aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/packet
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'net/packet')
-rw-r--r--net/packet/Kconfig8
-rw-r--r--net/packet/Makefile2
-rw-r--r--net/packet/af_packet.c1211
-rw-r--r--net/packet/diag.c242
-rw-r--r--net/packet/internal.h122
5 files changed, 229 insertions, 1356 deletions
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
index cc55b35f80e..0060e3b396b 100644
--- a/net/packet/Kconfig
+++ b/net/packet/Kconfig
@@ -14,11 +14,3 @@ config PACKET
14 be called af_packet. 14 be called af_packet.
15 15
16 If unsure, say Y. 16 If unsure, say Y.
17
18config PACKET_DIAG
19 tristate "Packet: sockets monitoring interface"
20 depends on PACKET
21 default n
22 ---help---
23 Support for PF_PACKET sockets monitoring interface used by the ss tool.
24 If unsure, say Y.
diff --git a/net/packet/Makefile b/net/packet/Makefile
index 9df61347a3c..81183eabfde 100644
--- a/net/packet/Makefile
+++ b/net/packet/Makefile
@@ -3,5 +3,3 @@
3# 3#
4 4
5obj-$(CONFIG_PACKET) += af_packet.o 5obj-$(CONFIG_PACKET) += af_packet.o
6obj-$(CONFIG_PACKET_DIAG) += af_packet_diag.o
7af_packet_diag-y += diag.o
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e639645e8fe..fabb4fafa28 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -40,10 +40,6 @@
40 * byte arrays at the end of sockaddr_ll 40 * byte arrays at the end of sockaddr_ll
41 * and packet_mreq. 41 * and packet_mreq.
42 * Johann Baudy : Added TX RING. 42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
47 * 43 *
48 * This program is free software; you can redistribute it and/or 44 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License 45 * modify it under the terms of the GNU General Public License
@@ -73,6 +69,7 @@
73#include <net/sock.h> 69#include <net/sock.h>
74#include <linux/errno.h> 70#include <linux/errno.h>
75#include <linux/timer.h> 71#include <linux/timer.h>
72#include <asm/system.h>
76#include <asm/uaccess.h> 73#include <asm/uaccess.h>
77#include <asm/ioctls.h> 74#include <asm/ioctls.h>
78#include <asm/page.h> 75#include <asm/page.h>
@@ -93,8 +90,6 @@
93#include <net/inet_common.h> 90#include <net/inet_common.h>
94#endif 91#endif
95 92
96#include "internal.h"
97
98/* 93/*
99 Assumptions: 94 Assumptions:
100 - if device has no dev->hard_header routine, it adds and removes ll header 95 - if device has no dev->hard_header routine, it adds and removes ll header
@@ -148,6 +143,14 @@ dev->hard_header == NULL (ll header is added by device, we cannot control it)
148 143
149/* Private packet socket structures. */ 144/* Private packet socket structures. */
150 145
146struct packet_mclist {
147 struct packet_mclist *next;
148 int ifindex;
149 int count;
150 unsigned short type;
151 unsigned short alen;
152 unsigned char addr[MAX_ADDR_LEN];
153};
151/* identical to struct packet_mreq except it has 154/* identical to struct packet_mreq except it has
152 * a longer address field. 155 * a longer address field.
153 */ 156 */
@@ -158,55 +161,77 @@ struct packet_mreq_max {
158 unsigned char mr_address[MAX_ADDR_LEN]; 161 unsigned char mr_address[MAX_ADDR_LEN];
159}; 162};
160 163
161static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, 164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162 int closing, int tx_ring); 165 int closing, int tx_ring);
163 166
167struct pgv {
168 char *buffer;
169};
164 170
165#define V3_ALIGNMENT (8) 171struct packet_ring_buffer {
166 172 struct pgv *pg_vec;
167#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) 173 unsigned int head;
168 174 unsigned int frames_per_block;
169#define BLK_PLUS_PRIV(sz_of_priv) \ 175 unsigned int frame_size;
170 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) 176 unsigned int frame_max;
171 177
172#define PGV_FROM_VMALLOC 1 178 unsigned int pg_vec_order;
179 unsigned int pg_vec_pages;
180 unsigned int pg_vec_len;
173 181
174#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) 182 atomic_t pending;
175#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) 183};
176#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
177#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
178#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
179#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
180#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
181 184
182struct packet_sock; 185struct packet_sock;
183static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); 186static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
184 187
185static void *packet_previous_frame(struct packet_sock *po,
186 struct packet_ring_buffer *rb,
187 int status);
188static void packet_increment_head(struct packet_ring_buffer *buff);
189static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
190 struct tpacket_block_desc *);
191static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
192 struct packet_sock *);
193static void prb_retire_current_block(struct tpacket_kbdq_core *,
194 struct packet_sock *, unsigned int status);
195static int prb_queue_frozen(struct tpacket_kbdq_core *);
196static void prb_open_block(struct tpacket_kbdq_core *,
197 struct tpacket_block_desc *);
198static void prb_retire_rx_blk_timer_expired(unsigned long);
199static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
200static void prb_init_blk_timer(struct packet_sock *,
201 struct tpacket_kbdq_core *,
202 void (*func) (unsigned long));
203static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
204static void prb_clear_rxhash(struct tpacket_kbdq_core *,
205 struct tpacket3_hdr *);
206static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
207 struct tpacket3_hdr *);
208static void packet_flush_mclist(struct sock *sk); 188static void packet_flush_mclist(struct sock *sk);
209 189
190struct packet_fanout;
191struct packet_sock {
192 /* struct sock has to be the first member of packet_sock */
193 struct sock sk;
194 struct packet_fanout *fanout;
195 struct tpacket_stats stats;
196 struct packet_ring_buffer rx_ring;
197 struct packet_ring_buffer tx_ring;
198 int copy_thresh;
199 spinlock_t bind_lock;
200 struct mutex pg_vec_lock;
201 unsigned int running:1, /* prot_hook is attached*/
202 auxdata:1,
203 origdev:1,
204 has_vnet_hdr:1;
205 int ifindex; /* bound device */
206 __be16 num;
207 struct packet_mclist *mclist;
208 atomic_t mapped;
209 enum tpacket_versions tp_version;
210 unsigned int tp_hdrlen;
211 unsigned int tp_reserve;
212 unsigned int tp_loss:1;
213 unsigned int tp_tstamp;
214 struct packet_type prot_hook ____cacheline_aligned_in_smp;
215};
216
217#define PACKET_FANOUT_MAX 256
218
219struct packet_fanout {
220#ifdef CONFIG_NET_NS
221 struct net *net;
222#endif
223 unsigned int num_members;
224 u16 id;
225 u8 type;
226 u8 defrag;
227 atomic_t rr_cur;
228 struct list_head list;
229 struct sock *arr[PACKET_FANOUT_MAX];
230 spinlock_t lock;
231 atomic_t sk_ref;
232 struct packet_type prot_hook ____cacheline_aligned_in_smp;
233};
234
210struct packet_skb_cb { 235struct packet_skb_cb {
211 unsigned int origlen; 236 unsigned int origlen;
212 union { 237 union {
@@ -217,14 +242,10 @@ struct packet_skb_cb {
217 242
218#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) 243#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
219 244
220#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) 245static inline struct packet_sock *pkt_sk(struct sock *sk)
221#define GET_PBLOCK_DESC(x, bid) \ 246{
222 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) 247 return (struct packet_sock *)sk;
223#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ 248}
224 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
225#define GET_NEXT_PRB_BLK_NUM(x) \
226 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
227 ((x)->kactive_blk_num+1) : 0)
228 249
229static void __fanout_unlink(struct sock *sk, struct packet_sock *po); 250static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
230static void __fanout_link(struct sock *sk, struct packet_sock *po); 251static void __fanout_link(struct sock *sk, struct packet_sock *po);
@@ -304,9 +325,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
304 h.h2->tp_status = status; 325 h.h2->tp_status = status;
305 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 326 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
306 break; 327 break;
307 case TPACKET_V3:
308 default: 328 default:
309 WARN(1, "TPACKET version not supported.\n"); 329 pr_err("TPACKET version not supported\n");
310 BUG(); 330 BUG();
311 } 331 }
312 332
@@ -331,9 +351,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
331 case TPACKET_V2: 351 case TPACKET_V2:
332 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 352 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
333 return h.h2->tp_status; 353 return h.h2->tp_status;
334 case TPACKET_V3:
335 default: 354 default:
336 WARN(1, "TPACKET version not supported.\n"); 355 pr_err("TPACKET version not supported\n");
337 BUG(); 356 BUG();
338 return 0; 357 return 0;
339 } 358 }
@@ -363,672 +382,14 @@ static void *packet_lookup_frame(struct packet_sock *po,
363 return h.raw; 382 return h.raw;
364} 383}
365 384
366static void *packet_current_frame(struct packet_sock *po, 385static inline void *packet_current_frame(struct packet_sock *po,
367 struct packet_ring_buffer *rb, 386 struct packet_ring_buffer *rb,
368 int status) 387 int status)
369{ 388{
370 return packet_lookup_frame(po, rb, rb->head, status); 389 return packet_lookup_frame(po, rb, rb->head, status);
371} 390}
372 391
373static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) 392static inline void *packet_previous_frame(struct packet_sock *po,
374{
375 del_timer_sync(&pkc->retire_blk_timer);
376}
377
378static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
379 int tx_ring,
380 struct sk_buff_head *rb_queue)
381{
382 struct tpacket_kbdq_core *pkc;
383
384 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
385
386 spin_lock(&rb_queue->lock);
387 pkc->delete_blk_timer = 1;
388 spin_unlock(&rb_queue->lock);
389
390 prb_del_retire_blk_timer(pkc);
391}
392
393static void prb_init_blk_timer(struct packet_sock *po,
394 struct tpacket_kbdq_core *pkc,
395 void (*func) (unsigned long))
396{
397 init_timer(&pkc->retire_blk_timer);
398 pkc->retire_blk_timer.data = (long)po;
399 pkc->retire_blk_timer.function = func;
400 pkc->retire_blk_timer.expires = jiffies;
401}
402
403static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
404{
405 struct tpacket_kbdq_core *pkc;
406
407 if (tx_ring)
408 BUG();
409
410 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
411 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
412}
413
414static int prb_calc_retire_blk_tmo(struct packet_sock *po,
415 int blk_size_in_bytes)
416{
417 struct net_device *dev;
418 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
419 struct ethtool_cmd ecmd;
420 int err;
421 u32 speed;
422
423 rtnl_lock();
424 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
425 if (unlikely(!dev)) {
426 rtnl_unlock();
427 return DEFAULT_PRB_RETIRE_TOV;
428 }
429 err = __ethtool_get_settings(dev, &ecmd);
430 speed = ethtool_cmd_speed(&ecmd);
431 rtnl_unlock();
432 if (!err) {
433 /*
434 * If the link speed is so slow you don't really
435 * need to worry about perf anyways
436 */
437 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
438 return DEFAULT_PRB_RETIRE_TOV;
439 } else {
440 msec = 1;
441 div = speed / 1000;
442 }
443 }
444
445 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
446
447 if (div)
448 mbits /= div;
449
450 tmo = mbits * msec;
451
452 if (div)
453 return tmo+1;
454 return tmo;
455}
456
457static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
458 union tpacket_req_u *req_u)
459{
460 p1->feature_req_word = req_u->req3.tp_feature_req_word;
461}
462
463static void init_prb_bdqc(struct packet_sock *po,
464 struct packet_ring_buffer *rb,
465 struct pgv *pg_vec,
466 union tpacket_req_u *req_u, int tx_ring)
467{
468 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
469 struct tpacket_block_desc *pbd;
470
471 memset(p1, 0x0, sizeof(*p1));
472
473 p1->knxt_seq_num = 1;
474 p1->pkbdq = pg_vec;
475 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
476 p1->pkblk_start = pg_vec[0].buffer;
477 p1->kblk_size = req_u->req3.tp_block_size;
478 p1->knum_blocks = req_u->req3.tp_block_nr;
479 p1->hdrlen = po->tp_hdrlen;
480 p1->version = po->tp_version;
481 p1->last_kactive_blk_num = 0;
482 po->stats_u.stats3.tp_freeze_q_cnt = 0;
483 if (req_u->req3.tp_retire_blk_tov)
484 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
485 else
486 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
487 req_u->req3.tp_block_size);
488 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
489 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
490
491 prb_init_ft_ops(p1, req_u);
492 prb_setup_retire_blk_timer(po, tx_ring);
493 prb_open_block(p1, pbd);
494}
495
496/* Do NOT update the last_blk_num first.
497 * Assumes sk_buff_head lock is held.
498 */
499static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
500{
501 mod_timer(&pkc->retire_blk_timer,
502 jiffies + pkc->tov_in_jiffies);
503 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
504}
505
506/*
507 * Timer logic:
508 * 1) We refresh the timer only when we open a block.
509 * By doing this we don't waste cycles refreshing the timer
510 * on packet-by-packet basis.
511 *
512 * With a 1MB block-size, on a 1Gbps line, it will take
513 * i) ~8 ms to fill a block + ii) memcpy etc.
514 * In this cut we are not accounting for the memcpy time.
515 *
516 * So, if the user sets the 'tmo' to 10ms then the timer
517 * will never fire while the block is still getting filled
518 * (which is what we want). However, the user could choose
519 * to close a block early and that's fine.
520 *
521 * But when the timer does fire, we check whether or not to refresh it.
522 * Since the tmo granularity is in msecs, it is not too expensive
523 * to refresh the timer, lets say every '8' msecs.
524 * Either the user can set the 'tmo' or we can derive it based on
525 * a) line-speed and b) block-size.
526 * prb_calc_retire_blk_tmo() calculates the tmo.
527 *
528 */
529static void prb_retire_rx_blk_timer_expired(unsigned long data)
530{
531 struct packet_sock *po = (struct packet_sock *)data;
532 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
533 unsigned int frozen;
534 struct tpacket_block_desc *pbd;
535
536 spin_lock(&po->sk.sk_receive_queue.lock);
537
538 frozen = prb_queue_frozen(pkc);
539 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
540
541 if (unlikely(pkc->delete_blk_timer))
542 goto out;
543
544 /* We only need to plug the race when the block is partially filled.
545 * tpacket_rcv:
546 * lock(); increment BLOCK_NUM_PKTS; unlock()
547 * copy_bits() is in progress ...
548 * timer fires on other cpu:
549 * we can't retire the current block because copy_bits
550 * is in progress.
551 *
552 */
553 if (BLOCK_NUM_PKTS(pbd)) {
554 while (atomic_read(&pkc->blk_fill_in_prog)) {
555 /* Waiting for skb_copy_bits to finish... */
556 cpu_relax();
557 }
558 }
559
560 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
561 if (!frozen) {
562 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
563 if (!prb_dispatch_next_block(pkc, po))
564 goto refresh_timer;
565 else
566 goto out;
567 } else {
568 /* Case 1. Queue was frozen because user-space was
569 * lagging behind.
570 */
571 if (prb_curr_blk_in_use(pkc, pbd)) {
572 /*
573 * Ok, user-space is still behind.
574 * So just refresh the timer.
575 */
576 goto refresh_timer;
577 } else {
578 /* Case 2. queue was frozen,user-space caught up,
579 * now the link went idle && the timer fired.
580 * We don't have a block to close.So we open this
581 * block and restart the timer.
582 * opening a block thaws the queue,restarts timer
583 * Thawing/timer-refresh is a side effect.
584 */
585 prb_open_block(pkc, pbd);
586 goto out;
587 }
588 }
589 }
590
591refresh_timer:
592 _prb_refresh_rx_retire_blk_timer(pkc);
593
594out:
595 spin_unlock(&po->sk.sk_receive_queue.lock);
596}
597
598static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
599 struct tpacket_block_desc *pbd1, __u32 status)
600{
601 /* Flush everything minus the block header */
602
603#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
604 u8 *start, *end;
605
606 start = (u8 *)pbd1;
607
608 /* Skip the block header(we know header WILL fit in 4K) */
609 start += PAGE_SIZE;
610
611 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
612 for (; start < end; start += PAGE_SIZE)
613 flush_dcache_page(pgv_to_page(start));
614
615 smp_wmb();
616#endif
617
618 /* Now update the block status. */
619
620 BLOCK_STATUS(pbd1) = status;
621
622 /* Flush the block header */
623
624#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
625 start = (u8 *)pbd1;
626 flush_dcache_page(pgv_to_page(start));
627
628 smp_wmb();
629#endif
630}
631
632/*
633 * Side effect:
634 *
635 * 1) flush the block
636 * 2) Increment active_blk_num
637 *
638 * Note:We DONT refresh the timer on purpose.
639 * Because almost always the next block will be opened.
640 */
641static void prb_close_block(struct tpacket_kbdq_core *pkc1,
642 struct tpacket_block_desc *pbd1,
643 struct packet_sock *po, unsigned int stat)
644{
645 __u32 status = TP_STATUS_USER | stat;
646
647 struct tpacket3_hdr *last_pkt;
648 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
649
650 if (po->stats.tp_drops)
651 status |= TP_STATUS_LOSING;
652
653 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
654 last_pkt->tp_next_offset = 0;
655
656 /* Get the ts of the last pkt */
657 if (BLOCK_NUM_PKTS(pbd1)) {
658 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
659 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
660 } else {
661 /* Ok, we tmo'd - so get the current time */
662 struct timespec ts;
663 getnstimeofday(&ts);
664 h1->ts_last_pkt.ts_sec = ts.tv_sec;
665 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
666 }
667
668 smp_wmb();
669
670 /* Flush the block */
671 prb_flush_block(pkc1, pbd1, status);
672
673 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
674}
675
676static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
677{
678 pkc->reset_pending_on_curr_blk = 0;
679}
680
681/*
682 * Side effect of opening a block:
683 *
684 * 1) prb_queue is thawed.
685 * 2) retire_blk_timer is refreshed.
686 *
687 */
688static void prb_open_block(struct tpacket_kbdq_core *pkc1,
689 struct tpacket_block_desc *pbd1)
690{
691 struct timespec ts;
692 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
693
694 smp_rmb();
695
696 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
697
698 /* We could have just memset this but we will lose the
699 * flexibility of making the priv area sticky
700 */
701 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
702 BLOCK_NUM_PKTS(pbd1) = 0;
703 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
704 getnstimeofday(&ts);
705 h1->ts_first_pkt.ts_sec = ts.tv_sec;
706 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
707 pkc1->pkblk_start = (char *)pbd1;
708 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
709 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
710 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
711 pbd1->version = pkc1->version;
712 pkc1->prev = pkc1->nxt_offset;
713 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
714 prb_thaw_queue(pkc1);
715 _prb_refresh_rx_retire_blk_timer(pkc1);
716
717 smp_wmb();
718
719 return;
720 }
721
722 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
723 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
724 dump_stack();
725 BUG();
726}
727
728/*
729 * Queue freeze logic:
730 * 1) Assume tp_block_nr = 8 blocks.
731 * 2) At time 't0', user opens Rx ring.
732 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
733 * 4) user-space is either sleeping or processing block '0'.
734 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
735 * it will close block-7,loop around and try to fill block '0'.
736 * call-flow:
737 * __packet_lookup_frame_in_block
738 * prb_retire_current_block()
739 * prb_dispatch_next_block()
740 * |->(BLOCK_STATUS == USER) evaluates to true
741 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
742 * 6) Now there are two cases:
743 * 6.1) Link goes idle right after the queue is frozen.
744 * But remember, the last open_block() refreshed the timer.
745 * When this timer expires,it will refresh itself so that we can
746 * re-open block-0 in near future.
747 * 6.2) Link is busy and keeps on receiving packets. This is a simple
748 * case and __packet_lookup_frame_in_block will check if block-0
749 * is free and can now be re-used.
750 */
751static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
752 struct packet_sock *po)
753{
754 pkc->reset_pending_on_curr_blk = 1;
755 po->stats_u.stats3.tp_freeze_q_cnt++;
756}
757
758#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
759
760/*
761 * If the next block is free then we will dispatch it
762 * and return a good offset.
763 * Else, we will freeze the queue.
764 * So, caller must check the return value.
765 */
766static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
767 struct packet_sock *po)
768{
769 struct tpacket_block_desc *pbd;
770
771 smp_rmb();
772
773 /* 1. Get current block num */
774 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
775
776 /* 2. If this block is currently in_use then freeze the queue */
777 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
778 prb_freeze_queue(pkc, po);
779 return NULL;
780 }
781
782 /*
783 * 3.
784 * open this block and return the offset where the first packet
785 * needs to get stored.
786 */
787 prb_open_block(pkc, pbd);
788 return (void *)pkc->nxt_offset;
789}
790
791static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
792 struct packet_sock *po, unsigned int status)
793{
794 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
795
796 /* retire/close the current block */
797 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
798 /*
799 * Plug the case where copy_bits() is in progress on
800 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
801 * have space to copy the pkt in the current block and
802 * called prb_retire_current_block()
803 *
804 * We don't need to worry about the TMO case because
805 * the timer-handler already handled this case.
806 */
807 if (!(status & TP_STATUS_BLK_TMO)) {
808 while (atomic_read(&pkc->blk_fill_in_prog)) {
809 /* Waiting for skb_copy_bits to finish... */
810 cpu_relax();
811 }
812 }
813 prb_close_block(pkc, pbd, po, status);
814 return;
815 }
816
817 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
818 dump_stack();
819 BUG();
820}
821
822static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
823 struct tpacket_block_desc *pbd)
824{
825 return TP_STATUS_USER & BLOCK_STATUS(pbd);
826}
827
828static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
829{
830 return pkc->reset_pending_on_curr_blk;
831}
832
833static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
834{
835 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
836 atomic_dec(&pkc->blk_fill_in_prog);
837}
838
839static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
840 struct tpacket3_hdr *ppd)
841{
842 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
843}
844
845static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
846 struct tpacket3_hdr *ppd)
847{
848 ppd->hv1.tp_rxhash = 0;
849}
850
851static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
852 struct tpacket3_hdr *ppd)
853{
854 if (vlan_tx_tag_present(pkc->skb)) {
855 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
856 ppd->tp_status = TP_STATUS_VLAN_VALID;
857 } else {
858 ppd->hv1.tp_vlan_tci = 0;
859 ppd->tp_status = TP_STATUS_AVAILABLE;
860 }
861}
862
863static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
864 struct tpacket3_hdr *ppd)
865{
866 prb_fill_vlan_info(pkc, ppd);
867
868 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
869 prb_fill_rxhash(pkc, ppd);
870 else
871 prb_clear_rxhash(pkc, ppd);
872}
873
874static void prb_fill_curr_block(char *curr,
875 struct tpacket_kbdq_core *pkc,
876 struct tpacket_block_desc *pbd,
877 unsigned int len)
878{
879 struct tpacket3_hdr *ppd;
880
881 ppd = (struct tpacket3_hdr *)curr;
882 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
883 pkc->prev = curr;
884 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
885 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
886 BLOCK_NUM_PKTS(pbd) += 1;
887 atomic_inc(&pkc->blk_fill_in_prog);
888 prb_run_all_ft_ops(pkc, ppd);
889}
890
891/* Assumes caller has the sk->rx_queue.lock */
892static void *__packet_lookup_frame_in_block(struct packet_sock *po,
893 struct sk_buff *skb,
894 int status,
895 unsigned int len
896 )
897{
898 struct tpacket_kbdq_core *pkc;
899 struct tpacket_block_desc *pbd;
900 char *curr, *end;
901
902 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
903 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
904
905 /* Queue is frozen when user space is lagging behind */
906 if (prb_queue_frozen(pkc)) {
907 /*
908 * Check if that last block which caused the queue to freeze,
909 * is still in_use by user-space.
910 */
911 if (prb_curr_blk_in_use(pkc, pbd)) {
912 /* Can't record this packet */
913 return NULL;
914 } else {
915 /*
916 * Ok, the block was released by user-space.
917 * Now let's open that block.
918 * opening a block also thaws the queue.
919 * Thawing is a side effect.
920 */
921 prb_open_block(pkc, pbd);
922 }
923 }
924
925 smp_mb();
926 curr = pkc->nxt_offset;
927 pkc->skb = skb;
928 end = (char *)pbd + pkc->kblk_size;
929
930 /* first try the current block */
931 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
932 prb_fill_curr_block(curr, pkc, pbd, len);
933 return (void *)curr;
934 }
935
936 /* Ok, close the current block */
937 prb_retire_current_block(pkc, po, 0);
938
939 /* Now, try to dispatch the next block */
940 curr = (char *)prb_dispatch_next_block(pkc, po);
941 if (curr) {
942 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
943 prb_fill_curr_block(curr, pkc, pbd, len);
944 return (void *)curr;
945 }
946
947 /*
948 * No free blocks are available.user_space hasn't caught up yet.
949 * Queue was just frozen and now this packet will get dropped.
950 */
951 return NULL;
952}
953
954static void *packet_current_rx_frame(struct packet_sock *po,
955 struct sk_buff *skb,
956 int status, unsigned int len)
957{
958 char *curr = NULL;
959 switch (po->tp_version) {
960 case TPACKET_V1:
961 case TPACKET_V2:
962 curr = packet_lookup_frame(po, &po->rx_ring,
963 po->rx_ring.head, status);
964 return curr;
965 case TPACKET_V3:
966 return __packet_lookup_frame_in_block(po, skb, status, len);
967 default:
968 WARN(1, "TPACKET version not supported\n");
969 BUG();
970 return NULL;
971 }
972}
973
974static void *prb_lookup_block(struct packet_sock *po,
975 struct packet_ring_buffer *rb,
976 unsigned int previous,
977 int status)
978{
979 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
980 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
981
982 if (status != BLOCK_STATUS(pbd))
983 return NULL;
984 return pbd;
985}
986
987static int prb_previous_blk_num(struct packet_ring_buffer *rb)
988{
989 unsigned int prev;
990 if (rb->prb_bdqc.kactive_blk_num)
991 prev = rb->prb_bdqc.kactive_blk_num-1;
992 else
993 prev = rb->prb_bdqc.knum_blocks-1;
994 return prev;
995}
996
997/* Assumes caller has held the rx_queue.lock */
998static void *__prb_previous_block(struct packet_sock *po,
999 struct packet_ring_buffer *rb,
1000 int status)
1001{
1002 unsigned int previous = prb_previous_blk_num(rb);
1003 return prb_lookup_block(po, rb, previous, status);
1004}
1005
1006static void *packet_previous_rx_frame(struct packet_sock *po,
1007 struct packet_ring_buffer *rb,
1008 int status)
1009{
1010 if (po->tp_version <= TPACKET_V2)
1011 return packet_previous_frame(po, rb, status);
1012
1013 return __prb_previous_block(po, rb, status);
1014}
1015
1016static void packet_increment_rx_head(struct packet_sock *po,
1017 struct packet_ring_buffer *rb)
1018{
1019 switch (po->tp_version) {
1020 case TPACKET_V1:
1021 case TPACKET_V2:
1022 return packet_increment_head(rb);
1023 case TPACKET_V3:
1024 default:
1025 WARN(1, "TPACKET version not supported.\n");
1026 BUG();
1027 return;
1028 }
1029}
1030
1031static void *packet_previous_frame(struct packet_sock *po,
1032 struct packet_ring_buffer *rb, 393 struct packet_ring_buffer *rb,
1033 int status) 394 int status)
1034{ 395{
@@ -1036,7 +397,7 @@ static void *packet_previous_frame(struct packet_sock *po,
1036 return packet_lookup_frame(po, rb, previous, status); 397 return packet_lookup_frame(po, rb, previous, status);
1037} 398}
1038 399
1039static void packet_increment_head(struct packet_ring_buffer *buff) 400static inline void packet_increment_head(struct packet_ring_buffer *buff)
1040{ 401{
1041 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; 402 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1042} 403}
@@ -1093,6 +454,43 @@ static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *sk
1093 return f->arr[cpu % num]; 454 return f->arr[cpu % num];
1094} 455}
1095 456
457static struct sk_buff *fanout_check_defrag(struct sk_buff *skb)
458{
459#ifdef CONFIG_INET
460 const struct iphdr *iph;
461 u32 len;
462
463 if (skb->protocol != htons(ETH_P_IP))
464 return skb;
465
466 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
467 return skb;
468
469 iph = ip_hdr(skb);
470 if (iph->ihl < 5 || iph->version != 4)
471 return skb;
472 if (!pskb_may_pull(skb, iph->ihl*4))
473 return skb;
474 iph = ip_hdr(skb);
475 len = ntohs(iph->tot_len);
476 if (skb->len < len || len < (iph->ihl * 4))
477 return skb;
478
479 if (ip_is_fragment(ip_hdr(skb))) {
480 skb = skb_share_check(skb, GFP_ATOMIC);
481 if (skb) {
482 if (pskb_trim_rcsum(skb, len))
483 return skb;
484 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
485 if (ip_defrag(skb, IP_DEFRAG_AF_PACKET))
486 return NULL;
487 skb->rxhash = 0;
488 }
489 }
490#endif
491 return skb;
492}
493
1096static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, 494static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1097 struct packet_type *pt, struct net_device *orig_dev) 495 struct packet_type *pt, struct net_device *orig_dev)
1098{ 496{
@@ -1111,7 +509,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1111 case PACKET_FANOUT_HASH: 509 case PACKET_FANOUT_HASH:
1112 default: 510 default:
1113 if (f->defrag) { 511 if (f->defrag) {
1114 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); 512 skb = fanout_check_defrag(skb);
1115 if (!skb) 513 if (!skb)
1116 return 0; 514 return 0;
1117 } 515 }
@@ -1131,8 +529,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1131 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); 529 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1132} 530}
1133 531
1134DEFINE_MUTEX(fanout_mutex); 532static DEFINE_MUTEX(fanout_mutex);
1135EXPORT_SYMBOL_GPL(fanout_mutex);
1136static LIST_HEAD(fanout_list); 533static LIST_HEAD(fanout_list);
1137 534
1138static void __fanout_link(struct sock *sk, struct packet_sock *po) 535static void __fanout_link(struct sock *sk, struct packet_sock *po)
@@ -1162,14 +559,6 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1162 spin_unlock(&f->lock); 559 spin_unlock(&f->lock);
1163} 560}
1164 561
1165static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
1166{
1167 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1168 return true;
1169
1170 return false;
1171}
1172
1173static int fanout_add(struct sock *sk, u16 id, u16 type_flags) 562static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1174{ 563{
1175 struct packet_sock *po = pkt_sk(sk); 564 struct packet_sock *po = pkt_sk(sk);
@@ -1222,7 +611,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1222 match->prot_hook.dev = po->prot_hook.dev; 611 match->prot_hook.dev = po->prot_hook.dev;
1223 match->prot_hook.func = packet_rcv_fanout; 612 match->prot_hook.func = packet_rcv_fanout;
1224 match->prot_hook.af_packet_priv = match; 613 match->prot_hook.af_packet_priv = match;
1225 match->prot_hook.id_match = match_fanout_group;
1226 dev_add_pack(&match->prot_hook); 614 dev_add_pack(&match->prot_hook);
1227 list_add(&match->list, &fanout_list); 615 list_add(&match->list, &fanout_list);
1228 } 616 }
@@ -1253,9 +641,9 @@ static void fanout_release(struct sock *sk)
1253 if (!f) 641 if (!f)
1254 return; 642 return;
1255 643
1256 mutex_lock(&fanout_mutex);
1257 po->fanout = NULL; 644 po->fanout = NULL;
1258 645
646 mutex_lock(&fanout_mutex);
1259 if (atomic_dec_and_test(&f->sk_ref)) { 647 if (atomic_dec_and_test(&f->sk_ref)) {
1260 list_del(&f->list); 648 list_del(&f->list);
1261 dev_remove_pack(&f->prot_hook); 649 dev_remove_pack(&f->prot_hook);
@@ -1349,7 +737,6 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1349 struct net_device *dev; 737 struct net_device *dev;
1350 __be16 proto = 0; 738 __be16 proto = 0;
1351 int err; 739 int err;
1352 int extra_len = 0;
1353 740
1354 /* 741 /*
1355 * Get and verify the address. 742 * Get and verify the address.
@@ -1367,7 +754,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
1367 * Find the device first to size check it 754 * Find the device first to size check it
1368 */ 755 */
1369 756
1370 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; 757 saddr->spkt_device[13] = 0;
1371retry: 758retry:
1372 rcu_read_lock(); 759 rcu_read_lock();
1373 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); 760 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
@@ -1384,25 +771,16 @@ retry:
1384 * raw protocol and you must do your own fragmentation at this level. 771 * raw protocol and you must do your own fragmentation at this level.
1385 */ 772 */
1386 773
1387 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1388 if (!netif_supports_nofcs(dev)) {
1389 err = -EPROTONOSUPPORT;
1390 goto out_unlock;
1391 }
1392 extra_len = 4; /* We're doing our own CRC */
1393 }
1394
1395 err = -EMSGSIZE; 774 err = -EMSGSIZE;
1396 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) 775 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN)
1397 goto out_unlock; 776 goto out_unlock;
1398 777
1399 if (!skb) { 778 if (!skb) {
1400 size_t reserved = LL_RESERVED_SPACE(dev); 779 size_t reserved = LL_RESERVED_SPACE(dev);
1401 int tlen = dev->needed_tailroom;
1402 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; 780 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1403 781
1404 rcu_read_unlock(); 782 rcu_read_unlock();
1405 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); 783 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
1406 if (skb == NULL) 784 if (skb == NULL)
1407 return -ENOBUFS; 785 return -ENOBUFS;
1408 /* FIXME: Save some space for broken drivers that write a hard 786 /* FIXME: Save some space for broken drivers that write a hard
@@ -1425,7 +803,7 @@ retry:
1425 goto retry; 803 goto retry;
1426 } 804 }
1427 805
1428 if (len > (dev->mtu + dev->hard_header_len + extra_len)) { 806 if (len > (dev->mtu + dev->hard_header_len)) {
1429 /* Earlier code assumed this would be a VLAN pkt, 807 /* Earlier code assumed this would be a VLAN pkt,
1430 * double-check this now that we have the actual 808 * double-check this now that we have the actual
1431 * packet in hand. 809 * packet in hand.
@@ -1447,9 +825,6 @@ retry:
1447 if (err < 0) 825 if (err < 0)
1448 goto out_unlock; 826 goto out_unlock;
1449 827
1450 if (unlikely(extra_len == 4))
1451 skb->no_fcs = 1;
1452
1453 dev_queue_xmit(skb); 828 dev_queue_xmit(skb);
1454 rcu_read_unlock(); 829 rcu_read_unlock();
1455 return len; 830 return len;
@@ -1461,7 +836,7 @@ out_free:
1461 return err; 836 return err;
1462} 837}
1463 838
1464static unsigned int run_filter(const struct sk_buff *skb, 839static inline unsigned int run_filter(const struct sk_buff *skb,
1465 const struct sock *sk, 840 const struct sock *sk,
1466 unsigned int res) 841 unsigned int res)
1467{ 842{
@@ -1533,7 +908,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1533 if (snaplen > res) 908 if (snaplen > res)
1534 snaplen = res; 909 snaplen = res;
1535 910
1536 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 911 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
912 (unsigned)sk->sk_rcvbuf)
1537 goto drop_n_acct; 913 goto drop_n_acct;
1538 914
1539 if (skb_shared(skb)) { 915 if (skb_shared(skb)) {
@@ -1545,7 +921,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1545 skb->data = skb_head; 921 skb->data = skb_head;
1546 skb->len = skb_len; 922 skb->len = skb_len;
1547 } 923 }
1548 consume_skb(skb); 924 kfree_skb(skb);
1549 skb = nskb; 925 skb = nskb;
1550 } 926 }
1551 927
@@ -1609,13 +985,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1609 union { 985 union {
1610 struct tpacket_hdr *h1; 986 struct tpacket_hdr *h1;
1611 struct tpacket2_hdr *h2; 987 struct tpacket2_hdr *h2;
1612 struct tpacket3_hdr *h3;
1613 void *raw; 988 void *raw;
1614 } h; 989 } h;
1615 u8 *skb_head = skb->data; 990 u8 *skb_head = skb->data;
1616 int skb_len = skb->len; 991 int skb_len = skb->len;
1617 unsigned int snaplen, res; 992 unsigned int snaplen, res;
1618 unsigned long status = TP_STATUS_USER; 993 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
1619 unsigned short macoff, netoff, hdrlen; 994 unsigned short macoff, netoff, hdrlen;
1620 struct sk_buff *copy_skb = NULL; 995 struct sk_buff *copy_skb = NULL;
1621 struct timeval tv; 996 struct timeval tv;
@@ -1655,51 +1030,43 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1655 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + 1030 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
1656 po->tp_reserve; 1031 po->tp_reserve;
1657 } else { 1032 } else {
1658 unsigned int maclen = skb_network_offset(skb); 1033 unsigned maclen = skb_network_offset(skb);
1659 netoff = TPACKET_ALIGN(po->tp_hdrlen + 1034 netoff = TPACKET_ALIGN(po->tp_hdrlen +
1660 (maclen < 16 ? 16 : maclen)) + 1035 (maclen < 16 ? 16 : maclen)) +
1661 po->tp_reserve; 1036 po->tp_reserve;
1662 macoff = netoff - maclen; 1037 macoff = netoff - maclen;
1663 } 1038 }
1664 if (po->tp_version <= TPACKET_V2) { 1039
1665 if (macoff + snaplen > po->rx_ring.frame_size) { 1040 if (macoff + snaplen > po->rx_ring.frame_size) {
1666 if (po->copy_thresh && 1041 if (po->copy_thresh &&
1667 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { 1042 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
1668 if (skb_shared(skb)) { 1043 (unsigned)sk->sk_rcvbuf) {
1669 copy_skb = skb_clone(skb, GFP_ATOMIC); 1044 if (skb_shared(skb)) {
1670 } else { 1045 copy_skb = skb_clone(skb, GFP_ATOMIC);
1671 copy_skb = skb_get(skb); 1046 } else {
1672 skb_head = skb->data; 1047 copy_skb = skb_get(skb);
1673 } 1048 skb_head = skb->data;
1674 if (copy_skb)
1675 skb_set_owner_r(copy_skb, sk);
1676 } 1049 }
1677 snaplen = po->rx_ring.frame_size - macoff; 1050 if (copy_skb)
1678 if ((int)snaplen < 0) 1051 skb_set_owner_r(copy_skb, sk);
1679 snaplen = 0;
1680 } 1052 }
1053 snaplen = po->rx_ring.frame_size - macoff;
1054 if ((int)snaplen < 0)
1055 snaplen = 0;
1681 } 1056 }
1057
1682 spin_lock(&sk->sk_receive_queue.lock); 1058 spin_lock(&sk->sk_receive_queue.lock);
1683 h.raw = packet_current_rx_frame(po, skb, 1059 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
1684 TP_STATUS_KERNEL, (macoff+snaplen));
1685 if (!h.raw) 1060 if (!h.raw)
1686 goto ring_is_full; 1061 goto ring_is_full;
1687 if (po->tp_version <= TPACKET_V2) { 1062 packet_increment_head(&po->rx_ring);
1688 packet_increment_rx_head(po, &po->rx_ring);
1689 /*
1690 * LOSING will be reported till you read the stats,
1691 * because it's COR - Clear On Read.
1692 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1693 * at packet level.
1694 */
1695 if (po->stats.tp_drops)
1696 status |= TP_STATUS_LOSING;
1697 }
1698 po->stats.tp_packets++; 1063 po->stats.tp_packets++;
1699 if (copy_skb) { 1064 if (copy_skb) {
1700 status |= TP_STATUS_COPY; 1065 status |= TP_STATUS_COPY;
1701 __skb_queue_tail(&sk->sk_receive_queue, copy_skb); 1066 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1702 } 1067 }
1068 if (!po->stats.tp_drops)
1069 status &= ~TP_STATUS_LOSING;
1703 spin_unlock(&sk->sk_receive_queue.lock); 1070 spin_unlock(&sk->sk_receive_queue.lock);
1704 1071
1705 skb_copy_bits(skb, 0, h.raw + macoff, snaplen); 1072 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
@@ -1750,29 +1117,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1750 h.h2->tp_padding = 0; 1117 h.h2->tp_padding = 0;
1751 hdrlen = sizeof(*h.h2); 1118 hdrlen = sizeof(*h.h2);
1752 break; 1119 break;
1753 case TPACKET_V3:
1754 /* tp_nxt_offset,vlan are already populated above.
1755 * So DONT clear those fields here
1756 */
1757 h.h3->tp_status |= status;
1758 h.h3->tp_len = skb->len;
1759 h.h3->tp_snaplen = snaplen;
1760 h.h3->tp_mac = macoff;
1761 h.h3->tp_net = netoff;
1762 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1763 && shhwtstamps->syststamp.tv64)
1764 ts = ktime_to_timespec(shhwtstamps->syststamp);
1765 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1766 && shhwtstamps->hwtstamp.tv64)
1767 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1768 else if (skb->tstamp.tv64)
1769 ts = ktime_to_timespec(skb->tstamp);
1770 else
1771 getnstimeofday(&ts);
1772 h.h3->tp_sec = ts.tv_sec;
1773 h.h3->tp_nsec = ts.tv_nsec;
1774 hdrlen = sizeof(*h.h3);
1775 break;
1776 default: 1120 default:
1777 BUG(); 1121 BUG();
1778 } 1122 }
@@ -1793,19 +1137,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1793 { 1137 {
1794 u8 *start, *end; 1138 u8 *start, *end;
1795 1139
1796 if (po->tp_version <= TPACKET_V2) { 1140 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
1797 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw 1141 for (start = h.raw; start < end; start += PAGE_SIZE)
1798 + macoff + snaplen); 1142 flush_dcache_page(pgv_to_page(start));
1799 for (start = h.raw; start < end; start += PAGE_SIZE)
1800 flush_dcache_page(pgv_to_page(start));
1801 }
1802 smp_wmb(); 1143 smp_wmb();
1803 } 1144 }
1804#endif 1145#endif
1805 if (po->tp_version <= TPACKET_V2) 1146 __packet_set_status(po, h.raw, status);
1806 __packet_set_status(po, h.raw, status);
1807 else
1808 prb_clear_blk_fill_status(&po->rx_ring);
1809 1147
1810 sk->sk_data_ready(sk, 0); 1148 sk->sk_data_ready(sk, 0);
1811 1149
@@ -1832,8 +1170,11 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
1832 struct packet_sock *po = pkt_sk(skb->sk); 1170 struct packet_sock *po = pkt_sk(skb->sk);
1833 void *ph; 1171 void *ph;
1834 1172
1173 BUG_ON(skb == NULL);
1174
1835 if (likely(po->tx_ring.pg_vec)) { 1175 if (likely(po->tx_ring.pg_vec)) {
1836 ph = skb_shinfo(skb)->destructor_arg; 1176 ph = skb_shinfo(skb)->destructor_arg;
1177 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
1837 BUG_ON(atomic_read(&po->tx_ring.pending) == 0); 1178 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1838 atomic_dec(&po->tx_ring.pending); 1179 atomic_dec(&po->tx_ring.pending);
1839 __packet_set_status(po, ph, TP_STATUS_AVAILABLE); 1180 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
@@ -1844,7 +1185,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
1844 1185
1845static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, 1186static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1846 void *frame, struct net_device *dev, int size_max, 1187 void *frame, struct net_device *dev, int size_max,
1847 __be16 proto, unsigned char *addr, int hlen) 1188 __be16 proto, unsigned char *addr)
1848{ 1189{
1849 union { 1190 union {
1850 struct tpacket_hdr *h1; 1191 struct tpacket_hdr *h1;
@@ -1878,38 +1219,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1878 return -EMSGSIZE; 1219 return -EMSGSIZE;
1879 } 1220 }
1880 1221
1881 skb_reserve(skb, hlen); 1222 skb_reserve(skb, LL_RESERVED_SPACE(dev));
1882 skb_reset_network_header(skb); 1223 skb_reset_network_header(skb);
1883 1224
1884 if (po->tp_tx_has_off) { 1225 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1885 int off_min, off_max, off;
1886 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
1887 off_max = po->tx_ring.frame_size - tp_len;
1888 if (sock->type == SOCK_DGRAM) {
1889 switch (po->tp_version) {
1890 case TPACKET_V2:
1891 off = ph.h2->tp_net;
1892 break;
1893 default:
1894 off = ph.h1->tp_net;
1895 break;
1896 }
1897 } else {
1898 switch (po->tp_version) {
1899 case TPACKET_V2:
1900 off = ph.h2->tp_mac;
1901 break;
1902 default:
1903 off = ph.h1->tp_mac;
1904 break;
1905 }
1906 }
1907 if (unlikely((off < off_min) || (off_max < off)))
1908 return -EINVAL;
1909 data = ph.raw + off;
1910 } else {
1911 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
1912 }
1913 to_write = tp_len; 1226 to_write = tp_len;
1914 1227
1915 if (sock->type == SOCK_DGRAM) { 1228 if (sock->type == SOCK_DGRAM) {
@@ -1935,6 +1248,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
1935 to_write -= dev->hard_header_len; 1248 to_write -= dev->hard_header_len;
1936 } 1249 }
1937 1250
1251 err = -EFAULT;
1938 offset = offset_in_page(data); 1252 offset = offset_in_page(data);
1939 len_max = PAGE_SIZE - offset; 1253 len_max = PAGE_SIZE - offset;
1940 len = ((to_write > len_max) ? len_max : to_write); 1254 len = ((to_write > len_max) ? len_max : to_write);
@@ -1979,11 +1293,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
1979 int tp_len, size_max; 1293 int tp_len, size_max;
1980 unsigned char *addr; 1294 unsigned char *addr;
1981 int len_sum = 0; 1295 int len_sum = 0;
1982 int status = TP_STATUS_AVAILABLE; 1296 int status = 0;
1983 int hlen, tlen;
1984 1297
1985 mutex_lock(&po->pg_vec_lock); 1298 mutex_lock(&po->pg_vec_lock);
1986 1299
1300 err = -EBUSY;
1987 if (saddr == NULL) { 1301 if (saddr == NULL) {
1988 dev = po->prot_hook.dev; 1302 dev = po->prot_hook.dev;
1989 proto = po->num; 1303 proto = po->num;
@@ -2028,17 +1342,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2028 } 1342 }
2029 1343
2030 status = TP_STATUS_SEND_REQUEST; 1344 status = TP_STATUS_SEND_REQUEST;
2031 hlen = LL_RESERVED_SPACE(dev);
2032 tlen = dev->needed_tailroom;
2033 skb = sock_alloc_send_skb(&po->sk, 1345 skb = sock_alloc_send_skb(&po->sk,
2034 hlen + tlen + sizeof(struct sockaddr_ll), 1346 LL_ALLOCATED_SPACE(dev)
1347 + sizeof(struct sockaddr_ll),
2035 0, &err); 1348 0, &err);
2036 1349
2037 if (unlikely(skb == NULL)) 1350 if (unlikely(skb == NULL))
2038 goto out_status; 1351 goto out_status;
2039 1352
2040 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, 1353 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
2041 addr, hlen); 1354 addr);
2042 1355
2043 if (unlikely(tp_len < 0)) { 1356 if (unlikely(tp_len < 0)) {
2044 if (po->tp_loss) { 1357 if (po->tp_loss) {
@@ -2095,10 +1408,10 @@ out:
2095 return err; 1408 return err;
2096} 1409}
2097 1410
2098static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, 1411static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2099 size_t reserve, size_t len, 1412 size_t reserve, size_t len,
2100 size_t linear, int noblock, 1413 size_t linear, int noblock,
2101 int *err) 1414 int *err)
2102{ 1415{
2103 struct sk_buff *skb; 1416 struct sk_buff *skb;
2104 1417
@@ -2135,8 +1448,6 @@ static int packet_snd(struct socket *sock,
2135 int vnet_hdr_len; 1448 int vnet_hdr_len;
2136 struct packet_sock *po = pkt_sk(sk); 1449 struct packet_sock *po = pkt_sk(sk);
2137 unsigned short gso_type = 0; 1450 unsigned short gso_type = 0;
2138 int hlen, tlen;
2139 int extra_len = 0;
2140 1451
2141 /* 1452 /*
2142 * Get and verify the address. 1453 * Get and verify the address.
@@ -2216,22 +1527,13 @@ static int packet_snd(struct socket *sock,
2216 } 1527 }
2217 } 1528 }
2218 1529
2219 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2220 if (!netif_supports_nofcs(dev)) {
2221 err = -EPROTONOSUPPORT;
2222 goto out_unlock;
2223 }
2224 extra_len = 4; /* We're doing our own CRC */
2225 }
2226
2227 err = -EMSGSIZE; 1530 err = -EMSGSIZE;
2228 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) 1531 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN))
2229 goto out_unlock; 1532 goto out_unlock;
2230 1533
2231 err = -ENOBUFS; 1534 err = -ENOBUFS;
2232 hlen = LL_RESERVED_SPACE(dev); 1535 skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
2233 tlen = dev->needed_tailroom; 1536 LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
2234 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len,
2235 msg->msg_flags & MSG_DONTWAIT, &err); 1537 msg->msg_flags & MSG_DONTWAIT, &err);
2236 if (skb == NULL) 1538 if (skb == NULL)
2237 goto out_unlock; 1539 goto out_unlock;
@@ -2251,7 +1553,7 @@ static int packet_snd(struct socket *sock,
2251 if (err < 0) 1553 if (err < 0)
2252 goto out_free; 1554 goto out_free;
2253 1555
2254 if (!gso_type && (len > dev->mtu + reserve + extra_len)) { 1556 if (!gso_type && (len > dev->mtu + reserve)) {
2255 /* Earlier code assumed this would be a VLAN pkt, 1557 /* Earlier code assumed this would be a VLAN pkt,
2256 * double-check this now that we have the actual 1558 * double-check this now that we have the actual
2257 * packet in hand. 1559 * packet in hand.
@@ -2289,9 +1591,6 @@ static int packet_snd(struct socket *sock,
2289 len += vnet_hdr_len; 1591 len += vnet_hdr_len;
2290 } 1592 }
2291 1593
2292 if (unlikely(extra_len == 4))
2293 skb->no_fcs = 1;
2294
2295 /* 1594 /*
2296 * Now send it 1595 * Now send it
2297 */ 1596 */
@@ -2335,7 +1634,7 @@ static int packet_release(struct socket *sock)
2335 struct sock *sk = sock->sk; 1634 struct sock *sk = sock->sk;
2336 struct packet_sock *po; 1635 struct packet_sock *po;
2337 struct net *net; 1636 struct net *net;
2338 union tpacket_req_u req_u; 1637 struct tpacket_req req;
2339 1638
2340 if (!sk) 1639 if (!sk)
2341 return 0; 1640 return 0;
@@ -2343,13 +1642,10 @@ static int packet_release(struct socket *sock)
2343 net = sock_net(sk); 1642 net = sock_net(sk);
2344 po = pkt_sk(sk); 1643 po = pkt_sk(sk);
2345 1644
2346 mutex_lock(&net->packet.sklist_lock); 1645 spin_lock_bh(&net->packet.sklist_lock);
2347 sk_del_node_init_rcu(sk); 1646 sk_del_node_init_rcu(sk);
2348 mutex_unlock(&net->packet.sklist_lock);
2349
2350 preempt_disable();
2351 sock_prot_inuse_add(net, sk->sk_prot, -1); 1647 sock_prot_inuse_add(net, sk->sk_prot, -1);
2352 preempt_enable(); 1648 spin_unlock_bh(&net->packet.sklist_lock);
2353 1649
2354 spin_lock(&po->bind_lock); 1650 spin_lock(&po->bind_lock);
2355 unregister_prot_hook(sk, false); 1651 unregister_prot_hook(sk, false);
@@ -2361,13 +1657,13 @@ static int packet_release(struct socket *sock)
2361 1657
2362 packet_flush_mclist(sk); 1658 packet_flush_mclist(sk);
2363 1659
2364 memset(&req_u, 0, sizeof(req_u)); 1660 memset(&req, 0, sizeof(req));
2365 1661
2366 if (po->rx_ring.pg_vec) 1662 if (po->rx_ring.pg_vec)
2367 packet_set_ring(sk, &req_u, 1, 0); 1663 packet_set_ring(sk, &req, 1, 0);
2368 1664
2369 if (po->tx_ring.pg_vec) 1665 if (po->tx_ring.pg_vec)
2370 packet_set_ring(sk, &req_u, 1, 1); 1666 packet_set_ring(sk, &req, 1, 1);
2371 1667
2372 fanout_release(sk); 1668 fanout_release(sk);
2373 1669
@@ -2395,12 +1691,8 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
2395{ 1691{
2396 struct packet_sock *po = pkt_sk(sk); 1692 struct packet_sock *po = pkt_sk(sk);
2397 1693
2398 if (po->fanout) { 1694 if (po->fanout)
2399 if (dev)
2400 dev_put(dev);
2401
2402 return -EINVAL; 1695 return -EINVAL;
2403 }
2404 1696
2405 lock_sock(sk); 1697 lock_sock(sk);
2406 1698
@@ -2504,7 +1796,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
2504 __be16 proto = (__force __be16)protocol; /* weird, but documented */ 1796 __be16 proto = (__force __be16)protocol; /* weird, but documented */
2505 int err; 1797 int err;
2506 1798
2507 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 1799 if (!capable(CAP_NET_RAW))
2508 return -EPERM; 1800 return -EPERM;
2509 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && 1801 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2510 sock->type != SOCK_PACKET) 1802 sock->type != SOCK_PACKET)
@@ -2548,13 +1840,10 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
2548 register_prot_hook(sk); 1840 register_prot_hook(sk);
2549 } 1841 }
2550 1842
2551 mutex_lock(&net->packet.sklist_lock); 1843 spin_lock_bh(&net->packet.sklist_lock);
2552 sk_add_node_rcu(sk, &net->packet.sklist); 1844 sk_add_node_rcu(sk, &net->packet.sklist);
2553 mutex_unlock(&net->packet.sklist_lock);
2554
2555 preempt_disable();
2556 sock_prot_inuse_add(net, &packet_proto, 1); 1845 sock_prot_inuse_add(net, &packet_proto, 1);
2557 preempt_enable(); 1846 spin_unlock_bh(&net->packet.sklist_lock);
2558 1847
2559 return 0; 1848 return 0;
2560out: 1849out:
@@ -2994,27 +2283,15 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
2994 case PACKET_RX_RING: 2283 case PACKET_RX_RING:
2995 case PACKET_TX_RING: 2284 case PACKET_TX_RING:
2996 { 2285 {
2997 union tpacket_req_u req_u; 2286 struct tpacket_req req;
2998 int len;
2999 2287
3000 switch (po->tp_version) { 2288 if (optlen < sizeof(req))
3001 case TPACKET_V1:
3002 case TPACKET_V2:
3003 len = sizeof(req_u.req);
3004 break;
3005 case TPACKET_V3:
3006 default:
3007 len = sizeof(req_u.req3);
3008 break;
3009 }
3010 if (optlen < len)
3011 return -EINVAL; 2289 return -EINVAL;
3012 if (pkt_sk(sk)->has_vnet_hdr) 2290 if (pkt_sk(sk)->has_vnet_hdr)
3013 return -EINVAL; 2291 return -EINVAL;
3014 if (copy_from_user(&req_u.req, optval, len)) 2292 if (copy_from_user(&req, optval, sizeof(req)))
3015 return -EFAULT; 2293 return -EFAULT;
3016 return packet_set_ring(sk, &req_u, 0, 2294 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
3017 optname == PACKET_TX_RING);
3018 } 2295 }
3019 case PACKET_COPY_THRESH: 2296 case PACKET_COPY_THRESH:
3020 { 2297 {
@@ -3041,7 +2318,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
3041 switch (val) { 2318 switch (val) {
3042 case TPACKET_V1: 2319 case TPACKET_V1:
3043 case TPACKET_V2: 2320 case TPACKET_V2:
3044 case TPACKET_V3:
3045 po->tp_version = val; 2321 po->tp_version = val;
3046 return 0; 2322 return 0;
3047 default: 2323 default:
@@ -3137,19 +2413,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
3137 2413
3138 return fanout_add(sk, val & 0xffff, val >> 16); 2414 return fanout_add(sk, val & 0xffff, val >> 16);
3139 } 2415 }
3140 case PACKET_TX_HAS_OFF:
3141 {
3142 unsigned int val;
3143
3144 if (optlen != sizeof(val))
3145 return -EINVAL;
3146 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3147 return -EBUSY;
3148 if (copy_from_user(&val, optval, sizeof(val)))
3149 return -EFAULT;
3150 po->tp_tx_has_off = !!val;
3151 return 0;
3152 }
3153 default: 2416 default:
3154 return -ENOPROTOOPT; 2417 return -ENOPROTOOPT;
3155 } 2418 }
@@ -3159,12 +2422,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
3159 char __user *optval, int __user *optlen) 2422 char __user *optval, int __user *optlen)
3160{ 2423{
3161 int len; 2424 int len;
3162 int val, lv = sizeof(val); 2425 int val;
3163 struct sock *sk = sock->sk; 2426 struct sock *sk = sock->sk;
3164 struct packet_sock *po = pkt_sk(sk); 2427 struct packet_sock *po = pkt_sk(sk);
3165 void *data = &val; 2428 void *data;
3166 struct tpacket_stats st; 2429 struct tpacket_stats st;
3167 union tpacket_stats_u st_u;
3168 2430
3169 if (level != SOL_PACKET) 2431 if (level != SOL_PACKET)
3170 return -ENOPROTOOPT; 2432 return -ENOPROTOOPT;
@@ -3177,35 +2439,42 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
3177 2439
3178 switch (optname) { 2440 switch (optname) {
3179 case PACKET_STATISTICS: 2441 case PACKET_STATISTICS:
2442 if (len > sizeof(struct tpacket_stats))
2443 len = sizeof(struct tpacket_stats);
3180 spin_lock_bh(&sk->sk_receive_queue.lock); 2444 spin_lock_bh(&sk->sk_receive_queue.lock);
3181 if (po->tp_version == TPACKET_V3) { 2445 st = po->stats;
3182 lv = sizeof(struct tpacket_stats_v3);
3183 memcpy(&st_u.stats3, &po->stats,
3184 sizeof(struct tpacket_stats));
3185 st_u.stats3.tp_freeze_q_cnt =
3186 po->stats_u.stats3.tp_freeze_q_cnt;
3187 st_u.stats3.tp_packets += po->stats.tp_drops;
3188 data = &st_u.stats3;
3189 } else {
3190 lv = sizeof(struct tpacket_stats);
3191 st = po->stats;
3192 st.tp_packets += st.tp_drops;
3193 data = &st;
3194 }
3195 memset(&po->stats, 0, sizeof(st)); 2446 memset(&po->stats, 0, sizeof(st));
3196 spin_unlock_bh(&sk->sk_receive_queue.lock); 2447 spin_unlock_bh(&sk->sk_receive_queue.lock);
2448 st.tp_packets += st.tp_drops;
2449
2450 data = &st;
3197 break; 2451 break;
3198 case PACKET_AUXDATA: 2452 case PACKET_AUXDATA:
2453 if (len > sizeof(int))
2454 len = sizeof(int);
3199 val = po->auxdata; 2455 val = po->auxdata;
2456
2457 data = &val;
3200 break; 2458 break;
3201 case PACKET_ORIGDEV: 2459 case PACKET_ORIGDEV:
2460 if (len > sizeof(int))
2461 len = sizeof(int);
3202 val = po->origdev; 2462 val = po->origdev;
2463
2464 data = &val;
3203 break; 2465 break;
3204 case PACKET_VNET_HDR: 2466 case PACKET_VNET_HDR:
2467 if (len > sizeof(int))
2468 len = sizeof(int);
3205 val = po->has_vnet_hdr; 2469 val = po->has_vnet_hdr;
2470
2471 data = &val;
3206 break; 2472 break;
3207 case PACKET_VERSION: 2473 case PACKET_VERSION:
2474 if (len > sizeof(int))
2475 len = sizeof(int);
3208 val = po->tp_version; 2476 val = po->tp_version;
2477 data = &val;
3209 break; 2478 break;
3210 case PACKET_HDRLEN: 2479 case PACKET_HDRLEN:
3211 if (len > sizeof(int)) 2480 if (len > sizeof(int))
@@ -3219,37 +2488,42 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
3219 case TPACKET_V2: 2488 case TPACKET_V2:
3220 val = sizeof(struct tpacket2_hdr); 2489 val = sizeof(struct tpacket2_hdr);
3221 break; 2490 break;
3222 case TPACKET_V3:
3223 val = sizeof(struct tpacket3_hdr);
3224 break;
3225 default: 2491 default:
3226 return -EINVAL; 2492 return -EINVAL;
3227 } 2493 }
2494 data = &val;
3228 break; 2495 break;
3229 case PACKET_RESERVE: 2496 case PACKET_RESERVE:
2497 if (len > sizeof(unsigned int))
2498 len = sizeof(unsigned int);
3230 val = po->tp_reserve; 2499 val = po->tp_reserve;
2500 data = &val;
3231 break; 2501 break;
3232 case PACKET_LOSS: 2502 case PACKET_LOSS:
2503 if (len > sizeof(unsigned int))
2504 len = sizeof(unsigned int);
3233 val = po->tp_loss; 2505 val = po->tp_loss;
2506 data = &val;
3234 break; 2507 break;
3235 case PACKET_TIMESTAMP: 2508 case PACKET_TIMESTAMP:
2509 if (len > sizeof(int))
2510 len = sizeof(int);
3236 val = po->tp_tstamp; 2511 val = po->tp_tstamp;
2512 data = &val;
3237 break; 2513 break;
3238 case PACKET_FANOUT: 2514 case PACKET_FANOUT:
2515 if (len > sizeof(int))
2516 len = sizeof(int);
3239 val = (po->fanout ? 2517 val = (po->fanout ?
3240 ((u32)po->fanout->id | 2518 ((u32)po->fanout->id |
3241 ((u32)po->fanout->type << 16)) : 2519 ((u32)po->fanout->type << 16)) :
3242 0); 2520 0);
3243 break; 2521 data = &val;
3244 case PACKET_TX_HAS_OFF:
3245 val = po->tp_tx_has_off;
3246 break; 2522 break;
3247 default: 2523 default:
3248 return -ENOPROTOOPT; 2524 return -ENOPROTOOPT;
3249 } 2525 }
3250 2526
3251 if (len > lv)
3252 len = lv;
3253 if (put_user(len, optlen)) 2527 if (put_user(len, optlen))
3254 return -EFAULT; 2528 return -EFAULT;
3255 if (copy_to_user(optval, data, len)) 2529 if (copy_to_user(optval, data, len))
@@ -3370,8 +2644,7 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
3370 2644
3371 spin_lock_bh(&sk->sk_receive_queue.lock); 2645 spin_lock_bh(&sk->sk_receive_queue.lock);
3372 if (po->rx_ring.pg_vec) { 2646 if (po->rx_ring.pg_vec) {
3373 if (!packet_previous_rx_frame(po, &po->rx_ring, 2647 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
3374 TP_STATUS_KERNEL))
3375 mask |= POLLIN | POLLRDNORM; 2648 mask |= POLLIN | POLLRDNORM;
3376 } 2649 }
3377 spin_unlock_bh(&sk->sk_receive_queue.lock); 2650 spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -3432,7 +2705,7 @@ static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3432 kfree(pg_vec); 2705 kfree(pg_vec);
3433} 2706}
3434 2707
3435static char *alloc_one_pg_vec_page(unsigned long order) 2708static inline char *alloc_one_pg_vec_page(unsigned long order)
3436{ 2709{
3437 char *buffer = NULL; 2710 char *buffer = NULL;
3438 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | 2711 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
@@ -3490,7 +2763,7 @@ out_free_pgvec:
3490 goto out; 2763 goto out;
3491} 2764}
3492 2765
3493static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, 2766static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
3494 int closing, int tx_ring) 2767 int closing, int tx_ring)
3495{ 2768{
3496 struct pgv *pg_vec = NULL; 2769 struct pgv *pg_vec = NULL;
@@ -3499,15 +2772,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3499 struct packet_ring_buffer *rb; 2772 struct packet_ring_buffer *rb;
3500 struct sk_buff_head *rb_queue; 2773 struct sk_buff_head *rb_queue;
3501 __be16 num; 2774 __be16 num;
3502 int err = -EINVAL; 2775 int err;
3503 /* Added to avoid minimal code churn */
3504 struct tpacket_req *req = &req_u->req;
3505
3506 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3507 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3508 WARN(1, "Tx-ring is not supported.\n");
3509 goto out;
3510 }
3511 2776
3512 rb = tx_ring ? &po->tx_ring : &po->rx_ring; 2777 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3513 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 2778 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -3533,9 +2798,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3533 case TPACKET_V2: 2798 case TPACKET_V2:
3534 po->tp_hdrlen = TPACKET2_HDRLEN; 2799 po->tp_hdrlen = TPACKET2_HDRLEN;
3535 break; 2800 break;
3536 case TPACKET_V3:
3537 po->tp_hdrlen = TPACKET3_HDRLEN;
3538 break;
3539 } 2801 }
3540 2802
3541 err = -EINVAL; 2803 err = -EINVAL;
@@ -3561,17 +2823,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3561 pg_vec = alloc_pg_vec(req, order); 2823 pg_vec = alloc_pg_vec(req, order);
3562 if (unlikely(!pg_vec)) 2824 if (unlikely(!pg_vec))
3563 goto out; 2825 goto out;
3564 switch (po->tp_version) {
3565 case TPACKET_V3:
3566 /* Transmit path is not supported. We checked
3567 * it above but just being paranoid
3568 */
3569 if (!tx_ring)
3570 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3571 break;
3572 default:
3573 break;
3574 }
3575 } 2826 }
3576 /* Done */ 2827 /* Done */
3577 else { 2828 else {
@@ -3624,11 +2875,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3624 register_prot_hook(sk); 2875 register_prot_hook(sk);
3625 } 2876 }
3626 spin_unlock(&po->bind_lock); 2877 spin_unlock(&po->bind_lock);
3627 if (closing && (po->tp_version > TPACKET_V2)) { 2878
3628 /* Because we don't support block-based V3 on tx-ring */
3629 if (!tx_ring)
3630 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3631 }
3632 release_sock(sk); 2879 release_sock(sk);
3633 2880
3634 if (pg_vec) 2881 if (pg_vec)
@@ -3791,7 +3038,7 @@ static int packet_seq_show(struct seq_file *seq, void *v)
3791 po->ifindex, 3038 po->ifindex,
3792 po->running, 3039 po->running,
3793 atomic_read(&s->sk_rmem_alloc), 3040 atomic_read(&s->sk_rmem_alloc),
3794 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), 3041 sock_i_uid(s),
3795 sock_i_ino(s)); 3042 sock_i_ino(s));
3796 } 3043 }
3797 3044
@@ -3823,7 +3070,7 @@ static const struct file_operations packet_seq_fops = {
3823 3070
3824static int __net_init packet_net_init(struct net *net) 3071static int __net_init packet_net_init(struct net *net)
3825{ 3072{
3826 mutex_init(&net->packet.sklist_lock); 3073 spin_lock_init(&net->packet.sklist_lock);
3827 INIT_HLIST_HEAD(&net->packet.sklist); 3074 INIT_HLIST_HEAD(&net->packet.sklist);
3828 3075
3829 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops)) 3076 if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
diff --git a/net/packet/diag.c b/net/packet/diag.c
deleted file mode 100644
index 8db6e21c46b..00000000000
--- a/net/packet/diag.c
+++ /dev/null
@@ -1,242 +0,0 @@
1#include <linux/module.h>
2#include <linux/sock_diag.h>
3#include <linux/net.h>
4#include <linux/netdevice.h>
5#include <linux/packet_diag.h>
6#include <net/net_namespace.h>
7#include <net/sock.h>
8
9#include "internal.h"
10
11static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
12{
13 struct packet_diag_info pinfo;
14
15 pinfo.pdi_index = po->ifindex;
16 pinfo.pdi_version = po->tp_version;
17 pinfo.pdi_reserve = po->tp_reserve;
18 pinfo.pdi_copy_thresh = po->copy_thresh;
19 pinfo.pdi_tstamp = po->tp_tstamp;
20
21 pinfo.pdi_flags = 0;
22 if (po->running)
23 pinfo.pdi_flags |= PDI_RUNNING;
24 if (po->auxdata)
25 pinfo.pdi_flags |= PDI_AUXDATA;
26 if (po->origdev)
27 pinfo.pdi_flags |= PDI_ORIGDEV;
28 if (po->has_vnet_hdr)
29 pinfo.pdi_flags |= PDI_VNETHDR;
30 if (po->tp_loss)
31 pinfo.pdi_flags |= PDI_LOSS;
32
33 return nla_put(nlskb, PACKET_DIAG_INFO, sizeof(pinfo), &pinfo);
34}
35
36static int pdiag_put_mclist(const struct packet_sock *po, struct sk_buff *nlskb)
37{
38 struct nlattr *mca;
39 struct packet_mclist *ml;
40
41 mca = nla_nest_start(nlskb, PACKET_DIAG_MCLIST);
42 if (!mca)
43 return -EMSGSIZE;
44
45 rtnl_lock();
46 for (ml = po->mclist; ml; ml = ml->next) {
47 struct packet_diag_mclist *dml;
48
49 dml = nla_reserve_nohdr(nlskb, sizeof(*dml));
50 if (!dml) {
51 rtnl_unlock();
52 nla_nest_cancel(nlskb, mca);
53 return -EMSGSIZE;
54 }
55
56 dml->pdmc_index = ml->ifindex;
57 dml->pdmc_type = ml->type;
58 dml->pdmc_alen = ml->alen;
59 dml->pdmc_count = ml->count;
60 BUILD_BUG_ON(sizeof(dml->pdmc_addr) != sizeof(ml->addr));
61 memcpy(dml->pdmc_addr, ml->addr, sizeof(ml->addr));
62 }
63
64 rtnl_unlock();
65 nla_nest_end(nlskb, mca);
66
67 return 0;
68}
69
70static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
71 struct sk_buff *nlskb)
72{
73 struct packet_diag_ring pdr;
74
75 if (!ring->pg_vec || ((ver > TPACKET_V2) &&
76 (nl_type == PACKET_DIAG_TX_RING)))
77 return 0;
78
79 pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
80 pdr.pdr_block_nr = ring->pg_vec_len;
81 pdr.pdr_frame_size = ring->frame_size;
82 pdr.pdr_frame_nr = ring->frame_max + 1;
83
84 if (ver > TPACKET_V2) {
85 pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
86 pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
87 pdr.pdr_features = ring->prb_bdqc.feature_req_word;
88 } else {
89 pdr.pdr_retire_tmo = 0;
90 pdr.pdr_sizeof_priv = 0;
91 pdr.pdr_features = 0;
92 }
93
94 return nla_put(nlskb, nl_type, sizeof(pdr), &pdr);
95}
96
97static int pdiag_put_rings_cfg(struct packet_sock *po, struct sk_buff *skb)
98{
99 int ret;
100
101 mutex_lock(&po->pg_vec_lock);
102 ret = pdiag_put_ring(&po->rx_ring, po->tp_version,
103 PACKET_DIAG_RX_RING, skb);
104 if (!ret)
105 ret = pdiag_put_ring(&po->tx_ring, po->tp_version,
106 PACKET_DIAG_TX_RING, skb);
107 mutex_unlock(&po->pg_vec_lock);
108
109 return ret;
110}
111
112static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb)
113{
114 int ret = 0;
115
116 mutex_lock(&fanout_mutex);
117 if (po->fanout) {
118 u32 val;
119
120 val = (u32)po->fanout->id | ((u32)po->fanout->type << 16);
121 ret = nla_put_u32(nlskb, PACKET_DIAG_FANOUT, val);
122 }
123 mutex_unlock(&fanout_mutex);
124
125 return ret;
126}
127
128static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct packet_diag_req *req,
129 u32 portid, u32 seq, u32 flags, int sk_ino)
130{
131 struct nlmsghdr *nlh;
132 struct packet_diag_msg *rp;
133 struct packet_sock *po = pkt_sk(sk);
134
135 nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rp), flags);
136 if (!nlh)
137 return -EMSGSIZE;
138
139 rp = nlmsg_data(nlh);
140 rp->pdiag_family = AF_PACKET;
141 rp->pdiag_type = sk->sk_type;
142 rp->pdiag_num = ntohs(po->num);
143 rp->pdiag_ino = sk_ino;
144 sock_diag_save_cookie(sk, rp->pdiag_cookie);
145
146 if ((req->pdiag_show & PACKET_SHOW_INFO) &&
147 pdiag_put_info(po, skb))
148 goto out_nlmsg_trim;
149
150 if ((req->pdiag_show & PACKET_SHOW_MCLIST) &&
151 pdiag_put_mclist(po, skb))
152 goto out_nlmsg_trim;
153
154 if ((req->pdiag_show & PACKET_SHOW_RING_CFG) &&
155 pdiag_put_rings_cfg(po, skb))
156 goto out_nlmsg_trim;
157
158 if ((req->pdiag_show & PACKET_SHOW_FANOUT) &&
159 pdiag_put_fanout(po, skb))
160 goto out_nlmsg_trim;
161
162 return nlmsg_end(skb, nlh);
163
164out_nlmsg_trim:
165 nlmsg_cancel(skb, nlh);
166 return -EMSGSIZE;
167}
168
169static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
170{
171 int num = 0, s_num = cb->args[0];
172 struct packet_diag_req *req;
173 struct net *net;
174 struct sock *sk;
175 struct hlist_node *node;
176
177 net = sock_net(skb->sk);
178 req = nlmsg_data(cb->nlh);
179
180 mutex_lock(&net->packet.sklist_lock);
181 sk_for_each(sk, node, &net->packet.sklist) {
182 if (!net_eq(sock_net(sk), net))
183 continue;
184 if (num < s_num)
185 goto next;
186
187 if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid,
188 cb->nlh->nlmsg_seq, NLM_F_MULTI,
189 sock_i_ino(sk)) < 0)
190 goto done;
191next:
192 num++;
193 }
194done:
195 mutex_unlock(&net->packet.sklist_lock);
196 cb->args[0] = num;
197
198 return skb->len;
199}
200
201static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
202{
203 int hdrlen = sizeof(struct packet_diag_req);
204 struct net *net = sock_net(skb->sk);
205 struct packet_diag_req *req;
206
207 if (nlmsg_len(h) < hdrlen)
208 return -EINVAL;
209
210 req = nlmsg_data(h);
211 /* Make it possible to support protocol filtering later */
212 if (req->sdiag_protocol)
213 return -EINVAL;
214
215 if (h->nlmsg_flags & NLM_F_DUMP) {
216 struct netlink_dump_control c = {
217 .dump = packet_diag_dump,
218 };
219 return netlink_dump_start(net->diag_nlsk, skb, h, &c);
220 } else
221 return -EOPNOTSUPP;
222}
223
224static const struct sock_diag_handler packet_diag_handler = {
225 .family = AF_PACKET,
226 .dump = packet_diag_handler_dump,
227};
228
229static int __init packet_diag_init(void)
230{
231 return sock_diag_register(&packet_diag_handler);
232}
233
234static void __exit packet_diag_exit(void)
235{
236 sock_diag_unregister(&packet_diag_handler);
237}
238
239module_init(packet_diag_init);
240module_exit(packet_diag_exit);
241MODULE_LICENSE("GPL");
242MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */);
diff --git a/net/packet/internal.h b/net/packet/internal.h
deleted file mode 100644
index e84cab8cb7a..00000000000
--- a/net/packet/internal.h
+++ /dev/null
@@ -1,122 +0,0 @@
1#ifndef __PACKET_INTERNAL_H__
2#define __PACKET_INTERNAL_H__
3
4struct packet_mclist {
5 struct packet_mclist *next;
6 int ifindex;
7 int count;
8 unsigned short type;
9 unsigned short alen;
10 unsigned char addr[MAX_ADDR_LEN];
11};
12
13/* kbdq - kernel block descriptor queue */
14struct tpacket_kbdq_core {
15 struct pgv *pkbdq;
16 unsigned int feature_req_word;
17 unsigned int hdrlen;
18 unsigned char reset_pending_on_curr_blk;
19 unsigned char delete_blk_timer;
20 unsigned short kactive_blk_num;
21 unsigned short blk_sizeof_priv;
22
23 /* last_kactive_blk_num:
24 * trick to see if user-space has caught up
25 * in order to avoid refreshing timer when every single pkt arrives.
26 */
27 unsigned short last_kactive_blk_num;
28
29 char *pkblk_start;
30 char *pkblk_end;
31 int kblk_size;
32 unsigned int knum_blocks;
33 uint64_t knxt_seq_num;
34 char *prev;
35 char *nxt_offset;
36 struct sk_buff *skb;
37
38 atomic_t blk_fill_in_prog;
39
40 /* Default is set to 8ms */
41#define DEFAULT_PRB_RETIRE_TOV (8)
42
43 unsigned short retire_blk_tov;
44 unsigned short version;
45 unsigned long tov_in_jiffies;
46
47 /* timer to retire an outstanding block */
48 struct timer_list retire_blk_timer;
49};
50
51struct pgv {
52 char *buffer;
53};
54
55struct packet_ring_buffer {
56 struct pgv *pg_vec;
57 unsigned int head;
58 unsigned int frames_per_block;
59 unsigned int frame_size;
60 unsigned int frame_max;
61
62 unsigned int pg_vec_order;
63 unsigned int pg_vec_pages;
64 unsigned int pg_vec_len;
65
66 struct tpacket_kbdq_core prb_bdqc;
67 atomic_t pending;
68};
69
70extern struct mutex fanout_mutex;
71#define PACKET_FANOUT_MAX 256
72
73struct packet_fanout {
74#ifdef CONFIG_NET_NS
75 struct net *net;
76#endif
77 unsigned int num_members;
78 u16 id;
79 u8 type;
80 u8 defrag;
81 atomic_t rr_cur;
82 struct list_head list;
83 struct sock *arr[PACKET_FANOUT_MAX];
84 spinlock_t lock;
85 atomic_t sk_ref;
86 struct packet_type prot_hook ____cacheline_aligned_in_smp;
87};
88
89struct packet_sock {
90 /* struct sock has to be the first member of packet_sock */
91 struct sock sk;
92 struct packet_fanout *fanout;
93 struct tpacket_stats stats;
94 union tpacket_stats_u stats_u;
95 struct packet_ring_buffer rx_ring;
96 struct packet_ring_buffer tx_ring;
97 int copy_thresh;
98 spinlock_t bind_lock;
99 struct mutex pg_vec_lock;
100 unsigned int running:1, /* prot_hook is attached*/
101 auxdata:1,
102 origdev:1,
103 has_vnet_hdr:1;
104 int ifindex; /* bound device */
105 __be16 num;
106 struct packet_mclist *mclist;
107 atomic_t mapped;
108 enum tpacket_versions tp_version;
109 unsigned int tp_hdrlen;
110 unsigned int tp_reserve;
111 unsigned int tp_loss:1;
112 unsigned int tp_tx_has_off:1;
113 unsigned int tp_tstamp;
114 struct packet_type prot_hook ____cacheline_aligned_in_smp;
115};
116
117static struct packet_sock *pkt_sk(struct sock *sk)
118{
119 return (struct packet_sock *)sk;
120}
121
122#endif