diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
commit | 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch) | |
tree | a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /net/packet | |
parent | 406089d01562f1e2bf9f089fd7637009ebaad589 (diff) |
Patched in Tegra support.
Diffstat (limited to 'net/packet')
-rw-r--r-- | net/packet/Kconfig | 8 | ||||
-rw-r--r-- | net/packet/Makefile | 2 | ||||
-rw-r--r-- | net/packet/af_packet.c | 1211 | ||||
-rw-r--r-- | net/packet/diag.c | 242 | ||||
-rw-r--r-- | net/packet/internal.h | 122 |
5 files changed, 229 insertions, 1356 deletions
diff --git a/net/packet/Kconfig b/net/packet/Kconfig index cc55b35f80e..0060e3b396b 100644 --- a/net/packet/Kconfig +++ b/net/packet/Kconfig | |||
@@ -14,11 +14,3 @@ config PACKET | |||
14 | be called af_packet. | 14 | be called af_packet. |
15 | 15 | ||
16 | If unsure, say Y. | 16 | If unsure, say Y. |
17 | |||
18 | config PACKET_DIAG | ||
19 | tristate "Packet: sockets monitoring interface" | ||
20 | depends on PACKET | ||
21 | default n | ||
22 | ---help--- | ||
23 | Support for PF_PACKET sockets monitoring interface used by the ss tool. | ||
24 | If unsure, say Y. | ||
diff --git a/net/packet/Makefile b/net/packet/Makefile index 9df61347a3c..81183eabfde 100644 --- a/net/packet/Makefile +++ b/net/packet/Makefile | |||
@@ -3,5 +3,3 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_PACKET) += af_packet.o | 5 | obj-$(CONFIG_PACKET) += af_packet.o |
6 | obj-$(CONFIG_PACKET_DIAG) += af_packet_diag.o | ||
7 | af_packet_diag-y += diag.o | ||
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e639645e8fe..fabb4fafa28 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -40,10 +40,6 @@ | |||
40 | * byte arrays at the end of sockaddr_ll | 40 | * byte arrays at the end of sockaddr_ll |
41 | * and packet_mreq. | 41 | * and packet_mreq. |
42 | * Johann Baudy : Added TX RING. | 42 | * Johann Baudy : Added TX RING. |
43 | * Chetan Loke : Implemented TPACKET_V3 block abstraction | ||
44 | * layer. | ||
45 | * Copyright (C) 2011, <lokec@ccs.neu.edu> | ||
46 | * | ||
47 | * | 43 | * |
48 | * This program is free software; you can redistribute it and/or | 44 | * This program is free software; you can redistribute it and/or |
49 | * modify it under the terms of the GNU General Public License | 45 | * modify it under the terms of the GNU General Public License |
@@ -73,6 +69,7 @@ | |||
73 | #include <net/sock.h> | 69 | #include <net/sock.h> |
74 | #include <linux/errno.h> | 70 | #include <linux/errno.h> |
75 | #include <linux/timer.h> | 71 | #include <linux/timer.h> |
72 | #include <asm/system.h> | ||
76 | #include <asm/uaccess.h> | 73 | #include <asm/uaccess.h> |
77 | #include <asm/ioctls.h> | 74 | #include <asm/ioctls.h> |
78 | #include <asm/page.h> | 75 | #include <asm/page.h> |
@@ -93,8 +90,6 @@ | |||
93 | #include <net/inet_common.h> | 90 | #include <net/inet_common.h> |
94 | #endif | 91 | #endif |
95 | 92 | ||
96 | #include "internal.h" | ||
97 | |||
98 | /* | 93 | /* |
99 | Assumptions: | 94 | Assumptions: |
100 | - if device has no dev->hard_header routine, it adds and removes ll header | 95 | - if device has no dev->hard_header routine, it adds and removes ll header |
@@ -148,6 +143,14 @@ dev->hard_header == NULL (ll header is added by device, we cannot control it) | |||
148 | 143 | ||
149 | /* Private packet socket structures. */ | 144 | /* Private packet socket structures. */ |
150 | 145 | ||
146 | struct packet_mclist { | ||
147 | struct packet_mclist *next; | ||
148 | int ifindex; | ||
149 | int count; | ||
150 | unsigned short type; | ||
151 | unsigned short alen; | ||
152 | unsigned char addr[MAX_ADDR_LEN]; | ||
153 | }; | ||
151 | /* identical to struct packet_mreq except it has | 154 | /* identical to struct packet_mreq except it has |
152 | * a longer address field. | 155 | * a longer address field. |
153 | */ | 156 | */ |
@@ -158,55 +161,77 @@ struct packet_mreq_max { | |||
158 | unsigned char mr_address[MAX_ADDR_LEN]; | 161 | unsigned char mr_address[MAX_ADDR_LEN]; |
159 | }; | 162 | }; |
160 | 163 | ||
161 | static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, | 164 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, |
162 | int closing, int tx_ring); | 165 | int closing, int tx_ring); |
163 | 166 | ||
167 | struct pgv { | ||
168 | char *buffer; | ||
169 | }; | ||
164 | 170 | ||
165 | #define V3_ALIGNMENT (8) | 171 | struct packet_ring_buffer { |
166 | 172 | struct pgv *pg_vec; | |
167 | #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) | 173 | unsigned int head; |
168 | 174 | unsigned int frames_per_block; | |
169 | #define BLK_PLUS_PRIV(sz_of_priv) \ | 175 | unsigned int frame_size; |
170 | (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) | 176 | unsigned int frame_max; |
171 | 177 | ||
172 | #define PGV_FROM_VMALLOC 1 | 178 | unsigned int pg_vec_order; |
179 | unsigned int pg_vec_pages; | ||
180 | unsigned int pg_vec_len; | ||
173 | 181 | ||
174 | #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) | 182 | atomic_t pending; |
175 | #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) | 183 | }; |
176 | #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) | ||
177 | #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) | ||
178 | #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) | ||
179 | #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) | ||
180 | #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) | ||
181 | 184 | ||
182 | struct packet_sock; | 185 | struct packet_sock; |
183 | static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); | 186 | static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); |
184 | 187 | ||
185 | static void *packet_previous_frame(struct packet_sock *po, | ||
186 | struct packet_ring_buffer *rb, | ||
187 | int status); | ||
188 | static void packet_increment_head(struct packet_ring_buffer *buff); | ||
189 | static int prb_curr_blk_in_use(struct tpacket_kbdq_core *, | ||
190 | struct tpacket_block_desc *); | ||
191 | static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, | ||
192 | struct packet_sock *); | ||
193 | static void prb_retire_current_block(struct tpacket_kbdq_core *, | ||
194 | struct packet_sock *, unsigned int status); | ||
195 | static int prb_queue_frozen(struct tpacket_kbdq_core *); | ||
196 | static void prb_open_block(struct tpacket_kbdq_core *, | ||
197 | struct tpacket_block_desc *); | ||
198 | static void prb_retire_rx_blk_timer_expired(unsigned long); | ||
199 | static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); | ||
200 | static void prb_init_blk_timer(struct packet_sock *, | ||
201 | struct tpacket_kbdq_core *, | ||
202 | void (*func) (unsigned long)); | ||
203 | static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); | ||
204 | static void prb_clear_rxhash(struct tpacket_kbdq_core *, | ||
205 | struct tpacket3_hdr *); | ||
206 | static void prb_fill_vlan_info(struct tpacket_kbdq_core *, | ||
207 | struct tpacket3_hdr *); | ||
208 | static void packet_flush_mclist(struct sock *sk); | 188 | static void packet_flush_mclist(struct sock *sk); |
209 | 189 | ||
190 | struct packet_fanout; | ||
191 | struct packet_sock { | ||
192 | /* struct sock has to be the first member of packet_sock */ | ||
193 | struct sock sk; | ||
194 | struct packet_fanout *fanout; | ||
195 | struct tpacket_stats stats; | ||
196 | struct packet_ring_buffer rx_ring; | ||
197 | struct packet_ring_buffer tx_ring; | ||
198 | int copy_thresh; | ||
199 | spinlock_t bind_lock; | ||
200 | struct mutex pg_vec_lock; | ||
201 | unsigned int running:1, /* prot_hook is attached*/ | ||
202 | auxdata:1, | ||
203 | origdev:1, | ||
204 | has_vnet_hdr:1; | ||
205 | int ifindex; /* bound device */ | ||
206 | __be16 num; | ||
207 | struct packet_mclist *mclist; | ||
208 | atomic_t mapped; | ||
209 | enum tpacket_versions tp_version; | ||
210 | unsigned int tp_hdrlen; | ||
211 | unsigned int tp_reserve; | ||
212 | unsigned int tp_loss:1; | ||
213 | unsigned int tp_tstamp; | ||
214 | struct packet_type prot_hook ____cacheline_aligned_in_smp; | ||
215 | }; | ||
216 | |||
217 | #define PACKET_FANOUT_MAX 256 | ||
218 | |||
219 | struct packet_fanout { | ||
220 | #ifdef CONFIG_NET_NS | ||
221 | struct net *net; | ||
222 | #endif | ||
223 | unsigned int num_members; | ||
224 | u16 id; | ||
225 | u8 type; | ||
226 | u8 defrag; | ||
227 | atomic_t rr_cur; | ||
228 | struct list_head list; | ||
229 | struct sock *arr[PACKET_FANOUT_MAX]; | ||
230 | spinlock_t lock; | ||
231 | atomic_t sk_ref; | ||
232 | struct packet_type prot_hook ____cacheline_aligned_in_smp; | ||
233 | }; | ||
234 | |||
210 | struct packet_skb_cb { | 235 | struct packet_skb_cb { |
211 | unsigned int origlen; | 236 | unsigned int origlen; |
212 | union { | 237 | union { |
@@ -217,14 +242,10 @@ struct packet_skb_cb { | |||
217 | 242 | ||
218 | #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) | 243 | #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) |
219 | 244 | ||
220 | #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) | 245 | static inline struct packet_sock *pkt_sk(struct sock *sk) |
221 | #define GET_PBLOCK_DESC(x, bid) \ | 246 | { |
222 | ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) | 247 | return (struct packet_sock *)sk; |
223 | #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ | 248 | } |
224 | ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) | ||
225 | #define GET_NEXT_PRB_BLK_NUM(x) \ | ||
226 | (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ | ||
227 | ((x)->kactive_blk_num+1) : 0) | ||
228 | 249 | ||
229 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po); | 250 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po); |
230 | static void __fanout_link(struct sock *sk, struct packet_sock *po); | 251 | static void __fanout_link(struct sock *sk, struct packet_sock *po); |
@@ -304,9 +325,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) | |||
304 | h.h2->tp_status = status; | 325 | h.h2->tp_status = status; |
305 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); | 326 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); |
306 | break; | 327 | break; |
307 | case TPACKET_V3: | ||
308 | default: | 328 | default: |
309 | WARN(1, "TPACKET version not supported.\n"); | 329 | pr_err("TPACKET version not supported\n"); |
310 | BUG(); | 330 | BUG(); |
311 | } | 331 | } |
312 | 332 | ||
@@ -331,9 +351,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame) | |||
331 | case TPACKET_V2: | 351 | case TPACKET_V2: |
332 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); | 352 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); |
333 | return h.h2->tp_status; | 353 | return h.h2->tp_status; |
334 | case TPACKET_V3: | ||
335 | default: | 354 | default: |
336 | WARN(1, "TPACKET version not supported.\n"); | 355 | pr_err("TPACKET version not supported\n"); |
337 | BUG(); | 356 | BUG(); |
338 | return 0; | 357 | return 0; |
339 | } | 358 | } |
@@ -363,672 +382,14 @@ static void *packet_lookup_frame(struct packet_sock *po, | |||
363 | return h.raw; | 382 | return h.raw; |
364 | } | 383 | } |
365 | 384 | ||
366 | static void *packet_current_frame(struct packet_sock *po, | 385 | static inline void *packet_current_frame(struct packet_sock *po, |
367 | struct packet_ring_buffer *rb, | 386 | struct packet_ring_buffer *rb, |
368 | int status) | 387 | int status) |
369 | { | 388 | { |
370 | return packet_lookup_frame(po, rb, rb->head, status); | 389 | return packet_lookup_frame(po, rb, rb->head, status); |
371 | } | 390 | } |
372 | 391 | ||
373 | static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) | 392 | static inline void *packet_previous_frame(struct packet_sock *po, |
374 | { | ||
375 | del_timer_sync(&pkc->retire_blk_timer); | ||
376 | } | ||
377 | |||
378 | static void prb_shutdown_retire_blk_timer(struct packet_sock *po, | ||
379 | int tx_ring, | ||
380 | struct sk_buff_head *rb_queue) | ||
381 | { | ||
382 | struct tpacket_kbdq_core *pkc; | ||
383 | |||
384 | pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; | ||
385 | |||
386 | spin_lock(&rb_queue->lock); | ||
387 | pkc->delete_blk_timer = 1; | ||
388 | spin_unlock(&rb_queue->lock); | ||
389 | |||
390 | prb_del_retire_blk_timer(pkc); | ||
391 | } | ||
392 | |||
393 | static void prb_init_blk_timer(struct packet_sock *po, | ||
394 | struct tpacket_kbdq_core *pkc, | ||
395 | void (*func) (unsigned long)) | ||
396 | { | ||
397 | init_timer(&pkc->retire_blk_timer); | ||
398 | pkc->retire_blk_timer.data = (long)po; | ||
399 | pkc->retire_blk_timer.function = func; | ||
400 | pkc->retire_blk_timer.expires = jiffies; | ||
401 | } | ||
402 | |||
403 | static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) | ||
404 | { | ||
405 | struct tpacket_kbdq_core *pkc; | ||
406 | |||
407 | if (tx_ring) | ||
408 | BUG(); | ||
409 | |||
410 | pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; | ||
411 | prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); | ||
412 | } | ||
413 | |||
414 | static int prb_calc_retire_blk_tmo(struct packet_sock *po, | ||
415 | int blk_size_in_bytes) | ||
416 | { | ||
417 | struct net_device *dev; | ||
418 | unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; | ||
419 | struct ethtool_cmd ecmd; | ||
420 | int err; | ||
421 | u32 speed; | ||
422 | |||
423 | rtnl_lock(); | ||
424 | dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); | ||
425 | if (unlikely(!dev)) { | ||
426 | rtnl_unlock(); | ||
427 | return DEFAULT_PRB_RETIRE_TOV; | ||
428 | } | ||
429 | err = __ethtool_get_settings(dev, &ecmd); | ||
430 | speed = ethtool_cmd_speed(&ecmd); | ||
431 | rtnl_unlock(); | ||
432 | if (!err) { | ||
433 | /* | ||
434 | * If the link speed is so slow you don't really | ||
435 | * need to worry about perf anyways | ||
436 | */ | ||
437 | if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) { | ||
438 | return DEFAULT_PRB_RETIRE_TOV; | ||
439 | } else { | ||
440 | msec = 1; | ||
441 | div = speed / 1000; | ||
442 | } | ||
443 | } | ||
444 | |||
445 | mbits = (blk_size_in_bytes * 8) / (1024 * 1024); | ||
446 | |||
447 | if (div) | ||
448 | mbits /= div; | ||
449 | |||
450 | tmo = mbits * msec; | ||
451 | |||
452 | if (div) | ||
453 | return tmo+1; | ||
454 | return tmo; | ||
455 | } | ||
456 | |||
457 | static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, | ||
458 | union tpacket_req_u *req_u) | ||
459 | { | ||
460 | p1->feature_req_word = req_u->req3.tp_feature_req_word; | ||
461 | } | ||
462 | |||
463 | static void init_prb_bdqc(struct packet_sock *po, | ||
464 | struct packet_ring_buffer *rb, | ||
465 | struct pgv *pg_vec, | ||
466 | union tpacket_req_u *req_u, int tx_ring) | ||
467 | { | ||
468 | struct tpacket_kbdq_core *p1 = &rb->prb_bdqc; | ||
469 | struct tpacket_block_desc *pbd; | ||
470 | |||
471 | memset(p1, 0x0, sizeof(*p1)); | ||
472 | |||
473 | p1->knxt_seq_num = 1; | ||
474 | p1->pkbdq = pg_vec; | ||
475 | pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; | ||
476 | p1->pkblk_start = pg_vec[0].buffer; | ||
477 | p1->kblk_size = req_u->req3.tp_block_size; | ||
478 | p1->knum_blocks = req_u->req3.tp_block_nr; | ||
479 | p1->hdrlen = po->tp_hdrlen; | ||
480 | p1->version = po->tp_version; | ||
481 | p1->last_kactive_blk_num = 0; | ||
482 | po->stats_u.stats3.tp_freeze_q_cnt = 0; | ||
483 | if (req_u->req3.tp_retire_blk_tov) | ||
484 | p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; | ||
485 | else | ||
486 | p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, | ||
487 | req_u->req3.tp_block_size); | ||
488 | p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); | ||
489 | p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; | ||
490 | |||
491 | prb_init_ft_ops(p1, req_u); | ||
492 | prb_setup_retire_blk_timer(po, tx_ring); | ||
493 | prb_open_block(p1, pbd); | ||
494 | } | ||
495 | |||
496 | /* Do NOT update the last_blk_num first. | ||
497 | * Assumes sk_buff_head lock is held. | ||
498 | */ | ||
499 | static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) | ||
500 | { | ||
501 | mod_timer(&pkc->retire_blk_timer, | ||
502 | jiffies + pkc->tov_in_jiffies); | ||
503 | pkc->last_kactive_blk_num = pkc->kactive_blk_num; | ||
504 | } | ||
505 | |||
506 | /* | ||
507 | * Timer logic: | ||
508 | * 1) We refresh the timer only when we open a block. | ||
509 | * By doing this we don't waste cycles refreshing the timer | ||
510 | * on packet-by-packet basis. | ||
511 | * | ||
512 | * With a 1MB block-size, on a 1Gbps line, it will take | ||
513 | * i) ~8 ms to fill a block + ii) memcpy etc. | ||
514 | * In this cut we are not accounting for the memcpy time. | ||
515 | * | ||
516 | * So, if the user sets the 'tmo' to 10ms then the timer | ||
517 | * will never fire while the block is still getting filled | ||
518 | * (which is what we want). However, the user could choose | ||
519 | * to close a block early and that's fine. | ||
520 | * | ||
521 | * But when the timer does fire, we check whether or not to refresh it. | ||
522 | * Since the tmo granularity is in msecs, it is not too expensive | ||
523 | * to refresh the timer, lets say every '8' msecs. | ||
524 | * Either the user can set the 'tmo' or we can derive it based on | ||
525 | * a) line-speed and b) block-size. | ||
526 | * prb_calc_retire_blk_tmo() calculates the tmo. | ||
527 | * | ||
528 | */ | ||
529 | static void prb_retire_rx_blk_timer_expired(unsigned long data) | ||
530 | { | ||
531 | struct packet_sock *po = (struct packet_sock *)data; | ||
532 | struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc; | ||
533 | unsigned int frozen; | ||
534 | struct tpacket_block_desc *pbd; | ||
535 | |||
536 | spin_lock(&po->sk.sk_receive_queue.lock); | ||
537 | |||
538 | frozen = prb_queue_frozen(pkc); | ||
539 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
540 | |||
541 | if (unlikely(pkc->delete_blk_timer)) | ||
542 | goto out; | ||
543 | |||
544 | /* We only need to plug the race when the block is partially filled. | ||
545 | * tpacket_rcv: | ||
546 | * lock(); increment BLOCK_NUM_PKTS; unlock() | ||
547 | * copy_bits() is in progress ... | ||
548 | * timer fires on other cpu: | ||
549 | * we can't retire the current block because copy_bits | ||
550 | * is in progress. | ||
551 | * | ||
552 | */ | ||
553 | if (BLOCK_NUM_PKTS(pbd)) { | ||
554 | while (atomic_read(&pkc->blk_fill_in_prog)) { | ||
555 | /* Waiting for skb_copy_bits to finish... */ | ||
556 | cpu_relax(); | ||
557 | } | ||
558 | } | ||
559 | |||
560 | if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { | ||
561 | if (!frozen) { | ||
562 | prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); | ||
563 | if (!prb_dispatch_next_block(pkc, po)) | ||
564 | goto refresh_timer; | ||
565 | else | ||
566 | goto out; | ||
567 | } else { | ||
568 | /* Case 1. Queue was frozen because user-space was | ||
569 | * lagging behind. | ||
570 | */ | ||
571 | if (prb_curr_blk_in_use(pkc, pbd)) { | ||
572 | /* | ||
573 | * Ok, user-space is still behind. | ||
574 | * So just refresh the timer. | ||
575 | */ | ||
576 | goto refresh_timer; | ||
577 | } else { | ||
578 | /* Case 2. queue was frozen,user-space caught up, | ||
579 | * now the link went idle && the timer fired. | ||
580 | * We don't have a block to close.So we open this | ||
581 | * block and restart the timer. | ||
582 | * opening a block thaws the queue,restarts timer | ||
583 | * Thawing/timer-refresh is a side effect. | ||
584 | */ | ||
585 | prb_open_block(pkc, pbd); | ||
586 | goto out; | ||
587 | } | ||
588 | } | ||
589 | } | ||
590 | |||
591 | refresh_timer: | ||
592 | _prb_refresh_rx_retire_blk_timer(pkc); | ||
593 | |||
594 | out: | ||
595 | spin_unlock(&po->sk.sk_receive_queue.lock); | ||
596 | } | ||
597 | |||
598 | static void prb_flush_block(struct tpacket_kbdq_core *pkc1, | ||
599 | struct tpacket_block_desc *pbd1, __u32 status) | ||
600 | { | ||
601 | /* Flush everything minus the block header */ | ||
602 | |||
603 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 | ||
604 | u8 *start, *end; | ||
605 | |||
606 | start = (u8 *)pbd1; | ||
607 | |||
608 | /* Skip the block header(we know header WILL fit in 4K) */ | ||
609 | start += PAGE_SIZE; | ||
610 | |||
611 | end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); | ||
612 | for (; start < end; start += PAGE_SIZE) | ||
613 | flush_dcache_page(pgv_to_page(start)); | ||
614 | |||
615 | smp_wmb(); | ||
616 | #endif | ||
617 | |||
618 | /* Now update the block status. */ | ||
619 | |||
620 | BLOCK_STATUS(pbd1) = status; | ||
621 | |||
622 | /* Flush the block header */ | ||
623 | |||
624 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 | ||
625 | start = (u8 *)pbd1; | ||
626 | flush_dcache_page(pgv_to_page(start)); | ||
627 | |||
628 | smp_wmb(); | ||
629 | #endif | ||
630 | } | ||
631 | |||
632 | /* | ||
633 | * Side effect: | ||
634 | * | ||
635 | * 1) flush the block | ||
636 | * 2) Increment active_blk_num | ||
637 | * | ||
638 | * Note:We DONT refresh the timer on purpose. | ||
639 | * Because almost always the next block will be opened. | ||
640 | */ | ||
641 | static void prb_close_block(struct tpacket_kbdq_core *pkc1, | ||
642 | struct tpacket_block_desc *pbd1, | ||
643 | struct packet_sock *po, unsigned int stat) | ||
644 | { | ||
645 | __u32 status = TP_STATUS_USER | stat; | ||
646 | |||
647 | struct tpacket3_hdr *last_pkt; | ||
648 | struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; | ||
649 | |||
650 | if (po->stats.tp_drops) | ||
651 | status |= TP_STATUS_LOSING; | ||
652 | |||
653 | last_pkt = (struct tpacket3_hdr *)pkc1->prev; | ||
654 | last_pkt->tp_next_offset = 0; | ||
655 | |||
656 | /* Get the ts of the last pkt */ | ||
657 | if (BLOCK_NUM_PKTS(pbd1)) { | ||
658 | h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; | ||
659 | h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; | ||
660 | } else { | ||
661 | /* Ok, we tmo'd - so get the current time */ | ||
662 | struct timespec ts; | ||
663 | getnstimeofday(&ts); | ||
664 | h1->ts_last_pkt.ts_sec = ts.tv_sec; | ||
665 | h1->ts_last_pkt.ts_nsec = ts.tv_nsec; | ||
666 | } | ||
667 | |||
668 | smp_wmb(); | ||
669 | |||
670 | /* Flush the block */ | ||
671 | prb_flush_block(pkc1, pbd1, status); | ||
672 | |||
673 | pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); | ||
674 | } | ||
675 | |||
676 | static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) | ||
677 | { | ||
678 | pkc->reset_pending_on_curr_blk = 0; | ||
679 | } | ||
680 | |||
681 | /* | ||
682 | * Side effect of opening a block: | ||
683 | * | ||
684 | * 1) prb_queue is thawed. | ||
685 | * 2) retire_blk_timer is refreshed. | ||
686 | * | ||
687 | */ | ||
688 | static void prb_open_block(struct tpacket_kbdq_core *pkc1, | ||
689 | struct tpacket_block_desc *pbd1) | ||
690 | { | ||
691 | struct timespec ts; | ||
692 | struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; | ||
693 | |||
694 | smp_rmb(); | ||
695 | |||
696 | if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) { | ||
697 | |||
698 | /* We could have just memset this but we will lose the | ||
699 | * flexibility of making the priv area sticky | ||
700 | */ | ||
701 | BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; | ||
702 | BLOCK_NUM_PKTS(pbd1) = 0; | ||
703 | BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); | ||
704 | getnstimeofday(&ts); | ||
705 | h1->ts_first_pkt.ts_sec = ts.tv_sec; | ||
706 | h1->ts_first_pkt.ts_nsec = ts.tv_nsec; | ||
707 | pkc1->pkblk_start = (char *)pbd1; | ||
708 | pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); | ||
709 | BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); | ||
710 | BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; | ||
711 | pbd1->version = pkc1->version; | ||
712 | pkc1->prev = pkc1->nxt_offset; | ||
713 | pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; | ||
714 | prb_thaw_queue(pkc1); | ||
715 | _prb_refresh_rx_retire_blk_timer(pkc1); | ||
716 | |||
717 | smp_wmb(); | ||
718 | |||
719 | return; | ||
720 | } | ||
721 | |||
722 | WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n", | ||
723 | pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num); | ||
724 | dump_stack(); | ||
725 | BUG(); | ||
726 | } | ||
727 | |||
728 | /* | ||
729 | * Queue freeze logic: | ||
730 | * 1) Assume tp_block_nr = 8 blocks. | ||
731 | * 2) At time 't0', user opens Rx ring. | ||
732 | * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 | ||
733 | * 4) user-space is either sleeping or processing block '0'. | ||
734 | * 5) tpacket_rcv is currently filling block '7', since there is no space left, | ||
735 | * it will close block-7,loop around and try to fill block '0'. | ||
736 | * call-flow: | ||
737 | * __packet_lookup_frame_in_block | ||
738 | * prb_retire_current_block() | ||
739 | * prb_dispatch_next_block() | ||
740 | * |->(BLOCK_STATUS == USER) evaluates to true | ||
741 | * 5.1) Since block-0 is currently in-use, we just freeze the queue. | ||
742 | * 6) Now there are two cases: | ||
743 | * 6.1) Link goes idle right after the queue is frozen. | ||
744 | * But remember, the last open_block() refreshed the timer. | ||
745 | * When this timer expires,it will refresh itself so that we can | ||
746 | * re-open block-0 in near future. | ||
747 | * 6.2) Link is busy and keeps on receiving packets. This is a simple | ||
748 | * case and __packet_lookup_frame_in_block will check if block-0 | ||
749 | * is free and can now be re-used. | ||
750 | */ | ||
751 | static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, | ||
752 | struct packet_sock *po) | ||
753 | { | ||
754 | pkc->reset_pending_on_curr_blk = 1; | ||
755 | po->stats_u.stats3.tp_freeze_q_cnt++; | ||
756 | } | ||
757 | |||
758 | #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) | ||
759 | |||
760 | /* | ||
761 | * If the next block is free then we will dispatch it | ||
762 | * and return a good offset. | ||
763 | * Else, we will freeze the queue. | ||
764 | * So, caller must check the return value. | ||
765 | */ | ||
766 | static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, | ||
767 | struct packet_sock *po) | ||
768 | { | ||
769 | struct tpacket_block_desc *pbd; | ||
770 | |||
771 | smp_rmb(); | ||
772 | |||
773 | /* 1. Get current block num */ | ||
774 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
775 | |||
776 | /* 2. If this block is currently in_use then freeze the queue */ | ||
777 | if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { | ||
778 | prb_freeze_queue(pkc, po); | ||
779 | return NULL; | ||
780 | } | ||
781 | |||
782 | /* | ||
783 | * 3. | ||
784 | * open this block and return the offset where the first packet | ||
785 | * needs to get stored. | ||
786 | */ | ||
787 | prb_open_block(pkc, pbd); | ||
788 | return (void *)pkc->nxt_offset; | ||
789 | } | ||
790 | |||
791 | static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, | ||
792 | struct packet_sock *po, unsigned int status) | ||
793 | { | ||
794 | struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
795 | |||
796 | /* retire/close the current block */ | ||
797 | if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { | ||
798 | /* | ||
799 | * Plug the case where copy_bits() is in progress on | ||
800 | * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't | ||
801 | * have space to copy the pkt in the current block and | ||
802 | * called prb_retire_current_block() | ||
803 | * | ||
804 | * We don't need to worry about the TMO case because | ||
805 | * the timer-handler already handled this case. | ||
806 | */ | ||
807 | if (!(status & TP_STATUS_BLK_TMO)) { | ||
808 | while (atomic_read(&pkc->blk_fill_in_prog)) { | ||
809 | /* Waiting for skb_copy_bits to finish... */ | ||
810 | cpu_relax(); | ||
811 | } | ||
812 | } | ||
813 | prb_close_block(pkc, pbd, po, status); | ||
814 | return; | ||
815 | } | ||
816 | |||
817 | WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd); | ||
818 | dump_stack(); | ||
819 | BUG(); | ||
820 | } | ||
821 | |||
822 | static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc, | ||
823 | struct tpacket_block_desc *pbd) | ||
824 | { | ||
825 | return TP_STATUS_USER & BLOCK_STATUS(pbd); | ||
826 | } | ||
827 | |||
828 | static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) | ||
829 | { | ||
830 | return pkc->reset_pending_on_curr_blk; | ||
831 | } | ||
832 | |||
833 | static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) | ||
834 | { | ||
835 | struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); | ||
836 | atomic_dec(&pkc->blk_fill_in_prog); | ||
837 | } | ||
838 | |||
839 | static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, | ||
840 | struct tpacket3_hdr *ppd) | ||
841 | { | ||
842 | ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb); | ||
843 | } | ||
844 | |||
845 | static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, | ||
846 | struct tpacket3_hdr *ppd) | ||
847 | { | ||
848 | ppd->hv1.tp_rxhash = 0; | ||
849 | } | ||
850 | |||
851 | static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, | ||
852 | struct tpacket3_hdr *ppd) | ||
853 | { | ||
854 | if (vlan_tx_tag_present(pkc->skb)) { | ||
855 | ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); | ||
856 | ppd->tp_status = TP_STATUS_VLAN_VALID; | ||
857 | } else { | ||
858 | ppd->hv1.tp_vlan_tci = 0; | ||
859 | ppd->tp_status = TP_STATUS_AVAILABLE; | ||
860 | } | ||
861 | } | ||
862 | |||
863 | static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, | ||
864 | struct tpacket3_hdr *ppd) | ||
865 | { | ||
866 | prb_fill_vlan_info(pkc, ppd); | ||
867 | |||
868 | if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) | ||
869 | prb_fill_rxhash(pkc, ppd); | ||
870 | else | ||
871 | prb_clear_rxhash(pkc, ppd); | ||
872 | } | ||
873 | |||
874 | static void prb_fill_curr_block(char *curr, | ||
875 | struct tpacket_kbdq_core *pkc, | ||
876 | struct tpacket_block_desc *pbd, | ||
877 | unsigned int len) | ||
878 | { | ||
879 | struct tpacket3_hdr *ppd; | ||
880 | |||
881 | ppd = (struct tpacket3_hdr *)curr; | ||
882 | ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); | ||
883 | pkc->prev = curr; | ||
884 | pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); | ||
885 | BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); | ||
886 | BLOCK_NUM_PKTS(pbd) += 1; | ||
887 | atomic_inc(&pkc->blk_fill_in_prog); | ||
888 | prb_run_all_ft_ops(pkc, ppd); | ||
889 | } | ||
890 | |||
891 | /* Assumes caller has the sk->rx_queue.lock */ | ||
892 | static void *__packet_lookup_frame_in_block(struct packet_sock *po, | ||
893 | struct sk_buff *skb, | ||
894 | int status, | ||
895 | unsigned int len | ||
896 | ) | ||
897 | { | ||
898 | struct tpacket_kbdq_core *pkc; | ||
899 | struct tpacket_block_desc *pbd; | ||
900 | char *curr, *end; | ||
901 | |||
902 | pkc = GET_PBDQC_FROM_RB(&po->rx_ring); | ||
903 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
904 | |||
905 | /* Queue is frozen when user space is lagging behind */ | ||
906 | if (prb_queue_frozen(pkc)) { | ||
907 | /* | ||
908 | * Check if that last block which caused the queue to freeze, | ||
909 | * is still in_use by user-space. | ||
910 | */ | ||
911 | if (prb_curr_blk_in_use(pkc, pbd)) { | ||
912 | /* Can't record this packet */ | ||
913 | return NULL; | ||
914 | } else { | ||
915 | /* | ||
916 | * Ok, the block was released by user-space. | ||
917 | * Now let's open that block. | ||
918 | * opening a block also thaws the queue. | ||
919 | * Thawing is a side effect. | ||
920 | */ | ||
921 | prb_open_block(pkc, pbd); | ||
922 | } | ||
923 | } | ||
924 | |||
925 | smp_mb(); | ||
926 | curr = pkc->nxt_offset; | ||
927 | pkc->skb = skb; | ||
928 | end = (char *)pbd + pkc->kblk_size; | ||
929 | |||
930 | /* first try the current block */ | ||
931 | if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { | ||
932 | prb_fill_curr_block(curr, pkc, pbd, len); | ||
933 | return (void *)curr; | ||
934 | } | ||
935 | |||
936 | /* Ok, close the current block */ | ||
937 | prb_retire_current_block(pkc, po, 0); | ||
938 | |||
939 | /* Now, try to dispatch the next block */ | ||
940 | curr = (char *)prb_dispatch_next_block(pkc, po); | ||
941 | if (curr) { | ||
942 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
943 | prb_fill_curr_block(curr, pkc, pbd, len); | ||
944 | return (void *)curr; | ||
945 | } | ||
946 | |||
947 | /* | ||
948 | * No free blocks are available.user_space hasn't caught up yet. | ||
949 | * Queue was just frozen and now this packet will get dropped. | ||
950 | */ | ||
951 | return NULL; | ||
952 | } | ||
953 | |||
954 | static void *packet_current_rx_frame(struct packet_sock *po, | ||
955 | struct sk_buff *skb, | ||
956 | int status, unsigned int len) | ||
957 | { | ||
958 | char *curr = NULL; | ||
959 | switch (po->tp_version) { | ||
960 | case TPACKET_V1: | ||
961 | case TPACKET_V2: | ||
962 | curr = packet_lookup_frame(po, &po->rx_ring, | ||
963 | po->rx_ring.head, status); | ||
964 | return curr; | ||
965 | case TPACKET_V3: | ||
966 | return __packet_lookup_frame_in_block(po, skb, status, len); | ||
967 | default: | ||
968 | WARN(1, "TPACKET version not supported\n"); | ||
969 | BUG(); | ||
970 | return NULL; | ||
971 | } | ||
972 | } | ||
973 | |||
974 | static void *prb_lookup_block(struct packet_sock *po, | ||
975 | struct packet_ring_buffer *rb, | ||
976 | unsigned int previous, | ||
977 | int status) | ||
978 | { | ||
979 | struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); | ||
980 | struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous); | ||
981 | |||
982 | if (status != BLOCK_STATUS(pbd)) | ||
983 | return NULL; | ||
984 | return pbd; | ||
985 | } | ||
986 | |||
987 | static int prb_previous_blk_num(struct packet_ring_buffer *rb) | ||
988 | { | ||
989 | unsigned int prev; | ||
990 | if (rb->prb_bdqc.kactive_blk_num) | ||
991 | prev = rb->prb_bdqc.kactive_blk_num-1; | ||
992 | else | ||
993 | prev = rb->prb_bdqc.knum_blocks-1; | ||
994 | return prev; | ||
995 | } | ||
996 | |||
997 | /* Assumes caller has held the rx_queue.lock */ | ||
998 | static void *__prb_previous_block(struct packet_sock *po, | ||
999 | struct packet_ring_buffer *rb, | ||
1000 | int status) | ||
1001 | { | ||
1002 | unsigned int previous = prb_previous_blk_num(rb); | ||
1003 | return prb_lookup_block(po, rb, previous, status); | ||
1004 | } | ||
1005 | |||
1006 | static void *packet_previous_rx_frame(struct packet_sock *po, | ||
1007 | struct packet_ring_buffer *rb, | ||
1008 | int status) | ||
1009 | { | ||
1010 | if (po->tp_version <= TPACKET_V2) | ||
1011 | return packet_previous_frame(po, rb, status); | ||
1012 | |||
1013 | return __prb_previous_block(po, rb, status); | ||
1014 | } | ||
1015 | |||
1016 | static void packet_increment_rx_head(struct packet_sock *po, | ||
1017 | struct packet_ring_buffer *rb) | ||
1018 | { | ||
1019 | switch (po->tp_version) { | ||
1020 | case TPACKET_V1: | ||
1021 | case TPACKET_V2: | ||
1022 | return packet_increment_head(rb); | ||
1023 | case TPACKET_V3: | ||
1024 | default: | ||
1025 | WARN(1, "TPACKET version not supported.\n"); | ||
1026 | BUG(); | ||
1027 | return; | ||
1028 | } | ||
1029 | } | ||
1030 | |||
1031 | static void *packet_previous_frame(struct packet_sock *po, | ||
1032 | struct packet_ring_buffer *rb, | 393 | struct packet_ring_buffer *rb, |
1033 | int status) | 394 | int status) |
1034 | { | 395 | { |
@@ -1036,7 +397,7 @@ static void *packet_previous_frame(struct packet_sock *po, | |||
1036 | return packet_lookup_frame(po, rb, previous, status); | 397 | return packet_lookup_frame(po, rb, previous, status); |
1037 | } | 398 | } |
1038 | 399 | ||
1039 | static void packet_increment_head(struct packet_ring_buffer *buff) | 400 | static inline void packet_increment_head(struct packet_ring_buffer *buff) |
1040 | { | 401 | { |
1041 | buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; | 402 | buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; |
1042 | } | 403 | } |
@@ -1093,6 +454,43 @@ static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *sk | |||
1093 | return f->arr[cpu % num]; | 454 | return f->arr[cpu % num]; |
1094 | } | 455 | } |
1095 | 456 | ||
457 | static struct sk_buff *fanout_check_defrag(struct sk_buff *skb) | ||
458 | { | ||
459 | #ifdef CONFIG_INET | ||
460 | const struct iphdr *iph; | ||
461 | u32 len; | ||
462 | |||
463 | if (skb->protocol != htons(ETH_P_IP)) | ||
464 | return skb; | ||
465 | |||
466 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | ||
467 | return skb; | ||
468 | |||
469 | iph = ip_hdr(skb); | ||
470 | if (iph->ihl < 5 || iph->version != 4) | ||
471 | return skb; | ||
472 | if (!pskb_may_pull(skb, iph->ihl*4)) | ||
473 | return skb; | ||
474 | iph = ip_hdr(skb); | ||
475 | len = ntohs(iph->tot_len); | ||
476 | if (skb->len < len || len < (iph->ihl * 4)) | ||
477 | return skb; | ||
478 | |||
479 | if (ip_is_fragment(ip_hdr(skb))) { | ||
480 | skb = skb_share_check(skb, GFP_ATOMIC); | ||
481 | if (skb) { | ||
482 | if (pskb_trim_rcsum(skb, len)) | ||
483 | return skb; | ||
484 | memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); | ||
485 | if (ip_defrag(skb, IP_DEFRAG_AF_PACKET)) | ||
486 | return NULL; | ||
487 | skb->rxhash = 0; | ||
488 | } | ||
489 | } | ||
490 | #endif | ||
491 | return skb; | ||
492 | } | ||
493 | |||
1096 | static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, | 494 | static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, |
1097 | struct packet_type *pt, struct net_device *orig_dev) | 495 | struct packet_type *pt, struct net_device *orig_dev) |
1098 | { | 496 | { |
@@ -1111,7 +509,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, | |||
1111 | case PACKET_FANOUT_HASH: | 509 | case PACKET_FANOUT_HASH: |
1112 | default: | 510 | default: |
1113 | if (f->defrag) { | 511 | if (f->defrag) { |
1114 | skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); | 512 | skb = fanout_check_defrag(skb); |
1115 | if (!skb) | 513 | if (!skb) |
1116 | return 0; | 514 | return 0; |
1117 | } | 515 | } |
@@ -1131,8 +529,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, | |||
1131 | return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); | 529 | return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); |
1132 | } | 530 | } |
1133 | 531 | ||
1134 | DEFINE_MUTEX(fanout_mutex); | 532 | static DEFINE_MUTEX(fanout_mutex); |
1135 | EXPORT_SYMBOL_GPL(fanout_mutex); | ||
1136 | static LIST_HEAD(fanout_list); | 533 | static LIST_HEAD(fanout_list); |
1137 | 534 | ||
1138 | static void __fanout_link(struct sock *sk, struct packet_sock *po) | 535 | static void __fanout_link(struct sock *sk, struct packet_sock *po) |
@@ -1162,14 +559,6 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) | |||
1162 | spin_unlock(&f->lock); | 559 | spin_unlock(&f->lock); |
1163 | } | 560 | } |
1164 | 561 | ||
1165 | static bool match_fanout_group(struct packet_type *ptype, struct sock * sk) | ||
1166 | { | ||
1167 | if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) | ||
1168 | return true; | ||
1169 | |||
1170 | return false; | ||
1171 | } | ||
1172 | |||
1173 | static int fanout_add(struct sock *sk, u16 id, u16 type_flags) | 562 | static int fanout_add(struct sock *sk, u16 id, u16 type_flags) |
1174 | { | 563 | { |
1175 | struct packet_sock *po = pkt_sk(sk); | 564 | struct packet_sock *po = pkt_sk(sk); |
@@ -1222,7 +611,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) | |||
1222 | match->prot_hook.dev = po->prot_hook.dev; | 611 | match->prot_hook.dev = po->prot_hook.dev; |
1223 | match->prot_hook.func = packet_rcv_fanout; | 612 | match->prot_hook.func = packet_rcv_fanout; |
1224 | match->prot_hook.af_packet_priv = match; | 613 | match->prot_hook.af_packet_priv = match; |
1225 | match->prot_hook.id_match = match_fanout_group; | ||
1226 | dev_add_pack(&match->prot_hook); | 614 | dev_add_pack(&match->prot_hook); |
1227 | list_add(&match->list, &fanout_list); | 615 | list_add(&match->list, &fanout_list); |
1228 | } | 616 | } |
@@ -1253,9 +641,9 @@ static void fanout_release(struct sock *sk) | |||
1253 | if (!f) | 641 | if (!f) |
1254 | return; | 642 | return; |
1255 | 643 | ||
1256 | mutex_lock(&fanout_mutex); | ||
1257 | po->fanout = NULL; | 644 | po->fanout = NULL; |
1258 | 645 | ||
646 | mutex_lock(&fanout_mutex); | ||
1259 | if (atomic_dec_and_test(&f->sk_ref)) { | 647 | if (atomic_dec_and_test(&f->sk_ref)) { |
1260 | list_del(&f->list); | 648 | list_del(&f->list); |
1261 | dev_remove_pack(&f->prot_hook); | 649 | dev_remove_pack(&f->prot_hook); |
@@ -1349,7 +737,6 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, | |||
1349 | struct net_device *dev; | 737 | struct net_device *dev; |
1350 | __be16 proto = 0; | 738 | __be16 proto = 0; |
1351 | int err; | 739 | int err; |
1352 | int extra_len = 0; | ||
1353 | 740 | ||
1354 | /* | 741 | /* |
1355 | * Get and verify the address. | 742 | * Get and verify the address. |
@@ -1367,7 +754,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, | |||
1367 | * Find the device first to size check it | 754 | * Find the device first to size check it |
1368 | */ | 755 | */ |
1369 | 756 | ||
1370 | saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; | 757 | saddr->spkt_device[13] = 0; |
1371 | retry: | 758 | retry: |
1372 | rcu_read_lock(); | 759 | rcu_read_lock(); |
1373 | dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); | 760 | dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); |
@@ -1384,25 +771,16 @@ retry: | |||
1384 | * raw protocol and you must do your own fragmentation at this level. | 771 | * raw protocol and you must do your own fragmentation at this level. |
1385 | */ | 772 | */ |
1386 | 773 | ||
1387 | if (unlikely(sock_flag(sk, SOCK_NOFCS))) { | ||
1388 | if (!netif_supports_nofcs(dev)) { | ||
1389 | err = -EPROTONOSUPPORT; | ||
1390 | goto out_unlock; | ||
1391 | } | ||
1392 | extra_len = 4; /* We're doing our own CRC */ | ||
1393 | } | ||
1394 | |||
1395 | err = -EMSGSIZE; | 774 | err = -EMSGSIZE; |
1396 | if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) | 775 | if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN) |
1397 | goto out_unlock; | 776 | goto out_unlock; |
1398 | 777 | ||
1399 | if (!skb) { | 778 | if (!skb) { |
1400 | size_t reserved = LL_RESERVED_SPACE(dev); | 779 | size_t reserved = LL_RESERVED_SPACE(dev); |
1401 | int tlen = dev->needed_tailroom; | ||
1402 | unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; | 780 | unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; |
1403 | 781 | ||
1404 | rcu_read_unlock(); | 782 | rcu_read_unlock(); |
1405 | skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); | 783 | skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL); |
1406 | if (skb == NULL) | 784 | if (skb == NULL) |
1407 | return -ENOBUFS; | 785 | return -ENOBUFS; |
1408 | /* FIXME: Save some space for broken drivers that write a hard | 786 | /* FIXME: Save some space for broken drivers that write a hard |
@@ -1425,7 +803,7 @@ retry: | |||
1425 | goto retry; | 803 | goto retry; |
1426 | } | 804 | } |
1427 | 805 | ||
1428 | if (len > (dev->mtu + dev->hard_header_len + extra_len)) { | 806 | if (len > (dev->mtu + dev->hard_header_len)) { |
1429 | /* Earlier code assumed this would be a VLAN pkt, | 807 | /* Earlier code assumed this would be a VLAN pkt, |
1430 | * double-check this now that we have the actual | 808 | * double-check this now that we have the actual |
1431 | * packet in hand. | 809 | * packet in hand. |
@@ -1447,9 +825,6 @@ retry: | |||
1447 | if (err < 0) | 825 | if (err < 0) |
1448 | goto out_unlock; | 826 | goto out_unlock; |
1449 | 827 | ||
1450 | if (unlikely(extra_len == 4)) | ||
1451 | skb->no_fcs = 1; | ||
1452 | |||
1453 | dev_queue_xmit(skb); | 828 | dev_queue_xmit(skb); |
1454 | rcu_read_unlock(); | 829 | rcu_read_unlock(); |
1455 | return len; | 830 | return len; |
@@ -1461,7 +836,7 @@ out_free: | |||
1461 | return err; | 836 | return err; |
1462 | } | 837 | } |
1463 | 838 | ||
1464 | static unsigned int run_filter(const struct sk_buff *skb, | 839 | static inline unsigned int run_filter(const struct sk_buff *skb, |
1465 | const struct sock *sk, | 840 | const struct sock *sk, |
1466 | unsigned int res) | 841 | unsigned int res) |
1467 | { | 842 | { |
@@ -1533,7 +908,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1533 | if (snaplen > res) | 908 | if (snaplen > res) |
1534 | snaplen = res; | 909 | snaplen = res; |
1535 | 910 | ||
1536 | if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) | 911 | if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= |
912 | (unsigned)sk->sk_rcvbuf) | ||
1537 | goto drop_n_acct; | 913 | goto drop_n_acct; |
1538 | 914 | ||
1539 | if (skb_shared(skb)) { | 915 | if (skb_shared(skb)) { |
@@ -1545,7 +921,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1545 | skb->data = skb_head; | 921 | skb->data = skb_head; |
1546 | skb->len = skb_len; | 922 | skb->len = skb_len; |
1547 | } | 923 | } |
1548 | consume_skb(skb); | 924 | kfree_skb(skb); |
1549 | skb = nskb; | 925 | skb = nskb; |
1550 | } | 926 | } |
1551 | 927 | ||
@@ -1609,13 +985,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1609 | union { | 985 | union { |
1610 | struct tpacket_hdr *h1; | 986 | struct tpacket_hdr *h1; |
1611 | struct tpacket2_hdr *h2; | 987 | struct tpacket2_hdr *h2; |
1612 | struct tpacket3_hdr *h3; | ||
1613 | void *raw; | 988 | void *raw; |
1614 | } h; | 989 | } h; |
1615 | u8 *skb_head = skb->data; | 990 | u8 *skb_head = skb->data; |
1616 | int skb_len = skb->len; | 991 | int skb_len = skb->len; |
1617 | unsigned int snaplen, res; | 992 | unsigned int snaplen, res; |
1618 | unsigned long status = TP_STATUS_USER; | 993 | unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; |
1619 | unsigned short macoff, netoff, hdrlen; | 994 | unsigned short macoff, netoff, hdrlen; |
1620 | struct sk_buff *copy_skb = NULL; | 995 | struct sk_buff *copy_skb = NULL; |
1621 | struct timeval tv; | 996 | struct timeval tv; |
@@ -1655,51 +1030,43 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1655 | macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + | 1030 | macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + |
1656 | po->tp_reserve; | 1031 | po->tp_reserve; |
1657 | } else { | 1032 | } else { |
1658 | unsigned int maclen = skb_network_offset(skb); | 1033 | unsigned maclen = skb_network_offset(skb); |
1659 | netoff = TPACKET_ALIGN(po->tp_hdrlen + | 1034 | netoff = TPACKET_ALIGN(po->tp_hdrlen + |
1660 | (maclen < 16 ? 16 : maclen)) + | 1035 | (maclen < 16 ? 16 : maclen)) + |
1661 | po->tp_reserve; | 1036 | po->tp_reserve; |
1662 | macoff = netoff - maclen; | 1037 | macoff = netoff - maclen; |
1663 | } | 1038 | } |
1664 | if (po->tp_version <= TPACKET_V2) { | 1039 | |
1665 | if (macoff + snaplen > po->rx_ring.frame_size) { | 1040 | if (macoff + snaplen > po->rx_ring.frame_size) { |
1666 | if (po->copy_thresh && | 1041 | if (po->copy_thresh && |
1667 | atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { | 1042 | atomic_read(&sk->sk_rmem_alloc) + skb->truesize < |
1668 | if (skb_shared(skb)) { | 1043 | (unsigned)sk->sk_rcvbuf) { |
1669 | copy_skb = skb_clone(skb, GFP_ATOMIC); | 1044 | if (skb_shared(skb)) { |
1670 | } else { | 1045 | copy_skb = skb_clone(skb, GFP_ATOMIC); |
1671 | copy_skb = skb_get(skb); | 1046 | } else { |
1672 | skb_head = skb->data; | 1047 | copy_skb = skb_get(skb); |
1673 | } | 1048 | skb_head = skb->data; |
1674 | if (copy_skb) | ||
1675 | skb_set_owner_r(copy_skb, sk); | ||
1676 | } | 1049 | } |
1677 | snaplen = po->rx_ring.frame_size - macoff; | 1050 | if (copy_skb) |
1678 | if ((int)snaplen < 0) | 1051 | skb_set_owner_r(copy_skb, sk); |
1679 | snaplen = 0; | ||
1680 | } | 1052 | } |
1053 | snaplen = po->rx_ring.frame_size - macoff; | ||
1054 | if ((int)snaplen < 0) | ||
1055 | snaplen = 0; | ||
1681 | } | 1056 | } |
1057 | |||
1682 | spin_lock(&sk->sk_receive_queue.lock); | 1058 | spin_lock(&sk->sk_receive_queue.lock); |
1683 | h.raw = packet_current_rx_frame(po, skb, | 1059 | h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL); |
1684 | TP_STATUS_KERNEL, (macoff+snaplen)); | ||
1685 | if (!h.raw) | 1060 | if (!h.raw) |
1686 | goto ring_is_full; | 1061 | goto ring_is_full; |
1687 | if (po->tp_version <= TPACKET_V2) { | 1062 | packet_increment_head(&po->rx_ring); |
1688 | packet_increment_rx_head(po, &po->rx_ring); | ||
1689 | /* | ||
1690 | * LOSING will be reported till you read the stats, | ||
1691 | * because it's COR - Clear On Read. | ||
1692 | * Anyways, moving it for V1/V2 only as V3 doesn't need this | ||
1693 | * at packet level. | ||
1694 | */ | ||
1695 | if (po->stats.tp_drops) | ||
1696 | status |= TP_STATUS_LOSING; | ||
1697 | } | ||
1698 | po->stats.tp_packets++; | 1063 | po->stats.tp_packets++; |
1699 | if (copy_skb) { | 1064 | if (copy_skb) { |
1700 | status |= TP_STATUS_COPY; | 1065 | status |= TP_STATUS_COPY; |
1701 | __skb_queue_tail(&sk->sk_receive_queue, copy_skb); | 1066 | __skb_queue_tail(&sk->sk_receive_queue, copy_skb); |
1702 | } | 1067 | } |
1068 | if (!po->stats.tp_drops) | ||
1069 | status &= ~TP_STATUS_LOSING; | ||
1703 | spin_unlock(&sk->sk_receive_queue.lock); | 1070 | spin_unlock(&sk->sk_receive_queue.lock); |
1704 | 1071 | ||
1705 | skb_copy_bits(skb, 0, h.raw + macoff, snaplen); | 1072 | skb_copy_bits(skb, 0, h.raw + macoff, snaplen); |
@@ -1750,29 +1117,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1750 | h.h2->tp_padding = 0; | 1117 | h.h2->tp_padding = 0; |
1751 | hdrlen = sizeof(*h.h2); | 1118 | hdrlen = sizeof(*h.h2); |
1752 | break; | 1119 | break; |
1753 | case TPACKET_V3: | ||
1754 | /* tp_nxt_offset,vlan are already populated above. | ||
1755 | * So DONT clear those fields here | ||
1756 | */ | ||
1757 | h.h3->tp_status |= status; | ||
1758 | h.h3->tp_len = skb->len; | ||
1759 | h.h3->tp_snaplen = snaplen; | ||
1760 | h.h3->tp_mac = macoff; | ||
1761 | h.h3->tp_net = netoff; | ||
1762 | if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) | ||
1763 | && shhwtstamps->syststamp.tv64) | ||
1764 | ts = ktime_to_timespec(shhwtstamps->syststamp); | ||
1765 | else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) | ||
1766 | && shhwtstamps->hwtstamp.tv64) | ||
1767 | ts = ktime_to_timespec(shhwtstamps->hwtstamp); | ||
1768 | else if (skb->tstamp.tv64) | ||
1769 | ts = ktime_to_timespec(skb->tstamp); | ||
1770 | else | ||
1771 | getnstimeofday(&ts); | ||
1772 | h.h3->tp_sec = ts.tv_sec; | ||
1773 | h.h3->tp_nsec = ts.tv_nsec; | ||
1774 | hdrlen = sizeof(*h.h3); | ||
1775 | break; | ||
1776 | default: | 1120 | default: |
1777 | BUG(); | 1121 | BUG(); |
1778 | } | 1122 | } |
@@ -1793,19 +1137,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1793 | { | 1137 | { |
1794 | u8 *start, *end; | 1138 | u8 *start, *end; |
1795 | 1139 | ||
1796 | if (po->tp_version <= TPACKET_V2) { | 1140 | end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); |
1797 | end = (u8 *)PAGE_ALIGN((unsigned long)h.raw | 1141 | for (start = h.raw; start < end; start += PAGE_SIZE) |
1798 | + macoff + snaplen); | 1142 | flush_dcache_page(pgv_to_page(start)); |
1799 | for (start = h.raw; start < end; start += PAGE_SIZE) | ||
1800 | flush_dcache_page(pgv_to_page(start)); | ||
1801 | } | ||
1802 | smp_wmb(); | 1143 | smp_wmb(); |
1803 | } | 1144 | } |
1804 | #endif | 1145 | #endif |
1805 | if (po->tp_version <= TPACKET_V2) | 1146 | __packet_set_status(po, h.raw, status); |
1806 | __packet_set_status(po, h.raw, status); | ||
1807 | else | ||
1808 | prb_clear_blk_fill_status(&po->rx_ring); | ||
1809 | 1147 | ||
1810 | sk->sk_data_ready(sk, 0); | 1148 | sk->sk_data_ready(sk, 0); |
1811 | 1149 | ||
@@ -1832,8 +1170,11 @@ static void tpacket_destruct_skb(struct sk_buff *skb) | |||
1832 | struct packet_sock *po = pkt_sk(skb->sk); | 1170 | struct packet_sock *po = pkt_sk(skb->sk); |
1833 | void *ph; | 1171 | void *ph; |
1834 | 1172 | ||
1173 | BUG_ON(skb == NULL); | ||
1174 | |||
1835 | if (likely(po->tx_ring.pg_vec)) { | 1175 | if (likely(po->tx_ring.pg_vec)) { |
1836 | ph = skb_shinfo(skb)->destructor_arg; | 1176 | ph = skb_shinfo(skb)->destructor_arg; |
1177 | BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); | ||
1837 | BUG_ON(atomic_read(&po->tx_ring.pending) == 0); | 1178 | BUG_ON(atomic_read(&po->tx_ring.pending) == 0); |
1838 | atomic_dec(&po->tx_ring.pending); | 1179 | atomic_dec(&po->tx_ring.pending); |
1839 | __packet_set_status(po, ph, TP_STATUS_AVAILABLE); | 1180 | __packet_set_status(po, ph, TP_STATUS_AVAILABLE); |
@@ -1844,7 +1185,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) | |||
1844 | 1185 | ||
1845 | static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, | 1186 | static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, |
1846 | void *frame, struct net_device *dev, int size_max, | 1187 | void *frame, struct net_device *dev, int size_max, |
1847 | __be16 proto, unsigned char *addr, int hlen) | 1188 | __be16 proto, unsigned char *addr) |
1848 | { | 1189 | { |
1849 | union { | 1190 | union { |
1850 | struct tpacket_hdr *h1; | 1191 | struct tpacket_hdr *h1; |
@@ -1878,38 +1219,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, | |||
1878 | return -EMSGSIZE; | 1219 | return -EMSGSIZE; |
1879 | } | 1220 | } |
1880 | 1221 | ||
1881 | skb_reserve(skb, hlen); | 1222 | skb_reserve(skb, LL_RESERVED_SPACE(dev)); |
1882 | skb_reset_network_header(skb); | 1223 | skb_reset_network_header(skb); |
1883 | 1224 | ||
1884 | if (po->tp_tx_has_off) { | 1225 | data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); |
1885 | int off_min, off_max, off; | ||
1886 | off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); | ||
1887 | off_max = po->tx_ring.frame_size - tp_len; | ||
1888 | if (sock->type == SOCK_DGRAM) { | ||
1889 | switch (po->tp_version) { | ||
1890 | case TPACKET_V2: | ||
1891 | off = ph.h2->tp_net; | ||
1892 | break; | ||
1893 | default: | ||
1894 | off = ph.h1->tp_net; | ||
1895 | break; | ||
1896 | } | ||
1897 | } else { | ||
1898 | switch (po->tp_version) { | ||
1899 | case TPACKET_V2: | ||
1900 | off = ph.h2->tp_mac; | ||
1901 | break; | ||
1902 | default: | ||
1903 | off = ph.h1->tp_mac; | ||
1904 | break; | ||
1905 | } | ||
1906 | } | ||
1907 | if (unlikely((off < off_min) || (off_max < off))) | ||
1908 | return -EINVAL; | ||
1909 | data = ph.raw + off; | ||
1910 | } else { | ||
1911 | data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll); | ||
1912 | } | ||
1913 | to_write = tp_len; | 1226 | to_write = tp_len; |
1914 | 1227 | ||
1915 | if (sock->type == SOCK_DGRAM) { | 1228 | if (sock->type == SOCK_DGRAM) { |
@@ -1935,6 +1248,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, | |||
1935 | to_write -= dev->hard_header_len; | 1248 | to_write -= dev->hard_header_len; |
1936 | } | 1249 | } |
1937 | 1250 | ||
1251 | err = -EFAULT; | ||
1938 | offset = offset_in_page(data); | 1252 | offset = offset_in_page(data); |
1939 | len_max = PAGE_SIZE - offset; | 1253 | len_max = PAGE_SIZE - offset; |
1940 | len = ((to_write > len_max) ? len_max : to_write); | 1254 | len = ((to_write > len_max) ? len_max : to_write); |
@@ -1979,11 +1293,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) | |||
1979 | int tp_len, size_max; | 1293 | int tp_len, size_max; |
1980 | unsigned char *addr; | 1294 | unsigned char *addr; |
1981 | int len_sum = 0; | 1295 | int len_sum = 0; |
1982 | int status = TP_STATUS_AVAILABLE; | 1296 | int status = 0; |
1983 | int hlen, tlen; | ||
1984 | 1297 | ||
1985 | mutex_lock(&po->pg_vec_lock); | 1298 | mutex_lock(&po->pg_vec_lock); |
1986 | 1299 | ||
1300 | err = -EBUSY; | ||
1987 | if (saddr == NULL) { | 1301 | if (saddr == NULL) { |
1988 | dev = po->prot_hook.dev; | 1302 | dev = po->prot_hook.dev; |
1989 | proto = po->num; | 1303 | proto = po->num; |
@@ -2028,17 +1342,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) | |||
2028 | } | 1342 | } |
2029 | 1343 | ||
2030 | status = TP_STATUS_SEND_REQUEST; | 1344 | status = TP_STATUS_SEND_REQUEST; |
2031 | hlen = LL_RESERVED_SPACE(dev); | ||
2032 | tlen = dev->needed_tailroom; | ||
2033 | skb = sock_alloc_send_skb(&po->sk, | 1345 | skb = sock_alloc_send_skb(&po->sk, |
2034 | hlen + tlen + sizeof(struct sockaddr_ll), | 1346 | LL_ALLOCATED_SPACE(dev) |
1347 | + sizeof(struct sockaddr_ll), | ||
2035 | 0, &err); | 1348 | 0, &err); |
2036 | 1349 | ||
2037 | if (unlikely(skb == NULL)) | 1350 | if (unlikely(skb == NULL)) |
2038 | goto out_status; | 1351 | goto out_status; |
2039 | 1352 | ||
2040 | tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, | 1353 | tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto, |
2041 | addr, hlen); | 1354 | addr); |
2042 | 1355 | ||
2043 | if (unlikely(tp_len < 0)) { | 1356 | if (unlikely(tp_len < 0)) { |
2044 | if (po->tp_loss) { | 1357 | if (po->tp_loss) { |
@@ -2095,10 +1408,10 @@ out: | |||
2095 | return err; | 1408 | return err; |
2096 | } | 1409 | } |
2097 | 1410 | ||
2098 | static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, | 1411 | static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, |
2099 | size_t reserve, size_t len, | 1412 | size_t reserve, size_t len, |
2100 | size_t linear, int noblock, | 1413 | size_t linear, int noblock, |
2101 | int *err) | 1414 | int *err) |
2102 | { | 1415 | { |
2103 | struct sk_buff *skb; | 1416 | struct sk_buff *skb; |
2104 | 1417 | ||
@@ -2135,8 +1448,6 @@ static int packet_snd(struct socket *sock, | |||
2135 | int vnet_hdr_len; | 1448 | int vnet_hdr_len; |
2136 | struct packet_sock *po = pkt_sk(sk); | 1449 | struct packet_sock *po = pkt_sk(sk); |
2137 | unsigned short gso_type = 0; | 1450 | unsigned short gso_type = 0; |
2138 | int hlen, tlen; | ||
2139 | int extra_len = 0; | ||
2140 | 1451 | ||
2141 | /* | 1452 | /* |
2142 | * Get and verify the address. | 1453 | * Get and verify the address. |
@@ -2216,22 +1527,13 @@ static int packet_snd(struct socket *sock, | |||
2216 | } | 1527 | } |
2217 | } | 1528 | } |
2218 | 1529 | ||
2219 | if (unlikely(sock_flag(sk, SOCK_NOFCS))) { | ||
2220 | if (!netif_supports_nofcs(dev)) { | ||
2221 | err = -EPROTONOSUPPORT; | ||
2222 | goto out_unlock; | ||
2223 | } | ||
2224 | extra_len = 4; /* We're doing our own CRC */ | ||
2225 | } | ||
2226 | |||
2227 | err = -EMSGSIZE; | 1530 | err = -EMSGSIZE; |
2228 | if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) | 1531 | if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN)) |
2229 | goto out_unlock; | 1532 | goto out_unlock; |
2230 | 1533 | ||
2231 | err = -ENOBUFS; | 1534 | err = -ENOBUFS; |
2232 | hlen = LL_RESERVED_SPACE(dev); | 1535 | skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), |
2233 | tlen = dev->needed_tailroom; | 1536 | LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len, |
2234 | skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, vnet_hdr.hdr_len, | ||
2235 | msg->msg_flags & MSG_DONTWAIT, &err); | 1537 | msg->msg_flags & MSG_DONTWAIT, &err); |
2236 | if (skb == NULL) | 1538 | if (skb == NULL) |
2237 | goto out_unlock; | 1539 | goto out_unlock; |
@@ -2251,7 +1553,7 @@ static int packet_snd(struct socket *sock, | |||
2251 | if (err < 0) | 1553 | if (err < 0) |
2252 | goto out_free; | 1554 | goto out_free; |
2253 | 1555 | ||
2254 | if (!gso_type && (len > dev->mtu + reserve + extra_len)) { | 1556 | if (!gso_type && (len > dev->mtu + reserve)) { |
2255 | /* Earlier code assumed this would be a VLAN pkt, | 1557 | /* Earlier code assumed this would be a VLAN pkt, |
2256 | * double-check this now that we have the actual | 1558 | * double-check this now that we have the actual |
2257 | * packet in hand. | 1559 | * packet in hand. |
@@ -2289,9 +1591,6 @@ static int packet_snd(struct socket *sock, | |||
2289 | len += vnet_hdr_len; | 1591 | len += vnet_hdr_len; |
2290 | } | 1592 | } |
2291 | 1593 | ||
2292 | if (unlikely(extra_len == 4)) | ||
2293 | skb->no_fcs = 1; | ||
2294 | |||
2295 | /* | 1594 | /* |
2296 | * Now send it | 1595 | * Now send it |
2297 | */ | 1596 | */ |
@@ -2335,7 +1634,7 @@ static int packet_release(struct socket *sock) | |||
2335 | struct sock *sk = sock->sk; | 1634 | struct sock *sk = sock->sk; |
2336 | struct packet_sock *po; | 1635 | struct packet_sock *po; |
2337 | struct net *net; | 1636 | struct net *net; |
2338 | union tpacket_req_u req_u; | 1637 | struct tpacket_req req; |
2339 | 1638 | ||
2340 | if (!sk) | 1639 | if (!sk) |
2341 | return 0; | 1640 | return 0; |
@@ -2343,13 +1642,10 @@ static int packet_release(struct socket *sock) | |||
2343 | net = sock_net(sk); | 1642 | net = sock_net(sk); |
2344 | po = pkt_sk(sk); | 1643 | po = pkt_sk(sk); |
2345 | 1644 | ||
2346 | mutex_lock(&net->packet.sklist_lock); | 1645 | spin_lock_bh(&net->packet.sklist_lock); |
2347 | sk_del_node_init_rcu(sk); | 1646 | sk_del_node_init_rcu(sk); |
2348 | mutex_unlock(&net->packet.sklist_lock); | ||
2349 | |||
2350 | preempt_disable(); | ||
2351 | sock_prot_inuse_add(net, sk->sk_prot, -1); | 1647 | sock_prot_inuse_add(net, sk->sk_prot, -1); |
2352 | preempt_enable(); | 1648 | spin_unlock_bh(&net->packet.sklist_lock); |
2353 | 1649 | ||
2354 | spin_lock(&po->bind_lock); | 1650 | spin_lock(&po->bind_lock); |
2355 | unregister_prot_hook(sk, false); | 1651 | unregister_prot_hook(sk, false); |
@@ -2361,13 +1657,13 @@ static int packet_release(struct socket *sock) | |||
2361 | 1657 | ||
2362 | packet_flush_mclist(sk); | 1658 | packet_flush_mclist(sk); |
2363 | 1659 | ||
2364 | memset(&req_u, 0, sizeof(req_u)); | 1660 | memset(&req, 0, sizeof(req)); |
2365 | 1661 | ||
2366 | if (po->rx_ring.pg_vec) | 1662 | if (po->rx_ring.pg_vec) |
2367 | packet_set_ring(sk, &req_u, 1, 0); | 1663 | packet_set_ring(sk, &req, 1, 0); |
2368 | 1664 | ||
2369 | if (po->tx_ring.pg_vec) | 1665 | if (po->tx_ring.pg_vec) |
2370 | packet_set_ring(sk, &req_u, 1, 1); | 1666 | packet_set_ring(sk, &req, 1, 1); |
2371 | 1667 | ||
2372 | fanout_release(sk); | 1668 | fanout_release(sk); |
2373 | 1669 | ||
@@ -2395,12 +1691,8 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc | |||
2395 | { | 1691 | { |
2396 | struct packet_sock *po = pkt_sk(sk); | 1692 | struct packet_sock *po = pkt_sk(sk); |
2397 | 1693 | ||
2398 | if (po->fanout) { | 1694 | if (po->fanout) |
2399 | if (dev) | ||
2400 | dev_put(dev); | ||
2401 | |||
2402 | return -EINVAL; | 1695 | return -EINVAL; |
2403 | } | ||
2404 | 1696 | ||
2405 | lock_sock(sk); | 1697 | lock_sock(sk); |
2406 | 1698 | ||
@@ -2504,7 +1796,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, | |||
2504 | __be16 proto = (__force __be16)protocol; /* weird, but documented */ | 1796 | __be16 proto = (__force __be16)protocol; /* weird, but documented */ |
2505 | int err; | 1797 | int err; |
2506 | 1798 | ||
2507 | if (!ns_capable(net->user_ns, CAP_NET_RAW)) | 1799 | if (!capable(CAP_NET_RAW)) |
2508 | return -EPERM; | 1800 | return -EPERM; |
2509 | if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && | 1801 | if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && |
2510 | sock->type != SOCK_PACKET) | 1802 | sock->type != SOCK_PACKET) |
@@ -2548,13 +1840,10 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, | |||
2548 | register_prot_hook(sk); | 1840 | register_prot_hook(sk); |
2549 | } | 1841 | } |
2550 | 1842 | ||
2551 | mutex_lock(&net->packet.sklist_lock); | 1843 | spin_lock_bh(&net->packet.sklist_lock); |
2552 | sk_add_node_rcu(sk, &net->packet.sklist); | 1844 | sk_add_node_rcu(sk, &net->packet.sklist); |
2553 | mutex_unlock(&net->packet.sklist_lock); | ||
2554 | |||
2555 | preempt_disable(); | ||
2556 | sock_prot_inuse_add(net, &packet_proto, 1); | 1845 | sock_prot_inuse_add(net, &packet_proto, 1); |
2557 | preempt_enable(); | 1846 | spin_unlock_bh(&net->packet.sklist_lock); |
2558 | 1847 | ||
2559 | return 0; | 1848 | return 0; |
2560 | out: | 1849 | out: |
@@ -2994,27 +2283,15 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
2994 | case PACKET_RX_RING: | 2283 | case PACKET_RX_RING: |
2995 | case PACKET_TX_RING: | 2284 | case PACKET_TX_RING: |
2996 | { | 2285 | { |
2997 | union tpacket_req_u req_u; | 2286 | struct tpacket_req req; |
2998 | int len; | ||
2999 | 2287 | ||
3000 | switch (po->tp_version) { | 2288 | if (optlen < sizeof(req)) |
3001 | case TPACKET_V1: | ||
3002 | case TPACKET_V2: | ||
3003 | len = sizeof(req_u.req); | ||
3004 | break; | ||
3005 | case TPACKET_V3: | ||
3006 | default: | ||
3007 | len = sizeof(req_u.req3); | ||
3008 | break; | ||
3009 | } | ||
3010 | if (optlen < len) | ||
3011 | return -EINVAL; | 2289 | return -EINVAL; |
3012 | if (pkt_sk(sk)->has_vnet_hdr) | 2290 | if (pkt_sk(sk)->has_vnet_hdr) |
3013 | return -EINVAL; | 2291 | return -EINVAL; |
3014 | if (copy_from_user(&req_u.req, optval, len)) | 2292 | if (copy_from_user(&req, optval, sizeof(req))) |
3015 | return -EFAULT; | 2293 | return -EFAULT; |
3016 | return packet_set_ring(sk, &req_u, 0, | 2294 | return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING); |
3017 | optname == PACKET_TX_RING); | ||
3018 | } | 2295 | } |
3019 | case PACKET_COPY_THRESH: | 2296 | case PACKET_COPY_THRESH: |
3020 | { | 2297 | { |
@@ -3041,7 +2318,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
3041 | switch (val) { | 2318 | switch (val) { |
3042 | case TPACKET_V1: | 2319 | case TPACKET_V1: |
3043 | case TPACKET_V2: | 2320 | case TPACKET_V2: |
3044 | case TPACKET_V3: | ||
3045 | po->tp_version = val; | 2321 | po->tp_version = val; |
3046 | return 0; | 2322 | return 0; |
3047 | default: | 2323 | default: |
@@ -3137,19 +2413,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
3137 | 2413 | ||
3138 | return fanout_add(sk, val & 0xffff, val >> 16); | 2414 | return fanout_add(sk, val & 0xffff, val >> 16); |
3139 | } | 2415 | } |
3140 | case PACKET_TX_HAS_OFF: | ||
3141 | { | ||
3142 | unsigned int val; | ||
3143 | |||
3144 | if (optlen != sizeof(val)) | ||
3145 | return -EINVAL; | ||
3146 | if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) | ||
3147 | return -EBUSY; | ||
3148 | if (copy_from_user(&val, optval, sizeof(val))) | ||
3149 | return -EFAULT; | ||
3150 | po->tp_tx_has_off = !!val; | ||
3151 | return 0; | ||
3152 | } | ||
3153 | default: | 2416 | default: |
3154 | return -ENOPROTOOPT; | 2417 | return -ENOPROTOOPT; |
3155 | } | 2418 | } |
@@ -3159,12 +2422,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
3159 | char __user *optval, int __user *optlen) | 2422 | char __user *optval, int __user *optlen) |
3160 | { | 2423 | { |
3161 | int len; | 2424 | int len; |
3162 | int val, lv = sizeof(val); | 2425 | int val; |
3163 | struct sock *sk = sock->sk; | 2426 | struct sock *sk = sock->sk; |
3164 | struct packet_sock *po = pkt_sk(sk); | 2427 | struct packet_sock *po = pkt_sk(sk); |
3165 | void *data = &val; | 2428 | void *data; |
3166 | struct tpacket_stats st; | 2429 | struct tpacket_stats st; |
3167 | union tpacket_stats_u st_u; | ||
3168 | 2430 | ||
3169 | if (level != SOL_PACKET) | 2431 | if (level != SOL_PACKET) |
3170 | return -ENOPROTOOPT; | 2432 | return -ENOPROTOOPT; |
@@ -3177,35 +2439,42 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
3177 | 2439 | ||
3178 | switch (optname) { | 2440 | switch (optname) { |
3179 | case PACKET_STATISTICS: | 2441 | case PACKET_STATISTICS: |
2442 | if (len > sizeof(struct tpacket_stats)) | ||
2443 | len = sizeof(struct tpacket_stats); | ||
3180 | spin_lock_bh(&sk->sk_receive_queue.lock); | 2444 | spin_lock_bh(&sk->sk_receive_queue.lock); |
3181 | if (po->tp_version == TPACKET_V3) { | 2445 | st = po->stats; |
3182 | lv = sizeof(struct tpacket_stats_v3); | ||
3183 | memcpy(&st_u.stats3, &po->stats, | ||
3184 | sizeof(struct tpacket_stats)); | ||
3185 | st_u.stats3.tp_freeze_q_cnt = | ||
3186 | po->stats_u.stats3.tp_freeze_q_cnt; | ||
3187 | st_u.stats3.tp_packets += po->stats.tp_drops; | ||
3188 | data = &st_u.stats3; | ||
3189 | } else { | ||
3190 | lv = sizeof(struct tpacket_stats); | ||
3191 | st = po->stats; | ||
3192 | st.tp_packets += st.tp_drops; | ||
3193 | data = &st; | ||
3194 | } | ||
3195 | memset(&po->stats, 0, sizeof(st)); | 2446 | memset(&po->stats, 0, sizeof(st)); |
3196 | spin_unlock_bh(&sk->sk_receive_queue.lock); | 2447 | spin_unlock_bh(&sk->sk_receive_queue.lock); |
2448 | st.tp_packets += st.tp_drops; | ||
2449 | |||
2450 | data = &st; | ||
3197 | break; | 2451 | break; |
3198 | case PACKET_AUXDATA: | 2452 | case PACKET_AUXDATA: |
2453 | if (len > sizeof(int)) | ||
2454 | len = sizeof(int); | ||
3199 | val = po->auxdata; | 2455 | val = po->auxdata; |
2456 | |||
2457 | data = &val; | ||
3200 | break; | 2458 | break; |
3201 | case PACKET_ORIGDEV: | 2459 | case PACKET_ORIGDEV: |
2460 | if (len > sizeof(int)) | ||
2461 | len = sizeof(int); | ||
3202 | val = po->origdev; | 2462 | val = po->origdev; |
2463 | |||
2464 | data = &val; | ||
3203 | break; | 2465 | break; |
3204 | case PACKET_VNET_HDR: | 2466 | case PACKET_VNET_HDR: |
2467 | if (len > sizeof(int)) | ||
2468 | len = sizeof(int); | ||
3205 | val = po->has_vnet_hdr; | 2469 | val = po->has_vnet_hdr; |
2470 | |||
2471 | data = &val; | ||
3206 | break; | 2472 | break; |
3207 | case PACKET_VERSION: | 2473 | case PACKET_VERSION: |
2474 | if (len > sizeof(int)) | ||
2475 | len = sizeof(int); | ||
3208 | val = po->tp_version; | 2476 | val = po->tp_version; |
2477 | data = &val; | ||
3209 | break; | 2478 | break; |
3210 | case PACKET_HDRLEN: | 2479 | case PACKET_HDRLEN: |
3211 | if (len > sizeof(int)) | 2480 | if (len > sizeof(int)) |
@@ -3219,37 +2488,42 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
3219 | case TPACKET_V2: | 2488 | case TPACKET_V2: |
3220 | val = sizeof(struct tpacket2_hdr); | 2489 | val = sizeof(struct tpacket2_hdr); |
3221 | break; | 2490 | break; |
3222 | case TPACKET_V3: | ||
3223 | val = sizeof(struct tpacket3_hdr); | ||
3224 | break; | ||
3225 | default: | 2491 | default: |
3226 | return -EINVAL; | 2492 | return -EINVAL; |
3227 | } | 2493 | } |
2494 | data = &val; | ||
3228 | break; | 2495 | break; |
3229 | case PACKET_RESERVE: | 2496 | case PACKET_RESERVE: |
2497 | if (len > sizeof(unsigned int)) | ||
2498 | len = sizeof(unsigned int); | ||
3230 | val = po->tp_reserve; | 2499 | val = po->tp_reserve; |
2500 | data = &val; | ||
3231 | break; | 2501 | break; |
3232 | case PACKET_LOSS: | 2502 | case PACKET_LOSS: |
2503 | if (len > sizeof(unsigned int)) | ||
2504 | len = sizeof(unsigned int); | ||
3233 | val = po->tp_loss; | 2505 | val = po->tp_loss; |
2506 | data = &val; | ||
3234 | break; | 2507 | break; |
3235 | case PACKET_TIMESTAMP: | 2508 | case PACKET_TIMESTAMP: |
2509 | if (len > sizeof(int)) | ||
2510 | len = sizeof(int); | ||
3236 | val = po->tp_tstamp; | 2511 | val = po->tp_tstamp; |
2512 | data = &val; | ||
3237 | break; | 2513 | break; |
3238 | case PACKET_FANOUT: | 2514 | case PACKET_FANOUT: |
2515 | if (len > sizeof(int)) | ||
2516 | len = sizeof(int); | ||
3239 | val = (po->fanout ? | 2517 | val = (po->fanout ? |
3240 | ((u32)po->fanout->id | | 2518 | ((u32)po->fanout->id | |
3241 | ((u32)po->fanout->type << 16)) : | 2519 | ((u32)po->fanout->type << 16)) : |
3242 | 0); | 2520 | 0); |
3243 | break; | 2521 | data = &val; |
3244 | case PACKET_TX_HAS_OFF: | ||
3245 | val = po->tp_tx_has_off; | ||
3246 | break; | 2522 | break; |
3247 | default: | 2523 | default: |
3248 | return -ENOPROTOOPT; | 2524 | return -ENOPROTOOPT; |
3249 | } | 2525 | } |
3250 | 2526 | ||
3251 | if (len > lv) | ||
3252 | len = lv; | ||
3253 | if (put_user(len, optlen)) | 2527 | if (put_user(len, optlen)) |
3254 | return -EFAULT; | 2528 | return -EFAULT; |
3255 | if (copy_to_user(optval, data, len)) | 2529 | if (copy_to_user(optval, data, len)) |
@@ -3370,8 +2644,7 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, | |||
3370 | 2644 | ||
3371 | spin_lock_bh(&sk->sk_receive_queue.lock); | 2645 | spin_lock_bh(&sk->sk_receive_queue.lock); |
3372 | if (po->rx_ring.pg_vec) { | 2646 | if (po->rx_ring.pg_vec) { |
3373 | if (!packet_previous_rx_frame(po, &po->rx_ring, | 2647 | if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) |
3374 | TP_STATUS_KERNEL)) | ||
3375 | mask |= POLLIN | POLLRDNORM; | 2648 | mask |= POLLIN | POLLRDNORM; |
3376 | } | 2649 | } |
3377 | spin_unlock_bh(&sk->sk_receive_queue.lock); | 2650 | spin_unlock_bh(&sk->sk_receive_queue.lock); |
@@ -3432,7 +2705,7 @@ static void free_pg_vec(struct pgv *pg_vec, unsigned int order, | |||
3432 | kfree(pg_vec); | 2705 | kfree(pg_vec); |
3433 | } | 2706 | } |
3434 | 2707 | ||
3435 | static char *alloc_one_pg_vec_page(unsigned long order) | 2708 | static inline char *alloc_one_pg_vec_page(unsigned long order) |
3436 | { | 2709 | { |
3437 | char *buffer = NULL; | 2710 | char *buffer = NULL; |
3438 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | | 2711 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | |
@@ -3490,7 +2763,7 @@ out_free_pgvec: | |||
3490 | goto out; | 2763 | goto out; |
3491 | } | 2764 | } |
3492 | 2765 | ||
3493 | static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, | 2766 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, |
3494 | int closing, int tx_ring) | 2767 | int closing, int tx_ring) |
3495 | { | 2768 | { |
3496 | struct pgv *pg_vec = NULL; | 2769 | struct pgv *pg_vec = NULL; |
@@ -3499,15 +2772,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, | |||
3499 | struct packet_ring_buffer *rb; | 2772 | struct packet_ring_buffer *rb; |
3500 | struct sk_buff_head *rb_queue; | 2773 | struct sk_buff_head *rb_queue; |
3501 | __be16 num; | 2774 | __be16 num; |
3502 | int err = -EINVAL; | 2775 | int err; |
3503 | /* Added to avoid minimal code churn */ | ||
3504 | struct tpacket_req *req = &req_u->req; | ||
3505 | |||
3506 | /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ | ||
3507 | if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { | ||
3508 | WARN(1, "Tx-ring is not supported.\n"); | ||
3509 | goto out; | ||
3510 | } | ||
3511 | 2776 | ||
3512 | rb = tx_ring ? &po->tx_ring : &po->rx_ring; | 2777 | rb = tx_ring ? &po->tx_ring : &po->rx_ring; |
3513 | rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; | 2778 | rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; |
@@ -3533,9 +2798,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, | |||
3533 | case TPACKET_V2: | 2798 | case TPACKET_V2: |
3534 | po->tp_hdrlen = TPACKET2_HDRLEN; | 2799 | po->tp_hdrlen = TPACKET2_HDRLEN; |
3535 | break; | 2800 | break; |
3536 | case TPACKET_V3: | ||
3537 | po->tp_hdrlen = TPACKET3_HDRLEN; | ||
3538 | break; | ||
3539 | } | 2801 | } |
3540 | 2802 | ||
3541 | err = -EINVAL; | 2803 | err = -EINVAL; |
@@ -3561,17 +2823,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, | |||
3561 | pg_vec = alloc_pg_vec(req, order); | 2823 | pg_vec = alloc_pg_vec(req, order); |
3562 | if (unlikely(!pg_vec)) | 2824 | if (unlikely(!pg_vec)) |
3563 | goto out; | 2825 | goto out; |
3564 | switch (po->tp_version) { | ||
3565 | case TPACKET_V3: | ||
3566 | /* Transmit path is not supported. We checked | ||
3567 | * it above but just being paranoid | ||
3568 | */ | ||
3569 | if (!tx_ring) | ||
3570 | init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); | ||
3571 | break; | ||
3572 | default: | ||
3573 | break; | ||
3574 | } | ||
3575 | } | 2826 | } |
3576 | /* Done */ | 2827 | /* Done */ |
3577 | else { | 2828 | else { |
@@ -3624,11 +2875,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, | |||
3624 | register_prot_hook(sk); | 2875 | register_prot_hook(sk); |
3625 | } | 2876 | } |
3626 | spin_unlock(&po->bind_lock); | 2877 | spin_unlock(&po->bind_lock); |
3627 | if (closing && (po->tp_version > TPACKET_V2)) { | 2878 | |
3628 | /* Because we don't support block-based V3 on tx-ring */ | ||
3629 | if (!tx_ring) | ||
3630 | prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue); | ||
3631 | } | ||
3632 | release_sock(sk); | 2879 | release_sock(sk); |
3633 | 2880 | ||
3634 | if (pg_vec) | 2881 | if (pg_vec) |
@@ -3791,7 +3038,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) | |||
3791 | po->ifindex, | 3038 | po->ifindex, |
3792 | po->running, | 3039 | po->running, |
3793 | atomic_read(&s->sk_rmem_alloc), | 3040 | atomic_read(&s->sk_rmem_alloc), |
3794 | from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), | 3041 | sock_i_uid(s), |
3795 | sock_i_ino(s)); | 3042 | sock_i_ino(s)); |
3796 | } | 3043 | } |
3797 | 3044 | ||
@@ -3823,7 +3070,7 @@ static const struct file_operations packet_seq_fops = { | |||
3823 | 3070 | ||
3824 | static int __net_init packet_net_init(struct net *net) | 3071 | static int __net_init packet_net_init(struct net *net) |
3825 | { | 3072 | { |
3826 | mutex_init(&net->packet.sklist_lock); | 3073 | spin_lock_init(&net->packet.sklist_lock); |
3827 | INIT_HLIST_HEAD(&net->packet.sklist); | 3074 | INIT_HLIST_HEAD(&net->packet.sklist); |
3828 | 3075 | ||
3829 | if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops)) | 3076 | if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops)) |
diff --git a/net/packet/diag.c b/net/packet/diag.c deleted file mode 100644 index 8db6e21c46b..00000000000 --- a/net/packet/diag.c +++ /dev/null | |||
@@ -1,242 +0,0 @@ | |||
1 | #include <linux/module.h> | ||
2 | #include <linux/sock_diag.h> | ||
3 | #include <linux/net.h> | ||
4 | #include <linux/netdevice.h> | ||
5 | #include <linux/packet_diag.h> | ||
6 | #include <net/net_namespace.h> | ||
7 | #include <net/sock.h> | ||
8 | |||
9 | #include "internal.h" | ||
10 | |||
11 | static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) | ||
12 | { | ||
13 | struct packet_diag_info pinfo; | ||
14 | |||
15 | pinfo.pdi_index = po->ifindex; | ||
16 | pinfo.pdi_version = po->tp_version; | ||
17 | pinfo.pdi_reserve = po->tp_reserve; | ||
18 | pinfo.pdi_copy_thresh = po->copy_thresh; | ||
19 | pinfo.pdi_tstamp = po->tp_tstamp; | ||
20 | |||
21 | pinfo.pdi_flags = 0; | ||
22 | if (po->running) | ||
23 | pinfo.pdi_flags |= PDI_RUNNING; | ||
24 | if (po->auxdata) | ||
25 | pinfo.pdi_flags |= PDI_AUXDATA; | ||
26 | if (po->origdev) | ||
27 | pinfo.pdi_flags |= PDI_ORIGDEV; | ||
28 | if (po->has_vnet_hdr) | ||
29 | pinfo.pdi_flags |= PDI_VNETHDR; | ||
30 | if (po->tp_loss) | ||
31 | pinfo.pdi_flags |= PDI_LOSS; | ||
32 | |||
33 | return nla_put(nlskb, PACKET_DIAG_INFO, sizeof(pinfo), &pinfo); | ||
34 | } | ||
35 | |||
36 | static int pdiag_put_mclist(const struct packet_sock *po, struct sk_buff *nlskb) | ||
37 | { | ||
38 | struct nlattr *mca; | ||
39 | struct packet_mclist *ml; | ||
40 | |||
41 | mca = nla_nest_start(nlskb, PACKET_DIAG_MCLIST); | ||
42 | if (!mca) | ||
43 | return -EMSGSIZE; | ||
44 | |||
45 | rtnl_lock(); | ||
46 | for (ml = po->mclist; ml; ml = ml->next) { | ||
47 | struct packet_diag_mclist *dml; | ||
48 | |||
49 | dml = nla_reserve_nohdr(nlskb, sizeof(*dml)); | ||
50 | if (!dml) { | ||
51 | rtnl_unlock(); | ||
52 | nla_nest_cancel(nlskb, mca); | ||
53 | return -EMSGSIZE; | ||
54 | } | ||
55 | |||
56 | dml->pdmc_index = ml->ifindex; | ||
57 | dml->pdmc_type = ml->type; | ||
58 | dml->pdmc_alen = ml->alen; | ||
59 | dml->pdmc_count = ml->count; | ||
60 | BUILD_BUG_ON(sizeof(dml->pdmc_addr) != sizeof(ml->addr)); | ||
61 | memcpy(dml->pdmc_addr, ml->addr, sizeof(ml->addr)); | ||
62 | } | ||
63 | |||
64 | rtnl_unlock(); | ||
65 | nla_nest_end(nlskb, mca); | ||
66 | |||
67 | return 0; | ||
68 | } | ||
69 | |||
70 | static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type, | ||
71 | struct sk_buff *nlskb) | ||
72 | { | ||
73 | struct packet_diag_ring pdr; | ||
74 | |||
75 | if (!ring->pg_vec || ((ver > TPACKET_V2) && | ||
76 | (nl_type == PACKET_DIAG_TX_RING))) | ||
77 | return 0; | ||
78 | |||
79 | pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT; | ||
80 | pdr.pdr_block_nr = ring->pg_vec_len; | ||
81 | pdr.pdr_frame_size = ring->frame_size; | ||
82 | pdr.pdr_frame_nr = ring->frame_max + 1; | ||
83 | |||
84 | if (ver > TPACKET_V2) { | ||
85 | pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov; | ||
86 | pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv; | ||
87 | pdr.pdr_features = ring->prb_bdqc.feature_req_word; | ||
88 | } else { | ||
89 | pdr.pdr_retire_tmo = 0; | ||
90 | pdr.pdr_sizeof_priv = 0; | ||
91 | pdr.pdr_features = 0; | ||
92 | } | ||
93 | |||
94 | return nla_put(nlskb, nl_type, sizeof(pdr), &pdr); | ||
95 | } | ||
96 | |||
97 | static int pdiag_put_rings_cfg(struct packet_sock *po, struct sk_buff *skb) | ||
98 | { | ||
99 | int ret; | ||
100 | |||
101 | mutex_lock(&po->pg_vec_lock); | ||
102 | ret = pdiag_put_ring(&po->rx_ring, po->tp_version, | ||
103 | PACKET_DIAG_RX_RING, skb); | ||
104 | if (!ret) | ||
105 | ret = pdiag_put_ring(&po->tx_ring, po->tp_version, | ||
106 | PACKET_DIAG_TX_RING, skb); | ||
107 | mutex_unlock(&po->pg_vec_lock); | ||
108 | |||
109 | return ret; | ||
110 | } | ||
111 | |||
112 | static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb) | ||
113 | { | ||
114 | int ret = 0; | ||
115 | |||
116 | mutex_lock(&fanout_mutex); | ||
117 | if (po->fanout) { | ||
118 | u32 val; | ||
119 | |||
120 | val = (u32)po->fanout->id | ((u32)po->fanout->type << 16); | ||
121 | ret = nla_put_u32(nlskb, PACKET_DIAG_FANOUT, val); | ||
122 | } | ||
123 | mutex_unlock(&fanout_mutex); | ||
124 | |||
125 | return ret; | ||
126 | } | ||
127 | |||
128 | static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct packet_diag_req *req, | ||
129 | u32 portid, u32 seq, u32 flags, int sk_ino) | ||
130 | { | ||
131 | struct nlmsghdr *nlh; | ||
132 | struct packet_diag_msg *rp; | ||
133 | struct packet_sock *po = pkt_sk(sk); | ||
134 | |||
135 | nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rp), flags); | ||
136 | if (!nlh) | ||
137 | return -EMSGSIZE; | ||
138 | |||
139 | rp = nlmsg_data(nlh); | ||
140 | rp->pdiag_family = AF_PACKET; | ||
141 | rp->pdiag_type = sk->sk_type; | ||
142 | rp->pdiag_num = ntohs(po->num); | ||
143 | rp->pdiag_ino = sk_ino; | ||
144 | sock_diag_save_cookie(sk, rp->pdiag_cookie); | ||
145 | |||
146 | if ((req->pdiag_show & PACKET_SHOW_INFO) && | ||
147 | pdiag_put_info(po, skb)) | ||
148 | goto out_nlmsg_trim; | ||
149 | |||
150 | if ((req->pdiag_show & PACKET_SHOW_MCLIST) && | ||
151 | pdiag_put_mclist(po, skb)) | ||
152 | goto out_nlmsg_trim; | ||
153 | |||
154 | if ((req->pdiag_show & PACKET_SHOW_RING_CFG) && | ||
155 | pdiag_put_rings_cfg(po, skb)) | ||
156 | goto out_nlmsg_trim; | ||
157 | |||
158 | if ((req->pdiag_show & PACKET_SHOW_FANOUT) && | ||
159 | pdiag_put_fanout(po, skb)) | ||
160 | goto out_nlmsg_trim; | ||
161 | |||
162 | return nlmsg_end(skb, nlh); | ||
163 | |||
164 | out_nlmsg_trim: | ||
165 | nlmsg_cancel(skb, nlh); | ||
166 | return -EMSGSIZE; | ||
167 | } | ||
168 | |||
169 | static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | ||
170 | { | ||
171 | int num = 0, s_num = cb->args[0]; | ||
172 | struct packet_diag_req *req; | ||
173 | struct net *net; | ||
174 | struct sock *sk; | ||
175 | struct hlist_node *node; | ||
176 | |||
177 | net = sock_net(skb->sk); | ||
178 | req = nlmsg_data(cb->nlh); | ||
179 | |||
180 | mutex_lock(&net->packet.sklist_lock); | ||
181 | sk_for_each(sk, node, &net->packet.sklist) { | ||
182 | if (!net_eq(sock_net(sk), net)) | ||
183 | continue; | ||
184 | if (num < s_num) | ||
185 | goto next; | ||
186 | |||
187 | if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, | ||
188 | cb->nlh->nlmsg_seq, NLM_F_MULTI, | ||
189 | sock_i_ino(sk)) < 0) | ||
190 | goto done; | ||
191 | next: | ||
192 | num++; | ||
193 | } | ||
194 | done: | ||
195 | mutex_unlock(&net->packet.sklist_lock); | ||
196 | cb->args[0] = num; | ||
197 | |||
198 | return skb->len; | ||
199 | } | ||
200 | |||
201 | static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) | ||
202 | { | ||
203 | int hdrlen = sizeof(struct packet_diag_req); | ||
204 | struct net *net = sock_net(skb->sk); | ||
205 | struct packet_diag_req *req; | ||
206 | |||
207 | if (nlmsg_len(h) < hdrlen) | ||
208 | return -EINVAL; | ||
209 | |||
210 | req = nlmsg_data(h); | ||
211 | /* Make it possible to support protocol filtering later */ | ||
212 | if (req->sdiag_protocol) | ||
213 | return -EINVAL; | ||
214 | |||
215 | if (h->nlmsg_flags & NLM_F_DUMP) { | ||
216 | struct netlink_dump_control c = { | ||
217 | .dump = packet_diag_dump, | ||
218 | }; | ||
219 | return netlink_dump_start(net->diag_nlsk, skb, h, &c); | ||
220 | } else | ||
221 | return -EOPNOTSUPP; | ||
222 | } | ||
223 | |||
224 | static const struct sock_diag_handler packet_diag_handler = { | ||
225 | .family = AF_PACKET, | ||
226 | .dump = packet_diag_handler_dump, | ||
227 | }; | ||
228 | |||
229 | static int __init packet_diag_init(void) | ||
230 | { | ||
231 | return sock_diag_register(&packet_diag_handler); | ||
232 | } | ||
233 | |||
234 | static void __exit packet_diag_exit(void) | ||
235 | { | ||
236 | sock_diag_unregister(&packet_diag_handler); | ||
237 | } | ||
238 | |||
239 | module_init(packet_diag_init); | ||
240 | module_exit(packet_diag_exit); | ||
241 | MODULE_LICENSE("GPL"); | ||
242 | MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */); | ||
diff --git a/net/packet/internal.h b/net/packet/internal.h deleted file mode 100644 index e84cab8cb7a..00000000000 --- a/net/packet/internal.h +++ /dev/null | |||
@@ -1,122 +0,0 @@ | |||
1 | #ifndef __PACKET_INTERNAL_H__ | ||
2 | #define __PACKET_INTERNAL_H__ | ||
3 | |||
4 | struct packet_mclist { | ||
5 | struct packet_mclist *next; | ||
6 | int ifindex; | ||
7 | int count; | ||
8 | unsigned short type; | ||
9 | unsigned short alen; | ||
10 | unsigned char addr[MAX_ADDR_LEN]; | ||
11 | }; | ||
12 | |||
13 | /* kbdq - kernel block descriptor queue */ | ||
14 | struct tpacket_kbdq_core { | ||
15 | struct pgv *pkbdq; | ||
16 | unsigned int feature_req_word; | ||
17 | unsigned int hdrlen; | ||
18 | unsigned char reset_pending_on_curr_blk; | ||
19 | unsigned char delete_blk_timer; | ||
20 | unsigned short kactive_blk_num; | ||
21 | unsigned short blk_sizeof_priv; | ||
22 | |||
23 | /* last_kactive_blk_num: | ||
24 | * trick to see if user-space has caught up | ||
25 | * in order to avoid refreshing timer when every single pkt arrives. | ||
26 | */ | ||
27 | unsigned short last_kactive_blk_num; | ||
28 | |||
29 | char *pkblk_start; | ||
30 | char *pkblk_end; | ||
31 | int kblk_size; | ||
32 | unsigned int knum_blocks; | ||
33 | uint64_t knxt_seq_num; | ||
34 | char *prev; | ||
35 | char *nxt_offset; | ||
36 | struct sk_buff *skb; | ||
37 | |||
38 | atomic_t blk_fill_in_prog; | ||
39 | |||
40 | /* Default is set to 8ms */ | ||
41 | #define DEFAULT_PRB_RETIRE_TOV (8) | ||
42 | |||
43 | unsigned short retire_blk_tov; | ||
44 | unsigned short version; | ||
45 | unsigned long tov_in_jiffies; | ||
46 | |||
47 | /* timer to retire an outstanding block */ | ||
48 | struct timer_list retire_blk_timer; | ||
49 | }; | ||
50 | |||
51 | struct pgv { | ||
52 | char *buffer; | ||
53 | }; | ||
54 | |||
55 | struct packet_ring_buffer { | ||
56 | struct pgv *pg_vec; | ||
57 | unsigned int head; | ||
58 | unsigned int frames_per_block; | ||
59 | unsigned int frame_size; | ||
60 | unsigned int frame_max; | ||
61 | |||
62 | unsigned int pg_vec_order; | ||
63 | unsigned int pg_vec_pages; | ||
64 | unsigned int pg_vec_len; | ||
65 | |||
66 | struct tpacket_kbdq_core prb_bdqc; | ||
67 | atomic_t pending; | ||
68 | }; | ||
69 | |||
70 | extern struct mutex fanout_mutex; | ||
71 | #define PACKET_FANOUT_MAX 256 | ||
72 | |||
73 | struct packet_fanout { | ||
74 | #ifdef CONFIG_NET_NS | ||
75 | struct net *net; | ||
76 | #endif | ||
77 | unsigned int num_members; | ||
78 | u16 id; | ||
79 | u8 type; | ||
80 | u8 defrag; | ||
81 | atomic_t rr_cur; | ||
82 | struct list_head list; | ||
83 | struct sock *arr[PACKET_FANOUT_MAX]; | ||
84 | spinlock_t lock; | ||
85 | atomic_t sk_ref; | ||
86 | struct packet_type prot_hook ____cacheline_aligned_in_smp; | ||
87 | }; | ||
88 | |||
89 | struct packet_sock { | ||
90 | /* struct sock has to be the first member of packet_sock */ | ||
91 | struct sock sk; | ||
92 | struct packet_fanout *fanout; | ||
93 | struct tpacket_stats stats; | ||
94 | union tpacket_stats_u stats_u; | ||
95 | struct packet_ring_buffer rx_ring; | ||
96 | struct packet_ring_buffer tx_ring; | ||
97 | int copy_thresh; | ||
98 | spinlock_t bind_lock; | ||
99 | struct mutex pg_vec_lock; | ||
100 | unsigned int running:1, /* prot_hook is attached*/ | ||
101 | auxdata:1, | ||
102 | origdev:1, | ||
103 | has_vnet_hdr:1; | ||
104 | int ifindex; /* bound device */ | ||
105 | __be16 num; | ||
106 | struct packet_mclist *mclist; | ||
107 | atomic_t mapped; | ||
108 | enum tpacket_versions tp_version; | ||
109 | unsigned int tp_hdrlen; | ||
110 | unsigned int tp_reserve; | ||
111 | unsigned int tp_loss:1; | ||
112 | unsigned int tp_tx_has_off:1; | ||
113 | unsigned int tp_tstamp; | ||
114 | struct packet_type prot_hook ____cacheline_aligned_in_smp; | ||
115 | }; | ||
116 | |||
117 | static struct packet_sock *pkt_sk(struct sock *sk) | ||
118 | { | ||
119 | return (struct packet_sock *)sk; | ||
120 | } | ||
121 | |||
122 | #endif | ||