diff options
author | David S. Miller <davem@davemloft.net> | 2011-07-05 04:45:05 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-07-06 01:34:52 -0400 |
commit | dc99f600698dcac69b8f56dda9a8a00d645c5ffc (patch) | |
tree | 81599e4397761610d5020c03e2571eeceaa859b6 /net/packet/af_packet.c | |
parent | ce06b03e60fc19c680d1bf873e779bf11c2fc518 (diff) |
packet: Add fanout support.
Fanouts allow packet capturing to be demuxed to a set of AF_PACKET
sockets. Two fanout policies are implemented:
1) Hashing based upon skb->rxhash
2) Pure round-robin
An AF_PACKET socket must be fully bound before it tries to add itself
to a fanout. All AF_PACKET sockets trying to join the same fanout
must all have the same bind settings.
Fanouts are identified (within a network namespace) by a 16-bit ID.
The first socket to try to add itself to a fanout with a particular
ID, creates that fanout. When the last socket leaves the fanout
(which happens only when the socket is closed), that fanout is
destroyed.
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r-- | net/packet/af_packet.c | 256 |
1 files changed, 251 insertions, 5 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index bb281bf330aa..3350f1d3c9aa 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); | |||
187 | 187 | ||
188 | static void packet_flush_mclist(struct sock *sk); | 188 | static void packet_flush_mclist(struct sock *sk); |
189 | 189 | ||
190 | struct packet_fanout; | ||
190 | struct packet_sock { | 191 | struct packet_sock { |
191 | /* struct sock has to be the first member of packet_sock */ | 192 | /* struct sock has to be the first member of packet_sock */ |
192 | struct sock sk; | 193 | struct sock sk; |
194 | struct packet_fanout *fanout; | ||
193 | struct tpacket_stats stats; | 195 | struct tpacket_stats stats; |
194 | struct packet_ring_buffer rx_ring; | 196 | struct packet_ring_buffer rx_ring; |
195 | struct packet_ring_buffer tx_ring; | 197 | struct packet_ring_buffer tx_ring; |
@@ -212,6 +214,24 @@ struct packet_sock { | |||
212 | struct packet_type prot_hook ____cacheline_aligned_in_smp; | 214 | struct packet_type prot_hook ____cacheline_aligned_in_smp; |
213 | }; | 215 | }; |
214 | 216 | ||
217 | #define PACKET_FANOUT_MAX 256 | ||
218 | |||
219 | struct packet_fanout { | ||
220 | #ifdef CONFIG_NET_NS | ||
221 | struct net *net; | ||
222 | #endif | ||
223 | unsigned int num_members; | ||
224 | u16 id; | ||
225 | u8 type; | ||
226 | u8 pad; | ||
227 | atomic_t rr_cur; | ||
228 | struct list_head list; | ||
229 | struct sock *arr[PACKET_FANOUT_MAX]; | ||
230 | spinlock_t lock; | ||
231 | atomic_t sk_ref; | ||
232 | struct packet_type prot_hook ____cacheline_aligned_in_smp; | ||
233 | }; | ||
234 | |||
215 | struct packet_skb_cb { | 235 | struct packet_skb_cb { |
216 | unsigned int origlen; | 236 | unsigned int origlen; |
217 | union { | 237 | union { |
@@ -227,6 +247,9 @@ static inline struct packet_sock *pkt_sk(struct sock *sk) | |||
227 | return (struct packet_sock *)sk; | 247 | return (struct packet_sock *)sk; |
228 | } | 248 | } |
229 | 249 | ||
250 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po); | ||
251 | static void __fanout_link(struct sock *sk, struct packet_sock *po); | ||
252 | |||
230 | /* register_prot_hook must be invoked with the po->bind_lock held, | 253 | /* register_prot_hook must be invoked with the po->bind_lock held, |
231 | * or from a context in which asynchronous accesses to the packet | 254 | * or from a context in which asynchronous accesses to the packet |
232 | * socket is not possible (packet_create()). | 255 | * socket is not possible (packet_create()). |
@@ -235,7 +258,10 @@ static void register_prot_hook(struct sock *sk) | |||
235 | { | 258 | { |
236 | struct packet_sock *po = pkt_sk(sk); | 259 | struct packet_sock *po = pkt_sk(sk); |
237 | if (!po->running) { | 260 | if (!po->running) { |
238 | dev_add_pack(&po->prot_hook); | 261 | if (po->fanout) |
262 | __fanout_link(sk, po); | ||
263 | else | ||
264 | dev_add_pack(&po->prot_hook); | ||
239 | sock_hold(sk); | 265 | sock_hold(sk); |
240 | po->running = 1; | 266 | po->running = 1; |
241 | } | 267 | } |
@@ -253,7 +279,10 @@ static void __unregister_prot_hook(struct sock *sk, bool sync) | |||
253 | struct packet_sock *po = pkt_sk(sk); | 279 | struct packet_sock *po = pkt_sk(sk); |
254 | 280 | ||
255 | po->running = 0; | 281 | po->running = 0; |
256 | __dev_remove_pack(&po->prot_hook); | 282 | if (po->fanout) |
283 | __fanout_unlink(sk, po); | ||
284 | else | ||
285 | __dev_remove_pack(&po->prot_hook); | ||
257 | __sock_put(sk); | 286 | __sock_put(sk); |
258 | 287 | ||
259 | if (sync) { | 288 | if (sync) { |
@@ -388,6 +417,201 @@ static void packet_sock_destruct(struct sock *sk) | |||
388 | sk_refcnt_debug_dec(sk); | 417 | sk_refcnt_debug_dec(sk); |
389 | } | 418 | } |
390 | 419 | ||
420 | static int fanout_rr_next(struct packet_fanout *f, unsigned int num) | ||
421 | { | ||
422 | int x = atomic_read(&f->rr_cur) + 1; | ||
423 | |||
424 | if (x >= num) | ||
425 | x = 0; | ||
426 | |||
427 | return x; | ||
428 | } | ||
429 | |||
430 | static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) | ||
431 | { | ||
432 | u32 idx, hash = skb->rxhash; | ||
433 | |||
434 | idx = ((u64)hash * num) >> 32; | ||
435 | |||
436 | return f->arr[idx]; | ||
437 | } | ||
438 | |||
439 | static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) | ||
440 | { | ||
441 | int cur, old; | ||
442 | |||
443 | cur = atomic_read(&f->rr_cur); | ||
444 | while ((old = atomic_cmpxchg(&f->rr_cur, cur, | ||
445 | fanout_rr_next(f, num))) != cur) | ||
446 | cur = old; | ||
447 | return f->arr[cur]; | ||
448 | } | ||
449 | |||
450 | static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev, | ||
451 | struct packet_type *pt, struct net_device *orig_dev) | ||
452 | { | ||
453 | struct packet_fanout *f = pt->af_packet_priv; | ||
454 | unsigned int num = f->num_members; | ||
455 | struct packet_sock *po; | ||
456 | struct sock *sk; | ||
457 | |||
458 | if (!net_eq(dev_net(dev), read_pnet(&f->net)) || | ||
459 | !num) { | ||
460 | kfree_skb(skb); | ||
461 | return 0; | ||
462 | } | ||
463 | |||
464 | skb_get_rxhash(skb); | ||
465 | |||
466 | sk = fanout_demux_hash(f, skb, num); | ||
467 | po = pkt_sk(sk); | ||
468 | |||
469 | return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); | ||
470 | } | ||
471 | |||
472 | static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev, | ||
473 | struct packet_type *pt, struct net_device *orig_dev) | ||
474 | { | ||
475 | struct packet_fanout *f = pt->af_packet_priv; | ||
476 | unsigned int num = f->num_members; | ||
477 | struct packet_sock *po; | ||
478 | struct sock *sk; | ||
479 | |||
480 | if (!net_eq(dev_net(dev), read_pnet(&f->net)) || | ||
481 | !num) { | ||
482 | kfree_skb(skb); | ||
483 | return 0; | ||
484 | } | ||
485 | |||
486 | sk = fanout_demux_lb(f, skb, num); | ||
487 | po = pkt_sk(sk); | ||
488 | |||
489 | return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); | ||
490 | } | ||
491 | |||
492 | static DEFINE_MUTEX(fanout_mutex); | ||
493 | static LIST_HEAD(fanout_list); | ||
494 | |||
495 | static void __fanout_link(struct sock *sk, struct packet_sock *po) | ||
496 | { | ||
497 | struct packet_fanout *f = po->fanout; | ||
498 | |||
499 | spin_lock(&f->lock); | ||
500 | f->arr[f->num_members] = sk; | ||
501 | smp_wmb(); | ||
502 | f->num_members++; | ||
503 | spin_unlock(&f->lock); | ||
504 | } | ||
505 | |||
506 | static void __fanout_unlink(struct sock *sk, struct packet_sock *po) | ||
507 | { | ||
508 | struct packet_fanout *f = po->fanout; | ||
509 | int i; | ||
510 | |||
511 | spin_lock(&f->lock); | ||
512 | for (i = 0; i < f->num_members; i++) { | ||
513 | if (f->arr[i] == sk) | ||
514 | break; | ||
515 | } | ||
516 | BUG_ON(i >= f->num_members); | ||
517 | f->arr[i] = f->arr[f->num_members - 1]; | ||
518 | f->num_members--; | ||
519 | spin_unlock(&f->lock); | ||
520 | } | ||
521 | |||
522 | static int fanout_add(struct sock *sk, u16 id, u8 type) | ||
523 | { | ||
524 | struct packet_sock *po = pkt_sk(sk); | ||
525 | struct packet_fanout *f, *match; | ||
526 | int err; | ||
527 | |||
528 | switch (type) { | ||
529 | case PACKET_FANOUT_HASH: | ||
530 | case PACKET_FANOUT_LB: | ||
531 | break; | ||
532 | default: | ||
533 | return -EINVAL; | ||
534 | } | ||
535 | |||
536 | if (!po->running) | ||
537 | return -EINVAL; | ||
538 | |||
539 | if (po->fanout) | ||
540 | return -EALREADY; | ||
541 | |||
542 | mutex_lock(&fanout_mutex); | ||
543 | match = NULL; | ||
544 | list_for_each_entry(f, &fanout_list, list) { | ||
545 | if (f->id == id && | ||
546 | read_pnet(&f->net) == sock_net(sk)) { | ||
547 | match = f; | ||
548 | break; | ||
549 | } | ||
550 | } | ||
551 | if (!match) { | ||
552 | match = kzalloc(sizeof(*match), GFP_KERNEL); | ||
553 | if (match) { | ||
554 | write_pnet(&match->net, sock_net(sk)); | ||
555 | match->id = id; | ||
556 | match->type = type; | ||
557 | atomic_set(&match->rr_cur, 0); | ||
558 | INIT_LIST_HEAD(&match->list); | ||
559 | spin_lock_init(&match->lock); | ||
560 | atomic_set(&match->sk_ref, 0); | ||
561 | match->prot_hook.type = po->prot_hook.type; | ||
562 | match->prot_hook.dev = po->prot_hook.dev; | ||
563 | switch (type) { | ||
564 | case PACKET_FANOUT_HASH: | ||
565 | match->prot_hook.func = packet_rcv_fanout_hash; | ||
566 | break; | ||
567 | case PACKET_FANOUT_LB: | ||
568 | match->prot_hook.func = packet_rcv_fanout_lb; | ||
569 | break; | ||
570 | } | ||
571 | match->prot_hook.af_packet_priv = match; | ||
572 | dev_add_pack(&match->prot_hook); | ||
573 | list_add(&match->list, &fanout_list); | ||
574 | } | ||
575 | } | ||
576 | err = -ENOMEM; | ||
577 | if (match) { | ||
578 | err = -EINVAL; | ||
579 | if (match->type == type && | ||
580 | match->prot_hook.type == po->prot_hook.type && | ||
581 | match->prot_hook.dev == po->prot_hook.dev) { | ||
582 | err = -ENOSPC; | ||
583 | if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) { | ||
584 | __dev_remove_pack(&po->prot_hook); | ||
585 | po->fanout = match; | ||
586 | atomic_inc(&match->sk_ref); | ||
587 | __fanout_link(sk, po); | ||
588 | err = 0; | ||
589 | } | ||
590 | } | ||
591 | } | ||
592 | mutex_unlock(&fanout_mutex); | ||
593 | return err; | ||
594 | } | ||
595 | |||
596 | static void fanout_release(struct sock *sk) | ||
597 | { | ||
598 | struct packet_sock *po = pkt_sk(sk); | ||
599 | struct packet_fanout *f; | ||
600 | |||
601 | f = po->fanout; | ||
602 | if (!f) | ||
603 | return; | ||
604 | |||
605 | po->fanout = NULL; | ||
606 | |||
607 | mutex_lock(&fanout_mutex); | ||
608 | if (atomic_dec_and_test(&f->sk_ref)) { | ||
609 | list_del(&f->list); | ||
610 | dev_remove_pack(&f->prot_hook); | ||
611 | kfree(f); | ||
612 | } | ||
613 | mutex_unlock(&fanout_mutex); | ||
614 | } | ||
391 | 615 | ||
392 | static const struct proto_ops packet_ops; | 616 | static const struct proto_ops packet_ops; |
393 | 617 | ||
@@ -1398,6 +1622,8 @@ static int packet_release(struct socket *sock) | |||
1398 | if (po->tx_ring.pg_vec) | 1622 | if (po->tx_ring.pg_vec) |
1399 | packet_set_ring(sk, &req, 1, 1); | 1623 | packet_set_ring(sk, &req, 1, 1); |
1400 | 1624 | ||
1625 | fanout_release(sk); | ||
1626 | |||
1401 | synchronize_net(); | 1627 | synchronize_net(); |
1402 | /* | 1628 | /* |
1403 | * Now the socket is dead. No more input will appear. | 1629 | * Now the socket is dead. No more input will appear. |
@@ -1421,9 +1647,9 @@ static int packet_release(struct socket *sock) | |||
1421 | static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) | 1647 | static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) |
1422 | { | 1648 | { |
1423 | struct packet_sock *po = pkt_sk(sk); | 1649 | struct packet_sock *po = pkt_sk(sk); |
1424 | /* | 1650 | |
1425 | * Detach an existing hook if present. | 1651 | if (po->fanout) |
1426 | */ | 1652 | return -EINVAL; |
1427 | 1653 | ||
1428 | lock_sock(sk); | 1654 | lock_sock(sk); |
1429 | 1655 | ||
@@ -2133,6 +2359,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
2133 | po->tp_tstamp = val; | 2359 | po->tp_tstamp = val; |
2134 | return 0; | 2360 | return 0; |
2135 | } | 2361 | } |
2362 | case PACKET_FANOUT: | ||
2363 | { | ||
2364 | int val; | ||
2365 | |||
2366 | if (optlen != sizeof(val)) | ||
2367 | return -EINVAL; | ||
2368 | if (copy_from_user(&val, optval, sizeof(val))) | ||
2369 | return -EFAULT; | ||
2370 | |||
2371 | return fanout_add(sk, val & 0xffff, val >> 16); | ||
2372 | } | ||
2136 | default: | 2373 | default: |
2137 | return -ENOPROTOOPT; | 2374 | return -ENOPROTOOPT; |
2138 | } | 2375 | } |
@@ -2231,6 +2468,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
2231 | val = po->tp_tstamp; | 2468 | val = po->tp_tstamp; |
2232 | data = &val; | 2469 | data = &val; |
2233 | break; | 2470 | break; |
2471 | case PACKET_FANOUT: | ||
2472 | if (len > sizeof(int)) | ||
2473 | len = sizeof(int); | ||
2474 | val = (po->fanout ? | ||
2475 | ((u32)po->fanout->id | | ||
2476 | ((u32)po->fanout->type << 16)) : | ||
2477 | 0); | ||
2478 | data = &val; | ||
2479 | break; | ||
2234 | default: | 2480 | default: |
2235 | return -ENOPROTOOPT; | 2481 | return -ENOPROTOOPT; |
2236 | } | 2482 | } |