aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-07-05 04:45:05 -0400
committerDavid S. Miller <davem@davemloft.net>2011-07-06 01:34:52 -0400
commitdc99f600698dcac69b8f56dda9a8a00d645c5ffc (patch)
tree81599e4397761610d5020c03e2571eeceaa859b6 /net/packet/af_packet.c
parentce06b03e60fc19c680d1bf873e779bf11c2fc518 (diff)
packet: Add fanout support.
Fanouts allow packet capturing to be demuxed to a set of AF_PACKET sockets. Two fanout policies are implemented: 1) Hashing based upon skb->rxhash 2) Pure round-robin An AF_PACKET socket must be fully bound before it tries to add itself to a fanout. All AF_PACKET sockets trying to join the same fanout must all have the same bind settings. Fanouts are identified (within a network namespace) by a 16-bit ID. The first socket to try to add itself to a fanout with a particular ID, creates that fanout. When the last socket leaves the fanout (which happens only when the socket is closed), that fanout is destroyed. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c256
1 files changed, 251 insertions, 5 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bb281bf330aa..3350f1d3c9aa 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -187,9 +187,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
187 187
188static void packet_flush_mclist(struct sock *sk); 188static void packet_flush_mclist(struct sock *sk);
189 189
190struct packet_fanout;
190struct packet_sock { 191struct packet_sock {
191 /* struct sock has to be the first member of packet_sock */ 192 /* struct sock has to be the first member of packet_sock */
192 struct sock sk; 193 struct sock sk;
194 struct packet_fanout *fanout;
193 struct tpacket_stats stats; 195 struct tpacket_stats stats;
194 struct packet_ring_buffer rx_ring; 196 struct packet_ring_buffer rx_ring;
195 struct packet_ring_buffer tx_ring; 197 struct packet_ring_buffer tx_ring;
@@ -212,6 +214,24 @@ struct packet_sock {
212 struct packet_type prot_hook ____cacheline_aligned_in_smp; 214 struct packet_type prot_hook ____cacheline_aligned_in_smp;
213}; 215};
214 216
217#define PACKET_FANOUT_MAX 256
218
219struct packet_fanout {
220#ifdef CONFIG_NET_NS
221 struct net *net;
222#endif
223 unsigned int num_members;
224 u16 id;
225 u8 type;
226 u8 pad;
227 atomic_t rr_cur;
228 struct list_head list;
229 struct sock *arr[PACKET_FANOUT_MAX];
230 spinlock_t lock;
231 atomic_t sk_ref;
232 struct packet_type prot_hook ____cacheline_aligned_in_smp;
233};
234
215struct packet_skb_cb { 235struct packet_skb_cb {
216 unsigned int origlen; 236 unsigned int origlen;
217 union { 237 union {
@@ -227,6 +247,9 @@ static inline struct packet_sock *pkt_sk(struct sock *sk)
227 return (struct packet_sock *)sk; 247 return (struct packet_sock *)sk;
228} 248}
229 249
250static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
251static void __fanout_link(struct sock *sk, struct packet_sock *po);
252
230/* register_prot_hook must be invoked with the po->bind_lock held, 253/* register_prot_hook must be invoked with the po->bind_lock held,
231 * or from a context in which asynchronous accesses to the packet 254 * or from a context in which asynchronous accesses to the packet
232 * socket is not possible (packet_create()). 255 * socket is not possible (packet_create()).
@@ -235,7 +258,10 @@ static void register_prot_hook(struct sock *sk)
235{ 258{
236 struct packet_sock *po = pkt_sk(sk); 259 struct packet_sock *po = pkt_sk(sk);
237 if (!po->running) { 260 if (!po->running) {
238 dev_add_pack(&po->prot_hook); 261 if (po->fanout)
262 __fanout_link(sk, po);
263 else
264 dev_add_pack(&po->prot_hook);
239 sock_hold(sk); 265 sock_hold(sk);
240 po->running = 1; 266 po->running = 1;
241 } 267 }
@@ -253,7 +279,10 @@ static void __unregister_prot_hook(struct sock *sk, bool sync)
253 struct packet_sock *po = pkt_sk(sk); 279 struct packet_sock *po = pkt_sk(sk);
254 280
255 po->running = 0; 281 po->running = 0;
256 __dev_remove_pack(&po->prot_hook); 282 if (po->fanout)
283 __fanout_unlink(sk, po);
284 else
285 __dev_remove_pack(&po->prot_hook);
257 __sock_put(sk); 286 __sock_put(sk);
258 287
259 if (sync) { 288 if (sync) {
@@ -388,6 +417,201 @@ static void packet_sock_destruct(struct sock *sk)
388 sk_refcnt_debug_dec(sk); 417 sk_refcnt_debug_dec(sk);
389} 418}
390 419
420static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
421{
422 int x = atomic_read(&f->rr_cur) + 1;
423
424 if (x >= num)
425 x = 0;
426
427 return x;
428}
429
430static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
431{
432 u32 idx, hash = skb->rxhash;
433
434 idx = ((u64)hash * num) >> 32;
435
436 return f->arr[idx];
437}
438
439static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
440{
441 int cur, old;
442
443 cur = atomic_read(&f->rr_cur);
444 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
445 fanout_rr_next(f, num))) != cur)
446 cur = old;
447 return f->arr[cur];
448}
449
450static int packet_rcv_fanout_hash(struct sk_buff *skb, struct net_device *dev,
451 struct packet_type *pt, struct net_device *orig_dev)
452{
453 struct packet_fanout *f = pt->af_packet_priv;
454 unsigned int num = f->num_members;
455 struct packet_sock *po;
456 struct sock *sk;
457
458 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
459 !num) {
460 kfree_skb(skb);
461 return 0;
462 }
463
464 skb_get_rxhash(skb);
465
466 sk = fanout_demux_hash(f, skb, num);
467 po = pkt_sk(sk);
468
469 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
470}
471
472static int packet_rcv_fanout_lb(struct sk_buff *skb, struct net_device *dev,
473 struct packet_type *pt, struct net_device *orig_dev)
474{
475 struct packet_fanout *f = pt->af_packet_priv;
476 unsigned int num = f->num_members;
477 struct packet_sock *po;
478 struct sock *sk;
479
480 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
481 !num) {
482 kfree_skb(skb);
483 return 0;
484 }
485
486 sk = fanout_demux_lb(f, skb, num);
487 po = pkt_sk(sk);
488
489 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
490}
491
492static DEFINE_MUTEX(fanout_mutex);
493static LIST_HEAD(fanout_list);
494
495static void __fanout_link(struct sock *sk, struct packet_sock *po)
496{
497 struct packet_fanout *f = po->fanout;
498
499 spin_lock(&f->lock);
500 f->arr[f->num_members] = sk;
501 smp_wmb();
502 f->num_members++;
503 spin_unlock(&f->lock);
504}
505
506static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
507{
508 struct packet_fanout *f = po->fanout;
509 int i;
510
511 spin_lock(&f->lock);
512 for (i = 0; i < f->num_members; i++) {
513 if (f->arr[i] == sk)
514 break;
515 }
516 BUG_ON(i >= f->num_members);
517 f->arr[i] = f->arr[f->num_members - 1];
518 f->num_members--;
519 spin_unlock(&f->lock);
520}
521
522static int fanout_add(struct sock *sk, u16 id, u8 type)
523{
524 struct packet_sock *po = pkt_sk(sk);
525 struct packet_fanout *f, *match;
526 int err;
527
528 switch (type) {
529 case PACKET_FANOUT_HASH:
530 case PACKET_FANOUT_LB:
531 break;
532 default:
533 return -EINVAL;
534 }
535
536 if (!po->running)
537 return -EINVAL;
538
539 if (po->fanout)
540 return -EALREADY;
541
542 mutex_lock(&fanout_mutex);
543 match = NULL;
544 list_for_each_entry(f, &fanout_list, list) {
545 if (f->id == id &&
546 read_pnet(&f->net) == sock_net(sk)) {
547 match = f;
548 break;
549 }
550 }
551 if (!match) {
552 match = kzalloc(sizeof(*match), GFP_KERNEL);
553 if (match) {
554 write_pnet(&match->net, sock_net(sk));
555 match->id = id;
556 match->type = type;
557 atomic_set(&match->rr_cur, 0);
558 INIT_LIST_HEAD(&match->list);
559 spin_lock_init(&match->lock);
560 atomic_set(&match->sk_ref, 0);
561 match->prot_hook.type = po->prot_hook.type;
562 match->prot_hook.dev = po->prot_hook.dev;
563 switch (type) {
564 case PACKET_FANOUT_HASH:
565 match->prot_hook.func = packet_rcv_fanout_hash;
566 break;
567 case PACKET_FANOUT_LB:
568 match->prot_hook.func = packet_rcv_fanout_lb;
569 break;
570 }
571 match->prot_hook.af_packet_priv = match;
572 dev_add_pack(&match->prot_hook);
573 list_add(&match->list, &fanout_list);
574 }
575 }
576 err = -ENOMEM;
577 if (match) {
578 err = -EINVAL;
579 if (match->type == type &&
580 match->prot_hook.type == po->prot_hook.type &&
581 match->prot_hook.dev == po->prot_hook.dev) {
582 err = -ENOSPC;
583 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
584 __dev_remove_pack(&po->prot_hook);
585 po->fanout = match;
586 atomic_inc(&match->sk_ref);
587 __fanout_link(sk, po);
588 err = 0;
589 }
590 }
591 }
592 mutex_unlock(&fanout_mutex);
593 return err;
594}
595
596static void fanout_release(struct sock *sk)
597{
598 struct packet_sock *po = pkt_sk(sk);
599 struct packet_fanout *f;
600
601 f = po->fanout;
602 if (!f)
603 return;
604
605 po->fanout = NULL;
606
607 mutex_lock(&fanout_mutex);
608 if (atomic_dec_and_test(&f->sk_ref)) {
609 list_del(&f->list);
610 dev_remove_pack(&f->prot_hook);
611 kfree(f);
612 }
613 mutex_unlock(&fanout_mutex);
614}
391 615
392static const struct proto_ops packet_ops; 616static const struct proto_ops packet_ops;
393 617
@@ -1398,6 +1622,8 @@ static int packet_release(struct socket *sock)
1398 if (po->tx_ring.pg_vec) 1622 if (po->tx_ring.pg_vec)
1399 packet_set_ring(sk, &req, 1, 1); 1623 packet_set_ring(sk, &req, 1, 1);
1400 1624
1625 fanout_release(sk);
1626
1401 synchronize_net(); 1627 synchronize_net();
1402 /* 1628 /*
1403 * Now the socket is dead. No more input will appear. 1629 * Now the socket is dead. No more input will appear.
@@ -1421,9 +1647,9 @@ static int packet_release(struct socket *sock)
1421static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol) 1647static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1422{ 1648{
1423 struct packet_sock *po = pkt_sk(sk); 1649 struct packet_sock *po = pkt_sk(sk);
1424 /* 1650
1425 * Detach an existing hook if present. 1651 if (po->fanout)
1426 */ 1652 return -EINVAL;
1427 1653
1428 lock_sock(sk); 1654 lock_sock(sk);
1429 1655
@@ -2133,6 +2359,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
2133 po->tp_tstamp = val; 2359 po->tp_tstamp = val;
2134 return 0; 2360 return 0;
2135 } 2361 }
2362 case PACKET_FANOUT:
2363 {
2364 int val;
2365
2366 if (optlen != sizeof(val))
2367 return -EINVAL;
2368 if (copy_from_user(&val, optval, sizeof(val)))
2369 return -EFAULT;
2370
2371 return fanout_add(sk, val & 0xffff, val >> 16);
2372 }
2136 default: 2373 default:
2137 return -ENOPROTOOPT; 2374 return -ENOPROTOOPT;
2138 } 2375 }
@@ -2231,6 +2468,15 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2231 val = po->tp_tstamp; 2468 val = po->tp_tstamp;
2232 data = &val; 2469 data = &val;
2233 break; 2470 break;
2471 case PACKET_FANOUT:
2472 if (len > sizeof(int))
2473 len = sizeof(int);
2474 val = (po->fanout ?
2475 ((u32)po->fanout->id |
2476 ((u32)po->fanout->type << 16)) :
2477 0);
2478 data = &val;
2479 break;
2234 default: 2480 default:
2235 return -ENOPROTOOPT; 2481 return -ENOPROTOOPT;
2236 } 2482 }