diff options
author | Willem de Bruijn <willemb@google.com> | 2015-08-14 22:31:34 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2015-08-17 17:22:47 -0400 |
commit | 47dceb8ecdc1c3ad1818dfea3d659a05b74c3fc2 (patch) | |
tree | 6329539935535b2b5ed438612464e168b02f03ce | |
parent | a1c234f95cae2d293047bb6c36e7a4840dbac815 (diff) |
packet: add classic BPF fanout mode
Add fanout mode PACKET_FANOUT_CBPF that accepts a classic BPF program
to select a socket.
This avoids having to keep adding special case fanout modes. One
example use case is application layer load balancing. The QUIC
protocol, for instance, encodes a connection ID in UDP payload.
Also add socket option SOL_PACKET/PACKET_FANOUT_DATA that updates data
associated with the socket group. Fanout mode PACKET_FANOUT_CBPF is the
only user so far.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/uapi/linux/if_packet.h | 2 | ||||
-rw-r--r-- | net/packet/af_packet.c | 99 | ||||
-rw-r--r-- | net/packet/internal.h | 5 |
3 files changed, 104 insertions, 2 deletions
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index d3d715f8c88f..a4bb16fa822e 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h | |||
@@ -55,6 +55,7 @@ struct sockaddr_ll { | |||
55 | #define PACKET_TX_HAS_OFF 19 | 55 | #define PACKET_TX_HAS_OFF 19 |
56 | #define PACKET_QDISC_BYPASS 20 | 56 | #define PACKET_QDISC_BYPASS 20 |
57 | #define PACKET_ROLLOVER_STATS 21 | 57 | #define PACKET_ROLLOVER_STATS 21 |
58 | #define PACKET_FANOUT_DATA 22 | ||
58 | 59 | ||
59 | #define PACKET_FANOUT_HASH 0 | 60 | #define PACKET_FANOUT_HASH 0 |
60 | #define PACKET_FANOUT_LB 1 | 61 | #define PACKET_FANOUT_LB 1 |
@@ -62,6 +63,7 @@ struct sockaddr_ll { | |||
62 | #define PACKET_FANOUT_ROLLOVER 3 | 63 | #define PACKET_FANOUT_ROLLOVER 3 |
63 | #define PACKET_FANOUT_RND 4 | 64 | #define PACKET_FANOUT_RND 4 |
64 | #define PACKET_FANOUT_QM 5 | 65 | #define PACKET_FANOUT_QM 5 |
66 | #define PACKET_FANOUT_CBPF 6 | ||
65 | #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 | 67 | #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 |
66 | #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 | 68 | #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 |
67 | 69 | ||
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5afe538bb88..8869d07013e6 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -92,6 +92,7 @@ | |||
92 | #ifdef CONFIG_INET | 92 | #ifdef CONFIG_INET |
93 | #include <net/inet_common.h> | 93 | #include <net/inet_common.h> |
94 | #endif | 94 | #endif |
95 | #include <linux/bpf.h> | ||
95 | 96 | ||
96 | #include "internal.h" | 97 | #include "internal.h" |
97 | 98 | ||
@@ -1410,6 +1411,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f, | |||
1410 | return skb_get_queue_mapping(skb) % num; | 1411 | return skb_get_queue_mapping(skb) % num; |
1411 | } | 1412 | } |
1412 | 1413 | ||
1414 | static unsigned int fanout_demux_bpf(struct packet_fanout *f, | ||
1415 | struct sk_buff *skb, | ||
1416 | unsigned int num) | ||
1417 | { | ||
1418 | struct bpf_prog *prog; | ||
1419 | unsigned int ret = 0; | ||
1420 | |||
1421 | rcu_read_lock(); | ||
1422 | prog = rcu_dereference(f->bpf_prog); | ||
1423 | if (prog) | ||
1424 | ret = BPF_PROG_RUN(prog, skb) % num; | ||
1425 | rcu_read_unlock(); | ||
1426 | |||
1427 | return ret; | ||
1428 | } | ||
1429 | |||
1413 | static bool fanout_has_flag(struct packet_fanout *f, u16 flag) | 1430 | static bool fanout_has_flag(struct packet_fanout *f, u16 flag) |
1414 | { | 1431 | { |
1415 | return f->flags & (flag >> 8); | 1432 | return f->flags & (flag >> 8); |
@@ -1454,6 +1471,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, | |||
1454 | case PACKET_FANOUT_ROLLOVER: | 1471 | case PACKET_FANOUT_ROLLOVER: |
1455 | idx = fanout_demux_rollover(f, skb, 0, false, num); | 1472 | idx = fanout_demux_rollover(f, skb, 0, false, num); |
1456 | break; | 1473 | break; |
1474 | case PACKET_FANOUT_CBPF: | ||
1475 | idx = fanout_demux_bpf(f, skb, num); | ||
1476 | break; | ||
1457 | } | 1477 | } |
1458 | 1478 | ||
1459 | if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) | 1479 | if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) |
@@ -1502,6 +1522,74 @@ static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) | |||
1502 | return false; | 1522 | return false; |
1503 | } | 1523 | } |
1504 | 1524 | ||
1525 | static void fanout_init_data(struct packet_fanout *f) | ||
1526 | { | ||
1527 | switch (f->type) { | ||
1528 | case PACKET_FANOUT_LB: | ||
1529 | atomic_set(&f->rr_cur, 0); | ||
1530 | break; | ||
1531 | case PACKET_FANOUT_CBPF: | ||
1532 | RCU_INIT_POINTER(f->bpf_prog, NULL); | ||
1533 | break; | ||
1534 | } | ||
1535 | } | ||
1536 | |||
1537 | static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) | ||
1538 | { | ||
1539 | struct bpf_prog *old; | ||
1540 | |||
1541 | spin_lock(&f->lock); | ||
1542 | old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); | ||
1543 | rcu_assign_pointer(f->bpf_prog, new); | ||
1544 | spin_unlock(&f->lock); | ||
1545 | |||
1546 | if (old) { | ||
1547 | synchronize_net(); | ||
1548 | bpf_prog_destroy(old); | ||
1549 | } | ||
1550 | } | ||
1551 | |||
1552 | static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, | ||
1553 | unsigned int len) | ||
1554 | { | ||
1555 | struct bpf_prog *new; | ||
1556 | struct sock_fprog fprog; | ||
1557 | int ret; | ||
1558 | |||
1559 | if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) | ||
1560 | return -EPERM; | ||
1561 | if (len != sizeof(fprog)) | ||
1562 | return -EINVAL; | ||
1563 | if (copy_from_user(&fprog, data, len)) | ||
1564 | return -EFAULT; | ||
1565 | |||
1566 | ret = bpf_prog_create_from_user(&new, &fprog, NULL); | ||
1567 | if (ret) | ||
1568 | return ret; | ||
1569 | |||
1570 | __fanout_set_data_bpf(po->fanout, new); | ||
1571 | return 0; | ||
1572 | } | ||
1573 | |||
1574 | static int fanout_set_data(struct packet_sock *po, char __user *data, | ||
1575 | unsigned int len) | ||
1576 | { | ||
1577 | switch (po->fanout->type) { | ||
1578 | case PACKET_FANOUT_CBPF: | ||
1579 | return fanout_set_data_cbpf(po, data, len); | ||
1580 | default: | ||
1581 | return -EINVAL; | ||
1582 | }; | ||
1583 | } | ||
1584 | |||
1585 | static void fanout_release_data(struct packet_fanout *f) | ||
1586 | { | ||
1587 | switch (f->type) { | ||
1588 | case PACKET_FANOUT_CBPF: | ||
1589 | __fanout_set_data_bpf(f, NULL); | ||
1590 | }; | ||
1591 | } | ||
1592 | |||
1505 | static int fanout_add(struct sock *sk, u16 id, u16 type_flags) | 1593 | static int fanout_add(struct sock *sk, u16 id, u16 type_flags) |
1506 | { | 1594 | { |
1507 | struct packet_sock *po = pkt_sk(sk); | 1595 | struct packet_sock *po = pkt_sk(sk); |
@@ -1519,6 +1607,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) | |||
1519 | case PACKET_FANOUT_CPU: | 1607 | case PACKET_FANOUT_CPU: |
1520 | case PACKET_FANOUT_RND: | 1608 | case PACKET_FANOUT_RND: |
1521 | case PACKET_FANOUT_QM: | 1609 | case PACKET_FANOUT_QM: |
1610 | case PACKET_FANOUT_CBPF: | ||
1522 | break; | 1611 | break; |
1523 | default: | 1612 | default: |
1524 | return -EINVAL; | 1613 | return -EINVAL; |
@@ -1561,10 +1650,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) | |||
1561 | match->id = id; | 1650 | match->id = id; |
1562 | match->type = type; | 1651 | match->type = type; |
1563 | match->flags = flags; | 1652 | match->flags = flags; |
1564 | atomic_set(&match->rr_cur, 0); | ||
1565 | INIT_LIST_HEAD(&match->list); | 1653 | INIT_LIST_HEAD(&match->list); |
1566 | spin_lock_init(&match->lock); | 1654 | spin_lock_init(&match->lock); |
1567 | atomic_set(&match->sk_ref, 0); | 1655 | atomic_set(&match->sk_ref, 0); |
1656 | fanout_init_data(match); | ||
1568 | match->prot_hook.type = po->prot_hook.type; | 1657 | match->prot_hook.type = po->prot_hook.type; |
1569 | match->prot_hook.dev = po->prot_hook.dev; | 1658 | match->prot_hook.dev = po->prot_hook.dev; |
1570 | match->prot_hook.func = packet_rcv_fanout; | 1659 | match->prot_hook.func = packet_rcv_fanout; |
@@ -1610,6 +1699,7 @@ static void fanout_release(struct sock *sk) | |||
1610 | if (atomic_dec_and_test(&f->sk_ref)) { | 1699 | if (atomic_dec_and_test(&f->sk_ref)) { |
1611 | list_del(&f->list); | 1700 | list_del(&f->list); |
1612 | dev_remove_pack(&f->prot_hook); | 1701 | dev_remove_pack(&f->prot_hook); |
1702 | fanout_release_data(f); | ||
1613 | kfree(f); | 1703 | kfree(f); |
1614 | } | 1704 | } |
1615 | mutex_unlock(&fanout_mutex); | 1705 | mutex_unlock(&fanout_mutex); |
@@ -3529,6 +3619,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
3529 | 3619 | ||
3530 | return fanout_add(sk, val & 0xffff, val >> 16); | 3620 | return fanout_add(sk, val & 0xffff, val >> 16); |
3531 | } | 3621 | } |
3622 | case PACKET_FANOUT_DATA: | ||
3623 | { | ||
3624 | if (!po->fanout) | ||
3625 | return -EINVAL; | ||
3626 | |||
3627 | return fanout_set_data(po, optval, optlen); | ||
3628 | } | ||
3532 | case PACKET_TX_HAS_OFF: | 3629 | case PACKET_TX_HAS_OFF: |
3533 | { | 3630 | { |
3534 | unsigned int val; | 3631 | unsigned int val; |
diff --git a/net/packet/internal.h b/net/packet/internal.h index e20b3e8829b8..9ee46314b7d7 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h | |||
@@ -79,7 +79,10 @@ struct packet_fanout { | |||
79 | u16 id; | 79 | u16 id; |
80 | u8 type; | 80 | u8 type; |
81 | u8 flags; | 81 | u8 flags; |
82 | atomic_t rr_cur; | 82 | union { |
83 | atomic_t rr_cur; | ||
84 | struct bpf_prog __rcu *bpf_prog; | ||
85 | }; | ||
83 | struct list_head list; | 86 | struct list_head list; |
84 | struct sock *arr[PACKET_FANOUT_MAX]; | 87 | struct sock *arr[PACKET_FANOUT_MAX]; |
85 | spinlock_t lock; | 88 | spinlock_t lock; |