aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorCraig Gallek <kraig@google.com>2016-01-04 17:41:45 -0500
committerDavid S. Miller <davem@davemloft.net>2016-01-04 22:49:58 -0500
commitef456144da8ef507c8cf504284b6042e9201a05c (patch)
treecadc1049482ed0c976bb436c854fec15be5053c2 /net
parentebb3cf41c1a49e334e3fe540ee24a1afb7ed30c4 (diff)
soreuseport: define reuseport groups
struct sock_reuseport is an optional shared structure referenced by each socket belonging to a reuseport group. When a socket is bound to an address/port not yet in use and the reuseport flag has been set, the structure will be allocated and attached to the newly bound socket. When subsequent calls to bind are made for the same address/port, the shared structure will be updated to include the new socket and the newly bound socket will reference the group structure. Usually, when an incoming packet was destined for a reuseport group, all sockets in the same group needed to be considered before a dispatching decision was made. With this structure, an appropriate socket can be found after looking up just one socket in the group. This shared structure will also allow for more complicated decisions to be made when selecting a socket (eg a BPF filter). This work is based off a similar implementation written by Ying Cai <ycai@google.com> for implementing policy-based reuseport selection. Signed-off-by: Craig Gallek <kraig@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/sock_reuseport.c173
2 files changed, 174 insertions, 1 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 086b01fbe1bd..0b835de04de3 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
9 9
10obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ 10obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ 11 neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
12 sock_diag.o dev_ioctl.o tso.o 12 sock_diag.o dev_ioctl.o tso.o sock_reuseport.o
13 13
14obj-$(CONFIG_XFRM) += flow.o 14obj-$(CONFIG_XFRM) += flow.o
15obj-y += net-sysfs.o 15obj-y += net-sysfs.o
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
new file mode 100644
index 000000000000..963c8d5f3027
--- /dev/null
+++ b/net/core/sock_reuseport.c
@@ -0,0 +1,173 @@
1/*
2 * To speed up listener socket lookup, create an array to store all sockets
3 * listening on the same port. This allows a decision to be made after finding
4 * the first socket.
5 */
6
7#include <net/sock_reuseport.h>
8#include <linux/rcupdate.h>
9
10#define INIT_SOCKS 128
11
12static DEFINE_SPINLOCK(reuseport_lock);
13
14static struct sock_reuseport *__reuseport_alloc(u16 max_socks)
15{
16 size_t size = sizeof(struct sock_reuseport) +
17 sizeof(struct sock *) * max_socks;
18 struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
19
20 if (!reuse)
21 return NULL;
22
23 reuse->max_socks = max_socks;
24
25 return reuse;
26}
27
28int reuseport_alloc(struct sock *sk)
29{
30 struct sock_reuseport *reuse;
31
32 /* bh lock used since this function call may precede hlist lock in
33 * soft irq of receive path or setsockopt from process context
34 */
35 spin_lock_bh(&reuseport_lock);
36 WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
37 lockdep_is_held(&reuseport_lock)),
38 "multiple allocations for the same socket");
39 reuse = __reuseport_alloc(INIT_SOCKS);
40 if (!reuse) {
41 spin_unlock_bh(&reuseport_lock);
42 return -ENOMEM;
43 }
44
45 reuse->socks[0] = sk;
46 reuse->num_socks = 1;
47 rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
48
49 spin_unlock_bh(&reuseport_lock);
50
51 return 0;
52}
53EXPORT_SYMBOL(reuseport_alloc);
54
55static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
56{
57 struct sock_reuseport *more_reuse;
58 u32 more_socks_size, i;
59
60 more_socks_size = reuse->max_socks * 2U;
61 if (more_socks_size > U16_MAX)
62 return NULL;
63
64 more_reuse = __reuseport_alloc(more_socks_size);
65 if (!more_reuse)
66 return NULL;
67
68 more_reuse->max_socks = more_socks_size;
69 more_reuse->num_socks = reuse->num_socks;
70
71 memcpy(more_reuse->socks, reuse->socks,
72 reuse->num_socks * sizeof(struct sock *));
73
74 for (i = 0; i < reuse->num_socks; ++i)
75 rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
76 more_reuse);
77
78 kfree_rcu(reuse, rcu);
79 return more_reuse;
80}
81
82/**
83 * reuseport_add_sock - Add a socket to the reuseport group of another.
84 * @sk: New socket to add to the group.
85 * @sk2: Socket belonging to the existing reuseport group.
86 * May return ENOMEM and not add socket to group under memory pressure.
87 */
88int reuseport_add_sock(struct sock *sk, const struct sock *sk2)
89{
90 struct sock_reuseport *reuse;
91
92 spin_lock_bh(&reuseport_lock);
93 reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
94 lockdep_is_held(&reuseport_lock)),
95 WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
96 lockdep_is_held(&reuseport_lock)),
97 "socket already in reuseport group");
98
99 if (reuse->num_socks == reuse->max_socks) {
100 reuse = reuseport_grow(reuse);
101 if (!reuse) {
102 spin_unlock_bh(&reuseport_lock);
103 return -ENOMEM;
104 }
105 }
106
107 reuse->socks[reuse->num_socks] = sk;
108 /* paired with smp_rmb() in reuseport_select_sock() */
109 smp_wmb();
110 reuse->num_socks++;
111 rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
112
113 spin_unlock_bh(&reuseport_lock);
114
115 return 0;
116}
117EXPORT_SYMBOL(reuseport_add_sock);
118
119void reuseport_detach_sock(struct sock *sk)
120{
121 struct sock_reuseport *reuse;
122 int i;
123
124 spin_lock_bh(&reuseport_lock);
125 reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
126 lockdep_is_held(&reuseport_lock));
127 rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
128
129 for (i = 0; i < reuse->num_socks; i++) {
130 if (reuse->socks[i] == sk) {
131 reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
132 reuse->num_socks--;
133 if (reuse->num_socks == 0)
134 kfree_rcu(reuse, rcu);
135 break;
136 }
137 }
138 spin_unlock_bh(&reuseport_lock);
139}
140EXPORT_SYMBOL(reuseport_detach_sock);
141
142/**
143 * reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
144 * @sk: First socket in the group.
145 * @hash: Use this hash to select.
146 * Returns a socket that should receive the packet (or NULL on error).
147 */
148struct sock *reuseport_select_sock(struct sock *sk, u32 hash)
149{
150 struct sock_reuseport *reuse;
151 struct sock *sk2 = NULL;
152 u16 socks;
153
154 rcu_read_lock();
155 reuse = rcu_dereference(sk->sk_reuseport_cb);
156
157 /* if memory allocation failed or add call is not yet complete */
158 if (!reuse)
159 goto out;
160
161 socks = READ_ONCE(reuse->num_socks);
162 if (likely(socks)) {
163 /* paired with smp_wmb() in reuseport_add_sock() */
164 smp_rmb();
165
166 sk2 = reuse->socks[reciprocal_scale(hash, socks)];
167 }
168
169out:
170 rcu_read_unlock();
171 return sk2;
172}
173EXPORT_SYMBOL(reuseport_select_sock);