diff options
author | Tom Herbert <therbert@google.com> | 2013-01-22 04:50:24 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-01-23 13:44:01 -0500 |
commit | da5e36308d9f7151845018369148201a5d28b46d (patch) | |
tree | fff243a12ae5a1d16c2827b3ac41ac23ea2043c4 /net/ipv4/inet_connection_sock.c | |
parent | 055dc21a1d1d219608cd4baac7d0683fb2cbbe8a (diff) |
soreuseport: TCP/IPv4 implementation
Allow multiple listener sockets to bind to the same port.
Motivation for soresuseport would be something like a web server
binding to port 80 running with multiple threads, where each thread
might have it's own listener socket. This could be done as an
alternative to other models: 1) have one listener thread which
dispatches completed connections to workers. 2) accept on a single
listener socket from multiple threads. In case #1 the listener thread
can easily become the bottleneck with high connection turn-over rate.
In case #2, the proportion of connections accepted per thread tends
to be uneven under high connection load (assuming simple event loop:
while (1) { accept(); process() }, wakeup does not promote fairness
among the sockets. We have seen the disproportion to be as high
as 3:1 ratio between thread accepting most connections and the one
accepting the fewest. With so_reusport the distribution is
uniform.
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/inet_connection_sock.c')
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 48 |
1 files changed, 37 insertions, 11 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d0670f00d524..8bb623d357ad 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk, | |||
59 | struct sock *sk2; | 59 | struct sock *sk2; |
60 | struct hlist_node *node; | 60 | struct hlist_node *node; |
61 | int reuse = sk->sk_reuse; | 61 | int reuse = sk->sk_reuse; |
62 | int reuseport = sk->sk_reuseport; | ||
63 | kuid_t uid = sock_i_uid((struct sock *)sk); | ||
62 | 64 | ||
63 | /* | 65 | /* |
64 | * Unlike other sk lookup places we do not check | 66 | * Unlike other sk lookup places we do not check |
@@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk, | |||
73 | (!sk->sk_bound_dev_if || | 75 | (!sk->sk_bound_dev_if || |
74 | !sk2->sk_bound_dev_if || | 76 | !sk2->sk_bound_dev_if || |
75 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { | 77 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { |
76 | if (!reuse || !sk2->sk_reuse || | 78 | if ((!reuse || !sk2->sk_reuse || |
77 | sk2->sk_state == TCP_LISTEN) { | 79 | sk2->sk_state == TCP_LISTEN) && |
80 | (!reuseport || !sk2->sk_reuseport || | ||
81 | (sk2->sk_state != TCP_TIME_WAIT && | ||
82 | !uid_eq(uid, sock_i_uid(sk2))))) { | ||
78 | const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); | 83 | const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); |
79 | if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || | 84 | if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || |
80 | sk2_rcv_saddr == sk_rcv_saddr(sk)) | 85 | sk2_rcv_saddr == sk_rcv_saddr(sk)) |
@@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) | |||
106 | int ret, attempts = 5; | 111 | int ret, attempts = 5; |
107 | struct net *net = sock_net(sk); | 112 | struct net *net = sock_net(sk); |
108 | int smallest_size = -1, smallest_rover; | 113 | int smallest_size = -1, smallest_rover; |
114 | kuid_t uid = sock_i_uid(sk); | ||
109 | 115 | ||
110 | local_bh_disable(); | 116 | local_bh_disable(); |
111 | if (!snum) { | 117 | if (!snum) { |
@@ -125,9 +131,12 @@ again: | |||
125 | spin_lock(&head->lock); | 131 | spin_lock(&head->lock); |
126 | inet_bind_bucket_for_each(tb, node, &head->chain) | 132 | inet_bind_bucket_for_each(tb, node, &head->chain) |
127 | if (net_eq(ib_net(tb), net) && tb->port == rover) { | 133 | if (net_eq(ib_net(tb), net) && tb->port == rover) { |
128 | if (tb->fastreuse > 0 && | 134 | if (((tb->fastreuse > 0 && |
129 | sk->sk_reuse && | 135 | sk->sk_reuse && |
130 | sk->sk_state != TCP_LISTEN && | 136 | sk->sk_state != TCP_LISTEN) || |
137 | (tb->fastreuseport > 0 && | ||
138 | sk->sk_reuseport && | ||
139 | uid_eq(tb->fastuid, uid))) && | ||
131 | (tb->num_owners < smallest_size || smallest_size == -1)) { | 140 | (tb->num_owners < smallest_size || smallest_size == -1)) { |
132 | smallest_size = tb->num_owners; | 141 | smallest_size = tb->num_owners; |
133 | smallest_rover = rover; | 142 | smallest_rover = rover; |
@@ -185,14 +194,17 @@ tb_found: | |||
185 | if (sk->sk_reuse == SK_FORCE_REUSE) | 194 | if (sk->sk_reuse == SK_FORCE_REUSE) |
186 | goto success; | 195 | goto success; |
187 | 196 | ||
188 | if (tb->fastreuse > 0 && | 197 | if (((tb->fastreuse > 0 && |
189 | sk->sk_reuse && sk->sk_state != TCP_LISTEN && | 198 | sk->sk_reuse && sk->sk_state != TCP_LISTEN) || |
199 | (tb->fastreuseport > 0 && | ||
200 | sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && | ||
190 | smallest_size == -1) { | 201 | smallest_size == -1) { |
191 | goto success; | 202 | goto success; |
192 | } else { | 203 | } else { |
193 | ret = 1; | 204 | ret = 1; |
194 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { | 205 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { |
195 | if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && | 206 | if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || |
207 | (sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && | ||
196 | smallest_size != -1 && --attempts >= 0) { | 208 | smallest_size != -1 && --attempts >= 0) { |
197 | spin_unlock(&head->lock); | 209 | spin_unlock(&head->lock); |
198 | goto again; | 210 | goto again; |
@@ -212,9 +224,23 @@ tb_not_found: | |||
212 | tb->fastreuse = 1; | 224 | tb->fastreuse = 1; |
213 | else | 225 | else |
214 | tb->fastreuse = 0; | 226 | tb->fastreuse = 0; |
215 | } else if (tb->fastreuse && | 227 | if (sk->sk_reuseport) { |
216 | (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) | 228 | tb->fastreuseport = 1; |
217 | tb->fastreuse = 0; | 229 | tb->fastuid = uid; |
230 | } else { | ||
231 | tb->fastreuseport = 0; | ||
232 | tb->fastuid = 0; | ||
233 | } | ||
234 | } else { | ||
235 | if (tb->fastreuse && | ||
236 | (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) | ||
237 | tb->fastreuse = 0; | ||
238 | if (tb->fastreuseport && | ||
239 | (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) { | ||
240 | tb->fastreuseport = 0; | ||
241 | tb->fastuid = 0; | ||
242 | } | ||
243 | } | ||
218 | success: | 244 | success: |
219 | if (!inet_csk(sk)->icsk_bind_hash) | 245 | if (!inet_csk(sk)->icsk_bind_hash) |
220 | inet_bind_hash(sk, tb, snum); | 246 | inet_bind_hash(sk, tb, snum); |