aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/inet_connection_sock.c
diff options
context:
space:
mode:
authorTom Herbert <therbert@google.com>2013-01-22 04:50:24 -0500
committerDavid S. Miller <davem@davemloft.net>2013-01-23 13:44:01 -0500
commitda5e36308d9f7151845018369148201a5d28b46d (patch)
treefff243a12ae5a1d16c2827b3ac41ac23ea2043c4 /net/ipv4/inet_connection_sock.c
parent055dc21a1d1d219608cd4baac7d0683fb2cbbe8a (diff)
soreuseport: TCP/IPv4 implementation
Allow multiple listener sockets to bind to the same port. Motivation for soresuseport would be something like a web server binding to port 80 running with multiple threads, where each thread might have it's own listener socket. This could be done as an alternative to other models: 1) have one listener thread which dispatches completed connections to workers. 2) accept on a single listener socket from multiple threads. In case #1 the listener thread can easily become the bottleneck with high connection turn-over rate. In case #2, the proportion of connections accepted per thread tends to be uneven under high connection load (assuming simple event loop: while (1) { accept(); process() }, wakeup does not promote fairness among the sockets. We have seen the disproportion to be as high as 3:1 ratio between thread accepting most connections and the one accepting the fewest. With so_reusport the distribution is uniform. Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/inet_connection_sock.c')
-rw-r--r--net/ipv4/inet_connection_sock.c48
1 files changed, 37 insertions, 11 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d0670f00d524..8bb623d357ad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
59 struct sock *sk2; 59 struct sock *sk2;
60 struct hlist_node *node; 60 struct hlist_node *node;
61 int reuse = sk->sk_reuse; 61 int reuse = sk->sk_reuse;
62 int reuseport = sk->sk_reuseport;
63 kuid_t uid = sock_i_uid((struct sock *)sk);
62 64
63 /* 65 /*
64 * Unlike other sk lookup places we do not check 66 * Unlike other sk lookup places we do not check
@@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
73 (!sk->sk_bound_dev_if || 75 (!sk->sk_bound_dev_if ||
74 !sk2->sk_bound_dev_if || 76 !sk2->sk_bound_dev_if ||
75 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 77 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
76 if (!reuse || !sk2->sk_reuse || 78 if ((!reuse || !sk2->sk_reuse ||
77 sk2->sk_state == TCP_LISTEN) { 79 sk2->sk_state == TCP_LISTEN) &&
80 (!reuseport || !sk2->sk_reuseport ||
81 (sk2->sk_state != TCP_TIME_WAIT &&
82 !uid_eq(uid, sock_i_uid(sk2))))) {
78 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); 83 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
79 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || 84 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
80 sk2_rcv_saddr == sk_rcv_saddr(sk)) 85 sk2_rcv_saddr == sk_rcv_saddr(sk))
@@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
106 int ret, attempts = 5; 111 int ret, attempts = 5;
107 struct net *net = sock_net(sk); 112 struct net *net = sock_net(sk);
108 int smallest_size = -1, smallest_rover; 113 int smallest_size = -1, smallest_rover;
114 kuid_t uid = sock_i_uid(sk);
109 115
110 local_bh_disable(); 116 local_bh_disable();
111 if (!snum) { 117 if (!snum) {
@@ -125,9 +131,12 @@ again:
125 spin_lock(&head->lock); 131 spin_lock(&head->lock);
126 inet_bind_bucket_for_each(tb, node, &head->chain) 132 inet_bind_bucket_for_each(tb, node, &head->chain)
127 if (net_eq(ib_net(tb), net) && tb->port == rover) { 133 if (net_eq(ib_net(tb), net) && tb->port == rover) {
128 if (tb->fastreuse > 0 && 134 if (((tb->fastreuse > 0 &&
129 sk->sk_reuse && 135 sk->sk_reuse &&
130 sk->sk_state != TCP_LISTEN && 136 sk->sk_state != TCP_LISTEN) ||
137 (tb->fastreuseport > 0 &&
138 sk->sk_reuseport &&
139 uid_eq(tb->fastuid, uid))) &&
131 (tb->num_owners < smallest_size || smallest_size == -1)) { 140 (tb->num_owners < smallest_size || smallest_size == -1)) {
132 smallest_size = tb->num_owners; 141 smallest_size = tb->num_owners;
133 smallest_rover = rover; 142 smallest_rover = rover;
@@ -185,14 +194,17 @@ tb_found:
185 if (sk->sk_reuse == SK_FORCE_REUSE) 194 if (sk->sk_reuse == SK_FORCE_REUSE)
186 goto success; 195 goto success;
187 196
188 if (tb->fastreuse > 0 && 197 if (((tb->fastreuse > 0 &&
189 sk->sk_reuse && sk->sk_state != TCP_LISTEN && 198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
199 (tb->fastreuseport > 0 &&
200 sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
190 smallest_size == -1) { 201 smallest_size == -1) {
191 goto success; 202 goto success;
192 } else { 203 } else {
193 ret = 1; 204 ret = 1;
194 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { 205 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
195 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && 206 if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
207 (sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
196 smallest_size != -1 && --attempts >= 0) { 208 smallest_size != -1 && --attempts >= 0) {
197 spin_unlock(&head->lock); 209 spin_unlock(&head->lock);
198 goto again; 210 goto again;
@@ -212,9 +224,23 @@ tb_not_found:
212 tb->fastreuse = 1; 224 tb->fastreuse = 1;
213 else 225 else
214 tb->fastreuse = 0; 226 tb->fastreuse = 0;
215 } else if (tb->fastreuse && 227 if (sk->sk_reuseport) {
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) 228 tb->fastreuseport = 1;
217 tb->fastreuse = 0; 229 tb->fastuid = uid;
230 } else {
231 tb->fastreuseport = 0;
232 tb->fastuid = 0;
233 }
234 } else {
235 if (tb->fastreuse &&
236 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
237 tb->fastreuse = 0;
238 if (tb->fastreuseport &&
239 (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) {
240 tb->fastreuseport = 0;
241 tb->fastuid = 0;
242 }
243 }
218success: 244success:
219 if (!inet_csk(sk)->icsk_bind_hash) 245 if (!inet_csk(sk)->icsk_bind_hash)
220 inet_bind_hash(sk, tb, snum); 246 inet_bind_hash(sk, tb, snum);