diff options
author | Tom Herbert <therbert@google.com> | 2013-01-22 04:50:24 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-01-23 13:44:01 -0500 |
commit | da5e36308d9f7151845018369148201a5d28b46d (patch) | |
tree | fff243a12ae5a1d16c2827b3ac41ac23ea2043c4 | |
parent | 055dc21a1d1d219608cd4baac7d0683fb2cbbe8a (diff) |
soreuseport: TCP/IPv4 implementation
Allow multiple listener sockets to bind to the same port.
Motivation for soresuseport would be something like a web server
binding to port 80 running with multiple threads, where each thread
might have it's own listener socket. This could be done as an
alternative to other models: 1) have one listener thread which
dispatches completed connections to workers. 2) accept on a single
listener socket from multiple threads. In case #1 the listener thread
can easily become the bottleneck with high connection turn-over rate.
In case #2, the proportion of connections accepted per thread tends
to be uneven under high connection load (assuming simple event loop:
while (1) { accept(); process() }, wakeup does not promote fairness
among the sockets. We have seen the disproportion to be as high
as 3:1 ratio between thread accepting most connections and the one
accepting the fewest. With so_reusport the distribution is
uniform.
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/inet_hashtables.h | 13 | ||||
-rw-r--r-- | include/net/netfilter/nf_tproxy_core.h | 1 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 48 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 28 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 4 |
5 files changed, 73 insertions, 21 deletions
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 67a8fa098e3a..7b2ae9d37076 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h | |||
@@ -81,7 +81,9 @@ struct inet_bind_bucket { | |||
81 | struct net *ib_net; | 81 | struct net *ib_net; |
82 | #endif | 82 | #endif |
83 | unsigned short port; | 83 | unsigned short port; |
84 | signed short fastreuse; | 84 | signed char fastreuse; |
85 | signed char fastreuseport; | ||
86 | kuid_t fastuid; | ||
85 | int num_owners; | 87 | int num_owners; |
86 | struct hlist_node node; | 88 | struct hlist_node node; |
87 | struct hlist_head owners; | 89 | struct hlist_head owners; |
@@ -257,15 +259,19 @@ extern void inet_unhash(struct sock *sk); | |||
257 | 259 | ||
258 | extern struct sock *__inet_lookup_listener(struct net *net, | 260 | extern struct sock *__inet_lookup_listener(struct net *net, |
259 | struct inet_hashinfo *hashinfo, | 261 | struct inet_hashinfo *hashinfo, |
262 | const __be32 saddr, | ||
263 | const __be16 sport, | ||
260 | const __be32 daddr, | 264 | const __be32 daddr, |
261 | const unsigned short hnum, | 265 | const unsigned short hnum, |
262 | const int dif); | 266 | const int dif); |
263 | 267 | ||
264 | static inline struct sock *inet_lookup_listener(struct net *net, | 268 | static inline struct sock *inet_lookup_listener(struct net *net, |
265 | struct inet_hashinfo *hashinfo, | 269 | struct inet_hashinfo *hashinfo, |
270 | __be32 saddr, __be16 sport, | ||
266 | __be32 daddr, __be16 dport, int dif) | 271 | __be32 daddr, __be16 dport, int dif) |
267 | { | 272 | { |
268 | return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif); | 273 | return __inet_lookup_listener(net, hashinfo, saddr, sport, |
274 | daddr, ntohs(dport), dif); | ||
269 | } | 275 | } |
270 | 276 | ||
271 | /* Socket demux engine toys. */ | 277 | /* Socket demux engine toys. */ |
@@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net, | |||
358 | struct sock *sk = __inet_lookup_established(net, hashinfo, | 364 | struct sock *sk = __inet_lookup_established(net, hashinfo, |
359 | saddr, sport, daddr, hnum, dif); | 365 | saddr, sport, daddr, hnum, dif); |
360 | 366 | ||
361 | return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif); | 367 | return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport, |
368 | daddr, hnum, dif); | ||
362 | } | 369 | } |
363 | 370 | ||
364 | static inline struct sock *inet_lookup(struct net *net, | 371 | static inline struct sock *inet_lookup(struct net *net, |
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h index 75ca9291cf2c..193796445642 100644 --- a/include/net/netfilter/nf_tproxy_core.h +++ b/include/net/netfilter/nf_tproxy_core.h | |||
@@ -82,6 +82,7 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, | |||
82 | break; | 82 | break; |
83 | case NFT_LOOKUP_LISTENER: | 83 | case NFT_LOOKUP_LISTENER: |
84 | sk = inet_lookup_listener(net, &tcp_hashinfo, | 84 | sk = inet_lookup_listener(net, &tcp_hashinfo, |
85 | saddr, sport, | ||
85 | daddr, dport, | 86 | daddr, dport, |
86 | in->ifindex); | 87 | in->ifindex); |
87 | 88 | ||
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d0670f00d524..8bb623d357ad 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk, | |||
59 | struct sock *sk2; | 59 | struct sock *sk2; |
60 | struct hlist_node *node; | 60 | struct hlist_node *node; |
61 | int reuse = sk->sk_reuse; | 61 | int reuse = sk->sk_reuse; |
62 | int reuseport = sk->sk_reuseport; | ||
63 | kuid_t uid = sock_i_uid((struct sock *)sk); | ||
62 | 64 | ||
63 | /* | 65 | /* |
64 | * Unlike other sk lookup places we do not check | 66 | * Unlike other sk lookup places we do not check |
@@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk, | |||
73 | (!sk->sk_bound_dev_if || | 75 | (!sk->sk_bound_dev_if || |
74 | !sk2->sk_bound_dev_if || | 76 | !sk2->sk_bound_dev_if || |
75 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { | 77 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { |
76 | if (!reuse || !sk2->sk_reuse || | 78 | if ((!reuse || !sk2->sk_reuse || |
77 | sk2->sk_state == TCP_LISTEN) { | 79 | sk2->sk_state == TCP_LISTEN) && |
80 | (!reuseport || !sk2->sk_reuseport || | ||
81 | (sk2->sk_state != TCP_TIME_WAIT && | ||
82 | !uid_eq(uid, sock_i_uid(sk2))))) { | ||
78 | const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); | 83 | const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); |
79 | if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || | 84 | if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || |
80 | sk2_rcv_saddr == sk_rcv_saddr(sk)) | 85 | sk2_rcv_saddr == sk_rcv_saddr(sk)) |
@@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) | |||
106 | int ret, attempts = 5; | 111 | int ret, attempts = 5; |
107 | struct net *net = sock_net(sk); | 112 | struct net *net = sock_net(sk); |
108 | int smallest_size = -1, smallest_rover; | 113 | int smallest_size = -1, smallest_rover; |
114 | kuid_t uid = sock_i_uid(sk); | ||
109 | 115 | ||
110 | local_bh_disable(); | 116 | local_bh_disable(); |
111 | if (!snum) { | 117 | if (!snum) { |
@@ -125,9 +131,12 @@ again: | |||
125 | spin_lock(&head->lock); | 131 | spin_lock(&head->lock); |
126 | inet_bind_bucket_for_each(tb, node, &head->chain) | 132 | inet_bind_bucket_for_each(tb, node, &head->chain) |
127 | if (net_eq(ib_net(tb), net) && tb->port == rover) { | 133 | if (net_eq(ib_net(tb), net) && tb->port == rover) { |
128 | if (tb->fastreuse > 0 && | 134 | if (((tb->fastreuse > 0 && |
129 | sk->sk_reuse && | 135 | sk->sk_reuse && |
130 | sk->sk_state != TCP_LISTEN && | 136 | sk->sk_state != TCP_LISTEN) || |
137 | (tb->fastreuseport > 0 && | ||
138 | sk->sk_reuseport && | ||
139 | uid_eq(tb->fastuid, uid))) && | ||
131 | (tb->num_owners < smallest_size || smallest_size == -1)) { | 140 | (tb->num_owners < smallest_size || smallest_size == -1)) { |
132 | smallest_size = tb->num_owners; | 141 | smallest_size = tb->num_owners; |
133 | smallest_rover = rover; | 142 | smallest_rover = rover; |
@@ -185,14 +194,17 @@ tb_found: | |||
185 | if (sk->sk_reuse == SK_FORCE_REUSE) | 194 | if (sk->sk_reuse == SK_FORCE_REUSE) |
186 | goto success; | 195 | goto success; |
187 | 196 | ||
188 | if (tb->fastreuse > 0 && | 197 | if (((tb->fastreuse > 0 && |
189 | sk->sk_reuse && sk->sk_state != TCP_LISTEN && | 198 | sk->sk_reuse && sk->sk_state != TCP_LISTEN) || |
199 | (tb->fastreuseport > 0 && | ||
200 | sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && | ||
190 | smallest_size == -1) { | 201 | smallest_size == -1) { |
191 | goto success; | 202 | goto success; |
192 | } else { | 203 | } else { |
193 | ret = 1; | 204 | ret = 1; |
194 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { | 205 | if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { |
195 | if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && | 206 | if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || |
207 | (sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && | ||
196 | smallest_size != -1 && --attempts >= 0) { | 208 | smallest_size != -1 && --attempts >= 0) { |
197 | spin_unlock(&head->lock); | 209 | spin_unlock(&head->lock); |
198 | goto again; | 210 | goto again; |
@@ -212,9 +224,23 @@ tb_not_found: | |||
212 | tb->fastreuse = 1; | 224 | tb->fastreuse = 1; |
213 | else | 225 | else |
214 | tb->fastreuse = 0; | 226 | tb->fastreuse = 0; |
215 | } else if (tb->fastreuse && | 227 | if (sk->sk_reuseport) { |
216 | (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) | 228 | tb->fastreuseport = 1; |
217 | tb->fastreuse = 0; | 229 | tb->fastuid = uid; |
230 | } else { | ||
231 | tb->fastreuseport = 0; | ||
232 | tb->fastuid = 0; | ||
233 | } | ||
234 | } else { | ||
235 | if (tb->fastreuse && | ||
236 | (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) | ||
237 | tb->fastreuse = 0; | ||
238 | if (tb->fastreuseport && | ||
239 | (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) { | ||
240 | tb->fastreuseport = 0; | ||
241 | tb->fastuid = 0; | ||
242 | } | ||
243 | } | ||
218 | success: | 244 | success: |
219 | if (!inet_csk(sk)->icsk_bind_hash) | 245 | if (!inet_csk(sk)->icsk_bind_hash) |
220 | inet_bind_hash(sk, tb, snum); | 246 | inet_bind_hash(sk, tb, snum); |
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fa3ae8148710..0ce0595d9861 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, | |||
39 | write_pnet(&tb->ib_net, hold_net(net)); | 39 | write_pnet(&tb->ib_net, hold_net(net)); |
40 | tb->port = snum; | 40 | tb->port = snum; |
41 | tb->fastreuse = 0; | 41 | tb->fastreuse = 0; |
42 | tb->fastreuseport = 0; | ||
42 | tb->num_owners = 0; | 43 | tb->num_owners = 0; |
43 | INIT_HLIST_HEAD(&tb->owners); | 44 | INIT_HLIST_HEAD(&tb->owners); |
44 | hlist_add_head(&tb->node, &head->chain); | 45 | hlist_add_head(&tb->node, &head->chain); |
@@ -151,16 +152,16 @@ static inline int compute_score(struct sock *sk, struct net *net, | |||
151 | if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && | 152 | if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && |
152 | !ipv6_only_sock(sk)) { | 153 | !ipv6_only_sock(sk)) { |
153 | __be32 rcv_saddr = inet->inet_rcv_saddr; | 154 | __be32 rcv_saddr = inet->inet_rcv_saddr; |
154 | score = sk->sk_family == PF_INET ? 1 : 0; | 155 | score = sk->sk_family == PF_INET ? 2 : 1; |
155 | if (rcv_saddr) { | 156 | if (rcv_saddr) { |
156 | if (rcv_saddr != daddr) | 157 | if (rcv_saddr != daddr) |
157 | return -1; | 158 | return -1; |
158 | score += 2; | 159 | score += 4; |
159 | } | 160 | } |
160 | if (sk->sk_bound_dev_if) { | 161 | if (sk->sk_bound_dev_if) { |
161 | if (sk->sk_bound_dev_if != dif) | 162 | if (sk->sk_bound_dev_if != dif) |
162 | return -1; | 163 | return -1; |
163 | score += 2; | 164 | score += 4; |
164 | } | 165 | } |
165 | } | 166 | } |
166 | return score; | 167 | return score; |
@@ -176,6 +177,7 @@ static inline int compute_score(struct sock *sk, struct net *net, | |||
176 | 177 | ||
177 | struct sock *__inet_lookup_listener(struct net *net, | 178 | struct sock *__inet_lookup_listener(struct net *net, |
178 | struct inet_hashinfo *hashinfo, | 179 | struct inet_hashinfo *hashinfo, |
180 | const __be32 saddr, __be16 sport, | ||
179 | const __be32 daddr, const unsigned short hnum, | 181 | const __be32 daddr, const unsigned short hnum, |
180 | const int dif) | 182 | const int dif) |
181 | { | 183 | { |
@@ -183,17 +185,29 @@ struct sock *__inet_lookup_listener(struct net *net, | |||
183 | struct hlist_nulls_node *node; | 185 | struct hlist_nulls_node *node; |
184 | unsigned int hash = inet_lhashfn(net, hnum); | 186 | unsigned int hash = inet_lhashfn(net, hnum); |
185 | struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; | 187 | struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; |
186 | int score, hiscore; | 188 | int score, hiscore, matches = 0, reuseport = 0; |
189 | u32 phash = 0; | ||
187 | 190 | ||
188 | rcu_read_lock(); | 191 | rcu_read_lock(); |
189 | begin: | 192 | begin: |
190 | result = NULL; | 193 | result = NULL; |
191 | hiscore = -1; | 194 | hiscore = 0; |
192 | sk_nulls_for_each_rcu(sk, node, &ilb->head) { | 195 | sk_nulls_for_each_rcu(sk, node, &ilb->head) { |
193 | score = compute_score(sk, net, hnum, daddr, dif); | 196 | score = compute_score(sk, net, hnum, daddr, dif); |
194 | if (score > hiscore) { | 197 | if (score > hiscore) { |
195 | result = sk; | 198 | result = sk; |
196 | hiscore = score; | 199 | hiscore = score; |
200 | reuseport = sk->sk_reuseport; | ||
201 | if (reuseport) { | ||
202 | phash = inet_ehashfn(net, daddr, hnum, | ||
203 | saddr, sport); | ||
204 | matches = 1; | ||
205 | } | ||
206 | } else if (score == hiscore && reuseport) { | ||
207 | matches++; | ||
208 | if (((u64)phash * matches) >> 32 == 0) | ||
209 | result = sk; | ||
210 | phash = next_pseudo_random32(phash); | ||
197 | } | 211 | } |
198 | } | 212 | } |
199 | /* | 213 | /* |
@@ -501,7 +515,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, | |||
501 | inet_bind_bucket_for_each(tb, node, &head->chain) { | 515 | inet_bind_bucket_for_each(tb, node, &head->chain) { |
502 | if (net_eq(ib_net(tb), net) && | 516 | if (net_eq(ib_net(tb), net) && |
503 | tb->port == port) { | 517 | tb->port == port) { |
504 | if (tb->fastreuse >= 0) | 518 | if (tb->fastreuse >= 0 || |
519 | tb->fastreuseport >= 0) | ||
505 | goto next_port; | 520 | goto next_port; |
506 | WARN_ON(hlist_empty(&tb->owners)); | 521 | WARN_ON(hlist_empty(&tb->owners)); |
507 | if (!check_established(death_row, sk, | 522 | if (!check_established(death_row, sk, |
@@ -518,6 +533,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, | |||
518 | break; | 533 | break; |
519 | } | 534 | } |
520 | tb->fastreuse = -1; | 535 | tb->fastreuse = -1; |
536 | tb->fastreuseport = -1; | ||
521 | goto ok; | 537 | goto ok; |
522 | 538 | ||
523 | next_port: | 539 | next_port: |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c6ce9ca98d23..bbbdcc5c1973 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
657 | * no RST generated if md5 hash doesn't match. | 657 | * no RST generated if md5 hash doesn't match. |
658 | */ | 658 | */ |
659 | sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), | 659 | sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), |
660 | &tcp_hashinfo, ip_hdr(skb)->daddr, | 660 | &tcp_hashinfo, ip_hdr(skb)->saddr, |
661 | th->source, ip_hdr(skb)->daddr, | ||
661 | ntohs(th->source), inet_iif(skb)); | 662 | ntohs(th->source), inet_iif(skb)); |
662 | /* don't send rst if it can't find key */ | 663 | /* don't send rst if it can't find key */ |
663 | if (!sk1) | 664 | if (!sk1) |
@@ -2074,6 +2075,7 @@ do_time_wait: | |||
2074 | case TCP_TW_SYN: { | 2075 | case TCP_TW_SYN: { |
2075 | struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), | 2076 | struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), |
2076 | &tcp_hashinfo, | 2077 | &tcp_hashinfo, |
2078 | iph->saddr, th->source, | ||
2077 | iph->daddr, th->dest, | 2079 | iph->daddr, th->dest, |
2078 | inet_iif(skb)); | 2080 | inet_iif(skb)); |
2079 | if (sk2) { | 2081 | if (sk2) { |