aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom Herbert <therbert@google.com>2013-01-22 04:50:24 -0500
committerDavid S. Miller <davem@davemloft.net>2013-01-23 13:44:01 -0500
commitda5e36308d9f7151845018369148201a5d28b46d (patch)
treefff243a12ae5a1d16c2827b3ac41ac23ea2043c4
parent055dc21a1d1d219608cd4baac7d0683fb2cbbe8a (diff)
soreuseport: TCP/IPv4 implementation
Allow multiple listener sockets to bind to the same port. Motivation for soresuseport would be something like a web server binding to port 80 running with multiple threads, where each thread might have it's own listener socket. This could be done as an alternative to other models: 1) have one listener thread which dispatches completed connections to workers. 2) accept on a single listener socket from multiple threads. In case #1 the listener thread can easily become the bottleneck with high connection turn-over rate. In case #2, the proportion of connections accepted per thread tends to be uneven under high connection load (assuming simple event loop: while (1) { accept(); process() }, wakeup does not promote fairness among the sockets. We have seen the disproportion to be as high as 3:1 ratio between thread accepting most connections and the one accepting the fewest. With so_reusport the distribution is uniform. Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/inet_hashtables.h13
-rw-r--r--include/net/netfilter/nf_tproxy_core.h1
-rw-r--r--net/ipv4/inet_connection_sock.c48
-rw-r--r--net/ipv4/inet_hashtables.c28
-rw-r--r--net/ipv4/tcp_ipv4.c4
5 files changed, 73 insertions, 21 deletions
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 67a8fa098e3a..7b2ae9d37076 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -81,7 +81,9 @@ struct inet_bind_bucket {
81 struct net *ib_net; 81 struct net *ib_net;
82#endif 82#endif
83 unsigned short port; 83 unsigned short port;
84 signed short fastreuse; 84 signed char fastreuse;
85 signed char fastreuseport;
86 kuid_t fastuid;
85 int num_owners; 87 int num_owners;
86 struct hlist_node node; 88 struct hlist_node node;
87 struct hlist_head owners; 89 struct hlist_head owners;
@@ -257,15 +259,19 @@ extern void inet_unhash(struct sock *sk);
257 259
258extern struct sock *__inet_lookup_listener(struct net *net, 260extern struct sock *__inet_lookup_listener(struct net *net,
259 struct inet_hashinfo *hashinfo, 261 struct inet_hashinfo *hashinfo,
262 const __be32 saddr,
263 const __be16 sport,
260 const __be32 daddr, 264 const __be32 daddr,
261 const unsigned short hnum, 265 const unsigned short hnum,
262 const int dif); 266 const int dif);
263 267
264static inline struct sock *inet_lookup_listener(struct net *net, 268static inline struct sock *inet_lookup_listener(struct net *net,
265 struct inet_hashinfo *hashinfo, 269 struct inet_hashinfo *hashinfo,
270 __be32 saddr, __be16 sport,
266 __be32 daddr, __be16 dport, int dif) 271 __be32 daddr, __be16 dport, int dif)
267{ 272{
268 return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif); 273 return __inet_lookup_listener(net, hashinfo, saddr, sport,
274 daddr, ntohs(dport), dif);
269} 275}
270 276
271/* Socket demux engine toys. */ 277/* Socket demux engine toys. */
@@ -358,7 +364,8 @@ static inline struct sock *__inet_lookup(struct net *net,
358 struct sock *sk = __inet_lookup_established(net, hashinfo, 364 struct sock *sk = __inet_lookup_established(net, hashinfo,
359 saddr, sport, daddr, hnum, dif); 365 saddr, sport, daddr, hnum, dif);
360 366
361 return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif); 367 return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport,
368 daddr, hnum, dif);
362} 369}
363 370
364static inline struct sock *inet_lookup(struct net *net, 371static inline struct sock *inet_lookup(struct net *net,
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
index 75ca9291cf2c..193796445642 100644
--- a/include/net/netfilter/nf_tproxy_core.h
+++ b/include/net/netfilter/nf_tproxy_core.h
@@ -82,6 +82,7 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
82 break; 82 break;
83 case NFT_LOOKUP_LISTENER: 83 case NFT_LOOKUP_LISTENER:
84 sk = inet_lookup_listener(net, &tcp_hashinfo, 84 sk = inet_lookup_listener(net, &tcp_hashinfo,
85 saddr, sport,
85 daddr, dport, 86 daddr, dport,
86 in->ifindex); 87 in->ifindex);
87 88
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d0670f00d524..8bb623d357ad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -59,6 +59,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
59 struct sock *sk2; 59 struct sock *sk2;
60 struct hlist_node *node; 60 struct hlist_node *node;
61 int reuse = sk->sk_reuse; 61 int reuse = sk->sk_reuse;
62 int reuseport = sk->sk_reuseport;
63 kuid_t uid = sock_i_uid((struct sock *)sk);
62 64
63 /* 65 /*
64 * Unlike other sk lookup places we do not check 66 * Unlike other sk lookup places we do not check
@@ -73,8 +75,11 @@ int inet_csk_bind_conflict(const struct sock *sk,
73 (!sk->sk_bound_dev_if || 75 (!sk->sk_bound_dev_if ||
74 !sk2->sk_bound_dev_if || 76 !sk2->sk_bound_dev_if ||
75 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { 77 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
76 if (!reuse || !sk2->sk_reuse || 78 if ((!reuse || !sk2->sk_reuse ||
77 sk2->sk_state == TCP_LISTEN) { 79 sk2->sk_state == TCP_LISTEN) &&
80 (!reuseport || !sk2->sk_reuseport ||
81 (sk2->sk_state != TCP_TIME_WAIT &&
82 !uid_eq(uid, sock_i_uid(sk2))))) {
78 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); 83 const __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2);
79 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) || 84 if (!sk2_rcv_saddr || !sk_rcv_saddr(sk) ||
80 sk2_rcv_saddr == sk_rcv_saddr(sk)) 85 sk2_rcv_saddr == sk_rcv_saddr(sk))
@@ -106,6 +111,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
106 int ret, attempts = 5; 111 int ret, attempts = 5;
107 struct net *net = sock_net(sk); 112 struct net *net = sock_net(sk);
108 int smallest_size = -1, smallest_rover; 113 int smallest_size = -1, smallest_rover;
114 kuid_t uid = sock_i_uid(sk);
109 115
110 local_bh_disable(); 116 local_bh_disable();
111 if (!snum) { 117 if (!snum) {
@@ -125,9 +131,12 @@ again:
125 spin_lock(&head->lock); 131 spin_lock(&head->lock);
126 inet_bind_bucket_for_each(tb, node, &head->chain) 132 inet_bind_bucket_for_each(tb, node, &head->chain)
127 if (net_eq(ib_net(tb), net) && tb->port == rover) { 133 if (net_eq(ib_net(tb), net) && tb->port == rover) {
128 if (tb->fastreuse > 0 && 134 if (((tb->fastreuse > 0 &&
129 sk->sk_reuse && 135 sk->sk_reuse &&
130 sk->sk_state != TCP_LISTEN && 136 sk->sk_state != TCP_LISTEN) ||
137 (tb->fastreuseport > 0 &&
138 sk->sk_reuseport &&
139 uid_eq(tb->fastuid, uid))) &&
131 (tb->num_owners < smallest_size || smallest_size == -1)) { 140 (tb->num_owners < smallest_size || smallest_size == -1)) {
132 smallest_size = tb->num_owners; 141 smallest_size = tb->num_owners;
133 smallest_rover = rover; 142 smallest_rover = rover;
@@ -185,14 +194,17 @@ tb_found:
185 if (sk->sk_reuse == SK_FORCE_REUSE) 194 if (sk->sk_reuse == SK_FORCE_REUSE)
186 goto success; 195 goto success;
187 196
188 if (tb->fastreuse > 0 && 197 if (((tb->fastreuse > 0 &&
189 sk->sk_reuse && sk->sk_state != TCP_LISTEN && 198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
199 (tb->fastreuseport > 0 &&
200 sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
190 smallest_size == -1) { 201 smallest_size == -1) {
191 goto success; 202 goto success;
192 } else { 203 } else {
193 ret = 1; 204 ret = 1;
194 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { 205 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
195 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && 206 if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
207 (sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
196 smallest_size != -1 && --attempts >= 0) { 208 smallest_size != -1 && --attempts >= 0) {
197 spin_unlock(&head->lock); 209 spin_unlock(&head->lock);
198 goto again; 210 goto again;
@@ -212,9 +224,23 @@ tb_not_found:
212 tb->fastreuse = 1; 224 tb->fastreuse = 1;
213 else 225 else
214 tb->fastreuse = 0; 226 tb->fastreuse = 0;
215 } else if (tb->fastreuse && 227 if (sk->sk_reuseport) {
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) 228 tb->fastreuseport = 1;
217 tb->fastreuse = 0; 229 tb->fastuid = uid;
230 } else {
231 tb->fastreuseport = 0;
232 tb->fastuid = 0;
233 }
234 } else {
235 if (tb->fastreuse &&
236 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
237 tb->fastreuse = 0;
238 if (tb->fastreuseport &&
239 (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) {
240 tb->fastreuseport = 0;
241 tb->fastuid = 0;
242 }
243 }
218success: 244success:
219 if (!inet_csk(sk)->icsk_bind_hash) 245 if (!inet_csk(sk)->icsk_bind_hash)
220 inet_bind_hash(sk, tb, snum); 246 inet_bind_hash(sk, tb, snum);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fa3ae8148710..0ce0595d9861 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -39,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
39 write_pnet(&tb->ib_net, hold_net(net)); 39 write_pnet(&tb->ib_net, hold_net(net));
40 tb->port = snum; 40 tb->port = snum;
41 tb->fastreuse = 0; 41 tb->fastreuse = 0;
42 tb->fastreuseport = 0;
42 tb->num_owners = 0; 43 tb->num_owners = 0;
43 INIT_HLIST_HEAD(&tb->owners); 44 INIT_HLIST_HEAD(&tb->owners);
44 hlist_add_head(&tb->node, &head->chain); 45 hlist_add_head(&tb->node, &head->chain);
@@ -151,16 +152,16 @@ static inline int compute_score(struct sock *sk, struct net *net,
151 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && 152 if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
152 !ipv6_only_sock(sk)) { 153 !ipv6_only_sock(sk)) {
153 __be32 rcv_saddr = inet->inet_rcv_saddr; 154 __be32 rcv_saddr = inet->inet_rcv_saddr;
154 score = sk->sk_family == PF_INET ? 1 : 0; 155 score = sk->sk_family == PF_INET ? 2 : 1;
155 if (rcv_saddr) { 156 if (rcv_saddr) {
156 if (rcv_saddr != daddr) 157 if (rcv_saddr != daddr)
157 return -1; 158 return -1;
158 score += 2; 159 score += 4;
159 } 160 }
160 if (sk->sk_bound_dev_if) { 161 if (sk->sk_bound_dev_if) {
161 if (sk->sk_bound_dev_if != dif) 162 if (sk->sk_bound_dev_if != dif)
162 return -1; 163 return -1;
163 score += 2; 164 score += 4;
164 } 165 }
165 } 166 }
166 return score; 167 return score;
@@ -176,6 +177,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
176 177
177struct sock *__inet_lookup_listener(struct net *net, 178struct sock *__inet_lookup_listener(struct net *net,
178 struct inet_hashinfo *hashinfo, 179 struct inet_hashinfo *hashinfo,
180 const __be32 saddr, __be16 sport,
179 const __be32 daddr, const unsigned short hnum, 181 const __be32 daddr, const unsigned short hnum,
180 const int dif) 182 const int dif)
181{ 183{
@@ -183,17 +185,29 @@ struct sock *__inet_lookup_listener(struct net *net,
183 struct hlist_nulls_node *node; 185 struct hlist_nulls_node *node;
184 unsigned int hash = inet_lhashfn(net, hnum); 186 unsigned int hash = inet_lhashfn(net, hnum);
185 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 187 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
186 int score, hiscore; 188 int score, hiscore, matches = 0, reuseport = 0;
189 u32 phash = 0;
187 190
188 rcu_read_lock(); 191 rcu_read_lock();
189begin: 192begin:
190 result = NULL; 193 result = NULL;
191 hiscore = -1; 194 hiscore = 0;
192 sk_nulls_for_each_rcu(sk, node, &ilb->head) { 195 sk_nulls_for_each_rcu(sk, node, &ilb->head) {
193 score = compute_score(sk, net, hnum, daddr, dif); 196 score = compute_score(sk, net, hnum, daddr, dif);
194 if (score > hiscore) { 197 if (score > hiscore) {
195 result = sk; 198 result = sk;
196 hiscore = score; 199 hiscore = score;
200 reuseport = sk->sk_reuseport;
201 if (reuseport) {
202 phash = inet_ehashfn(net, daddr, hnum,
203 saddr, sport);
204 matches = 1;
205 }
206 } else if (score == hiscore && reuseport) {
207 matches++;
208 if (((u64)phash * matches) >> 32 == 0)
209 result = sk;
210 phash = next_pseudo_random32(phash);
197 } 211 }
198 } 212 }
199 /* 213 /*
@@ -501,7 +515,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
501 inet_bind_bucket_for_each(tb, node, &head->chain) { 515 inet_bind_bucket_for_each(tb, node, &head->chain) {
502 if (net_eq(ib_net(tb), net) && 516 if (net_eq(ib_net(tb), net) &&
503 tb->port == port) { 517 tb->port == port) {
504 if (tb->fastreuse >= 0) 518 if (tb->fastreuse >= 0 ||
519 tb->fastreuseport >= 0)
505 goto next_port; 520 goto next_port;
506 WARN_ON(hlist_empty(&tb->owners)); 521 WARN_ON(hlist_empty(&tb->owners));
507 if (!check_established(death_row, sk, 522 if (!check_established(death_row, sk,
@@ -518,6 +533,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
518 break; 533 break;
519 } 534 }
520 tb->fastreuse = -1; 535 tb->fastreuse = -1;
536 tb->fastreuseport = -1;
521 goto ok; 537 goto ok;
522 538
523 next_port: 539 next_port:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c6ce9ca98d23..bbbdcc5c1973 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -657,7 +657,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
657 * no RST generated if md5 hash doesn't match. 657 * no RST generated if md5 hash doesn't match.
658 */ 658 */
659 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev), 659 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
660 &tcp_hashinfo, ip_hdr(skb)->daddr, 660 &tcp_hashinfo, ip_hdr(skb)->saddr,
661 th->source, ip_hdr(skb)->daddr,
661 ntohs(th->source), inet_iif(skb)); 662 ntohs(th->source), inet_iif(skb));
662 /* don't send rst if it can't find key */ 663 /* don't send rst if it can't find key */
663 if (!sk1) 664 if (!sk1)
@@ -2074,6 +2075,7 @@ do_time_wait:
2074 case TCP_TW_SYN: { 2075 case TCP_TW_SYN: {
2075 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), 2076 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2076 &tcp_hashinfo, 2077 &tcp_hashinfo,
2078 iph->saddr, th->source,
2077 iph->daddr, th->dest, 2079 iph->daddr, th->dest,
2078 inet_iif(skb)); 2080 inet_iif(skb));
2079 if (sk2) { 2081 if (sk2) {