diff options
author | Tom Herbert <therbert@google.com> | 2013-01-22 04:50:39 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-01-23 13:44:01 -0500 |
commit | 5ba24953e9707387cce87b07f0d5fbdd03c5c11b (patch) | |
tree | c98e56f8a06f07ff585f85cbe6af8cd9c19f2ca6 | |
parent | ba418fa357a7b3c9d477f4706c6c7c96ddbd1360 (diff) |
soreuseport: TCP/IPv6 implementation
Motivation for soreuseport would be something like a web server
binding to port 80 running with multiple threads, where each thread
might have it's own listener socket. This could be done as an
alternative to other models: 1) have one listener thread which
dispatches completed connections to workers. 2) accept on a single
listener socket from multiple threads. In case #1 the listener thread
can easily become the bottleneck with high connection turn-over rate.
In case #2, the proportion of connections accepted per thread tends
to be uneven under high connection load (assuming simple event loop:
while (1) { accept(); process() }, wakeup does not promote fairness
among the sockets. We have seen the disproportion to be as high
as 3:1 ratio between thread accepting most connections and the one
accepting the fewest. With so_reusport the distribution is
uniform.
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/inet6_hashtables.h | 5 | ||||
-rw-r--r-- | include/net/netfilter/nf_tproxy_core.h | 1 | ||||
-rw-r--r-- | net/ipv6/inet6_connection_sock.c | 19 | ||||
-rw-r--r-- | net/ipv6/inet6_hashtables.c | 19 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 4 |
5 files changed, 38 insertions, 10 deletions
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 9e34c877a770..7ca75cbbf75e 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h | |||
@@ -71,6 +71,8 @@ extern struct sock *__inet6_lookup_established(struct net *net, | |||
71 | 71 | ||
72 | extern struct sock *inet6_lookup_listener(struct net *net, | 72 | extern struct sock *inet6_lookup_listener(struct net *net, |
73 | struct inet_hashinfo *hashinfo, | 73 | struct inet_hashinfo *hashinfo, |
74 | const struct in6_addr *saddr, | ||
75 | const __be16 sport, | ||
74 | const struct in6_addr *daddr, | 76 | const struct in6_addr *daddr, |
75 | const unsigned short hnum, | 77 | const unsigned short hnum, |
76 | const int dif); | 78 | const int dif); |
@@ -88,7 +90,8 @@ static inline struct sock *__inet6_lookup(struct net *net, | |||
88 | if (sk) | 90 | if (sk) |
89 | return sk; | 91 | return sk; |
90 | 92 | ||
91 | return inet6_lookup_listener(net, hashinfo, daddr, hnum, dif); | 93 | return inet6_lookup_listener(net, hashinfo, saddr, sport, |
94 | daddr, hnum, dif); | ||
92 | } | 95 | } |
93 | 96 | ||
94 | static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, | 97 | static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, |
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h index 193796445642..36d9379d4c4b 100644 --- a/include/net/netfilter/nf_tproxy_core.h +++ b/include/net/netfilter/nf_tproxy_core.h | |||
@@ -152,6 +152,7 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol, | |||
152 | break; | 152 | break; |
153 | case NFT_LOOKUP_LISTENER: | 153 | case NFT_LOOKUP_LISTENER: |
154 | sk = inet6_lookup_listener(net, &tcp_hashinfo, | 154 | sk = inet6_lookup_listener(net, &tcp_hashinfo, |
155 | saddr, sport, | ||
155 | daddr, ntohs(dport), | 156 | daddr, ntohs(dport), |
156 | in->ifindex); | 157 | in->ifindex); |
157 | 158 | ||
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 30647857a375..e4297a393678 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c | |||
@@ -32,6 +32,9 @@ int inet6_csk_bind_conflict(const struct sock *sk, | |||
32 | { | 32 | { |
33 | const struct sock *sk2; | 33 | const struct sock *sk2; |
34 | const struct hlist_node *node; | 34 | const struct hlist_node *node; |
35 | int reuse = sk->sk_reuse; | ||
36 | int reuseport = sk->sk_reuseport; | ||
37 | int uid = sock_i_uid((struct sock *)sk); | ||
35 | 38 | ||
36 | /* We must walk the whole port owner list in this case. -DaveM */ | 39 | /* We must walk the whole port owner list in this case. -DaveM */ |
37 | /* | 40 | /* |
@@ -42,11 +45,17 @@ int inet6_csk_bind_conflict(const struct sock *sk, | |||
42 | if (sk != sk2 && | 45 | if (sk != sk2 && |
43 | (!sk->sk_bound_dev_if || | 46 | (!sk->sk_bound_dev_if || |
44 | !sk2->sk_bound_dev_if || | 47 | !sk2->sk_bound_dev_if || |
45 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && | 48 | sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { |
46 | (!sk->sk_reuse || !sk2->sk_reuse || | 49 | if ((!reuse || !sk2->sk_reuse || |
47 | sk2->sk_state == TCP_LISTEN) && | 50 | sk2->sk_state == TCP_LISTEN) && |
48 | ipv6_rcv_saddr_equal(sk, sk2)) | 51 | (!reuseport || !sk2->sk_reuseport || |
49 | break; | 52 | (sk2->sk_state != TCP_TIME_WAIT && |
53 | !uid_eq(uid, | ||
54 | sock_i_uid((struct sock *)sk2))))) { | ||
55 | if (ipv6_rcv_saddr_equal(sk, sk2)) | ||
56 | break; | ||
57 | } | ||
58 | } | ||
50 | } | 59 | } |
51 | 60 | ||
52 | return node != NULL; | 61 | return node != NULL; |
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index dea17fd28e50..32b4a1675d82 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c | |||
@@ -158,25 +158,38 @@ static inline int compute_score(struct sock *sk, struct net *net, | |||
158 | } | 158 | } |
159 | 159 | ||
160 | struct sock *inet6_lookup_listener(struct net *net, | 160 | struct sock *inet6_lookup_listener(struct net *net, |
161 | struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, | 161 | struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, |
162 | const __be16 sport, const struct in6_addr *daddr, | ||
162 | const unsigned short hnum, const int dif) | 163 | const unsigned short hnum, const int dif) |
163 | { | 164 | { |
164 | struct sock *sk; | 165 | struct sock *sk; |
165 | const struct hlist_nulls_node *node; | 166 | const struct hlist_nulls_node *node; |
166 | struct sock *result; | 167 | struct sock *result; |
167 | int score, hiscore; | 168 | int score, hiscore, matches = 0, reuseport = 0; |
169 | u32 phash = 0; | ||
168 | unsigned int hash = inet_lhashfn(net, hnum); | 170 | unsigned int hash = inet_lhashfn(net, hnum); |
169 | struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; | 171 | struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; |
170 | 172 | ||
171 | rcu_read_lock(); | 173 | rcu_read_lock(); |
172 | begin: | 174 | begin: |
173 | result = NULL; | 175 | result = NULL; |
174 | hiscore = -1; | 176 | hiscore = 0; |
175 | sk_nulls_for_each(sk, node, &ilb->head) { | 177 | sk_nulls_for_each(sk, node, &ilb->head) { |
176 | score = compute_score(sk, net, hnum, daddr, dif); | 178 | score = compute_score(sk, net, hnum, daddr, dif); |
177 | if (score > hiscore) { | 179 | if (score > hiscore) { |
178 | hiscore = score; | 180 | hiscore = score; |
179 | result = sk; | 181 | result = sk; |
182 | reuseport = sk->sk_reuseport; | ||
183 | if (reuseport) { | ||
184 | phash = inet6_ehashfn(net, daddr, hnum, | ||
185 | saddr, sport); | ||
186 | matches = 1; | ||
187 | } | ||
188 | } else if (score == hiscore && reuseport) { | ||
189 | matches++; | ||
190 | if (((u64)phash * matches) >> 32 == 0) | ||
191 | result = sk; | ||
192 | phash = next_pseudo_random32(phash); | ||
180 | } | 193 | } |
181 | } | 194 | } |
182 | /* | 195 | /* |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3701c3c6e2eb..06087e58738a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -834,7 +834,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) | |||
834 | * no RST generated if md5 hash doesn't match. | 834 | * no RST generated if md5 hash doesn't match. |
835 | */ | 835 | */ |
836 | sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), | 836 | sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), |
837 | &tcp_hashinfo, &ipv6h->daddr, | 837 | &tcp_hashinfo, &ipv6h->saddr, |
838 | th->source, &ipv6h->daddr, | ||
838 | ntohs(th->source), inet6_iif(skb)); | 839 | ntohs(th->source), inet6_iif(skb)); |
839 | if (!sk1) | 840 | if (!sk1) |
840 | return; | 841 | return; |
@@ -1598,6 +1599,7 @@ do_time_wait: | |||
1598 | struct sock *sk2; | 1599 | struct sock *sk2; |
1599 | 1600 | ||
1600 | sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, | 1601 | sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, |
1602 | &ipv6_hdr(skb)->saddr, th->source, | ||
1601 | &ipv6_hdr(skb)->daddr, | 1603 | &ipv6_hdr(skb)->daddr, |
1602 | ntohs(th->dest), inet6_iif(skb)); | 1604 | ntohs(th->dest), inet6_iif(skb)); |
1603 | if (sk2 != NULL) { | 1605 | if (sk2 != NULL) { |