aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom Herbert <therbert@google.com>2013-01-22 04:50:39 -0500
committerDavid S. Miller <davem@davemloft.net>2013-01-23 13:44:01 -0500
commit5ba24953e9707387cce87b07f0d5fbdd03c5c11b (patch)
treec98e56f8a06f07ff585f85cbe6af8cd9c19f2ca6
parentba418fa357a7b3c9d477f4706c6c7c96ddbd1360 (diff)
soreuseport: TCP/IPv6 implementation
Motivation for soreuseport would be something like a web server binding to port 80 running with multiple threads, where each thread might have it's own listener socket. This could be done as an alternative to other models: 1) have one listener thread which dispatches completed connections to workers. 2) accept on a single listener socket from multiple threads. In case #1 the listener thread can easily become the bottleneck with high connection turn-over rate. In case #2, the proportion of connections accepted per thread tends to be uneven under high connection load (assuming simple event loop: while (1) { accept(); process() }, wakeup does not promote fairness among the sockets. We have seen the disproportion to be as high as 3:1 ratio between thread accepting most connections and the one accepting the fewest. With so_reusport the distribution is uniform. Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/inet6_hashtables.h5
-rw-r--r--include/net/netfilter/nf_tproxy_core.h1
-rw-r--r--net/ipv6/inet6_connection_sock.c19
-rw-r--r--net/ipv6/inet6_hashtables.c19
-rw-r--r--net/ipv6/tcp_ipv6.c4
5 files changed, 38 insertions, 10 deletions
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 9e34c877a770..7ca75cbbf75e 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -71,6 +71,8 @@ extern struct sock *__inet6_lookup_established(struct net *net,
71 71
72extern struct sock *inet6_lookup_listener(struct net *net, 72extern struct sock *inet6_lookup_listener(struct net *net,
73 struct inet_hashinfo *hashinfo, 73 struct inet_hashinfo *hashinfo,
74 const struct in6_addr *saddr,
75 const __be16 sport,
74 const struct in6_addr *daddr, 76 const struct in6_addr *daddr,
75 const unsigned short hnum, 77 const unsigned short hnum,
76 const int dif); 78 const int dif);
@@ -88,7 +90,8 @@ static inline struct sock *__inet6_lookup(struct net *net,
88 if (sk) 90 if (sk)
89 return sk; 91 return sk;
90 92
91 return inet6_lookup_listener(net, hashinfo, daddr, hnum, dif); 93 return inet6_lookup_listener(net, hashinfo, saddr, sport,
94 daddr, hnum, dif);
92} 95}
93 96
94static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, 97static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
index 193796445642..36d9379d4c4b 100644
--- a/include/net/netfilter/nf_tproxy_core.h
+++ b/include/net/netfilter/nf_tproxy_core.h
@@ -152,6 +152,7 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
152 break; 152 break;
153 case NFT_LOOKUP_LISTENER: 153 case NFT_LOOKUP_LISTENER:
154 sk = inet6_lookup_listener(net, &tcp_hashinfo, 154 sk = inet6_lookup_listener(net, &tcp_hashinfo,
155 saddr, sport,
155 daddr, ntohs(dport), 156 daddr, ntohs(dport),
156 in->ifindex); 157 in->ifindex);
157 158
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 30647857a375..e4297a393678 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -32,6 +32,9 @@ int inet6_csk_bind_conflict(const struct sock *sk,
32{ 32{
33 const struct sock *sk2; 33 const struct sock *sk2;
34 const struct hlist_node *node; 34 const struct hlist_node *node;
35 int reuse = sk->sk_reuse;
36 int reuseport = sk->sk_reuseport;
37 int uid = sock_i_uid((struct sock *)sk);
35 38
36 /* We must walk the whole port owner list in this case. -DaveM */ 39 /* We must walk the whole port owner list in this case. -DaveM */
37 /* 40 /*
@@ -42,11 +45,17 @@ int inet6_csk_bind_conflict(const struct sock *sk,
42 if (sk != sk2 && 45 if (sk != sk2 &&
43 (!sk->sk_bound_dev_if || 46 (!sk->sk_bound_dev_if ||
44 !sk2->sk_bound_dev_if || 47 !sk2->sk_bound_dev_if ||
45 sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && 48 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
46 (!sk->sk_reuse || !sk2->sk_reuse || 49 if ((!reuse || !sk2->sk_reuse ||
47 sk2->sk_state == TCP_LISTEN) && 50 sk2->sk_state == TCP_LISTEN) &&
48 ipv6_rcv_saddr_equal(sk, sk2)) 51 (!reuseport || !sk2->sk_reuseport ||
49 break; 52 (sk2->sk_state != TCP_TIME_WAIT &&
53 !uid_eq(uid,
54 sock_i_uid((struct sock *)sk2))))) {
55 if (ipv6_rcv_saddr_equal(sk, sk2))
56 break;
57 }
58 }
50 } 59 }
51 60
52 return node != NULL; 61 return node != NULL;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index dea17fd28e50..32b4a1675d82 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -158,25 +158,38 @@ static inline int compute_score(struct sock *sk, struct net *net,
158} 158}
159 159
160struct sock *inet6_lookup_listener(struct net *net, 160struct sock *inet6_lookup_listener(struct net *net,
161 struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, 161 struct inet_hashinfo *hashinfo, const struct in6_addr *saddr,
162 const __be16 sport, const struct in6_addr *daddr,
162 const unsigned short hnum, const int dif) 163 const unsigned short hnum, const int dif)
163{ 164{
164 struct sock *sk; 165 struct sock *sk;
165 const struct hlist_nulls_node *node; 166 const struct hlist_nulls_node *node;
166 struct sock *result; 167 struct sock *result;
167 int score, hiscore; 168 int score, hiscore, matches = 0, reuseport = 0;
169 u32 phash = 0;
168 unsigned int hash = inet_lhashfn(net, hnum); 170 unsigned int hash = inet_lhashfn(net, hnum);
169 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; 171 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
170 172
171 rcu_read_lock(); 173 rcu_read_lock();
172begin: 174begin:
173 result = NULL; 175 result = NULL;
174 hiscore = -1; 176 hiscore = 0;
175 sk_nulls_for_each(sk, node, &ilb->head) { 177 sk_nulls_for_each(sk, node, &ilb->head) {
176 score = compute_score(sk, net, hnum, daddr, dif); 178 score = compute_score(sk, net, hnum, daddr, dif);
177 if (score > hiscore) { 179 if (score > hiscore) {
178 hiscore = score; 180 hiscore = score;
179 result = sk; 181 result = sk;
182 reuseport = sk->sk_reuseport;
183 if (reuseport) {
184 phash = inet6_ehashfn(net, daddr, hnum,
185 saddr, sport);
186 matches = 1;
187 }
188 } else if (score == hiscore && reuseport) {
189 matches++;
190 if (((u64)phash * matches) >> 32 == 0)
191 result = sk;
192 phash = next_pseudo_random32(phash);
180 } 193 }
181 } 194 }
182 /* 195 /*
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3701c3c6e2eb..06087e58738a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -834,7 +834,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
834 * no RST generated if md5 hash doesn't match. 834 * no RST generated if md5 hash doesn't match.
835 */ 835 */
836 sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), 836 sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
837 &tcp_hashinfo, &ipv6h->daddr, 837 &tcp_hashinfo, &ipv6h->saddr,
838 th->source, &ipv6h->daddr,
838 ntohs(th->source), inet6_iif(skb)); 839 ntohs(th->source), inet6_iif(skb));
839 if (!sk1) 840 if (!sk1)
840 return; 841 return;
@@ -1598,6 +1599,7 @@ do_time_wait:
1598 struct sock *sk2; 1599 struct sock *sk2;
1599 1600
1600 sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, 1601 sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
1602 &ipv6_hdr(skb)->saddr, th->source,
1601 &ipv6_hdr(skb)->daddr, 1603 &ipv6_hdr(skb)->daddr,
1602 ntohs(th->dest), inet6_iif(skb)); 1604 ntohs(th->dest), inet6_iif(skb));
1603 if (sk2 != NULL) { 1605 if (sk2 != NULL) {