aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorTom Herbert <therbert@google.com>2013-01-22 04:50:32 -0500
committerDavid S. Miller <davem@davemloft.net>2013-01-23 13:44:01 -0500
commitba418fa357a7b3c9d477f4706c6c7c96ddbd1360 (patch)
tree127ab1b2e773a2c50d217565b6413dd8be9e49a5 /net/ipv4
parentda5e36308d9f7151845018369148201a5d28b46d (diff)
soreuseport: UDP/IPv4 implementation
Allow multiple UDP sockets to bind to the same port. Motivation soreuseport would be something like a DNS server.  An alternative would be to recv on the same socket from multiple threads. As in the case of TCP, the load across these threads tends to be disproportionate and we also see a lot of contection on the socketlock. Note that SO_REUSEADDR already allows multiple UDP sockets to bind to the same port, however there is no provision to prevent hijacking and nothing to distribute packets across all the sockets sharing the same bound port.  This patch does not change the semantics of SO_REUSEADDR, but provides usable functionality of it for unicast. Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/udp.c61
1 files changed, 43 insertions, 18 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf6158f1f46b..e0610e4b5158 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -139,6 +139,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
139{ 139{
140 struct sock *sk2; 140 struct sock *sk2;
141 struct hlist_nulls_node *node; 141 struct hlist_nulls_node *node;
142 kuid_t uid = sock_i_uid(sk);
142 143
143 sk_nulls_for_each(sk2, node, &hslot->head) 144 sk_nulls_for_each(sk2, node, &hslot->head)
144 if (net_eq(sock_net(sk2), net) && 145 if (net_eq(sock_net(sk2), net) &&
@@ -147,6 +148,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
147 (!sk2->sk_reuse || !sk->sk_reuse) && 148 (!sk2->sk_reuse || !sk->sk_reuse) &&
148 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || 149 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
149 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 150 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
151 (!sk2->sk_reuseport || !sk->sk_reuseport ||
152 !uid_eq(uid, sock_i_uid(sk2))) &&
150 (*saddr_comp)(sk, sk2)) { 153 (*saddr_comp)(sk, sk2)) {
151 if (bitmap) 154 if (bitmap)
152 __set_bit(udp_sk(sk2)->udp_port_hash >> log, 155 __set_bit(udp_sk(sk2)->udp_port_hash >> log,
@@ -169,6 +172,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
169{ 172{
170 struct sock *sk2; 173 struct sock *sk2;
171 struct hlist_nulls_node *node; 174 struct hlist_nulls_node *node;
175 kuid_t uid = sock_i_uid(sk);
172 int res = 0; 176 int res = 0;
173 177
174 spin_lock(&hslot2->lock); 178 spin_lock(&hslot2->lock);
@@ -179,6 +183,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
179 (!sk2->sk_reuse || !sk->sk_reuse) && 183 (!sk2->sk_reuse || !sk->sk_reuse) &&
180 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || 184 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
181 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && 185 sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
186 (!sk2->sk_reuseport || !sk->sk_reuseport ||
187 !uid_eq(uid, sock_i_uid(sk2))) &&
182 (*saddr_comp)(sk, sk2)) { 188 (*saddr_comp)(sk, sk2)) {
183 res = 1; 189 res = 1;
184 break; 190 break;
@@ -337,26 +343,26 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
337 !ipv6_only_sock(sk)) { 343 !ipv6_only_sock(sk)) {
338 struct inet_sock *inet = inet_sk(sk); 344 struct inet_sock *inet = inet_sk(sk);
339 345
340 score = (sk->sk_family == PF_INET ? 1 : 0); 346 score = (sk->sk_family == PF_INET ? 2 : 1);
341 if (inet->inet_rcv_saddr) { 347 if (inet->inet_rcv_saddr) {
342 if (inet->inet_rcv_saddr != daddr) 348 if (inet->inet_rcv_saddr != daddr)
343 return -1; 349 return -1;
344 score += 2; 350 score += 4;
345 } 351 }
346 if (inet->inet_daddr) { 352 if (inet->inet_daddr) {
347 if (inet->inet_daddr != saddr) 353 if (inet->inet_daddr != saddr)
348 return -1; 354 return -1;
349 score += 2; 355 score += 4;
350 } 356 }
351 if (inet->inet_dport) { 357 if (inet->inet_dport) {
352 if (inet->inet_dport != sport) 358 if (inet->inet_dport != sport)
353 return -1; 359 return -1;
354 score += 2; 360 score += 4;
355 } 361 }
356 if (sk->sk_bound_dev_if) { 362 if (sk->sk_bound_dev_if) {
357 if (sk->sk_bound_dev_if != dif) 363 if (sk->sk_bound_dev_if != dif)
358 return -1; 364 return -1;
359 score += 2; 365 score += 4;
360 } 366 }
361 } 367 }
362 return score; 368 return score;
@@ -365,7 +371,6 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
365/* 371/*
366 * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num) 372 * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
367 */ 373 */
368#define SCORE2_MAX (1 + 2 + 2 + 2)
369static inline int compute_score2(struct sock *sk, struct net *net, 374static inline int compute_score2(struct sock *sk, struct net *net,
370 __be32 saddr, __be16 sport, 375 __be32 saddr, __be16 sport,
371 __be32 daddr, unsigned int hnum, int dif) 376 __be32 daddr, unsigned int hnum, int dif)
@@ -380,21 +385,21 @@ static inline int compute_score2(struct sock *sk, struct net *net,
380 if (inet->inet_num != hnum) 385 if (inet->inet_num != hnum)
381 return -1; 386 return -1;
382 387
383 score = (sk->sk_family == PF_INET ? 1 : 0); 388 score = (sk->sk_family == PF_INET ? 2 : 1);
384 if (inet->inet_daddr) { 389 if (inet->inet_daddr) {
385 if (inet->inet_daddr != saddr) 390 if (inet->inet_daddr != saddr)
386 return -1; 391 return -1;
387 score += 2; 392 score += 4;
388 } 393 }
389 if (inet->inet_dport) { 394 if (inet->inet_dport) {
390 if (inet->inet_dport != sport) 395 if (inet->inet_dport != sport)
391 return -1; 396 return -1;
392 score += 2; 397 score += 4;
393 } 398 }
394 if (sk->sk_bound_dev_if) { 399 if (sk->sk_bound_dev_if) {
395 if (sk->sk_bound_dev_if != dif) 400 if (sk->sk_bound_dev_if != dif)
396 return -1; 401 return -1;
397 score += 2; 402 score += 4;
398 } 403 }
399 } 404 }
400 return score; 405 return score;
@@ -409,19 +414,29 @@ static struct sock *udp4_lib_lookup2(struct net *net,
409{ 414{
410 struct sock *sk, *result; 415 struct sock *sk, *result;
411 struct hlist_nulls_node *node; 416 struct hlist_nulls_node *node;
412 int score, badness; 417 int score, badness, matches = 0, reuseport = 0;
418 u32 hash = 0;
413 419
414begin: 420begin:
415 result = NULL; 421 result = NULL;
416 badness = -1; 422 badness = 0;
417 udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { 423 udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
418 score = compute_score2(sk, net, saddr, sport, 424 score = compute_score2(sk, net, saddr, sport,
419 daddr, hnum, dif); 425 daddr, hnum, dif);
420 if (score > badness) { 426 if (score > badness) {
421 result = sk; 427 result = sk;
422 badness = score; 428 badness = score;
423 if (score == SCORE2_MAX) 429 reuseport = sk->sk_reuseport;
424 goto exact_match; 430 if (reuseport) {
431 hash = inet_ehashfn(net, daddr, hnum,
432 saddr, htons(sport));
433 matches = 1;
434 }
435 } else if (score == badness && reuseport) {
436 matches++;
437 if (((u64)hash * matches) >> 32 == 0)
438 result = sk;
439 hash = next_pseudo_random32(hash);
425 } 440 }
426 } 441 }
427 /* 442 /*
@@ -431,9 +446,7 @@ begin:
431 */ 446 */
432 if (get_nulls_value(node) != slot2) 447 if (get_nulls_value(node) != slot2)
433 goto begin; 448 goto begin;
434
435 if (result) { 449 if (result) {
436exact_match:
437 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) 450 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
438 result = NULL; 451 result = NULL;
439 else if (unlikely(compute_score2(result, net, saddr, sport, 452 else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -457,7 +470,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
457 unsigned short hnum = ntohs(dport); 470 unsigned short hnum = ntohs(dport);
458 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); 471 unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
459 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; 472 struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
460 int score, badness; 473 int score, badness, matches = 0, reuseport = 0;
474 u32 hash = 0;
461 475
462 rcu_read_lock(); 476 rcu_read_lock();
463 if (hslot->count > 10) { 477 if (hslot->count > 10) {
@@ -486,13 +500,24 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
486 } 500 }
487begin: 501begin:
488 result = NULL; 502 result = NULL;
489 badness = -1; 503 badness = 0;
490 sk_nulls_for_each_rcu(sk, node, &hslot->head) { 504 sk_nulls_for_each_rcu(sk, node, &hslot->head) {
491 score = compute_score(sk, net, saddr, hnum, sport, 505 score = compute_score(sk, net, saddr, hnum, sport,
492 daddr, dport, dif); 506 daddr, dport, dif);
493 if (score > badness) { 507 if (score > badness) {
494 result = sk; 508 result = sk;
495 badness = score; 509 badness = score;
510 reuseport = sk->sk_reuseport;
511 if (reuseport) {
512 hash = inet_ehashfn(net, daddr, hnum,
513 saddr, htons(sport));
514 matches = 1;
515 }
516 } else if (score == badness && reuseport) {
517 matches++;
518 if (((u64)hash * matches) >> 32 == 0)
519 result = sk;
520 hash = next_pseudo_random32(hash);
496 } 521 }
497 } 522 }
498 /* 523 /*