diff options
author | Craig Gallek <kraig@google.com> | 2016-04-12 13:11:25 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-04-14 21:14:03 -0400 |
commit | d894ba18d4e449b3a7f6eb491f16c9e02933736e (patch) | |
tree | 9a0987a506c9caa46daf10a58abd2bc141b06745 | |
parent | c5b5343cfbc9f46af65033fa4f407d7b7d98371d (diff) |
soreuseport: fix ordering for mixed v4/v6 sockets
With the SO_REUSEPORT socket option, it is possible to create sockets
in the AF_INET and AF_INET6 domains which are bound to the same IPv4 address.
This is only possible with SO_REUSEPORT and when not using IPV6_V6ONLY on
the AF_INET6 sockets.
Prior to the commits referenced below, an incoming IPv4 packet would
always be routed to a socket of type AF_INET when this mixed-mode was used.
After those changes, the same packet would be routed to the most recently
bound socket (if this happened to be an AF_INET6 socket, it would
have an IPv4 mapped IPv6 address).
The change in behavior occurred because the recent SO_REUSEPORT optimizations
short-circuit the socket scoring logic as soon as they find a match. They
did not take into account the scoring logic that favors AF_INET sockets
over AF_INET6 sockets in the event of a tie.
To fix this problem, this patch changes the insertion order of AF_INET
and AF_INET6 addresses in the TCP and UDP socket lists when the sockets
have SO_REUSEPORT set. AF_INET sockets will be inserted at the head of the
list and AF_INET6 sockets with SO_REUSEPORT set will always be inserted at
the tail of the list. This will force AF_INET sockets to always be
considered first.
Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
Fixes: 125e80b88687 ("soreuseport: fast reuseport TCP socket selection")
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/linux/rculist_nulls.h | 39 | ||||
-rw-r--r-- | include/net/sock.h | 6 | ||||
-rw-r--r-- | net/ipv4/udp.c | 9 |
3 files changed, 51 insertions, 3 deletions
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 1c33dd7da4a7..4ae95f7e8597 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h | |||
@@ -98,6 +98,45 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, | |||
98 | if (!is_a_nulls(first)) | 98 | if (!is_a_nulls(first)) |
99 | first->pprev = &n->next; | 99 | first->pprev = &n->next; |
100 | } | 100 | } |
101 | |||
102 | /** | ||
103 | * hlist_nulls_add_tail_rcu | ||
104 | * @n: the element to add to the hash list. | ||
105 | * @h: the list to add to. | ||
106 | * | ||
107 | * Description: | ||
108 | * Adds the specified element to the end of the specified hlist_nulls, | ||
109 | * while permitting racing traversals. NOTE: tail insertion requires | ||
110 | * list traversal. | ||
111 | * | ||
112 | * The caller must take whatever precautions are necessary | ||
113 | * (such as holding appropriate locks) to avoid racing | ||
114 | * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() | ||
115 | * or hlist_nulls_del_rcu(), running on this same list. | ||
116 | * However, it is perfectly legal to run concurrently with | ||
117 | * the _rcu list-traversal primitives, such as | ||
118 | * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency | ||
119 | * problems on Alpha CPUs. Regardless of the type of CPU, the | ||
120 | * list-traversal primitive must be guarded by rcu_read_lock(). | ||
121 | */ | ||
122 | static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, | ||
123 | struct hlist_nulls_head *h) | ||
124 | { | ||
125 | struct hlist_nulls_node *i, *last = NULL; | ||
126 | |||
127 | for (i = hlist_nulls_first_rcu(h); !is_a_nulls(i); | ||
128 | i = hlist_nulls_next_rcu(i)) | ||
129 | last = i; | ||
130 | |||
131 | if (last) { | ||
132 | n->next = last->next; | ||
133 | n->pprev = &last->next; | ||
134 | rcu_assign_pointer(hlist_nulls_next_rcu(last), n); | ||
135 | } else { | ||
136 | hlist_nulls_add_head_rcu(n, h); | ||
137 | } | ||
138 | } | ||
139 | |||
101 | /** | 140 | /** |
102 | * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type | 141 | * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type |
103 | * @tpos: the type * to use as a loop cursor. | 142 | * @tpos: the type * to use as a loop cursor. |
diff --git a/include/net/sock.h b/include/net/sock.h index 255d3e03727b..121ffc115c4f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -630,7 +630,11 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) | |||
630 | 630 | ||
631 | static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) | 631 | static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) |
632 | { | 632 | { |
633 | hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); | 633 | if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && |
634 | sk->sk_family == AF_INET6) | ||
635 | hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); | ||
636 | else | ||
637 | hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); | ||
634 | } | 638 | } |
635 | 639 | ||
636 | static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) | 640 | static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 08eed5e16df0..a2e7f55a1f61 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -339,8 +339,13 @@ found: | |||
339 | 339 | ||
340 | hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); | 340 | hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); |
341 | spin_lock(&hslot2->lock); | 341 | spin_lock(&hslot2->lock); |
342 | hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, | 342 | if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && |
343 | &hslot2->head); | 343 | sk->sk_family == AF_INET6) |
344 | hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, | ||
345 | &hslot2->head); | ||
346 | else | ||
347 | hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, | ||
348 | &hslot2->head); | ||
344 | hslot2->count++; | 349 | hslot2->count++; |
345 | spin_unlock(&hslot2->lock); | 350 | spin_unlock(&hslot2->lock); |
346 | } | 351 | } |