aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Gallek <kraig@google.com>2016-04-12 13:11:25 -0400
committerDavid S. Miller <davem@davemloft.net>2016-04-14 21:14:03 -0400
commitd894ba18d4e449b3a7f6eb491f16c9e02933736e (patch)
tree9a0987a506c9caa46daf10a58abd2bc141b06745
parentc5b5343cfbc9f46af65033fa4f407d7b7d98371d (diff)
soreuseport: fix ordering for mixed v4/v6 sockets
With the SO_REUSEPORT socket option, it is possible to create sockets in the AF_INET and AF_INET6 domains which are bound to the same IPv4 address. This is only possible with SO_REUSEPORT and when not using IPV6_V6ONLY on the AF_INET6 sockets. Prior to the commits referenced below, an incoming IPv4 packet would always be routed to a socket of type AF_INET when this mixed-mode was used. After those changes, the same packet would be routed to the most recently bound socket (if this happened to be an AF_INET6 socket, it would have an IPv4 mapped IPv6 address). The change in behavior occurred because the recent SO_REUSEPORT optimizations short-circuit the socket scoring logic as soon as they find a match. They did not take into account the scoring logic that favors AF_INET sockets over AF_INET6 sockets in the event of a tie. To fix this problem, this patch changes the insertion order of AF_INET and AF_INET6 addresses in the TCP and UDP socket lists when the sockets have SO_REUSEPORT set. AF_INET sockets will be inserted at the head of the list and AF_INET6 sockets with SO_REUSEPORT set will always be inserted at the tail of the list. This will force AF_INET sockets to always be considered first. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Fixes: 125e80b88687 ("soreuseport: fast reuseport TCP socket selection") Reported-by: Maciej Żenczykowski <maze@google.com> Signed-off-by: Craig Gallek <kraig@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/rculist_nulls.h39
-rw-r--r--include/net/sock.h6
-rw-r--r--net/ipv4/udp.c9
3 files changed, 51 insertions, 3 deletions
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 1c33dd7da4a7..4ae95f7e8597 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -98,6 +98,45 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
98 if (!is_a_nulls(first)) 98 if (!is_a_nulls(first))
99 first->pprev = &n->next; 99 first->pprev = &n->next;
100} 100}
101
102/**
103 * hlist_nulls_add_tail_rcu
104 * @n: the element to add to the hash list.
105 * @h: the list to add to.
106 *
107 * Description:
108 * Adds the specified element to the end of the specified hlist_nulls,
109 * while permitting racing traversals. NOTE: tail insertion requires
110 * list traversal.
111 *
112 * The caller must take whatever precautions are necessary
113 * (such as holding appropriate locks) to avoid racing
114 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
115 * or hlist_nulls_del_rcu(), running on this same list.
116 * However, it is perfectly legal to run concurrently with
117 * the _rcu list-traversal primitives, such as
118 * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
119 * problems on Alpha CPUs. Regardless of the type of CPU, the
120 * list-traversal primitive must be guarded by rcu_read_lock().
121 */
122static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
123 struct hlist_nulls_head *h)
124{
125 struct hlist_nulls_node *i, *last = NULL;
126
127 for (i = hlist_nulls_first_rcu(h); !is_a_nulls(i);
128 i = hlist_nulls_next_rcu(i))
129 last = i;
130
131 if (last) {
132 n->next = last->next;
133 n->pprev = &last->next;
134 rcu_assign_pointer(hlist_nulls_next_rcu(last), n);
135 } else {
136 hlist_nulls_add_head_rcu(n, h);
137 }
138}
139
101/** 140/**
102 * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type 141 * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
103 * @tpos: the type * to use as a loop cursor. 142 * @tpos: the type * to use as a loop cursor.
diff --git a/include/net/sock.h b/include/net/sock.h
index 255d3e03727b..121ffc115c4f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -630,7 +630,11 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
630 630
631static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) 631static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
632{ 632{
633 hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); 633 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
634 sk->sk_family == AF_INET6)
635 hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
636 else
637 hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
634} 638}
635 639
636static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) 640static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 08eed5e16df0..a2e7f55a1f61 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -339,8 +339,13 @@ found:
339 339
340 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); 340 hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
341 spin_lock(&hslot2->lock); 341 spin_lock(&hslot2->lock);
342 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, 342 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
343 &hslot2->head); 343 sk->sk_family == AF_INET6)
344 hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
345 &hslot2->head);
346 else
347 hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
348 &hslot2->head);
344 hslot2->count++; 349 hslot2->count++;
345 spin_unlock(&hslot2->lock); 350 spin_unlock(&hslot2->lock);
346 } 351 }