diff options
author | David S. Miller <davem@davemloft.net> | 2018-03-22 15:12:57 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-03-22 15:12:57 -0400 |
commit | 1a2e10a24039147047cc21759ec57314c333c1ac (patch) | |
tree | 0b09f0de0ff410fef91ad0224a241b322bd0eb05 | |
parent | f2d254fac13cc7c86871ea607c4ab1afa7f13e2e (diff) | |
parent | d9ff3049739e349b5380b96226f9ad766741773d (diff) |
Merge branch 'Rework-ip_ra_chain-protection'
Kirill Tkhai says:
====================
Rework ip_ra_chain protection
Commit 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control"
made rtnl_lock() be used in raw_close(). This function is called
on every RAW socket destruction, so that rtnl_mutex is taken
every time. This scales very sadly. I observe cleanup_net()
spending a lot of time in rtnl_lock() and raw_close() is one
of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk).
This patchset reworks the locking: reverts the problem commit
and its descendant, and introduces rtnl-independent locking.
This may have a continuation, and someone may work on killing
rtnl_lock() in mrtsock_destruct() in the future.
v3: Change patches order: [2/5] and [3/5].
v2: Fix sparse warning [4/5], as reported by kbuild test robot.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/ip.h | 13 | ||||
-rw-r--r-- | include/net/netns/ipv4.h | 2 | ||||
-rw-r--r-- | net/core/net_namespace.c | 1 | ||||
-rw-r--r-- | net/ipv4/ip_input.c | 5 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 34 | ||||
-rw-r--r-- | net/ipv4/ipmr.c | 11 | ||||
-rw-r--r-- | net/ipv4/raw.c | 2 |
7 files changed, 38 insertions, 30 deletions
diff --git a/include/net/ip.h b/include/net/ip.h index fe63ba95d12b..d53b5a9eae34 100644 --- a/include/net/ip.h +++ b/include/net/ip.h | |||
@@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb) | |||
91 | return 0; | 91 | return 0; |
92 | } | 92 | } |
93 | 93 | ||
94 | /* Special input handler for packets caught by router alert option. | ||
95 | They are selected only by protocol field, and then processed likely | ||
96 | local ones; but only if someone wants them! Otherwise, router | ||
97 | not running rsvpd will kill RSVP. | ||
98 | |||
99 | It is user level problem, what it will make with them. | ||
100 | I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), | ||
101 | but receiver should be enough clever f.e. to forward mtrace requests, | ||
102 | sent to multicast group to reach destination designated router. | ||
103 | */ | ||
104 | |||
94 | struct ip_ra_chain { | 105 | struct ip_ra_chain { |
95 | struct ip_ra_chain __rcu *next; | 106 | struct ip_ra_chain __rcu *next; |
96 | struct sock *sk; | 107 | struct sock *sk; |
@@ -101,8 +112,6 @@ struct ip_ra_chain { | |||
101 | struct rcu_head rcu; | 112 | struct rcu_head rcu; |
102 | }; | 113 | }; |
103 | 114 | ||
104 | extern struct ip_ra_chain __rcu *ip_ra_chain; | ||
105 | |||
106 | /* IP flags. */ | 115 | /* IP flags. */ |
107 | #define IP_CE 0x8000 /* Flag: "Congestion" */ | 116 | #define IP_CE 0x8000 /* Flag: "Congestion" */ |
108 | #define IP_DF 0x4000 /* Flag: "Don't Fragment" */ | 117 | #define IP_DF 0x4000 /* Flag: "Don't Fragment" */ |
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 382bfd7583cf..8491bc9c86b1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h | |||
@@ -49,6 +49,8 @@ struct netns_ipv4 { | |||
49 | #endif | 49 | #endif |
50 | struct ipv4_devconf *devconf_all; | 50 | struct ipv4_devconf *devconf_all; |
51 | struct ipv4_devconf *devconf_dflt; | 51 | struct ipv4_devconf *devconf_dflt; |
52 | struct ip_ra_chain __rcu *ra_chain; | ||
53 | struct mutex ra_mutex; | ||
52 | #ifdef CONFIG_IP_MULTIPLE_TABLES | 54 | #ifdef CONFIG_IP_MULTIPLE_TABLES |
53 | struct fib_rules_ops *rules_ops; | 55 | struct fib_rules_ops *rules_ops; |
54 | bool fib_has_custom_rules; | 56 | bool fib_has_custom_rules; |
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c340d5cfbdec..95ba2c53bd9a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c | |||
@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) | |||
301 | net->user_ns = user_ns; | 301 | net->user_ns = user_ns; |
302 | idr_init(&net->netns_ids); | 302 | idr_init(&net->netns_ids); |
303 | spin_lock_init(&net->nsid_lock); | 303 | spin_lock_init(&net->nsid_lock); |
304 | mutex_init(&net->ipv4.ra_mutex); | ||
304 | 305 | ||
305 | list_for_each_entry(ops, &pernet_list, list) { | 306 | list_for_each_entry(ops, &pernet_list, list) { |
306 | error = ops_init(ops, net); | 307 | error = ops_init(ops, net); |
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 57fc13c6ab2b..7582713dd18f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c | |||
@@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) | |||
159 | struct net_device *dev = skb->dev; | 159 | struct net_device *dev = skb->dev; |
160 | struct net *net = dev_net(dev); | 160 | struct net *net = dev_net(dev); |
161 | 161 | ||
162 | for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { | 162 | for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { |
163 | struct sock *sk = ra->sk; | 163 | struct sock *sk = ra->sk; |
164 | 164 | ||
165 | /* If socket is bound to an interface, only report | 165 | /* If socket is bound to an interface, only report |
@@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) | |||
167 | */ | 167 | */ |
168 | if (sk && inet_sk(sk)->inet_num == protocol && | 168 | if (sk && inet_sk(sk)->inet_num == protocol && |
169 | (!sk->sk_bound_dev_if || | 169 | (!sk->sk_bound_dev_if || |
170 | sk->sk_bound_dev_if == dev->ifindex) && | 170 | sk->sk_bound_dev_if == dev->ifindex)) { |
171 | net_eq(sock_net(sk), net)) { | ||
172 | if (ip_is_fragment(ip_hdr(skb))) { | 171 | if (ip_is_fragment(ip_hdr(skb))) { |
173 | if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) | 172 | if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) |
174 | return true; | 173 | return true; |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 74c962b9b09c..5ad2d8ed3a3f 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -322,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, | |||
322 | return 0; | 322 | return 0; |
323 | } | 323 | } |
324 | 324 | ||
325 | |||
326 | /* Special input handler for packets caught by router alert option. | ||
327 | They are selected only by protocol field, and then processed likely | ||
328 | local ones; but only if someone wants them! Otherwise, router | ||
329 | not running rsvpd will kill RSVP. | ||
330 | |||
331 | It is user level problem, what it will make with them. | ||
332 | I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), | ||
333 | but receiver should be enough clever f.e. to forward mtrace requests, | ||
334 | sent to multicast group to reach destination designated router. | ||
335 | */ | ||
336 | struct ip_ra_chain __rcu *ip_ra_chain; | ||
337 | |||
338 | |||
339 | static void ip_ra_destroy_rcu(struct rcu_head *head) | 325 | static void ip_ra_destroy_rcu(struct rcu_head *head) |
340 | { | 326 | { |
341 | struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); | 327 | struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); |
@@ -349,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on, | |||
349 | { | 335 | { |
350 | struct ip_ra_chain *ra, *new_ra; | 336 | struct ip_ra_chain *ra, *new_ra; |
351 | struct ip_ra_chain __rcu **rap; | 337 | struct ip_ra_chain __rcu **rap; |
338 | struct net *net = sock_net(sk); | ||
352 | 339 | ||
353 | if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) | 340 | if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) |
354 | return -EINVAL; | 341 | return -EINVAL; |
355 | 342 | ||
356 | new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; | 343 | new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; |
357 | 344 | ||
358 | for (rap = &ip_ra_chain; | 345 | mutex_lock(&net->ipv4.ra_mutex); |
359 | (ra = rtnl_dereference(*rap)) != NULL; | 346 | for (rap = &net->ipv4.ra_chain; |
347 | (ra = rcu_dereference_protected(*rap, | ||
348 | lockdep_is_held(&net->ipv4.ra_mutex))) != NULL; | ||
360 | rap = &ra->next) { | 349 | rap = &ra->next) { |
361 | if (ra->sk == sk) { | 350 | if (ra->sk == sk) { |
362 | if (on) { | 351 | if (on) { |
352 | mutex_unlock(&net->ipv4.ra_mutex); | ||
363 | kfree(new_ra); | 353 | kfree(new_ra); |
364 | return -EADDRINUSE; | 354 | return -EADDRINUSE; |
365 | } | 355 | } |
366 | /* dont let ip_call_ra_chain() use sk again */ | 356 | /* dont let ip_call_ra_chain() use sk again */ |
367 | ra->sk = NULL; | 357 | ra->sk = NULL; |
368 | RCU_INIT_POINTER(*rap, ra->next); | 358 | RCU_INIT_POINTER(*rap, ra->next); |
359 | mutex_unlock(&net->ipv4.ra_mutex); | ||
369 | 360 | ||
370 | if (ra->destructor) | 361 | if (ra->destructor) |
371 | ra->destructor(sk); | 362 | ra->destructor(sk); |
@@ -379,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on, | |||
379 | return 0; | 370 | return 0; |
380 | } | 371 | } |
381 | } | 372 | } |
382 | if (!new_ra) | 373 | if (!new_ra) { |
374 | mutex_unlock(&net->ipv4.ra_mutex); | ||
383 | return -ENOBUFS; | 375 | return -ENOBUFS; |
376 | } | ||
384 | new_ra->sk = sk; | 377 | new_ra->sk = sk; |
385 | new_ra->destructor = destructor; | 378 | new_ra->destructor = destructor; |
386 | 379 | ||
387 | RCU_INIT_POINTER(new_ra->next, ra); | 380 | RCU_INIT_POINTER(new_ra->next, ra); |
388 | rcu_assign_pointer(*rap, new_ra); | 381 | rcu_assign_pointer(*rap, new_ra); |
389 | sock_hold(sk); | 382 | sock_hold(sk); |
383 | mutex_unlock(&net->ipv4.ra_mutex); | ||
390 | 384 | ||
391 | return 0; | 385 | return 0; |
392 | } | 386 | } |
@@ -586,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname) | |||
586 | case MCAST_LEAVE_GROUP: | 580 | case MCAST_LEAVE_GROUP: |
587 | case MCAST_LEAVE_SOURCE_GROUP: | 581 | case MCAST_LEAVE_SOURCE_GROUP: |
588 | case MCAST_UNBLOCK_SOURCE: | 582 | case MCAST_UNBLOCK_SOURCE: |
589 | case IP_ROUTER_ALERT: | ||
590 | return true; | 583 | return true; |
591 | } | 584 | } |
592 | return false; | 585 | return false; |
@@ -639,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
639 | 632 | ||
640 | /* If optlen==0, it is equivalent to val == 0 */ | 633 | /* If optlen==0, it is equivalent to val == 0 */ |
641 | 634 | ||
635 | if (optname == IP_ROUTER_ALERT) | ||
636 | return ip_ra_control(sk, val ? 1 : 0, NULL); | ||
642 | if (ip_mroute_opt(optname)) | 637 | if (ip_mroute_opt(optname)) |
643 | return ip_mroute_setsockopt(sk, optname, optval, optlen); | 638 | return ip_mroute_setsockopt(sk, optname, optval, optlen); |
644 | 639 | ||
@@ -1149,9 +1144,6 @@ mc_msf_out: | |||
1149 | goto e_inval; | 1144 | goto e_inval; |
1150 | inet->mc_all = val; | 1145 | inet->mc_all = val; |
1151 | break; | 1146 | break; |
1152 | case IP_ROUTER_ALERT: | ||
1153 | err = ip_ra_control(sk, val ? 1 : 0, NULL); | ||
1154 | break; | ||
1155 | 1147 | ||
1156 | case IP_FREEBIND: | 1148 | case IP_FREEBIND: |
1157 | if (optlen < 1) | 1149 | if (optlen < 1) |
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d752a70855d8..f6be5db16da2 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c | |||
@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk) | |||
1399 | struct net *net = sock_net(sk); | 1399 | struct net *net = sock_net(sk); |
1400 | struct mr_table *mrt; | 1400 | struct mr_table *mrt; |
1401 | 1401 | ||
1402 | ASSERT_RTNL(); | 1402 | rtnl_lock(); |
1403 | ipmr_for_each_table(mrt, net) { | 1403 | ipmr_for_each_table(mrt, net) { |
1404 | if (sk == rtnl_dereference(mrt->mroute_sk)) { | 1404 | if (sk == rtnl_dereference(mrt->mroute_sk)) { |
1405 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; | 1405 | IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; |
@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk) | |||
1411 | mroute_clean_tables(mrt, false); | 1411 | mroute_clean_tables(mrt, false); |
1412 | } | 1412 | } |
1413 | } | 1413 | } |
1414 | rtnl_unlock(); | ||
1414 | } | 1415 | } |
1415 | 1416 | ||
1416 | /* Socket options and virtual interface manipulation. The whole | 1417 | /* Socket options and virtual interface manipulation. The whole |
@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, | |||
1475 | if (sk != rcu_access_pointer(mrt->mroute_sk)) { | 1476 | if (sk != rcu_access_pointer(mrt->mroute_sk)) { |
1476 | ret = -EACCES; | 1477 | ret = -EACCES; |
1477 | } else { | 1478 | } else { |
1479 | /* We need to unlock here because mrtsock_destruct takes | ||
1480 | * care of rtnl itself and we can't change that due to | ||
1481 | * the IP_ROUTER_ALERT setsockopt which runs without it. | ||
1482 | */ | ||
1483 | rtnl_unlock(); | ||
1478 | ret = ip_ra_control(sk, 0, NULL); | 1484 | ret = ip_ra_control(sk, 0, NULL); |
1479 | goto out_unlock; | 1485 | goto out; |
1480 | } | 1486 | } |
1481 | break; | 1487 | break; |
1482 | case MRT_ADD_VIF: | 1488 | case MRT_ADD_VIF: |
@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, | |||
1588 | } | 1594 | } |
1589 | out_unlock: | 1595 | out_unlock: |
1590 | rtnl_unlock(); | 1596 | rtnl_unlock(); |
1597 | out: | ||
1591 | return ret; | 1598 | return ret; |
1592 | } | 1599 | } |
1593 | 1600 | ||
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 54648d20bf0f..720bef7da2f6 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout) | |||
711 | /* | 711 | /* |
712 | * Raw sockets may have direct kernel references. Kill them. | 712 | * Raw sockets may have direct kernel references. Kill them. |
713 | */ | 713 | */ |
714 | rtnl_lock(); | ||
715 | ip_ra_control(sk, 0, NULL); | 714 | ip_ra_control(sk, 0, NULL); |
716 | rtnl_unlock(); | ||
717 | 715 | ||
718 | sk_common_release(sk); | 716 | sk_common_release(sk); |
719 | } | 717 | } |