aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-03-22 15:12:57 -0400
committerDavid S. Miller <davem@davemloft.net>2018-03-22 15:12:57 -0400
commit1a2e10a24039147047cc21759ec57314c333c1ac (patch)
tree0b09f0de0ff410fef91ad0224a241b322bd0eb05
parentf2d254fac13cc7c86871ea607c4ab1afa7f13e2e (diff)
parentd9ff3049739e349b5380b96226f9ad766741773d (diff)
Merge branch 'Rework-ip_ra_chain-protection'
Kirill Tkhai says: ==================== Rework ip_ra_chain protection Commit 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control" made rtnl_lock() be used in raw_close(). This function is called on every RAW socket destruction, so that rtnl_mutex is taken every time. This scales very sadly. I observe cleanup_net() spending a lot of time in rtnl_lock() and raw_close() is one of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk). This patchset reworks the locking: reverts the problem commit and its descendant, and introduces rtnl-independent locking. This may have a continuation, and someone may work on killing rtnl_lock() in mrtsock_destruct() in the future. v3: Change patches order: [2/5] and [3/5]. v2: Fix sparse warning [4/5], as reported by kbuild test robot. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/ip.h13
-rw-r--r--include/net/netns/ipv4.h2
-rw-r--r--net/core/net_namespace.c1
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_sockglue.c34
-rw-r--r--net/ipv4/ipmr.c11
-rw-r--r--net/ipv4/raw.c2
7 files changed, 38 insertions, 30 deletions
diff --git a/include/net/ip.h b/include/net/ip.h
index fe63ba95d12b..d53b5a9eae34 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb)
91 return 0; 91 return 0;
92} 92}
93 93
94/* Special input handler for packets caught by router alert option.
95 They are selected only by protocol field, and then processed likely
96 local ones; but only if someone wants them! Otherwise, router
97 not running rsvpd will kill RSVP.
98
99 It is user level problem, what it will make with them.
100 I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
101 but receiver should be enough clever f.e. to forward mtrace requests,
102 sent to multicast group to reach destination designated router.
103 */
104
94struct ip_ra_chain { 105struct ip_ra_chain {
95 struct ip_ra_chain __rcu *next; 106 struct ip_ra_chain __rcu *next;
96 struct sock *sk; 107 struct sock *sk;
@@ -101,8 +112,6 @@ struct ip_ra_chain {
101 struct rcu_head rcu; 112 struct rcu_head rcu;
102}; 113};
103 114
104extern struct ip_ra_chain __rcu *ip_ra_chain;
105
106/* IP flags. */ 115/* IP flags. */
107#define IP_CE 0x8000 /* Flag: "Congestion" */ 116#define IP_CE 0x8000 /* Flag: "Congestion" */
108#define IP_DF 0x4000 /* Flag: "Don't Fragment" */ 117#define IP_DF 0x4000 /* Flag: "Don't Fragment" */
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 382bfd7583cf..8491bc9c86b1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -49,6 +49,8 @@ struct netns_ipv4 {
49#endif 49#endif
50 struct ipv4_devconf *devconf_all; 50 struct ipv4_devconf *devconf_all;
51 struct ipv4_devconf *devconf_dflt; 51 struct ipv4_devconf *devconf_dflt;
52 struct ip_ra_chain __rcu *ra_chain;
53 struct mutex ra_mutex;
52#ifdef CONFIG_IP_MULTIPLE_TABLES 54#ifdef CONFIG_IP_MULTIPLE_TABLES
53 struct fib_rules_ops *rules_ops; 55 struct fib_rules_ops *rules_ops;
54 bool fib_has_custom_rules; 56 bool fib_has_custom_rules;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c340d5cfbdec..95ba2c53bd9a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
301 net->user_ns = user_ns; 301 net->user_ns = user_ns;
302 idr_init(&net->netns_ids); 302 idr_init(&net->netns_ids);
303 spin_lock_init(&net->nsid_lock); 303 spin_lock_init(&net->nsid_lock);
304 mutex_init(&net->ipv4.ra_mutex);
304 305
305 list_for_each_entry(ops, &pernet_list, list) { 306 list_for_each_entry(ops, &pernet_list, list) {
306 error = ops_init(ops, net); 307 error = ops_init(ops, net);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 57fc13c6ab2b..7582713dd18f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
159 struct net_device *dev = skb->dev; 159 struct net_device *dev = skb->dev;
160 struct net *net = dev_net(dev); 160 struct net *net = dev_net(dev);
161 161
162 for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { 162 for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
163 struct sock *sk = ra->sk; 163 struct sock *sk = ra->sk;
164 164
165 /* If socket is bound to an interface, only report 165 /* If socket is bound to an interface, only report
@@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
167 */ 167 */
168 if (sk && inet_sk(sk)->inet_num == protocol && 168 if (sk && inet_sk(sk)->inet_num == protocol &&
169 (!sk->sk_bound_dev_if || 169 (!sk->sk_bound_dev_if ||
170 sk->sk_bound_dev_if == dev->ifindex) && 170 sk->sk_bound_dev_if == dev->ifindex)) {
171 net_eq(sock_net(sk), net)) {
172 if (ip_is_fragment(ip_hdr(skb))) { 171 if (ip_is_fragment(ip_hdr(skb))) {
173 if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) 172 if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
174 return true; 173 return true;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 74c962b9b09c..5ad2d8ed3a3f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -322,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
322 return 0; 322 return 0;
323} 323}
324 324
325
326/* Special input handler for packets caught by router alert option.
327 They are selected only by protocol field, and then processed likely
328 local ones; but only if someone wants them! Otherwise, router
329 not running rsvpd will kill RSVP.
330
331 It is user level problem, what it will make with them.
332 I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
333 but receiver should be enough clever f.e. to forward mtrace requests,
334 sent to multicast group to reach destination designated router.
335 */
336struct ip_ra_chain __rcu *ip_ra_chain;
337
338
339static void ip_ra_destroy_rcu(struct rcu_head *head) 325static void ip_ra_destroy_rcu(struct rcu_head *head)
340{ 326{
341 struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); 327 struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
@@ -349,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on,
349{ 335{
350 struct ip_ra_chain *ra, *new_ra; 336 struct ip_ra_chain *ra, *new_ra;
351 struct ip_ra_chain __rcu **rap; 337 struct ip_ra_chain __rcu **rap;
338 struct net *net = sock_net(sk);
352 339
353 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) 340 if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
354 return -EINVAL; 341 return -EINVAL;
355 342
356 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; 343 new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
357 344
358 for (rap = &ip_ra_chain; 345 mutex_lock(&net->ipv4.ra_mutex);
359 (ra = rtnl_dereference(*rap)) != NULL; 346 for (rap = &net->ipv4.ra_chain;
347 (ra = rcu_dereference_protected(*rap,
348 lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
360 rap = &ra->next) { 349 rap = &ra->next) {
361 if (ra->sk == sk) { 350 if (ra->sk == sk) {
362 if (on) { 351 if (on) {
352 mutex_unlock(&net->ipv4.ra_mutex);
363 kfree(new_ra); 353 kfree(new_ra);
364 return -EADDRINUSE; 354 return -EADDRINUSE;
365 } 355 }
366 /* dont let ip_call_ra_chain() use sk again */ 356 /* dont let ip_call_ra_chain() use sk again */
367 ra->sk = NULL; 357 ra->sk = NULL;
368 RCU_INIT_POINTER(*rap, ra->next); 358 RCU_INIT_POINTER(*rap, ra->next);
359 mutex_unlock(&net->ipv4.ra_mutex);
369 360
370 if (ra->destructor) 361 if (ra->destructor)
371 ra->destructor(sk); 362 ra->destructor(sk);
@@ -379,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
379 return 0; 370 return 0;
380 } 371 }
381 } 372 }
382 if (!new_ra) 373 if (!new_ra) {
374 mutex_unlock(&net->ipv4.ra_mutex);
383 return -ENOBUFS; 375 return -ENOBUFS;
376 }
384 new_ra->sk = sk; 377 new_ra->sk = sk;
385 new_ra->destructor = destructor; 378 new_ra->destructor = destructor;
386 379
387 RCU_INIT_POINTER(new_ra->next, ra); 380 RCU_INIT_POINTER(new_ra->next, ra);
388 rcu_assign_pointer(*rap, new_ra); 381 rcu_assign_pointer(*rap, new_ra);
389 sock_hold(sk); 382 sock_hold(sk);
383 mutex_unlock(&net->ipv4.ra_mutex);
390 384
391 return 0; 385 return 0;
392} 386}
@@ -586,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname)
586 case MCAST_LEAVE_GROUP: 580 case MCAST_LEAVE_GROUP:
587 case MCAST_LEAVE_SOURCE_GROUP: 581 case MCAST_LEAVE_SOURCE_GROUP:
588 case MCAST_UNBLOCK_SOURCE: 582 case MCAST_UNBLOCK_SOURCE:
589 case IP_ROUTER_ALERT:
590 return true; 583 return true;
591 } 584 }
592 return false; 585 return false;
@@ -639,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
639 632
640 /* If optlen==0, it is equivalent to val == 0 */ 633 /* If optlen==0, it is equivalent to val == 0 */
641 634
635 if (optname == IP_ROUTER_ALERT)
636 return ip_ra_control(sk, val ? 1 : 0, NULL);
642 if (ip_mroute_opt(optname)) 637 if (ip_mroute_opt(optname))
643 return ip_mroute_setsockopt(sk, optname, optval, optlen); 638 return ip_mroute_setsockopt(sk, optname, optval, optlen);
644 639
@@ -1149,9 +1144,6 @@ mc_msf_out:
1149 goto e_inval; 1144 goto e_inval;
1150 inet->mc_all = val; 1145 inet->mc_all = val;
1151 break; 1146 break;
1152 case IP_ROUTER_ALERT:
1153 err = ip_ra_control(sk, val ? 1 : 0, NULL);
1154 break;
1155 1147
1156 case IP_FREEBIND: 1148 case IP_FREEBIND:
1157 if (optlen < 1) 1149 if (optlen < 1)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d752a70855d8..f6be5db16da2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk)
1399 struct net *net = sock_net(sk); 1399 struct net *net = sock_net(sk);
1400 struct mr_table *mrt; 1400 struct mr_table *mrt;
1401 1401
1402 ASSERT_RTNL(); 1402 rtnl_lock();
1403 ipmr_for_each_table(mrt, net) { 1403 ipmr_for_each_table(mrt, net) {
1404 if (sk == rtnl_dereference(mrt->mroute_sk)) { 1404 if (sk == rtnl_dereference(mrt->mroute_sk)) {
1405 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; 1405 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk)
1411 mroute_clean_tables(mrt, false); 1411 mroute_clean_tables(mrt, false);
1412 } 1412 }
1413 } 1413 }
1414 rtnl_unlock();
1414} 1415}
1415 1416
1416/* Socket options and virtual interface manipulation. The whole 1417/* Socket options and virtual interface manipulation. The whole
@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
1475 if (sk != rcu_access_pointer(mrt->mroute_sk)) { 1476 if (sk != rcu_access_pointer(mrt->mroute_sk)) {
1476 ret = -EACCES; 1477 ret = -EACCES;
1477 } else { 1478 } else {
1479 /* We need to unlock here because mrtsock_destruct takes
1480 * care of rtnl itself and we can't change that due to
1481 * the IP_ROUTER_ALERT setsockopt which runs without it.
1482 */
1483 rtnl_unlock();
1478 ret = ip_ra_control(sk, 0, NULL); 1484 ret = ip_ra_control(sk, 0, NULL);
1479 goto out_unlock; 1485 goto out;
1480 } 1486 }
1481 break; 1487 break;
1482 case MRT_ADD_VIF: 1488 case MRT_ADD_VIF:
@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
1588 } 1594 }
1589out_unlock: 1595out_unlock:
1590 rtnl_unlock(); 1596 rtnl_unlock();
1597out:
1591 return ret; 1598 return ret;
1592} 1599}
1593 1600
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 54648d20bf0f..720bef7da2f6 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout)
711 /* 711 /*
712 * Raw sockets may have direct kernel references. Kill them. 712 * Raw sockets may have direct kernel references. Kill them.
713 */ 713 */
714 rtnl_lock();
715 ip_ra_control(sk, 0, NULL); 714 ip_ra_control(sk, 0, NULL);
716 rtnl_unlock();
717 715
718 sk_common_release(sk); 716 sk_common_release(sk);
719} 717}