diff options
author | Kirill Tkhai <ktkhai@virtuozzo.com> | 2018-03-29 12:20:32 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-03-29 13:47:53 -0400 |
commit | f0b07bb151b098d291fd1fd71ef7a2df56fb124a (patch) | |
tree | 24f28ec5ec61e4b0950fef35da79853357a34afb | |
parent | 906edee91e79af5a348f1ad1b3f9b4b948db3db7 (diff) |
net: Introduce net_rwsem to protect net_namespace_list
rtnl_lock() is used everywhere, and contention is very high.
When someone wants to iterate over alive net namespaces,
he/she has no a possibility to do that without exclusive lock.
But the exclusive rtnl_lock() in such places is overkill,
and it just increases the contention. Yes, there is already
for_each_net_rcu() in kernel, but it requires rcu_read_lock(),
and this can't be sleepable. Also, sometimes it may be need
really prevent net_namespace_list growth, so for_each_net_rcu()
is not fit there.
This patch introduces new rw_semaphore, which will be used
instead of rtnl_mutex to protect net_namespace_list. It is
sleepable and allows not-exclusive iterations over net
namespaces list. It allows to stop using rtnl_lock()
in several places (what is made in next patches) and makes
less the time, we keep rtnl_mutex. Here we just add new lock,
while the explanation of we can remove rtnl_lock() there are
in next patches.
Fine grained locks generally are better, then one big lock,
so let's do that with net_namespace_list, while the situation
allows that.
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/infiniband/core/roce_gid_mgmt.c | 2 | ||||
-rw-r--r-- | include/linux/rtnetlink.h | 1 | ||||
-rw-r--r-- | include/net/net_namespace.h | 1 | ||||
-rw-r--r-- | net/core/dev.c | 5 | ||||
-rw-r--r-- | net/core/fib_notifier.c | 2 | ||||
-rw-r--r-- | net/core/net_namespace.c | 18 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 5 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_core.c | 2 | ||||
-rw-r--r-- | net/openvswitch/datapath.c | 2 | ||||
-rw-r--r-- | net/wireless/wext-core.c | 2 | ||||
-rw-r--r-- | security/selinux/include/xfrm.h | 2 |
11 files changed, 37 insertions, 5 deletions
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 5a52ec77940a..cc2966380c0c 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c | |||
@@ -403,10 +403,12 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, | |||
403 | * our feet | 403 | * our feet |
404 | */ | 404 | */ |
405 | rtnl_lock(); | 405 | rtnl_lock(); |
406 | down_read(&net_rwsem); | ||
406 | for_each_net(net) | 407 | for_each_net(net) |
407 | for_each_netdev(net, ndev) | 408 | for_each_netdev(net, ndev) |
408 | if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev)) | 409 | if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev)) |
409 | add_netdev_ips(ib_dev, port, rdma_ndev, ndev); | 410 | add_netdev_ips(ib_dev, port, rdma_ndev, ndev); |
411 | up_read(&net_rwsem); | ||
410 | rtnl_unlock(); | 412 | rtnl_unlock(); |
411 | } | 413 | } |
412 | 414 | ||
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c7d1e4689325..5225832bd6ff 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h | |||
@@ -37,6 +37,7 @@ extern int rtnl_lock_killable(void); | |||
37 | 37 | ||
38 | extern wait_queue_head_t netdev_unregistering_wq; | 38 | extern wait_queue_head_t netdev_unregistering_wq; |
39 | extern struct rw_semaphore pernet_ops_rwsem; | 39 | extern struct rw_semaphore pernet_ops_rwsem; |
40 | extern struct rw_semaphore net_rwsem; | ||
40 | 41 | ||
41 | #ifdef CONFIG_PROVE_LOCKING | 42 | #ifdef CONFIG_PROVE_LOCKING |
42 | extern bool lockdep_rtnl_is_held(void); | 43 | extern bool lockdep_rtnl_is_held(void); |
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 1ab4f920f109..47e35cce3b64 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h | |||
@@ -291,6 +291,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) | |||
291 | #endif | 291 | #endif |
292 | } | 292 | } |
293 | 293 | ||
294 | /* Protected by net_rwsem */ | ||
294 | #define for_each_net(VAR) \ | 295 | #define for_each_net(VAR) \ |
295 | list_for_each_entry(VAR, &net_namespace_list, list) | 296 | list_for_each_entry(VAR, &net_namespace_list, list) |
296 | 297 | ||
diff --git a/net/core/dev.c b/net/core/dev.c index e13807b5c84d..eca5458b2753 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -1629,6 +1629,7 @@ int register_netdevice_notifier(struct notifier_block *nb) | |||
1629 | goto unlock; | 1629 | goto unlock; |
1630 | if (dev_boot_phase) | 1630 | if (dev_boot_phase) |
1631 | goto unlock; | 1631 | goto unlock; |
1632 | down_read(&net_rwsem); | ||
1632 | for_each_net(net) { | 1633 | for_each_net(net) { |
1633 | for_each_netdev(net, dev) { | 1634 | for_each_netdev(net, dev) { |
1634 | err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); | 1635 | err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); |
@@ -1642,6 +1643,7 @@ int register_netdevice_notifier(struct notifier_block *nb) | |||
1642 | call_netdevice_notifier(nb, NETDEV_UP, dev); | 1643 | call_netdevice_notifier(nb, NETDEV_UP, dev); |
1643 | } | 1644 | } |
1644 | } | 1645 | } |
1646 | up_read(&net_rwsem); | ||
1645 | 1647 | ||
1646 | unlock: | 1648 | unlock: |
1647 | rtnl_unlock(); | 1649 | rtnl_unlock(); |
@@ -1664,6 +1666,7 @@ rollback: | |||
1664 | } | 1666 | } |
1665 | 1667 | ||
1666 | outroll: | 1668 | outroll: |
1669 | up_read(&net_rwsem); | ||
1667 | raw_notifier_chain_unregister(&netdev_chain, nb); | 1670 | raw_notifier_chain_unregister(&netdev_chain, nb); |
1668 | goto unlock; | 1671 | goto unlock; |
1669 | } | 1672 | } |
@@ -1694,6 +1697,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) | |||
1694 | if (err) | 1697 | if (err) |
1695 | goto unlock; | 1698 | goto unlock; |
1696 | 1699 | ||
1700 | down_read(&net_rwsem); | ||
1697 | for_each_net(net) { | 1701 | for_each_net(net) { |
1698 | for_each_netdev(net, dev) { | 1702 | for_each_netdev(net, dev) { |
1699 | if (dev->flags & IFF_UP) { | 1703 | if (dev->flags & IFF_UP) { |
@@ -1704,6 +1708,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) | |||
1704 | call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); | 1708 | call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); |
1705 | } | 1709 | } |
1706 | } | 1710 | } |
1711 | up_read(&net_rwsem); | ||
1707 | unlock: | 1712 | unlock: |
1708 | rtnl_unlock(); | 1713 | rtnl_unlock(); |
1709 | return err; | 1714 | return err; |
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 0c048bdeb016..614b985c92a4 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c | |||
@@ -33,6 +33,7 @@ static unsigned int fib_seq_sum(void) | |||
33 | struct net *net; | 33 | struct net *net; |
34 | 34 | ||
35 | rtnl_lock(); | 35 | rtnl_lock(); |
36 | down_read(&net_rwsem); | ||
36 | for_each_net(net) { | 37 | for_each_net(net) { |
37 | rcu_read_lock(); | 38 | rcu_read_lock(); |
38 | list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { | 39 | list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { |
@@ -43,6 +44,7 @@ static unsigned int fib_seq_sum(void) | |||
43 | } | 44 | } |
44 | rcu_read_unlock(); | 45 | rcu_read_unlock(); |
45 | } | 46 | } |
47 | up_read(&net_rwsem); | ||
46 | rtnl_unlock(); | 48 | rtnl_unlock(); |
47 | 49 | ||
48 | return fib_seq; | 50 | return fib_seq; |
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b5796d17a302..7fdf321d4997 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c | |||
@@ -33,6 +33,10 @@ static struct list_head *first_device = &pernet_list; | |||
33 | LIST_HEAD(net_namespace_list); | 33 | LIST_HEAD(net_namespace_list); |
34 | EXPORT_SYMBOL_GPL(net_namespace_list); | 34 | EXPORT_SYMBOL_GPL(net_namespace_list); |
35 | 35 | ||
36 | /* Protects net_namespace_list. Nests iside rtnl_lock() */ | ||
37 | DECLARE_RWSEM(net_rwsem); | ||
38 | EXPORT_SYMBOL_GPL(net_rwsem); | ||
39 | |||
36 | struct net init_net = { | 40 | struct net init_net = { |
37 | .count = REFCOUNT_INIT(1), | 41 | .count = REFCOUNT_INIT(1), |
38 | .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), | 42 | .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), |
@@ -309,9 +313,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) | |||
309 | if (error < 0) | 313 | if (error < 0) |
310 | goto out_undo; | 314 | goto out_undo; |
311 | } | 315 | } |
312 | rtnl_lock(); | 316 | down_write(&net_rwsem); |
313 | list_add_tail_rcu(&net->list, &net_namespace_list); | 317 | list_add_tail_rcu(&net->list, &net_namespace_list); |
314 | rtnl_unlock(); | 318 | up_write(&net_rwsem); |
315 | out: | 319 | out: |
316 | return error; | 320 | return error; |
317 | 321 | ||
@@ -450,7 +454,7 @@ static void unhash_nsid(struct net *net, struct net *last) | |||
450 | * and this work is the only process, that may delete | 454 | * and this work is the only process, that may delete |
451 | * a net from net_namespace_list. So, when the below | 455 | * a net from net_namespace_list. So, when the below |
452 | * is executing, the list may only grow. Thus, we do not | 456 | * is executing, the list may only grow. Thus, we do not |
453 | * use for_each_net_rcu() or rtnl_lock(). | 457 | * use for_each_net_rcu() or net_rwsem. |
454 | */ | 458 | */ |
455 | for_each_net(tmp) { | 459 | for_each_net(tmp) { |
456 | int id; | 460 | int id; |
@@ -485,7 +489,7 @@ static void cleanup_net(struct work_struct *work) | |||
485 | down_read(&pernet_ops_rwsem); | 489 | down_read(&pernet_ops_rwsem); |
486 | 490 | ||
487 | /* Don't let anyone else find us. */ | 491 | /* Don't let anyone else find us. */ |
488 | rtnl_lock(); | 492 | down_write(&net_rwsem); |
489 | llist_for_each_entry(net, net_kill_list, cleanup_list) | 493 | llist_for_each_entry(net, net_kill_list, cleanup_list) |
490 | list_del_rcu(&net->list); | 494 | list_del_rcu(&net->list); |
491 | /* Cache last net. After we unlock rtnl, no one new net | 495 | /* Cache last net. After we unlock rtnl, no one new net |
@@ -499,7 +503,7 @@ static void cleanup_net(struct work_struct *work) | |||
499 | * useless anyway, as netns_ids are destroyed there. | 503 | * useless anyway, as netns_ids are destroyed there. |
500 | */ | 504 | */ |
501 | last = list_last_entry(&net_namespace_list, struct net, list); | 505 | last = list_last_entry(&net_namespace_list, struct net, list); |
502 | rtnl_unlock(); | 506 | up_write(&net_rwsem); |
503 | 507 | ||
504 | llist_for_each_entry(net, net_kill_list, cleanup_list) { | 508 | llist_for_each_entry(net, net_kill_list, cleanup_list) { |
505 | unhash_nsid(net, last); | 509 | unhash_nsid(net, last); |
@@ -900,6 +904,9 @@ static int __register_pernet_operations(struct list_head *list, | |||
900 | 904 | ||
901 | list_add_tail(&ops->list, list); | 905 | list_add_tail(&ops->list, list); |
902 | if (ops->init || (ops->id && ops->size)) { | 906 | if (ops->init || (ops->id && ops->size)) { |
907 | /* We held write locked pernet_ops_rwsem, and parallel | ||
908 | * setup_net() and cleanup_net() are not possible. | ||
909 | */ | ||
903 | for_each_net(net) { | 910 | for_each_net(net) { |
904 | error = ops_init(ops, net); | 911 | error = ops_init(ops, net); |
905 | if (error) | 912 | if (error) |
@@ -923,6 +930,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) | |||
923 | LIST_HEAD(net_exit_list); | 930 | LIST_HEAD(net_exit_list); |
924 | 931 | ||
925 | list_del(&ops->list); | 932 | list_del(&ops->list); |
933 | /* See comment in __register_pernet_operations() */ | ||
926 | for_each_net(net) | 934 | for_each_net(net) |
927 | list_add_tail(&net->exit_list, &net_exit_list); | 935 | list_add_tail(&net->exit_list, &net_exit_list); |
928 | ops_exit_list(ops, &net_exit_list); | 936 | ops_exit_list(ops, &net_exit_list); |
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2d3949789cef..e86b28482ca7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c | |||
@@ -418,9 +418,11 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) | |||
418 | { | 418 | { |
419 | struct net *net; | 419 | struct net *net; |
420 | 420 | ||
421 | down_read(&net_rwsem); | ||
421 | for_each_net(net) { | 422 | for_each_net(net) { |
422 | __rtnl_kill_links(net, ops); | 423 | __rtnl_kill_links(net, ops); |
423 | } | 424 | } |
425 | up_read(&net_rwsem); | ||
424 | list_del(&ops->list); | 426 | list_del(&ops->list); |
425 | } | 427 | } |
426 | EXPORT_SYMBOL_GPL(__rtnl_link_unregister); | 428 | EXPORT_SYMBOL_GPL(__rtnl_link_unregister); |
@@ -438,6 +440,9 @@ static void rtnl_lock_unregistering_all(void) | |||
438 | for (;;) { | 440 | for (;;) { |
439 | unregistering = false; | 441 | unregistering = false; |
440 | rtnl_lock(); | 442 | rtnl_lock(); |
443 | /* We held write locked pernet_ops_rwsem, and parallel | ||
444 | * setup_net() and cleanup_net() are not possible. | ||
445 | */ | ||
441 | for_each_net(net) { | 446 | for_each_net(net) { |
442 | if (net->dev_unreg_count > 0) { | 447 | if (net->dev_unreg_count > 0) { |
443 | unregistering = true; | 448 | unregistering = true; |
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 705198de671d..370f9b7f051b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c | |||
@@ -1764,12 +1764,14 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) | |||
1764 | struct net *net; | 1764 | struct net *net; |
1765 | 1765 | ||
1766 | rtnl_lock(); | 1766 | rtnl_lock(); |
1767 | down_read(&net_rwsem); | ||
1767 | for_each_net(net) { | 1768 | for_each_net(net) { |
1768 | if (atomic_read(&net->ct.count) == 0) | 1769 | if (atomic_read(&net->ct.count) == 0) |
1769 | continue; | 1770 | continue; |
1770 | __nf_ct_unconfirmed_destroy(net); | 1771 | __nf_ct_unconfirmed_destroy(net); |
1771 | nf_queue_nf_hook_drop(net); | 1772 | nf_queue_nf_hook_drop(net); |
1772 | } | 1773 | } |
1774 | up_read(&net_rwsem); | ||
1773 | rtnl_unlock(); | 1775 | rtnl_unlock(); |
1774 | 1776 | ||
1775 | /* Need to wait for netns cleanup worker to finish, if its | 1777 | /* Need to wait for netns cleanup worker to finish, if its |
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ef38e5aecd28..9746ee30a99b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c | |||
@@ -2364,8 +2364,10 @@ static void __net_exit ovs_exit_net(struct net *dnet) | |||
2364 | __dp_destroy(dp); | 2364 | __dp_destroy(dp); |
2365 | 2365 | ||
2366 | rtnl_lock(); | 2366 | rtnl_lock(); |
2367 | down_read(&net_rwsem); | ||
2367 | for_each_net(net) | 2368 | for_each_net(net) |
2368 | list_vports_from_net(net, dnet, &head); | 2369 | list_vports_from_net(net, dnet, &head); |
2370 | up_read(&net_rwsem); | ||
2369 | rtnl_unlock(); | 2371 | rtnl_unlock(); |
2370 | 2372 | ||
2371 | /* Detach all vports from given namespace. */ | 2373 | /* Detach all vports from given namespace. */ |
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 9efbfc753347..544d7b62d7ca 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c | |||
@@ -349,11 +349,13 @@ void wireless_nlevent_flush(void) | |||
349 | 349 | ||
350 | ASSERT_RTNL(); | 350 | ASSERT_RTNL(); |
351 | 351 | ||
352 | down_read(&net_rwsem); | ||
352 | for_each_net(net) { | 353 | for_each_net(net) { |
353 | while ((skb = skb_dequeue(&net->wext_nlevents))) | 354 | while ((skb = skb_dequeue(&net->wext_nlevents))) |
354 | rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, | 355 | rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, |
355 | GFP_KERNEL); | 356 | GFP_KERNEL); |
356 | } | 357 | } |
358 | up_read(&net_rwsem); | ||
357 | } | 359 | } |
358 | EXPORT_SYMBOL_GPL(wireless_nlevent_flush); | 360 | EXPORT_SYMBOL_GPL(wireless_nlevent_flush); |
359 | 361 | ||
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 1f173a7a4daa..31d66431be1e 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h | |||
@@ -48,8 +48,10 @@ static inline void selinux_xfrm_notify_policyload(void) | |||
48 | struct net *net; | 48 | struct net *net; |
49 | 49 | ||
50 | rtnl_lock(); | 50 | rtnl_lock(); |
51 | down_read(&net_rwsem); | ||
51 | for_each_net(net) | 52 | for_each_net(net) |
52 | rt_genid_bump_all(net); | 53 | rt_genid_bump_all(net); |
54 | up_read(&net_rwsem); | ||
53 | rtnl_unlock(); | 55 | rtnl_unlock(); |
54 | } | 56 | } |
55 | #else | 57 | #else |