diff options
-rw-r--r-- | fs/kernfs/mount.c | 30 | ||||
-rw-r--r-- | include/linux/kernfs.h | 1 | ||||
-rw-r--r-- | kernel/cgroup.c | 58 | ||||
-rw-r--r-- | kernel/cpuset.c | 20 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 |
5 files changed, 100 insertions, 11 deletions
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d171b98a6cdd..f973ae9b05f1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c | |||
@@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb) | |||
211 | kernfs_put(root_kn); | 211 | kernfs_put(root_kn); |
212 | } | 212 | } |
213 | 213 | ||
214 | /** | ||
215 | * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root | ||
216 | * @kernfs_root: the kernfs_root in question | ||
217 | * @ns: the namespace tag | ||
218 | * | ||
219 | * Pin the superblock so the superblock won't be destroyed in subsequent | ||
220 | * operations. This can be used to block ->kill_sb() which may be useful | ||
221 | * for kernfs users which dynamically manage superblocks. | ||
222 | * | ||
223 | * Returns NULL if there's no superblock associated to this kernfs_root, or | ||
224 | * -EINVAL if the superblock is being freed. | ||
225 | */ | ||
226 | struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) | ||
227 | { | ||
228 | struct kernfs_super_info *info; | ||
229 | struct super_block *sb = NULL; | ||
230 | |||
231 | mutex_lock(&kernfs_mutex); | ||
232 | list_for_each_entry(info, &root->supers, node) { | ||
233 | if (info->ns == ns) { | ||
234 | sb = info->sb; | ||
235 | if (!atomic_inc_not_zero(&info->sb->s_active)) | ||
236 | sb = ERR_PTR(-EINVAL); | ||
237 | break; | ||
238 | } | ||
239 | } | ||
240 | mutex_unlock(&kernfs_mutex); | ||
241 | return sb; | ||
242 | } | ||
243 | |||
214 | void __init kernfs_init(void) | 244 | void __init kernfs_init(void) |
215 | { | 245 | { |
216 | kernfs_node_cache = kmem_cache_create("kernfs_node_cache", | 246 | kernfs_node_cache = kmem_cache_create("kernfs_node_cache", |
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 145375ea0bd9..30faf797c2c3 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h | |||
@@ -305,6 +305,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, | |||
305 | struct kernfs_root *root, unsigned long magic, | 305 | struct kernfs_root *root, unsigned long magic, |
306 | bool *new_sb_created, const void *ns); | 306 | bool *new_sb_created, const void *ns); |
307 | void kernfs_kill_sb(struct super_block *sb); | 307 | void kernfs_kill_sb(struct super_block *sb); |
308 | struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns); | ||
308 | 309 | ||
309 | void kernfs_init(void); | 310 | void kernfs_init(void); |
310 | 311 | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7868fc3c0bc5..70776aec2562 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1648 | int flags, const char *unused_dev_name, | 1648 | int flags, const char *unused_dev_name, |
1649 | void *data) | 1649 | void *data) |
1650 | { | 1650 | { |
1651 | struct super_block *pinned_sb = NULL; | ||
1652 | struct cgroup_subsys *ss; | ||
1651 | struct cgroup_root *root; | 1653 | struct cgroup_root *root; |
1652 | struct cgroup_sb_opts opts; | 1654 | struct cgroup_sb_opts opts; |
1653 | struct dentry *dentry; | 1655 | struct dentry *dentry; |
1654 | int ret; | 1656 | int ret; |
1657 | int i; | ||
1655 | bool new_sb; | 1658 | bool new_sb; |
1656 | 1659 | ||
1657 | /* | 1660 | /* |
@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1677 | goto out_unlock; | 1680 | goto out_unlock; |
1678 | } | 1681 | } |
1679 | 1682 | ||
1683 | /* | ||
1684 | * Destruction of cgroup root is asynchronous, so subsystems may | ||
1685 | * still be dying after the previous unmount. Let's drain the | ||
1686 | * dying subsystems. We just need to ensure that the ones | ||
1687 | * unmounted previously finish dying and don't care about new ones | ||
1688 | * starting. Testing ref liveliness is good enough. | ||
1689 | */ | ||
1690 | for_each_subsys(ss, i) { | ||
1691 | if (!(opts.subsys_mask & (1 << i)) || | ||
1692 | ss->root == &cgrp_dfl_root) | ||
1693 | continue; | ||
1694 | |||
1695 | if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { | ||
1696 | mutex_unlock(&cgroup_mutex); | ||
1697 | msleep(10); | ||
1698 | ret = restart_syscall(); | ||
1699 | goto out_free; | ||
1700 | } | ||
1701 | cgroup_put(&ss->root->cgrp); | ||
1702 | } | ||
1703 | |||
1680 | for_each_root(root) { | 1704 | for_each_root(root) { |
1681 | bool name_match = false; | 1705 | bool name_match = false; |
1682 | 1706 | ||
@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1717 | } | 1741 | } |
1718 | 1742 | ||
1719 | /* | 1743 | /* |
1720 | * A root's lifetime is governed by its root cgroup. | 1744 | * We want to reuse @root whose lifetime is governed by its |
1721 | * tryget_live failure indicate that the root is being | 1745 | * ->cgrp. Let's check whether @root is alive and keep it |
1722 | * destroyed. Wait for destruction to complete so that the | 1746 | * that way. As cgroup_kill_sb() can happen anytime, we |
1723 | * subsystems are free. We can use wait_queue for the wait | 1747 | * want to block it by pinning the sb so that @root doesn't |
1724 | * but this path is super cold. Let's just sleep for a bit | 1748 | * get killed before mount is complete. |
1725 | * and retry. | 1749 | * |
1750 | * With the sb pinned, tryget_live can reliably indicate | ||
1751 | * whether @root can be reused. If it's being killed, | ||
1752 | * drain it. We can use wait_queue for the wait but this | ||
1753 | * path is super cold. Let's just sleep a bit and retry. | ||
1726 | */ | 1754 | */ |
1727 | if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { | 1755 | pinned_sb = kernfs_pin_sb(root->kf_root, NULL); |
1756 | if (IS_ERR(pinned_sb) || | ||
1757 | !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { | ||
1728 | mutex_unlock(&cgroup_mutex); | 1758 | mutex_unlock(&cgroup_mutex); |
1759 | if (!IS_ERR_OR_NULL(pinned_sb)) | ||
1760 | deactivate_super(pinned_sb); | ||
1729 | msleep(10); | 1761 | msleep(10); |
1730 | ret = restart_syscall(); | 1762 | ret = restart_syscall(); |
1731 | goto out_free; | 1763 | goto out_free; |
@@ -1770,6 +1802,16 @@ out_free: | |||
1770 | CGROUP_SUPER_MAGIC, &new_sb); | 1802 | CGROUP_SUPER_MAGIC, &new_sb); |
1771 | if (IS_ERR(dentry) || !new_sb) | 1803 | if (IS_ERR(dentry) || !new_sb) |
1772 | cgroup_put(&root->cgrp); | 1804 | cgroup_put(&root->cgrp); |
1805 | |||
1806 | /* | ||
1807 | * If @pinned_sb, we're reusing an existing root and holding an | ||
1808 | * extra ref on its sb. Mount is complete. Put the extra ref. | ||
1809 | */ | ||
1810 | if (pinned_sb) { | ||
1811 | WARN_ON(new_sb); | ||
1812 | deactivate_super(pinned_sb); | ||
1813 | } | ||
1814 | |||
1773 | return dentry; | 1815 | return dentry; |
1774 | } | 1816 | } |
1775 | 1817 | ||
@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) | |||
3328 | 3370 | ||
3329 | rcu_read_lock(); | 3371 | rcu_read_lock(); |
3330 | css_for_each_child(child, css) { | 3372 | css_for_each_child(child, css) { |
3331 | if (css->flags & CSS_ONLINE) { | 3373 | if (child->flags & CSS_ONLINE) { |
3332 | ret = true; | 3374 | ret = true; |
3333 | break; | 3375 | break; |
3334 | } | 3376 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f6b33c696224..116a4164720a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1181,7 +1181,13 @@ done: | |||
1181 | 1181 | ||
1182 | int current_cpuset_is_being_rebound(void) | 1182 | int current_cpuset_is_being_rebound(void) |
1183 | { | 1183 | { |
1184 | return task_cs(current) == cpuset_being_rebound; | 1184 | int ret; |
1185 | |||
1186 | rcu_read_lock(); | ||
1187 | ret = task_cs(current) == cpuset_being_rebound; | ||
1188 | rcu_read_unlock(); | ||
1189 | |||
1190 | return ret; | ||
1185 | } | 1191 | } |
1186 | 1192 | ||
1187 | static int update_relax_domain_level(struct cpuset *cs, s64 val) | 1193 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, | |||
1617 | * resources, wait for the previously scheduled operations before | 1623 | * resources, wait for the previously scheduled operations before |
1618 | * proceeding, so that we don't end up keep removing tasks added | 1624 | * proceeding, so that we don't end up keep removing tasks added |
1619 | * after execution capability is restored. | 1625 | * after execution capability is restored. |
1626 | * | ||
1627 | * cpuset_hotplug_work calls back into cgroup core via | ||
1628 | * cgroup_transfer_tasks() and waiting for it from a cgroupfs | ||
1629 | * operation like this one can lead to a deadlock through kernfs | ||
1630 | * active_ref protection. Let's break the protection. Losing the | ||
1631 | * protection is okay as we check whether @cs is online after | ||
1632 | * grabbing cpuset_mutex anyway. This only happens on the legacy | ||
1633 | * hierarchies. | ||
1620 | */ | 1634 | */ |
1635 | css_get(&cs->css); | ||
1636 | kernfs_break_active_protection(of->kn); | ||
1621 | flush_work(&cpuset_hotplug_work); | 1637 | flush_work(&cpuset_hotplug_work); |
1622 | 1638 | ||
1623 | mutex_lock(&cpuset_mutex); | 1639 | mutex_lock(&cpuset_mutex); |
@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, | |||
1645 | free_trial_cpuset(trialcs); | 1661 | free_trial_cpuset(trialcs); |
1646 | out_unlock: | 1662 | out_unlock: |
1647 | mutex_unlock(&cpuset_mutex); | 1663 | mutex_unlock(&cpuset_mutex); |
1664 | kernfs_unbreak_active_protection(of->kn); | ||
1665 | css_put(&cs->css); | ||
1648 | return retval ?: nbytes; | 1666 | return retval ?: nbytes; |
1649 | } | 1667 | } |
1650 | 1668 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb58de19f815..8f5330d74f47 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -2139,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
2139 | } else | 2139 | } else |
2140 | *new = *old; | 2140 | *new = *old; |
2141 | 2141 | ||
2142 | rcu_read_lock(); | ||
2143 | if (current_cpuset_is_being_rebound()) { | 2142 | if (current_cpuset_is_being_rebound()) { |
2144 | nodemask_t mems = cpuset_mems_allowed(current); | 2143 | nodemask_t mems = cpuset_mems_allowed(current); |
2145 | if (new->flags & MPOL_F_REBINDING) | 2144 | if (new->flags & MPOL_F_REBINDING) |
@@ -2147,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
2147 | else | 2146 | else |
2148 | mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); | 2147 | mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); |
2149 | } | 2148 | } |
2150 | rcu_read_unlock(); | ||
2151 | atomic_set(&new->refcnt, 1); | 2149 | atomic_set(&new->refcnt, 1); |
2152 | return new; | 2150 | return new; |
2153 | } | 2151 | } |