5 files changed, 100 insertions, 11 deletions
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d171b98a6cdd..f973ae9b05f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb)
        kernfs_put(root_kn);
 }
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations.  This can be used to block ->kill_sb() which may be useful
+ * for kernfs users which dynamically manage superblocks.
+ *
+ * Returns NULL if there's no superblock associated to this kernfs_root, or
+ * -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+        struct kernfs_super_info *info;
+        struct super_block *sb = NULL;
+        mutex_lock(&kernfs_mutex);
+        list_for_each_entry(info, &root->supers, node) {
+                if (info->ns == ns) {
+                        sb = info->sb;
+                        if (!atomic_inc_not_zero(&info->sb->s_active))
+                                sb = ERR_PTR(-EINVAL);
+                        break;
+                }
+        }
+        mutex_unlock(&kernfs_mutex);
+        return sb;
+}
 void __init kernfs_init(void)
 {
        kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 145375ea0bd9..30faf797c2c3 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -305,6 +305,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
                               struct kernfs_root *root, unsigned long magic,
                               bool *new_sb_created, const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 void kernfs_init(void);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7868fc3c0bc5..70776aec2562 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
                         void *data)
 {
+        struct super_block *pinned_sb = NULL;
+        struct cgroup_subsys *ss;
        struct cgroup_root *root;
        struct cgroup_sb_opts opts;
        struct dentry *dentry;
        int ret;
+        int i;
        bool new_sb;
        /*
@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                goto out_unlock;
        }
+        /*
+         * Destruction of cgroup root is asynchronous, so subsystems may
+         * still be dying after the previous unmount.  Let's drain the
+         * dying subsystems.  We just need to ensure that the ones
+         * unmounted previously finish dying and don't care about new ones
+         * starting.  Testing ref liveliness is good enough.
+         */
+        for_each_subsys(ss, i) {
+                if (!(opts.subsys_mask & (1 << i)) ||
+                    ss->root == &cgrp_dfl_root)
+                        continue;
+                if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+                        mutex_unlock(&cgroup_mutex);
+                        msleep(10);
+                        ret = restart_syscall();
+                        goto out_free;
+                }
+                cgroup_put(&ss->root->cgrp);
+        }
        for_each_root(root) {
                bool name_match = false;
@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                }
                /*
-                 * A root's lifetime is governed by its root cgroup.
+                 * We want to reuse @root whose lifetime is governed by its
-                 * tryget_live failure indicate that the root is being
+                 * ->cgrp.  Let's check whether @root is alive and keep it
-                 * destroyed.  Wait for destruction to complete so that the
+                 * that way.  As cgroup_kill_sb() can happen anytime, we
-                 * subsystems are free.  We can use wait_queue for the wait
+                 * want to block it by pinning the sb so that @root doesn't
-                 * but this path is super cold.  Let's just sleep for a bit
+                 * get killed before mount is complete.
-                 * and retry.
+                 *
+                 * With the sb pinned, tryget_live can reliably indicate
+                 * whether @root can be reused.  If it's being killed,
+                 * drain it.  We can use wait_queue for the wait but this
+                 * path is super cold.  Let's just sleep a bit and retry.
                 */
-                if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+                pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+                if (IS_ERR(pinned_sb) ||
+                    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
                        mutex_unlock(&cgroup_mutex);
+                        if (!IS_ERR_OR_NULL(pinned_sb))
+                                deactivate_super(pinned_sb);
                        msleep(10);
                        ret = restart_syscall();
                        goto out_free;
@@ -1770,6 +1802,16 @@ out_free:
                                CGROUP_SUPER_MAGIC, &new_sb);
        if (IS_ERR(dentry) || !new_sb)
                cgroup_put(&root->cgrp);
+        /*
+         * If @pinned_sb, we're reusing an existing root and holding an
+         * extra ref on its sb.  Mount is complete.  Put the extra ref.
+         */
+        if (pinned_sb) {
+                WARN_ON(new_sb);
+                deactivate_super(pinned_sb);
+        }
        return dentry;
 }
@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
        rcu_read_lock();
        css_for_each_child(child, css) {
-                if (css->flags & CSS_ONLINE) {
+                if (child->flags & CSS_ONLINE) {
                        ret = true;
                        break;
                }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f6b33c696224..116a4164720a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1181,7 +1181,13 @@ done:
 int current_cpuset_is_being_rebound(void)
 {
-        return task_cs(current) == cpuset_being_rebound;
+        int ret;
+        rcu_read_lock();
+        ret = task_cs(current) == cpuset_being_rebound;
+        rcu_read_unlock();
+        return ret;
 }
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
+         *
+         * cpuset_hotplug_work calls back into cgroup core via
+         * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+         * operation like this one can lead to a deadlock through kernfs
+         * active_ref protection.  Let's break the protection.  Losing the
+         * protection is okay as we check whether @cs is online after
+         * grabbing cpuset_mutex anyway.  This only happens on the legacy
+         * hierarchies.
         */
+        css_get(&cs->css);
+        kernfs_break_active_protection(of->kn);
        flush_work(&cpuset_hotplug_work);
        mutex_lock(&cpuset_mutex);
@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
        free_trial_cpuset(trialcs);
 out_unlock:
        mutex_unlock(&cpuset_mutex);
+        kernfs_unbreak_active_protection(of->kn);
+        css_put(&cs->css);
        return retval ?: nbytes;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eb58de19f815..8f5330d74f47 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2139,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
        } else
                *new = *old;
-        rcu_read_lock();
        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                if (new->flags & MPOL_F_REBINDING)
@@ -2147,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
                else
                        mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
        }
-        rcu_read_unlock();
        atomic_set(&new->refcnt, 1);
        return new;
 }

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d171b98a6cdd..f973ae9b05f1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c
@@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb)
211	kernfs_put(root_kn);	211	kernfs_put(root_kn);
212	}	212	}
213		213
		214	/**
		215	* kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
		216	* @kernfs_root: the kernfs_root in question
		217	* @ns: the namespace tag
		218	*
		219	* Pin the superblock so the superblock won't be destroyed in subsequent
		220	* operations. This can be used to block ->kill_sb() which may be useful
		221	* for kernfs users which dynamically manage superblocks.
		222	*
		223	* Returns NULL if there's no superblock associated to this kernfs_root, or
		224	* -EINVAL if the superblock is being freed.
		225	*/
		226	struct super_block kernfs_pin_sb(struct kernfs_root root, const void *ns)
		227	{
		228	struct kernfs_super_info *info;
		229	struct super_block *sb = NULL;
		230
		231	mutex_lock(&kernfs_mutex);
		232	list_for_each_entry(info, &root->supers, node) {
		233	if (info->ns == ns) {
		234	sb = info->sb;
		235	if (!atomic_inc_not_zero(&info->sb->s_active))
		236	sb = ERR_PTR(-EINVAL);
		237	break;
		238	}
		239	}
		240	mutex_unlock(&kernfs_mutex);
		241	return sb;
		242	}
		243
214	void __init kernfs_init(void)	244	void __init kernfs_init(void)
215	{	245	{
216	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",	246	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",


diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 145375ea0bd9..30faf797c2c3 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h
@@ -305,6 +305,7 @@ struct dentry kernfs_mount_ns(struct file_system_type fs_type, int flags,
305	struct kernfs_root *root, unsigned long magic,	305	struct kernfs_root *root, unsigned long magic,
306	bool new_sb_created, const void ns);	306	bool new_sb_created, const void ns);
307	void kernfs_kill_sb(struct super_block *sb);	307	void kernfs_kill_sb(struct super_block *sb);
		308	struct super_block kernfs_pin_sb(struct kernfs_root root, const void *ns);
308		309
309	void kernfs_init(void);	310	void kernfs_init(void);
310		311


diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7868fc3c0bc5..70776aec2562 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -1648,10 +1648,13 @@ static struct dentry cgroup_mount(struct file_system_type fs_type,
1648	int flags, const char *unused_dev_name,	1648	int flags, const char *unused_dev_name,
1649	void *data)	1649	void *data)
1650	{	1650	{
		1651	struct super_block *pinned_sb = NULL;
		1652	struct cgroup_subsys *ss;
1651	struct cgroup_root *root;	1653	struct cgroup_root *root;
1652	struct cgroup_sb_opts opts;	1654	struct cgroup_sb_opts opts;
1653	struct dentry *dentry;	1655	struct dentry *dentry;
1654	int ret;	1656	int ret;
		1657	int i;
1655	bool new_sb;	1658	bool new_sb;
1656		1659
1657	/*	1660	/*
@@ -1677,6 +1680,27 @@ static struct dentry cgroup_mount(struct file_system_type fs_type,
1677	goto out_unlock;	1680	goto out_unlock;
1678	}	1681	}
1679		1682
		1683	/*
		1684	* Destruction of cgroup root is asynchronous, so subsystems may
		1685	* still be dying after the previous unmount. Let's drain the
		1686	* dying subsystems. We just need to ensure that the ones
		1687	* unmounted previously finish dying and don't care about new ones
		1688	* starting. Testing ref liveliness is good enough.
		1689	*/
		1690	for_each_subsys(ss, i) {
		1691	if (!(opts.subsys_mask & (1 << i)) \|\|
		1692	ss->root == &cgrp_dfl_root)
		1693	continue;
		1694
		1695	if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
		1696	mutex_unlock(&cgroup_mutex);
		1697	msleep(10);
		1698	ret = restart_syscall();
		1699	goto out_free;
		1700	}
		1701	cgroup_put(&ss->root->cgrp);
		1702	}
		1703
1680	for_each_root(root) {	1704	for_each_root(root) {
1681	bool name_match = false;	1705	bool name_match = false;
1682		1706
@@ -1717,15 +1741,23 @@ static struct dentry cgroup_mount(struct file_system_type fs_type,
1717	}	1741	}
1718		1742
1719	/*	1743	/*
1720	* A root's lifetime is governed by its root cgroup.	1744	* We want to reuse @root whose lifetime is governed by its
1721	* tryget_live failure indicate that the root is being	1745	* ->cgrp. Let's check whether @root is alive and keep it
1722	* destroyed. Wait for destruction to complete so that the	1746	* that way. As cgroup_kill_sb() can happen anytime, we
1723	* subsystems are free. We can use wait_queue for the wait	1747	* want to block it by pinning the sb so that @root doesn't
1724	* but this path is super cold. Let's just sleep for a bit	1748	* get killed before mount is complete.
1725	* and retry.	1749	*
		1750	* With the sb pinned, tryget_live can reliably indicate
		1751	* whether @root can be reused. If it's being killed,
		1752	* drain it. We can use wait_queue for the wait but this
		1753	* path is super cold. Let's just sleep a bit and retry.
1726	*/	1754	*/
1727	if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {	1755	pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
		1756	if (IS_ERR(pinned_sb) \|\|
		1757	!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
1728	mutex_unlock(&cgroup_mutex);	1758	mutex_unlock(&cgroup_mutex);
		1759	if (!IS_ERR_OR_NULL(pinned_sb))
		1760	deactivate_super(pinned_sb);
1729	msleep(10);	1761	msleep(10);
1730	ret = restart_syscall();	1762	ret = restart_syscall();
1731	goto out_free;	1763	goto out_free;
@@ -1770,6 +1802,16 @@ out_free:
1770	CGROUP_SUPER_MAGIC, &new_sb);	1802	CGROUP_SUPER_MAGIC, &new_sb);
1771	if (IS_ERR(dentry) \|\| !new_sb)	1803	if (IS_ERR(dentry) \|\| !new_sb)
1772	cgroup_put(&root->cgrp);	1804	cgroup_put(&root->cgrp);
		1805
		1806	/*
		1807	* If @pinned_sb, we're reusing an existing root and holding an
		1808	* extra ref on its sb. Mount is complete. Put the extra ref.
		1809	*/
		1810	if (pinned_sb) {
		1811	WARN_ON(new_sb);
		1812	deactivate_super(pinned_sb);
		1813	}
		1814
1773	return dentry;	1815	return dentry;
1774	}	1816	}
1775		1817
@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
3328		3370
3329	rcu_read_lock();	3371	rcu_read_lock();
3330	css_for_each_child(child, css) {	3372	css_for_each_child(child, css) {
3331	if (css->flags & CSS_ONLINE) {	3373	if (child->flags & CSS_ONLINE) {
3332	ret = true;	3374	ret = true;
3333	break;	3375	break;
3334	}	3376	}


diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f6b33c696224..116a4164720a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -1181,7 +1181,13 @@ done:
1181		1181
1182	int current_cpuset_is_being_rebound(void)	1182	int current_cpuset_is_being_rebound(void)
1183	{	1183	{
1184	return task_cs(current) == cpuset_being_rebound;	1184	int ret;
		1185
		1186	rcu_read_lock();
		1187	ret = task_cs(current) == cpuset_being_rebound;
		1188	rcu_read_unlock();
		1189
		1190	return ret;
1185	}	1191	}
1186		1192
1187	static int update_relax_domain_level(struct cpuset *cs, s64 val)	1193	static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1617	* resources, wait for the previously scheduled operations before	1623	* resources, wait for the previously scheduled operations before
1618	* proceeding, so that we don't end up keep removing tasks added	1624	* proceeding, so that we don't end up keep removing tasks added
1619	* after execution capability is restored.	1625	* after execution capability is restored.
		1626	*
		1627	* cpuset_hotplug_work calls back into cgroup core via
		1628	* cgroup_transfer_tasks() and waiting for it from a cgroupfs
		1629	* operation like this one can lead to a deadlock through kernfs
		1630	* active_ref protection. Let's break the protection. Losing the
		1631	* protection is okay as we check whether @cs is online after
		1632	* grabbing cpuset_mutex anyway. This only happens on the legacy
		1633	* hierarchies.
1620	*/	1634	*/
		1635	css_get(&cs->css);
		1636	kernfs_break_active_protection(of->kn);
1621	flush_work(&cpuset_hotplug_work);	1637	flush_work(&cpuset_hotplug_work);
1622		1638
1623	mutex_lock(&cpuset_mutex);	1639	mutex_lock(&cpuset_mutex);
@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
1645	free_trial_cpuset(trialcs);	1661	free_trial_cpuset(trialcs);
1646	out_unlock:	1662	out_unlock:
1647	mutex_unlock(&cpuset_mutex);	1663	mutex_unlock(&cpuset_mutex);
		1664	kernfs_unbreak_active_protection(of->kn);
		1665	css_put(&cs->css);
1648	return retval ?: nbytes;	1666	return retval ?: nbytes;
1649	}	1667	}
1650		1668


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb58de19f815..8f5330d74f47 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -2139,7 +2139,6 @@ struct mempolicy __mpol_dup(struct mempolicy old)
2139	} else	2139	} else
2140	new = old;	2140	new = old;
2141		2141
2142	rcu_read_lock();
2143	if (current_cpuset_is_being_rebound()) {	2142	if (current_cpuset_is_being_rebound()) {
2144	nodemask_t mems = cpuset_mems_allowed(current);	2143	nodemask_t mems = cpuset_mems_allowed(current);
2145	if (new->flags & MPOL_F_REBINDING)	2144	if (new->flags & MPOL_F_REBINDING)
@@ -2147,7 +2146,6 @@ struct mempolicy __mpol_dup(struct mempolicy old)
2147	else	2146	else
2148	mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);	2147	mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2149	}	2148	}
2150	rcu_read_unlock();
2151	atomic_set(&new->refcnt, 1);	2149	atomic_set(&new->refcnt, 1);
2152	return new;	2150	return new;
2153	}	2151	}