delayed mntput

On final mntput() we want fs shutdown to happen before return to userland; however, the only case where we want it happen right there (i.e. where task_work_add won't do) is MNT_INTERNAL victim. Those have to be fully synchronous - failure halfway through module init might count on having vfsmount killed right there. Fortunately, final mntput on MNT_INTERNAL vfsmounts happens on shallow stack. So we handle those synchronously and do an analog of delayed fput logics for everything else. As the result, we are guaranteed that fs shutdown will always happen on shallow stack. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: Al Viro <viro@zeniv.linux.org.uk> 2014-08-08 13:08:20 -0400
committer: Al Viro <viro@zeniv.linux.org.uk> 2014-10-09 02:38:53 -0400
commit: 9ea459e110df32e60a762f311f7939eaa879601d (patch)
tree: 3c25c8c4dbe1c21d92ab8e5a52b02c95453490d4 /fs/namespace.c
parent: b3ca406f2755c20cea1cc1169672c56dd03c266c (diff)
1 files changed, 53 insertions, 18 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index ef42d9bee212..044134315f93 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -23,6 +23,7 @@
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
 #include <linux/bootmem.h>
+#include <linux/task_work.h>
 #include "pnode.h"
 #include "internal.h"
@@ -957,6 +958,46 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
        return ERR_PTR(err);
 }
+static void cleanup_mnt(struct mount *mnt)
+{
+        /*
+         * This probably indicates that somebody messed
+         * up a mnt_want/drop_write() pair.  If this
+         * happens, the filesystem was probably unable
+         * to make r/w->r/o transitions.
+         */
+        /*
+         * The locking used to deal with mnt_count decrement provides barriers,
+         * so mnt_get_writers() below is safe.
+         */
+        WARN_ON(mnt_get_writers(mnt));
+        if (unlikely(mnt->mnt_pins.first))
+                mnt_pin_kill(mnt);
+        fsnotify_vfsmount_delete(&mnt->mnt);
+        dput(mnt->mnt.mnt_root);
+        deactivate_super(mnt->mnt.mnt_sb);
+        mnt_free_id(mnt);
+        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
+}
+static void __cleanup_mnt(struct rcu_head *head)
+{
+        cleanup_mnt(container_of(head, struct mount, mnt_rcu));
+}
+static LLIST_HEAD(delayed_mntput_list);
+static void delayed_mntput(struct work_struct *unused)
+{
+        struct llist_node *node = llist_del_all(&delayed_mntput_list);
+        struct llist_node *next;
+        for (; node; node = next) {
+                next = llist_next(node);
+                cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
+        }
+}
+static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
 static void mntput_no_expire(struct mount *mnt)
 {
        rcu_read_lock();
@@ -982,24 +1023,18 @@ static void mntput_no_expire(struct mount *mnt)
        list_del(&mnt->mnt_instance);
        unlock_mount_hash();
-        /*
+        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
-         * This probably indicates that somebody messed
+                struct task_struct *task = current;
-         * up a mnt_want/drop_write() pair.  If this
+                if (likely(!(task->flags & PF_KTHREAD))) {
-         * happens, the filesystem was probably unable
+                        init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
-         * to make r/w->r/o transitions.
+                        if (!task_work_add(task, &mnt->mnt_rcu, true))
-         */
+                                return;
-        /*
+                }
-         * The locking used to deal with mnt_count decrement provides barriers,
+                if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
-         * so mnt_get_writers() below is safe.
+                        schedule_delayed_work(&delayed_mntput_work, 1);
-         */
+                return;
-        WARN_ON(mnt_get_writers(mnt));
+        }
-        if (unlikely(mnt->mnt_pins.first))
+        cleanup_mnt(mnt);
-                mnt_pin_kill(mnt);
-        fsnotify_vfsmount_delete(&mnt->mnt);
-        dput(mnt->mnt.mnt_root);
-        deactivate_super(mnt->mnt.mnt_sb);
-        mnt_free_id(mnt);
-        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 }
 void mntput(struct vfsmount *mnt)
author	Al Viro <viro@zeniv.linux.org.uk>	2014-08-08 13:08:20 -0400
committer	Al Viro <viro@zeniv.linux.org.uk>	2014-10-09 02:38:53 -0400
commit	9ea459e110df32e60a762f311f7939eaa879601d (patch)
tree	3c25c8c4dbe1c21d92ab8e5a52b02c95453490d4 /fs/namespace.c
parent	b3ca406f2755c20cea1cc1169672c56dd03c266c (diff)

diff --git a/fs/namespace.c b/fs/namespace.c index ef42d9bee212..044134315f93 100644 --- a/fs/namespace.c +++ b/fs/namespace.c
@@ -23,6 +23,7 @@
23	#include <linux/proc_ns.h>	23	#include <linux/proc_ns.h>
24	#include <linux/magic.h>	24	#include <linux/magic.h>
25	#include <linux/bootmem.h>	25	#include <linux/bootmem.h>
		26	#include <linux/task_work.h>
26	#include "pnode.h"	27	#include "pnode.h"
27	#include "internal.h"	28	#include "internal.h"
28		29
@@ -957,6 +958,46 @@ static struct mount clone_mnt(struct mount old, struct dentry *root,
957	return ERR_PTR(err);	958	return ERR_PTR(err);
958	}	959	}
959		960
		961	static void cleanup_mnt(struct mount *mnt)
		962	{
		963	/*
		964	* This probably indicates that somebody messed
		965	* up a mnt_want/drop_write() pair. If this
		966	* happens, the filesystem was probably unable
		967	* to make r/w->r/o transitions.
		968	*/
		969	/*
		970	* The locking used to deal with mnt_count decrement provides barriers,
		971	* so mnt_get_writers() below is safe.
		972	*/
		973	WARN_ON(mnt_get_writers(mnt));
		974	if (unlikely(mnt->mnt_pins.first))
		975	mnt_pin_kill(mnt);
		976	fsnotify_vfsmount_delete(&mnt->mnt);
		977	dput(mnt->mnt.mnt_root);
		978	deactivate_super(mnt->mnt.mnt_sb);
		979	mnt_free_id(mnt);
		980	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
		981	}
		982
		983	static void __cleanup_mnt(struct rcu_head *head)
		984	{
		985	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
		986	}
		987
		988	static LLIST_HEAD(delayed_mntput_list);
		989	static void delayed_mntput(struct work_struct *unused)
		990	{
		991	struct llist_node *node = llist_del_all(&delayed_mntput_list);
		992	struct llist_node *next;
		993
		994	for (; node; node = next) {
		995	next = llist_next(node);
		996	cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
		997	}
		998	}
		999	static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
		1000
960	static void mntput_no_expire(struct mount *mnt)	1001	static void mntput_no_expire(struct mount *mnt)
961	{	1002	{
962	rcu_read_lock();	1003	rcu_read_lock();
@@ -982,24 +1023,18 @@ static void mntput_no_expire(struct mount *mnt)
982	list_del(&mnt->mnt_instance);	1023	list_del(&mnt->mnt_instance);
983	unlock_mount_hash();	1024	unlock_mount_hash();
984		1025
985	/*	1026	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
986	* This probably indicates that somebody messed	1027	struct task_struct *task = current;
987	* up a mnt_want/drop_write() pair. If this	1028	if (likely(!(task->flags & PF_KTHREAD))) {
988	* happens, the filesystem was probably unable	1029	init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
989	* to make r/w->r/o transitions.	1030	if (!task_work_add(task, &mnt->mnt_rcu, true))
990	*/	1031	return;
991	/*	1032	}
992	* The locking used to deal with mnt_count decrement provides barriers,	1033	if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
993	* so mnt_get_writers() below is safe.	1034	schedule_delayed_work(&delayed_mntput_work, 1);
994	*/	1035	return;
995	WARN_ON(mnt_get_writers(mnt));	1036	}
996	if (unlikely(mnt->mnt_pins.first))	1037	cleanup_mnt(mnt);
997	mnt_pin_kill(mnt);
998	fsnotify_vfsmount_delete(&mnt->mnt);
999	dput(mnt->mnt.mnt_root);
1000	deactivate_super(mnt->mnt.mnt_sb);
1001	mnt_free_id(mnt);
1002	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1003	}	1038	}
1004		1039
1005	void mntput(struct vfsmount *mnt)	1040	void mntput(struct vfsmount *mnt)