fs: Improve filesystem freezing handling

vfs_check_frozen() tests are racy since the filesystem can be frozen just after the test is performed. Thus in write paths we can end up marking some pages or inodes dirty even though the file system is already frozen. This creates problems with flusher thread hanging on frozen filesystem. Another problem is that exclusion between ->page_mkwrite() and filesystem freezing has been handled by setting page dirty and then verifying s_frozen. This guaranteed that either the freezing code sees the faulted page, writes it, and writeprotects it again or we see s_frozen set and bail out of page fault. This works to protect from page being marked writeable while filesystem freezing is running but has an unpleasant artefact of leaving dirty (although unmodified and writeprotected) pages on frozen filesystem resulting in similar problems with flusher thread as the first problem. This patch aims at providing exclusion between write paths and filesystem freezing. We implement a writer-freeze read-write semaphore in the superblock. Actually, there are three such semaphores because of lock ranking reasons - one for page fault handlers (->page_mkwrite), one for all other writers, and one of internal filesystem purposes (used e.g. to track running transactions). Write paths which should block freezing (e.g. directory operations, ->aio_write(), ->page_mkwrite) hold reader side of the semaphore. Code freezing the filesystem takes the writer side. Only that we don't really want to bounce cachelines of the semaphores between CPUs for each write happening. So we implement the reader side of the semaphore as a per-cpu counter and the writer side is implemented using s_writers.frozen superblock field. [AV: microoptimize sb_start_write(); we want it fast in normal case] BugLink: https://bugs.launchpad.net/bugs/897421 Tested-by: Kamal Mostafa <kamal@canonical.com> Tested-by: Peter M. Petrakis <peter.petrakis@canonical.com> Tested-by: Dann Frazier <dann.frazier@canonical.com> Tested-by: Massimo Morana <massimo.morana@canonical.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: Jan Kara <jack@suse.cz> 2012-06-12 10:20:34 -0400
committer: Al Viro <viro@zeniv.linux.org.uk> 2012-07-31 01:30:13 -0400
commit: 5accdf82ba25cacefd6c1867f1704beb4d244cdd (patch)
tree: 7125b01d9bf0f23d5c5eaed0cbafa9a1cbe544d5 /fs/super.c
parent: d87aae2f3c8e90bd0fe03f5309b4d066b712b8ec (diff)
1 files changed, 230 insertions, 21 deletions
diff --git a/fs/super.c b/fs/super.c
index c743fb3be4b8..0f64ecb7b1bf 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,12 +33,19 @@
 #include <linux/rculist_bl.h>
 #include <linux/cleancache.h>
 #include <linux/fsnotify.h>
+#include <linux/lockdep.h>
 #include "internal.h"
 LIST_HEAD(super_blocks);
 DEFINE_SPINLOCK(sb_lock);
+static char *sb_writers_name[SB_FREEZE_LEVELS] = {
+        "sb_writers",
+        "sb_pagefaults",
+        "sb_internal",
+};
 /*
 * One thing we have to be careful of with a per-sb shrinker is that we don't
 * drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
        return total_objects;
 }
+static int init_sb_writers(struct super_block *s, struct file_system_type *type)
+{
+        int err;
+        int i;
+        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
+                err = percpu_counter_init(&s->s_writers.counter[i], 0);
+                if (err < 0)
+                        goto err_out;
+                lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
+                                 &type->s_writers_key[i], 0);
+        }
+        init_waitqueue_head(&s->s_writers.wait);
+        init_waitqueue_head(&s->s_writers.wait_unfrozen);
+        return 0;
+err_out:
+        while (--i >= 0)
+                percpu_counter_destroy(&s->s_writers.counter[i]);
+        return err;
+}
+static void destroy_sb_writers(struct super_block *s)
+{
+        int i;
+        for (i = 0; i < SB_FREEZE_LEVELS; i++)
+                percpu_counter_destroy(&s->s_writers.counter[i]);
+}
 /**
 *      alloc_super     -       create new superblock
 *      @type:  filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        if (s) {
                if (security_sb_alloc(s)) {
+                        /*
+                         * We cannot call security_sb_free() without
+                         * security_sb_alloc() succeeding. So bail out manually
+                         */
                        kfree(s);
                        s = NULL;
                        goto out;
                }
 #ifdef CONFIG_SMP
                s->s_files = alloc_percpu(struct list_head);
-                if (!s->s_files) {
+                if (!s->s_files)
-                        security_sb_free(s);
+                        goto err_out;
-                        kfree(s);
+                else {
-                        s = NULL;
-                        goto out;
-                } else {
                        int i;
                        for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 #else
                INIT_LIST_HEAD(&s->s_files);
 #endif
+                if (init_sb_writers(s, type))
+                        goto err_out;
                s->s_flags = flags;
                s->s_bdi = &default_backing_dev_info;
                INIT_HLIST_NODE(&s->s_instances);
@@ -190,6 +229,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        }
 out:
        return s;
+err_out:
+        security_sb_free(s);
+#ifdef CONFIG_SMP
+        if (s->s_files)
+                free_percpu(s->s_files);
+#endif
+        destroy_sb_writers(s);
+        kfree(s);
+        s = NULL;
+        goto out;
 }
 /**
@@ -203,6 +252,7 @@ static inline void destroy_super(struct super_block *s)
 #ifdef CONFIG_SMP
        free_percpu(s->s_files);
 #endif
+        destroy_sb_writers(s);
        security_sb_free(s);
        WARN_ON(!list_empty(&s->s_mounts));
        kfree(s->s_subtype);
@@ -651,10 +701,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)
 {
        while (1) {
                struct super_block *s = get_super(bdev);
-                if (!s || s->s_frozen == SB_UNFROZEN)
+                if (!s || s->s_writers.frozen == SB_UNFROZEN)
                        return s;
                up_read(&s->s_umount);
-                vfs_check_frozen(s, SB_FREEZE_WRITE);
+                wait_event(s->s_writers.wait_unfrozen,
+                           s->s_writers.frozen == SB_UNFROZEN);
                put_super(s);
        }
 }
@@ -732,7 +783,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        int retval;
        int remount_ro;
-        if (sb->s_frozen != SB_UNFROZEN)
+        if (sb->s_writers.frozen != SB_UNFROZEN)
                return -EBUSY;
 #ifdef CONFIG_BLOCK
@@ -1163,6 +1214,120 @@ out:
        return ERR_PTR(error);
 }
+/*
+ * This is an internal function, please use sb_end_{write,pagefault,intwrite}
+ * instead.
+ */
+void __sb_end_write(struct super_block *sb, int level)
+{
+        percpu_counter_dec(&sb->s_writers.counter[level-1]);
+        /*
+         * Make sure s_writers are updated before we wake up waiters in
+         * freeze_super().
+         */
+        smp_mb();
+        if (waitqueue_active(&sb->s_writers.wait))
+                wake_up(&sb->s_writers.wait);
+        rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+}
+EXPORT_SYMBOL(__sb_end_write);
+#ifdef CONFIG_LOCKDEP
+/*
+ * We want lockdep to tell us about possible deadlocks with freezing but
+ * it's it bit tricky to properly instrument it. Getting a freeze protection
+ * works as getting a read lock but there are subtle problems. XFS for example
+ * gets freeze protection on internal level twice in some cases, which is OK
+ * only because we already hold a freeze protection also on higher level. Due
+ * to these cases we have to tell lockdep we are doing trylock when we
+ * already hold a freeze protection for a higher freeze level.
+ */
+static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
+                                unsigned long ip)
+{
+        int i;
+        if (!trylock) {
+                for (i = 0; i < level - 1; i++)
+                        if (lock_is_held(&sb->s_writers.lock_map[i])) {
+                                trylock = true;
+                                break;
+                        }
+        }
+        rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
+}
+#endif
+/*
+ * This is an internal function, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+int __sb_start_write(struct super_block *sb, int level, bool wait)
+{
+retry:
+        if (unlikely(sb->s_writers.frozen >= level)) {
+                if (!wait)
+                        return 0;
+                wait_event(sb->s_writers.wait_unfrozen,
+                           sb->s_writers.frozen < level);
+        }
+#ifdef CONFIG_LOCKDEP
+        acquire_freeze_lock(sb, level, !wait, _RET_IP_);
+#endif
+        percpu_counter_inc(&sb->s_writers.counter[level-1]);
+        /*
+         * Make sure counter is updated before we check for frozen.
+         * freeze_super() first sets frozen and then checks the counter.
+         */
+        smp_mb();
+        if (unlikely(sb->s_writers.frozen >= level)) {
+                __sb_end_write(sb, level);
+                goto retry;
+        }
+        return 1;
+}
+EXPORT_SYMBOL(__sb_start_write);
+/**
+ * sb_wait_write - wait until all writers to given file system finish
+ * @sb: the super for which we wait
+ * @level: type of writers we wait for (normal vs page fault)
+ *
+ * This function waits until there are no writers of given type to given file
+ * system. Caller of this function should make sure there can be no new writers
+ * of type @level before calling this function. Otherwise this function can
+ * livelock.
+ */
+static void sb_wait_write(struct super_block *sb, int level)
+{
+        s64 writers;
+        /*
+         * We just cycle-through lockdep here so that it does not complain
+         * about returning with lock to userspace
+         */
+        rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
+        rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
+        do {
+                DEFINE_WAIT(wait);
+                /*
+                 * We use a barrier in prepare_to_wait() to separate setting
+                 * of frozen and checking of the counter
+                 */
+                prepare_to_wait(&sb->s_writers.wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
+                if (writers)
+                        schedule();
+                finish_wait(&sb->s_writers.wait, &wait);
+        } while (writers);
+}
 /**
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
@@ -1170,6 +1335,31 @@ out:
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs will return
 * -EBUSY.
+ *
+ * During this function, sb->s_writers.frozen goes through these values:
+ *
+ * SB_UNFROZEN: File system is normal, all writes progress as usual.
+ *
+ * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
+ * writes should be blocked, though page faults are still allowed. We wait for
+ * all writes to complete and then proceed to the next stage.
+ *
+ * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
+ * but internal fs threads can still modify the filesystem (although they
+ * should not dirty new pages or inodes), writeback can run etc. After waiting
+ * for all running page faults we sync the filesystem which will clean all
+ * dirty pages and inodes (no new dirty pages or inodes can be created when
+ * sync is running).
+ *
+ * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
+ * modification are blocked (e.g. XFS preallocation truncation on inode
+ * reclaim). This is usually implemented by blocking new transactions for
+ * filesystems that have them and need this additional guard. After all
+ * internal writers are finished we call ->freeze_fs() to finish filesystem
+ * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
+ * mostly auxiliary for filesystems to verify they do not modify frozen fs.
+ *
+ * sb->s_writers.frozen is protected by sb->s_umount.
 */
 int freeze_super(struct super_block *sb)
 {
@@ -1177,7 +1367,7 @@ int freeze_super(struct super_block *sb)
        atomic_inc(&sb->s_active);
        down_write(&sb->s_umount);
-        if (sb->s_frozen) {
+        if (sb->s_writers.frozen != SB_UNFROZEN) {
                deactivate_locked_super(sb);
                return -EBUSY;
        }
@@ -1188,33 +1378,53 @@ int freeze_super(struct super_block *sb)
        }
        if (sb->s_flags & MS_RDONLY) {
-                sb->s_frozen = SB_FREEZE_TRANS;
+                /* Nothing to do really... */
-                smp_wmb();
+                sb->s_writers.frozen = SB_FREEZE_COMPLETE;
                up_write(&sb->s_umount);
                return 0;
        }
-        sb->s_frozen = SB_FREEZE_WRITE;
+        /* From now on, no new normal writers can start */
+        sb->s_writers.frozen = SB_FREEZE_WRITE;
+        smp_wmb();
+        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
+        up_write(&sb->s_umount);
+        sb_wait_write(sb, SB_FREEZE_WRITE);
+        /* Now we go and block page faults... */
+        down_write(&sb->s_umount);
+        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
        smp_wmb();
+        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
+        /* All writers are done so after syncing there won't be dirty data */
        sync_filesystem(sb);
-        sb->s_frozen = SB_FREEZE_TRANS;
+        /* Now wait for internal filesystem counter */
+        sb->s_writers.frozen = SB_FREEZE_FS;
        smp_wmb();
+        sb_wait_write(sb, SB_FREEZE_FS);
-        sync_blockdev(sb->s_bdev);
        if (sb->s_op->freeze_fs) {
                ret = sb->s_op->freeze_fs(sb);
                if (ret) {
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
-                        sb->s_frozen = SB_UNFROZEN;
+                        sb->s_writers.frozen = SB_UNFROZEN;
                        smp_wmb();
-                        wake_up(&sb->s_wait_unfrozen);
+                        wake_up(&sb->s_writers.wait_unfrozen);
                        deactivate_locked_super(sb);
                        return ret;
                }
        }
+        /*
+         * This is just for debugging purposes so that fs can warn if it
+         * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
+         */
+        sb->s_writers.frozen = SB_FREEZE_COMPLETE;
        up_write(&sb->s_umount);
        return 0;
 }
@@ -1231,7 +1441,7 @@ int thaw_super(struct super_block *sb)
        int error;
        down_write(&sb->s_umount);
-        if (sb->s_frozen == SB_UNFROZEN) {
+        if (sb->s_writers.frozen == SB_UNFROZEN) {
                up_write(&sb->s_umount);
                return -EINVAL;
        }
@@ -1244,16 +1454,15 @@ int thaw_super(struct super_block *sb)
                if (error) {
                        printk(KERN_ERR
                                "VFS:Filesystem thaw failed\n");
-                        sb->s_frozen = SB_FREEZE_TRANS;
                        up_write(&sb->s_umount);
                        return error;
                }
        }
 out:
-        sb->s_frozen = SB_UNFROZEN;
+        sb->s_writers.frozen = SB_UNFROZEN;
        smp_wmb();
-        wake_up(&sb->s_wait_unfrozen);
+        wake_up(&sb->s_writers.wait_unfrozen);
        deactivate_locked_super(sb);
        return 0;
author	Jan Kara <jack@suse.cz>	2012-06-12 10:20:34 -0400
committer	Al Viro <viro@zeniv.linux.org.uk>	2012-07-31 01:30:13 -0400
commit	5accdf82ba25cacefd6c1867f1704beb4d244cdd (patch)
tree	7125b01d9bf0f23d5c5eaed0cbafa9a1cbe544d5 /fs/super.c
parent	d87aae2f3c8e90bd0fe03f5309b4d066b712b8ec (diff)

diff --git a/fs/super.c b/fs/super.c index c743fb3be4b8..0f64ecb7b1bf 100644 --- a/fs/super.c +++ b/fs/super.c
@@ -33,12 +33,19 @@
33	#include <linux/rculist_bl.h>	33	#include <linux/rculist_bl.h>
34	#include <linux/cleancache.h>	34	#include <linux/cleancache.h>
35	#include <linux/fsnotify.h>	35	#include <linux/fsnotify.h>
		36	#include <linux/lockdep.h>
36	#include "internal.h"	37	#include "internal.h"
37		38
38		39
39	LIST_HEAD(super_blocks);	40	LIST_HEAD(super_blocks);
40	DEFINE_SPINLOCK(sb_lock);	41	DEFINE_SPINLOCK(sb_lock);
41		42
		43	static char *sb_writers_name[SB_FREEZE_LEVELS] = {
		44	"sb_writers",
		45	"sb_pagefaults",
		46	"sb_internal",
		47	};
		48
42	/*	49	/*
43	* One thing we have to be careful of with a per-sb shrinker is that we don't	50	* One thing we have to be careful of with a per-sb shrinker is that we don't
44	* drop the last active reference to the superblock from within the shrinker.	51	* drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker shrink, struct shrink_control sc)
102	return total_objects;	109	return total_objects;
103	}	110	}
104		111
		112	static int init_sb_writers(struct super_block s, struct file_system_type type)
		113	{
		114	int err;
		115	int i;
		116
		117	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
		118	err = percpu_counter_init(&s->s_writers.counter[i], 0);
		119	if (err < 0)
		120	goto err_out;
		121	lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
		122	&type->s_writers_key[i], 0);
		123	}
		124	init_waitqueue_head(&s->s_writers.wait);
		125	init_waitqueue_head(&s->s_writers.wait_unfrozen);
		126	return 0;
		127	err_out:
		128	while (--i >= 0)
		129	percpu_counter_destroy(&s->s_writers.counter[i]);
		130	return err;
		131	}
		132
		133	static void destroy_sb_writers(struct super_block *s)
		134	{
		135	int i;
		136
		137	for (i = 0; i < SB_FREEZE_LEVELS; i++)
		138	percpu_counter_destroy(&s->s_writers.counter[i]);
		139	}
		140
105	/**	141	/**
106	* alloc_super - create new superblock	142	* alloc_super - create new superblock
107	* @type: filesystem type superblock should belong to	143	* @type: filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block alloc_super(struct file_system_type type, int flags)
117		153
118	if (s) {	154	if (s) {
119	if (security_sb_alloc(s)) {	155	if (security_sb_alloc(s)) {
		156	/*
		157	* We cannot call security_sb_free() without
		158	* security_sb_alloc() succeeding. So bail out manually
		159	*/
120	kfree(s);	160	kfree(s);
121	s = NULL;	161	s = NULL;
122	goto out;	162	goto out;
123	}	163	}
124	#ifdef CONFIG_SMP	164	#ifdef CONFIG_SMP
125	s->s_files = alloc_percpu(struct list_head);	165	s->s_files = alloc_percpu(struct list_head);
126	if (!s->s_files) {	166	if (!s->s_files)
127	security_sb_free(s);	167	goto err_out;
128	kfree(s);	168	else {
129	s = NULL;
130	goto out;
131	} else {
132	int i;	169	int i;
133		170
134	for_each_possible_cpu(i)	171	for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block alloc_super(struct file_system_type type, int flags)
137	#else	174	#else
138	INIT_LIST_HEAD(&s->s_files);	175	INIT_LIST_HEAD(&s->s_files);
139	#endif	176	#endif
		177	if (init_sb_writers(s, type))
		178	goto err_out;
140	s->s_flags = flags;	179	s->s_flags = flags;
141	s->s_bdi = &default_backing_dev_info;	180	s->s_bdi = &default_backing_dev_info;
142	INIT_HLIST_NODE(&s->s_instances);	181	INIT_HLIST_NODE(&s->s_instances);
@@ -190,6 +229,16 @@ static struct super_block alloc_super(struct file_system_type type, int flags)
190	}	229	}
191	out:	230	out:
192	return s;	231	return s;
		232	err_out:
		233	security_sb_free(s);
		234	#ifdef CONFIG_SMP
		235	if (s->s_files)
		236	free_percpu(s->s_files);
		237	#endif
		238	destroy_sb_writers(s);
		239	kfree(s);
		240	s = NULL;
		241	goto out;
193	}	242	}
194		243
195	/**	244	/**
@@ -203,6 +252,7 @@ static inline void destroy_super(struct super_block *s)
203	#ifdef CONFIG_SMP	252	#ifdef CONFIG_SMP
204	free_percpu(s->s_files);	253	free_percpu(s->s_files);
205	#endif	254	#endif
		255	destroy_sb_writers(s);
206	security_sb_free(s);	256	security_sb_free(s);
207	WARN_ON(!list_empty(&s->s_mounts));	257	WARN_ON(!list_empty(&s->s_mounts));
208	kfree(s->s_subtype);	258	kfree(s->s_subtype);
@@ -651,10 +701,11 @@ struct super_block get_super_thawed(struct block_device bdev)
651	{	701	{
652	while (1) {	702	while (1) {
653	struct super_block *s = get_super(bdev);	703	struct super_block *s = get_super(bdev);
654	if (!s \|\| s->s_frozen == SB_UNFROZEN)	704	if (!s \|\| s->s_writers.frozen == SB_UNFROZEN)
655	return s;	705	return s;
656	up_read(&s->s_umount);	706	up_read(&s->s_umount);
657	vfs_check_frozen(s, SB_FREEZE_WRITE);	707	wait_event(s->s_writers.wait_unfrozen,
		708	s->s_writers.frozen == SB_UNFROZEN);
658	put_super(s);	709	put_super(s);
659	}	710	}
660	}	711	}
@@ -732,7 +783,7 @@ int do_remount_sb(struct super_block sb, int flags, void data, int force)
732	int retval;	783	int retval;
733	int remount_ro;	784	int remount_ro;
734		785
735	if (sb->s_frozen != SB_UNFROZEN)	786	if (sb->s_writers.frozen != SB_UNFROZEN)
736	return -EBUSY;	787	return -EBUSY;
737		788
738	#ifdef CONFIG_BLOCK	789	#ifdef CONFIG_BLOCK
@@ -1163,6 +1214,120 @@ out:
1163	return ERR_PTR(error);	1214	return ERR_PTR(error);
1164	}	1215	}
1165		1216
		1217	/*
		1218	* This is an internal function, please use sb_end_{write,pagefault,intwrite}
		1219	* instead.
		1220	*/
		1221	void __sb_end_write(struct super_block *sb, int level)
		1222	{
		1223	percpu_counter_dec(&sb->s_writers.counter[level-1]);
		1224	/*
		1225	* Make sure s_writers are updated before we wake up waiters in
		1226	* freeze_super().
		1227	*/
		1228	smp_mb();
		1229	if (waitqueue_active(&sb->s_writers.wait))
		1230	wake_up(&sb->s_writers.wait);
		1231	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
		1232	}
		1233	EXPORT_SYMBOL(__sb_end_write);
		1234
		1235	#ifdef CONFIG_LOCKDEP
		1236	/*
		1237	* We want lockdep to tell us about possible deadlocks with freezing but
		1238	* it's it bit tricky to properly instrument it. Getting a freeze protection
		1239	* works as getting a read lock but there are subtle problems. XFS for example
		1240	* gets freeze protection on internal level twice in some cases, which is OK
		1241	* only because we already hold a freeze protection also on higher level. Due
		1242	* to these cases we have to tell lockdep we are doing trylock when we
		1243	* already hold a freeze protection for a higher freeze level.
		1244	*/
		1245	static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
		1246	unsigned long ip)
		1247	{
		1248	int i;
		1249
		1250	if (!trylock) {
		1251	for (i = 0; i < level - 1; i++)
		1252	if (lock_is_held(&sb->s_writers.lock_map[i])) {
		1253	trylock = true;
		1254	break;
		1255	}
		1256	}
		1257	rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
		1258	}
		1259	#endif
		1260
		1261	/*
		1262	* This is an internal function, please use sb_start_{write,pagefault,intwrite}
		1263	* instead.
		1264	*/
		1265	int __sb_start_write(struct super_block *sb, int level, bool wait)
		1266	{
		1267	retry:
		1268	if (unlikely(sb->s_writers.frozen >= level)) {
		1269	if (!wait)
		1270	return 0;
		1271	wait_event(sb->s_writers.wait_unfrozen,
		1272	sb->s_writers.frozen < level);
		1273	}
		1274
		1275	#ifdef CONFIG_LOCKDEP
		1276	acquire_freeze_lock(sb, level, !wait, _RET_IP_);
		1277	#endif
		1278	percpu_counter_inc(&sb->s_writers.counter[level-1]);
		1279	/*
		1280	* Make sure counter is updated before we check for frozen.
		1281	* freeze_super() first sets frozen and then checks the counter.
		1282	*/
		1283	smp_mb();
		1284	if (unlikely(sb->s_writers.frozen >= level)) {
		1285	__sb_end_write(sb, level);
		1286	goto retry;
		1287	}
		1288	return 1;
		1289	}
		1290	EXPORT_SYMBOL(__sb_start_write);
		1291
		1292	/**
		1293	* sb_wait_write - wait until all writers to given file system finish
		1294	* @sb: the super for which we wait
		1295	* @level: type of writers we wait for (normal vs page fault)
		1296	*
		1297	* This function waits until there are no writers of given type to given file
		1298	* system. Caller of this function should make sure there can be no new writers
		1299	* of type @level before calling this function. Otherwise this function can
		1300	* livelock.
		1301	*/
		1302	static void sb_wait_write(struct super_block *sb, int level)
		1303	{
		1304	s64 writers;
		1305
		1306	/*
		1307	* We just cycle-through lockdep here so that it does not complain
		1308	* about returning with lock to userspace
		1309	*/
		1310	rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
		1311	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
		1312
		1313	do {
		1314	DEFINE_WAIT(wait);
		1315
		1316	/*
		1317	* We use a barrier in prepare_to_wait() to separate setting
		1318	* of frozen and checking of the counter
		1319	*/
		1320	prepare_to_wait(&sb->s_writers.wait, &wait,
		1321	TASK_UNINTERRUPTIBLE);
		1322
		1323	writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
		1324	if (writers)
		1325	schedule();
		1326
		1327	finish_wait(&sb->s_writers.wait, &wait);
		1328	} while (writers);
		1329	}
		1330
1166	/**	1331	/**
1167	* freeze_super - lock the filesystem and force it into a consistent state	1332	* freeze_super - lock the filesystem and force it into a consistent state
1168	* @sb: the super to lock	1333	* @sb: the super to lock
@@ -1170,6 +1335,31 @@ out:
1170	* Syncs the super to make sure the filesystem is consistent and calls the fs's	1335	* Syncs the super to make sure the filesystem is consistent and calls the fs's
1171	* freeze_fs. Subsequent calls to this without first thawing the fs will return	1336	* freeze_fs. Subsequent calls to this without first thawing the fs will return
1172	* -EBUSY.	1337	* -EBUSY.
		1338	*
		1339	* During this function, sb->s_writers.frozen goes through these values:
		1340	*
		1341	* SB_UNFROZEN: File system is normal, all writes progress as usual.
		1342	*
		1343	* SB_FREEZE_WRITE: The file system is in the process of being frozen. New
		1344	* writes should be blocked, though page faults are still allowed. We wait for
		1345	* all writes to complete and then proceed to the next stage.
		1346	*
		1347	* SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
		1348	* but internal fs threads can still modify the filesystem (although they
		1349	* should not dirty new pages or inodes), writeback can run etc. After waiting
		1350	* for all running page faults we sync the filesystem which will clean all
		1351	* dirty pages and inodes (no new dirty pages or inodes can be created when
		1352	* sync is running).
		1353	*
		1354	* SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
		1355	* modification are blocked (e.g. XFS preallocation truncation on inode
		1356	* reclaim). This is usually implemented by blocking new transactions for
		1357	* filesystems that have them and need this additional guard. After all
		1358	* internal writers are finished we call ->freeze_fs() to finish filesystem
		1359	* freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
		1360	* mostly auxiliary for filesystems to verify they do not modify frozen fs.
		1361	*
		1362	* sb->s_writers.frozen is protected by sb->s_umount.
1173	*/	1363	*/
1174	int freeze_super(struct super_block *sb)	1364	int freeze_super(struct super_block *sb)
1175	{	1365	{
@@ -1177,7 +1367,7 @@ int freeze_super(struct super_block *sb)
1177		1367
1178	atomic_inc(&sb->s_active);	1368	atomic_inc(&sb->s_active);
1179	down_write(&sb->s_umount);	1369	down_write(&sb->s_umount);
1180	if (sb->s_frozen) {	1370	if (sb->s_writers.frozen != SB_UNFROZEN) {
1181	deactivate_locked_super(sb);	1371	deactivate_locked_super(sb);
1182	return -EBUSY;	1372	return -EBUSY;
1183	}	1373	}
@@ -1188,33 +1378,53 @@ int freeze_super(struct super_block *sb)
1188	}	1378	}
1189		1379
1190	if (sb->s_flags & MS_RDONLY) {	1380	if (sb->s_flags & MS_RDONLY) {
1191	sb->s_frozen = SB_FREEZE_TRANS;	1381	/* Nothing to do really... */
1192	smp_wmb();	1382	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1193	up_write(&sb->s_umount);	1383	up_write(&sb->s_umount);
1194	return 0;	1384	return 0;
1195	}	1385	}
1196		1386
1197	sb->s_frozen = SB_FREEZE_WRITE;	1387	/* From now on, no new normal writers can start */
		1388	sb->s_writers.frozen = SB_FREEZE_WRITE;
		1389	smp_wmb();
		1390
		1391	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
		1392	up_write(&sb->s_umount);
		1393
		1394	sb_wait_write(sb, SB_FREEZE_WRITE);
		1395
		1396	/* Now we go and block page faults... */
		1397	down_write(&sb->s_umount);
		1398	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
1198	smp_wmb();	1399	smp_wmb();
1199		1400
		1401	sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
		1402
		1403	/* All writers are done so after syncing there won't be dirty data */
1200	sync_filesystem(sb);	1404	sync_filesystem(sb);
1201		1405
1202	sb->s_frozen = SB_FREEZE_TRANS;	1406	/* Now wait for internal filesystem counter */
		1407	sb->s_writers.frozen = SB_FREEZE_FS;
1203	smp_wmb();	1408	smp_wmb();
		1409	sb_wait_write(sb, SB_FREEZE_FS);
1204		1410
1205	sync_blockdev(sb->s_bdev);
1206	if (sb->s_op->freeze_fs) {	1411	if (sb->s_op->freeze_fs) {
1207	ret = sb->s_op->freeze_fs(sb);	1412	ret = sb->s_op->freeze_fs(sb);
1208	if (ret) {	1413	if (ret) {
1209	printk(KERN_ERR	1414	printk(KERN_ERR
1210	"VFS:Filesystem freeze failed\n");	1415	"VFS:Filesystem freeze failed\n");
1211	sb->s_frozen = SB_UNFROZEN;	1416	sb->s_writers.frozen = SB_UNFROZEN;
1212	smp_wmb();	1417	smp_wmb();
1213	wake_up(&sb->s_wait_unfrozen);	1418	wake_up(&sb->s_writers.wait_unfrozen);
1214	deactivate_locked_super(sb);	1419	deactivate_locked_super(sb);
1215	return ret;	1420	return ret;
1216	}	1421	}
1217	}	1422	}
		1423	/*
		1424	* This is just for debugging purposes so that fs can warn if it
		1425	* sees write activity when frozen is set to SB_FREEZE_COMPLETE.
		1426	*/
		1427	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
1218	up_write(&sb->s_umount);	1428	up_write(&sb->s_umount);
1219	return 0;	1429	return 0;
1220	}	1430	}
@@ -1231,7 +1441,7 @@ int thaw_super(struct super_block *sb)
1231	int error;	1441	int error;
1232		1442
1233	down_write(&sb->s_umount);	1443	down_write(&sb->s_umount);
1234	if (sb->s_frozen == SB_UNFROZEN) {	1444	if (sb->s_writers.frozen == SB_UNFROZEN) {
1235	up_write(&sb->s_umount);	1445	up_write(&sb->s_umount);
1236	return -EINVAL;	1446	return -EINVAL;
1237	}	1447	}
@@ -1244,16 +1454,15 @@ int thaw_super(struct super_block *sb)
1244	if (error) {	1454	if (error) {
1245	printk(KERN_ERR	1455	printk(KERN_ERR
1246	"VFS:Filesystem thaw failed\n");	1456	"VFS:Filesystem thaw failed\n");
1247	sb->s_frozen = SB_FREEZE_TRANS;
1248	up_write(&sb->s_umount);	1457	up_write(&sb->s_umount);
1249	return error;	1458	return error;
1250	}	1459	}
1251	}	1460	}
1252		1461
1253	out:	1462	out:
1254	sb->s_frozen = SB_UNFROZEN;	1463	sb->s_writers.frozen = SB_UNFROZEN;
1255	smp_wmb();	1464	smp_wmb();
1256	wake_up(&sb->s_wait_unfrozen);	1465	wake_up(&sb->s_writers.wait_unfrozen);
1257	deactivate_locked_super(sb);	1466	deactivate_locked_super(sb);
1258		1467
1259	return 0;	1468	return 0;