[PATCH] r/o bind mounts: track numbers of writers to mounts

This is the real meat of the entire series. It actually implements the tracking of the number of writers to a mount. However, it causes scalability problems because there can be hundreds of cpus doing open()/close() on files on the same mnt at the same time. Even an atomic_t in the mnt has massive scalaing problems because the cacheline gets so terribly contended. This uses a statically-allocated percpu variable. All want/drop operations are local to a cpu as long that cpu operates on the same mount, and there are no writer count imbalances. Writer count imbalances happen when a write is taken on one cpu, and released on another, like when an open/close pair is performed on two Upon a remount,ro request, all of the data from the percpu variables is collected (expensive, but very rare) and we determine if there are any outstanding writers to the mount. I've written a little benchmark to sit in a loop for a couple of seconds in several cpus in parallel doing open/write/close loops. http://sr71.net/~dave/linux/openbench.c The code in here is a a worst-possible case for this patch. It does opens on a _pair_ of files in two different mounts in parallel. This should cause my code to lose its "operate on the same mount" optimization completely. This worst-case scenario causes a 3% degredation in the benchmark. I could probably get rid of even this 3%, but it would be more complex than what I have here, and I think this is getting into acceptable territory. In practice, I expect writing more than 3 bytes to a file, as well as disk I/O to mask any effects that this has. (To get rid of that 3%, we could have an #defined number of mounts in the percpu variable. So, instead of a CPU getting operate only on percpu data when it accesses only one mount, it could stay on percpu data when it only accesses N or fewer mounts.) [AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount Acked-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: Dave Hansen <haveblue@us.ibm.com> 2008-02-15 17:37:59 -0500
committer: Al Viro <viro@zeniv.linux.org.uk> 2008-04-19 00:29:27 -0400
commit: 3d733633a633065729c9e4e254b2e5442c00ef7e (patch)
tree: 8b52ba468f275f86221ddb77c29306a2405844fc
parent: 2c463e95480829a2fe8f386589516e13b1289db6 (diff)
2 files changed, 244 insertions, 15 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 066b393578c1..e3ce18d91aad 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/acct.h>
 #include <linux/capability.h>
+#include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/sysfs.h>
 #include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
        return tmp & (HASH_SIZE - 1);
 }
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
        struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
+                atomic_set(&mnt->__mnt_writers, 0);
                if (name) {
                        int size = strlen(name) + 1;
                        char *newname = kmalloc(size, GFP_KERNEL);
@@ -88,6 +92,92 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 * we can determine when writes are able to occur to
 * a filesystem.
 */
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*.  This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ * mnt_want/drop_write() will _keep_ the filesystem
+ * r/w.
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+        return (mnt->mnt_sb->s_flags & MS_RDONLY);
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+struct mnt_writer {
+        /*
+         * If holding multiple instances of this lock, they
+         * must be ordered by cpu number.
+         */
+        spinlock_t lock;
+        struct lock_class_key lock_class; /* compiles out with !lockdep */
+        unsigned long count;
+        struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+static int __init init_mnt_writers(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+                spin_lock_init(&writer->lock);
+                lockdep_set_class(&writer->lock, &writer->lock_class);
+                writer->count = 0;
+        }
+        return 0;
+}
+fs_initcall(init_mnt_writers);
+static void unlock_mnt_writers(void)
+{
+        int cpu;
+        struct mnt_writer *cpu_writer;
+        for_each_possible_cpu(cpu) {
+                cpu_writer = &per_cpu(mnt_writers, cpu);
+                spin_unlock(&cpu_writer->lock);
+        }
+}
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+        if (!cpu_writer->mnt)
+                return;
+        /*
+         * This is in case anyone ever leaves an invalid,
+         * old ->mnt and a count of 0.
+         */
+        if (!cpu_writer->count)
+                return;
+        atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+        cpu_writer->count = 0;
+}
+ /*
+ * must hold cpu_writer->lock
+ */
+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+                                          struct vfsmount *mnt)
+{
+        if (cpu_writer->mnt == mnt)
+                return;
+        __clear_mnt_count(cpu_writer);
+        cpu_writer->mnt = mnt;
+}
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
 /**
 * mnt_want_write - get write access to a mount
 * @mnt: the mount on which to take a write
@@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 */
 int mnt_want_write(struct vfsmount *mnt)
 {
-        if (__mnt_is_readonly(mnt))
+        int ret = 0;
-                return -EROFS;
+        struct mnt_writer *cpu_writer;
-        return 0;
+        cpu_writer = &get_cpu_var(mnt_writers);
+        spin_lock(&cpu_writer->lock);
+        if (__mnt_is_readonly(mnt)) {
+                ret = -EROFS;
+                goto out;
+        }
+        use_cpu_writer_for_mount(cpu_writer, mnt);
+        cpu_writer->count++;
+out:
+        spin_unlock(&cpu_writer->lock);
+        put_cpu_var(mnt_writers);
+        return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
+static void lock_mnt_writers(void)
+{
+        int cpu;
+        struct mnt_writer *cpu_writer;
+        for_each_possible_cpu(cpu) {
+                cpu_writer = &per_cpu(mnt_writers, cpu);
+                spin_lock(&cpu_writer->lock);
+                __clear_mnt_count(cpu_writer);
+                cpu_writer->mnt = NULL;
+        }
+}
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count.  Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+        if (atomic_read(&mnt->__mnt_writers) >=
+            MNT_WRITER_UNDERFLOW_LIMIT)
+                return;
+        /*
+         * It isn't necessary to hold all of the locks
+         * at the same time, but doing it this way makes
+         * us share a lot more code.
+         */
+        lock_mnt_writers();
+        /*
+         * vfsmount_lock is for mnt_flags.
+         */
+        spin_lock(&vfsmount_lock);
+        /*
+         * If coalescing the per-cpu writer counts did not
+         * get us back to a positive writer count, we have
+         * a bug.
+         */
+        if ((atomic_read(&mnt->__mnt_writers) < 0) &&
+            !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
+                printk(KERN_DEBUG "leak detected on mount(%p) writers "
+                                "count: %d\n",
+                        mnt, atomic_read(&mnt->__mnt_writers));
+                WARN_ON(1);
+                /* use the flag to keep the dmesg spam down */
+                mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
+        }
+        spin_unlock(&vfsmount_lock);
+        unlock_mnt_writers();
+}
 /**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
@@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
 */
 void mnt_drop_write(struct vfsmount *mnt)
 {
+        int must_check_underflow = 0;
+        struct mnt_writer *cpu_writer;
+        cpu_writer = &get_cpu_var(mnt_writers);
+        spin_lock(&cpu_writer->lock);
+        use_cpu_writer_for_mount(cpu_writer, mnt);
+        if (cpu_writer->count > 0) {
+                cpu_writer->count--;
+        } else {
+                must_check_underflow = 1;
+                atomic_dec(&mnt->__mnt_writers);
+        }
+        spin_unlock(&cpu_writer->lock);
+        /*
+         * Logically, we could call this each time,
+         * but the __mnt_writers cacheline tends to
+         * be cold, and makes this expensive.
+         */
+        if (must_check_underflow)
+                handle_write_count_underflow(mnt);
+        /*
+         * This could be done right after the spinlock
+         * is taken because the spinlock keeps us on
+         * the cpu, and disables preemption.  However,
+         * putting it here bounds the amount that
+         * __mnt_writers can underflow.  Without it,
+         * we could theoretically wrap __mnt_writers.
+         */
+        put_cpu_var(mnt_writers);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
-/*
+int mnt_make_readonly(struct vfsmount *mnt)
- * __mnt_is_readonly: check whether a mount is read-only
- * @mnt: the mount to check for its write status
- *
- * This shouldn't be used directly ouside of the VFS.
- * It does not guarantee that the filesystem will stay
- * r/w, just that it is right *now*.  This can not and
- * should not be used in place of IS_RDONLY(inode).
- */
-int __mnt_is_readonly(struct vfsmount *mnt)
 {
-        return (mnt->mnt_sb->s_flags & MS_RDONLY);
+        int ret = 0;
+        lock_mnt_writers();
+        /*
+         * With all the locks held, this value is stable
+         */
+        if (atomic_read(&mnt->__mnt_writers) > 0) {
+                ret = -EBUSY;
+                goto out;
+        }
+        /*
+         * actually set mount's r/o flag here to make
+         * __mnt_is_readonly() true, which keeps anyone
+         * from doing a successful mnt_want_write().
+         */
+out:
+        unlock_mnt_writers();
+        return ret;
 }
-EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
@@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 static inline void __mntput(struct vfsmount *mnt)
 {
+        int cpu;
        struct super_block *sb = mnt->mnt_sb;
+        /*
+         * We don't have to hold all of the locks at the
+         * same time here because we know that we're the
+         * last reference to mnt and that no new writers
+         * can come in.
+         */
+        for_each_possible_cpu(cpu) {
+                struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
+                if (cpu_writer->mnt != mnt)
+                        continue;
+                spin_lock(&cpu_writer->lock);
+                atomic_add(cpu_writer->count, &mnt->__mnt_writers);
+                cpu_writer->count = 0;
+                /*
+                 * Might as well do this so that no one
+                 * ever sees the pointer and expects
+                 * it to be valid.
+                 */
+                cpu_writer->mnt = NULL;
+                spin_unlock(&cpu_writer->lock);
+        }
+        /*
+         * This probably indicates that somebody messed
+         * up a mnt_want/drop_write() pair.  If this
+         * happens, the filesystem was probably unable
+         * to make r/w->r/o transitions.
+         */
+        WARN_ON(atomic_read(&mnt->__mnt_writers));
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 2eecd2c8c760..8c8e94369ac8 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/nodemask.h>
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
@@ -30,6 +31,7 @@ struct mnt_namespace;
 #define MNT_RELATIME    0x20
 #define MNT_SHRINKABLE  0x100
+#define MNT_IMBALANCED_WRITE_COUNT      0x200 /* just for debugging */
 #define MNT_SHARED      0x1000  /* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE  0x2000  /* if the vfsmount is a unbindable mount */
@@ -62,6 +64,11 @@ struct vfsmount {
        int mnt_expiry_mark;            /* true if marked for expiry */
        int mnt_pinned;
        int mnt_ghosts;
+        /*
+         * This value is not stable unless all of the mnt_writers[] spinlocks
+         * are held, and all mnt_writer[]s on this mount have 0 as their ->count
+         */
+        atomic_t __mnt_writers;
 };
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
author	Dave Hansen <haveblue@us.ibm.com>	2008-02-15 17:37:59 -0500
committer	Al Viro <viro@zeniv.linux.org.uk>	2008-04-19 00:29:27 -0400
commit	3d733633a633065729c9e4e254b2e5442c00ef7e (patch)
tree	8b52ba468f275f86221ddb77c29306a2405844fc
parent	2c463e95480829a2fe8f386589516e13b1289db6 (diff)

diff --git a/fs/namespace.c b/fs/namespace.c index 066b393578c1..e3ce18d91aad 100644 --- a/fs/namespace.c +++ b/fs/namespace.c
@@ -17,6 +17,7 @@
17	#include <linux/quotaops.h>	17	#include <linux/quotaops.h>
18	#include <linux/acct.h>	18	#include <linux/acct.h>
19	#include <linux/capability.h>	19	#include <linux/capability.h>
		20	#include <linux/cpumask.h>
20	#include <linux/module.h>	21	#include <linux/module.h>
21	#include <linux/sysfs.h>	22	#include <linux/sysfs.h>
22	#include <linux/seq_file.h>	23	#include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount mnt, struct dentry dentry)
55	return tmp & (HASH_SIZE - 1);	56	return tmp & (HASH_SIZE - 1);
56	}	57	}
57		58
		59	#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
		60
58	struct vfsmount alloc_vfsmnt(const char name)	61	struct vfsmount alloc_vfsmnt(const char name)
59	{	62	{
60	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);	63	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount alloc_vfsmnt(const char name)
68	INIT_LIST_HEAD(&mnt->mnt_share);	71	INIT_LIST_HEAD(&mnt->mnt_share);
69	INIT_LIST_HEAD(&mnt->mnt_slave_list);	72	INIT_LIST_HEAD(&mnt->mnt_slave_list);
70	INIT_LIST_HEAD(&mnt->mnt_slave);	73	INIT_LIST_HEAD(&mnt->mnt_slave);
		74	atomic_set(&mnt->__mnt_writers, 0);
71	if (name) {	75	if (name) {
72	int size = strlen(name) + 1;	76	int size = strlen(name) + 1;
73	char *newname = kmalloc(size, GFP_KERNEL);	77	char *newname = kmalloc(size, GFP_KERNEL);
@@ -88,6 +92,92 @@ struct vfsmount alloc_vfsmnt(const char name)
88	* we can determine when writes are able to occur to	92	* we can determine when writes are able to occur to
89	* a filesystem.	93	* a filesystem.
90	*/	94	*/
		95	/*
		96	* __mnt_is_readonly: check whether a mount is read-only
		97	* @mnt: the mount to check for its write status
		98	*
		99	* This shouldn't be used directly ouside of the VFS.
		100	* It does not guarantee that the filesystem will stay
		101	* r/w, just that it is right now. This can not and
		102	* should not be used in place of IS_RDONLY(inode).
		103	* mnt_want/drop_write() will _keep_ the filesystem
		104	* r/w.
		105	*/
		106	int __mnt_is_readonly(struct vfsmount *mnt)
		107	{
		108	return (mnt->mnt_sb->s_flags & MS_RDONLY);
		109	}
		110	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
		111
		112	struct mnt_writer {
		113	/*
		114	* If holding multiple instances of this lock, they
		115	* must be ordered by cpu number.
		116	*/
		117	spinlock_t lock;
		118	struct lock_class_key lock_class; /* compiles out with !lockdep */
		119	unsigned long count;
		120	struct vfsmount *mnt;
		121	} ____cacheline_aligned_in_smp;
		122	static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
		123
		124	static int __init init_mnt_writers(void)
		125	{
		126	int cpu;
		127	for_each_possible_cpu(cpu) {
		128	struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
		129	spin_lock_init(&writer->lock);
		130	lockdep_set_class(&writer->lock, &writer->lock_class);
		131	writer->count = 0;
		132	}
		133	return 0;
		134	}
		135	fs_initcall(init_mnt_writers);
		136
		137	static void unlock_mnt_writers(void)
		138	{
		139	int cpu;
		140	struct mnt_writer *cpu_writer;
		141
		142	for_each_possible_cpu(cpu) {
		143	cpu_writer = &per_cpu(mnt_writers, cpu);
		144	spin_unlock(&cpu_writer->lock);
		145	}
		146	}
		147
		148	static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
		149	{
		150	if (!cpu_writer->mnt)
		151	return;
		152	/*
		153	* This is in case anyone ever leaves an invalid,
		154	* old ->mnt and a count of 0.
		155	*/
		156	if (!cpu_writer->count)
		157	return;
		158	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
		159	cpu_writer->count = 0;
		160	}
		161	/*
		162	* must hold cpu_writer->lock
		163	*/
		164	static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
		165	struct vfsmount *mnt)
		166	{
		167	if (cpu_writer->mnt == mnt)
		168	return;
		169	__clear_mnt_count(cpu_writer);
		170	cpu_writer->mnt = mnt;
		171	}
		172
		173	/*
		174	* Most r/o checks on a fs are for operations that take
		175	* discrete amounts of time, like a write() or unlink().
		176	* We must keep track of when those operations start
		177	* (for permission checks) and when they end, so that
		178	* we can determine when writes are able to occur to
		179	* a filesystem.
		180	*/
91	/**	181	/**
92	* mnt_want_write - get write access to a mount	182	* mnt_want_write - get write access to a mount
93	* @mnt: the mount on which to take a write	183	* @mnt: the mount on which to take a write
@@ -100,12 +190,77 @@ struct vfsmount alloc_vfsmnt(const char name)
100	*/	190	*/
101	int mnt_want_write(struct vfsmount *mnt)	191	int mnt_want_write(struct vfsmount *mnt)
102	{	192	{
103	if (__mnt_is_readonly(mnt))	193	int ret = 0;
104	return -EROFS;	194	struct mnt_writer *cpu_writer;
105	return 0;	195
		196	cpu_writer = &get_cpu_var(mnt_writers);
		197	spin_lock(&cpu_writer->lock);
		198	if (__mnt_is_readonly(mnt)) {
		199	ret = -EROFS;
		200	goto out;
		201	}
		202	use_cpu_writer_for_mount(cpu_writer, mnt);
		203	cpu_writer->count++;
		204	out:
		205	spin_unlock(&cpu_writer->lock);
		206	put_cpu_var(mnt_writers);
		207	return ret;
106	}	208	}
107	EXPORT_SYMBOL_GPL(mnt_want_write);	209	EXPORT_SYMBOL_GPL(mnt_want_write);
108		210
		211	static void lock_mnt_writers(void)
		212	{
		213	int cpu;
		214	struct mnt_writer *cpu_writer;
		215
		216	for_each_possible_cpu(cpu) {
		217	cpu_writer = &per_cpu(mnt_writers, cpu);
		218	spin_lock(&cpu_writer->lock);
		219	__clear_mnt_count(cpu_writer);
		220	cpu_writer->mnt = NULL;
		221	}
		222	}
		223
		224	/*
		225	* These per-cpu write counts are not guaranteed to have
		226	* matched increments and decrements on any given cpu.
		227	* A file open()ed for write on one cpu and close()d on
		228	* another cpu will imbalance this count. Make sure it
		229	* does not get too far out of whack.
		230	*/
		231	static void handle_write_count_underflow(struct vfsmount *mnt)
		232	{
		233	if (atomic_read(&mnt->__mnt_writers) >=
		234	MNT_WRITER_UNDERFLOW_LIMIT)
		235	return;
		236	/*
		237	* It isn't necessary to hold all of the locks
		238	* at the same time, but doing it this way makes
		239	* us share a lot more code.
		240	*/
		241	lock_mnt_writers();
		242	/*
		243	* vfsmount_lock is for mnt_flags.
		244	*/
		245	spin_lock(&vfsmount_lock);
		246	/*
		247	* If coalescing the per-cpu writer counts did not
		248	* get us back to a positive writer count, we have
		249	* a bug.
		250	*/
		251	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
		252	!(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
		253	printk(KERN_DEBUG "leak detected on mount(%p) writers "
		254	"count: %d\n",
		255	mnt, atomic_read(&mnt->__mnt_writers));
		256	WARN_ON(1);
		257	/* use the flag to keep the dmesg spam down */
		258	mnt->mnt_flags \|= MNT_IMBALANCED_WRITE_COUNT;
		259	}
		260	spin_unlock(&vfsmount_lock);
		261	unlock_mnt_writers();
		262	}
		263
109	/**	264	/**
110	* mnt_drop_write - give up write access to a mount	265	* mnt_drop_write - give up write access to a mount
111	* @mnt: the mount on which to give up write access	266	* @mnt: the mount on which to give up write access
@@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
116	*/	271	*/
117	void mnt_drop_write(struct vfsmount *mnt)	272	void mnt_drop_write(struct vfsmount *mnt)
118	{	273	{
		274	int must_check_underflow = 0;
		275	struct mnt_writer *cpu_writer;
		276
		277	cpu_writer = &get_cpu_var(mnt_writers);
		278	spin_lock(&cpu_writer->lock);
		279
		280	use_cpu_writer_for_mount(cpu_writer, mnt);
		281	if (cpu_writer->count > 0) {
		282	cpu_writer->count--;
		283	} else {
		284	must_check_underflow = 1;
		285	atomic_dec(&mnt->__mnt_writers);
		286	}
		287
		288	spin_unlock(&cpu_writer->lock);
		289	/*
		290	* Logically, we could call this each time,
		291	* but the __mnt_writers cacheline tends to
		292	* be cold, and makes this expensive.
		293	*/
		294	if (must_check_underflow)
		295	handle_write_count_underflow(mnt);
		296	/*
		297	* This could be done right after the spinlock
		298	* is taken because the spinlock keeps us on
		299	* the cpu, and disables preemption. However,
		300	* putting it here bounds the amount that
		301	* __mnt_writers can underflow. Without it,
		302	* we could theoretically wrap __mnt_writers.
		303	*/
		304	put_cpu_var(mnt_writers);
119	}	305	}
120	EXPORT_SYMBOL_GPL(mnt_drop_write);	306	EXPORT_SYMBOL_GPL(mnt_drop_write);
121		307
122	/*	308	int mnt_make_readonly(struct vfsmount *mnt)
123	* __mnt_is_readonly: check whether a mount is read-only
124	* @mnt: the mount to check for its write status
125	*
126	* This shouldn't be used directly ouside of the VFS.
127	* It does not guarantee that the filesystem will stay
128	* r/w, just that it is right now. This can not and
129	* should not be used in place of IS_RDONLY(inode).
130	*/
131	int __mnt_is_readonly(struct vfsmount *mnt)
132	{	309	{
133	return (mnt->mnt_sb->s_flags & MS_RDONLY);	310	int ret = 0;
		311
		312	lock_mnt_writers();
		313	/*
		314	* With all the locks held, this value is stable
		315	*/
		316	if (atomic_read(&mnt->__mnt_writers) > 0) {
		317	ret = -EBUSY;
		318	goto out;
		319	}
		320	/*
		321	* actually set mount's r/o flag here to make
		322	* __mnt_is_readonly() true, which keeps anyone
		323	* from doing a successful mnt_want_write().
		324	*/
		325	out:
		326	unlock_mnt_writers();
		327	return ret;
134	}	328	}
135	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
136		329
137	int simple_set_mnt(struct vfsmount mnt, struct super_block sb)	330	int simple_set_mnt(struct vfsmount mnt, struct super_block sb)
138	{	331	{
@@ -325,7 +518,36 @@ static struct vfsmount clone_mnt(struct vfsmount old, struct dentry *root,
325		518
326	static inline void __mntput(struct vfsmount *mnt)	519	static inline void __mntput(struct vfsmount *mnt)
327	{	520	{
		521	int cpu;
328	struct super_block *sb = mnt->mnt_sb;	522	struct super_block *sb = mnt->mnt_sb;
		523	/*
		524	* We don't have to hold all of the locks at the
		525	* same time here because we know that we're the
		526	* last reference to mnt and that no new writers
		527	* can come in.
		528	*/
		529	for_each_possible_cpu(cpu) {
		530	struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
		531	if (cpu_writer->mnt != mnt)
		532	continue;
		533	spin_lock(&cpu_writer->lock);
		534	atomic_add(cpu_writer->count, &mnt->__mnt_writers);
		535	cpu_writer->count = 0;
		536	/*
		537	* Might as well do this so that no one
		538	* ever sees the pointer and expects
		539	* it to be valid.
		540	*/
		541	cpu_writer->mnt = NULL;
		542	spin_unlock(&cpu_writer->lock);
		543	}
		544	/*
		545	* This probably indicates that somebody messed
		546	* up a mnt_want/drop_write() pair. If this
		547	* happens, the filesystem was probably unable
		548	* to make r/w->r/o transitions.
		549	*/
		550	WARN_ON(atomic_read(&mnt->__mnt_writers));
329	dput(mnt->mnt_root);	551	dput(mnt->mnt_root);
330	free_vfsmnt(mnt);	552	free_vfsmnt(mnt);
331	deactivate_super(sb);	553	deactivate_super(sb);


diff --git a/include/linux/mount.h b/include/linux/mount.h index 2eecd2c8c760..8c8e94369ac8 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h
@@ -14,6 +14,7 @@
14		14
15	#include <linux/types.h>	15	#include <linux/types.h>
16	#include <linux/list.h>	16	#include <linux/list.h>
		17	#include <linux/nodemask.h>
17	#include <linux/spinlock.h>	18	#include <linux/spinlock.h>
18	#include <asm/atomic.h>	19	#include <asm/atomic.h>
19		20
@@ -30,6 +31,7 @@ struct mnt_namespace;
30	#define MNT_RELATIME 0x20	31	#define MNT_RELATIME 0x20
31		32
32	#define MNT_SHRINKABLE 0x100	33	#define MNT_SHRINKABLE 0x100
		34	#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
33		35
34	#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */	36	#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
35	#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */	37	#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
@@ -62,6 +64,11 @@ struct vfsmount {
62	int mnt_expiry_mark; /* true if marked for expiry */	64	int mnt_expiry_mark; /* true if marked for expiry */
63	int mnt_pinned;	65	int mnt_pinned;
64	int mnt_ghosts;	66	int mnt_ghosts;
		67	/*
		68	* This value is not stable unless all of the mnt_writers[] spinlocks
		69	* are held, and all mnt_writer[]s on this mount have 0 as their ->count
		70	*/
		71	atomic_t __mnt_writers;
65	};	72	};
66		73
67	static inline struct vfsmount mntget(struct vfsmount mnt)	74	static inline struct vfsmount mntget(struct vfsmount mnt)