fs: mnt_want_write speedup

This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it. A microbenchmark yes, but it exercises some important paths in the mm. Before: avg = 501.9 std = 14.7773 After: avg = 462.286 std = 5.46106 (50 runs of each, stddev gives a reasonable confidence, but there is quite a bit of variation there still) It does this by removing the complex per-cpu locking and counter-cache and replaces it with a percpu counter in struct vfsmount. This makes the code much simpler, and avoids spinlocks (although the msync is still pretty costly, unfortunately). It results in about 900 bytes smaller code too. It does increase the size of a vfsmount, however. It should also give a speedup on large systems if CPUs are frequently operating on different mounts (because the existing scheme has to operate on an atomic in the struct vfsmount when switching between mounts). But I'm most interested in the single threaded path performance for the moment. [AV: minor cleanup] Cc: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: npiggin@suse.de <npiggin@suse.de> 2009-04-26 06:25:54 -0400
committer: Al Viro <viro@zeniv.linux.org.uk> 2009-06-11 21:36:02 -0400
commit: d3ef3d7351ccfbef3e5d926efc5ee332136f40d4 (patch)
tree: bd875a2b267ae03b350e259675ccb1a04453b9b9
parent: 3174c21b74b56c6a53fddd41a30fd6f757a32bd0 (diff)
2 files changed, 106 insertions, 183 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index b94ad3d685ff..22ae06ad751d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
-                atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+                mnt->mnt_writers = alloc_percpu(int);
+                if (!mnt->mnt_writers)
+                        goto out_free_devname;
+#else
+                mnt->mnt_writers = 0;
+#endif
        }
        return mnt;
+#ifdef CONFIG_SMP
+out_free_devname:
+        kfree(mnt->mnt_devname);
+#endif
 out_free_id:
        mnt_free_id(mnt);
 out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-struct mnt_writer {
+static inline void inc_mnt_writers(struct vfsmount *mnt)
-        /*
+{
-         * If holding multiple instances of this lock, they
+#ifdef CONFIG_SMP
-         * must be ordered by cpu number.
+        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
-         */
+#else
-        spinlock_t lock;
+        mnt->mnt_writers++;
-        struct lock_class_key lock_class; /* compiles out with !lockdep */
+#endif
-        unsigned long count;
+}
-        struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
 {
-        int cpu;
+#ifdef CONFIG_SMP
-        for_each_possible_cpu(cpu) {
+        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
-                struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+#else
-                spin_lock_init(&writer->lock);
+        mnt->mnt_writers--;
-                lockdep_set_class(&writer->lock, &writer->lock_class);
+#endif
-                writer->count = 0;
-        }
-        return 0;
 }
-fs_initcall(init_mnt_writers);
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
+        unsigned int count = 0;
        int cpu;
-        struct mnt_writer *cpu_writer;
        for_each_possible_cpu(cpu) {
-                cpu_writer = &per_cpu(mnt_writers, cpu);
+                count += *per_cpu_ptr(mnt->mnt_writers, cpu);
-                spin_unlock(&cpu_writer->lock);
        }
-}
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+        return count;
-{
+#else
-        if (!cpu_writer->mnt)
+        return mnt->mnt_writers;
-                return;
+#endif
-        /*
-         * This is in case anyone ever leaves an invalid,
-         * old ->mnt and a count of 0.
-         */
-        if (!cpu_writer->count)
-                return;
-        atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
-        cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
-                                          struct vfsmount *mnt)
-{
-        if (cpu_writer->mnt == mnt)
-                return;
-        __clear_mnt_count(cpu_writer);
-        cpu_writer->mnt = mnt;
 }
 /*
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
 int mnt_want_write(struct vfsmount *mnt)
 {
        int ret = 0;
-        struct mnt_writer *cpu_writer;
-        cpu_writer = &get_cpu_var(mnt_writers);
+        preempt_disable();
-        spin_lock(&cpu_writer->lock);
+        inc_mnt_writers(mnt);
+        /*
+         * The store to inc_mnt_writers must be visible before we pass
+         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+         * incremented count after it has set MNT_WRITE_HOLD.
+         */
+        smp_mb();
+        while (mnt->mnt_flags & MNT_WRITE_HOLD)
+                cpu_relax();
+        /*
+         * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+         * be set to match its requirements. So we must not load that until
+         * MNT_WRITE_HOLD is cleared.
+         */
+        smp_rmb();
        if (__mnt_is_readonly(mnt)) {
+                dec_mnt_writers(mnt);
                ret = -EROFS;
                goto out;
        }
-        use_cpu_writer_for_mount(cpu_writer, mnt);
-        cpu_writer->count++;
 out:
-        spin_unlock(&cpu_writer->lock);
+        preempt_enable();
-        put_cpu_var(mnt_writers);
        return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
-static void lock_mnt_writers(void)
-{
-        int cpu;
-        struct mnt_writer *cpu_writer;
-        for_each_possible_cpu(cpu) {
-                cpu_writer = &per_cpu(mnt_writers, cpu);
-                spin_lock(&cpu_writer->lock);
-                __clear_mnt_count(cpu_writer);
-                cpu_writer->mnt = NULL;
-        }
-}
-/*
- * These per-cpu write counts are not guaranteed to have
- * matched increments and decrements on any given cpu.
- * A file open()ed for write on one cpu and close()d on
- * another cpu will imbalance this count.  Make sure it
- * does not get too far out of whack.
- */
-static void handle_write_count_underflow(struct vfsmount *mnt)
-{
-        if (atomic_read(&mnt->__mnt_writers) >=
-            MNT_WRITER_UNDERFLOW_LIMIT)
-                return;
-        /*
-         * It isn't necessary to hold all of the locks
-         * at the same time, but doing it this way makes
-         * us share a lot more code.
-         */
-        lock_mnt_writers();
-        /*
-         * vfsmount_lock is for mnt_flags.
-         */
-        spin_lock(&vfsmount_lock);
-        /*
-         * If coalescing the per-cpu writer counts did not
-         * get us back to a positive writer count, we have
-         * a bug.
-         */
-        if ((atomic_read(&mnt->__mnt_writers) < 0) &&
-            !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-                WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
-                                "count: %d\n",
-                        mnt, atomic_read(&mnt->__mnt_writers));
-                /* use the flag to keep the dmesg spam down */
-                mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
-        }
-        spin_unlock(&vfsmount_lock);
-        unlock_mnt_writers();
-}
 /**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
 */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-        int must_check_underflow = 0;
+        preempt_disable();
-        struct mnt_writer *cpu_writer;
+        dec_mnt_writers(mnt);
+        preempt_enable();
-        cpu_writer = &get_cpu_var(mnt_writers);
-        spin_lock(&cpu_writer->lock);
-        use_cpu_writer_for_mount(cpu_writer, mnt);
-        if (cpu_writer->count > 0) {
-                cpu_writer->count--;
-        } else {
-                must_check_underflow = 1;
-                atomic_dec(&mnt->__mnt_writers);
-        }
-        spin_unlock(&cpu_writer->lock);
-        /*
-         * Logically, we could call this each time,
-         * but the __mnt_writers cacheline tends to
-         * be cold, and makes this expensive.
-         */
-        if (must_check_underflow)
-                handle_write_count_underflow(mnt);
-        /*
-         * This could be done right after the spinlock
-         * is taken because the spinlock keeps us on
-         * the cpu, and disables preemption.  However,
-         * putting it here bounds the amount that
-         * __mnt_writers can underflow.  Without it,
-         * we could theoretically wrap __mnt_writers.
-         */
-        put_cpu_var(mnt_writers);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
        int ret = 0;
-        lock_mnt_writers();
+        spin_lock(&vfsmount_lock);
+        mnt->mnt_flags |= MNT_WRITE_HOLD;
        /*
-         * With all the locks held, this value is stable
+         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+         * should be visible before we do.
         */
-        if (atomic_read(&mnt->__mnt_writers) > 0) {
+        smp_mb();
-                ret = -EBUSY;
-                goto out;
-        }
        /*
-         * nobody can do a successful mnt_want_write() with all
+         * With writers on hold, if this value is zero, then there are
-         * of the counts in MNT_DENIED_WRITE and the locks held.
+         * definitely no active writers (although held writers may subsequently
+         * increment the count, they'll have to wait, and decrement it after
+         * seeing MNT_READONLY).
+         *
+         * It is OK to have counter incremented on one CPU and decremented on
+         * another: the sum will add up correctly. The danger would be when we
+         * sum up each counter, if we read a counter before it is incremented,
+         * but then read another CPU's count which it has been subsequently
+         * decremented from -- we would see more decrements than we should.
+         * MNT_WRITE_HOLD protects against this scenario, because
+         * mnt_want_write first increments count, then smp_mb, then spins on
+         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+         * we're counting up here.
         */
-        spin_lock(&vfsmount_lock);
+        if (count_mnt_writers(mnt) > 0)
-        if (!ret)
+                ret = -EBUSY;
+        else
                mnt->mnt_flags |= MNT_READONLY;
+        /*
+         * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+         * that become unheld will see MNT_READONLY.
+         */
+        smp_wmb();
+        mnt->mnt_flags &= ~MNT_WRITE_HOLD;
        spin_unlock(&vfsmount_lock);
-out:
-        unlock_mnt_writers();
        return ret;
 }
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+        free_percpu(mnt->mnt_writers);
+#endif
        kmem_cache_free(mnt_cache, mnt);
 }
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 static inline void __mntput(struct vfsmount *mnt)
 {
-        int cpu;
        struct super_block *sb = mnt->mnt_sb;
        /*
-         * We don't have to hold all of the locks at the
-         * same time here because we know that we're the
-         * last reference to mnt and that no new writers
-         * can come in.
-         */
-        for_each_possible_cpu(cpu) {
-                struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-                spin_lock(&cpu_writer->lock);
-                if (cpu_writer->mnt != mnt) {
-                        spin_unlock(&cpu_writer->lock);
-                        continue;
-                }
-                atomic_add(cpu_writer->count, &mnt->__mnt_writers);
-                cpu_writer->count = 0;
-                /*
-                 * Might as well do this so that no one
-                 * ever sees the pointer and expects
-                 * it to be valid.
-                 */
-                cpu_writer->mnt = NULL;
-                spin_unlock(&cpu_writer->lock);
-        }
-        /*
         * This probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this
         * happens, the filesystem was probably unable
         * to make r/w->r/o transitions.
         */
-        WARN_ON(atomic_read(&mnt->__mnt_writers));
+        /*
+         * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+         * provides barriers, so count_mnt_writers() below is safe.  AV
+         */
+        WARN_ON(count_mnt_writers(mnt));
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 51f55f903aff..ac49c1f8e5c0 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
 #define MNT_STRICTATIME 0x80
 #define MNT_SHRINKABLE  0x100
-#define MNT_IMBALANCED_WRITE_COUNT      0x200 /* just for debugging */
+#define MNT_WRITE_HOLD  0x200
 #define MNT_SHARED      0x1000  /* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE  0x2000  /* if the vfsmount is a unbindable mount */
@@ -65,13 +65,22 @@ struct vfsmount {
        int mnt_expiry_mark;            /* true if marked for expiry */
        int mnt_pinned;
        int mnt_ghosts;
-        /*
+#ifdef CONFIG_SMP
-         * This value is not stable unless all of the mnt_writers[] spinlocks
+        int *mnt_writers;
-         * are held, and all mnt_writer[]s on this mount have 0 as their ->count
+#else
-         */
+        int mnt_writers;
-        atomic_t __mnt_writers;
+#endif
 };
+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+        return mnt->mnt_writers;
+#else
+        return &mnt->mnt_writers;
+#endif
+}
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
 {
        if (mnt)
author	npiggin@suse.de <npiggin@suse.de>	2009-04-26 06:25:54 -0400
committer	Al Viro <viro@zeniv.linux.org.uk>	2009-06-11 21:36:02 -0400
commit	d3ef3d7351ccfbef3e5d926efc5ee332136f40d4 (patch)
tree	bd875a2b267ae03b350e259675ccb1a04453b9b9
parent	3174c21b74b56c6a53fddd41a30fd6f757a32bd0 (diff)

diff --git a/fs/namespace.c b/fs/namespace.c index b94ad3d685ff..22ae06ad751d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount alloc_vfsmnt(const char name)
131	INIT_LIST_HEAD(&mnt->mnt_share);	131	INIT_LIST_HEAD(&mnt->mnt_share);
132	INIT_LIST_HEAD(&mnt->mnt_slave_list);	132	INIT_LIST_HEAD(&mnt->mnt_slave_list);
133	INIT_LIST_HEAD(&mnt->mnt_slave);	133	INIT_LIST_HEAD(&mnt->mnt_slave);
134	atomic_set(&mnt->__mnt_writers, 0);	134	#ifdef CONFIG_SMP
		135	mnt->mnt_writers = alloc_percpu(int);
		136	if (!mnt->mnt_writers)
		137	goto out_free_devname;
		138	#else
		139	mnt->mnt_writers = 0;
		140	#endif
135	}	141	}
136	return mnt;	142	return mnt;
137		143
		144	#ifdef CONFIG_SMP
		145	out_free_devname:
		146	kfree(mnt->mnt_devname);
		147	#endif
138	out_free_id:	148	out_free_id:
139	mnt_free_id(mnt);	149	mnt_free_id(mnt);
140	out_free_cache:	150	out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
171	}	181	}
172	EXPORT_SYMBOL_GPL(__mnt_is_readonly);	182	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
173		183
174	struct mnt_writer {	184	static inline void inc_mnt_writers(struct vfsmount *mnt)
175	/*	185	{
176	* If holding multiple instances of this lock, they	186	#ifdef CONFIG_SMP
177	* must be ordered by cpu number.	187	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
178	*/	188	#else
179	spinlock_t lock;	189	mnt->mnt_writers++;
180	struct lock_class_key lock_class; /* compiles out with !lockdep */	190	#endif
181	unsigned long count;	191	}
182	struct vfsmount *mnt;
183	} ____cacheline_aligned_in_smp;
184	static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
185		192
186	static int __init init_mnt_writers(void)	193	static inline void dec_mnt_writers(struct vfsmount *mnt)
187	{	194	{
188	int cpu;	195	#ifdef CONFIG_SMP
189	for_each_possible_cpu(cpu) {	196	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
190	struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);	197	#else
191	spin_lock_init(&writer->lock);	198	mnt->mnt_writers--;
192	lockdep_set_class(&writer->lock, &writer->lock_class);	199	#endif
193	writer->count = 0;
194	}
195	return 0;
196	}	200	}
197	fs_initcall(init_mnt_writers);
198		201
199	static void unlock_mnt_writers(void)	202	static unsigned int count_mnt_writers(struct vfsmount *mnt)
200	{	203	{
		204	#ifdef CONFIG_SMP
		205	unsigned int count = 0;
201	int cpu;	206	int cpu;
202	struct mnt_writer *cpu_writer;
203		207
204	for_each_possible_cpu(cpu) {	208	for_each_possible_cpu(cpu) {
205	cpu_writer = &per_cpu(mnt_writers, cpu);	209	count += *per_cpu_ptr(mnt->mnt_writers, cpu);
206	spin_unlock(&cpu_writer->lock);
207	}	210	}
208	}
209		211
210	static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)	212	return count;
211	{	213	#else
212	if (!cpu_writer->mnt)	214	return mnt->mnt_writers;
213	return;	215	#endif
214	/*
215	* This is in case anyone ever leaves an invalid,
216	* old ->mnt and a count of 0.
217	*/
218	if (!cpu_writer->count)
219	return;
220	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
221	cpu_writer->count = 0;
222	}
223	/*
224	* must hold cpu_writer->lock
225	*/
226	static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
227	struct vfsmount *mnt)
228	{
229	if (cpu_writer->mnt == mnt)
230	return;
231	__clear_mnt_count(cpu_writer);
232	cpu_writer->mnt = mnt;
233	}	216	}
234		217
235	/*	218	/*
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
253	int mnt_want_write(struct vfsmount *mnt)	236	int mnt_want_write(struct vfsmount *mnt)
254	{	237	{
255	int ret = 0;	238	int ret = 0;
256	struct mnt_writer *cpu_writer;
257		239
258	cpu_writer = &get_cpu_var(mnt_writers);	240	preempt_disable();
259	spin_lock(&cpu_writer->lock);	241	inc_mnt_writers(mnt);
		242	/*
		243	* The store to inc_mnt_writers must be visible before we pass
		244	* MNT_WRITE_HOLD loop below, so that the slowpath can see our
		245	* incremented count after it has set MNT_WRITE_HOLD.
		246	*/
		247	smp_mb();
		248	while (mnt->mnt_flags & MNT_WRITE_HOLD)
		249	cpu_relax();
		250	/*
		251	* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
		252	* be set to match its requirements. So we must not load that until
		253	* MNT_WRITE_HOLD is cleared.
		254	*/
		255	smp_rmb();
260	if (__mnt_is_readonly(mnt)) {	256	if (__mnt_is_readonly(mnt)) {
		257	dec_mnt_writers(mnt);
261	ret = -EROFS;	258	ret = -EROFS;
262	goto out;	259	goto out;
263	}	260	}
264	use_cpu_writer_for_mount(cpu_writer, mnt);
265	cpu_writer->count++;
266	out:	261	out:
267	spin_unlock(&cpu_writer->lock);	262	preempt_enable();
268	put_cpu_var(mnt_writers);
269	return ret;	263	return ret;
270	}	264	}
271	EXPORT_SYMBOL_GPL(mnt_want_write);	265	EXPORT_SYMBOL_GPL(mnt_want_write);
272		266
273	static void lock_mnt_writers(void)
274	{
275	int cpu;
276	struct mnt_writer *cpu_writer;
277
278	for_each_possible_cpu(cpu) {
279	cpu_writer = &per_cpu(mnt_writers, cpu);
280	spin_lock(&cpu_writer->lock);
281	__clear_mnt_count(cpu_writer);
282	cpu_writer->mnt = NULL;
283	}
284	}
285
286	/*
287	* These per-cpu write counts are not guaranteed to have
288	* matched increments and decrements on any given cpu.
289	* A file open()ed for write on one cpu and close()d on
290	* another cpu will imbalance this count. Make sure it
291	* does not get too far out of whack.
292	*/
293	static void handle_write_count_underflow(struct vfsmount *mnt)
294	{
295	if (atomic_read(&mnt->__mnt_writers) >=
296	MNT_WRITER_UNDERFLOW_LIMIT)
297	return;
298	/*
299	* It isn't necessary to hold all of the locks
300	* at the same time, but doing it this way makes
301	* us share a lot more code.
302	*/
303	lock_mnt_writers();
304	/*
305	* vfsmount_lock is for mnt_flags.
306	*/
307	spin_lock(&vfsmount_lock);
308	/*
309	* If coalescing the per-cpu writer counts did not
310	* get us back to a positive writer count, we have
311	* a bug.
312	*/
313	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
314	!(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
315	WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
316	"count: %d\n",
317	mnt, atomic_read(&mnt->__mnt_writers));
318	/* use the flag to keep the dmesg spam down */
319	mnt->mnt_flags \|= MNT_IMBALANCED_WRITE_COUNT;
320	}
321	spin_unlock(&vfsmount_lock);
322	unlock_mnt_writers();
323	}
324
325	/**	267	/**
326	* mnt_drop_write - give up write access to a mount	268	* mnt_drop_write - give up write access to a mount
327	* @mnt: the mount on which to give up write access	269	* @mnt: the mount on which to give up write access
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
332	*/	274	*/
333	void mnt_drop_write(struct vfsmount *mnt)	275	void mnt_drop_write(struct vfsmount *mnt)
334	{	276	{
335	int must_check_underflow = 0;	277	preempt_disable();
336	struct mnt_writer *cpu_writer;	278	dec_mnt_writers(mnt);
337		279	preempt_enable();
338	cpu_writer = &get_cpu_var(mnt_writers);
339	spin_lock(&cpu_writer->lock);
340
341	use_cpu_writer_for_mount(cpu_writer, mnt);
342	if (cpu_writer->count > 0) {
343	cpu_writer->count--;
344	} else {
345	must_check_underflow = 1;
346	atomic_dec(&mnt->__mnt_writers);
347	}
348
349	spin_unlock(&cpu_writer->lock);
350	/*
351	* Logically, we could call this each time,
352	* but the __mnt_writers cacheline tends to
353	* be cold, and makes this expensive.
354	*/
355	if (must_check_underflow)
356	handle_write_count_underflow(mnt);
357	/*
358	* This could be done right after the spinlock
359	* is taken because the spinlock keeps us on
360	* the cpu, and disables preemption. However,
361	* putting it here bounds the amount that
362	* __mnt_writers can underflow. Without it,
363	* we could theoretically wrap __mnt_writers.
364	*/
365	put_cpu_var(mnt_writers);
366	}	280	}
367	EXPORT_SYMBOL_GPL(mnt_drop_write);	281	EXPORT_SYMBOL_GPL(mnt_drop_write);
368		282
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
370	{	284	{
371	int ret = 0;	285	int ret = 0;
372		286
373	lock_mnt_writers();	287	spin_lock(&vfsmount_lock);
		288	mnt->mnt_flags \|= MNT_WRITE_HOLD;
374	/*	289	/*
375	* With all the locks held, this value is stable	290	* After storing MNT_WRITE_HOLD, we'll read the counters. This store
		291	* should be visible before we do.
376	*/	292	*/
377	if (atomic_read(&mnt->__mnt_writers) > 0) {	293	smp_mb();
378	ret = -EBUSY;	294
379	goto out;
380	}
381	/*	295	/*
382	* nobody can do a successful mnt_want_write() with all	296	* With writers on hold, if this value is zero, then there are
383	* of the counts in MNT_DENIED_WRITE and the locks held.	297	* definitely no active writers (although held writers may subsequently
		298	* increment the count, they'll have to wait, and decrement it after
		299	* seeing MNT_READONLY).
		300	*
		301	* It is OK to have counter incremented on one CPU and decremented on
		302	* another: the sum will add up correctly. The danger would be when we
		303	* sum up each counter, if we read a counter before it is incremented,
		304	* but then read another CPU's count which it has been subsequently
		305	* decremented from -- we would see more decrements than we should.
		306	* MNT_WRITE_HOLD protects against this scenario, because
		307	* mnt_want_write first increments count, then smp_mb, then spins on
		308	* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
		309	* we're counting up here.
384	*/	310	*/
385	spin_lock(&vfsmount_lock);	311	if (count_mnt_writers(mnt) > 0)
386	if (!ret)	312	ret = -EBUSY;
		313	else
387	mnt->mnt_flags \|= MNT_READONLY;	314	mnt->mnt_flags \|= MNT_READONLY;
		315	/*
		316	* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
		317	* that become unheld will see MNT_READONLY.
		318	*/
		319	smp_wmb();
		320	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
388	spin_unlock(&vfsmount_lock);	321	spin_unlock(&vfsmount_lock);
389	out:
390	unlock_mnt_writers();
391	return ret;	322	return ret;
392	}	323	}
393		324
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt)
410	{	341	{
411	kfree(mnt->mnt_devname);	342	kfree(mnt->mnt_devname);
412	mnt_free_id(mnt);	343	mnt_free_id(mnt);
		344	#ifdef CONFIG_SMP
		345	free_percpu(mnt->mnt_writers);
		346	#endif
413	kmem_cache_free(mnt_cache, mnt);	347	kmem_cache_free(mnt_cache, mnt);
414	}	348	}
415		349
@@ -604,38 +538,18 @@ static struct vfsmount clone_mnt(struct vfsmount old, struct dentry *root,
604		538
605	static inline void __mntput(struct vfsmount *mnt)	539	static inline void __mntput(struct vfsmount *mnt)
606	{	540	{
607	int cpu;
608	struct super_block *sb = mnt->mnt_sb;	541	struct super_block *sb = mnt->mnt_sb;
609	/*	542	/*
610	* We don't have to hold all of the locks at the
611	* same time here because we know that we're the
612	* last reference to mnt and that no new writers
613	* can come in.
614	*/
615	for_each_possible_cpu(cpu) {
616	struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617	spin_lock(&cpu_writer->lock);
618	if (cpu_writer->mnt != mnt) {
619	spin_unlock(&cpu_writer->lock);
620	continue;
621	}
622	atomic_add(cpu_writer->count, &mnt->__mnt_writers);
623	cpu_writer->count = 0;
624	/*
625	* Might as well do this so that no one
626	* ever sees the pointer and expects
627	* it to be valid.
628	*/
629	cpu_writer->mnt = NULL;
630	spin_unlock(&cpu_writer->lock);
631	}
632	/*
633	* This probably indicates that somebody messed	543	* This probably indicates that somebody messed
634	* up a mnt_want/drop_write() pair. If this	544	* up a mnt_want/drop_write() pair. If this
635	* happens, the filesystem was probably unable	545	* happens, the filesystem was probably unable
636	* to make r/w->r/o transitions.	546	* to make r/w->r/o transitions.
637	*/	547	*/
638	WARN_ON(atomic_read(&mnt->__mnt_writers));	548	/*
		549	* atomic_dec_and_lock() used to deal with ->mnt_count decrements
		550	* provides barriers, so count_mnt_writers() below is safe. AV
		551	*/
		552	WARN_ON(count_mnt_writers(mnt));
639	dput(mnt->mnt_root);	553	dput(mnt->mnt_root);
640	free_vfsmnt(mnt);	554	free_vfsmnt(mnt);
641	deactivate_super(sb);	555	deactivate_super(sb);


diff --git a/include/linux/mount.h b/include/linux/mount.h index 51f55f903aff..ac49c1f8e5c0 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
30	#define MNT_STRICTATIME 0x80	30	#define MNT_STRICTATIME 0x80
31		31
32	#define MNT_SHRINKABLE 0x100	32	#define MNT_SHRINKABLE 0x100
33	#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */	33	#define MNT_WRITE_HOLD 0x200
34		34
35	#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */	35	#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
36	#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */	36	#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
@@ -65,13 +65,22 @@ struct vfsmount {
65	int mnt_expiry_mark; /* true if marked for expiry */	65	int mnt_expiry_mark; /* true if marked for expiry */
66	int mnt_pinned;	66	int mnt_pinned;
67	int mnt_ghosts;	67	int mnt_ghosts;
68	/*	68	#ifdef CONFIG_SMP
69	* This value is not stable unless all of the mnt_writers[] spinlocks	69	int *mnt_writers;
70	* are held, and all mnt_writer[]s on this mount have 0 as their ->count	70	#else
71	*/	71	int mnt_writers;
72	atomic_t __mnt_writers;	72	#endif
73	};	73	};
74		74
		75	static inline int get_mnt_writers_ptr(struct vfsmount mnt)
		76	{
		77	#ifdef CONFIG_SMP
		78	return mnt->mnt_writers;
		79	#else
		80	return &mnt->mnt_writers;
		81	#endif
		82	}
		83
75	static inline struct vfsmount mntget(struct vfsmount mnt)	84	static inline struct vfsmount mntget(struct vfsmount mnt)
76	{	85	{
77	if (mnt)	86	if (mnt)