aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authornpiggin@suse.de <npiggin@suse.de>2009-04-26 06:25:54 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2009-06-11 21:36:02 -0400
commitd3ef3d7351ccfbef3e5d926efc5ee332136f40d4 (patch)
treebd875a2b267ae03b350e259675ccb1a04453b9b9
parent3174c21b74b56c6a53fddd41a30fd6f757a32bd0 (diff)
fs: mnt_want_write speedup
This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it. A microbenchmark yes, but it exercises some important paths in the mm. Before: avg = 501.9 std = 14.7773 After: avg = 462.286 std = 5.46106 (50 runs of each, stddev gives a reasonable confidence, but there is quite a bit of variation there still) It does this by removing the complex per-cpu locking and counter-cache and replaces it with a percpu counter in struct vfsmount. This makes the code much simpler, and avoids spinlocks (although the msync is still pretty costly, unfortunately). It results in about 900 bytes smaller code too. It does increase the size of a vfsmount, however. It should also give a speedup on large systems if CPUs are frequently operating on different mounts (because the existing scheme has to operate on an atomic in the struct vfsmount when switching between mounts). But I'm most interested in the single threaded path performance for the moment. [AV: minor cleanup] Cc: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/namespace.c268
-rw-r--r--include/linux/mount.h21
2 files changed, 106 insertions, 183 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index b94ad3d685ff..22ae06ad751d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
131 INIT_LIST_HEAD(&mnt->mnt_share); 131 INIT_LIST_HEAD(&mnt->mnt_share);
132 INIT_LIST_HEAD(&mnt->mnt_slave_list); 132 INIT_LIST_HEAD(&mnt->mnt_slave_list);
133 INIT_LIST_HEAD(&mnt->mnt_slave); 133 INIT_LIST_HEAD(&mnt->mnt_slave);
134 atomic_set(&mnt->__mnt_writers, 0); 134#ifdef CONFIG_SMP
135 mnt->mnt_writers = alloc_percpu(int);
136 if (!mnt->mnt_writers)
137 goto out_free_devname;
138#else
139 mnt->mnt_writers = 0;
140#endif
135 } 141 }
136 return mnt; 142 return mnt;
137 143
144#ifdef CONFIG_SMP
145out_free_devname:
146 kfree(mnt->mnt_devname);
147#endif
138out_free_id: 148out_free_id:
139 mnt_free_id(mnt); 149 mnt_free_id(mnt);
140out_free_cache: 150out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
171} 181}
172EXPORT_SYMBOL_GPL(__mnt_is_readonly); 182EXPORT_SYMBOL_GPL(__mnt_is_readonly);
173 183
174struct mnt_writer { 184static inline void inc_mnt_writers(struct vfsmount *mnt)
175 /* 185{
176 * If holding multiple instances of this lock, they 186#ifdef CONFIG_SMP
177 * must be ordered by cpu number. 187 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
178 */ 188#else
179 spinlock_t lock; 189 mnt->mnt_writers++;
180 struct lock_class_key lock_class; /* compiles out with !lockdep */ 190#endif
181 unsigned long count; 191}
182 struct vfsmount *mnt;
183} ____cacheline_aligned_in_smp;
184static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
185 192
186static int __init init_mnt_writers(void) 193static inline void dec_mnt_writers(struct vfsmount *mnt)
187{ 194{
188 int cpu; 195#ifdef CONFIG_SMP
189 for_each_possible_cpu(cpu) { 196 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
190 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); 197#else
191 spin_lock_init(&writer->lock); 198 mnt->mnt_writers--;
192 lockdep_set_class(&writer->lock, &writer->lock_class); 199#endif
193 writer->count = 0;
194 }
195 return 0;
196} 200}
197fs_initcall(init_mnt_writers);
198 201
199static void unlock_mnt_writers(void) 202static unsigned int count_mnt_writers(struct vfsmount *mnt)
200{ 203{
204#ifdef CONFIG_SMP
205 unsigned int count = 0;
201 int cpu; 206 int cpu;
202 struct mnt_writer *cpu_writer;
203 207
204 for_each_possible_cpu(cpu) { 208 for_each_possible_cpu(cpu) {
205 cpu_writer = &per_cpu(mnt_writers, cpu); 209 count += *per_cpu_ptr(mnt->mnt_writers, cpu);
206 spin_unlock(&cpu_writer->lock);
207 } 210 }
208}
209 211
210static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) 212 return count;
211{ 213#else
212 if (!cpu_writer->mnt) 214 return mnt->mnt_writers;
213 return; 215#endif
214 /*
215 * This is in case anyone ever leaves an invalid,
216 * old ->mnt and a count of 0.
217 */
218 if (!cpu_writer->count)
219 return;
220 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
221 cpu_writer->count = 0;
222}
223 /*
224 * must hold cpu_writer->lock
225 */
226static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
227 struct vfsmount *mnt)
228{
229 if (cpu_writer->mnt == mnt)
230 return;
231 __clear_mnt_count(cpu_writer);
232 cpu_writer->mnt = mnt;
233} 216}
234 217
235/* 218/*
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
253int mnt_want_write(struct vfsmount *mnt) 236int mnt_want_write(struct vfsmount *mnt)
254{ 237{
255 int ret = 0; 238 int ret = 0;
256 struct mnt_writer *cpu_writer;
257 239
258 cpu_writer = &get_cpu_var(mnt_writers); 240 preempt_disable();
259 spin_lock(&cpu_writer->lock); 241 inc_mnt_writers(mnt);
242 /*
243 * The store to inc_mnt_writers must be visible before we pass
244 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
245 * incremented count after it has set MNT_WRITE_HOLD.
246 */
247 smp_mb();
248 while (mnt->mnt_flags & MNT_WRITE_HOLD)
249 cpu_relax();
250 /*
251 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
252 * be set to match its requirements. So we must not load that until
253 * MNT_WRITE_HOLD is cleared.
254 */
255 smp_rmb();
260 if (__mnt_is_readonly(mnt)) { 256 if (__mnt_is_readonly(mnt)) {
257 dec_mnt_writers(mnt);
261 ret = -EROFS; 258 ret = -EROFS;
262 goto out; 259 goto out;
263 } 260 }
264 use_cpu_writer_for_mount(cpu_writer, mnt);
265 cpu_writer->count++;
266out: 261out:
267 spin_unlock(&cpu_writer->lock); 262 preempt_enable();
268 put_cpu_var(mnt_writers);
269 return ret; 263 return ret;
270} 264}
271EXPORT_SYMBOL_GPL(mnt_want_write); 265EXPORT_SYMBOL_GPL(mnt_want_write);
272 266
273static void lock_mnt_writers(void)
274{
275 int cpu;
276 struct mnt_writer *cpu_writer;
277
278 for_each_possible_cpu(cpu) {
279 cpu_writer = &per_cpu(mnt_writers, cpu);
280 spin_lock(&cpu_writer->lock);
281 __clear_mnt_count(cpu_writer);
282 cpu_writer->mnt = NULL;
283 }
284}
285
286/*
287 * These per-cpu write counts are not guaranteed to have
288 * matched increments and decrements on any given cpu.
289 * A file open()ed for write on one cpu and close()d on
290 * another cpu will imbalance this count. Make sure it
291 * does not get too far out of whack.
292 */
293static void handle_write_count_underflow(struct vfsmount *mnt)
294{
295 if (atomic_read(&mnt->__mnt_writers) >=
296 MNT_WRITER_UNDERFLOW_LIMIT)
297 return;
298 /*
299 * It isn't necessary to hold all of the locks
300 * at the same time, but doing it this way makes
301 * us share a lot more code.
302 */
303 lock_mnt_writers();
304 /*
305 * vfsmount_lock is for mnt_flags.
306 */
307 spin_lock(&vfsmount_lock);
308 /*
309 * If coalescing the per-cpu writer counts did not
310 * get us back to a positive writer count, we have
311 * a bug.
312 */
313 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
314 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
315 WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
316 "count: %d\n",
317 mnt, atomic_read(&mnt->__mnt_writers));
318 /* use the flag to keep the dmesg spam down */
319 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
320 }
321 spin_unlock(&vfsmount_lock);
322 unlock_mnt_writers();
323}
324
325/** 267/**
326 * mnt_drop_write - give up write access to a mount 268 * mnt_drop_write - give up write access to a mount
327 * @mnt: the mount on which to give up write access 269 * @mnt: the mount on which to give up write access
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
332 */ 274 */
333void mnt_drop_write(struct vfsmount *mnt) 275void mnt_drop_write(struct vfsmount *mnt)
334{ 276{
335 int must_check_underflow = 0; 277 preempt_disable();
336 struct mnt_writer *cpu_writer; 278 dec_mnt_writers(mnt);
337 279 preempt_enable();
338 cpu_writer = &get_cpu_var(mnt_writers);
339 spin_lock(&cpu_writer->lock);
340
341 use_cpu_writer_for_mount(cpu_writer, mnt);
342 if (cpu_writer->count > 0) {
343 cpu_writer->count--;
344 } else {
345 must_check_underflow = 1;
346 atomic_dec(&mnt->__mnt_writers);
347 }
348
349 spin_unlock(&cpu_writer->lock);
350 /*
351 * Logically, we could call this each time,
352 * but the __mnt_writers cacheline tends to
353 * be cold, and makes this expensive.
354 */
355 if (must_check_underflow)
356 handle_write_count_underflow(mnt);
357 /*
358 * This could be done right after the spinlock
359 * is taken because the spinlock keeps us on
360 * the cpu, and disables preemption. However,
361 * putting it here bounds the amount that
362 * __mnt_writers can underflow. Without it,
363 * we could theoretically wrap __mnt_writers.
364 */
365 put_cpu_var(mnt_writers);
366} 280}
367EXPORT_SYMBOL_GPL(mnt_drop_write); 281EXPORT_SYMBOL_GPL(mnt_drop_write);
368 282
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
370{ 284{
371 int ret = 0; 285 int ret = 0;
372 286
373 lock_mnt_writers(); 287 spin_lock(&vfsmount_lock);
288 mnt->mnt_flags |= MNT_WRITE_HOLD;
374 /* 289 /*
375 * With all the locks held, this value is stable 290 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
291 * should be visible before we do.
376 */ 292 */
377 if (atomic_read(&mnt->__mnt_writers) > 0) { 293 smp_mb();
378 ret = -EBUSY; 294
379 goto out;
380 }
381 /* 295 /*
382 * nobody can do a successful mnt_want_write() with all 296 * With writers on hold, if this value is zero, then there are
383 * of the counts in MNT_DENIED_WRITE and the locks held. 297 * definitely no active writers (although held writers may subsequently
298 * increment the count, they'll have to wait, and decrement it after
299 * seeing MNT_READONLY).
300 *
301 * It is OK to have counter incremented on one CPU and decremented on
302 * another: the sum will add up correctly. The danger would be when we
303 * sum up each counter, if we read a counter before it is incremented,
304 * but then read another CPU's count which it has been subsequently
305 * decremented from -- we would see more decrements than we should.
306 * MNT_WRITE_HOLD protects against this scenario, because
307 * mnt_want_write first increments count, then smp_mb, then spins on
308 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
309 * we're counting up here.
384 */ 310 */
385 spin_lock(&vfsmount_lock); 311 if (count_mnt_writers(mnt) > 0)
386 if (!ret) 312 ret = -EBUSY;
313 else
387 mnt->mnt_flags |= MNT_READONLY; 314 mnt->mnt_flags |= MNT_READONLY;
315 /*
316 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
317 * that become unheld will see MNT_READONLY.
318 */
319 smp_wmb();
320 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
388 spin_unlock(&vfsmount_lock); 321 spin_unlock(&vfsmount_lock);
389out:
390 unlock_mnt_writers();
391 return ret; 322 return ret;
392} 323}
393 324
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt)
410{ 341{
411 kfree(mnt->mnt_devname); 342 kfree(mnt->mnt_devname);
412 mnt_free_id(mnt); 343 mnt_free_id(mnt);
344#ifdef CONFIG_SMP
345 free_percpu(mnt->mnt_writers);
346#endif
413 kmem_cache_free(mnt_cache, mnt); 347 kmem_cache_free(mnt_cache, mnt);
414} 348}
415 349
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
604 538
605static inline void __mntput(struct vfsmount *mnt) 539static inline void __mntput(struct vfsmount *mnt)
606{ 540{
607 int cpu;
608 struct super_block *sb = mnt->mnt_sb; 541 struct super_block *sb = mnt->mnt_sb;
609 /* 542 /*
610 * We don't have to hold all of the locks at the
611 * same time here because we know that we're the
612 * last reference to mnt and that no new writers
613 * can come in.
614 */
615 for_each_possible_cpu(cpu) {
616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617 spin_lock(&cpu_writer->lock);
618 if (cpu_writer->mnt != mnt) {
619 spin_unlock(&cpu_writer->lock);
620 continue;
621 }
622 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
623 cpu_writer->count = 0;
624 /*
625 * Might as well do this so that no one
626 * ever sees the pointer and expects
627 * it to be valid.
628 */
629 cpu_writer->mnt = NULL;
630 spin_unlock(&cpu_writer->lock);
631 }
632 /*
633 * This probably indicates that somebody messed 543 * This probably indicates that somebody messed
634 * up a mnt_want/drop_write() pair. If this 544 * up a mnt_want/drop_write() pair. If this
635 * happens, the filesystem was probably unable 545 * happens, the filesystem was probably unable
636 * to make r/w->r/o transitions. 546 * to make r/w->r/o transitions.
637 */ 547 */
638 WARN_ON(atomic_read(&mnt->__mnt_writers)); 548 /*
549 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
550 * provides barriers, so count_mnt_writers() below is safe. AV
551 */
552 WARN_ON(count_mnt_writers(mnt));
639 dput(mnt->mnt_root); 553 dput(mnt->mnt_root);
640 free_vfsmnt(mnt); 554 free_vfsmnt(mnt);
641 deactivate_super(sb); 555 deactivate_super(sb);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 51f55f903aff..ac49c1f8e5c0 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
30#define MNT_STRICTATIME 0x80 30#define MNT_STRICTATIME 0x80
31 31
32#define MNT_SHRINKABLE 0x100 32#define MNT_SHRINKABLE 0x100
33#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ 33#define MNT_WRITE_HOLD 0x200
34 34
35#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ 35#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
36#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ 36#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
@@ -65,13 +65,22 @@ struct vfsmount {
65 int mnt_expiry_mark; /* true if marked for expiry */ 65 int mnt_expiry_mark; /* true if marked for expiry */
66 int mnt_pinned; 66 int mnt_pinned;
67 int mnt_ghosts; 67 int mnt_ghosts;
68 /* 68#ifdef CONFIG_SMP
69 * This value is not stable unless all of the mnt_writers[] spinlocks 69 int *mnt_writers;
70 * are held, and all mnt_writer[]s on this mount have 0 as their ->count 70#else
71 */ 71 int mnt_writers;
72 atomic_t __mnt_writers; 72#endif
73}; 73};
74 74
75static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
76{
77#ifdef CONFIG_SMP
78 return mnt->mnt_writers;
79#else
80 return &mnt->mnt_writers;
81#endif
82}
83
75static inline struct vfsmount *mntget(struct vfsmount *mnt) 84static inline struct vfsmount *mntget(struct vfsmount *mnt)
76{ 85{
77 if (mnt) 86 if (mnt)