diff options
author | npiggin@suse.de <npiggin@suse.de> | 2009-04-26 06:25:54 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2009-06-11 21:36:02 -0400 |
commit | d3ef3d7351ccfbef3e5d926efc5ee332136f40d4 (patch) | |
tree | bd875a2b267ae03b350e259675ccb1a04453b9b9 | |
parent | 3174c21b74b56c6a53fddd41a30fd6f757a32bd0 (diff) |
fs: mnt_want_write speedup
This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
A microbenchmark yes, but it exercises some important paths in the mm.
Before:
avg = 501.9
std = 14.7773
After:
avg = 462.286
std = 5.46106
(50 runs of each, stddev gives a reasonable confidence, but there is quite
a bit of variation there still)
It does this by removing the complex per-cpu locking and counter-cache and
replaces it with a percpu counter in struct vfsmount. This makes the code
much simpler, and avoids spinlocks (although the msync is still pretty
costly, unfortunately). It results in about 900 bytes smaller code too. It
does increase the size of a vfsmount, however.
It should also give a speedup on large systems if CPUs are frequently operating
on different mounts (because the existing scheme has to operate on an atomic in
the struct vfsmount when switching between mounts). But I'm most interested in
the single threaded path performance for the moment.
[AV: minor cleanup]
Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r-- | fs/namespace.c | 268 | ||||
-rw-r--r-- | include/linux/mount.h | 21 |
2 files changed, 106 insertions, 183 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index b94ad3d685ff..22ae06ad751d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c | |||
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name) | |||
131 | INIT_LIST_HEAD(&mnt->mnt_share); | 131 | INIT_LIST_HEAD(&mnt->mnt_share); |
132 | INIT_LIST_HEAD(&mnt->mnt_slave_list); | 132 | INIT_LIST_HEAD(&mnt->mnt_slave_list); |
133 | INIT_LIST_HEAD(&mnt->mnt_slave); | 133 | INIT_LIST_HEAD(&mnt->mnt_slave); |
134 | atomic_set(&mnt->__mnt_writers, 0); | 134 | #ifdef CONFIG_SMP |
135 | mnt->mnt_writers = alloc_percpu(int); | ||
136 | if (!mnt->mnt_writers) | ||
137 | goto out_free_devname; | ||
138 | #else | ||
139 | mnt->mnt_writers = 0; | ||
140 | #endif | ||
135 | } | 141 | } |
136 | return mnt; | 142 | return mnt; |
137 | 143 | ||
144 | #ifdef CONFIG_SMP | ||
145 | out_free_devname: | ||
146 | kfree(mnt->mnt_devname); | ||
147 | #endif | ||
138 | out_free_id: | 148 | out_free_id: |
139 | mnt_free_id(mnt); | 149 | mnt_free_id(mnt); |
140 | out_free_cache: | 150 | out_free_cache: |
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt) | |||
171 | } | 181 | } |
172 | EXPORT_SYMBOL_GPL(__mnt_is_readonly); | 182 | EXPORT_SYMBOL_GPL(__mnt_is_readonly); |
173 | 183 | ||
174 | struct mnt_writer { | 184 | static inline void inc_mnt_writers(struct vfsmount *mnt) |
175 | /* | 185 | { |
176 | * If holding multiple instances of this lock, they | 186 | #ifdef CONFIG_SMP |
177 | * must be ordered by cpu number. | 187 | (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; |
178 | */ | 188 | #else |
179 | spinlock_t lock; | 189 | mnt->mnt_writers++; |
180 | struct lock_class_key lock_class; /* compiles out with !lockdep */ | 190 | #endif |
181 | unsigned long count; | 191 | } |
182 | struct vfsmount *mnt; | ||
183 | } ____cacheline_aligned_in_smp; | ||
184 | static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); | ||
185 | 192 | ||
186 | static int __init init_mnt_writers(void) | 193 | static inline void dec_mnt_writers(struct vfsmount *mnt) |
187 | { | 194 | { |
188 | int cpu; | 195 | #ifdef CONFIG_SMP |
189 | for_each_possible_cpu(cpu) { | 196 | (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; |
190 | struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); | 197 | #else |
191 | spin_lock_init(&writer->lock); | 198 | mnt->mnt_writers--; |
192 | lockdep_set_class(&writer->lock, &writer->lock_class); | 199 | #endif |
193 | writer->count = 0; | ||
194 | } | ||
195 | return 0; | ||
196 | } | 200 | } |
197 | fs_initcall(init_mnt_writers); | ||
198 | 201 | ||
199 | static void unlock_mnt_writers(void) | 202 | static unsigned int count_mnt_writers(struct vfsmount *mnt) |
200 | { | 203 | { |
204 | #ifdef CONFIG_SMP | ||
205 | unsigned int count = 0; | ||
201 | int cpu; | 206 | int cpu; |
202 | struct mnt_writer *cpu_writer; | ||
203 | 207 | ||
204 | for_each_possible_cpu(cpu) { | 208 | for_each_possible_cpu(cpu) { |
205 | cpu_writer = &per_cpu(mnt_writers, cpu); | 209 | count += *per_cpu_ptr(mnt->mnt_writers, cpu); |
206 | spin_unlock(&cpu_writer->lock); | ||
207 | } | 210 | } |
208 | } | ||
209 | 211 | ||
210 | static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) | 212 | return count; |
211 | { | 213 | #else |
212 | if (!cpu_writer->mnt) | 214 | return mnt->mnt_writers; |
213 | return; | 215 | #endif |
214 | /* | ||
215 | * This is in case anyone ever leaves an invalid, | ||
216 | * old ->mnt and a count of 0. | ||
217 | */ | ||
218 | if (!cpu_writer->count) | ||
219 | return; | ||
220 | atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); | ||
221 | cpu_writer->count = 0; | ||
222 | } | ||
223 | /* | ||
224 | * must hold cpu_writer->lock | ||
225 | */ | ||
226 | static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, | ||
227 | struct vfsmount *mnt) | ||
228 | { | ||
229 | if (cpu_writer->mnt == mnt) | ||
230 | return; | ||
231 | __clear_mnt_count(cpu_writer); | ||
232 | cpu_writer->mnt = mnt; | ||
233 | } | 216 | } |
234 | 217 | ||
235 | /* | 218 | /* |
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, | |||
253 | int mnt_want_write(struct vfsmount *mnt) | 236 | int mnt_want_write(struct vfsmount *mnt) |
254 | { | 237 | { |
255 | int ret = 0; | 238 | int ret = 0; |
256 | struct mnt_writer *cpu_writer; | ||
257 | 239 | ||
258 | cpu_writer = &get_cpu_var(mnt_writers); | 240 | preempt_disable(); |
259 | spin_lock(&cpu_writer->lock); | 241 | inc_mnt_writers(mnt); |
242 | /* | ||
243 | * The store to inc_mnt_writers must be visible before we pass | ||
244 | * MNT_WRITE_HOLD loop below, so that the slowpath can see our | ||
245 | * incremented count after it has set MNT_WRITE_HOLD. | ||
246 | */ | ||
247 | smp_mb(); | ||
248 | while (mnt->mnt_flags & MNT_WRITE_HOLD) | ||
249 | cpu_relax(); | ||
250 | /* | ||
251 | * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will | ||
252 | * be set to match its requirements. So we must not load that until | ||
253 | * MNT_WRITE_HOLD is cleared. | ||
254 | */ | ||
255 | smp_rmb(); | ||
260 | if (__mnt_is_readonly(mnt)) { | 256 | if (__mnt_is_readonly(mnt)) { |
257 | dec_mnt_writers(mnt); | ||
261 | ret = -EROFS; | 258 | ret = -EROFS; |
262 | goto out; | 259 | goto out; |
263 | } | 260 | } |
264 | use_cpu_writer_for_mount(cpu_writer, mnt); | ||
265 | cpu_writer->count++; | ||
266 | out: | 261 | out: |
267 | spin_unlock(&cpu_writer->lock); | 262 | preempt_enable(); |
268 | put_cpu_var(mnt_writers); | ||
269 | return ret; | 263 | return ret; |
270 | } | 264 | } |
271 | EXPORT_SYMBOL_GPL(mnt_want_write); | 265 | EXPORT_SYMBOL_GPL(mnt_want_write); |
272 | 266 | ||
273 | static void lock_mnt_writers(void) | ||
274 | { | ||
275 | int cpu; | ||
276 | struct mnt_writer *cpu_writer; | ||
277 | |||
278 | for_each_possible_cpu(cpu) { | ||
279 | cpu_writer = &per_cpu(mnt_writers, cpu); | ||
280 | spin_lock(&cpu_writer->lock); | ||
281 | __clear_mnt_count(cpu_writer); | ||
282 | cpu_writer->mnt = NULL; | ||
283 | } | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * These per-cpu write counts are not guaranteed to have | ||
288 | * matched increments and decrements on any given cpu. | ||
289 | * A file open()ed for write on one cpu and close()d on | ||
290 | * another cpu will imbalance this count. Make sure it | ||
291 | * does not get too far out of whack. | ||
292 | */ | ||
293 | static void handle_write_count_underflow(struct vfsmount *mnt) | ||
294 | { | ||
295 | if (atomic_read(&mnt->__mnt_writers) >= | ||
296 | MNT_WRITER_UNDERFLOW_LIMIT) | ||
297 | return; | ||
298 | /* | ||
299 | * It isn't necessary to hold all of the locks | ||
300 | * at the same time, but doing it this way makes | ||
301 | * us share a lot more code. | ||
302 | */ | ||
303 | lock_mnt_writers(); | ||
304 | /* | ||
305 | * vfsmount_lock is for mnt_flags. | ||
306 | */ | ||
307 | spin_lock(&vfsmount_lock); | ||
308 | /* | ||
309 | * If coalescing the per-cpu writer counts did not | ||
310 | * get us back to a positive writer count, we have | ||
311 | * a bug. | ||
312 | */ | ||
313 | if ((atomic_read(&mnt->__mnt_writers) < 0) && | ||
314 | !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { | ||
315 | WARN(1, KERN_DEBUG "leak detected on mount(%p) writers " | ||
316 | "count: %d\n", | ||
317 | mnt, atomic_read(&mnt->__mnt_writers)); | ||
318 | /* use the flag to keep the dmesg spam down */ | ||
319 | mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; | ||
320 | } | ||
321 | spin_unlock(&vfsmount_lock); | ||
322 | unlock_mnt_writers(); | ||
323 | } | ||
324 | |||
325 | /** | 267 | /** |
326 | * mnt_drop_write - give up write access to a mount | 268 | * mnt_drop_write - give up write access to a mount |
327 | * @mnt: the mount on which to give up write access | 269 | * @mnt: the mount on which to give up write access |
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt) | |||
332 | */ | 274 | */ |
333 | void mnt_drop_write(struct vfsmount *mnt) | 275 | void mnt_drop_write(struct vfsmount *mnt) |
334 | { | 276 | { |
335 | int must_check_underflow = 0; | 277 | preempt_disable(); |
336 | struct mnt_writer *cpu_writer; | 278 | dec_mnt_writers(mnt); |
337 | 279 | preempt_enable(); | |
338 | cpu_writer = &get_cpu_var(mnt_writers); | ||
339 | spin_lock(&cpu_writer->lock); | ||
340 | |||
341 | use_cpu_writer_for_mount(cpu_writer, mnt); | ||
342 | if (cpu_writer->count > 0) { | ||
343 | cpu_writer->count--; | ||
344 | } else { | ||
345 | must_check_underflow = 1; | ||
346 | atomic_dec(&mnt->__mnt_writers); | ||
347 | } | ||
348 | |||
349 | spin_unlock(&cpu_writer->lock); | ||
350 | /* | ||
351 | * Logically, we could call this each time, | ||
352 | * but the __mnt_writers cacheline tends to | ||
353 | * be cold, and makes this expensive. | ||
354 | */ | ||
355 | if (must_check_underflow) | ||
356 | handle_write_count_underflow(mnt); | ||
357 | /* | ||
358 | * This could be done right after the spinlock | ||
359 | * is taken because the spinlock keeps us on | ||
360 | * the cpu, and disables preemption. However, | ||
361 | * putting it here bounds the amount that | ||
362 | * __mnt_writers can underflow. Without it, | ||
363 | * we could theoretically wrap __mnt_writers. | ||
364 | */ | ||
365 | put_cpu_var(mnt_writers); | ||
366 | } | 280 | } |
367 | EXPORT_SYMBOL_GPL(mnt_drop_write); | 281 | EXPORT_SYMBOL_GPL(mnt_drop_write); |
368 | 282 | ||
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt) | |||
370 | { | 284 | { |
371 | int ret = 0; | 285 | int ret = 0; |
372 | 286 | ||
373 | lock_mnt_writers(); | 287 | spin_lock(&vfsmount_lock); |
288 | mnt->mnt_flags |= MNT_WRITE_HOLD; | ||
374 | /* | 289 | /* |
375 | * With all the locks held, this value is stable | 290 | * After storing MNT_WRITE_HOLD, we'll read the counters. This store |
291 | * should be visible before we do. | ||
376 | */ | 292 | */ |
377 | if (atomic_read(&mnt->__mnt_writers) > 0) { | 293 | smp_mb(); |
378 | ret = -EBUSY; | 294 | |
379 | goto out; | ||
380 | } | ||
381 | /* | 295 | /* |
382 | * nobody can do a successful mnt_want_write() with all | 296 | * With writers on hold, if this value is zero, then there are |
383 | * of the counts in MNT_DENIED_WRITE and the locks held. | 297 | * definitely no active writers (although held writers may subsequently |
298 | * increment the count, they'll have to wait, and decrement it after | ||
299 | * seeing MNT_READONLY). | ||
300 | * | ||
301 | * It is OK to have counter incremented on one CPU and decremented on | ||
302 | * another: the sum will add up correctly. The danger would be when we | ||
303 | * sum up each counter, if we read a counter before it is incremented, | ||
304 | * but then read another CPU's count which it has been subsequently | ||
305 | * decremented from -- we would see more decrements than we should. | ||
306 | * MNT_WRITE_HOLD protects against this scenario, because | ||
307 | * mnt_want_write first increments count, then smp_mb, then spins on | ||
308 | * MNT_WRITE_HOLD, so it can't be decremented by another CPU while | ||
309 | * we're counting up here. | ||
384 | */ | 310 | */ |
385 | spin_lock(&vfsmount_lock); | 311 | if (count_mnt_writers(mnt) > 0) |
386 | if (!ret) | 312 | ret = -EBUSY; |
313 | else | ||
387 | mnt->mnt_flags |= MNT_READONLY; | 314 | mnt->mnt_flags |= MNT_READONLY; |
315 | /* | ||
316 | * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers | ||
317 | * that become unheld will see MNT_READONLY. | ||
318 | */ | ||
319 | smp_wmb(); | ||
320 | mnt->mnt_flags &= ~MNT_WRITE_HOLD; | ||
388 | spin_unlock(&vfsmount_lock); | 321 | spin_unlock(&vfsmount_lock); |
389 | out: | ||
390 | unlock_mnt_writers(); | ||
391 | return ret; | 322 | return ret; |
392 | } | 323 | } |
393 | 324 | ||
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt) | |||
410 | { | 341 | { |
411 | kfree(mnt->mnt_devname); | 342 | kfree(mnt->mnt_devname); |
412 | mnt_free_id(mnt); | 343 | mnt_free_id(mnt); |
344 | #ifdef CONFIG_SMP | ||
345 | free_percpu(mnt->mnt_writers); | ||
346 | #endif | ||
413 | kmem_cache_free(mnt_cache, mnt); | 347 | kmem_cache_free(mnt_cache, mnt); |
414 | } | 348 | } |
415 | 349 | ||
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, | |||
604 | 538 | ||
605 | static inline void __mntput(struct vfsmount *mnt) | 539 | static inline void __mntput(struct vfsmount *mnt) |
606 | { | 540 | { |
607 | int cpu; | ||
608 | struct super_block *sb = mnt->mnt_sb; | 541 | struct super_block *sb = mnt->mnt_sb; |
609 | /* | 542 | /* |
610 | * We don't have to hold all of the locks at the | ||
611 | * same time here because we know that we're the | ||
612 | * last reference to mnt and that no new writers | ||
613 | * can come in. | ||
614 | */ | ||
615 | for_each_possible_cpu(cpu) { | ||
616 | struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); | ||
617 | spin_lock(&cpu_writer->lock); | ||
618 | if (cpu_writer->mnt != mnt) { | ||
619 | spin_unlock(&cpu_writer->lock); | ||
620 | continue; | ||
621 | } | ||
622 | atomic_add(cpu_writer->count, &mnt->__mnt_writers); | ||
623 | cpu_writer->count = 0; | ||
624 | /* | ||
625 | * Might as well do this so that no one | ||
626 | * ever sees the pointer and expects | ||
627 | * it to be valid. | ||
628 | */ | ||
629 | cpu_writer->mnt = NULL; | ||
630 | spin_unlock(&cpu_writer->lock); | ||
631 | } | ||
632 | /* | ||
633 | * This probably indicates that somebody messed | 543 | * This probably indicates that somebody messed |
634 | * up a mnt_want/drop_write() pair. If this | 544 | * up a mnt_want/drop_write() pair. If this |
635 | * happens, the filesystem was probably unable | 545 | * happens, the filesystem was probably unable |
636 | * to make r/w->r/o transitions. | 546 | * to make r/w->r/o transitions. |
637 | */ | 547 | */ |
638 | WARN_ON(atomic_read(&mnt->__mnt_writers)); | 548 | /* |
549 | * atomic_dec_and_lock() used to deal with ->mnt_count decrements | ||
550 | * provides barriers, so count_mnt_writers() below is safe. AV | ||
551 | */ | ||
552 | WARN_ON(count_mnt_writers(mnt)); | ||
639 | dput(mnt->mnt_root); | 553 | dput(mnt->mnt_root); |
640 | free_vfsmnt(mnt); | 554 | free_vfsmnt(mnt); |
641 | deactivate_super(sb); | 555 | deactivate_super(sb); |
diff --git a/include/linux/mount.h b/include/linux/mount.h index 51f55f903aff..ac49c1f8e5c0 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h | |||
@@ -30,7 +30,7 @@ struct mnt_namespace; | |||
30 | #define MNT_STRICTATIME 0x80 | 30 | #define MNT_STRICTATIME 0x80 |
31 | 31 | ||
32 | #define MNT_SHRINKABLE 0x100 | 32 | #define MNT_SHRINKABLE 0x100 |
33 | #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ | 33 | #define MNT_WRITE_HOLD 0x200 |
34 | 34 | ||
35 | #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ | 35 | #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ |
36 | #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ | 36 | #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ |
@@ -65,13 +65,22 @@ struct vfsmount { | |||
65 | int mnt_expiry_mark; /* true if marked for expiry */ | 65 | int mnt_expiry_mark; /* true if marked for expiry */ |
66 | int mnt_pinned; | 66 | int mnt_pinned; |
67 | int mnt_ghosts; | 67 | int mnt_ghosts; |
68 | /* | 68 | #ifdef CONFIG_SMP |
69 | * This value is not stable unless all of the mnt_writers[] spinlocks | 69 | int *mnt_writers; |
70 | * are held, and all mnt_writer[]s on this mount have 0 as their ->count | 70 | #else |
71 | */ | 71 | int mnt_writers; |
72 | atomic_t __mnt_writers; | 72 | #endif |
73 | }; | 73 | }; |
74 | 74 | ||
75 | static inline int *get_mnt_writers_ptr(struct vfsmount *mnt) | ||
76 | { | ||
77 | #ifdef CONFIG_SMP | ||
78 | return mnt->mnt_writers; | ||
79 | #else | ||
80 | return &mnt->mnt_writers; | ||
81 | #endif | ||
82 | } | ||
83 | |||
75 | static inline struct vfsmount *mntget(struct vfsmount *mnt) | 84 | static inline struct vfsmount *mntget(struct vfsmount *mnt) |
76 | { | 85 | { |
77 | if (mnt) | 86 | if (mnt) |