diff options
-rw-r--r-- | fs/namespace.c | 268 | ||||
-rw-r--r-- | include/linux/mount.h | 21 |
2 files changed, 106 insertions, 183 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index b94ad3d685ff..22ae06ad751d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c | |||
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name) | |||
131 | INIT_LIST_HEAD(&mnt->mnt_share); | 131 | INIT_LIST_HEAD(&mnt->mnt_share); |
132 | INIT_LIST_HEAD(&mnt->mnt_slave_list); | 132 | INIT_LIST_HEAD(&mnt->mnt_slave_list); |
133 | INIT_LIST_HEAD(&mnt->mnt_slave); | 133 | INIT_LIST_HEAD(&mnt->mnt_slave); |
134 | atomic_set(&mnt->__mnt_writers, 0); | 134 | #ifdef CONFIG_SMP |
135 | mnt->mnt_writers = alloc_percpu(int); | ||
136 | if (!mnt->mnt_writers) | ||
137 | goto out_free_devname; | ||
138 | #else | ||
139 | mnt->mnt_writers = 0; | ||
140 | #endif | ||
135 | } | 141 | } |
136 | return mnt; | 142 | return mnt; |
137 | 143 | ||
144 | #ifdef CONFIG_SMP | ||
145 | out_free_devname: | ||
146 | kfree(mnt->mnt_devname); | ||
147 | #endif | ||
138 | out_free_id: | 148 | out_free_id: |
139 | mnt_free_id(mnt); | 149 | mnt_free_id(mnt); |
140 | out_free_cache: | 150 | out_free_cache: |
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt) | |||
171 | } | 181 | } |
172 | EXPORT_SYMBOL_GPL(__mnt_is_readonly); | 182 | EXPORT_SYMBOL_GPL(__mnt_is_readonly); |
173 | 183 | ||
174 | struct mnt_writer { | 184 | static inline void inc_mnt_writers(struct vfsmount *mnt) |
175 | /* | 185 | { |
176 | * If holding multiple instances of this lock, they | 186 | #ifdef CONFIG_SMP |
177 | * must be ordered by cpu number. | 187 | (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; |
178 | */ | 188 | #else |
179 | spinlock_t lock; | 189 | mnt->mnt_writers++; |
180 | struct lock_class_key lock_class; /* compiles out with !lockdep */ | 190 | #endif |
181 | unsigned long count; | 191 | } |
182 | struct vfsmount *mnt; | ||
183 | } ____cacheline_aligned_in_smp; | ||
184 | static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); | ||
185 | 192 | ||
186 | static int __init init_mnt_writers(void) | 193 | static inline void dec_mnt_writers(struct vfsmount *mnt) |
187 | { | 194 | { |
188 | int cpu; | 195 | #ifdef CONFIG_SMP |
189 | for_each_possible_cpu(cpu) { | 196 | (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; |
190 | struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); | 197 | #else |
191 | spin_lock_init(&writer->lock); | 198 | mnt->mnt_writers--; |
192 | lockdep_set_class(&writer->lock, &writer->lock_class); | 199 | #endif |
193 | writer->count = 0; | ||
194 | } | ||
195 | return 0; | ||
196 | } | 200 | } |
197 | fs_initcall(init_mnt_writers); | ||
198 | 201 | ||
199 | static void unlock_mnt_writers(void) | 202 | static unsigned int count_mnt_writers(struct vfsmount *mnt) |
200 | { | 203 | { |
204 | #ifdef CONFIG_SMP | ||
205 | unsigned int count = 0; | ||
201 | int cpu; | 206 | int cpu; |
202 | struct mnt_writer *cpu_writer; | ||
203 | 207 | ||
204 | for_each_possible_cpu(cpu) { | 208 | for_each_possible_cpu(cpu) { |
205 | cpu_writer = &per_cpu(mnt_writers, cpu); | 209 | count += *per_cpu_ptr(mnt->mnt_writers, cpu); |
206 | spin_unlock(&cpu_writer->lock); | ||
207 | } | 210 | } |
208 | } | ||
209 | 211 | ||
210 | static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) | 212 | return count; |
211 | { | 213 | #else |
212 | if (!cpu_writer->mnt) | 214 | return mnt->mnt_writers; |
213 | return; | 215 | #endif |
214 | /* | ||
215 | * This is in case anyone ever leaves an invalid, | ||
216 | * old ->mnt and a count of 0. | ||
217 | */ | ||
218 | if (!cpu_writer->count) | ||
219 | return; | ||
220 | atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); | ||
221 | cpu_writer->count = 0; | ||
222 | } | ||
223 | /* | ||
224 | * must hold cpu_writer->lock | ||
225 | */ | ||
226 | static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, | ||
227 | struct vfsmount *mnt) | ||
228 | { | ||
229 | if (cpu_writer->mnt == mnt) | ||
230 | return; | ||
231 | __clear_mnt_count(cpu_writer); | ||
232 | cpu_writer->mnt = mnt; | ||
233 | } | 216 | } |
234 | 217 | ||
235 | /* | 218 | /* |
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, | |||
253 | int mnt_want_write(struct vfsmount *mnt) | 236 | int mnt_want_write(struct vfsmount *mnt) |
254 | { | 237 | { |
255 | int ret = 0; | 238 | int ret = 0; |
256 | struct mnt_writer *cpu_writer; | ||
257 | 239 | ||
258 | cpu_writer = &get_cpu_var(mnt_writers); | 240 | preempt_disable(); |
259 | spin_lock(&cpu_writer->lock); | 241 | inc_mnt_writers(mnt); |
242 | /* | ||
243 | * The store to inc_mnt_writers must be visible before we pass | ||
244 | * MNT_WRITE_HOLD loop below, so that the slowpath can see our | ||
245 | * incremented count after it has set MNT_WRITE_HOLD. | ||
246 | */ | ||
247 | smp_mb(); | ||
248 | while (mnt->mnt_flags & MNT_WRITE_HOLD) | ||
249 | cpu_relax(); | ||
250 | /* | ||
251 | * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will | ||
252 | * be set to match its requirements. So we must not load that until | ||
253 | * MNT_WRITE_HOLD is cleared. | ||
254 | */ | ||
255 | smp_rmb(); | ||
260 | if (__mnt_is_readonly(mnt)) { | 256 | if (__mnt_is_readonly(mnt)) { |
257 | dec_mnt_writers(mnt); | ||
261 | ret = -EROFS; | 258 | ret = -EROFS; |
262 | goto out; | 259 | goto out; |
263 | } | 260 | } |
264 | use_cpu_writer_for_mount(cpu_writer, mnt); | ||
265 | cpu_writer->count++; | ||
266 | out: | 261 | out: |
267 | spin_unlock(&cpu_writer->lock); | 262 | preempt_enable(); |
268 | put_cpu_var(mnt_writers); | ||
269 | return ret; | 263 | return ret; |
270 | } | 264 | } |
271 | EXPORT_SYMBOL_GPL(mnt_want_write); | 265 | EXPORT_SYMBOL_GPL(mnt_want_write); |
272 | 266 | ||
273 | static void lock_mnt_writers(void) | ||
274 | { | ||
275 | int cpu; | ||
276 | struct mnt_writer *cpu_writer; | ||
277 | |||
278 | for_each_possible_cpu(cpu) { | ||
279 | cpu_writer = &per_cpu(mnt_writers, cpu); | ||
280 | spin_lock(&cpu_writer->lock); | ||
281 | __clear_mnt_count(cpu_writer); | ||
282 | cpu_writer->mnt = NULL; | ||
283 | } | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * These per-cpu write counts are not guaranteed to have | ||
288 | * matched increments and decrements on any given cpu. | ||
289 | * A file open()ed for write on one cpu and close()d on | ||
290 | * another cpu will imbalance this count. Make sure it | ||
291 | * does not get too far out of whack. | ||
292 | */ | ||
293 | static void handle_write_count_underflow(struct vfsmount *mnt) | ||
294 | { | ||
295 | if (atomic_read(&mnt->__mnt_writers) >= | ||
296 | MNT_WRITER_UNDERFLOW_LIMIT) | ||
297 | return; | ||
298 | /* | ||
299 | * It isn't necessary to hold all of the locks | ||
300 | * at the same time, but doing it this way makes | ||
301 | * us share a lot more code. | ||
302 | */ | ||
303 | lock_mnt_writers(); | ||
304 | /* | ||
305 | * vfsmount_lock is for mnt_flags. | ||
306 | */ | ||
307 | spin_lock(&vfsmount_lock); | ||
308 | /* | ||
309 | * If coalescing the per-cpu writer counts did not | ||
310 | * get us back to a positive writer count, we have | ||
311 | * a bug. | ||
312 | */ | ||
313 | if ((atomic_read(&mnt->__mnt_writers) < 0) && | ||
314 | !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { | ||
315 | WARN(1, KERN_DEBUG "leak detected on mount(%p) writers " | ||
316 | "count: %d\n", | ||
317 | mnt, atomic_read(&mnt->__mnt_writers)); | ||
318 | /* use the flag to keep the dmesg spam down */ | ||
319 | mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; | ||
320 | } | ||
321 | spin_unlock(&vfsmount_lock); | ||
322 | unlock_mnt_writers(); | ||
323 | } | ||
324 | |||
325 | /** | 267 | /** |
326 | * mnt_drop_write - give up write access to a mount | 268 | * mnt_drop_write - give up write access to a mount |
327 | * @mnt: the mount on which to give up write access | 269 | * @mnt: the mount on which to give up write access |
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt) | |||
332 | */ | 274 | */ |
333 | void mnt_drop_write(struct vfsmount *mnt) | 275 | void mnt_drop_write(struct vfsmount *mnt) |
334 | { | 276 | { |
335 | int must_check_underflow = 0; | 277 | preempt_disable(); |
336 | struct mnt_writer *cpu_writer; | 278 | dec_mnt_writers(mnt); |
337 | 279 | preempt_enable(); | |
338 | cpu_writer = &get_cpu_var(mnt_writers); | ||
339 | spin_lock(&cpu_writer->lock); | ||
340 | |||
341 | use_cpu_writer_for_mount(cpu_writer, mnt); | ||
342 | if (cpu_writer->count > 0) { | ||
343 | cpu_writer->count--; | ||
344 | } else { | ||
345 | must_check_underflow = 1; | ||
346 | atomic_dec(&mnt->__mnt_writers); | ||
347 | } | ||
348 | |||
349 | spin_unlock(&cpu_writer->lock); | ||
350 | /* | ||
351 | * Logically, we could call this each time, | ||
352 | * but the __mnt_writers cacheline tends to | ||
353 | * be cold, and makes this expensive. | ||
354 | */ | ||
355 | if (must_check_underflow) | ||
356 | handle_write_count_underflow(mnt); | ||
357 | /* | ||
358 | * This could be done right after the spinlock | ||
359 | * is taken because the spinlock keeps us on | ||
360 | * the cpu, and disables preemption. However, | ||
361 | * putting it here bounds the amount that | ||
362 | * __mnt_writers can underflow. Without it, | ||
363 | * we could theoretically wrap __mnt_writers. | ||
364 | */ | ||
365 | put_cpu_var(mnt_writers); | ||
366 | } | 280 | } |
367 | EXPORT_SYMBOL_GPL(mnt_drop_write); | 281 | EXPORT_SYMBOL_GPL(mnt_drop_write); |
368 | 282 | ||
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt) | |||
370 | { | 284 | { |
371 | int ret = 0; | 285 | int ret = 0; |
372 | 286 | ||
373 | lock_mnt_writers(); | 287 | spin_lock(&vfsmount_lock); |
288 | mnt->mnt_flags |= MNT_WRITE_HOLD; | ||
374 | /* | 289 | /* |
375 | * With all the locks held, this value is stable | 290 | * After storing MNT_WRITE_HOLD, we'll read the counters. This store |
291 | * should be visible before we do. | ||
376 | */ | 292 | */ |
377 | if (atomic_read(&mnt->__mnt_writers) > 0) { | 293 | smp_mb(); |
378 | ret = -EBUSY; | 294 | |
379 | goto out; | ||
380 | } | ||
381 | /* | 295 | /* |
382 | * nobody can do a successful mnt_want_write() with all | 296 | * With writers on hold, if this value is zero, then there are |
383 | * of the counts in MNT_DENIED_WRITE and the locks held. | 297 | * definitely no active writers (although held writers may subsequently |
298 | * increment the count, they'll have to wait, and decrement it after | ||
299 | * seeing MNT_READONLY). | ||
300 | * | ||
301 | * It is OK to have counter incremented on one CPU and decremented on | ||
302 | * another: the sum will add up correctly. The danger would be when we | ||
303 | * sum up each counter, if we read a counter before it is incremented, | ||
304 | * but then read another CPU's count which it has been subsequently | ||
305 | * decremented from -- we would see more decrements than we should. | ||
306 | * MNT_WRITE_HOLD protects against this scenario, because | ||
307 | * mnt_want_write first increments count, then smp_mb, then spins on | ||
308 | * MNT_WRITE_HOLD, so it can't be decremented by another CPU while | ||
309 | * we're counting up here. | ||
384 | */ | 310 | */ |
385 | spin_lock(&vfsmount_lock); | 311 | if (count_mnt_writers(mnt) > 0) |
386 | if (!ret) | 312 | ret = -EBUSY; |
313 | else | ||
387 | mnt->mnt_flags |= MNT_READONLY; | 314 | mnt->mnt_flags |= MNT_READONLY; |
315 | /* | ||
316 | * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers | ||
317 | * that become unheld will see MNT_READONLY. | ||
318 | */ | ||
319 | smp_wmb(); | ||
320 | mnt->mnt_flags &= ~MNT_WRITE_HOLD; | ||
388 | spin_unlock(&vfsmount_lock); | 321 | spin_unlock(&vfsmount_lock); |
389 | out: | ||
390 | unlock_mnt_writers(); | ||
391 | return ret; | 322 | return ret; |
392 | } | 323 | } |
393 | 324 | ||
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt) | |||
410 | { | 341 | { |
411 | kfree(mnt->mnt_devname); | 342 | kfree(mnt->mnt_devname); |
412 | mnt_free_id(mnt); | 343 | mnt_free_id(mnt); |
344 | #ifdef CONFIG_SMP | ||
345 | free_percpu(mnt->mnt_writers); | ||
346 | #endif | ||
413 | kmem_cache_free(mnt_cache, mnt); | 347 | kmem_cache_free(mnt_cache, mnt); |
414 | } | 348 | } |
415 | 349 | ||
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, | |||
604 | 538 | ||
605 | static inline void __mntput(struct vfsmount *mnt) | 539 | static inline void __mntput(struct vfsmount *mnt) |
606 | { | 540 | { |
607 | int cpu; | ||
608 | struct super_block *sb = mnt->mnt_sb; | 541 | struct super_block *sb = mnt->mnt_sb; |
609 | /* | 542 | /* |
610 | * We don't have to hold all of the locks at the | ||
611 | * same time here because we know that we're the | ||
612 | * last reference to mnt and that no new writers | ||
613 | * can come in. | ||
614 | */ | ||
615 | for_each_possible_cpu(cpu) { | ||
616 | struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); | ||
617 | spin_lock(&cpu_writer->lock); | ||
618 | if (cpu_writer->mnt != mnt) { | ||
619 | spin_unlock(&cpu_writer->lock); | ||
620 | continue; | ||
621 | } | ||
622 | atomic_add(cpu_writer->count, &mnt->__mnt_writers); | ||
623 | cpu_writer->count = 0; | ||
624 | /* | ||
625 | * Might as well do this so that no one | ||
626 | * ever sees the pointer and expects | ||
627 | * it to be valid. | ||
628 | */ | ||
629 | cpu_writer->mnt = NULL; | ||
630 | spin_unlock(&cpu_writer->lock); | ||
631 | } | ||
632 | /* | ||
633 | * This probably indicates that somebody messed | 543 | * This probably indicates that somebody messed |
634 | * up a mnt_want/drop_write() pair. If this | 544 | * up a mnt_want/drop_write() pair. If this |
635 | * happens, the filesystem was probably unable | 545 | * happens, the filesystem was probably unable |
636 | * to make r/w->r/o transitions. | 546 | * to make r/w->r/o transitions. |
637 | */ | 547 | */ |
638 | WARN_ON(atomic_read(&mnt->__mnt_writers)); | 548 | /* |
549 | * atomic_dec_and_lock() used to deal with ->mnt_count decrements | ||
550 | * provides barriers, so count_mnt_writers() below is safe. AV | ||
551 | */ | ||
552 | WARN_ON(count_mnt_writers(mnt)); | ||
639 | dput(mnt->mnt_root); | 553 | dput(mnt->mnt_root); |
640 | free_vfsmnt(mnt); | 554 | free_vfsmnt(mnt); |
641 | deactivate_super(sb); | 555 | deactivate_super(sb); |
diff --git a/include/linux/mount.h b/include/linux/mount.h index 51f55f903aff..ac49c1f8e5c0 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h | |||
@@ -30,7 +30,7 @@ struct mnt_namespace; | |||
30 | #define MNT_STRICTATIME 0x80 | 30 | #define MNT_STRICTATIME 0x80 |
31 | 31 | ||
32 | #define MNT_SHRINKABLE 0x100 | 32 | #define MNT_SHRINKABLE 0x100 |
33 | #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ | 33 | #define MNT_WRITE_HOLD 0x200 |
34 | 34 | ||
35 | #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ | 35 | #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ |
36 | #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ | 36 | #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ |
@@ -65,13 +65,22 @@ struct vfsmount { | |||
65 | int mnt_expiry_mark; /* true if marked for expiry */ | 65 | int mnt_expiry_mark; /* true if marked for expiry */ |
66 | int mnt_pinned; | 66 | int mnt_pinned; |
67 | int mnt_ghosts; | 67 | int mnt_ghosts; |
68 | /* | 68 | #ifdef CONFIG_SMP |
69 | * This value is not stable unless all of the mnt_writers[] spinlocks | 69 | int *mnt_writers; |
70 | * are held, and all mnt_writer[]s on this mount have 0 as their ->count | 70 | #else |
71 | */ | 71 | int mnt_writers; |
72 | atomic_t __mnt_writers; | 72 | #endif |
73 | }; | 73 | }; |
74 | 74 | ||
75 | static inline int *get_mnt_writers_ptr(struct vfsmount *mnt) | ||
76 | { | ||
77 | #ifdef CONFIG_SMP | ||
78 | return mnt->mnt_writers; | ||
79 | #else | ||
80 | return &mnt->mnt_writers; | ||
81 | #endif | ||
82 | } | ||
83 | |||
75 | static inline struct vfsmount *mntget(struct vfsmount *mnt) | 84 | static inline struct vfsmount *mntget(struct vfsmount *mnt) |
76 | { | 85 | { |
77 | if (mnt) | 86 | if (mnt) |