aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/namespace.c268
-rw-r--r--include/linux/mount.h21
2 files changed, 106 insertions, 183 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index b94ad3d685ff..22ae06ad751d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
131 INIT_LIST_HEAD(&mnt->mnt_share); 131 INIT_LIST_HEAD(&mnt->mnt_share);
132 INIT_LIST_HEAD(&mnt->mnt_slave_list); 132 INIT_LIST_HEAD(&mnt->mnt_slave_list);
133 INIT_LIST_HEAD(&mnt->mnt_slave); 133 INIT_LIST_HEAD(&mnt->mnt_slave);
134 atomic_set(&mnt->__mnt_writers, 0); 134#ifdef CONFIG_SMP
135 mnt->mnt_writers = alloc_percpu(int);
136 if (!mnt->mnt_writers)
137 goto out_free_devname;
138#else
139 mnt->mnt_writers = 0;
140#endif
135 } 141 }
136 return mnt; 142 return mnt;
137 143
144#ifdef CONFIG_SMP
145out_free_devname:
146 kfree(mnt->mnt_devname);
147#endif
138out_free_id: 148out_free_id:
139 mnt_free_id(mnt); 149 mnt_free_id(mnt);
140out_free_cache: 150out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
171} 181}
172EXPORT_SYMBOL_GPL(__mnt_is_readonly); 182EXPORT_SYMBOL_GPL(__mnt_is_readonly);
173 183
174struct mnt_writer { 184static inline void inc_mnt_writers(struct vfsmount *mnt)
175 /* 185{
176 * If holding multiple instances of this lock, they 186#ifdef CONFIG_SMP
177 * must be ordered by cpu number. 187 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
178 */ 188#else
179 spinlock_t lock; 189 mnt->mnt_writers++;
180 struct lock_class_key lock_class; /* compiles out with !lockdep */ 190#endif
181 unsigned long count; 191}
182 struct vfsmount *mnt;
183} ____cacheline_aligned_in_smp;
184static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
185 192
186static int __init init_mnt_writers(void) 193static inline void dec_mnt_writers(struct vfsmount *mnt)
187{ 194{
188 int cpu; 195#ifdef CONFIG_SMP
189 for_each_possible_cpu(cpu) { 196 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
190 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); 197#else
191 spin_lock_init(&writer->lock); 198 mnt->mnt_writers--;
192 lockdep_set_class(&writer->lock, &writer->lock_class); 199#endif
193 writer->count = 0;
194 }
195 return 0;
196} 200}
197fs_initcall(init_mnt_writers);
198 201
199static void unlock_mnt_writers(void) 202static unsigned int count_mnt_writers(struct vfsmount *mnt)
200{ 203{
204#ifdef CONFIG_SMP
205 unsigned int count = 0;
201 int cpu; 206 int cpu;
202 struct mnt_writer *cpu_writer;
203 207
204 for_each_possible_cpu(cpu) { 208 for_each_possible_cpu(cpu) {
205 cpu_writer = &per_cpu(mnt_writers, cpu); 209 count += *per_cpu_ptr(mnt->mnt_writers, cpu);
206 spin_unlock(&cpu_writer->lock);
207 } 210 }
208}
209 211
210static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) 212 return count;
211{ 213#else
212 if (!cpu_writer->mnt) 214 return mnt->mnt_writers;
213 return; 215#endif
214 /*
215 * This is in case anyone ever leaves an invalid,
216 * old ->mnt and a count of 0.
217 */
218 if (!cpu_writer->count)
219 return;
220 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
221 cpu_writer->count = 0;
222}
223 /*
224 * must hold cpu_writer->lock
225 */
226static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
227 struct vfsmount *mnt)
228{
229 if (cpu_writer->mnt == mnt)
230 return;
231 __clear_mnt_count(cpu_writer);
232 cpu_writer->mnt = mnt;
233} 216}
234 217
235/* 218/*
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
253int mnt_want_write(struct vfsmount *mnt) 236int mnt_want_write(struct vfsmount *mnt)
254{ 237{
255 int ret = 0; 238 int ret = 0;
256 struct mnt_writer *cpu_writer;
257 239
258 cpu_writer = &get_cpu_var(mnt_writers); 240 preempt_disable();
259 spin_lock(&cpu_writer->lock); 241 inc_mnt_writers(mnt);
242 /*
243 * The store to inc_mnt_writers must be visible before we pass
244 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
245 * incremented count after it has set MNT_WRITE_HOLD.
246 */
247 smp_mb();
248 while (mnt->mnt_flags & MNT_WRITE_HOLD)
249 cpu_relax();
250 /*
251 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
252 * be set to match its requirements. So we must not load that until
253 * MNT_WRITE_HOLD is cleared.
254 */
255 smp_rmb();
260 if (__mnt_is_readonly(mnt)) { 256 if (__mnt_is_readonly(mnt)) {
257 dec_mnt_writers(mnt);
261 ret = -EROFS; 258 ret = -EROFS;
262 goto out; 259 goto out;
263 } 260 }
264 use_cpu_writer_for_mount(cpu_writer, mnt);
265 cpu_writer->count++;
266out: 261out:
267 spin_unlock(&cpu_writer->lock); 262 preempt_enable();
268 put_cpu_var(mnt_writers);
269 return ret; 263 return ret;
270} 264}
271EXPORT_SYMBOL_GPL(mnt_want_write); 265EXPORT_SYMBOL_GPL(mnt_want_write);
272 266
273static void lock_mnt_writers(void)
274{
275 int cpu;
276 struct mnt_writer *cpu_writer;
277
278 for_each_possible_cpu(cpu) {
279 cpu_writer = &per_cpu(mnt_writers, cpu);
280 spin_lock(&cpu_writer->lock);
281 __clear_mnt_count(cpu_writer);
282 cpu_writer->mnt = NULL;
283 }
284}
285
286/*
287 * These per-cpu write counts are not guaranteed to have
288 * matched increments and decrements on any given cpu.
289 * A file open()ed for write on one cpu and close()d on
290 * another cpu will imbalance this count. Make sure it
291 * does not get too far out of whack.
292 */
293static void handle_write_count_underflow(struct vfsmount *mnt)
294{
295 if (atomic_read(&mnt->__mnt_writers) >=
296 MNT_WRITER_UNDERFLOW_LIMIT)
297 return;
298 /*
299 * It isn't necessary to hold all of the locks
300 * at the same time, but doing it this way makes
301 * us share a lot more code.
302 */
303 lock_mnt_writers();
304 /*
305 * vfsmount_lock is for mnt_flags.
306 */
307 spin_lock(&vfsmount_lock);
308 /*
309 * If coalescing the per-cpu writer counts did not
310 * get us back to a positive writer count, we have
311 * a bug.
312 */
313 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
314 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
315 WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
316 "count: %d\n",
317 mnt, atomic_read(&mnt->__mnt_writers));
318 /* use the flag to keep the dmesg spam down */
319 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
320 }
321 spin_unlock(&vfsmount_lock);
322 unlock_mnt_writers();
323}
324
325/** 267/**
326 * mnt_drop_write - give up write access to a mount 268 * mnt_drop_write - give up write access to a mount
327 * @mnt: the mount on which to give up write access 269 * @mnt: the mount on which to give up write access
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
332 */ 274 */
333void mnt_drop_write(struct vfsmount *mnt) 275void mnt_drop_write(struct vfsmount *mnt)
334{ 276{
335 int must_check_underflow = 0; 277 preempt_disable();
336 struct mnt_writer *cpu_writer; 278 dec_mnt_writers(mnt);
337 279 preempt_enable();
338 cpu_writer = &get_cpu_var(mnt_writers);
339 spin_lock(&cpu_writer->lock);
340
341 use_cpu_writer_for_mount(cpu_writer, mnt);
342 if (cpu_writer->count > 0) {
343 cpu_writer->count--;
344 } else {
345 must_check_underflow = 1;
346 atomic_dec(&mnt->__mnt_writers);
347 }
348
349 spin_unlock(&cpu_writer->lock);
350 /*
351 * Logically, we could call this each time,
352 * but the __mnt_writers cacheline tends to
353 * be cold, and makes this expensive.
354 */
355 if (must_check_underflow)
356 handle_write_count_underflow(mnt);
357 /*
358 * This could be done right after the spinlock
359 * is taken because the spinlock keeps us on
360 * the cpu, and disables preemption. However,
361 * putting it here bounds the amount that
362 * __mnt_writers can underflow. Without it,
363 * we could theoretically wrap __mnt_writers.
364 */
365 put_cpu_var(mnt_writers);
366} 280}
367EXPORT_SYMBOL_GPL(mnt_drop_write); 281EXPORT_SYMBOL_GPL(mnt_drop_write);
368 282
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
370{ 284{
371 int ret = 0; 285 int ret = 0;
372 286
373 lock_mnt_writers(); 287 spin_lock(&vfsmount_lock);
288 mnt->mnt_flags |= MNT_WRITE_HOLD;
374 /* 289 /*
375 * With all the locks held, this value is stable 290 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
291 * should be visible before we do.
376 */ 292 */
377 if (atomic_read(&mnt->__mnt_writers) > 0) { 293 smp_mb();
378 ret = -EBUSY; 294
379 goto out;
380 }
381 /* 295 /*
382 * nobody can do a successful mnt_want_write() with all 296 * With writers on hold, if this value is zero, then there are
383 * of the counts in MNT_DENIED_WRITE and the locks held. 297 * definitely no active writers (although held writers may subsequently
298 * increment the count, they'll have to wait, and decrement it after
299 * seeing MNT_READONLY).
300 *
301 * It is OK to have counter incremented on one CPU and decremented on
302 * another: the sum will add up correctly. The danger would be when we
303 * sum up each counter, if we read a counter before it is incremented,
304 * but then read another CPU's count which it has been subsequently
305 * decremented from -- we would see more decrements than we should.
306 * MNT_WRITE_HOLD protects against this scenario, because
307 * mnt_want_write first increments count, then smp_mb, then spins on
308 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
309 * we're counting up here.
384 */ 310 */
385 spin_lock(&vfsmount_lock); 311 if (count_mnt_writers(mnt) > 0)
386 if (!ret) 312 ret = -EBUSY;
313 else
387 mnt->mnt_flags |= MNT_READONLY; 314 mnt->mnt_flags |= MNT_READONLY;
315 /*
316 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
317 * that become unheld will see MNT_READONLY.
318 */
319 smp_wmb();
320 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
388 spin_unlock(&vfsmount_lock); 321 spin_unlock(&vfsmount_lock);
389out:
390 unlock_mnt_writers();
391 return ret; 322 return ret;
392} 323}
393 324
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt)
410{ 341{
411 kfree(mnt->mnt_devname); 342 kfree(mnt->mnt_devname);
412 mnt_free_id(mnt); 343 mnt_free_id(mnt);
344#ifdef CONFIG_SMP
345 free_percpu(mnt->mnt_writers);
346#endif
413 kmem_cache_free(mnt_cache, mnt); 347 kmem_cache_free(mnt_cache, mnt);
414} 348}
415 349
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
604 538
605static inline void __mntput(struct vfsmount *mnt) 539static inline void __mntput(struct vfsmount *mnt)
606{ 540{
607 int cpu;
608 struct super_block *sb = mnt->mnt_sb; 541 struct super_block *sb = mnt->mnt_sb;
609 /* 542 /*
610 * We don't have to hold all of the locks at the
611 * same time here because we know that we're the
612 * last reference to mnt and that no new writers
613 * can come in.
614 */
615 for_each_possible_cpu(cpu) {
616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617 spin_lock(&cpu_writer->lock);
618 if (cpu_writer->mnt != mnt) {
619 spin_unlock(&cpu_writer->lock);
620 continue;
621 }
622 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
623 cpu_writer->count = 0;
624 /*
625 * Might as well do this so that no one
626 * ever sees the pointer and expects
627 * it to be valid.
628 */
629 cpu_writer->mnt = NULL;
630 spin_unlock(&cpu_writer->lock);
631 }
632 /*
633 * This probably indicates that somebody messed 543 * This probably indicates that somebody messed
634 * up a mnt_want/drop_write() pair. If this 544 * up a mnt_want/drop_write() pair. If this
635 * happens, the filesystem was probably unable 545 * happens, the filesystem was probably unable
636 * to make r/w->r/o transitions. 546 * to make r/w->r/o transitions.
637 */ 547 */
638 WARN_ON(atomic_read(&mnt->__mnt_writers)); 548 /*
549 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
550 * provides barriers, so count_mnt_writers() below is safe. AV
551 */
552 WARN_ON(count_mnt_writers(mnt));
639 dput(mnt->mnt_root); 553 dput(mnt->mnt_root);
640 free_vfsmnt(mnt); 554 free_vfsmnt(mnt);
641 deactivate_super(sb); 555 deactivate_super(sb);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 51f55f903aff..ac49c1f8e5c0 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
30#define MNT_STRICTATIME 0x80 30#define MNT_STRICTATIME 0x80
31 31
32#define MNT_SHRINKABLE 0x100 32#define MNT_SHRINKABLE 0x100
33#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ 33#define MNT_WRITE_HOLD 0x200
34 34
35#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ 35#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
36#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ 36#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
@@ -65,13 +65,22 @@ struct vfsmount {
65 int mnt_expiry_mark; /* true if marked for expiry */ 65 int mnt_expiry_mark; /* true if marked for expiry */
66 int mnt_pinned; 66 int mnt_pinned;
67 int mnt_ghosts; 67 int mnt_ghosts;
68 /* 68#ifdef CONFIG_SMP
69 * This value is not stable unless all of the mnt_writers[] spinlocks 69 int *mnt_writers;
70 * are held, and all mnt_writer[]s on this mount have 0 as their ->count 70#else
71 */ 71 int mnt_writers;
72 atomic_t __mnt_writers; 72#endif
73}; 73};
74 74
75static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
76{
77#ifdef CONFIG_SMP
78 return mnt->mnt_writers;
79#else
80 return &mnt->mnt_writers;
81#endif
82}
83
75static inline struct vfsmount *mntget(struct vfsmount *mnt) 84static inline struct vfsmount *mntget(struct vfsmount *mnt)
76{ 85{
77 if (mnt) 86 if (mnt)