aboutsummaryrefslogtreecommitdiffstats
path: root/fs/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c327
1 files changed, 139 insertions, 188 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 134d494158d9..2dd333b0fe7f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
131 INIT_LIST_HEAD(&mnt->mnt_share); 131 INIT_LIST_HEAD(&mnt->mnt_share);
132 INIT_LIST_HEAD(&mnt->mnt_slave_list); 132 INIT_LIST_HEAD(&mnt->mnt_slave_list);
133 INIT_LIST_HEAD(&mnt->mnt_slave); 133 INIT_LIST_HEAD(&mnt->mnt_slave);
134 atomic_set(&mnt->__mnt_writers, 0); 134#ifdef CONFIG_SMP
135 mnt->mnt_writers = alloc_percpu(int);
136 if (!mnt->mnt_writers)
137 goto out_free_devname;
138#else
139 mnt->mnt_writers = 0;
140#endif
135 } 141 }
136 return mnt; 142 return mnt;
137 143
144#ifdef CONFIG_SMP
145out_free_devname:
146 kfree(mnt->mnt_devname);
147#endif
138out_free_id: 148out_free_id:
139 mnt_free_id(mnt); 149 mnt_free_id(mnt);
140out_free_cache: 150out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
171} 181}
172EXPORT_SYMBOL_GPL(__mnt_is_readonly); 182EXPORT_SYMBOL_GPL(__mnt_is_readonly);
173 183
174struct mnt_writer { 184static inline void inc_mnt_writers(struct vfsmount *mnt)
175 /* 185{
176 * If holding multiple instances of this lock, they 186#ifdef CONFIG_SMP
177 * must be ordered by cpu number. 187 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
178 */ 188#else
179 spinlock_t lock; 189 mnt->mnt_writers++;
180 struct lock_class_key lock_class; /* compiles out with !lockdep */ 190#endif
181 unsigned long count; 191}
182 struct vfsmount *mnt;
183} ____cacheline_aligned_in_smp;
184static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
185 192
186static int __init init_mnt_writers(void) 193static inline void dec_mnt_writers(struct vfsmount *mnt)
187{ 194{
188 int cpu; 195#ifdef CONFIG_SMP
189 for_each_possible_cpu(cpu) { 196 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
190 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); 197#else
191 spin_lock_init(&writer->lock); 198 mnt->mnt_writers--;
192 lockdep_set_class(&writer->lock, &writer->lock_class); 199#endif
193 writer->count = 0;
194 }
195 return 0;
196} 200}
197fs_initcall(init_mnt_writers);
198 201
199static void unlock_mnt_writers(void) 202static unsigned int count_mnt_writers(struct vfsmount *mnt)
200{ 203{
204#ifdef CONFIG_SMP
205 unsigned int count = 0;
201 int cpu; 206 int cpu;
202 struct mnt_writer *cpu_writer;
203 207
204 for_each_possible_cpu(cpu) { 208 for_each_possible_cpu(cpu) {
205 cpu_writer = &per_cpu(mnt_writers, cpu); 209 count += *per_cpu_ptr(mnt->mnt_writers, cpu);
206 spin_unlock(&cpu_writer->lock);
207 } 210 }
208}
209 211
210static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) 212 return count;
211{ 213#else
212 if (!cpu_writer->mnt) 214 return mnt->mnt_writers;
213 return; 215#endif
214 /*
215 * This is in case anyone ever leaves an invalid,
216 * old ->mnt and a count of 0.
217 */
218 if (!cpu_writer->count)
219 return;
220 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
221 cpu_writer->count = 0;
222}
223 /*
224 * must hold cpu_writer->lock
225 */
226static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
227 struct vfsmount *mnt)
228{
229 if (cpu_writer->mnt == mnt)
230 return;
231 __clear_mnt_count(cpu_writer);
232 cpu_writer->mnt = mnt;
233} 216}
234 217
235/* 218/*
@@ -253,74 +236,73 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
253int mnt_want_write(struct vfsmount *mnt) 236int mnt_want_write(struct vfsmount *mnt)
254{ 237{
255 int ret = 0; 238 int ret = 0;
256 struct mnt_writer *cpu_writer;
257 239
258 cpu_writer = &get_cpu_var(mnt_writers); 240 preempt_disable();
259 spin_lock(&cpu_writer->lock); 241 inc_mnt_writers(mnt);
242 /*
243 * The store to inc_mnt_writers must be visible before we pass
244 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
245 * incremented count after it has set MNT_WRITE_HOLD.
246 */
247 smp_mb();
248 while (mnt->mnt_flags & MNT_WRITE_HOLD)
249 cpu_relax();
250 /*
251 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
252 * be set to match its requirements. So we must not load that until
253 * MNT_WRITE_HOLD is cleared.
254 */
255 smp_rmb();
260 if (__mnt_is_readonly(mnt)) { 256 if (__mnt_is_readonly(mnt)) {
257 dec_mnt_writers(mnt);
261 ret = -EROFS; 258 ret = -EROFS;
262 goto out; 259 goto out;
263 } 260 }
264 use_cpu_writer_for_mount(cpu_writer, mnt);
265 cpu_writer->count++;
266out: 261out:
267 spin_unlock(&cpu_writer->lock); 262 preempt_enable();
268 put_cpu_var(mnt_writers);
269 return ret; 263 return ret;
270} 264}
271EXPORT_SYMBOL_GPL(mnt_want_write); 265EXPORT_SYMBOL_GPL(mnt_want_write);
272 266
273static void lock_mnt_writers(void) 267/**
274{ 268 * mnt_clone_write - get write access to a mount
275 int cpu; 269 * @mnt: the mount on which to take a write
276 struct mnt_writer *cpu_writer; 270 *
277 271 * This is effectively like mnt_want_write, except
278 for_each_possible_cpu(cpu) { 272 * it must only be used to take an extra write reference
279 cpu_writer = &per_cpu(mnt_writers, cpu); 273 * on a mountpoint that we already know has a write reference
280 spin_lock(&cpu_writer->lock); 274 * on it. This allows some optimisation.
281 __clear_mnt_count(cpu_writer); 275 *
282 cpu_writer->mnt = NULL; 276 * After finished, mnt_drop_write must be called as usual to
283 } 277 * drop the reference.
278 */
279int mnt_clone_write(struct vfsmount *mnt)
280{
281 /* superblock may be r/o */
282 if (__mnt_is_readonly(mnt))
283 return -EROFS;
284 preempt_disable();
285 inc_mnt_writers(mnt);
286 preempt_enable();
287 return 0;
284} 288}
289EXPORT_SYMBOL_GPL(mnt_clone_write);
285 290
286/* 291/**
287 * These per-cpu write counts are not guaranteed to have 292 * mnt_want_write_file - get write access to a file's mount
288 * matched increments and decrements on any given cpu. 293 * @file: the file who's mount on which to take a write
289 * A file open()ed for write on one cpu and close()d on 294 *
290 * another cpu will imbalance this count. Make sure it 295 * This is like mnt_want_write, but it takes a file and can
291 * does not get too far out of whack. 296 * do some optimisations if the file is open for write already
292 */ 297 */
293static void handle_write_count_underflow(struct vfsmount *mnt) 298int mnt_want_write_file(struct file *file)
294{ 299{
295 if (atomic_read(&mnt->__mnt_writers) >= 300 if (!(file->f_mode & FMODE_WRITE))
296 MNT_WRITER_UNDERFLOW_LIMIT) 301 return mnt_want_write(file->f_path.mnt);
297 return; 302 else
298 /* 303 return mnt_clone_write(file->f_path.mnt);
299 * It isn't necessary to hold all of the locks
300 * at the same time, but doing it this way makes
301 * us share a lot more code.
302 */
303 lock_mnt_writers();
304 /*
305 * vfsmount_lock is for mnt_flags.
306 */
307 spin_lock(&vfsmount_lock);
308 /*
309 * If coalescing the per-cpu writer counts did not
310 * get us back to a positive writer count, we have
311 * a bug.
312 */
313 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
314 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
315 WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
316 "count: %d\n",
317 mnt, atomic_read(&mnt->__mnt_writers));
318 /* use the flag to keep the dmesg spam down */
319 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
320 }
321 spin_unlock(&vfsmount_lock);
322 unlock_mnt_writers();
323} 304}
305EXPORT_SYMBOL_GPL(mnt_want_write_file);
324 306
325/** 307/**
326 * mnt_drop_write - give up write access to a mount 308 * mnt_drop_write - give up write access to a mount
@@ -332,37 +314,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
332 */ 314 */
333void mnt_drop_write(struct vfsmount *mnt) 315void mnt_drop_write(struct vfsmount *mnt)
334{ 316{
335 int must_check_underflow = 0; 317 preempt_disable();
336 struct mnt_writer *cpu_writer; 318 dec_mnt_writers(mnt);
337 319 preempt_enable();
338 cpu_writer = &get_cpu_var(mnt_writers);
339 spin_lock(&cpu_writer->lock);
340
341 use_cpu_writer_for_mount(cpu_writer, mnt);
342 if (cpu_writer->count > 0) {
343 cpu_writer->count--;
344 } else {
345 must_check_underflow = 1;
346 atomic_dec(&mnt->__mnt_writers);
347 }
348
349 spin_unlock(&cpu_writer->lock);
350 /*
351 * Logically, we could call this each time,
352 * but the __mnt_writers cacheline tends to
353 * be cold, and makes this expensive.
354 */
355 if (must_check_underflow)
356 handle_write_count_underflow(mnt);
357 /*
358 * This could be done right after the spinlock
359 * is taken because the spinlock keeps us on
360 * the cpu, and disables preemption. However,
361 * putting it here bounds the amount that
362 * __mnt_writers can underflow. Without it,
363 * we could theoretically wrap __mnt_writers.
364 */
365 put_cpu_var(mnt_writers);
366} 320}
367EXPORT_SYMBOL_GPL(mnt_drop_write); 321EXPORT_SYMBOL_GPL(mnt_drop_write);
368 322
@@ -370,24 +324,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
370{ 324{
371 int ret = 0; 325 int ret = 0;
372 326
373 lock_mnt_writers(); 327 spin_lock(&vfsmount_lock);
328 mnt->mnt_flags |= MNT_WRITE_HOLD;
374 /* 329 /*
375 * With all the locks held, this value is stable 330 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
331 * should be visible before we do.
376 */ 332 */
377 if (atomic_read(&mnt->__mnt_writers) > 0) { 333 smp_mb();
378 ret = -EBUSY; 334
379 goto out;
380 }
381 /* 335 /*
382 * nobody can do a successful mnt_want_write() with all 336 * With writers on hold, if this value is zero, then there are
383 * of the counts in MNT_DENIED_WRITE and the locks held. 337 * definitely no active writers (although held writers may subsequently
338 * increment the count, they'll have to wait, and decrement it after
339 * seeing MNT_READONLY).
340 *
341 * It is OK to have counter incremented on one CPU and decremented on
342 * another: the sum will add up correctly. The danger would be when we
343 * sum up each counter, if we read a counter before it is incremented,
344 * but then read another CPU's count which it has been subsequently
345 * decremented from -- we would see more decrements than we should.
346 * MNT_WRITE_HOLD protects against this scenario, because
347 * mnt_want_write first increments count, then smp_mb, then spins on
348 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
349 * we're counting up here.
384 */ 350 */
385 spin_lock(&vfsmount_lock); 351 if (count_mnt_writers(mnt) > 0)
386 if (!ret) 352 ret = -EBUSY;
353 else
387 mnt->mnt_flags |= MNT_READONLY; 354 mnt->mnt_flags |= MNT_READONLY;
355 /*
356 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
357 * that become unheld will see MNT_READONLY.
358 */
359 smp_wmb();
360 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
388 spin_unlock(&vfsmount_lock); 361 spin_unlock(&vfsmount_lock);
389out:
390 unlock_mnt_writers();
391 return ret; 362 return ret;
392} 363}
393 364
@@ -410,6 +381,9 @@ void free_vfsmnt(struct vfsmount *mnt)
410{ 381{
411 kfree(mnt->mnt_devname); 382 kfree(mnt->mnt_devname);
412 mnt_free_id(mnt); 383 mnt_free_id(mnt);
384#ifdef CONFIG_SMP
385 free_percpu(mnt->mnt_writers);
386#endif
413 kmem_cache_free(mnt_cache, mnt); 387 kmem_cache_free(mnt_cache, mnt);
414} 388}
415 389
@@ -442,11 +416,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
442 * lookup_mnt increments the ref count before returning 416 * lookup_mnt increments the ref count before returning
443 * the vfsmount struct. 417 * the vfsmount struct.
444 */ 418 */
445struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 419struct vfsmount *lookup_mnt(struct path *path)
446{ 420{
447 struct vfsmount *child_mnt; 421 struct vfsmount *child_mnt;
448 spin_lock(&vfsmount_lock); 422 spin_lock(&vfsmount_lock);
449 if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) 423 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
450 mntget(child_mnt); 424 mntget(child_mnt);
451 spin_unlock(&vfsmount_lock); 425 spin_unlock(&vfsmount_lock);
452 return child_mnt; 426 return child_mnt;
@@ -604,38 +578,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
604 578
605static inline void __mntput(struct vfsmount *mnt) 579static inline void __mntput(struct vfsmount *mnt)
606{ 580{
607 int cpu;
608 struct super_block *sb = mnt->mnt_sb; 581 struct super_block *sb = mnt->mnt_sb;
609 /* 582 /*
610 * We don't have to hold all of the locks at the
611 * same time here because we know that we're the
612 * last reference to mnt and that no new writers
613 * can come in.
614 */
615 for_each_possible_cpu(cpu) {
616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617 spin_lock(&cpu_writer->lock);
618 if (cpu_writer->mnt != mnt) {
619 spin_unlock(&cpu_writer->lock);
620 continue;
621 }
622 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
623 cpu_writer->count = 0;
624 /*
625 * Might as well do this so that no one
626 * ever sees the pointer and expects
627 * it to be valid.
628 */
629 cpu_writer->mnt = NULL;
630 spin_unlock(&cpu_writer->lock);
631 }
632 /*
633 * This probably indicates that somebody messed 583 * This probably indicates that somebody messed
634 * up a mnt_want/drop_write() pair. If this 584 * up a mnt_want/drop_write() pair. If this
635 * happens, the filesystem was probably unable 585 * happens, the filesystem was probably unable
636 * to make r/w->r/o transitions. 586 * to make r/w->r/o transitions.
637 */ 587 */
638 WARN_ON(atomic_read(&mnt->__mnt_writers)); 588 /*
589 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
590 * provides barriers, so count_mnt_writers() below is safe. AV
591 */
592 WARN_ON(count_mnt_writers(mnt));
639 dput(mnt->mnt_root); 593 dput(mnt->mnt_root);
640 free_vfsmnt(mnt); 594 free_vfsmnt(mnt);
641 deactivate_super(sb); 595 deactivate_super(sb);
@@ -1106,11 +1060,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
1106 * we just try to remount it readonly. 1060 * we just try to remount it readonly.
1107 */ 1061 */
1108 down_write(&sb->s_umount); 1062 down_write(&sb->s_umount);
1109 if (!(sb->s_flags & MS_RDONLY)) { 1063 if (!(sb->s_flags & MS_RDONLY))
1110 lock_kernel();
1111 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1064 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
1112 unlock_kernel();
1113 }
1114 up_write(&sb->s_umount); 1065 up_write(&sb->s_umount);
1115 return retval; 1066 return retval;
1116 } 1067 }
@@ -1253,11 +1204,11 @@ Enomem:
1253 return NULL; 1204 return NULL;
1254} 1205}
1255 1206
1256struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) 1207struct vfsmount *collect_mounts(struct path *path)
1257{ 1208{
1258 struct vfsmount *tree; 1209 struct vfsmount *tree;
1259 down_write(&namespace_sem); 1210 down_write(&namespace_sem);
1260 tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); 1211 tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
1261 up_write(&namespace_sem); 1212 up_write(&namespace_sem);
1262 return tree; 1213 return tree;
1263} 1214}
@@ -1430,7 +1381,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1430 goto out_unlock; 1381 goto out_unlock;
1431 1382
1432 err = -ENOENT; 1383 err = -ENOENT;
1433 if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry)) 1384 if (!d_unlinked(path->dentry))
1434 err = attach_recursive_mnt(mnt, path, NULL); 1385 err = attach_recursive_mnt(mnt, path, NULL);
1435out_unlock: 1386out_unlock:
1436 mutex_unlock(&path->dentry->d_inode->i_mutex); 1387 mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1601,7 +1552,7 @@ static int do_move_mount(struct path *path, char *old_name)
1601 1552
1602 down_write(&namespace_sem); 1553 down_write(&namespace_sem);
1603 while (d_mountpoint(path->dentry) && 1554 while (d_mountpoint(path->dentry) &&
1604 follow_down(&path->mnt, &path->dentry)) 1555 follow_down(path))
1605 ; 1556 ;
1606 err = -EINVAL; 1557 err = -EINVAL;
1607 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1558 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1612,7 +1563,7 @@ static int do_move_mount(struct path *path, char *old_name)
1612 if (IS_DEADDIR(path->dentry->d_inode)) 1563 if (IS_DEADDIR(path->dentry->d_inode))
1613 goto out1; 1564 goto out1;
1614 1565
1615 if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) 1566 if (d_unlinked(path->dentry))
1616 goto out1; 1567 goto out1;
1617 1568
1618 err = -EINVAL; 1569 err = -EINVAL;
@@ -1676,7 +1627,9 @@ static int do_new_mount(struct path *path, char *type, int flags,
1676 if (!capable(CAP_SYS_ADMIN)) 1627 if (!capable(CAP_SYS_ADMIN))
1677 return -EPERM; 1628 return -EPERM;
1678 1629
1630 lock_kernel();
1679 mnt = do_kern_mount(type, flags, name, data); 1631 mnt = do_kern_mount(type, flags, name, data);
1632 unlock_kernel();
1680 if (IS_ERR(mnt)) 1633 if (IS_ERR(mnt))
1681 return PTR_ERR(mnt); 1634 return PTR_ERR(mnt);
1682 1635
@@ -1695,10 +1648,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1695 down_write(&namespace_sem); 1648 down_write(&namespace_sem);
1696 /* Something was mounted here while we slept */ 1649 /* Something was mounted here while we slept */
1697 while (d_mountpoint(path->dentry) && 1650 while (d_mountpoint(path->dentry) &&
1698 follow_down(&path->mnt, &path->dentry)) 1651 follow_down(path))
1699 ; 1652 ;
1700 err = -EINVAL; 1653 err = -EINVAL;
1701 if (!check_mnt(path->mnt)) 1654 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1702 goto unlock; 1655 goto unlock;
1703 1656
1704 /* Refuse the same filesystem on the same mount point */ 1657 /* Refuse the same filesystem on the same mount point */
@@ -2092,10 +2045,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2092 if (retval < 0) 2045 if (retval < 0)
2093 goto out3; 2046 goto out3;
2094 2047
2095 lock_kernel();
2096 retval = do_mount((char *)dev_page, dir_page, (char *)type_page, 2048 retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
2097 flags, (void *)data_page); 2049 flags, (void *)data_page);
2098 unlock_kernel();
2099 free_page(data_page); 2050 free_page(data_page);
2100 2051
2101out3: 2052out3:
@@ -2175,9 +2126,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2175 error = -ENOENT; 2126 error = -ENOENT;
2176 if (IS_DEADDIR(new.dentry->d_inode)) 2127 if (IS_DEADDIR(new.dentry->d_inode))
2177 goto out2; 2128 goto out2;
2178 if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry)) 2129 if (d_unlinked(new.dentry))
2179 goto out2; 2130 goto out2;
2180 if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry)) 2131 if (d_unlinked(old.dentry))
2181 goto out2; 2132 goto out2;
2182 error = -EBUSY; 2133 error = -EBUSY;
2183 if (new.mnt == root.mnt || 2134 if (new.mnt == root.mnt ||