aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorNick Piggin <npiggin@kernel.dk>2010-08-17 14:37:39 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2010-08-18 08:35:48 -0400
commit99b7db7b8ffd6bb755eb0a175596421a0b581cb2 (patch)
treecbaf57d252f0852f967d3fd5a5f87472964a01fe /fs
parent6416ccb7899960868f5016751fb81bf25213d24f (diff)
fs: brlock vfsmount_lock
fs: brlock vfsmount_lock Use a brlock for the vfsmount lock. It must be taken for write whenever modifying the mount hash or associated fields, and may be taken for read when performing mount hash lookups. A new lock is added for the mnt-id allocator, so it doesn't need to take the heavy vfsmount write-lock. The number of atomics should remain the same for fastpath rlock cases, though code would be slightly slower due to per-cpu access. Scalability is not not be much improved in common cases yet, due to other locks (ie. dcache_lock) getting in the way. However path lookups crossing mountpoints should be one case where scalability is improved (currently requiring the global lock). The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node Altix system (high latency to remote nodes), a simple umount microbenchmark (mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it took 6.8s, afterwards took 7.1s, about 5% slower. Cc: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Nick Piggin <npiggin@kernel.dk> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs')
-rw-r--r--fs/dcache.c11
-rw-r--r--fs/internal.h5
-rw-r--r--fs/namei.c7
-rw-r--r--fs/namespace.c177
-rw-r--r--fs/pnode.c11
5 files changed, 134 insertions, 77 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index d56a40b5a577..83293be48149 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1935,7 +1935,7 @@ static int prepend_path(const struct path *path, struct path *root,
1935 bool slash = false; 1935 bool slash = false;
1936 int error = 0; 1936 int error = 0;
1937 1937
1938 spin_lock(&vfsmount_lock); 1938 br_read_lock(vfsmount_lock);
1939 while (dentry != root->dentry || vfsmnt != root->mnt) { 1939 while (dentry != root->dentry || vfsmnt != root->mnt) {
1940 struct dentry * parent; 1940 struct dentry * parent;
1941 1941
@@ -1964,7 +1964,7 @@ out:
1964 if (!error && !slash) 1964 if (!error && !slash)
1965 error = prepend(buffer, buflen, "/", 1); 1965 error = prepend(buffer, buflen, "/", 1);
1966 1966
1967 spin_unlock(&vfsmount_lock); 1967 br_read_unlock(vfsmount_lock);
1968 return error; 1968 return error;
1969 1969
1970global_root: 1970global_root:
@@ -2302,11 +2302,12 @@ int path_is_under(struct path *path1, struct path *path2)
2302 struct vfsmount *mnt = path1->mnt; 2302 struct vfsmount *mnt = path1->mnt;
2303 struct dentry *dentry = path1->dentry; 2303 struct dentry *dentry = path1->dentry;
2304 int res; 2304 int res;
2305 spin_lock(&vfsmount_lock); 2305
2306 br_read_lock(vfsmount_lock);
2306 if (mnt != path2->mnt) { 2307 if (mnt != path2->mnt) {
2307 for (;;) { 2308 for (;;) {
2308 if (mnt->mnt_parent == mnt) { 2309 if (mnt->mnt_parent == mnt) {
2309 spin_unlock(&vfsmount_lock); 2310 br_read_unlock(vfsmount_lock);
2310 return 0; 2311 return 0;
2311 } 2312 }
2312 if (mnt->mnt_parent == path2->mnt) 2313 if (mnt->mnt_parent == path2->mnt)
@@ -2316,7 +2317,7 @@ int path_is_under(struct path *path1, struct path *path2)
2316 dentry = mnt->mnt_mountpoint; 2317 dentry = mnt->mnt_mountpoint;
2317 } 2318 }
2318 res = is_subdir(dentry, path2->dentry); 2319 res = is_subdir(dentry, path2->dentry);
2319 spin_unlock(&vfsmount_lock); 2320 br_read_unlock(vfsmount_lock);
2320 return res; 2321 return res;
2321} 2322}
2322EXPORT_SYMBOL(path_is_under); 2323EXPORT_SYMBOL(path_is_under);
diff --git a/fs/internal.h b/fs/internal.h
index 6a5c13a80660..a6910e91cee8 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/lglock.h>
13
12struct super_block; 14struct super_block;
13struct linux_binprm; 15struct linux_binprm;
14struct path; 16struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
70 72
71extern void __init mnt_init(void); 73extern void __init mnt_init(void);
72 74
73extern spinlock_t vfsmount_lock; 75DECLARE_BRLOCK(vfsmount_lock);
76
74 77
75/* 78/*
76 * fs_struct.c 79 * fs_struct.c
diff --git a/fs/namei.c b/fs/namei.c
index 11de7c39ff76..24896e833565 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -595,15 +595,16 @@ int follow_up(struct path *path)
595{ 595{
596 struct vfsmount *parent; 596 struct vfsmount *parent;
597 struct dentry *mountpoint; 597 struct dentry *mountpoint;
598 spin_lock(&vfsmount_lock); 598
599 br_read_lock(vfsmount_lock);
599 parent = path->mnt->mnt_parent; 600 parent = path->mnt->mnt_parent;
600 if (parent == path->mnt) { 601 if (parent == path->mnt) {
601 spin_unlock(&vfsmount_lock); 602 br_read_unlock(vfsmount_lock);
602 return 0; 603 return 0;
603 } 604 }
604 mntget(parent); 605 mntget(parent);
605 mountpoint = dget(path->mnt->mnt_mountpoint); 606 mountpoint = dget(path->mnt->mnt_mountpoint);
606 spin_unlock(&vfsmount_lock); 607 br_read_unlock(vfsmount_lock);
607 dput(path->dentry); 608 dput(path->dentry);
608 path->dentry = mountpoint; 609 path->dentry = mountpoint;
609 mntput(path->mnt); 610 mntput(path->mnt);
diff --git a/fs/namespace.c b/fs/namespace.c
index 2e10cb19c5b0..de402eb6eafb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
11#include <linux/syscalls.h> 11#include <linux/syscalls.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/spinlock.h>
15#include <linux/percpu.h>
14#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
15#include <linux/init.h> 17#include <linux/init.h>
16#include <linux/kernel.h> 18#include <linux/kernel.h>
@@ -38,12 +40,10 @@
38#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 40#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
39#define HASH_SIZE (1UL << HASH_SHIFT) 41#define HASH_SIZE (1UL << HASH_SHIFT)
40 42
41/* spinlock for vfsmount related operations, inplace of dcache_lock */
42__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
43
44static int event; 43static int event;
45static DEFINE_IDA(mnt_id_ida); 44static DEFINE_IDA(mnt_id_ida);
46static DEFINE_IDA(mnt_group_ida); 45static DEFINE_IDA(mnt_group_ida);
46static DEFINE_SPINLOCK(mnt_id_lock);
47static int mnt_id_start = 0; 47static int mnt_id_start = 0;
48static int mnt_group_start = 1; 48static int mnt_group_start = 1;
49 49
@@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem;
55struct kobject *fs_kobj; 55struct kobject *fs_kobj;
56EXPORT_SYMBOL_GPL(fs_kobj); 56EXPORT_SYMBOL_GPL(fs_kobj);
57 57
58/*
59 * vfsmount lock may be taken for read to prevent changes to the
60 * vfsmount hash, ie. during mountpoint lookups or walking back
61 * up the tree.
62 *
63 * It should be taken for write in all cases where the vfsmount
64 * tree or hash is modified or when a vfsmount structure is modified.
65 */
66DEFINE_BRLOCK(vfsmount_lock);
67
58static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 68static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
59{ 69{
60 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 70 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
65 75
66#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) 76#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
67 77
68/* allocation is serialized by namespace_sem */ 78/*
79 * allocation is serialized by namespace_sem, but we need the spinlock to
80 * serialize with freeing.
81 */
69static int mnt_alloc_id(struct vfsmount *mnt) 82static int mnt_alloc_id(struct vfsmount *mnt)
70{ 83{
71 int res; 84 int res;
72 85
73retry: 86retry:
74 ida_pre_get(&mnt_id_ida, GFP_KERNEL); 87 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
75 spin_lock(&vfsmount_lock); 88 spin_lock(&mnt_id_lock);
76 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); 89 res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
77 if (!res) 90 if (!res)
78 mnt_id_start = mnt->mnt_id + 1; 91 mnt_id_start = mnt->mnt_id + 1;
79 spin_unlock(&vfsmount_lock); 92 spin_unlock(&mnt_id_lock);
80 if (res == -EAGAIN) 93 if (res == -EAGAIN)
81 goto retry; 94 goto retry;
82 95
@@ -86,11 +99,11 @@ retry:
86static void mnt_free_id(struct vfsmount *mnt) 99static void mnt_free_id(struct vfsmount *mnt)
87{ 100{
88 int id = mnt->mnt_id; 101 int id = mnt->mnt_id;
89 spin_lock(&vfsmount_lock); 102 spin_lock(&mnt_id_lock);
90 ida_remove(&mnt_id_ida, id); 103 ida_remove(&mnt_id_ida, id);
91 if (mnt_id_start > id) 104 if (mnt_id_start > id)
92 mnt_id_start = id; 105 mnt_id_start = id;
93 spin_unlock(&vfsmount_lock); 106 spin_unlock(&mnt_id_lock);
94} 107}
95 108
96/* 109/*
@@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
348{ 361{
349 int ret = 0; 362 int ret = 0;
350 363
351 spin_lock(&vfsmount_lock); 364 br_write_lock(vfsmount_lock);
352 mnt->mnt_flags |= MNT_WRITE_HOLD; 365 mnt->mnt_flags |= MNT_WRITE_HOLD;
353 /* 366 /*
354 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 367 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
382 */ 395 */
383 smp_wmb(); 396 smp_wmb();
384 mnt->mnt_flags &= ~MNT_WRITE_HOLD; 397 mnt->mnt_flags &= ~MNT_WRITE_HOLD;
385 spin_unlock(&vfsmount_lock); 398 br_write_unlock(vfsmount_lock);
386 return ret; 399 return ret;
387} 400}
388 401
389static void __mnt_unmake_readonly(struct vfsmount *mnt) 402static void __mnt_unmake_readonly(struct vfsmount *mnt)
390{ 403{
391 spin_lock(&vfsmount_lock); 404 br_write_lock(vfsmount_lock);
392 mnt->mnt_flags &= ~MNT_READONLY; 405 mnt->mnt_flags &= ~MNT_READONLY;
393 spin_unlock(&vfsmount_lock); 406 br_write_unlock(vfsmount_lock);
394} 407}
395 408
396void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 409void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
414/* 427/*
415 * find the first or last mount at @dentry on vfsmount @mnt depending on 428 * find the first or last mount at @dentry on vfsmount @mnt depending on
416 * @dir. If @dir is set return the first mount else return the last mount. 429 * @dir. If @dir is set return the first mount else return the last mount.
430 * vfsmount_lock must be held for read or write.
417 */ 431 */
418struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 432struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
419 int dir) 433 int dir)
@@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
443struct vfsmount *lookup_mnt(struct path *path) 457struct vfsmount *lookup_mnt(struct path *path)
444{ 458{
445 struct vfsmount *child_mnt; 459 struct vfsmount *child_mnt;
446 spin_lock(&vfsmount_lock); 460
461 br_read_lock(vfsmount_lock);
447 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) 462 if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
448 mntget(child_mnt); 463 mntget(child_mnt);
449 spin_unlock(&vfsmount_lock); 464 br_read_unlock(vfsmount_lock);
450 return child_mnt; 465 return child_mnt;
451} 466}
452 467
@@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
455 return mnt->mnt_ns == current->nsproxy->mnt_ns; 470 return mnt->mnt_ns == current->nsproxy->mnt_ns;
456} 471}
457 472
473/*
474 * vfsmount lock must be held for write
475 */
458static void touch_mnt_namespace(struct mnt_namespace *ns) 476static void touch_mnt_namespace(struct mnt_namespace *ns)
459{ 477{
460 if (ns) { 478 if (ns) {
@@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
463 } 481 }
464} 482}
465 483
484/*
485 * vfsmount lock must be held for write
486 */
466static void __touch_mnt_namespace(struct mnt_namespace *ns) 487static void __touch_mnt_namespace(struct mnt_namespace *ns)
467{ 488{
468 if (ns && ns->event != event) { 489 if (ns && ns->event != event) {
@@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
471 } 492 }
472} 493}
473 494
495/*
496 * vfsmount lock must be held for write
497 */
474static void detach_mnt(struct vfsmount *mnt, struct path *old_path) 498static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
475{ 499{
476 old_path->dentry = mnt->mnt_mountpoint; 500 old_path->dentry = mnt->mnt_mountpoint;
@@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
482 old_path->dentry->d_mounted--; 506 old_path->dentry->d_mounted--;
483} 507}
484 508
509/*
510 * vfsmount lock must be held for write
511 */
485void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, 512void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
486 struct vfsmount *child_mnt) 513 struct vfsmount *child_mnt)
487{ 514{
@@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
490 dentry->d_mounted++; 517 dentry->d_mounted++;
491} 518}
492 519
520/*
521 * vfsmount lock must be held for write
522 */
493static void attach_mnt(struct vfsmount *mnt, struct path *path) 523static void attach_mnt(struct vfsmount *mnt, struct path *path)
494{ 524{
495 mnt_set_mountpoint(path->mnt, path->dentry, mnt); 525 mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
499} 529}
500 530
501/* 531/*
502 * the caller must hold vfsmount_lock 532 * vfsmount lock must be held for write
503 */ 533 */
504static void commit_tree(struct vfsmount *mnt) 534static void commit_tree(struct vfsmount *mnt)
505{ 535{
@@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
623void mntput_no_expire(struct vfsmount *mnt) 653void mntput_no_expire(struct vfsmount *mnt)
624{ 654{
625repeat: 655repeat:
626 if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { 656 if (atomic_add_unless(&mnt->mnt_count, -1, 1))
627 if (likely(!mnt->mnt_pinned)) { 657 return;
628 spin_unlock(&vfsmount_lock); 658 br_write_lock(vfsmount_lock);
629 __mntput(mnt); 659 if (!atomic_dec_and_test(&mnt->mnt_count)) {
630 return; 660 br_write_unlock(vfsmount_lock);
631 } 661 return;
632 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); 662 }
633 mnt->mnt_pinned = 0; 663 if (likely(!mnt->mnt_pinned)) {
634 spin_unlock(&vfsmount_lock); 664 br_write_unlock(vfsmount_lock);
635 acct_auto_close_mnt(mnt); 665 __mntput(mnt);
636 goto repeat; 666 return;
637 } 667 }
668 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
669 mnt->mnt_pinned = 0;
670 br_write_unlock(vfsmount_lock);
671 acct_auto_close_mnt(mnt);
672 goto repeat;
638} 673}
639
640EXPORT_SYMBOL(mntput_no_expire); 674EXPORT_SYMBOL(mntput_no_expire);
641 675
642void mnt_pin(struct vfsmount *mnt) 676void mnt_pin(struct vfsmount *mnt)
643{ 677{
644 spin_lock(&vfsmount_lock); 678 br_write_lock(vfsmount_lock);
645 mnt->mnt_pinned++; 679 mnt->mnt_pinned++;
646 spin_unlock(&vfsmount_lock); 680 br_write_unlock(vfsmount_lock);
647} 681}
648 682
649EXPORT_SYMBOL(mnt_pin); 683EXPORT_SYMBOL(mnt_pin);
650 684
651void mnt_unpin(struct vfsmount *mnt) 685void mnt_unpin(struct vfsmount *mnt)
652{ 686{
653 spin_lock(&vfsmount_lock); 687 br_write_lock(vfsmount_lock);
654 if (mnt->mnt_pinned) { 688 if (mnt->mnt_pinned) {
655 atomic_inc(&mnt->mnt_count); 689 atomic_inc(&mnt->mnt_count);
656 mnt->mnt_pinned--; 690 mnt->mnt_pinned--;
657 } 691 }
658 spin_unlock(&vfsmount_lock); 692 br_write_unlock(vfsmount_lock);
659} 693}
660 694
661EXPORT_SYMBOL(mnt_unpin); 695EXPORT_SYMBOL(mnt_unpin);
@@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
746 struct mnt_namespace *ns = p->ns; 780 struct mnt_namespace *ns = p->ns;
747 int res = 0; 781 int res = 0;
748 782
749 spin_lock(&vfsmount_lock); 783 br_read_lock(vfsmount_lock);
750 if (p->event != ns->event) { 784 if (p->event != ns->event) {
751 p->event = ns->event; 785 p->event = ns->event;
752 res = 1; 786 res = 1;
753 } 787 }
754 spin_unlock(&vfsmount_lock); 788 br_read_unlock(vfsmount_lock);
755 789
756 return res; 790 return res;
757} 791}
@@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
952 int minimum_refs = 0; 986 int minimum_refs = 0;
953 struct vfsmount *p; 987 struct vfsmount *p;
954 988
955 spin_lock(&vfsmount_lock); 989 br_read_lock(vfsmount_lock);
956 for (p = mnt; p; p = next_mnt(p, mnt)) { 990 for (p = mnt; p; p = next_mnt(p, mnt)) {
957 actual_refs += atomic_read(&p->mnt_count); 991 actual_refs += atomic_read(&p->mnt_count);
958 minimum_refs += 2; 992 minimum_refs += 2;
959 } 993 }
960 spin_unlock(&vfsmount_lock); 994 br_read_unlock(vfsmount_lock);
961 995
962 if (actual_refs > minimum_refs) 996 if (actual_refs > minimum_refs)
963 return 0; 997 return 0;
@@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
984{ 1018{
985 int ret = 1; 1019 int ret = 1;
986 down_read(&namespace_sem); 1020 down_read(&namespace_sem);
987 spin_lock(&vfsmount_lock); 1021 br_read_lock(vfsmount_lock);
988 if (propagate_mount_busy(mnt, 2)) 1022 if (propagate_mount_busy(mnt, 2))
989 ret = 0; 1023 ret = 0;
990 spin_unlock(&vfsmount_lock); 1024 br_read_unlock(vfsmount_lock);
991 up_read(&namespace_sem); 1025 up_read(&namespace_sem);
992 return ret; 1026 return ret;
993} 1027}
@@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head)
1003 if (mnt->mnt_parent != mnt) { 1037 if (mnt->mnt_parent != mnt) {
1004 struct dentry *dentry; 1038 struct dentry *dentry;
1005 struct vfsmount *m; 1039 struct vfsmount *m;
1006 spin_lock(&vfsmount_lock); 1040
1041 br_write_lock(vfsmount_lock);
1007 dentry = mnt->mnt_mountpoint; 1042 dentry = mnt->mnt_mountpoint;
1008 m = mnt->mnt_parent; 1043 m = mnt->mnt_parent;
1009 mnt->mnt_mountpoint = mnt->mnt_root; 1044 mnt->mnt_mountpoint = mnt->mnt_root;
1010 mnt->mnt_parent = mnt; 1045 mnt->mnt_parent = mnt;
1011 m->mnt_ghosts--; 1046 m->mnt_ghosts--;
1012 spin_unlock(&vfsmount_lock); 1047 br_write_unlock(vfsmount_lock);
1013 dput(dentry); 1048 dput(dentry);
1014 mntput(m); 1049 mntput(m);
1015 } 1050 }
@@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head)
1017 } 1052 }
1018} 1053}
1019 1054
1055/*
1056 * vfsmount lock must be held for write
1057 * namespace_sem must be held for write
1058 */
1020void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) 1059void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
1021{ 1060{
1022 struct vfsmount *p; 1061 struct vfsmount *p;
@@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1107 } 1146 }
1108 1147
1109 down_write(&namespace_sem); 1148 down_write(&namespace_sem);
1110 spin_lock(&vfsmount_lock); 1149 br_write_lock(vfsmount_lock);
1111 event++; 1150 event++;
1112 1151
1113 if (!(flags & MNT_DETACH)) 1152 if (!(flags & MNT_DETACH))
@@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1119 umount_tree(mnt, 1, &umount_list); 1158 umount_tree(mnt, 1, &umount_list);
1120 retval = 0; 1159 retval = 0;
1121 } 1160 }
1122 spin_unlock(&vfsmount_lock); 1161 br_write_unlock(vfsmount_lock);
1123 up_write(&namespace_sem); 1162 up_write(&namespace_sem);
1124 release_mounts(&umount_list); 1163 release_mounts(&umount_list);
1125 return retval; 1164 return retval;
@@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
1231 q = clone_mnt(p, p->mnt_root, flag); 1270 q = clone_mnt(p, p->mnt_root, flag);
1232 if (!q) 1271 if (!q)
1233 goto Enomem; 1272 goto Enomem;
1234 spin_lock(&vfsmount_lock); 1273 br_write_lock(vfsmount_lock);
1235 list_add_tail(&q->mnt_list, &res->mnt_list); 1274 list_add_tail(&q->mnt_list, &res->mnt_list);
1236 attach_mnt(q, &path); 1275 attach_mnt(q, &path);
1237 spin_unlock(&vfsmount_lock); 1276 br_write_unlock(vfsmount_lock);
1238 } 1277 }
1239 } 1278 }
1240 return res; 1279 return res;
1241Enomem: 1280Enomem:
1242 if (res) { 1281 if (res) {
1243 LIST_HEAD(umount_list); 1282 LIST_HEAD(umount_list);
1244 spin_lock(&vfsmount_lock); 1283 br_write_lock(vfsmount_lock);
1245 umount_tree(res, 0, &umount_list); 1284 umount_tree(res, 0, &umount_list);
1246 spin_unlock(&vfsmount_lock); 1285 br_write_unlock(vfsmount_lock);
1247 release_mounts(&umount_list); 1286 release_mounts(&umount_list);
1248 } 1287 }
1249 return NULL; 1288 return NULL;
@@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
1262{ 1301{
1263 LIST_HEAD(umount_list); 1302 LIST_HEAD(umount_list);
1264 down_write(&namespace_sem); 1303 down_write(&namespace_sem);
1265 spin_lock(&vfsmount_lock); 1304 br_write_lock(vfsmount_lock);
1266 umount_tree(mnt, 0, &umount_list); 1305 umount_tree(mnt, 0, &umount_list);
1267 spin_unlock(&vfsmount_lock); 1306 br_write_unlock(vfsmount_lock);
1268 up_write(&namespace_sem); 1307 up_write(&namespace_sem);
1269 release_mounts(&umount_list); 1308 release_mounts(&umount_list);
1270} 1309}
@@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1392 if (err) 1431 if (err)
1393 goto out_cleanup_ids; 1432 goto out_cleanup_ids;
1394 1433
1395 spin_lock(&vfsmount_lock); 1434 br_write_lock(vfsmount_lock);
1396 1435
1397 if (IS_MNT_SHARED(dest_mnt)) { 1436 if (IS_MNT_SHARED(dest_mnt)) {
1398 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1437 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1411 list_del_init(&child->mnt_hash); 1450 list_del_init(&child->mnt_hash);
1412 commit_tree(child); 1451 commit_tree(child);
1413 } 1452 }
1414 spin_unlock(&vfsmount_lock); 1453 br_write_unlock(vfsmount_lock);
1454
1415 return 0; 1455 return 0;
1416 1456
1417 out_cleanup_ids: 1457 out_cleanup_ids:
@@ -1466,10 +1506,10 @@ static int do_change_type(struct path *path, int flag)
1466 goto out_unlock; 1506 goto out_unlock;
1467 } 1507 }
1468 1508
1469 spin_lock(&vfsmount_lock); 1509 br_write_lock(vfsmount_lock);
1470 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1510 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1471 change_mnt_propagation(m, type); 1511 change_mnt_propagation(m, type);
1472 spin_unlock(&vfsmount_lock); 1512 br_write_unlock(vfsmount_lock);
1473 1513
1474 out_unlock: 1514 out_unlock:
1475 up_write(&namespace_sem); 1515 up_write(&namespace_sem);
@@ -1513,9 +1553,10 @@ static int do_loopback(struct path *path, char *old_name,
1513 err = graft_tree(mnt, path); 1553 err = graft_tree(mnt, path);
1514 if (err) { 1554 if (err) {
1515 LIST_HEAD(umount_list); 1555 LIST_HEAD(umount_list);
1516 spin_lock(&vfsmount_lock); 1556
1557 br_write_lock(vfsmount_lock);
1517 umount_tree(mnt, 0, &umount_list); 1558 umount_tree(mnt, 0, &umount_list);
1518 spin_unlock(&vfsmount_lock); 1559 br_write_unlock(vfsmount_lock);
1519 release_mounts(&umount_list); 1560 release_mounts(&umount_list);
1520 } 1561 }
1521 1562
@@ -1568,16 +1609,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1568 else 1609 else
1569 err = do_remount_sb(sb, flags, data, 0); 1610 err = do_remount_sb(sb, flags, data, 0);
1570 if (!err) { 1611 if (!err) {
1571 spin_lock(&vfsmount_lock); 1612 br_write_lock(vfsmount_lock);
1572 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; 1613 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
1573 path->mnt->mnt_flags = mnt_flags; 1614 path->mnt->mnt_flags = mnt_flags;
1574 spin_unlock(&vfsmount_lock); 1615 br_write_unlock(vfsmount_lock);
1575 } 1616 }
1576 up_write(&sb->s_umount); 1617 up_write(&sb->s_umount);
1577 if (!err) { 1618 if (!err) {
1578 spin_lock(&vfsmount_lock); 1619 br_write_lock(vfsmount_lock);
1579 touch_mnt_namespace(path->mnt->mnt_ns); 1620 touch_mnt_namespace(path->mnt->mnt_ns);
1580 spin_unlock(&vfsmount_lock); 1621 br_write_unlock(vfsmount_lock);
1581 } 1622 }
1582 return err; 1623 return err;
1583} 1624}
@@ -1754,7 +1795,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1754 return; 1795 return;
1755 1796
1756 down_write(&namespace_sem); 1797 down_write(&namespace_sem);
1757 spin_lock(&vfsmount_lock); 1798 br_write_lock(vfsmount_lock);
1758 1799
1759 /* extract from the expiration list every vfsmount that matches the 1800 /* extract from the expiration list every vfsmount that matches the
1760 * following criteria: 1801 * following criteria:
@@ -1773,7 +1814,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
1773 touch_mnt_namespace(mnt->mnt_ns); 1814 touch_mnt_namespace(mnt->mnt_ns);
1774 umount_tree(mnt, 1, &umounts); 1815 umount_tree(mnt, 1, &umounts);
1775 } 1816 }
1776 spin_unlock(&vfsmount_lock); 1817 br_write_unlock(vfsmount_lock);
1777 up_write(&namespace_sem); 1818 up_write(&namespace_sem);
1778 1819
1779 release_mounts(&umounts); 1820 release_mounts(&umounts);
@@ -1830,6 +1871,8 @@ resume:
1830/* 1871/*
1831 * process a list of expirable mountpoints with the intent of discarding any 1872 * process a list of expirable mountpoints with the intent of discarding any
1832 * submounts of a specific parent mountpoint 1873 * submounts of a specific parent mountpoint
1874 *
1875 * vfsmount_lock must be held for write
1833 */ 1876 */
1834static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) 1877static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
1835{ 1878{
@@ -2048,9 +2091,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2048 kfree(new_ns); 2091 kfree(new_ns);
2049 return ERR_PTR(-ENOMEM); 2092 return ERR_PTR(-ENOMEM);
2050 } 2093 }
2051 spin_lock(&vfsmount_lock); 2094 br_write_lock(vfsmount_lock);
2052 list_add_tail(&new_ns->list, &new_ns->root->mnt_list); 2095 list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
2053 spin_unlock(&vfsmount_lock); 2096 br_write_unlock(vfsmount_lock);
2054 2097
2055 /* 2098 /*
2056 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2099 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2244,7 +2287,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2244 goto out2; /* not attached */ 2287 goto out2; /* not attached */
2245 /* make sure we can reach put_old from new_root */ 2288 /* make sure we can reach put_old from new_root */
2246 tmp = old.mnt; 2289 tmp = old.mnt;
2247 spin_lock(&vfsmount_lock); 2290 br_write_lock(vfsmount_lock);
2248 if (tmp != new.mnt) { 2291 if (tmp != new.mnt) {
2249 for (;;) { 2292 for (;;) {
2250 if (tmp->mnt_parent == tmp) 2293 if (tmp->mnt_parent == tmp)
@@ -2264,7 +2307,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2264 /* mount new_root on / */ 2307 /* mount new_root on / */
2265 attach_mnt(new.mnt, &root_parent); 2308 attach_mnt(new.mnt, &root_parent);
2266 touch_mnt_namespace(current->nsproxy->mnt_ns); 2309 touch_mnt_namespace(current->nsproxy->mnt_ns);
2267 spin_unlock(&vfsmount_lock); 2310 br_write_unlock(vfsmount_lock);
2268 chroot_fs_refs(&root, &new); 2311 chroot_fs_refs(&root, &new);
2269 error = 0; 2312 error = 0;
2270 path_put(&root_parent); 2313 path_put(&root_parent);
@@ -2279,7 +2322,7 @@ out1:
2279out0: 2322out0:
2280 return error; 2323 return error;
2281out3: 2324out3:
2282 spin_unlock(&vfsmount_lock); 2325 br_write_unlock(vfsmount_lock);
2283 goto out2; 2326 goto out2;
2284} 2327}
2285 2328
@@ -2326,6 +2369,8 @@ void __init mnt_init(void)
2326 for (u = 0; u < HASH_SIZE; u++) 2369 for (u = 0; u < HASH_SIZE; u++)
2327 INIT_LIST_HEAD(&mount_hashtable[u]); 2370 INIT_LIST_HEAD(&mount_hashtable[u]);
2328 2371
2372 br_lock_init(vfsmount_lock);
2373
2329 err = sysfs_init(); 2374 err = sysfs_init();
2330 if (err) 2375 if (err)
2331 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2376 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2344,9 +2389,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
2344 if (!atomic_dec_and_test(&ns->count)) 2389 if (!atomic_dec_and_test(&ns->count))
2345 return; 2390 return;
2346 down_write(&namespace_sem); 2391 down_write(&namespace_sem);
2347 spin_lock(&vfsmount_lock); 2392 br_write_lock(vfsmount_lock);
2348 umount_tree(ns->root, 0, &umount_list); 2393 umount_tree(ns->root, 0, &umount_list);
2349 spin_unlock(&vfsmount_lock); 2394 br_write_unlock(vfsmount_lock);
2350 up_write(&namespace_sem); 2395 up_write(&namespace_sem);
2351 release_mounts(&umount_list); 2396 release_mounts(&umount_list);
2352 kfree(ns); 2397 kfree(ns);
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a83149..8066b8dd748f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
126 return 0; 126 return 0;
127} 127}
128 128
129/*
130 * vfsmount lock must be held for write
131 */
129void change_mnt_propagation(struct vfsmount *mnt, int type) 132void change_mnt_propagation(struct vfsmount *mnt, int type)
130{ 133{
131 if (type == MS_SHARED) { 134 if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
270 prev_src_mnt = child; 273 prev_src_mnt = child;
271 } 274 }
272out: 275out:
273 spin_lock(&vfsmount_lock); 276 br_write_lock(vfsmount_lock);
274 while (!list_empty(&tmp_list)) { 277 while (!list_empty(&tmp_list)) {
275 child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); 278 child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
276 umount_tree(child, 0, &umount_list); 279 umount_tree(child, 0, &umount_list);
277 } 280 }
278 spin_unlock(&vfsmount_lock); 281 br_write_unlock(vfsmount_lock);
279 release_mounts(&umount_list); 282 release_mounts(&umount_list);
280 return ret; 283 return ret;
281} 284}
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
296 * other mounts its parent propagates to. 299 * other mounts its parent propagates to.
297 * Check if any of these mounts that **do not have submounts** 300 * Check if any of these mounts that **do not have submounts**
298 * have more references than 'refcnt'. If so return busy. 301 * have more references than 'refcnt'. If so return busy.
302 *
303 * vfsmount lock must be held for read or write
299 */ 304 */
300int propagate_mount_busy(struct vfsmount *mnt, int refcnt) 305int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
301{ 306{
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
353 * collect all mounts that receive propagation from the mount in @list, 358 * collect all mounts that receive propagation from the mount in @list,
354 * and return these additional mounts in the same list. 359 * and return these additional mounts in the same list.
355 * @list: the list of mounts to be unmounted. 360 * @list: the list of mounts to be unmounted.
361 *
362 * vfsmount lock must be held for write
356 */ 363 */
357int propagate_umount(struct list_head *list) 364int propagate_umount(struct list_head *list)
358{ 365{