aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAl Viro <viro@zeniv.linux.org.uk>2013-09-29 22:06:07 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2013-11-09 00:16:19 -0500
commit48a066e72d970a3e225a9c18690d570c736fc455 (patch)
tree9a1861c1be4309cf69964742d238eadd5d0b1832
parent42c326082d8a2c91506f951ace638deae1faf083 (diff)
RCU'd vfsmounts
* RCU-delayed freeing of vfsmounts * vfsmount_lock replaced with a seqlock (mount_lock) * sequence number from mount_lock is stored in nameidata->m_seq and used when we exit RCU mode * new vfsmount flag - MNT_SYNC_UMOUNT. Set by umount_tree() when its caller knows that vfsmount will have no surviving references. * synchronize_rcu() done between unlocking namespace_sem in namespace_unlock() and doing pending mntput(). * new helper: legitimize_mnt(mnt, seq). Checks the mount_lock sequence number against seq, then grabs reference to mnt. Then it rechecks mount_lock again to close the race and either returns success or drops the reference it has acquired. The subtle point is that in case of MNT_SYNC_UMOUNT we can simply decrement the refcount and sod off - aforementioned synchronize_rcu() makes sure that final mntput() won't come until we leave RCU mode. We need that, since we don't want to end up with some lazy pathwalk racing with umount() and stealing the final mntput() from it - caller of umount() may expect it to return only once the fs is shut down and we don't want to break that. In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do full-blown mntput() in case of mount_lock sequence number mismatch happening just as we'd grabbed the reference, but in those cases we won't be stealing the final mntput() from anything that would care. * mntput_no_expire() doesn't lock anything on the fast path now. Incidentally, SMP and UP cases are handled the same way - no ifdefs there. * normal pathname resolution does *not* do any writes to mount_lock. It does, of course, bump the refcounts of vfsmount and dentry in the very end, but that's it. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/mount.h10
-rw-r--r--fs/namei.c50
-rw-r--r--fs/namespace.c135
-rw-r--r--include/linux/mount.h2
-rw-r--r--include/linux/namei.h2
6 files changed, 136 insertions, 83 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index eb0978da1bd4..aafa2a146434 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2887,24 +2887,28 @@ static int prepend_path(const struct path *path,
2887 struct vfsmount *vfsmnt = path->mnt; 2887 struct vfsmount *vfsmnt = path->mnt;
2888 struct mount *mnt = real_mount(vfsmnt); 2888 struct mount *mnt = real_mount(vfsmnt);
2889 int error = 0; 2889 int error = 0;
2890 unsigned seq = 0; 2890 unsigned seq, m_seq = 0;
2891 char *bptr; 2891 char *bptr;
2892 int blen; 2892 int blen;
2893 2893
2894 br_read_lock(&vfsmount_lock);
2895 rcu_read_lock(); 2894 rcu_read_lock();
2895restart_mnt:
2896 read_seqbegin_or_lock(&mount_lock, &m_seq);
2897 seq = 0;
2896restart: 2898restart:
2897 bptr = *buffer; 2899 bptr = *buffer;
2898 blen = *buflen; 2900 blen = *buflen;
2901 error = 0;
2899 read_seqbegin_or_lock(&rename_lock, &seq); 2902 read_seqbegin_or_lock(&rename_lock, &seq);
2900 while (dentry != root->dentry || vfsmnt != root->mnt) { 2903 while (dentry != root->dentry || vfsmnt != root->mnt) {
2901 struct dentry * parent; 2904 struct dentry * parent;
2902 2905
2903 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { 2906 if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
2907 struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
2904 /* Global root? */ 2908 /* Global root? */
2905 if (mnt_has_parent(mnt)) { 2909 if (mnt != parent) {
2906 dentry = mnt->mnt_mountpoint; 2910 dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
2907 mnt = mnt->mnt_parent; 2911 mnt = parent;
2908 vfsmnt = &mnt->mnt; 2912 vfsmnt = &mnt->mnt;
2909 continue; 2913 continue;
2910 } 2914 }
@@ -2938,7 +2942,11 @@ restart:
2938 goto restart; 2942 goto restart;
2939 } 2943 }
2940 done_seqretry(&rename_lock, seq); 2944 done_seqretry(&rename_lock, seq);
2941 br_read_unlock(&vfsmount_lock); 2945 if (need_seqretry(&mount_lock, m_seq)) {
2946 m_seq = 1;
2947 goto restart_mnt;
2948 }
2949 done_seqretry(&mount_lock, m_seq);
2942 2950
2943 if (error >= 0 && bptr == *buffer) { 2951 if (error >= 0 && bptr == *buffer) {
2944 if (--blen < 0) 2952 if (--blen < 0)
diff --git a/fs/mount.h b/fs/mount.h
index f0866076de6e..d64c594be6c4 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -1,7 +1,6 @@
1#include <linux/mount.h> 1#include <linux/mount.h>
2#include <linux/seq_file.h> 2#include <linux/seq_file.h>
3#include <linux/poll.h> 3#include <linux/poll.h>
4#include <linux/lglock.h>
5 4
6struct mnt_namespace { 5struct mnt_namespace {
7 atomic_t count; 6 atomic_t count;
@@ -30,6 +29,7 @@ struct mount {
30 struct mount *mnt_parent; 29 struct mount *mnt_parent;
31 struct dentry *mnt_mountpoint; 30 struct dentry *mnt_mountpoint;
32 struct vfsmount mnt; 31 struct vfsmount mnt;
32 struct rcu_head mnt_rcu;
33#ifdef CONFIG_SMP 33#ifdef CONFIG_SMP
34 struct mnt_pcp __percpu *mnt_pcp; 34 struct mnt_pcp __percpu *mnt_pcp;
35#else 35#else
@@ -80,21 +80,23 @@ static inline int is_mounted(struct vfsmount *mnt)
80extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); 80extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
81extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); 81extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
82 82
83extern bool legitimize_mnt(struct vfsmount *, unsigned);
84
83static inline void get_mnt_ns(struct mnt_namespace *ns) 85static inline void get_mnt_ns(struct mnt_namespace *ns)
84{ 86{
85 atomic_inc(&ns->count); 87 atomic_inc(&ns->count);
86} 88}
87 89
88extern struct lglock vfsmount_lock; 90extern seqlock_t mount_lock;
89 91
90static inline void lock_mount_hash(void) 92static inline void lock_mount_hash(void)
91{ 93{
92 br_write_lock(&vfsmount_lock); 94 write_seqlock(&mount_lock);
93} 95}
94 96
95static inline void unlock_mount_hash(void) 97static inline void unlock_mount_hash(void)
96{ 98{
97 br_write_unlock(&vfsmount_lock); 99 write_sequnlock(&mount_lock);
98} 100}
99 101
100struct proc_mounts { 102struct proc_mounts {
diff --git a/fs/namei.c b/fs/namei.c
index 1f844fbfce72..cb0ebae07e52 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -484,14 +484,12 @@ EXPORT_SYMBOL(path_put);
484 484
485static inline void lock_rcu_walk(void) 485static inline void lock_rcu_walk(void)
486{ 486{
487 br_read_lock(&vfsmount_lock);
488 rcu_read_lock(); 487 rcu_read_lock();
489} 488}
490 489
491static inline void unlock_rcu_walk(void) 490static inline void unlock_rcu_walk(void)
492{ 491{
493 rcu_read_unlock(); 492 rcu_read_unlock();
494 br_read_unlock(&vfsmount_lock);
495} 493}
496 494
497/** 495/**
@@ -512,26 +510,23 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
512 BUG_ON(!(nd->flags & LOOKUP_RCU)); 510 BUG_ON(!(nd->flags & LOOKUP_RCU));
513 511
514 /* 512 /*
515 * Get a reference to the parent first: we're 513 * After legitimizing the bastards, terminate_walk()
516 * going to make "path_put(nd->path)" valid in 514 * will do the right thing for non-RCU mode, and all our
517 * non-RCU context for "terminate_walk()". 515 * subsequent exit cases should rcu_read_unlock()
518 * 516 * before returning. Do vfsmount first; if dentry
519 * If this doesn't work, return immediately with 517 * can't be legitimized, just set nd->path.dentry to NULL
520 * RCU walking still active (and then we will do 518 * and rely on dput(NULL) being a no-op.
521 * the RCU walk cleanup in terminate_walk()).
522 */ 519 */
523 if (!lockref_get_not_dead(&parent->d_lockref)) 520 if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
524 return -ECHILD; 521 return -ECHILD;
525
526 /*
527 * After the mntget(), we terminate_walk() will do
528 * the right thing for non-RCU mode, and all our
529 * subsequent exit cases should unlock_rcu_walk()
530 * before returning.
531 */
532 mntget(nd->path.mnt);
533 nd->flags &= ~LOOKUP_RCU; 522 nd->flags &= ~LOOKUP_RCU;
534 523
524 if (!lockref_get_not_dead(&parent->d_lockref)) {
525 nd->path.dentry = NULL;
526 unlock_rcu_walk();
527 return -ECHILD;
528 }
529
535 /* 530 /*
536 * For a negative lookup, the lookup sequence point is the parents 531 * For a negative lookup, the lookup sequence point is the parents
537 * sequence point, and it only needs to revalidate the parent dentry. 532 * sequence point, and it only needs to revalidate the parent dentry.
@@ -608,16 +603,21 @@ static int complete_walk(struct nameidata *nd)
608 if (!(nd->flags & LOOKUP_ROOT)) 603 if (!(nd->flags & LOOKUP_ROOT))
609 nd->root.mnt = NULL; 604 nd->root.mnt = NULL;
610 605
606 if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
607 unlock_rcu_walk();
608 return -ECHILD;
609 }
611 if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { 610 if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
612 unlock_rcu_walk(); 611 unlock_rcu_walk();
612 mntput(nd->path.mnt);
613 return -ECHILD; 613 return -ECHILD;
614 } 614 }
615 if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { 615 if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
616 unlock_rcu_walk(); 616 unlock_rcu_walk();
617 dput(dentry); 617 dput(dentry);
618 mntput(nd->path.mnt);
618 return -ECHILD; 619 return -ECHILD;
619 } 620 }
620 mntget(nd->path.mnt);
621 unlock_rcu_walk(); 621 unlock_rcu_walk();
622 } 622 }
623 623
@@ -909,15 +909,15 @@ int follow_up(struct path *path)
909 struct mount *parent; 909 struct mount *parent;
910 struct dentry *mountpoint; 910 struct dentry *mountpoint;
911 911
912 br_read_lock(&vfsmount_lock); 912 read_seqlock_excl(&mount_lock);
913 parent = mnt->mnt_parent; 913 parent = mnt->mnt_parent;
914 if (parent == mnt) { 914 if (parent == mnt) {
915 br_read_unlock(&vfsmount_lock); 915 read_sequnlock_excl(&mount_lock);
916 return 0; 916 return 0;
917 } 917 }
918 mntget(&parent->mnt); 918 mntget(&parent->mnt);
919 mountpoint = dget(mnt->mnt_mountpoint); 919 mountpoint = dget(mnt->mnt_mountpoint);
920 br_read_unlock(&vfsmount_lock); 920 read_sequnlock_excl(&mount_lock);
921 dput(path->dentry); 921 dput(path->dentry);
922 path->dentry = mountpoint; 922 path->dentry = mountpoint;
923 mntput(path->mnt); 923 mntput(path->mnt);
@@ -1048,8 +1048,8 @@ static int follow_managed(struct path *path, unsigned flags)
1048 1048
1049 /* Something is mounted on this dentry in another 1049 /* Something is mounted on this dentry in another
1050 * namespace and/or whatever was mounted there in this 1050 * namespace and/or whatever was mounted there in this
1051 * namespace got unmounted before we managed to get the 1051 * namespace got unmounted before lookup_mnt() could
1052 * vfsmount_lock */ 1052 * get it */
1053 } 1053 }
1054 1054
1055 /* Handle an automount point */ 1055 /* Handle an automount point */
@@ -1864,6 +1864,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1864 if (flags & LOOKUP_RCU) { 1864 if (flags & LOOKUP_RCU) {
1865 lock_rcu_walk(); 1865 lock_rcu_walk();
1866 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1866 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1867 nd->m_seq = read_seqbegin(&mount_lock);
1867 } else { 1868 } else {
1868 path_get(&nd->path); 1869 path_get(&nd->path);
1869 } 1870 }
@@ -1872,6 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1872 1873
1873 nd->root.mnt = NULL; 1874 nd->root.mnt = NULL;
1874 1875
1876 nd->m_seq = read_seqbegin(&mount_lock);
1875 if (*name=='/') { 1877 if (*name=='/') {
1876 if (flags & LOOKUP_RCU) { 1878 if (flags & LOOKUP_RCU) {
1877 lock_rcu_walk(); 1879 lock_rcu_walk();
diff --git a/fs/namespace.c b/fs/namespace.c
index 500202ce10db..ac2ce8a766e1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(fs_kobj);
53 * It should be taken for write in all cases where the vfsmount 53 * It should be taken for write in all cases where the vfsmount
54 * tree or hash is modified or when a vfsmount structure is modified. 54 * tree or hash is modified or when a vfsmount structure is modified.
55 */ 55 */
56DEFINE_BRLOCK(vfsmount_lock); 56__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
57 57
58static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 58static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
59{ 59{
@@ -547,16 +547,38 @@ static void free_vfsmnt(struct mount *mnt)
547 kmem_cache_free(mnt_cache, mnt); 547 kmem_cache_free(mnt_cache, mnt);
548} 548}
549 549
550/* call under rcu_read_lock */
551bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
552{
553 struct mount *mnt;
554 if (read_seqretry(&mount_lock, seq))
555 return false;
556 if (bastard == NULL)
557 return true;
558 mnt = real_mount(bastard);
559 mnt_add_count(mnt, 1);
560 if (likely(!read_seqretry(&mount_lock, seq)))
561 return true;
562 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
563 mnt_add_count(mnt, -1);
564 return false;
565 }
566 rcu_read_unlock();
567 mntput(bastard);
568 rcu_read_lock();
569 return false;
570}
571
550/* 572/*
551 * find the first mount at @dentry on vfsmount @mnt. 573 * find the first mount at @dentry on vfsmount @mnt.
552 * vfsmount_lock must be held for read or write. 574 * call under rcu_read_lock()
553 */ 575 */
554struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 576struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
555{ 577{
556 struct list_head *head = mount_hashtable + hash(mnt, dentry); 578 struct list_head *head = mount_hashtable + hash(mnt, dentry);
557 struct mount *p; 579 struct mount *p;
558 580
559 list_for_each_entry(p, head, mnt_hash) 581 list_for_each_entry_rcu(p, head, mnt_hash)
560 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) 582 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
561 return p; 583 return p;
562 return NULL; 584 return NULL;
@@ -564,7 +586,7 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
564 586
565/* 587/*
566 * find the last mount at @dentry on vfsmount @mnt. 588 * find the last mount at @dentry on vfsmount @mnt.
567 * vfsmount_lock must be held for read or write. 589 * mount_lock must be held.
568 */ 590 */
569struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) 591struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
570{ 592{
@@ -596,17 +618,17 @@ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
596struct vfsmount *lookup_mnt(struct path *path) 618struct vfsmount *lookup_mnt(struct path *path)
597{ 619{
598 struct mount *child_mnt; 620 struct mount *child_mnt;
621 struct vfsmount *m;
622 unsigned seq;
599 623
600 br_read_lock(&vfsmount_lock); 624 rcu_read_lock();
601 child_mnt = __lookup_mnt(path->mnt, path->dentry); 625 do {
602 if (child_mnt) { 626 seq = read_seqbegin(&mount_lock);
603 mnt_add_count(child_mnt, 1); 627 child_mnt = __lookup_mnt(path->mnt, path->dentry);
604 br_read_unlock(&vfsmount_lock); 628 m = child_mnt ? &child_mnt->mnt : NULL;
605 return &child_mnt->mnt; 629 } while (!legitimize_mnt(m, seq));
606 } else { 630 rcu_read_unlock();
607 br_read_unlock(&vfsmount_lock); 631 return m;
608 return NULL;
609 }
610} 632}
611 633
612static struct mountpoint *new_mountpoint(struct dentry *dentry) 634static struct mountpoint *new_mountpoint(struct dentry *dentry)
@@ -874,38 +896,46 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
874 return ERR_PTR(err); 896 return ERR_PTR(err);
875} 897}
876 898
899static void delayed_free(struct rcu_head *head)
900{
901 struct mount *mnt = container_of(head, struct mount, mnt_rcu);
902 kfree(mnt->mnt_devname);
903#ifdef CONFIG_SMP
904 free_percpu(mnt->mnt_pcp);
905#endif
906 kmem_cache_free(mnt_cache, mnt);
907}
908
877static void mntput_no_expire(struct mount *mnt) 909static void mntput_no_expire(struct mount *mnt)
878{ 910{
879put_again: 911put_again:
880#ifdef CONFIG_SMP 912 rcu_read_lock();
881 br_read_lock(&vfsmount_lock); 913 mnt_add_count(mnt, -1);
882 if (likely(mnt->mnt_ns)) { 914 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
883 /* shouldn't be the last one */ 915 rcu_read_unlock();
884 mnt_add_count(mnt, -1);
885 br_read_unlock(&vfsmount_lock);
886 return; 916 return;
887 } 917 }
888 br_read_unlock(&vfsmount_lock);
889
890 lock_mount_hash(); 918 lock_mount_hash();
891 mnt_add_count(mnt, -1);
892 if (mnt_get_count(mnt)) { 919 if (mnt_get_count(mnt)) {
920 rcu_read_unlock();
893 unlock_mount_hash(); 921 unlock_mount_hash();
894 return; 922 return;
895 } 923 }
896#else
897 mnt_add_count(mnt, -1);
898 if (likely(mnt_get_count(mnt)))
899 return;
900 lock_mount_hash();
901#endif
902 if (unlikely(mnt->mnt_pinned)) { 924 if (unlikely(mnt->mnt_pinned)) {
903 mnt_add_count(mnt, mnt->mnt_pinned + 1); 925 mnt_add_count(mnt, mnt->mnt_pinned + 1);
904 mnt->mnt_pinned = 0; 926 mnt->mnt_pinned = 0;
927 rcu_read_unlock();
905 unlock_mount_hash(); 928 unlock_mount_hash();
906 acct_auto_close_mnt(&mnt->mnt); 929 acct_auto_close_mnt(&mnt->mnt);
907 goto put_again; 930 goto put_again;
908 } 931 }
932 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
933 rcu_read_unlock();
934 unlock_mount_hash();
935 return;
936 }
937 mnt->mnt.mnt_flags |= MNT_DOOMED;
938 rcu_read_unlock();
909 939
910 list_del(&mnt->mnt_instance); 940 list_del(&mnt->mnt_instance);
911 unlock_mount_hash(); 941 unlock_mount_hash();
@@ -924,7 +954,8 @@ put_again:
924 fsnotify_vfsmount_delete(&mnt->mnt); 954 fsnotify_vfsmount_delete(&mnt->mnt);
925 dput(mnt->mnt.mnt_root); 955 dput(mnt->mnt.mnt_root);
926 deactivate_super(mnt->mnt.mnt_sb); 956 deactivate_super(mnt->mnt.mnt_sb);
927 free_vfsmnt(mnt); 957 mnt_free_id(mnt);
958 call_rcu(&mnt->mnt_rcu, delayed_free);
928} 959}
929 960
930void mntput(struct vfsmount *mnt) 961void mntput(struct vfsmount *mnt)
@@ -1137,6 +1168,8 @@ static void namespace_unlock(void)
1137 list_splice_init(&unmounted, &head); 1168 list_splice_init(&unmounted, &head);
1138 up_write(&namespace_sem); 1169 up_write(&namespace_sem);
1139 1170
1171 synchronize_rcu();
1172
1140 while (!list_empty(&head)) { 1173 while (!list_empty(&head)) {
1141 mnt = list_first_entry(&head, struct mount, mnt_hash); 1174 mnt = list_first_entry(&head, struct mount, mnt_hash);
1142 list_del_init(&mnt->mnt_hash); 1175 list_del_init(&mnt->mnt_hash);
@@ -1152,10 +1185,13 @@ static inline void namespace_lock(void)
1152} 1185}
1153 1186
1154/* 1187/*
1155 * vfsmount lock must be held for write 1188 * mount_lock must be held
1156 * namespace_sem must be held for write 1189 * namespace_sem must be held for write
1190 * how = 0 => just this tree, don't propagate
1191 * how = 1 => propagate; we know that nobody else has reference to any victims
1192 * how = 2 => lazy umount
1157 */ 1193 */
1158void umount_tree(struct mount *mnt, int propagate) 1194void umount_tree(struct mount *mnt, int how)
1159{ 1195{
1160 LIST_HEAD(tmp_list); 1196 LIST_HEAD(tmp_list);
1161 struct mount *p; 1197 struct mount *p;
@@ -1163,7 +1199,7 @@ void umount_tree(struct mount *mnt, int propagate)
1163 for (p = mnt; p; p = next_mnt(p, mnt)) 1199 for (p = mnt; p; p = next_mnt(p, mnt))
1164 list_move(&p->mnt_hash, &tmp_list); 1200 list_move(&p->mnt_hash, &tmp_list);
1165 1201
1166 if (propagate) 1202 if (how)
1167 propagate_umount(&tmp_list); 1203 propagate_umount(&tmp_list);
1168 1204
1169 list_for_each_entry(p, &tmp_list, mnt_hash) { 1205 list_for_each_entry(p, &tmp_list, mnt_hash) {
@@ -1171,6 +1207,8 @@ void umount_tree(struct mount *mnt, int propagate)
1171 list_del_init(&p->mnt_list); 1207 list_del_init(&p->mnt_list);
1172 __touch_mnt_namespace(p->mnt_ns); 1208 __touch_mnt_namespace(p->mnt_ns);
1173 p->mnt_ns = NULL; 1209 p->mnt_ns = NULL;
1210 if (how < 2)
1211 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1174 list_del_init(&p->mnt_child); 1212 list_del_init(&p->mnt_child);
1175 if (mnt_has_parent(p)) { 1213 if (mnt_has_parent(p)) {
1176 put_mountpoint(p->mnt_mp); 1214 put_mountpoint(p->mnt_mp);
@@ -1262,14 +1300,18 @@ static int do_umount(struct mount *mnt, int flags)
1262 lock_mount_hash(); 1300 lock_mount_hash();
1263 event++; 1301 event++;
1264 1302
1265 if (!(flags & MNT_DETACH)) 1303 if (flags & MNT_DETACH) {
1266 shrink_submounts(mnt);
1267
1268 retval = -EBUSY;
1269 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
1270 if (!list_empty(&mnt->mnt_list)) 1304 if (!list_empty(&mnt->mnt_list))
1271 umount_tree(mnt, 1); 1305 umount_tree(mnt, 2);
1272 retval = 0; 1306 retval = 0;
1307 } else {
1308 shrink_submounts(mnt);
1309 retval = -EBUSY;
1310 if (!propagate_mount_busy(mnt, 2)) {
1311 if (!list_empty(&mnt->mnt_list))
1312 umount_tree(mnt, 1);
1313 retval = 0;
1314 }
1273 } 1315 }
1274 unlock_mount_hash(); 1316 unlock_mount_hash();
1275 namespace_unlock(); 1317 namespace_unlock();
@@ -1955,7 +1997,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
1955 struct mount *parent; 1997 struct mount *parent;
1956 int err; 1998 int err;
1957 1999
1958 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); 2000 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
1959 2001
1960 mp = lock_mount(path); 2002 mp = lock_mount(path);
1961 if (IS_ERR(mp)) 2003 if (IS_ERR(mp))
@@ -2172,7 +2214,7 @@ resume:
2172 * process a list of expirable mountpoints with the intent of discarding any 2214 * process a list of expirable mountpoints with the intent of discarding any
2173 * submounts of a specific parent mountpoint 2215 * submounts of a specific parent mountpoint
2174 * 2216 *
2175 * vfsmount_lock must be held for write 2217 * mount_lock must be held for write
2176 */ 2218 */
2177static void shrink_submounts(struct mount *mnt) 2219static void shrink_submounts(struct mount *mnt)
2178{ 2220{
@@ -2558,7 +2600,7 @@ out_type:
2558/* 2600/*
2559 * Return true if path is reachable from root 2601 * Return true if path is reachable from root
2560 * 2602 *
2561 * namespace_sem or vfsmount_lock is held 2603 * namespace_sem or mount_lock is held
2562 */ 2604 */
2563bool is_path_reachable(struct mount *mnt, struct dentry *dentry, 2605bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2564 const struct path *root) 2606 const struct path *root)
@@ -2573,9 +2615,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2573int path_is_under(struct path *path1, struct path *path2) 2615int path_is_under(struct path *path1, struct path *path2)
2574{ 2616{
2575 int res; 2617 int res;
2576 br_read_lock(&vfsmount_lock); 2618 read_seqlock_excl(&mount_lock);
2577 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 2619 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
2578 br_read_unlock(&vfsmount_lock); 2620 read_sequnlock_excl(&mount_lock);
2579 return res; 2621 return res;
2580} 2622}
2581EXPORT_SYMBOL(path_is_under); 2623EXPORT_SYMBOL(path_is_under);
@@ -2748,8 +2790,6 @@ void __init mnt_init(void)
2748 for (u = 0; u < HASH_SIZE; u++) 2790 for (u = 0; u < HASH_SIZE; u++)
2749 INIT_LIST_HEAD(&mountpoint_hashtable[u]); 2791 INIT_LIST_HEAD(&mountpoint_hashtable[u]);
2750 2792
2751 br_lock_init(&vfsmount_lock);
2752
2753 err = sysfs_init(); 2793 err = sysfs_init();
2754 if (err) 2794 if (err)
2755 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2795 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2788,9 +2828,8 @@ void kern_unmount(struct vfsmount *mnt)
2788{ 2828{
2789 /* release long term mount so mount point can be released */ 2829 /* release long term mount so mount point can be released */
2790 if (!IS_ERR_OR_NULL(mnt)) { 2830 if (!IS_ERR_OR_NULL(mnt)) {
2791 lock_mount_hash();
2792 real_mount(mnt)->mnt_ns = NULL; 2831 real_mount(mnt)->mnt_ns = NULL;
2793 unlock_mount_hash(); 2832 synchronize_rcu(); /* yecchhh... */
2794 mntput(mnt); 2833 mntput(mnt);
2795 } 2834 }
2796} 2835}
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 38cd98f112a0..371d346fa270 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -49,6 +49,8 @@ struct mnt_namespace;
49 49
50#define MNT_LOCK_READONLY 0x400000 50#define MNT_LOCK_READONLY 0x400000
51#define MNT_LOCKED 0x800000 51#define MNT_LOCKED 0x800000
52#define MNT_DOOMED 0x1000000
53#define MNT_SYNC_UMOUNT 0x2000000
52 54
53struct vfsmount { 55struct vfsmount {
54 struct dentry *mnt_root; /* root of the mounted tree */ 56 struct dentry *mnt_root; /* root of the mounted tree */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 8e47bc7a1665..492de72560fa 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -16,7 +16,7 @@ struct nameidata {
16 struct path root; 16 struct path root;
17 struct inode *inode; /* path.dentry.d_inode */ 17 struct inode *inode; /* path.dentry.d_inode */
18 unsigned int flags; 18 unsigned int flags;
19 unsigned seq; 19 unsigned seq, m_seq;
20 int last_type; 20 int last_type;
21 unsigned depth; 21 unsigned depth;
22 char *saved_names[MAX_NESTED_LINKS + 1]; 22 char *saved_names[MAX_NESTED_LINKS + 1];