aboutsummaryrefslogtreecommitdiffstats
path: root/fs/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c390
1 files changed, 194 insertions, 196 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index da5c49483430..ac2ce8a766e1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -39,7 +39,7 @@ static int mnt_group_start = 1;
39static struct list_head *mount_hashtable __read_mostly; 39static struct list_head *mount_hashtable __read_mostly;
40static struct list_head *mountpoint_hashtable __read_mostly; 40static struct list_head *mountpoint_hashtable __read_mostly;
41static struct kmem_cache *mnt_cache __read_mostly; 41static struct kmem_cache *mnt_cache __read_mostly;
42static struct rw_semaphore namespace_sem; 42static DECLARE_RWSEM(namespace_sem);
43 43
44/* /sys/fs */ 44/* /sys/fs */
45struct kobject *fs_kobj; 45struct kobject *fs_kobj;
@@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(fs_kobj);
53 * It should be taken for write in all cases where the vfsmount 53 * It should be taken for write in all cases where the vfsmount
54 * tree or hash is modified or when a vfsmount structure is modified. 54 * tree or hash is modified or when a vfsmount structure is modified.
55 */ 55 */
56DEFINE_BRLOCK(vfsmount_lock); 56__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
57 57
58static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 58static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
59{ 59{
@@ -63,8 +63,6 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
63 return tmp & (HASH_SIZE - 1); 63 return tmp & (HASH_SIZE - 1);
64} 64}
65 65
66#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
67
68/* 66/*
69 * allocation is serialized by namespace_sem, but we need the spinlock to 67 * allocation is serialized by namespace_sem, but we need the spinlock to
70 * serialize with freeing. 68 * serialize with freeing.
@@ -458,7 +456,7 @@ static int mnt_make_readonly(struct mount *mnt)
458{ 456{
459 int ret = 0; 457 int ret = 0;
460 458
461 br_write_lock(&vfsmount_lock); 459 lock_mount_hash();
462 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 460 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
463 /* 461 /*
464 * After storing MNT_WRITE_HOLD, we'll read the counters. This store 462 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -492,15 +490,15 @@ static int mnt_make_readonly(struct mount *mnt)
492 */ 490 */
493 smp_wmb(); 491 smp_wmb();
494 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 492 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
495 br_write_unlock(&vfsmount_lock); 493 unlock_mount_hash();
496 return ret; 494 return ret;
497} 495}
498 496
499static void __mnt_unmake_readonly(struct mount *mnt) 497static void __mnt_unmake_readonly(struct mount *mnt)
500{ 498{
501 br_write_lock(&vfsmount_lock); 499 lock_mount_hash();
502 mnt->mnt.mnt_flags &= ~MNT_READONLY; 500 mnt->mnt.mnt_flags &= ~MNT_READONLY;
503 br_write_unlock(&vfsmount_lock); 501 unlock_mount_hash();
504} 502}
505 503
506int sb_prepare_remount_readonly(struct super_block *sb) 504int sb_prepare_remount_readonly(struct super_block *sb)
@@ -512,7 +510,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
512 if (atomic_long_read(&sb->s_remove_count)) 510 if (atomic_long_read(&sb->s_remove_count))
513 return -EBUSY; 511 return -EBUSY;
514 512
515 br_write_lock(&vfsmount_lock); 513 lock_mount_hash();
516 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 514 list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
517 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { 515 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
518 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; 516 mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -534,7 +532,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
534 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 532 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
535 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 533 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
536 } 534 }
537 br_write_unlock(&vfsmount_lock); 535 unlock_mount_hash();
538 536
539 return err; 537 return err;
540} 538}
@@ -549,30 +547,56 @@ static void free_vfsmnt(struct mount *mnt)
549 kmem_cache_free(mnt_cache, mnt); 547 kmem_cache_free(mnt_cache, mnt);
550} 548}
551 549
550/* call under rcu_read_lock */
551bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
552{
553 struct mount *mnt;
554 if (read_seqretry(&mount_lock, seq))
555 return false;
556 if (bastard == NULL)
557 return true;
558 mnt = real_mount(bastard);
559 mnt_add_count(mnt, 1);
560 if (likely(!read_seqretry(&mount_lock, seq)))
561 return true;
562 if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
563 mnt_add_count(mnt, -1);
564 return false;
565 }
566 rcu_read_unlock();
567 mntput(bastard);
568 rcu_read_lock();
569 return false;
570}
571
552/* 572/*
553 * find the first or last mount at @dentry on vfsmount @mnt depending on 573 * find the first mount at @dentry on vfsmount @mnt.
554 * @dir. If @dir is set return the first mount else return the last mount. 574 * call under rcu_read_lock()
555 * vfsmount_lock must be held for read or write.
556 */ 575 */
557struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, 576struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
558 int dir)
559{ 577{
560 struct list_head *head = mount_hashtable + hash(mnt, dentry); 578 struct list_head *head = mount_hashtable + hash(mnt, dentry);
561 struct list_head *tmp = head; 579 struct mount *p;
562 struct mount *p, *found = NULL;
563 580
564 for (;;) { 581 list_for_each_entry_rcu(p, head, mnt_hash)
565 tmp = dir ? tmp->next : tmp->prev; 582 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
566 p = NULL; 583 return p;
567 if (tmp == head) 584 return NULL;
568 break; 585}
569 p = list_entry(tmp, struct mount, mnt_hash); 586
570 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) { 587/*
571 found = p; 588 * find the last mount at @dentry on vfsmount @mnt.
572 break; 589 * mount_lock must be held.
573 } 590 */
574 } 591struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
575 return found; 592{
593 struct list_head *head = mount_hashtable + hash(mnt, dentry);
594 struct mount *p;
595
596 list_for_each_entry_reverse(p, head, mnt_hash)
597 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
598 return p;
599 return NULL;
576} 600}
577 601
578/* 602/*
@@ -594,17 +618,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
594struct vfsmount *lookup_mnt(struct path *path) 618struct vfsmount *lookup_mnt(struct path *path)
595{ 619{
596 struct mount *child_mnt; 620 struct mount *child_mnt;
621 struct vfsmount *m;
622 unsigned seq;
597 623
598 br_read_lock(&vfsmount_lock); 624 rcu_read_lock();
599 child_mnt = __lookup_mnt(path->mnt, path->dentry, 1); 625 do {
600 if (child_mnt) { 626 seq = read_seqbegin(&mount_lock);
601 mnt_add_count(child_mnt, 1); 627 child_mnt = __lookup_mnt(path->mnt, path->dentry);
602 br_read_unlock(&vfsmount_lock); 628 m = child_mnt ? &child_mnt->mnt : NULL;
603 return &child_mnt->mnt; 629 } while (!legitimize_mnt(m, seq));
604 } else { 630 rcu_read_unlock();
605 br_read_unlock(&vfsmount_lock); 631 return m;
606 return NULL;
607 }
608} 632}
609 633
610static struct mountpoint *new_mountpoint(struct dentry *dentry) 634static struct mountpoint *new_mountpoint(struct dentry *dentry)
@@ -796,9 +820,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
796 mnt->mnt.mnt_sb = root->d_sb; 820 mnt->mnt.mnt_sb = root->d_sb;
797 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 821 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
798 mnt->mnt_parent = mnt; 822 mnt->mnt_parent = mnt;
799 br_write_lock(&vfsmount_lock); 823 lock_mount_hash();
800 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); 824 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
801 br_write_unlock(&vfsmount_lock); 825 unlock_mount_hash();
802 return &mnt->mnt; 826 return &mnt->mnt;
803} 827}
804EXPORT_SYMBOL_GPL(vfs_kern_mount); 828EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -839,9 +863,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
839 mnt->mnt.mnt_root = dget(root); 863 mnt->mnt.mnt_root = dget(root);
840 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 864 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
841 mnt->mnt_parent = mnt; 865 mnt->mnt_parent = mnt;
842 br_write_lock(&vfsmount_lock); 866 lock_mount_hash();
843 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 867 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
844 br_write_unlock(&vfsmount_lock); 868 unlock_mount_hash();
845 869
846 if ((flag & CL_SLAVE) || 870 if ((flag & CL_SLAVE) ||
847 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { 871 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
@@ -872,64 +896,66 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
872 return ERR_PTR(err); 896 return ERR_PTR(err);
873} 897}
874 898
875static inline void mntfree(struct mount *mnt) 899static void delayed_free(struct rcu_head *head)
876{ 900{
877 struct vfsmount *m = &mnt->mnt; 901 struct mount *mnt = container_of(head, struct mount, mnt_rcu);
878 struct super_block *sb = m->mnt_sb; 902 kfree(mnt->mnt_devname);
879 903#ifdef CONFIG_SMP
880 /* 904 free_percpu(mnt->mnt_pcp);
881 * This probably indicates that somebody messed 905#endif
882 * up a mnt_want/drop_write() pair. If this 906 kmem_cache_free(mnt_cache, mnt);
883 * happens, the filesystem was probably unable
884 * to make r/w->r/o transitions.
885 */
886 /*
887 * The locking used to deal with mnt_count decrement provides barriers,
888 * so mnt_get_writers() below is safe.
889 */
890 WARN_ON(mnt_get_writers(mnt));
891 fsnotify_vfsmount_delete(m);
892 dput(m->mnt_root);
893 free_vfsmnt(mnt);
894 deactivate_super(sb);
895} 907}
896 908
897static void mntput_no_expire(struct mount *mnt) 909static void mntput_no_expire(struct mount *mnt)
898{ 910{
899put_again: 911put_again:
900#ifdef CONFIG_SMP 912 rcu_read_lock();
901 br_read_lock(&vfsmount_lock); 913 mnt_add_count(mnt, -1);
902 if (likely(mnt->mnt_ns)) { 914 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
903 /* shouldn't be the last one */ 915 rcu_read_unlock();
904 mnt_add_count(mnt, -1);
905 br_read_unlock(&vfsmount_lock);
906 return; 916 return;
907 } 917 }
908 br_read_unlock(&vfsmount_lock); 918 lock_mount_hash();
909
910 br_write_lock(&vfsmount_lock);
911 mnt_add_count(mnt, -1);
912 if (mnt_get_count(mnt)) { 919 if (mnt_get_count(mnt)) {
913 br_write_unlock(&vfsmount_lock); 920 rcu_read_unlock();
921 unlock_mount_hash();
914 return; 922 return;
915 } 923 }
916#else
917 mnt_add_count(mnt, -1);
918 if (likely(mnt_get_count(mnt)))
919 return;
920 br_write_lock(&vfsmount_lock);
921#endif
922 if (unlikely(mnt->mnt_pinned)) { 924 if (unlikely(mnt->mnt_pinned)) {
923 mnt_add_count(mnt, mnt->mnt_pinned + 1); 925 mnt_add_count(mnt, mnt->mnt_pinned + 1);
924 mnt->mnt_pinned = 0; 926 mnt->mnt_pinned = 0;
925 br_write_unlock(&vfsmount_lock); 927 rcu_read_unlock();
928 unlock_mount_hash();
926 acct_auto_close_mnt(&mnt->mnt); 929 acct_auto_close_mnt(&mnt->mnt);
927 goto put_again; 930 goto put_again;
928 } 931 }
932 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
933 rcu_read_unlock();
934 unlock_mount_hash();
935 return;
936 }
937 mnt->mnt.mnt_flags |= MNT_DOOMED;
938 rcu_read_unlock();
929 939
930 list_del(&mnt->mnt_instance); 940 list_del(&mnt->mnt_instance);
931 br_write_unlock(&vfsmount_lock); 941 unlock_mount_hash();
932 mntfree(mnt); 942
943 /*
944 * This probably indicates that somebody messed
945 * up a mnt_want/drop_write() pair. If this
946 * happens, the filesystem was probably unable
947 * to make r/w->r/o transitions.
948 */
949 /*
950 * The locking used to deal with mnt_count decrement provides barriers,
951 * so mnt_get_writers() below is safe.
952 */
953 WARN_ON(mnt_get_writers(mnt));
954 fsnotify_vfsmount_delete(&mnt->mnt);
955 dput(mnt->mnt.mnt_root);
956 deactivate_super(mnt->mnt.mnt_sb);
957 mnt_free_id(mnt);
958 call_rcu(&mnt->mnt_rcu, delayed_free);
933} 959}
934 960
935void mntput(struct vfsmount *mnt) 961void mntput(struct vfsmount *mnt)
@@ -954,21 +980,21 @@ EXPORT_SYMBOL(mntget);
954 980
955void mnt_pin(struct vfsmount *mnt) 981void mnt_pin(struct vfsmount *mnt)
956{ 982{
957 br_write_lock(&vfsmount_lock); 983 lock_mount_hash();
958 real_mount(mnt)->mnt_pinned++; 984 real_mount(mnt)->mnt_pinned++;
959 br_write_unlock(&vfsmount_lock); 985 unlock_mount_hash();
960} 986}
961EXPORT_SYMBOL(mnt_pin); 987EXPORT_SYMBOL(mnt_pin);
962 988
963void mnt_unpin(struct vfsmount *m) 989void mnt_unpin(struct vfsmount *m)
964{ 990{
965 struct mount *mnt = real_mount(m); 991 struct mount *mnt = real_mount(m);
966 br_write_lock(&vfsmount_lock); 992 lock_mount_hash();
967 if (mnt->mnt_pinned) { 993 if (mnt->mnt_pinned) {
968 mnt_add_count(mnt, 1); 994 mnt_add_count(mnt, 1);
969 mnt->mnt_pinned--; 995 mnt->mnt_pinned--;
970 } 996 }
971 br_write_unlock(&vfsmount_lock); 997 unlock_mount_hash();
972} 998}
973EXPORT_SYMBOL(mnt_unpin); 999EXPORT_SYMBOL(mnt_unpin);
974 1000
@@ -1085,12 +1111,12 @@ int may_umount_tree(struct vfsmount *m)
1085 BUG_ON(!m); 1111 BUG_ON(!m);
1086 1112
1087 /* write lock needed for mnt_get_count */ 1113 /* write lock needed for mnt_get_count */
1088 br_write_lock(&vfsmount_lock); 1114 lock_mount_hash();
1089 for (p = mnt; p; p = next_mnt(p, mnt)) { 1115 for (p = mnt; p; p = next_mnt(p, mnt)) {
1090 actual_refs += mnt_get_count(p); 1116 actual_refs += mnt_get_count(p);
1091 minimum_refs += 2; 1117 minimum_refs += 2;
1092 } 1118 }
1093 br_write_unlock(&vfsmount_lock); 1119 unlock_mount_hash();
1094 1120
1095 if (actual_refs > minimum_refs) 1121 if (actual_refs > minimum_refs)
1096 return 0; 1122 return 0;
@@ -1117,10 +1143,10 @@ int may_umount(struct vfsmount *mnt)
1117{ 1143{
1118 int ret = 1; 1144 int ret = 1;
1119 down_read(&namespace_sem); 1145 down_read(&namespace_sem);
1120 br_write_lock(&vfsmount_lock); 1146 lock_mount_hash();
1121 if (propagate_mount_busy(real_mount(mnt), 2)) 1147 if (propagate_mount_busy(real_mount(mnt), 2))
1122 ret = 0; 1148 ret = 0;
1123 br_write_unlock(&vfsmount_lock); 1149 unlock_mount_hash();
1124 up_read(&namespace_sem); 1150 up_read(&namespace_sem);
1125 return ret; 1151 return ret;
1126} 1152}
@@ -1142,23 +1168,13 @@ static void namespace_unlock(void)
1142 list_splice_init(&unmounted, &head); 1168 list_splice_init(&unmounted, &head);
1143 up_write(&namespace_sem); 1169 up_write(&namespace_sem);
1144 1170
1171 synchronize_rcu();
1172
1145 while (!list_empty(&head)) { 1173 while (!list_empty(&head)) {
1146 mnt = list_first_entry(&head, struct mount, mnt_hash); 1174 mnt = list_first_entry(&head, struct mount, mnt_hash);
1147 list_del_init(&mnt->mnt_hash); 1175 list_del_init(&mnt->mnt_hash);
1148 if (mnt_has_parent(mnt)) { 1176 if (mnt->mnt_ex_mountpoint.mnt)
1149 struct dentry *dentry; 1177 path_put(&mnt->mnt_ex_mountpoint);
1150 struct mount *m;
1151
1152 br_write_lock(&vfsmount_lock);
1153 dentry = mnt->mnt_mountpoint;
1154 m = mnt->mnt_parent;
1155 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1156 mnt->mnt_parent = mnt;
1157 m->mnt_ghosts--;
1158 br_write_unlock(&vfsmount_lock);
1159 dput(dentry);
1160 mntput(&m->mnt);
1161 }
1162 mntput(&mnt->mnt); 1178 mntput(&mnt->mnt);
1163 } 1179 }
1164} 1180}
@@ -1169,10 +1185,13 @@ static inline void namespace_lock(void)
1169} 1185}
1170 1186
1171/* 1187/*
1172 * vfsmount lock must be held for write 1188 * mount_lock must be held
1173 * namespace_sem must be held for write 1189 * namespace_sem must be held for write
1190 * how = 0 => just this tree, don't propagate
1191 * how = 1 => propagate; we know that nobody else has reference to any victims
1192 * how = 2 => lazy umount
1174 */ 1193 */
1175void umount_tree(struct mount *mnt, int propagate) 1194void umount_tree(struct mount *mnt, int how)
1176{ 1195{
1177 LIST_HEAD(tmp_list); 1196 LIST_HEAD(tmp_list);
1178 struct mount *p; 1197 struct mount *p;
@@ -1180,7 +1199,7 @@ void umount_tree(struct mount *mnt, int propagate)
1180 for (p = mnt; p; p = next_mnt(p, mnt)) 1199 for (p = mnt; p; p = next_mnt(p, mnt))
1181 list_move(&p->mnt_hash, &tmp_list); 1200 list_move(&p->mnt_hash, &tmp_list);
1182 1201
1183 if (propagate) 1202 if (how)
1184 propagate_umount(&tmp_list); 1203 propagate_umount(&tmp_list);
1185 1204
1186 list_for_each_entry(p, &tmp_list, mnt_hash) { 1205 list_for_each_entry(p, &tmp_list, mnt_hash) {
@@ -1188,10 +1207,16 @@ void umount_tree(struct mount *mnt, int propagate)
1188 list_del_init(&p->mnt_list); 1207 list_del_init(&p->mnt_list);
1189 __touch_mnt_namespace(p->mnt_ns); 1208 __touch_mnt_namespace(p->mnt_ns);
1190 p->mnt_ns = NULL; 1209 p->mnt_ns = NULL;
1210 if (how < 2)
1211 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1191 list_del_init(&p->mnt_child); 1212 list_del_init(&p->mnt_child);
1192 if (mnt_has_parent(p)) { 1213 if (mnt_has_parent(p)) {
1193 p->mnt_parent->mnt_ghosts++;
1194 put_mountpoint(p->mnt_mp); 1214 put_mountpoint(p->mnt_mp);
1215 /* move the reference to mountpoint into ->mnt_ex_mountpoint */
1216 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
1217 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
1218 p->mnt_mountpoint = p->mnt.mnt_root;
1219 p->mnt_parent = p;
1195 p->mnt_mp = NULL; 1220 p->mnt_mp = NULL;
1196 } 1221 }
1197 change_mnt_propagation(p, MS_PRIVATE); 1222 change_mnt_propagation(p, MS_PRIVATE);
@@ -1225,12 +1250,12 @@ static int do_umount(struct mount *mnt, int flags)
1225 * probably don't strictly need the lock here if we examined 1250 * probably don't strictly need the lock here if we examined
1226 * all race cases, but it's a slowpath. 1251 * all race cases, but it's a slowpath.
1227 */ 1252 */
1228 br_write_lock(&vfsmount_lock); 1253 lock_mount_hash();
1229 if (mnt_get_count(mnt) != 2) { 1254 if (mnt_get_count(mnt) != 2) {
1230 br_write_unlock(&vfsmount_lock); 1255 unlock_mount_hash();
1231 return -EBUSY; 1256 return -EBUSY;
1232 } 1257 }
1233 br_write_unlock(&vfsmount_lock); 1258 unlock_mount_hash();
1234 1259
1235 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1260 if (!xchg(&mnt->mnt_expiry_mark, 1))
1236 return -EAGAIN; 1261 return -EAGAIN;
@@ -1272,19 +1297,23 @@ static int do_umount(struct mount *mnt, int flags)
1272 } 1297 }
1273 1298
1274 namespace_lock(); 1299 namespace_lock();
1275 br_write_lock(&vfsmount_lock); 1300 lock_mount_hash();
1276 event++; 1301 event++;
1277 1302
1278 if (!(flags & MNT_DETACH)) 1303 if (flags & MNT_DETACH) {
1279 shrink_submounts(mnt);
1280
1281 retval = -EBUSY;
1282 if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
1283 if (!list_empty(&mnt->mnt_list)) 1304 if (!list_empty(&mnt->mnt_list))
1284 umount_tree(mnt, 1); 1305 umount_tree(mnt, 2);
1285 retval = 0; 1306 retval = 0;
1307 } else {
1308 shrink_submounts(mnt);
1309 retval = -EBUSY;
1310 if (!propagate_mount_busy(mnt, 2)) {
1311 if (!list_empty(&mnt->mnt_list))
1312 umount_tree(mnt, 1);
1313 retval = 0;
1314 }
1286 } 1315 }
1287 br_write_unlock(&vfsmount_lock); 1316 unlock_mount_hash();
1288 namespace_unlock(); 1317 namespace_unlock();
1289 return retval; 1318 return retval;
1290} 1319}
@@ -1427,18 +1456,18 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1427 q = clone_mnt(p, p->mnt.mnt_root, flag); 1456 q = clone_mnt(p, p->mnt.mnt_root, flag);
1428 if (IS_ERR(q)) 1457 if (IS_ERR(q))
1429 goto out; 1458 goto out;
1430 br_write_lock(&vfsmount_lock); 1459 lock_mount_hash();
1431 list_add_tail(&q->mnt_list, &res->mnt_list); 1460 list_add_tail(&q->mnt_list, &res->mnt_list);
1432 attach_mnt(q, parent, p->mnt_mp); 1461 attach_mnt(q, parent, p->mnt_mp);
1433 br_write_unlock(&vfsmount_lock); 1462 unlock_mount_hash();
1434 } 1463 }
1435 } 1464 }
1436 return res; 1465 return res;
1437out: 1466out:
1438 if (res) { 1467 if (res) {
1439 br_write_lock(&vfsmount_lock); 1468 lock_mount_hash();
1440 umount_tree(res, 0); 1469 umount_tree(res, 0);
1441 br_write_unlock(&vfsmount_lock); 1470 unlock_mount_hash();
1442 } 1471 }
1443 return q; 1472 return q;
1444} 1473}
@@ -1460,9 +1489,9 @@ struct vfsmount *collect_mounts(struct path *path)
1460void drop_collected_mounts(struct vfsmount *mnt) 1489void drop_collected_mounts(struct vfsmount *mnt)
1461{ 1490{
1462 namespace_lock(); 1491 namespace_lock();
1463 br_write_lock(&vfsmount_lock); 1492 lock_mount_hash();
1464 umount_tree(real_mount(mnt), 0); 1493 umount_tree(real_mount(mnt), 0);
1465 br_write_unlock(&vfsmount_lock); 1494 unlock_mount_hash();
1466 namespace_unlock(); 1495 namespace_unlock();
1467} 1496}
1468 1497
@@ -1589,7 +1618,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1589 if (err) 1618 if (err)
1590 goto out_cleanup_ids; 1619 goto out_cleanup_ids;
1591 1620
1592 br_write_lock(&vfsmount_lock); 1621 lock_mount_hash();
1593 1622
1594 if (IS_MNT_SHARED(dest_mnt)) { 1623 if (IS_MNT_SHARED(dest_mnt)) {
1595 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1624 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1608,7 +1637,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1608 list_del_init(&child->mnt_hash); 1637 list_del_init(&child->mnt_hash);
1609 commit_tree(child); 1638 commit_tree(child);
1610 } 1639 }
1611 br_write_unlock(&vfsmount_lock); 1640 unlock_mount_hash();
1612 1641
1613 return 0; 1642 return 0;
1614 1643
@@ -1710,10 +1739,10 @@ static int do_change_type(struct path *path, int flag)
1710 goto out_unlock; 1739 goto out_unlock;
1711 } 1740 }
1712 1741
1713 br_write_lock(&vfsmount_lock); 1742 lock_mount_hash();
1714 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1743 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1715 change_mnt_propagation(m, type); 1744 change_mnt_propagation(m, type);
1716 br_write_unlock(&vfsmount_lock); 1745 unlock_mount_hash();
1717 1746
1718 out_unlock: 1747 out_unlock:
1719 namespace_unlock(); 1748 namespace_unlock();
@@ -1785,9 +1814,9 @@ static int do_loopback(struct path *path, const char *old_name,
1785 1814
1786 err = graft_tree(mnt, parent, mp); 1815 err = graft_tree(mnt, parent, mp);
1787 if (err) { 1816 if (err) {
1788 br_write_lock(&vfsmount_lock); 1817 lock_mount_hash();
1789 umount_tree(mnt, 0); 1818 umount_tree(mnt, 0);
1790 br_write_unlock(&vfsmount_lock); 1819 unlock_mount_hash();
1791 } 1820 }
1792out2: 1821out2:
1793 unlock_mount(mp); 1822 unlock_mount(mp);
@@ -1846,17 +1875,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1846 else 1875 else
1847 err = do_remount_sb(sb, flags, data, 0); 1876 err = do_remount_sb(sb, flags, data, 0);
1848 if (!err) { 1877 if (!err) {
1849 br_write_lock(&vfsmount_lock); 1878 lock_mount_hash();
1850 mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; 1879 mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
1851 mnt->mnt.mnt_flags = mnt_flags; 1880 mnt->mnt.mnt_flags = mnt_flags;
1852 br_write_unlock(&vfsmount_lock);
1853 }
1854 up_write(&sb->s_umount);
1855 if (!err) {
1856 br_write_lock(&vfsmount_lock);
1857 touch_mnt_namespace(mnt->mnt_ns); 1881 touch_mnt_namespace(mnt->mnt_ns);
1858 br_write_unlock(&vfsmount_lock); 1882 unlock_mount_hash();
1859 } 1883 }
1884 up_write(&sb->s_umount);
1860 return err; 1885 return err;
1861} 1886}
1862 1887
@@ -1972,7 +1997,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
1972 struct mount *parent; 1997 struct mount *parent;
1973 int err; 1998 int err;
1974 1999
1975 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); 2000 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
1976 2001
1977 mp = lock_mount(path); 2002 mp = lock_mount(path);
1978 if (IS_ERR(mp)) 2003 if (IS_ERR(mp))
@@ -2077,9 +2102,7 @@ fail:
2077 /* remove m from any expiration list it may be on */ 2102 /* remove m from any expiration list it may be on */
2078 if (!list_empty(&mnt->mnt_expire)) { 2103 if (!list_empty(&mnt->mnt_expire)) {
2079 namespace_lock(); 2104 namespace_lock();
2080 br_write_lock(&vfsmount_lock);
2081 list_del_init(&mnt->mnt_expire); 2105 list_del_init(&mnt->mnt_expire);
2082 br_write_unlock(&vfsmount_lock);
2083 namespace_unlock(); 2106 namespace_unlock();
2084 } 2107 }
2085 mntput(m); 2108 mntput(m);
@@ -2095,11 +2118,9 @@ fail:
2095void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) 2118void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
2096{ 2119{
2097 namespace_lock(); 2120 namespace_lock();
2098 br_write_lock(&vfsmount_lock);
2099 2121
2100 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); 2122 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
2101 2123
2102 br_write_unlock(&vfsmount_lock);
2103 namespace_unlock(); 2124 namespace_unlock();
2104} 2125}
2105EXPORT_SYMBOL(mnt_set_expiry); 2126EXPORT_SYMBOL(mnt_set_expiry);
@@ -2118,7 +2139,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
2118 return; 2139 return;
2119 2140
2120 namespace_lock(); 2141 namespace_lock();
2121 br_write_lock(&vfsmount_lock); 2142 lock_mount_hash();
2122 2143
2123 /* extract from the expiration list every vfsmount that matches the 2144 /* extract from the expiration list every vfsmount that matches the
2124 * following criteria: 2145 * following criteria:
@@ -2137,7 +2158,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
2137 touch_mnt_namespace(mnt->mnt_ns); 2158 touch_mnt_namespace(mnt->mnt_ns);
2138 umount_tree(mnt, 1); 2159 umount_tree(mnt, 1);
2139 } 2160 }
2140 br_write_unlock(&vfsmount_lock); 2161 unlock_mount_hash();
2141 namespace_unlock(); 2162 namespace_unlock();
2142} 2163}
2143 2164
@@ -2193,7 +2214,7 @@ resume:
2193 * process a list of expirable mountpoints with the intent of discarding any 2214 * process a list of expirable mountpoints with the intent of discarding any
2194 * submounts of a specific parent mountpoint 2215 * submounts of a specific parent mountpoint
2195 * 2216 *
2196 * vfsmount_lock must be held for write 2217 * mount_lock must be held for write
2197 */ 2218 */
2198static void shrink_submounts(struct mount *mnt) 2219static void shrink_submounts(struct mount *mnt)
2199{ 2220{
@@ -2414,20 +2435,25 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
2414 return new_ns; 2435 return new_ns;
2415} 2436}
2416 2437
2417/* 2438struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2418 * Allocate a new namespace structure and populate it with contents 2439 struct user_namespace *user_ns, struct fs_struct *new_fs)
2419 * copied from the namespace of the passed in task structure.
2420 */
2421static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2422 struct user_namespace *user_ns, struct fs_struct *fs)
2423{ 2440{
2424 struct mnt_namespace *new_ns; 2441 struct mnt_namespace *new_ns;
2425 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2442 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
2426 struct mount *p, *q; 2443 struct mount *p, *q;
2427 struct mount *old = mnt_ns->root; 2444 struct mount *old;
2428 struct mount *new; 2445 struct mount *new;
2429 int copy_flags; 2446 int copy_flags;
2430 2447
2448 BUG_ON(!ns);
2449
2450 if (likely(!(flags & CLONE_NEWNS))) {
2451 get_mnt_ns(ns);
2452 return ns;
2453 }
2454
2455 old = ns->root;
2456
2431 new_ns = alloc_mnt_ns(user_ns); 2457 new_ns = alloc_mnt_ns(user_ns);
2432 if (IS_ERR(new_ns)) 2458 if (IS_ERR(new_ns))
2433 return new_ns; 2459 return new_ns;
@@ -2435,7 +2461,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2435 namespace_lock(); 2461 namespace_lock();
2436 /* First pass: copy the tree topology */ 2462 /* First pass: copy the tree topology */
2437 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; 2463 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
2438 if (user_ns != mnt_ns->user_ns) 2464 if (user_ns != ns->user_ns)
2439 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; 2465 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
2440 new = copy_tree(old, old->mnt.mnt_root, copy_flags); 2466 new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2441 if (IS_ERR(new)) { 2467 if (IS_ERR(new)) {
@@ -2444,9 +2470,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2444 return ERR_CAST(new); 2470 return ERR_CAST(new);
2445 } 2471 }
2446 new_ns->root = new; 2472 new_ns->root = new;
2447 br_write_lock(&vfsmount_lock);
2448 list_add_tail(&new_ns->list, &new->mnt_list); 2473 list_add_tail(&new_ns->list, &new->mnt_list);
2449 br_write_unlock(&vfsmount_lock);
2450 2474
2451 /* 2475 /*
2452 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts 2476 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2457,13 +2481,13 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2457 q = new; 2481 q = new;
2458 while (p) { 2482 while (p) {
2459 q->mnt_ns = new_ns; 2483 q->mnt_ns = new_ns;
2460 if (fs) { 2484 if (new_fs) {
2461 if (&p->mnt == fs->root.mnt) { 2485 if (&p->mnt == new_fs->root.mnt) {
2462 fs->root.mnt = mntget(&q->mnt); 2486 new_fs->root.mnt = mntget(&q->mnt);
2463 rootmnt = &p->mnt; 2487 rootmnt = &p->mnt;
2464 } 2488 }
2465 if (&p->mnt == fs->pwd.mnt) { 2489 if (&p->mnt == new_fs->pwd.mnt) {
2466 fs->pwd.mnt = mntget(&q->mnt); 2490 new_fs->pwd.mnt = mntget(&q->mnt);
2467 pwdmnt = &p->mnt; 2491 pwdmnt = &p->mnt;
2468 } 2492 }
2469 } 2493 }
@@ -2484,23 +2508,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2484 return new_ns; 2508 return new_ns;
2485} 2509}
2486 2510
2487struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2488 struct user_namespace *user_ns, struct fs_struct *new_fs)
2489{
2490 struct mnt_namespace *new_ns;
2491
2492 BUG_ON(!ns);
2493 get_mnt_ns(ns);
2494
2495 if (!(flags & CLONE_NEWNS))
2496 return ns;
2497
2498 new_ns = dup_mnt_ns(ns, user_ns, new_fs);
2499
2500 put_mnt_ns(ns);
2501 return new_ns;
2502}
2503
2504/** 2511/**
2505 * create_mnt_ns - creates a private namespace and adds a root filesystem 2512 * create_mnt_ns - creates a private namespace and adds a root filesystem
2506 * @mnt: pointer to the new root filesystem mountpoint 2513 * @mnt: pointer to the new root filesystem mountpoint
@@ -2593,7 +2600,7 @@ out_type:
2593/* 2600/*
2594 * Return true if path is reachable from root 2601 * Return true if path is reachable from root
2595 * 2602 *
2596 * namespace_sem or vfsmount_lock is held 2603 * namespace_sem or mount_lock is held
2597 */ 2604 */
2598bool is_path_reachable(struct mount *mnt, struct dentry *dentry, 2605bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2599 const struct path *root) 2606 const struct path *root)
@@ -2608,9 +2615,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2608int path_is_under(struct path *path1, struct path *path2) 2615int path_is_under(struct path *path1, struct path *path2)
2609{ 2616{
2610 int res; 2617 int res;
2611 br_read_lock(&vfsmount_lock); 2618 read_seqlock_excl(&mount_lock);
2612 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 2619 res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
2613 br_read_unlock(&vfsmount_lock); 2620 read_sequnlock_excl(&mount_lock);
2614 return res; 2621 return res;
2615} 2622}
2616EXPORT_SYMBOL(path_is_under); 2623EXPORT_SYMBOL(path_is_under);
@@ -2701,7 +2708,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2701 if (!is_path_reachable(old_mnt, old.dentry, &new)) 2708 if (!is_path_reachable(old_mnt, old.dentry, &new))
2702 goto out4; 2709 goto out4;
2703 root_mp->m_count++; /* pin it so it won't go away */ 2710 root_mp->m_count++; /* pin it so it won't go away */
2704 br_write_lock(&vfsmount_lock); 2711 lock_mount_hash();
2705 detach_mnt(new_mnt, &parent_path); 2712 detach_mnt(new_mnt, &parent_path);
2706 detach_mnt(root_mnt, &root_parent); 2713 detach_mnt(root_mnt, &root_parent);
2707 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { 2714 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
@@ -2713,7 +2720,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2713 /* mount new_root on / */ 2720 /* mount new_root on / */
2714 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); 2721 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
2715 touch_mnt_namespace(current->nsproxy->mnt_ns); 2722 touch_mnt_namespace(current->nsproxy->mnt_ns);
2716 br_write_unlock(&vfsmount_lock); 2723 unlock_mount_hash();
2717 chroot_fs_refs(&root, &new); 2724 chroot_fs_refs(&root, &new);
2718 put_mountpoint(root_mp); 2725 put_mountpoint(root_mp);
2719 error = 0; 2726 error = 0;
@@ -2767,8 +2774,6 @@ void __init mnt_init(void)
2767 unsigned u; 2774 unsigned u;
2768 int err; 2775 int err;
2769 2776
2770 init_rwsem(&namespace_sem);
2771
2772 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 2777 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
2773 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 2778 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
2774 2779
@@ -2785,8 +2790,6 @@ void __init mnt_init(void)
2785 for (u = 0; u < HASH_SIZE; u++) 2790 for (u = 0; u < HASH_SIZE; u++)
2786 INIT_LIST_HEAD(&mountpoint_hashtable[u]); 2791 INIT_LIST_HEAD(&mountpoint_hashtable[u]);
2787 2792
2788 br_lock_init(&vfsmount_lock);
2789
2790 err = sysfs_init(); 2793 err = sysfs_init();
2791 if (err) 2794 if (err)
2792 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 2795 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2802,11 +2805,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
2802{ 2805{
2803 if (!atomic_dec_and_test(&ns->count)) 2806 if (!atomic_dec_and_test(&ns->count))
2804 return; 2807 return;
2805 namespace_lock(); 2808 drop_collected_mounts(&ns->root->mnt);
2806 br_write_lock(&vfsmount_lock);
2807 umount_tree(ns->root, 0);
2808 br_write_unlock(&vfsmount_lock);
2809 namespace_unlock();
2810 free_mnt_ns(ns); 2809 free_mnt_ns(ns);
2811} 2810}
2812 2811
@@ -2829,9 +2828,8 @@ void kern_unmount(struct vfsmount *mnt)
2829{ 2828{
2830 /* release long term mount so mount point can be released */ 2829 /* release long term mount so mount point can be released */
2831 if (!IS_ERR_OR_NULL(mnt)) { 2830 if (!IS_ERR_OR_NULL(mnt)) {
2832 br_write_lock(&vfsmount_lock);
2833 real_mount(mnt)->mnt_ns = NULL; 2831 real_mount(mnt)->mnt_ns = NULL;
2834 br_write_unlock(&vfsmount_lock); 2832 synchronize_rcu(); /* yecchhh... */
2835 mntput(mnt); 2833 mntput(mnt);
2836 } 2834 }
2837} 2835}
@@ -2875,7 +2873,7 @@ bool fs_fully_visible(struct file_system_type *type)
2875 if (unlikely(!ns)) 2873 if (unlikely(!ns))
2876 return false; 2874 return false;
2877 2875
2878 namespace_lock(); 2876 down_read(&namespace_sem);
2879 list_for_each_entry(mnt, &ns->list, mnt_list) { 2877 list_for_each_entry(mnt, &ns->list, mnt_list) {
2880 struct mount *child; 2878 struct mount *child;
2881 if (mnt->mnt.mnt_sb->s_type != type) 2879 if (mnt->mnt.mnt_sb->s_type != type)
@@ -2896,7 +2894,7 @@ bool fs_fully_visible(struct file_system_type *type)
2896 next: ; 2894 next: ;
2897 } 2895 }
2898found: 2896found:
2899 namespace_unlock(); 2897 up_read(&namespace_sem);
2900 return visible; 2898 return visible;
2901} 2899}
2902 2900