aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@kernel.dk>2011-01-07 01:50:11 -0500
committerNick Piggin <npiggin@kernel.dk>2011-01-07 01:50:33 -0500
commitb3e19d924b6eaf2ca7d22cba99a517c5171007b6 (patch)
tree8c1fa4074114a883a4e2de2f7d12eb29ed91bdf1
parentc6653a838b1b2738561aff0b8c0f62a9b714bdd9 (diff)
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability. We need to take a reference on the vfsmount for every successful path lookup, which often go to the same mount point. The fundamental difficulty is that a "simple" reference count can never be made scalable, because any time a reference is dropped, we must check whether that was the last reference. To do that requires communication with all other CPUs that may have taken a reference count. We can make refcounts more scalable in a couple of ways, involving keeping distributed counters, and checking for the global-zero condition less frequently. - check the global sum once every interval (this will delay zero detection for some interval, so it's probably a showstopper for vfsmounts). - keep a local count and only taking the global sum when local reaches 0 (this is difficult for vfsmounts, because we can't hold preempt off for the life of a reference, so a counter would need to be per-thread or tied strongly to a particular CPU which requires more locking). - keep a local difference of increments and decrements, which allows us to sum the total difference and hence find the refcount when summing all CPUs. Then, keep a single integer "long" refcount for slow and long lasting references, and only take the global sum of local counters when the long refcount is 0. This last scheme is what I implemented here. Attached mounts and process root and working directory references are "long" references, and everything else is a short reference. This allows scalable vfsmount references during path walking over mounted subtrees and unattached (lazy umounted) mounts with processes still running in them. This results in one fewer atomic op in the fastpath: mntget is now just a per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock and non-atomic decrement in the common case. However code is otherwise bigger and heavier, so single threaded performance is basically a wash. Signed-off-by: Nick Piggin <npiggin@kernel.dk>
-rw-r--r--arch/ia64/kernel/perfmon.c2
-rw-r--r--drivers/mtd/mtdchar.c2
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/fs_struct.c26
-rw-r--r--fs/internal.h1
-rw-r--r--fs/namei.c24
-rw-r--r--fs/namespace.c242
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/pnode.c4
-rw-r--r--fs/super.c2
-rw-r--r--include/linux/mount.h53
-rw-r--r--include/linux/path.h2
-rw-r--r--net/socket.c19
13 files changed, 283 insertions, 98 deletions
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 5a24f40bb48e..f099b82703d8 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -1542,7 +1542,7 @@ pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
1542 * any operations on the root directory. However, we need a non-trivial 1542 * any operations on the root directory. However, we need a non-trivial
1543 * d_name - pfm: will go nicely and kill the special-casing in procfs. 1543 * d_name - pfm: will go nicely and kill the special-casing in procfs.
1544 */ 1544 */
1545static struct vfsmount *pfmfs_mnt; 1545static struct vfsmount *pfmfs_mnt __read_mostly;
1546 1546
1547static int __init 1547static int __init
1548init_pfm_fs(void) 1548init_pfm_fs(void)
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 4759d827e8c7..f511dd15fd31 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -1201,7 +1201,7 @@ err_unregister_chdev:
1201static void __exit cleanup_mtdchar(void) 1201static void __exit cleanup_mtdchar(void)
1202{ 1202{
1203 unregister_mtd_user(&mtdchar_notifier); 1203 unregister_mtd_user(&mtdchar_notifier);
1204 mntput(mtd_inode_mnt); 1204 mntput_long(mtd_inode_mnt);
1205 unregister_filesystem(&mtd_inodefs_type); 1205 unregister_filesystem(&mtd_inodefs_type);
1206 __unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd"); 1206 __unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
1207} 1207}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9d92b33da8a0..5fd38112a6ca 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -232,7 +232,7 @@ static int __init anon_inode_init(void)
232 return 0; 232 return 0;
233 233
234err_mntput: 234err_mntput:
235 mntput(anon_inode_mnt); 235 mntput_long(anon_inode_mnt);
236err_unregister_filesystem: 236err_unregister_filesystem:
237 unregister_filesystem(&anon_inode_fs_type); 237 unregister_filesystem(&anon_inode_fs_type);
238err_exit: 238err_exit:
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 60b8531f41c5..68ca487bedb1 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -17,11 +17,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
17 write_seqcount_begin(&fs->seq); 17 write_seqcount_begin(&fs->seq);
18 old_root = fs->root; 18 old_root = fs->root;
19 fs->root = *path; 19 fs->root = *path;
20 path_get(path); 20 path_get_long(path);
21 write_seqcount_end(&fs->seq); 21 write_seqcount_end(&fs->seq);
22 spin_unlock(&fs->lock); 22 spin_unlock(&fs->lock);
23 if (old_root.dentry) 23 if (old_root.dentry)
24 path_put(&old_root); 24 path_put_long(&old_root);
25} 25}
26 26
27/* 27/*
@@ -36,12 +36,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
36 write_seqcount_begin(&fs->seq); 36 write_seqcount_begin(&fs->seq);
37 old_pwd = fs->pwd; 37 old_pwd = fs->pwd;
38 fs->pwd = *path; 38 fs->pwd = *path;
39 path_get(path); 39 path_get_long(path);
40 write_seqcount_end(&fs->seq); 40 write_seqcount_end(&fs->seq);
41 spin_unlock(&fs->lock); 41 spin_unlock(&fs->lock);
42 42
43 if (old_pwd.dentry) 43 if (old_pwd.dentry)
44 path_put(&old_pwd); 44 path_put_long(&old_pwd);
45} 45}
46 46
47void chroot_fs_refs(struct path *old_root, struct path *new_root) 47void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +59,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
59 write_seqcount_begin(&fs->seq); 59 write_seqcount_begin(&fs->seq);
60 if (fs->root.dentry == old_root->dentry 60 if (fs->root.dentry == old_root->dentry
61 && fs->root.mnt == old_root->mnt) { 61 && fs->root.mnt == old_root->mnt) {
62 path_get(new_root); 62 path_get_long(new_root);
63 fs->root = *new_root; 63 fs->root = *new_root;
64 count++; 64 count++;
65 } 65 }
66 if (fs->pwd.dentry == old_root->dentry 66 if (fs->pwd.dentry == old_root->dentry
67 && fs->pwd.mnt == old_root->mnt) { 67 && fs->pwd.mnt == old_root->mnt) {
68 path_get(new_root); 68 path_get_long(new_root);
69 fs->pwd = *new_root; 69 fs->pwd = *new_root;
70 count++; 70 count++;
71 } 71 }
@@ -76,13 +76,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
76 } while_each_thread(g, p); 76 } while_each_thread(g, p);
77 read_unlock(&tasklist_lock); 77 read_unlock(&tasklist_lock);
78 while (count--) 78 while (count--)
79 path_put(old_root); 79 path_put_long(old_root);
80} 80}
81 81
82void free_fs_struct(struct fs_struct *fs) 82void free_fs_struct(struct fs_struct *fs)
83{ 83{
84 path_put(&fs->root); 84 path_put_long(&fs->root);
85 path_put(&fs->pwd); 85 path_put_long(&fs->pwd);
86 kmem_cache_free(fs_cachep, fs); 86 kmem_cache_free(fs_cachep, fs);
87} 87}
88 88
@@ -115,7 +115,13 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
115 spin_lock_init(&fs->lock); 115 spin_lock_init(&fs->lock);
116 seqcount_init(&fs->seq); 116 seqcount_init(&fs->seq);
117 fs->umask = old->umask; 117 fs->umask = old->umask;
118 get_fs_root_and_pwd(old, &fs->root, &fs->pwd); 118
119 spin_lock(&old->lock);
120 fs->root = old->root;
121 path_get_long(&fs->root);
122 fs->pwd = old->pwd;
123 path_get_long(&fs->pwd);
124 spin_unlock(&old->lock);
119 } 125 }
120 return fs; 126 return fs;
121} 127}
diff --git a/fs/internal.h b/fs/internal.h
index e43b9a4dbf4e..9687c2ee2735 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -63,6 +63,7 @@ extern int copy_mount_string(const void __user *, char **);
63 63
64extern void free_vfsmnt(struct vfsmount *); 64extern void free_vfsmnt(struct vfsmount *);
65extern struct vfsmount *alloc_vfsmnt(const char *); 65extern struct vfsmount *alloc_vfsmnt(const char *);
66extern unsigned int mnt_get_count(struct vfsmount *mnt);
66extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); 67extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
67extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, 68extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
68 struct vfsmount *); 69 struct vfsmount *);
diff --git a/fs/namei.c b/fs/namei.c
index 4e957bf744ae..19433cdba011 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -368,6 +368,18 @@ void path_get(struct path *path)
368EXPORT_SYMBOL(path_get); 368EXPORT_SYMBOL(path_get);
369 369
370/** 370/**
371 * path_get_long - get a long reference to a path
372 * @path: path to get the reference to
373 *
374 * Given a path increment the reference count to the dentry and the vfsmount.
375 */
376void path_get_long(struct path *path)
377{
378 mntget_long(path->mnt);
379 dget(path->dentry);
380}
381
382/**
371 * path_put - put a reference to a path 383 * path_put - put a reference to a path
372 * @path: path to put the reference to 384 * @path: path to put the reference to
373 * 385 *
@@ -381,6 +393,18 @@ void path_put(struct path *path)
381EXPORT_SYMBOL(path_put); 393EXPORT_SYMBOL(path_put);
382 394
383/** 395/**
396 * path_put_long - put a long reference to a path
397 * @path: path to put the reference to
398 *
399 * Given a path decrement the reference count to the dentry and the vfsmount.
400 */
401void path_put_long(struct path *path)
402{
403 dput(path->dentry);
404 mntput_long(path->mnt);
405}
406
407/**
384 * nameidata_drop_rcu - drop this nameidata out of rcu-walk 408 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
385 * @nd: nameidata pathwalk data to drop 409 * @nd: nameidata pathwalk data to drop
386 * @Returns: 0 on success, -ECHLID on failure 410 * @Returns: 0 on success, -ECHLID on failure
diff --git a/fs/namespace.c b/fs/namespace.c
index 03b82350f020..3ddfd9046c44 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -138,6 +138,64 @@ void mnt_release_group_id(struct vfsmount *mnt)
138 mnt->mnt_group_id = 0; 138 mnt->mnt_group_id = 0;
139} 139}
140 140
141/*
142 * vfsmount lock must be held for read
143 */
144static inline void mnt_add_count(struct vfsmount *mnt, int n)
145{
146#ifdef CONFIG_SMP
147 this_cpu_add(mnt->mnt_pcp->mnt_count, n);
148#else
149 preempt_disable();
150 mnt->mnt_count += n;
151 preempt_enable();
152#endif
153}
154
155static inline void mnt_set_count(struct vfsmount *mnt, int n)
156{
157#ifdef CONFIG_SMP
158 this_cpu_write(mnt->mnt_pcp->mnt_count, n);
159#else
160 mnt->mnt_count = n;
161#endif
162}
163
164/*
165 * vfsmount lock must be held for read
166 */
167static inline void mnt_inc_count(struct vfsmount *mnt)
168{
169 mnt_add_count(mnt, 1);
170}
171
172/*
173 * vfsmount lock must be held for read
174 */
175static inline void mnt_dec_count(struct vfsmount *mnt)
176{
177 mnt_add_count(mnt, -1);
178}
179
180/*
181 * vfsmount lock must be held for write
182 */
183unsigned int mnt_get_count(struct vfsmount *mnt)
184{
185#ifdef CONFIG_SMP
186 unsigned int count = atomic_read(&mnt->mnt_longrefs);
187 int cpu;
188
189 for_each_possible_cpu(cpu) {
190 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
191 }
192
193 return count;
194#else
195 return mnt->mnt_count;
196#endif
197}
198
141struct vfsmount *alloc_vfsmnt(const char *name) 199struct vfsmount *alloc_vfsmnt(const char *name)
142{ 200{
143 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 201 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -154,7 +212,17 @@ struct vfsmount *alloc_vfsmnt(const char *name)
154 goto out_free_id; 212 goto out_free_id;
155 } 213 }
156 214
157 atomic_set(&mnt->mnt_count, 1); 215#ifdef CONFIG_SMP
216 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
217 if (!mnt->mnt_pcp)
218 goto out_free_devname;
219
220 atomic_set(&mnt->mnt_longrefs, 1);
221#else
222 mnt->mnt_count = 1;
223 mnt->mnt_writers = 0;
224#endif
225
158 INIT_LIST_HEAD(&mnt->mnt_hash); 226 INIT_LIST_HEAD(&mnt->mnt_hash);
159 INIT_LIST_HEAD(&mnt->mnt_child); 227 INIT_LIST_HEAD(&mnt->mnt_child);
160 INIT_LIST_HEAD(&mnt->mnt_mounts); 228 INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -166,13 +234,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)
166#ifdef CONFIG_FSNOTIFY 234#ifdef CONFIG_FSNOTIFY
167 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 235 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
168#endif 236#endif
169#ifdef CONFIG_SMP
170 mnt->mnt_writers = alloc_percpu(int);
171 if (!mnt->mnt_writers)
172 goto out_free_devname;
173#else
174 mnt->mnt_writers = 0;
175#endif
176 } 237 }
177 return mnt; 238 return mnt;
178 239
@@ -219,7 +280,7 @@ EXPORT_SYMBOL_GPL(__mnt_is_readonly);
219static inline void mnt_inc_writers(struct vfsmount *mnt) 280static inline void mnt_inc_writers(struct vfsmount *mnt)
220{ 281{
221#ifdef CONFIG_SMP 282#ifdef CONFIG_SMP
222 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; 283 this_cpu_inc(mnt->mnt_pcp->mnt_writers);
223#else 284#else
224 mnt->mnt_writers++; 285 mnt->mnt_writers++;
225#endif 286#endif
@@ -228,7 +289,7 @@ static inline void mnt_inc_writers(struct vfsmount *mnt)
228static inline void mnt_dec_writers(struct vfsmount *mnt) 289static inline void mnt_dec_writers(struct vfsmount *mnt)
229{ 290{
230#ifdef CONFIG_SMP 291#ifdef CONFIG_SMP
231 (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; 292 this_cpu_dec(mnt->mnt_pcp->mnt_writers);
232#else 293#else
233 mnt->mnt_writers--; 294 mnt->mnt_writers--;
234#endif 295#endif
@@ -241,7 +302,7 @@ static unsigned int mnt_get_writers(struct vfsmount *mnt)
241 int cpu; 302 int cpu;
242 303
243 for_each_possible_cpu(cpu) { 304 for_each_possible_cpu(cpu) {
244 count += *per_cpu_ptr(mnt->mnt_writers, cpu); 305 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
245 } 306 }
246 307
247 return count; 308 return count;
@@ -418,7 +479,7 @@ void free_vfsmnt(struct vfsmount *mnt)
418 kfree(mnt->mnt_devname); 479 kfree(mnt->mnt_devname);
419 mnt_free_id(mnt); 480 mnt_free_id(mnt);
420#ifdef CONFIG_SMP 481#ifdef CONFIG_SMP
421 free_percpu(mnt->mnt_writers); 482 free_percpu(mnt->mnt_pcp);
422#endif 483#endif
423 kmem_cache_free(mnt_cache, mnt); 484 kmem_cache_free(mnt_cache, mnt);
424} 485}
@@ -652,9 +713,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
652 return NULL; 713 return NULL;
653} 714}
654 715
655static inline void __mntput(struct vfsmount *mnt) 716static inline void mntfree(struct vfsmount *mnt)
656{ 717{
657 struct super_block *sb = mnt->mnt_sb; 718 struct super_block *sb = mnt->mnt_sb;
719
658 /* 720 /*
659 * This probably indicates that somebody messed 721 * This probably indicates that somebody messed
660 * up a mnt_want/drop_write() pair. If this 722 * up a mnt_want/drop_write() pair. If this
@@ -662,8 +724,8 @@ static inline void __mntput(struct vfsmount *mnt)
662 * to make r/w->r/o transitions. 724 * to make r/w->r/o transitions.
663 */ 725 */
664 /* 726 /*
665 * atomic_dec_and_lock() used to deal with ->mnt_count decrements 727 * The locking used to deal with mnt_count decrement provides barriers,
666 * provides barriers, so mnt_get_writers() below is safe. AV 728 * so mnt_get_writers() below is safe.
667 */ 729 */
668 WARN_ON(mnt_get_writers(mnt)); 730 WARN_ON(mnt_get_writers(mnt));
669 fsnotify_vfsmount_delete(mnt); 731 fsnotify_vfsmount_delete(mnt);
@@ -672,28 +734,113 @@ static inline void __mntput(struct vfsmount *mnt)
672 deactivate_super(sb); 734 deactivate_super(sb);
673} 735}
674 736
675void mntput_no_expire(struct vfsmount *mnt) 737#ifdef CONFIG_SMP
676{ 738static inline void __mntput(struct vfsmount *mnt, int longrefs)
677repeat: 739{
678 if (atomic_add_unless(&mnt->mnt_count, -1, 1)) 740 if (!longrefs) {
679 return; 741put_again:
742 br_read_lock(vfsmount_lock);
743 if (likely(atomic_read(&mnt->mnt_longrefs))) {
744 mnt_dec_count(mnt);
745 br_read_unlock(vfsmount_lock);
746 return;
747 }
748 br_read_unlock(vfsmount_lock);
749 } else {
750 BUG_ON(!atomic_read(&mnt->mnt_longrefs));
751 if (atomic_add_unless(&mnt->mnt_longrefs, -1, 1))
752 return;
753 }
754
680 br_write_lock(vfsmount_lock); 755 br_write_lock(vfsmount_lock);
681 if (!atomic_dec_and_test(&mnt->mnt_count)) { 756 if (!longrefs)
757 mnt_dec_count(mnt);
758 else
759 atomic_dec(&mnt->mnt_longrefs);
760 if (mnt_get_count(mnt)) {
682 br_write_unlock(vfsmount_lock); 761 br_write_unlock(vfsmount_lock);
683 return; 762 return;
684 } 763 }
685 if (likely(!mnt->mnt_pinned)) { 764 if (unlikely(mnt->mnt_pinned)) {
765 mnt_add_count(mnt, mnt->mnt_pinned + 1);
766 mnt->mnt_pinned = 0;
686 br_write_unlock(vfsmount_lock); 767 br_write_unlock(vfsmount_lock);
687 __mntput(mnt); 768 acct_auto_close_mnt(mnt);
769 goto put_again;
770 }
771 br_write_unlock(vfsmount_lock);
772 mntfree(mnt);
773}
774#else
775static inline void __mntput(struct vfsmount *mnt, int longrefs)
776{
777put_again:
778 mnt_dec_count(mnt);
779 if (likely(mnt_get_count(mnt)))
688 return; 780 return;
781 br_write_lock(vfsmount_lock);
782 if (unlikely(mnt->mnt_pinned)) {
783 mnt_add_count(mnt, mnt->mnt_pinned + 1);
784 mnt->mnt_pinned = 0;
785 br_write_unlock(vfsmount_lock);
786 acct_auto_close_mnt(mnt);
787 goto put_again;
689 } 788 }
690 atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
691 mnt->mnt_pinned = 0;
692 br_write_unlock(vfsmount_lock); 789 br_write_unlock(vfsmount_lock);
693 acct_auto_close_mnt(mnt); 790 mntfree(mnt);
694 goto repeat; 791}
792#endif
793
794static void mntput_no_expire(struct vfsmount *mnt)
795{
796 __mntput(mnt, 0);
797}
798
799void mntput(struct vfsmount *mnt)
800{
801 if (mnt) {
802 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
803 if (unlikely(mnt->mnt_expiry_mark))
804 mnt->mnt_expiry_mark = 0;
805 __mntput(mnt, 0);
806 }
807}
808EXPORT_SYMBOL(mntput);
809
810struct vfsmount *mntget(struct vfsmount *mnt)
811{
812 if (mnt)
813 mnt_inc_count(mnt);
814 return mnt;
815}
816EXPORT_SYMBOL(mntget);
817
818void mntput_long(struct vfsmount *mnt)
819{
820#ifdef CONFIG_SMP
821 if (mnt) {
822 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
823 if (unlikely(mnt->mnt_expiry_mark))
824 mnt->mnt_expiry_mark = 0;
825 __mntput(mnt, 1);
826 }
827#else
828 mntput(mnt);
829#endif
695} 830}
696EXPORT_SYMBOL(mntput_no_expire); 831EXPORT_SYMBOL(mntput_long);
832
833struct vfsmount *mntget_long(struct vfsmount *mnt)
834{
835#ifdef CONFIG_SMP
836 if (mnt)
837 atomic_inc(&mnt->mnt_longrefs);
838 return mnt;
839#else
840 return mntget(mnt);
841#endif
842}
843EXPORT_SYMBOL(mntget_long);
697 844
698void mnt_pin(struct vfsmount *mnt) 845void mnt_pin(struct vfsmount *mnt)
699{ 846{
@@ -701,19 +848,17 @@ void mnt_pin(struct vfsmount *mnt)
701 mnt->mnt_pinned++; 848 mnt->mnt_pinned++;
702 br_write_unlock(vfsmount_lock); 849 br_write_unlock(vfsmount_lock);
703} 850}
704
705EXPORT_SYMBOL(mnt_pin); 851EXPORT_SYMBOL(mnt_pin);
706 852
707void mnt_unpin(struct vfsmount *mnt) 853void mnt_unpin(struct vfsmount *mnt)
708{ 854{
709 br_write_lock(vfsmount_lock); 855 br_write_lock(vfsmount_lock);
710 if (mnt->mnt_pinned) { 856 if (mnt->mnt_pinned) {
711 atomic_inc(&mnt->mnt_count); 857 mnt_inc_count(mnt);
712 mnt->mnt_pinned--; 858 mnt->mnt_pinned--;
713 } 859 }
714 br_write_unlock(vfsmount_lock); 860 br_write_unlock(vfsmount_lock);
715} 861}
716
717EXPORT_SYMBOL(mnt_unpin); 862EXPORT_SYMBOL(mnt_unpin);
718 863
719static inline void mangle(struct seq_file *m, const char *s) 864static inline void mangle(struct seq_file *m, const char *s)
@@ -1008,12 +1153,13 @@ int may_umount_tree(struct vfsmount *mnt)
1008 int minimum_refs = 0; 1153 int minimum_refs = 0;
1009 struct vfsmount *p; 1154 struct vfsmount *p;
1010 1155
1011 br_read_lock(vfsmount_lock); 1156 /* write lock needed for mnt_get_count */
1157 br_write_lock(vfsmount_lock);
1012 for (p = mnt; p; p = next_mnt(p, mnt)) { 1158 for (p = mnt; p; p = next_mnt(p, mnt)) {
1013 actual_refs += atomic_read(&p->mnt_count); 1159 actual_refs += mnt_get_count(p);
1014 minimum_refs += 2; 1160 minimum_refs += 2;
1015 } 1161 }
1016 br_read_unlock(vfsmount_lock); 1162 br_write_unlock(vfsmount_lock);
1017 1163
1018 if (actual_refs > minimum_refs) 1164 if (actual_refs > minimum_refs)
1019 return 0; 1165 return 0;
@@ -1040,10 +1186,10 @@ int may_umount(struct vfsmount *mnt)
1040{ 1186{
1041 int ret = 1; 1187 int ret = 1;
1042 down_read(&namespace_sem); 1188 down_read(&namespace_sem);
1043 br_read_lock(vfsmount_lock); 1189 br_write_lock(vfsmount_lock);
1044 if (propagate_mount_busy(mnt, 2)) 1190 if (propagate_mount_busy(mnt, 2))
1045 ret = 0; 1191 ret = 0;
1046 br_read_unlock(vfsmount_lock); 1192 br_write_unlock(vfsmount_lock);
1047 up_read(&namespace_sem); 1193 up_read(&namespace_sem);
1048 return ret; 1194 return ret;
1049} 1195}
@@ -1070,7 +1216,7 @@ void release_mounts(struct list_head *head)
1070 dput(dentry); 1216 dput(dentry);
1071 mntput(m); 1217 mntput(m);
1072 } 1218 }
1073 mntput(mnt); 1219 mntput_long(mnt);
1074 } 1220 }
1075} 1221}
1076 1222
@@ -1125,8 +1271,16 @@ static int do_umount(struct vfsmount *mnt, int flags)
1125 flags & (MNT_FORCE | MNT_DETACH)) 1271 flags & (MNT_FORCE | MNT_DETACH))
1126 return -EINVAL; 1272 return -EINVAL;
1127 1273
1128 if (atomic_read(&mnt->mnt_count) != 2) 1274 /*
1275 * probably don't strictly need the lock here if we examined
1276 * all race cases, but it's a slowpath.
1277 */
1278 br_write_lock(vfsmount_lock);
1279 if (mnt_get_count(mnt) != 2) {
1280 br_write_lock(vfsmount_lock);
1129 return -EBUSY; 1281 return -EBUSY;
1282 }
1283 br_write_unlock(vfsmount_lock);
1130 1284
1131 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1285 if (!xchg(&mnt->mnt_expiry_mark, 1))
1132 return -EAGAIN; 1286 return -EAGAIN;
@@ -1815,7 +1969,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
1815 1969
1816unlock: 1970unlock:
1817 up_write(&namespace_sem); 1971 up_write(&namespace_sem);
1818 mntput(newmnt); 1972 mntput_long(newmnt);
1819 return err; 1973 return err;
1820} 1974}
1821 1975
@@ -2148,11 +2302,11 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2148 if (fs) { 2302 if (fs) {
2149 if (p == fs->root.mnt) { 2303 if (p == fs->root.mnt) {
2150 rootmnt = p; 2304 rootmnt = p;
2151 fs->root.mnt = mntget(q); 2305 fs->root.mnt = mntget_long(q);
2152 } 2306 }
2153 if (p == fs->pwd.mnt) { 2307 if (p == fs->pwd.mnt) {
2154 pwdmnt = p; 2308 pwdmnt = p;
2155 fs->pwd.mnt = mntget(q); 2309 fs->pwd.mnt = mntget_long(q);
2156 } 2310 }
2157 } 2311 }
2158 p = next_mnt(p, mnt_ns->root); 2312 p = next_mnt(p, mnt_ns->root);
@@ -2161,9 +2315,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2161 up_write(&namespace_sem); 2315 up_write(&namespace_sem);
2162 2316
2163 if (rootmnt) 2317 if (rootmnt)
2164 mntput(rootmnt); 2318 mntput_long(rootmnt);
2165 if (pwdmnt) 2319 if (pwdmnt)
2166 mntput(pwdmnt); 2320 mntput_long(pwdmnt);
2167 2321
2168 return new_ns; 2322 return new_ns;
2169} 2323}
@@ -2350,6 +2504,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2350 touch_mnt_namespace(current->nsproxy->mnt_ns); 2504 touch_mnt_namespace(current->nsproxy->mnt_ns);
2351 br_write_unlock(vfsmount_lock); 2505 br_write_unlock(vfsmount_lock);
2352 chroot_fs_refs(&root, &new); 2506 chroot_fs_refs(&root, &new);
2507
2353 error = 0; 2508 error = 0;
2354 path_put(&root_parent); 2509 path_put(&root_parent);
2355 path_put(&parent_path); 2510 path_put(&parent_path);
@@ -2376,6 +2531,7 @@ static void __init init_mount_tree(void)
2376 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2531 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
2377 if (IS_ERR(mnt)) 2532 if (IS_ERR(mnt))
2378 panic("Can't create rootfs"); 2533 panic("Can't create rootfs");
2534
2379 ns = create_mnt_ns(mnt); 2535 ns = create_mnt_ns(mnt);
2380 if (IS_ERR(ns)) 2536 if (IS_ERR(ns))
2381 panic("Can't allocate initial namespace"); 2537 panic("Can't allocate initial namespace");
diff --git a/fs/pipe.c b/fs/pipe.c
index cfe3a7f2ee21..68f1f8e4e23b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1292,7 +1292,7 @@ static int __init init_pipe_fs(void)
1292static void __exit exit_pipe_fs(void) 1292static void __exit exit_pipe_fs(void)
1293{ 1293{
1294 unregister_filesystem(&pipe_fs_type); 1294 unregister_filesystem(&pipe_fs_type);
1295 mntput(pipe_mnt); 1295 mntput_long(pipe_mnt);
1296} 1296}
1297 1297
1298fs_initcall(init_pipe_fs); 1298fs_initcall(init_pipe_fs);
diff --git a/fs/pnode.c b/fs/pnode.c
index 8066b8dd748f..d42514e32380 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -288,7 +288,7 @@ out:
288 */ 288 */
289static inline int do_refcount_check(struct vfsmount *mnt, int count) 289static inline int do_refcount_check(struct vfsmount *mnt, int count)
290{ 290{
291 int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts; 291 int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
292 return (mycount > count); 292 return (mycount > count);
293} 293}
294 294
@@ -300,7 +300,7 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
300 * Check if any of these mounts that **do not have submounts** 300 * Check if any of these mounts that **do not have submounts**
301 * have more references than 'refcnt'. If so return busy. 301 * have more references than 'refcnt'. If so return busy.
302 * 302 *
303 * vfsmount lock must be held for read or write 303 * vfsmount lock must be held for write
304 */ 304 */
305int propagate_mount_busy(struct vfsmount *mnt, int refcnt) 305int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
306{ 306{
diff --git a/fs/super.c b/fs/super.c
index 968ba013011a..823e061faa87 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1140,7 +1140,7 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1140 return mnt; 1140 return mnt;
1141 1141
1142 err: 1142 err:
1143 mntput(mnt); 1143 mntput_long(mnt);
1144 return ERR_PTR(err); 1144 return ERR_PTR(err);
1145} 1145}
1146 1146
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5e7a59408dd4..1869ea24a739 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -13,6 +13,7 @@
13#include <linux/list.h> 13#include <linux/list.h>
14#include <linux/nodemask.h> 14#include <linux/nodemask.h>
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/seqlock.h>
16#include <asm/atomic.h> 17#include <asm/atomic.h>
17 18
18struct super_block; 19struct super_block;
@@ -46,12 +47,24 @@ struct mnt_namespace;
46 47
47#define MNT_INTERNAL 0x4000 48#define MNT_INTERNAL 0x4000
48 49
50struct mnt_pcp {
51 int mnt_count;
52 int mnt_writers;
53};
54
49struct vfsmount { 55struct vfsmount {
50 struct list_head mnt_hash; 56 struct list_head mnt_hash;
51 struct vfsmount *mnt_parent; /* fs we are mounted on */ 57 struct vfsmount *mnt_parent; /* fs we are mounted on */
52 struct dentry *mnt_mountpoint; /* dentry of mountpoint */ 58 struct dentry *mnt_mountpoint; /* dentry of mountpoint */
53 struct dentry *mnt_root; /* root of the mounted tree */ 59 struct dentry *mnt_root; /* root of the mounted tree */
54 struct super_block *mnt_sb; /* pointer to superblock */ 60 struct super_block *mnt_sb; /* pointer to superblock */
61#ifdef CONFIG_SMP
62 struct mnt_pcp __percpu *mnt_pcp;
63 atomic_t mnt_longrefs;
64#else
65 int mnt_count;
66 int mnt_writers;
67#endif
55 struct list_head mnt_mounts; /* list of children, anchored here */ 68 struct list_head mnt_mounts; /* list of children, anchored here */
56 struct list_head mnt_child; /* and going through their mnt_child */ 69 struct list_head mnt_child; /* and going through their mnt_child */
57 int mnt_flags; 70 int mnt_flags;
@@ -70,57 +83,25 @@ struct vfsmount {
70 struct mnt_namespace *mnt_ns; /* containing namespace */ 83 struct mnt_namespace *mnt_ns; /* containing namespace */
71 int mnt_id; /* mount identifier */ 84 int mnt_id; /* mount identifier */
72 int mnt_group_id; /* peer group identifier */ 85 int mnt_group_id; /* peer group identifier */
73 /*
74 * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
75 * to let these frequently modified fields in a separate cache line
76 * (so that reads of mnt_flags wont ping-pong on SMP machines)
77 */
78 atomic_t mnt_count;
79 int mnt_expiry_mark; /* true if marked for expiry */ 86 int mnt_expiry_mark; /* true if marked for expiry */
80 int mnt_pinned; 87 int mnt_pinned;
81 int mnt_ghosts; 88 int mnt_ghosts;
82#ifdef CONFIG_SMP
83 int __percpu *mnt_writers;
84#else
85 int mnt_writers;
86#endif
87}; 89};
88 90
89static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
90{
91#ifdef CONFIG_SMP
92 return mnt->mnt_writers;
93#else
94 return &mnt->mnt_writers;
95#endif
96}
97
98static inline struct vfsmount *mntget(struct vfsmount *mnt)
99{
100 if (mnt)
101 atomic_inc(&mnt->mnt_count);
102 return mnt;
103}
104
105struct file; /* forward dec */ 91struct file; /* forward dec */
106 92
107extern int mnt_want_write(struct vfsmount *mnt); 93extern int mnt_want_write(struct vfsmount *mnt);
108extern int mnt_want_write_file(struct file *file); 94extern int mnt_want_write_file(struct file *file);
109extern int mnt_clone_write(struct vfsmount *mnt); 95extern int mnt_clone_write(struct vfsmount *mnt);
110extern void mnt_drop_write(struct vfsmount *mnt); 96extern void mnt_drop_write(struct vfsmount *mnt);
111extern void mntput_no_expire(struct vfsmount *mnt); 97extern void mntput(struct vfsmount *mnt);
98extern struct vfsmount *mntget(struct vfsmount *mnt);
99extern void mntput_long(struct vfsmount *mnt);
100extern struct vfsmount *mntget_long(struct vfsmount *mnt);
112extern void mnt_pin(struct vfsmount *mnt); 101extern void mnt_pin(struct vfsmount *mnt);
113extern void mnt_unpin(struct vfsmount *mnt); 102extern void mnt_unpin(struct vfsmount *mnt);
114extern int __mnt_is_readonly(struct vfsmount *mnt); 103extern int __mnt_is_readonly(struct vfsmount *mnt);
115 104
116static inline void mntput(struct vfsmount *mnt)
117{
118 if (mnt) {
119 mnt->mnt_expiry_mark = 0;
120 mntput_no_expire(mnt);
121 }
122}
123
124extern struct vfsmount *do_kern_mount(const char *fstype, int flags, 105extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
125 const char *name, void *data); 106 const char *name, void *data);
126 107
diff --git a/include/linux/path.h b/include/linux/path.h
index edc98dec6266..a581e8c06533 100644
--- a/include/linux/path.h
+++ b/include/linux/path.h
@@ -10,7 +10,9 @@ struct path {
10}; 10};
11 11
12extern void path_get(struct path *); 12extern void path_get(struct path *);
13extern void path_get_long(struct path *);
13extern void path_put(struct path *); 14extern void path_put(struct path *);
15extern void path_put_long(struct path *);
14 16
15static inline int path_equal(const struct path *path1, const struct path *path2) 17static inline int path_equal(const struct path *path1, const struct path *path2)
16{ 18{
diff --git a/net/socket.c b/net/socket.c
index 0ee74c325320..815bba3d2fe0 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2390,6 +2390,8 @@ EXPORT_SYMBOL(sock_unregister);
2390 2390
2391static int __init sock_init(void) 2391static int __init sock_init(void)
2392{ 2392{
2393 int err;
2394
2393 /* 2395 /*
2394 * Initialize sock SLAB cache. 2396 * Initialize sock SLAB cache.
2395 */ 2397 */
@@ -2406,8 +2408,15 @@ static int __init sock_init(void)
2406 */ 2408 */
2407 2409
2408 init_inodecache(); 2410 init_inodecache();
2409 register_filesystem(&sock_fs_type); 2411
2412 err = register_filesystem(&sock_fs_type);
2413 if (err)
2414 goto out_fs;
2410 sock_mnt = kern_mount(&sock_fs_type); 2415 sock_mnt = kern_mount(&sock_fs_type);
2416 if (IS_ERR(sock_mnt)) {
2417 err = PTR_ERR(sock_mnt);
2418 goto out_mount;
2419 }
2411 2420
2412 /* The real protocol initialization is performed in later initcalls. 2421 /* The real protocol initialization is performed in later initcalls.
2413 */ 2422 */
@@ -2420,7 +2429,13 @@ static int __init sock_init(void)
2420 skb_timestamping_init(); 2429 skb_timestamping_init();
2421#endif 2430#endif
2422 2431
2423 return 0; 2432out:
2433 return err;
2434
2435out_mount:
2436 unregister_filesystem(&sock_fs_type);
2437out_fs:
2438 goto out;
2424} 2439}
2425 2440
2426core_initcall(sock_init); /* early initcall */ 2441core_initcall(sock_init); /* early initcall */