summaryrefslogtreecommitdiffstats
path: root/fs/namespace.c
diff options
context:
space:
mode:
authorDave Hansen <haveblue@us.ibm.com>2008-02-15 17:37:59 -0500
committerAl Viro <viro@zeniv.linux.org.uk>2008-04-19 00:29:27 -0400
commit3d733633a633065729c9e4e254b2e5442c00ef7e (patch)
tree8b52ba468f275f86221ddb77c29306a2405844fc /fs/namespace.c
parent2c463e95480829a2fe8f386589516e13b1289db6 (diff)
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually implements the tracking of the number of writers to a mount. However, it causes scalability problems because there can be hundreds of cpus doing open()/close() on files on the same mnt at the same time. Even an atomic_t in the mnt has massive scalaing problems because the cacheline gets so terribly contended. This uses a statically-allocated percpu variable. All want/drop operations are local to a cpu as long that cpu operates on the same mount, and there are no writer count imbalances. Writer count imbalances happen when a write is taken on one cpu, and released on another, like when an open/close pair is performed on two Upon a remount,ro request, all of the data from the percpu variables is collected (expensive, but very rare) and we determine if there are any outstanding writers to the mount. I've written a little benchmark to sit in a loop for a couple of seconds in several cpus in parallel doing open/write/close loops. http://sr71.net/~dave/linux/openbench.c The code in here is a a worst-possible case for this patch. It does opens on a _pair_ of files in two different mounts in parallel. This should cause my code to lose its "operate on the same mount" optimization completely. This worst-case scenario causes a 3% degredation in the benchmark. I could probably get rid of even this 3%, but it would be more complex than what I have here, and I think this is getting into acceptable territory. In practice, I expect writing more than 3 bytes to a file, as well as disk I/O to mask any effects that this has. (To get rid of that 3%, we could have an #defined number of mounts in the percpu variable. So, instead of a CPU getting operate only on percpu data when it accesses only one mount, it could stay on percpu data when it only accesses N or fewer mounts.) [AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount Acked-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c252
1 files changed, 237 insertions, 15 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 066b393578c1..e3ce18d91aad 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/acct.h> 18#include <linux/acct.h>
19#include <linux/capability.h> 19#include <linux/capability.h>
20#include <linux/cpumask.h>
20#include <linux/module.h> 21#include <linux/module.h>
21#include <linux/sysfs.h> 22#include <linux/sysfs.h>
22#include <linux/seq_file.h> 23#include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
55 return tmp & (HASH_SIZE - 1); 56 return tmp & (HASH_SIZE - 1);
56} 57}
57 58
59#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
60
58struct vfsmount *alloc_vfsmnt(const char *name) 61struct vfsmount *alloc_vfsmnt(const char *name)
59{ 62{
60 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 63 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
68 INIT_LIST_HEAD(&mnt->mnt_share); 71 INIT_LIST_HEAD(&mnt->mnt_share);
69 INIT_LIST_HEAD(&mnt->mnt_slave_list); 72 INIT_LIST_HEAD(&mnt->mnt_slave_list);
70 INIT_LIST_HEAD(&mnt->mnt_slave); 73 INIT_LIST_HEAD(&mnt->mnt_slave);
74 atomic_set(&mnt->__mnt_writers, 0);
71 if (name) { 75 if (name) {
72 int size = strlen(name) + 1; 76 int size = strlen(name) + 1;
73 char *newname = kmalloc(size, GFP_KERNEL); 77 char *newname = kmalloc(size, GFP_KERNEL);
@@ -88,6 +92,92 @@ struct vfsmount *alloc_vfsmnt(const char *name)
88 * we can determine when writes are able to occur to 92 * we can determine when writes are able to occur to
89 * a filesystem. 93 * a filesystem.
90 */ 94 */
95/*
96 * __mnt_is_readonly: check whether a mount is read-only
97 * @mnt: the mount to check for its write status
98 *
99 * This shouldn't be used directly ouside of the VFS.
100 * It does not guarantee that the filesystem will stay
101 * r/w, just that it is right *now*. This can not and
102 * should not be used in place of IS_RDONLY(inode).
103 * mnt_want/drop_write() will _keep_ the filesystem
104 * r/w.
105 */
106int __mnt_is_readonly(struct vfsmount *mnt)
107{
108 return (mnt->mnt_sb->s_flags & MS_RDONLY);
109}
110EXPORT_SYMBOL_GPL(__mnt_is_readonly);
111
112struct mnt_writer {
113 /*
114 * If holding multiple instances of this lock, they
115 * must be ordered by cpu number.
116 */
117 spinlock_t lock;
118 struct lock_class_key lock_class; /* compiles out with !lockdep */
119 unsigned long count;
120 struct vfsmount *mnt;
121} ____cacheline_aligned_in_smp;
122static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
123
124static int __init init_mnt_writers(void)
125{
126 int cpu;
127 for_each_possible_cpu(cpu) {
128 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
129 spin_lock_init(&writer->lock);
130 lockdep_set_class(&writer->lock, &writer->lock_class);
131 writer->count = 0;
132 }
133 return 0;
134}
135fs_initcall(init_mnt_writers);
136
137static void unlock_mnt_writers(void)
138{
139 int cpu;
140 struct mnt_writer *cpu_writer;
141
142 for_each_possible_cpu(cpu) {
143 cpu_writer = &per_cpu(mnt_writers, cpu);
144 spin_unlock(&cpu_writer->lock);
145 }
146}
147
148static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
149{
150 if (!cpu_writer->mnt)
151 return;
152 /*
153 * This is in case anyone ever leaves an invalid,
154 * old ->mnt and a count of 0.
155 */
156 if (!cpu_writer->count)
157 return;
158 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
159 cpu_writer->count = 0;
160}
161 /*
162 * must hold cpu_writer->lock
163 */
164static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
165 struct vfsmount *mnt)
166{
167 if (cpu_writer->mnt == mnt)
168 return;
169 __clear_mnt_count(cpu_writer);
170 cpu_writer->mnt = mnt;
171}
172
173/*
174 * Most r/o checks on a fs are for operations that take
175 * discrete amounts of time, like a write() or unlink().
176 * We must keep track of when those operations start
177 * (for permission checks) and when they end, so that
178 * we can determine when writes are able to occur to
179 * a filesystem.
180 */
91/** 181/**
92 * mnt_want_write - get write access to a mount 182 * mnt_want_write - get write access to a mount
93 * @mnt: the mount on which to take a write 183 * @mnt: the mount on which to take a write
@@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name)
100 */ 190 */
101int mnt_want_write(struct vfsmount *mnt) 191int mnt_want_write(struct vfsmount *mnt)
102{ 192{
103 if (__mnt_is_readonly(mnt)) 193 int ret = 0;
104 return -EROFS; 194 struct mnt_writer *cpu_writer;
105 return 0; 195
196 cpu_writer = &get_cpu_var(mnt_writers);
197 spin_lock(&cpu_writer->lock);
198 if (__mnt_is_readonly(mnt)) {
199 ret = -EROFS;
200 goto out;
201 }
202 use_cpu_writer_for_mount(cpu_writer, mnt);
203 cpu_writer->count++;
204out:
205 spin_unlock(&cpu_writer->lock);
206 put_cpu_var(mnt_writers);
207 return ret;
106} 208}
107EXPORT_SYMBOL_GPL(mnt_want_write); 209EXPORT_SYMBOL_GPL(mnt_want_write);
108 210
211static void lock_mnt_writers(void)
212{
213 int cpu;
214 struct mnt_writer *cpu_writer;
215
216 for_each_possible_cpu(cpu) {
217 cpu_writer = &per_cpu(mnt_writers, cpu);
218 spin_lock(&cpu_writer->lock);
219 __clear_mnt_count(cpu_writer);
220 cpu_writer->mnt = NULL;
221 }
222}
223
224/*
225 * These per-cpu write counts are not guaranteed to have
226 * matched increments and decrements on any given cpu.
227 * A file open()ed for write on one cpu and close()d on
228 * another cpu will imbalance this count. Make sure it
229 * does not get too far out of whack.
230 */
231static void handle_write_count_underflow(struct vfsmount *mnt)
232{
233 if (atomic_read(&mnt->__mnt_writers) >=
234 MNT_WRITER_UNDERFLOW_LIMIT)
235 return;
236 /*
237 * It isn't necessary to hold all of the locks
238 * at the same time, but doing it this way makes
239 * us share a lot more code.
240 */
241 lock_mnt_writers();
242 /*
243 * vfsmount_lock is for mnt_flags.
244 */
245 spin_lock(&vfsmount_lock);
246 /*
247 * If coalescing the per-cpu writer counts did not
248 * get us back to a positive writer count, we have
249 * a bug.
250 */
251 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
252 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
253 printk(KERN_DEBUG "leak detected on mount(%p) writers "
254 "count: %d\n",
255 mnt, atomic_read(&mnt->__mnt_writers));
256 WARN_ON(1);
257 /* use the flag to keep the dmesg spam down */
258 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
259 }
260 spin_unlock(&vfsmount_lock);
261 unlock_mnt_writers();
262}
263
109/** 264/**
110 * mnt_drop_write - give up write access to a mount 265 * mnt_drop_write - give up write access to a mount
111 * @mnt: the mount on which to give up write access 266 * @mnt: the mount on which to give up write access
@@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
116 */ 271 */
117void mnt_drop_write(struct vfsmount *mnt) 272void mnt_drop_write(struct vfsmount *mnt)
118{ 273{
274 int must_check_underflow = 0;
275 struct mnt_writer *cpu_writer;
276
277 cpu_writer = &get_cpu_var(mnt_writers);
278 spin_lock(&cpu_writer->lock);
279
280 use_cpu_writer_for_mount(cpu_writer, mnt);
281 if (cpu_writer->count > 0) {
282 cpu_writer->count--;
283 } else {
284 must_check_underflow = 1;
285 atomic_dec(&mnt->__mnt_writers);
286 }
287
288 spin_unlock(&cpu_writer->lock);
289 /*
290 * Logically, we could call this each time,
291 * but the __mnt_writers cacheline tends to
292 * be cold, and makes this expensive.
293 */
294 if (must_check_underflow)
295 handle_write_count_underflow(mnt);
296 /*
297 * This could be done right after the spinlock
298 * is taken because the spinlock keeps us on
299 * the cpu, and disables preemption. However,
300 * putting it here bounds the amount that
301 * __mnt_writers can underflow. Without it,
302 * we could theoretically wrap __mnt_writers.
303 */
304 put_cpu_var(mnt_writers);
119} 305}
120EXPORT_SYMBOL_GPL(mnt_drop_write); 306EXPORT_SYMBOL_GPL(mnt_drop_write);
121 307
122/* 308int mnt_make_readonly(struct vfsmount *mnt)
123 * __mnt_is_readonly: check whether a mount is read-only
124 * @mnt: the mount to check for its write status
125 *
126 * This shouldn't be used directly ouside of the VFS.
127 * It does not guarantee that the filesystem will stay
128 * r/w, just that it is right *now*. This can not and
129 * should not be used in place of IS_RDONLY(inode).
130 */
131int __mnt_is_readonly(struct vfsmount *mnt)
132{ 309{
133 return (mnt->mnt_sb->s_flags & MS_RDONLY); 310 int ret = 0;
311
312 lock_mnt_writers();
313 /*
314 * With all the locks held, this value is stable
315 */
316 if (atomic_read(&mnt->__mnt_writers) > 0) {
317 ret = -EBUSY;
318 goto out;
319 }
320 /*
321 * actually set mount's r/o flag here to make
322 * __mnt_is_readonly() true, which keeps anyone
323 * from doing a successful mnt_want_write().
324 */
325out:
326 unlock_mnt_writers();
327 return ret;
134} 328}
135EXPORT_SYMBOL_GPL(__mnt_is_readonly);
136 329
137int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 330int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
138{ 331{
@@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
325 518
326static inline void __mntput(struct vfsmount *mnt) 519static inline void __mntput(struct vfsmount *mnt)
327{ 520{
521 int cpu;
328 struct super_block *sb = mnt->mnt_sb; 522 struct super_block *sb = mnt->mnt_sb;
523 /*
524 * We don't have to hold all of the locks at the
525 * same time here because we know that we're the
526 * last reference to mnt and that no new writers
527 * can come in.
528 */
529 for_each_possible_cpu(cpu) {
530 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
531 if (cpu_writer->mnt != mnt)
532 continue;
533 spin_lock(&cpu_writer->lock);
534 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
535 cpu_writer->count = 0;
536 /*
537 * Might as well do this so that no one
538 * ever sees the pointer and expects
539 * it to be valid.
540 */
541 cpu_writer->mnt = NULL;
542 spin_unlock(&cpu_writer->lock);
543 }
544 /*
545 * This probably indicates that somebody messed
546 * up a mnt_want/drop_write() pair. If this
547 * happens, the filesystem was probably unable
548 * to make r/w->r/o transitions.
549 */
550 WARN_ON(atomic_read(&mnt->__mnt_writers));
329 dput(mnt->mnt_root); 551 dput(mnt->mnt_root);
330 free_vfsmnt(mnt); 552 free_vfsmnt(mnt);
331 deactivate_super(sb); 553 deactivate_super(sb);