diff options
author | Dave Hansen <haveblue@us.ibm.com> | 2008-02-15 17:37:59 -0500 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2008-04-19 00:29:27 -0400 |
commit | 3d733633a633065729c9e4e254b2e5442c00ef7e (patch) | |
tree | 8b52ba468f275f86221ddb77c29306a2405844fc | |
parent | 2c463e95480829a2fe8f386589516e13b1289db6 (diff) |
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r-- | fs/namespace.c | 252 | ||||
-rw-r--r-- | include/linux/mount.h | 7 |
2 files changed, 244 insertions, 15 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 066b393578c1..e3ce18d91aad 100644 --- a/fs/namespace.c +++ b/fs/namespace.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/quotaops.h> | 17 | #include <linux/quotaops.h> |
18 | #include <linux/acct.h> | 18 | #include <linux/acct.h> |
19 | #include <linux/capability.h> | 19 | #include <linux/capability.h> |
20 | #include <linux/cpumask.h> | ||
20 | #include <linux/module.h> | 21 | #include <linux/module.h> |
21 | #include <linux/sysfs.h> | 22 | #include <linux/sysfs.h> |
22 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) | |||
55 | return tmp & (HASH_SIZE - 1); | 56 | return tmp & (HASH_SIZE - 1); |
56 | } | 57 | } |
57 | 58 | ||
59 | #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) | ||
60 | |||
58 | struct vfsmount *alloc_vfsmnt(const char *name) | 61 | struct vfsmount *alloc_vfsmnt(const char *name) |
59 | { | 62 | { |
60 | struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); | 63 | struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); |
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) | |||
68 | INIT_LIST_HEAD(&mnt->mnt_share); | 71 | INIT_LIST_HEAD(&mnt->mnt_share); |
69 | INIT_LIST_HEAD(&mnt->mnt_slave_list); | 72 | INIT_LIST_HEAD(&mnt->mnt_slave_list); |
70 | INIT_LIST_HEAD(&mnt->mnt_slave); | 73 | INIT_LIST_HEAD(&mnt->mnt_slave); |
74 | atomic_set(&mnt->__mnt_writers, 0); | ||
71 | if (name) { | 75 | if (name) { |
72 | int size = strlen(name) + 1; | 76 | int size = strlen(name) + 1; |
73 | char *newname = kmalloc(size, GFP_KERNEL); | 77 | char *newname = kmalloc(size, GFP_KERNEL); |
@@ -88,6 +92,92 @@ struct vfsmount *alloc_vfsmnt(const char *name) | |||
88 | * we can determine when writes are able to occur to | 92 | * we can determine when writes are able to occur to |
89 | * a filesystem. | 93 | * a filesystem. |
90 | */ | 94 | */ |
95 | /* | ||
96 | * __mnt_is_readonly: check whether a mount is read-only | ||
97 | * @mnt: the mount to check for its write status | ||
98 | * | ||
99 | * This shouldn't be used directly ouside of the VFS. | ||
100 | * It does not guarantee that the filesystem will stay | ||
101 | * r/w, just that it is right *now*. This can not and | ||
102 | * should not be used in place of IS_RDONLY(inode). | ||
103 | * mnt_want/drop_write() will _keep_ the filesystem | ||
104 | * r/w. | ||
105 | */ | ||
106 | int __mnt_is_readonly(struct vfsmount *mnt) | ||
107 | { | ||
108 | return (mnt->mnt_sb->s_flags & MS_RDONLY); | ||
109 | } | ||
110 | EXPORT_SYMBOL_GPL(__mnt_is_readonly); | ||
111 | |||
112 | struct mnt_writer { | ||
113 | /* | ||
114 | * If holding multiple instances of this lock, they | ||
115 | * must be ordered by cpu number. | ||
116 | */ | ||
117 | spinlock_t lock; | ||
118 | struct lock_class_key lock_class; /* compiles out with !lockdep */ | ||
119 | unsigned long count; | ||
120 | struct vfsmount *mnt; | ||
121 | } ____cacheline_aligned_in_smp; | ||
122 | static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); | ||
123 | |||
124 | static int __init init_mnt_writers(void) | ||
125 | { | ||
126 | int cpu; | ||
127 | for_each_possible_cpu(cpu) { | ||
128 | struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); | ||
129 | spin_lock_init(&writer->lock); | ||
130 | lockdep_set_class(&writer->lock, &writer->lock_class); | ||
131 | writer->count = 0; | ||
132 | } | ||
133 | return 0; | ||
134 | } | ||
135 | fs_initcall(init_mnt_writers); | ||
136 | |||
137 | static void unlock_mnt_writers(void) | ||
138 | { | ||
139 | int cpu; | ||
140 | struct mnt_writer *cpu_writer; | ||
141 | |||
142 | for_each_possible_cpu(cpu) { | ||
143 | cpu_writer = &per_cpu(mnt_writers, cpu); | ||
144 | spin_unlock(&cpu_writer->lock); | ||
145 | } | ||
146 | } | ||
147 | |||
148 | static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) | ||
149 | { | ||
150 | if (!cpu_writer->mnt) | ||
151 | return; | ||
152 | /* | ||
153 | * This is in case anyone ever leaves an invalid, | ||
154 | * old ->mnt and a count of 0. | ||
155 | */ | ||
156 | if (!cpu_writer->count) | ||
157 | return; | ||
158 | atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); | ||
159 | cpu_writer->count = 0; | ||
160 | } | ||
161 | /* | ||
162 | * must hold cpu_writer->lock | ||
163 | */ | ||
164 | static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, | ||
165 | struct vfsmount *mnt) | ||
166 | { | ||
167 | if (cpu_writer->mnt == mnt) | ||
168 | return; | ||
169 | __clear_mnt_count(cpu_writer); | ||
170 | cpu_writer->mnt = mnt; | ||
171 | } | ||
172 | |||
173 | /* | ||
174 | * Most r/o checks on a fs are for operations that take | ||
175 | * discrete amounts of time, like a write() or unlink(). | ||
176 | * We must keep track of when those operations start | ||
177 | * (for permission checks) and when they end, so that | ||
178 | * we can determine when writes are able to occur to | ||
179 | * a filesystem. | ||
180 | */ | ||
91 | /** | 181 | /** |
92 | * mnt_want_write - get write access to a mount | 182 | * mnt_want_write - get write access to a mount |
93 | * @mnt: the mount on which to take a write | 183 | * @mnt: the mount on which to take a write |
@@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name) | |||
100 | */ | 190 | */ |
101 | int mnt_want_write(struct vfsmount *mnt) | 191 | int mnt_want_write(struct vfsmount *mnt) |
102 | { | 192 | { |
103 | if (__mnt_is_readonly(mnt)) | 193 | int ret = 0; |
104 | return -EROFS; | 194 | struct mnt_writer *cpu_writer; |
105 | return 0; | 195 | |
196 | cpu_writer = &get_cpu_var(mnt_writers); | ||
197 | spin_lock(&cpu_writer->lock); | ||
198 | if (__mnt_is_readonly(mnt)) { | ||
199 | ret = -EROFS; | ||
200 | goto out; | ||
201 | } | ||
202 | use_cpu_writer_for_mount(cpu_writer, mnt); | ||
203 | cpu_writer->count++; | ||
204 | out: | ||
205 | spin_unlock(&cpu_writer->lock); | ||
206 | put_cpu_var(mnt_writers); | ||
207 | return ret; | ||
106 | } | 208 | } |
107 | EXPORT_SYMBOL_GPL(mnt_want_write); | 209 | EXPORT_SYMBOL_GPL(mnt_want_write); |
108 | 210 | ||
211 | static void lock_mnt_writers(void) | ||
212 | { | ||
213 | int cpu; | ||
214 | struct mnt_writer *cpu_writer; | ||
215 | |||
216 | for_each_possible_cpu(cpu) { | ||
217 | cpu_writer = &per_cpu(mnt_writers, cpu); | ||
218 | spin_lock(&cpu_writer->lock); | ||
219 | __clear_mnt_count(cpu_writer); | ||
220 | cpu_writer->mnt = NULL; | ||
221 | } | ||
222 | } | ||
223 | |||
224 | /* | ||
225 | * These per-cpu write counts are not guaranteed to have | ||
226 | * matched increments and decrements on any given cpu. | ||
227 | * A file open()ed for write on one cpu and close()d on | ||
228 | * another cpu will imbalance this count. Make sure it | ||
229 | * does not get too far out of whack. | ||
230 | */ | ||
231 | static void handle_write_count_underflow(struct vfsmount *mnt) | ||
232 | { | ||
233 | if (atomic_read(&mnt->__mnt_writers) >= | ||
234 | MNT_WRITER_UNDERFLOW_LIMIT) | ||
235 | return; | ||
236 | /* | ||
237 | * It isn't necessary to hold all of the locks | ||
238 | * at the same time, but doing it this way makes | ||
239 | * us share a lot more code. | ||
240 | */ | ||
241 | lock_mnt_writers(); | ||
242 | /* | ||
243 | * vfsmount_lock is for mnt_flags. | ||
244 | */ | ||
245 | spin_lock(&vfsmount_lock); | ||
246 | /* | ||
247 | * If coalescing the per-cpu writer counts did not | ||
248 | * get us back to a positive writer count, we have | ||
249 | * a bug. | ||
250 | */ | ||
251 | if ((atomic_read(&mnt->__mnt_writers) < 0) && | ||
252 | !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { | ||
253 | printk(KERN_DEBUG "leak detected on mount(%p) writers " | ||
254 | "count: %d\n", | ||
255 | mnt, atomic_read(&mnt->__mnt_writers)); | ||
256 | WARN_ON(1); | ||
257 | /* use the flag to keep the dmesg spam down */ | ||
258 | mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; | ||
259 | } | ||
260 | spin_unlock(&vfsmount_lock); | ||
261 | unlock_mnt_writers(); | ||
262 | } | ||
263 | |||
109 | /** | 264 | /** |
110 | * mnt_drop_write - give up write access to a mount | 265 | * mnt_drop_write - give up write access to a mount |
111 | * @mnt: the mount on which to give up write access | 266 | * @mnt: the mount on which to give up write access |
@@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write); | |||
116 | */ | 271 | */ |
117 | void mnt_drop_write(struct vfsmount *mnt) | 272 | void mnt_drop_write(struct vfsmount *mnt) |
118 | { | 273 | { |
274 | int must_check_underflow = 0; | ||
275 | struct mnt_writer *cpu_writer; | ||
276 | |||
277 | cpu_writer = &get_cpu_var(mnt_writers); | ||
278 | spin_lock(&cpu_writer->lock); | ||
279 | |||
280 | use_cpu_writer_for_mount(cpu_writer, mnt); | ||
281 | if (cpu_writer->count > 0) { | ||
282 | cpu_writer->count--; | ||
283 | } else { | ||
284 | must_check_underflow = 1; | ||
285 | atomic_dec(&mnt->__mnt_writers); | ||
286 | } | ||
287 | |||
288 | spin_unlock(&cpu_writer->lock); | ||
289 | /* | ||
290 | * Logically, we could call this each time, | ||
291 | * but the __mnt_writers cacheline tends to | ||
292 | * be cold, and makes this expensive. | ||
293 | */ | ||
294 | if (must_check_underflow) | ||
295 | handle_write_count_underflow(mnt); | ||
296 | /* | ||
297 | * This could be done right after the spinlock | ||
298 | * is taken because the spinlock keeps us on | ||
299 | * the cpu, and disables preemption. However, | ||
300 | * putting it here bounds the amount that | ||
301 | * __mnt_writers can underflow. Without it, | ||
302 | * we could theoretically wrap __mnt_writers. | ||
303 | */ | ||
304 | put_cpu_var(mnt_writers); | ||
119 | } | 305 | } |
120 | EXPORT_SYMBOL_GPL(mnt_drop_write); | 306 | EXPORT_SYMBOL_GPL(mnt_drop_write); |
121 | 307 | ||
122 | /* | 308 | int mnt_make_readonly(struct vfsmount *mnt) |
123 | * __mnt_is_readonly: check whether a mount is read-only | ||
124 | * @mnt: the mount to check for its write status | ||
125 | * | ||
126 | * This shouldn't be used directly ouside of the VFS. | ||
127 | * It does not guarantee that the filesystem will stay | ||
128 | * r/w, just that it is right *now*. This can not and | ||
129 | * should not be used in place of IS_RDONLY(inode). | ||
130 | */ | ||
131 | int __mnt_is_readonly(struct vfsmount *mnt) | ||
132 | { | 309 | { |
133 | return (mnt->mnt_sb->s_flags & MS_RDONLY); | 310 | int ret = 0; |
311 | |||
312 | lock_mnt_writers(); | ||
313 | /* | ||
314 | * With all the locks held, this value is stable | ||
315 | */ | ||
316 | if (atomic_read(&mnt->__mnt_writers) > 0) { | ||
317 | ret = -EBUSY; | ||
318 | goto out; | ||
319 | } | ||
320 | /* | ||
321 | * actually set mount's r/o flag here to make | ||
322 | * __mnt_is_readonly() true, which keeps anyone | ||
323 | * from doing a successful mnt_want_write(). | ||
324 | */ | ||
325 | out: | ||
326 | unlock_mnt_writers(); | ||
327 | return ret; | ||
134 | } | 328 | } |
135 | EXPORT_SYMBOL_GPL(__mnt_is_readonly); | ||
136 | 329 | ||
137 | int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) | 330 | int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) |
138 | { | 331 | { |
@@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, | |||
325 | 518 | ||
326 | static inline void __mntput(struct vfsmount *mnt) | 519 | static inline void __mntput(struct vfsmount *mnt) |
327 | { | 520 | { |
521 | int cpu; | ||
328 | struct super_block *sb = mnt->mnt_sb; | 522 | struct super_block *sb = mnt->mnt_sb; |
523 | /* | ||
524 | * We don't have to hold all of the locks at the | ||
525 | * same time here because we know that we're the | ||
526 | * last reference to mnt and that no new writers | ||
527 | * can come in. | ||
528 | */ | ||
529 | for_each_possible_cpu(cpu) { | ||
530 | struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); | ||
531 | if (cpu_writer->mnt != mnt) | ||
532 | continue; | ||
533 | spin_lock(&cpu_writer->lock); | ||
534 | atomic_add(cpu_writer->count, &mnt->__mnt_writers); | ||
535 | cpu_writer->count = 0; | ||
536 | /* | ||
537 | * Might as well do this so that no one | ||
538 | * ever sees the pointer and expects | ||
539 | * it to be valid. | ||
540 | */ | ||
541 | cpu_writer->mnt = NULL; | ||
542 | spin_unlock(&cpu_writer->lock); | ||
543 | } | ||
544 | /* | ||
545 | * This probably indicates that somebody messed | ||
546 | * up a mnt_want/drop_write() pair. If this | ||
547 | * happens, the filesystem was probably unable | ||
548 | * to make r/w->r/o transitions. | ||
549 | */ | ||
550 | WARN_ON(atomic_read(&mnt->__mnt_writers)); | ||
329 | dput(mnt->mnt_root); | 551 | dput(mnt->mnt_root); |
330 | free_vfsmnt(mnt); | 552 | free_vfsmnt(mnt); |
331 | deactivate_super(sb); | 553 | deactivate_super(sb); |
diff --git a/include/linux/mount.h b/include/linux/mount.h index 2eecd2c8c760..8c8e94369ac8 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h | |||
@@ -14,6 +14,7 @@ | |||
14 | 14 | ||
15 | #include <linux/types.h> | 15 | #include <linux/types.h> |
16 | #include <linux/list.h> | 16 | #include <linux/list.h> |
17 | #include <linux/nodemask.h> | ||
17 | #include <linux/spinlock.h> | 18 | #include <linux/spinlock.h> |
18 | #include <asm/atomic.h> | 19 | #include <asm/atomic.h> |
19 | 20 | ||
@@ -30,6 +31,7 @@ struct mnt_namespace; | |||
30 | #define MNT_RELATIME 0x20 | 31 | #define MNT_RELATIME 0x20 |
31 | 32 | ||
32 | #define MNT_SHRINKABLE 0x100 | 33 | #define MNT_SHRINKABLE 0x100 |
34 | #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ | ||
33 | 35 | ||
34 | #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ | 36 | #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ |
35 | #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ | 37 | #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ |
@@ -62,6 +64,11 @@ struct vfsmount { | |||
62 | int mnt_expiry_mark; /* true if marked for expiry */ | 64 | int mnt_expiry_mark; /* true if marked for expiry */ |
63 | int mnt_pinned; | 65 | int mnt_pinned; |
64 | int mnt_ghosts; | 66 | int mnt_ghosts; |
67 | /* | ||
68 | * This value is not stable unless all of the mnt_writers[] spinlocks | ||
69 | * are held, and all mnt_writer[]s on this mount have 0 as their ->count | ||
70 | */ | ||
71 | atomic_t __mnt_writers; | ||
65 | }; | 72 | }; |
66 | 73 | ||
67 | static inline struct vfsmount *mntget(struct vfsmount *mnt) | 74 | static inline struct vfsmount *mntget(struct vfsmount *mnt) |