diff options
author | Dave Hansen <haveblue@us.ibm.com> | 2008-02-15 17:37:59 -0500 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2008-04-19 00:29:27 -0400 |
commit | 3d733633a633065729c9e4e254b2e5442c00ef7e (patch) | |
tree | 8b52ba468f275f86221ddb77c29306a2405844fc /include/linux | |
parent | 2c463e95480829a2fe8f386589516e13b1289db6 (diff) |
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'include/linux')
-rw-r--r-- | include/linux/mount.h | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/include/linux/mount.h b/include/linux/mount.h index 2eecd2c8c760..8c8e94369ac8 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h | |||
@@ -14,6 +14,7 @@ | |||
14 | 14 | ||
15 | #include <linux/types.h> | 15 | #include <linux/types.h> |
16 | #include <linux/list.h> | 16 | #include <linux/list.h> |
17 | #include <linux/nodemask.h> | ||
17 | #include <linux/spinlock.h> | 18 | #include <linux/spinlock.h> |
18 | #include <asm/atomic.h> | 19 | #include <asm/atomic.h> |
19 | 20 | ||
@@ -30,6 +31,7 @@ struct mnt_namespace; | |||
30 | #define MNT_RELATIME 0x20 | 31 | #define MNT_RELATIME 0x20 |
31 | 32 | ||
32 | #define MNT_SHRINKABLE 0x100 | 33 | #define MNT_SHRINKABLE 0x100 |
34 | #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ | ||
33 | 35 | ||
34 | #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ | 36 | #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ |
35 | #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ | 37 | #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ |
@@ -62,6 +64,11 @@ struct vfsmount { | |||
62 | int mnt_expiry_mark; /* true if marked for expiry */ | 64 | int mnt_expiry_mark; /* true if marked for expiry */ |
63 | int mnt_pinned; | 65 | int mnt_pinned; |
64 | int mnt_ghosts; | 66 | int mnt_ghosts; |
67 | /* | ||
68 | * This value is not stable unless all of the mnt_writers[] spinlocks | ||
69 | * are held, and all mnt_writer[]s on this mount have 0 as their ->count | ||
70 | */ | ||
71 | atomic_t __mnt_writers; | ||
65 | }; | 72 | }; |
66 | 73 | ||
67 | static inline struct vfsmount *mntget(struct vfsmount *mnt) | 74 | static inline struct vfsmount *mntget(struct vfsmount *mnt) |