From 5eef7fa905c814826f518aca2d414ca77508ce30 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:22 +1100 Subject: fs: dcache documentation cleanup Remove redundant (and incorrect, since dcache RCU lookup) dentry locking documentation and point to the canonical document. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6a4aea30aa09..fff975576b5b 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -141,22 +141,16 @@ struct dentry_operations { char *(*d_dname)(struct dentry *, char *, int); }; -/* the dentry parameter passed to d_hash and d_compare is the parent +/* + * Locking rules for dentry_operations callbacks are to be found in + * Documentation/filesystems/Locking. Keep it updated! + * + * the dentry parameter passed to d_hash and d_compare is the parent * directory of the entries to be compared. It is used in case these * functions need any directory specific information for determining * equivalency classes. Using the dentry itself might not work, as it * might be a negative dentry which has no information associated with - * it */ - -/* -locking rules: - big lock dcache_lock d_lock may block -d_revalidate: no no no yes -d_hash no no no yes -d_compare: no yes yes no -d_delete: no yes no no -d_release: no no no yes -d_iput: no no no yes + * it. */ /* d_flags entries */ -- cgit v1.2.2 From fe15ce446beb3a33583af81ffe6c9d01a75314ed Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:23 +1100 Subject: fs: change d_delete semantics Change d_delete from a dentry deletion notification to a dentry caching advise, more like ->drop_inode. Require it to be constant and idempotent, and not take d_lock. This is how all existing filesystems use the callback anyway. This makes fine grained dentry locking of dput and dentry lru scanning much simpler. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index fff975576b5b..cbfc9567e4e9 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -133,9 +133,9 @@ enum dentry_d_lock_class struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); - int (*d_hash) (struct dentry *, struct qstr *); - int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); - int (*d_delete)(struct dentry *); + int (*d_hash)(struct dentry *, struct qstr *); + int (*d_compare)(struct dentry *, struct qstr *, struct qstr *); + int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); -- cgit v1.2.2 From fb2d5b86aff355a27ebfc132d3c99f4a940cc3fe Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:26 +1100 Subject: fs: name case update method smpfs and ncpfs want to update a live dentry name in-place. Rather than have them open code the locking, provide a documented dcache API. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index cbfc9567e4e9..6cdf4995c90a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -290,6 +290,8 @@ static inline struct dentry *d_add_unique(struct dentry *entry, struct inode *in return res; } +extern void dentry_update_name_case(struct dentry *, struct qstr *); + /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); -- cgit v1.2.2 From 621e155a3591962420eacdd39f6f0aa29ceb221e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:27 +1100 Subject: fs: change d_compare for rcu-walk Change d_compare so it may be called from lock-free RCU lookups. This does put significant restrictions on what may be done from the callback, however there don't seem to have been any problems with in-tree fses. If some strange use case pops up that _really_ cannot cope with the rcu-walk rules, we can just add new rcu-unaware callbacks, which would cause name lookup to drop out of rcu-walk mode. For in-tree filesystems, this is just a mechanical change. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6cdf4995c90a..75a072bf2a34 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -134,7 +134,9 @@ enum dentry_d_lock_class struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); int (*d_hash)(struct dentry *, struct qstr *); - int (*d_compare)(struct dentry *, struct qstr *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct inode *, + const struct dentry *, const struct inode *, + unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); @@ -145,12 +147,8 @@ struct dentry_operations { * Locking rules for dentry_operations callbacks are to be found in * Documentation/filesystems/Locking. Keep it updated! * - * the dentry parameter passed to d_hash and d_compare is the parent - * directory of the entries to be compared. It is used in case these - * functions need any directory specific information for determining - * equivalency classes. Using the dentry itself might not work, as it - * might be a negative dentry which has no information associated with - * it. + * FUrther descriptions are found in Documentation/filesystems/vfs.txt. + * Keep it updated too! */ /* d_flags entries */ -- cgit v1.2.2 From b1e6a015a580ad145689ad1d6b4aa0e03e6c868b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:28 +1100 Subject: fs: change d_hash for rcu-walk Change d_hash so it may be called from lock-free RCU lookups. See similar patch for d_compare for details. For in-tree filesystems, this is just a mechanical change. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 75a072bf2a34..1149e706f04d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -133,7 +133,8 @@ enum dentry_d_lock_class struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); - int (*d_hash)(struct dentry *, struct qstr *); + int (*d_hash)(const struct dentry *, const struct inode *, + struct qstr *); int (*d_compare)(const struct dentry *, const struct inode *, const struct dentry *, const struct inode *, unsigned int, const char *, const struct qstr *); -- cgit v1.2.2 From ec2447c278ee973d35f38e53ca16ba7f965ae33d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:29 +1100 Subject: hostfs: simplify locking Remove dcache_lock locking from hostfs filesystem, and move it into dcache helpers. All that is required is a coherent path name. Protection from concurrent modification of the namespace after path name generation is not provided in current code, because dcache_lock is dropped before the path is used. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 1149e706f04d..cea27dfca532 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -311,7 +311,7 @@ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...); extern char *__d_path(const struct path *path, struct path *root, char *, int); extern char *d_path(const struct path *, char *, int); extern char *d_path_with_unreachable(const struct path *, char *, int); -extern char *__dentry_path(struct dentry *, char *, int); +extern char *dentry_path_raw(struct dentry *, char *, int); extern char *dentry_path(struct dentry *, char *, int); /* Allocation counts.. */ -- cgit v1.2.2 From 789680d1ee9311cdf095241dc02bd9784d799cd1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:30 +1100 Subject: fs: dcache scale hash Add a new lock, dcache_hash_lock, to protect the dcache hash table from concurrent modification. d_hash is also protected by d_lock. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index cea27dfca532..2feb624b67f1 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -184,39 +184,6 @@ struct dentry_operations { extern spinlock_t dcache_lock; extern seqlock_t rename_lock; -/** - * d_drop - drop a dentry - * @dentry: dentry to drop - * - * d_drop() unhashes the entry from the parent dentry hashes, so that it won't - * be found through a VFS lookup any more. Note that this is different from - * deleting the dentry - d_delete will try to mark the dentry negative if - * possible, giving a successful _negative_ lookup, while d_drop will - * just make the cache lookup fail. - * - * d_drop() is used mainly for stuff that wants to invalidate a dentry for some - * reason (NFS timeouts or autofs deletes). - * - * __d_drop requires dentry->d_lock. - */ - -static inline void __d_drop(struct dentry *dentry) -{ - if (!(dentry->d_flags & DCACHE_UNHASHED)) { - dentry->d_flags |= DCACHE_UNHASHED; - hlist_del_rcu(&dentry->d_hash); - } -} - -static inline void d_drop(struct dentry *dentry) -{ - spin_lock(&dcache_lock); - spin_lock(&dentry->d_lock); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); -} - static inline int dname_external(struct dentry *dentry) { return dentry->d_name.name != dentry->d_iname; @@ -228,6 +195,8 @@ static inline int dname_external(struct dentry *dentry) extern void d_instantiate(struct dentry *, struct inode *); extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *); extern struct dentry * d_materialise_unique(struct dentry *, struct inode *); +extern void __d_drop(struct dentry *dentry); +extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); /* allocate/de-allocate */ -- cgit v1.2.2 From b7ab39f631f505edc2bbdb86620d5493f995c9da Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:32 +1100 Subject: fs: dcache scale dentry refcount Make d_count non-atomic and protect it with d_lock. This allows us to ensure a 0 refcount dentry remains 0 without dcache_lock. It is also fairly natural when we start protecting many other dentry members with d_lock. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 2feb624b67f1..b0ade2d46805 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -87,7 +87,7 @@ full_name_hash(const unsigned char *name, unsigned int len) #endif struct dentry { - atomic_t d_count; + unsigned int d_count; /* protected by d_lock */ unsigned int d_flags; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ int d_mounted; @@ -297,17 +297,28 @@ extern char *dentry_path(struct dentry *, char *, int); * needs and they take necessary precautions) you should hold dcache_lock * and call dget_locked() instead of dget(). */ - +static inline struct dentry *dget_dlock(struct dentry *dentry) +{ + if (dentry) { + BUG_ON(!dentry->d_count); + dentry->d_count++; + } + return dentry; +} static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { - BUG_ON(!atomic_read(&dentry->d_count)); - atomic_inc(&dentry->d_count); + spin_lock(&dentry->d_lock); + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); } return dentry; } extern struct dentry * dget_locked(struct dentry *); +extern struct dentry * dget_locked_dlock(struct dentry *); + +extern struct dentry *dget_parent(struct dentry *dentry); /** * d_unhashed - is dentry hashed @@ -338,16 +349,6 @@ static inline void dont_mount(struct dentry *dentry) spin_unlock(&dentry->d_lock); } -static inline struct dentry *dget_parent(struct dentry *dentry) -{ - struct dentry *ret; - - spin_lock(&dentry->d_lock); - ret = dget(dentry->d_parent); - spin_unlock(&dentry->d_lock); - return ret; -} - extern void dput(struct dentry *); static inline int d_mountpoint(struct dentry *dentry) -- cgit v1.2.2 From 2fd6b7f50797f2e993eea59e0a0b8c6399c811dc Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:34 +1100 Subject: fs: dcache scale subdirs Protect d_subdirs and d_child with d_lock, except in filesystems that aren't using dcache_lock for these anyway (eg. using i_mutex). Note: if we change the locking rule in future so that ->d_child protection is provided only with ->d_parent->d_lock, it may allow us to reduce some locking. But it would be an exception to an otherwise regular locking scheme, so we'd have to see some good results. Probably not worthwhile. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b0ade2d46805..ddf4f55624f7 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -305,6 +305,7 @@ static inline struct dentry *dget_dlock(struct dentry *dentry) } return dentry; } + static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { -- cgit v1.2.2 From b23fb0a60379a95e10c671f646b259ea2558421e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:35 +1100 Subject: fs: scale inode alias list Add a new lock, dcache_inode_lock, to protect the inode's i_dentry list from concurrent modification. d_alias is also protected by d_lock. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ddf4f55624f7..bda5ec0b077d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -181,6 +181,7 @@ struct dentry_operations { #define DCACHE_CANT_MOUNT 0x0100 +extern spinlock_t dcache_inode_lock; extern spinlock_t dcache_lock; extern seqlock_t rename_lock; -- cgit v1.2.2 From 949854d02455080d20cd3e1db28a3a18daf7599d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:37 +1100 Subject: fs: Use rename lock and RCU for multi-step operations The remaining usages for dcache_lock is to allow atomic, multi-step read-side operations over the directory tree by excluding modifications to the tree. Also, to walk in the leaf->root direction in the tree where we don't have a natural d_lock ordering. This could be accomplished by taking every d_lock, but this would mean a huge number of locks and actually gets very tricky. Solve this instead by using the rename seqlock for multi-step read-side operations, retry in case of a rename so we don't walk up the wrong parent. Concurrent dentry insertions are not serialised against. Concurrent deletes are tricky when walking up the directory: our parent might have been deleted when dropping locks so also need to check and retry for that. We can also use the rename lock in cases where livelock is a worry (and it is introduced in subsequent patch). Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bda5ec0b077d..c963ebada922 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -180,6 +180,7 @@ struct dentry_operations { #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ #define DCACHE_CANT_MOUNT 0x0100 +#define DCACHE_GENOCIDE 0x0200 extern spinlock_t dcache_inode_lock; extern spinlock_t dcache_lock; -- cgit v1.2.2 From b5c84bf6f6fa3a7dfdcb556023a62953574b60ee Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:38 +1100 Subject: fs: dcache remove dcache_lock dcache_lock no longer protects anything. remove it. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c963ebada922..a2ceb94b0e38 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -183,7 +183,6 @@ struct dentry_operations { #define DCACHE_GENOCIDE 0x0200 extern spinlock_t dcache_inode_lock; -extern spinlock_t dcache_lock; extern seqlock_t rename_lock; static inline int dname_external(struct dentry *dentry) @@ -296,8 +295,8 @@ extern char *dentry_path(struct dentry *, char *, int); * destroyed when it has references. dget() should never be * called for dentries with zero reference counter. For these cases * (preferably none, functions in dcache.c are sufficient for normal - * needs and they take necessary precautions) you should hold dcache_lock - * and call dget_locked() instead of dget(). + * needs and they take necessary precautions) you should hold d_lock + * and call dget_dlock() instead of dget(). */ static inline struct dentry *dget_dlock(struct dentry *dentry) { -- cgit v1.2.2 From dc0474be3e27463d4d4a2793f82366eed906f223 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:43 +1100 Subject: fs: dcache rationalise dget variants dget_locked was a shortcut to avoid the lazy lru manipulation when we already held dcache_lock (lru manipulation was relatively cheap at that point). However, how that the lru lock is an innermost one, we never hold it at any caller, so the lock cost can now be avoided. We already have well working lazy dcache LRU, so it should be fine to defer LRU manipulations to scan time. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index a2ceb94b0e38..ca648685f0cc 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -287,23 +287,17 @@ extern char *dentry_path(struct dentry *, char *, int); /* Allocation counts.. */ /** - * dget, dget_locked - get a reference to a dentry + * dget, dget_dlock - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a dentry or %NULL pointer increment the reference count * if appropriate and return the dentry. A dentry will not be - * destroyed when it has references. dget() should never be - * called for dentries with zero reference counter. For these cases - * (preferably none, functions in dcache.c are sufficient for normal - * needs and they take necessary precautions) you should hold d_lock - * and call dget_dlock() instead of dget(). + * destroyed when it has references. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { - if (dentry) { - BUG_ON(!dentry->d_count); + if (dentry) dentry->d_count++; - } return dentry; } @@ -317,9 +311,6 @@ static inline struct dentry *dget(struct dentry *dentry) return dentry; } -extern struct dentry * dget_locked(struct dentry *); -extern struct dentry * dget_locked_dlock(struct dentry *); - extern struct dentry *dget_parent(struct dentry *dentry); /** -- cgit v1.2.2 From 31e6b01f4183ff419a6d1f86177cbf4662347cec Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:52 +1100 Subject: fs: rcu-walk for path lookup Perform common cases of path lookups without any stores or locking in the ancestor dentry elements. This is called rcu-walk, as opposed to the current algorithm which is a refcount based walk, or ref-walk. This results in far fewer atomic operations on every path element, significantly improving path lookup performance. It also avoids cacheline bouncing on common dentries, significantly improving scalability. The overall design is like this: * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. * Take the RCU lock for the entire path walk, starting with the acquiring of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are not required for dentry persistence. * synchronize_rcu is called when unregistering a filesystem, so we can access d_ops and i_ops during rcu-walk. * Similarly take the vfsmount lock for the entire path walk. So now mnt refcounts are not required for persistence. Also we are free to perform mount lookups, and to assume dentry mount points and mount roots are stable up and down the path. * Have a per-dentry seqlock to protect the dentry name, parent, and inode, so we can load this tuple atomically, and also check whether any of its members have changed. * Dentry lookups (based on parent, candidate string tuple) recheck the parent sequence after the child is found in case anything changed in the parent during the path walk. * inode is also RCU protected so we can load d_inode and use the inode for limited things. * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. * i_op can be loaded. When we reach the destination dentry, we lock it, recheck lookup sequence, and increment its refcount and mountpoint refcount. RCU and vfsmount locks are dropped. This is termed "dropping rcu-walk". If the dentry refcount does not match, we can not drop rcu-walk gracefully at the current point in the lokup, so instead return -ECHILD (for want of a better errno). This signals the path walking code to re-do the entire lookup with a ref-walk. Aside from the final dentry, there are other situations that may be encounted where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take a reference on the last good dentry) and continue with a ref-walk. Again, if we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup using ref-walk. But it is very important that we can continue with ref-walk for most cases, particularly to avoid the overhead of double lookups, and to gain the scalability advantages on common path elements (like cwd and root). The cases where rcu-walk cannot continue are: * NULL dentry (ie. any uncached path element) * parent with d_inode->i_op->permission or ACLs * dentries with d_revalidate * Following links In future patches, permission checks and d_revalidate become rcu-walk aware. It may be possible eventually to make following links rcu-walk aware. Uncached path elements will always require dropping to ref-walk mode, at the very least because i_mutex needs to be grabbed, and objects allocated. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ca648685f0cc..c2e7390289cc 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -90,6 +91,7 @@ struct dentry { unsigned int d_count; /* protected by d_lock */ unsigned int d_flags; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ + seqcount_t d_seq; /* per dentry seqlock */ int d_mounted; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ @@ -266,9 +268,33 @@ extern void d_move(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); /* appendix may either be NULL or be used for transname suffixes */ -extern struct dentry * d_lookup(struct dentry *, struct qstr *); -extern struct dentry * __d_lookup(struct dentry *, struct qstr *); -extern struct dentry * d_hash_and_lookup(struct dentry *, struct qstr *); +extern struct dentry *d_lookup(struct dentry *, struct qstr *); +extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); +extern struct dentry *__d_lookup(struct dentry *, struct qstr *); +extern struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, + unsigned *seq, struct inode **inode); + +/** + * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok + * @dentry: dentry to take a ref on + * @seq: seqcount to verify against + * @Returns: 0 on failure, else 1. + * + * __d_rcu_to_refcount operates on a dentry,seq pair that was returned + * by __d_lookup_rcu, to get a reference on an rcu-walk dentry. + */ +static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) +{ + int ret = 0; + + assert_spin_locked(&dentry->d_lock); + if (!read_seqcount_retry(&dentry->d_seq, seq)) { + ret = 1; + dentry->d_count++; + } + + return ret; +} /* validate "insecure" dentry pointer */ extern int d_validate(struct dentry *, struct dentry *); -- cgit v1.2.2 From 5f57cbcc02cf18f6b22ef4066bb10afeb8f930ff Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:54 +1100 Subject: fs: dcache remove d_mounted Rather than keep a d_mounted count in the dentry, set a dentry flag instead. The flag can be cleared by checking the hash table to see if there are any mounts left, which is not time critical because it is performed at detach time. The mounted state of a dentry is only used to speculatively take a look in the mount hash table if it is set -- before following the mount, vfsmount lock is taken and mount re-checked without races. This saves 4 bytes on 32-bit, nothing on 64-bit but it does provide a hole I might use later (and some configs have larger than 32-bit spinlocks which might make use of the hole). Autofs4 conversion and changelog by Ian Kent : In autofs4, when expring direct (or offset) mounts we need to ensure that we block user path walks into the autofs mount, which is covered by another mount. To do this we clear the mounted status so that follows stop before walking into the mount and are essentially blocked until the expire is completed. The automount daemon still finds the correct dentry for the umount due to the follow mount logic in fs/autofs4/root.c:autofs4_follow_link(), which is set as an inode operation for direct and offset mounts only and is called following the lookup that stopped at the covered mount. At the end of the expire the covering mount probably has gone away so the mounted status need not be restored. But we need to check this and only restore the mounted status if the expire failed. XXX: autofs may not work right if we have other mounts go over the top of it? Signed-off-by: Nick Piggin --- include/linux/dcache.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c2e7390289cc..e4414693065e 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -92,7 +92,6 @@ struct dentry { unsigned int d_flags; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ seqcount_t d_seq; /* per dentry seqlock */ - int d_mounted; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ /* @@ -156,33 +155,34 @@ struct dentry_operations { /* d_flags entries */ #define DCACHE_AUTOFS_PENDING 0x0001 /* autofs: "under construction" */ -#define DCACHE_NFSFS_RENAMED 0x0002 /* this dentry has been "silly - * renamed" and has to be - * deleted on the last dput() - */ -#define DCACHE_DISCONNECTED 0x0004 - /* This dentry is possibly not currently connected to the dcache tree, - * in which case its parent will either be itself, or will have this - * flag as well. nfsd will not use a dentry with this bit set, but will - * first endeavour to clear the bit either by discovering that it is - * connected, or by performing lookup operations. Any filesystem which - * supports nfsd_operations MUST have a lookup function which, if it finds - * a directory inode with a DCACHE_DISCONNECTED dentry, will d_move - * that dentry into place and return that dentry rather than the passed one, - * typically using d_splice_alias. - */ +#define DCACHE_NFSFS_RENAMED 0x0002 + /* this dentry has been "silly renamed" and has to be deleted on the last + * dput() */ + +#define DCACHE_DISCONNECTED 0x0004 + /* This dentry is possibly not currently connected to the dcache tree, in + * which case its parent will either be itself, or will have this flag as + * well. nfsd will not use a dentry with this bit set, but will first + * endeavour to clear the bit either by discovering that it is connected, + * or by performing lookup operations. Any filesystem which supports + * nfsd_operations MUST have a lookup function which, if it finds a + * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that + * dentry into place and return that dentry rather than the passed one, + * typically using d_splice_alias. */ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ #define DCACHE_UNHASHED 0x0010 - -#define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched by inotify */ +#define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 + /* Parent inode is watched by inotify */ #define DCACHE_COOKIE 0x0040 /* For use by dcookie subsystem */ - -#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ +#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 + /* Parent inode is watched by some fsnotify listener */ #define DCACHE_CANT_MOUNT 0x0100 #define DCACHE_GENOCIDE 0x0200 +#define DCACHE_MOUNTED 0x0400 /* is a mountpoint */ + extern spinlock_t dcache_inode_lock; extern seqlock_t rename_lock; @@ -372,7 +372,7 @@ extern void dput(struct dentry *); static inline int d_mountpoint(struct dentry *dentry) { - return dentry->d_mounted; + return dentry->d_flags & DCACHE_MOUNTED; } extern struct vfsmount *lookup_mnt(struct path *); -- cgit v1.2.2 From fb045adb99d9b7c562dc7fef834857f78249daa1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:55 +1100 Subject: fs: dcache reduce branches in lookup path Reduce some branches and memory accesses in dcache lookup by adding dentry flags to indicate common d_ops are set, rather than having to check them. This saves a pointer memory access (dentry->d_op) in common path lookup situations, and saves another pointer load and branch in cases where we have d_op but not the particular operation. Patched with: git grep -E '[.>]([[:space:]])*d_op([[:space:]])*=' | xargs sed -e 's/\([^\t ]*\)->d_op = \(.*\);/d_set_d_op(\1, \2);/' -e 's/\([^\t ]*\)\.d_op = \(.*\);/d_set_d_op(\&\1, \2);/' -i Signed-off-by: Nick Piggin --- include/linux/dcache.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index e4414693065e..f4b40a751f09 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -183,6 +183,11 @@ struct dentry_operations { #define DCACHE_GENOCIDE 0x0200 #define DCACHE_MOUNTED 0x0400 /* is a mountpoint */ +#define DCACHE_OP_HASH 0x1000 +#define DCACHE_OP_COMPARE 0x2000 +#define DCACHE_OP_REVALIDATE 0x4000 +#define DCACHE_OP_DELETE 0x8000 + extern spinlock_t dcache_inode_lock; extern seqlock_t rename_lock; @@ -201,6 +206,7 @@ extern struct dentry * d_materialise_unique(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); +extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); -- cgit v1.2.2 From 44a7d7a878c9cbb74f236ea755b25b6b2e26a9a9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:56 +1100 Subject: fs: cache optimise dentry and inode for rcu-walk Put dentry and inode fields into top of data structure. This allows RCU path traversal to perform an RCU dentry lookup in a path walk by touching only the first 56 bytes of the dentry. We also fit in 8 bytes of inline name in the first 64 bytes, so for short names, only 64 bytes needs to be touched to perform the lookup. We should get rid of the hash->prev pointer from the first 64 bytes, and fit 16 bytes of name in there, which will take care of 81% rather than 32% of the kernel tree. inode is also rearranged so that RCU lookup will only touch a single cacheline in the inode, plus one in the i_ops structure. This is important for directory component lookups in RCU path walking. In the kernel source, directory names average is around 6 chars, so this works. When we reach the last element of the lookup, we need to lock it and take its refcount which requires another cacheline access. Align dentry and inode operations structs, so members will be at predictable offsets and we can group common operations into head of structure. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f4b40a751f09..b1aeda077258 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -82,25 +82,33 @@ full_name_hash(const unsigned char *name, unsigned int len) * large memory footprint increase). */ #ifdef CONFIG_64BIT -#define DNAME_INLINE_LEN_MIN 32 /* 192 bytes */ +# define DNAME_INLINE_LEN 32 /* 192 bytes */ #else -#define DNAME_INLINE_LEN_MIN 40 /* 128 bytes */ +# ifdef CONFIG_SMP +# define DNAME_INLINE_LEN 36 /* 128 bytes */ +# else +# define DNAME_INLINE_LEN 40 /* 128 bytes */ +# endif #endif struct dentry { - unsigned int d_count; /* protected by d_lock */ + /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ - spinlock_t d_lock; /* per dentry lock */ seqcount_t d_seq; /* per dentry seqlock */ - struct inode *d_inode; /* Where the name belongs to - NULL is - * negative */ - /* - * The next three fields are touched by __d_lookup. Place them here - * so they all fit in a cache line. - */ struct hlist_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; + struct inode *d_inode; /* Where the name belongs to - NULL is + * negative */ + unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ + + /* Ref lookup also touches following */ + unsigned int d_count; /* protected by d_lock */ + spinlock_t d_lock; /* per dentry lock */ + const struct dentry_operations *d_op; + struct super_block *d_sb; /* The root of the dentry tree */ + unsigned long d_time; /* used by d_revalidate */ + void *d_fsdata; /* fs-specific data */ struct list_head d_lru; /* LRU list */ /* @@ -112,12 +120,6 @@ struct dentry { } d_u; struct list_head d_subdirs; /* our children */ struct list_head d_alias; /* inode alias list */ - unsigned long d_time; /* used by d_revalidate */ - const struct dentry_operations *d_op; - struct super_block *d_sb; /* The root of the dentry tree */ - void *d_fsdata; /* fs-specific data */ - - unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ }; /* @@ -143,7 +145,7 @@ struct dentry_operations { void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); -}; +} ____cacheline_aligned; /* * Locking rules for dentry_operations callbacks are to be found in -- cgit v1.2.2 From 34286d6662308d82aed891852d04c7c3a2649b16 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:57 +1100 Subject: fs: rcu-walk aware d_revalidate method Require filesystems be aware of .d_revalidate being called in rcu-walk mode (nd->flags & LOOKUP_RCU). For now do a simple push down, returning -ECHILD from all implementations. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b1aeda077258..8b2064d02928 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -190,7 +190,6 @@ struct dentry_operations { #define DCACHE_OP_REVALIDATE 0x4000 #define DCACHE_OP_DELETE 0x8000 - extern spinlock_t dcache_inode_lock; extern seqlock_t rename_lock; -- cgit v1.2.2 From ceb5bdc2d246f6d81cf61ed70f325308a11821d2 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:05 +1100 Subject: fs: dcache per-bucket dcache hash locking We can turn the dcache hash locking from a global dcache_hash_lock into per-bucket locking. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8b2064d02928..5f0392ef759b 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +96,7 @@ struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ seqcount_t d_seq; /* per dentry seqlock */ - struct hlist_node d_hash; /* lookup hash list */ + struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is -- cgit v1.2.2 From 873feea09ebc980cbd3631b767356ce1eee65ec1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:06 +1100 Subject: fs: dcache per-inode inode alias locking dcache_inode_lock can be replaced with per-inode locking. Use existing inode->i_lock for this. This is slightly non-trivial because we sometimes need to find the inode from the dentry, which requires d_inode to be stabilised (either with refcount or d_lock). Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 5f0392ef759b..d719e4de8046 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -191,7 +191,6 @@ struct dentry_operations { #define DCACHE_OP_REVALIDATE 0x4000 #define DCACHE_OP_DELETE 0x8000 -extern spinlock_t dcache_inode_lock; extern seqlock_t rename_lock; static inline int dname_external(struct dentry *dentry) -- cgit v1.2.2 From 4b936885ab04dc6e0bb0ef35e0e23c1a7364d9e5 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:07 +1100 Subject: fs: improve scalability of pseudo filesystems Regardless of how much we possibly try to scale dcache, there is likely always going to be some fundamental contention when adding or removing children under the same parent. Pseudo filesystems do not seem need to have connected dentries because by definition they are disconnected. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d719e4de8046..c0a2ca97c72f 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -211,6 +211,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); +extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry * d_obtain_alias(struct inode *); -- cgit v1.2.2 From 9d55c369bb5e695e629bc35cba2ef607755b3bee Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:09 +1100 Subject: fs: implement faster dentry memcmp The standard memcmp function on a Westmere system shows up hot in profiles in the `git diff` workload (both parallel and single threaded), and it is likely due to the costs associated with trapping into microcode, and little opportunity to improve memory access (dentry name is not likely to take up more than a cacheline). So replace it with an open-coded byte comparison. This increases code size by 8 bytes in the critical __d_lookup_rcu function, but the speedup is huge, averaging 10 runs of each: git diff st user sys elapsed CPU before 1.15 2.57 3.82 97.1 after 1.14 2.35 3.61 96.8 git diff mt user sys elapsed CPU before 1.27 3.85 1.46 349 after 1.26 3.54 1.43 333 Elapsed time for single threaded git diff at 95.0% confidence: -0.21 +/- 0.01 -5.45% +/- 0.24% It's -0.66% +/- 0.06% elapsed time on my Opteron, so rep cmp costs on the fam10h seem to be relatively smaller, but there is still a win. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux/dcache.h') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c0a2ca97c72f..bd07758943e0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -47,6 +47,27 @@ struct dentry_stat_t { }; extern struct dentry_stat_t dentry_stat; +/* + * Compare 2 name strings, return 0 if they match, otherwise non-zero. + * The strings are both count bytes long, and count is non-zero. + */ +static inline int dentry_cmp(const unsigned char *cs, size_t scount, + const unsigned char *ct, size_t tcount) +{ + int ret; + if (scount != tcount) + return 1; + do { + ret = (*cs != *ct); + if (ret) + break; + cs++; + ct++; + tcount--; + } while (tcount); + return ret; +} + /* Name hashing routines. Initial hash value */ /* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ #define init_name_hash() 0 -- cgit v1.2.2