aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorNick Piggin <npiggin@kernel.dk>2011-01-07 01:49:52 -0500
committerNick Piggin <npiggin@kernel.dk>2011-01-07 01:50:27 -0500
commit31e6b01f4183ff419a6d1f86177cbf4662347cec (patch)
treee215ec9af88352c55e024f784f3d9f8eb13fab85 /include
parent3c22cd5709e8143444a6d08682a87f4c57902df3 (diff)
fs: rcu-walk for path lookup
Perform common cases of path lookups without any stores or locking in the ancestor dentry elements. This is called rcu-walk, as opposed to the current algorithm which is a refcount based walk, or ref-walk. This results in far fewer atomic operations on every path element, significantly improving path lookup performance. It also avoids cacheline bouncing on common dentries, significantly improving scalability. The overall design is like this: * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. * Take the RCU lock for the entire path walk, starting with the acquiring of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are not required for dentry persistence. * synchronize_rcu is called when unregistering a filesystem, so we can access d_ops and i_ops during rcu-walk. * Similarly take the vfsmount lock for the entire path walk. So now mnt refcounts are not required for persistence. Also we are free to perform mount lookups, and to assume dentry mount points and mount roots are stable up and down the path. * Have a per-dentry seqlock to protect the dentry name, parent, and inode, so we can load this tuple atomically, and also check whether any of its members have changed. * Dentry lookups (based on parent, candidate string tuple) recheck the parent sequence after the child is found in case anything changed in the parent during the path walk. * inode is also RCU protected so we can load d_inode and use the inode for limited things. * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. * i_op can be loaded. When we reach the destination dentry, we lock it, recheck lookup sequence, and increment its refcount and mountpoint refcount. RCU and vfsmount locks are dropped. This is termed "dropping rcu-walk". If the dentry refcount does not match, we can not drop rcu-walk gracefully at the current point in the lokup, so instead return -ECHILD (for want of a better errno). This signals the path walking code to re-do the entire lookup with a ref-walk. Aside from the final dentry, there are other situations that may be encounted where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take a reference on the last good dentry) and continue with a ref-walk. Again, if we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup using ref-walk. But it is very important that we can continue with ref-walk for most cases, particularly to avoid the overhead of double lookups, and to gain the scalability advantages on common path elements (like cwd and root). The cases where rcu-walk cannot continue are: * NULL dentry (ie. any uncached path element) * parent with d_inode->i_op->permission or ACLs * dentries with d_revalidate * Following links In future patches, permission checks and d_revalidate become rcu-walk aware. It may be possible eventually to make following links rcu-walk aware. Uncached path elements will always require dropping to ref-walk mode, at the very least because i_mutex needs to be grabbed, and objects allocated. Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Diffstat (limited to 'include')
-rw-r--r--include/linux/dcache.h32
-rw-r--r--include/linux/namei.h15
-rw-r--r--include/linux/security.h8
3 files changed, 46 insertions, 9 deletions
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index ca648685f0cc..c2e7390289cc 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -5,6 +5,7 @@
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/rculist.h> 6#include <linux/rculist.h>
7#include <linux/spinlock.h> 7#include <linux/spinlock.h>
8#include <linux/seqlock.h>
8#include <linux/cache.h> 9#include <linux/cache.h>
9#include <linux/rcupdate.h> 10#include <linux/rcupdate.h>
10 11
@@ -90,6 +91,7 @@ struct dentry {
90 unsigned int d_count; /* protected by d_lock */ 91 unsigned int d_count; /* protected by d_lock */
91 unsigned int d_flags; /* protected by d_lock */ 92 unsigned int d_flags; /* protected by d_lock */
92 spinlock_t d_lock; /* per dentry lock */ 93 spinlock_t d_lock; /* per dentry lock */
94 seqcount_t d_seq; /* per dentry seqlock */
93 int d_mounted; 95 int d_mounted;
94 struct inode *d_inode; /* Where the name belongs to - NULL is 96 struct inode *d_inode; /* Where the name belongs to - NULL is
95 * negative */ 97 * negative */
@@ -266,9 +268,33 @@ extern void d_move(struct dentry *, struct dentry *);
266extern struct dentry *d_ancestor(struct dentry *, struct dentry *); 268extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
267 269
268/* appendix may either be NULL or be used for transname suffixes */ 270/* appendix may either be NULL or be used for transname suffixes */
269extern struct dentry * d_lookup(struct dentry *, struct qstr *); 271extern struct dentry *d_lookup(struct dentry *, struct qstr *);
270extern struct dentry * __d_lookup(struct dentry *, struct qstr *); 272extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
271extern struct dentry * d_hash_and_lookup(struct dentry *, struct qstr *); 273extern struct dentry *__d_lookup(struct dentry *, struct qstr *);
274extern struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
275 unsigned *seq, struct inode **inode);
276
277/**
278 * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok
279 * @dentry: dentry to take a ref on
280 * @seq: seqcount to verify against
281 * @Returns: 0 on failure, else 1.
282 *
283 * __d_rcu_to_refcount operates on a dentry,seq pair that was returned
284 * by __d_lookup_rcu, to get a reference on an rcu-walk dentry.
285 */
286static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq)
287{
288 int ret = 0;
289
290 assert_spin_locked(&dentry->d_lock);
291 if (!read_seqcount_retry(&dentry->d_seq, seq)) {
292 ret = 1;
293 dentry->d_count++;
294 }
295
296 return ret;
297}
272 298
273/* validate "insecure" dentry pointer */ 299/* validate "insecure" dentry pointer */
274extern int d_validate(struct dentry *, struct dentry *); 300extern int d_validate(struct dentry *, struct dentry *);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index aec730b53935..18d06add0a40 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -19,7 +19,10 @@ struct nameidata {
19 struct path path; 19 struct path path;
20 struct qstr last; 20 struct qstr last;
21 struct path root; 21 struct path root;
22 struct file *file;
23 struct inode *inode; /* path.dentry.d_inode */
22 unsigned int flags; 24 unsigned int flags;
25 unsigned seq;
23 int last_type; 26 int last_type;
24 unsigned depth; 27 unsigned depth;
25 char *saved_names[MAX_NESTED_LINKS + 1]; 28 char *saved_names[MAX_NESTED_LINKS + 1];
@@ -43,11 +46,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
43 * - internal "there are more path components" flag 46 * - internal "there are more path components" flag
44 * - dentry cache is untrusted; force a real lookup 47 * - dentry cache is untrusted; force a real lookup
45 */ 48 */
46#define LOOKUP_FOLLOW 1 49#define LOOKUP_FOLLOW 0x0001
47#define LOOKUP_DIRECTORY 2 50#define LOOKUP_DIRECTORY 0x0002
48#define LOOKUP_CONTINUE 4 51#define LOOKUP_CONTINUE 0x0004
49#define LOOKUP_PARENT 16 52
50#define LOOKUP_REVAL 64 53#define LOOKUP_PARENT 0x0010
54#define LOOKUP_REVAL 0x0020
55#define LOOKUP_RCU 0x0040
51/* 56/*
52 * Intent data 57 * Intent data
53 */ 58 */
diff --git a/include/linux/security.h b/include/linux/security.h
index fd4d55fb8845..ed95401970c7 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -457,7 +457,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
457 * called when the actual read/write operations are performed. 457 * called when the actual read/write operations are performed.
458 * @inode contains the inode structure to check. 458 * @inode contains the inode structure to check.
459 * @mask contains the permission mask. 459 * @mask contains the permission mask.
460 * @nd contains the nameidata (may be NULL).
461 * Return 0 if permission is granted. 460 * Return 0 if permission is granted.
462 * @inode_setattr: 461 * @inode_setattr:
463 * Check permission before setting file attributes. Note that the kernel 462 * Check permission before setting file attributes. Note that the kernel
@@ -1713,6 +1712,7 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
1713int security_inode_readlink(struct dentry *dentry); 1712int security_inode_readlink(struct dentry *dentry);
1714int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); 1713int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd);
1715int security_inode_permission(struct inode *inode, int mask); 1714int security_inode_permission(struct inode *inode, int mask);
1715int security_inode_exec_permission(struct inode *inode, unsigned int flags);
1716int security_inode_setattr(struct dentry *dentry, struct iattr *attr); 1716int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
1717int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry); 1717int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry);
1718int security_inode_setxattr(struct dentry *dentry, const char *name, 1718int security_inode_setxattr(struct dentry *dentry, const char *name,
@@ -2102,6 +2102,12 @@ static inline int security_inode_permission(struct inode *inode, int mask)
2102 return 0; 2102 return 0;
2103} 2103}
2104 2104
2105static inline int security_inode_exec_permission(struct inode *inode,
2106 unsigned int flags)
2107{
2108 return 0;
2109}
2110
2105static inline int security_inode_setattr(struct dentry *dentry, 2111static inline int security_inode_setattr(struct dentry *dentry,
2106 struct iattr *attr) 2112 struct iattr *attr)
2107{ 2113{