diff options
| author | Nick Piggin <npiggin@kernel.dk> | 2011-01-07 01:49:58 -0500 |
|---|---|---|
| committer | Nick Piggin <npiggin@kernel.dk> | 2011-01-07 01:50:29 -0500 |
| commit | b74c79e99389cd79b31fcc08f82c24e492e63c7e (patch) | |
| tree | 763c6b412517306670bc625e90035f2d16bb739f /Documentation/filesystems | |
| parent | 34286d6662308d82aed891852d04c7c3a2649b16 (diff) | |
fs: provide rcu-walk aware permission i_ops
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Diffstat (limited to 'Documentation/filesystems')
| -rw-r--r-- | Documentation/filesystems/Locking | 6 | ||||
| -rw-r--r-- | Documentation/filesystems/path-lookup.txt | 44 | ||||
| -rw-r--r-- | Documentation/filesystems/porting | 5 | ||||
| -rw-r--r-- | Documentation/filesystems/vfs.txt | 10 |
4 files changed, 58 insertions, 7 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index e90ffe61eb65..977d8919cc69 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
| @@ -47,8 +47,8 @@ ata *); | |||
| 47 | void * (*follow_link) (struct dentry *, struct nameidata *); | 47 | void * (*follow_link) (struct dentry *, struct nameidata *); |
| 48 | void (*put_link) (struct dentry *, struct nameidata *, void *); | 48 | void (*put_link) (struct dentry *, struct nameidata *, void *); |
| 49 | void (*truncate) (struct inode *); | 49 | void (*truncate) (struct inode *); |
| 50 | int (*permission) (struct inode *, int, struct nameidata *); | 50 | int (*permission) (struct inode *, int, unsigned int); |
| 51 | int (*check_acl)(struct inode *, int); | 51 | int (*check_acl)(struct inode *, int, unsigned int); |
| 52 | int (*setattr) (struct dentry *, struct iattr *); | 52 | int (*setattr) (struct dentry *, struct iattr *); |
| 53 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); | 53 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); |
| 54 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 54 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
| @@ -76,7 +76,7 @@ follow_link: no | |||
| 76 | put_link: no | 76 | put_link: no |
| 77 | truncate: yes (see below) | 77 | truncate: yes (see below) |
| 78 | setattr: yes | 78 | setattr: yes |
| 79 | permission: no | 79 | permission: no (may not block if called in rcu-walk mode) |
| 80 | check_acl: no | 80 | check_acl: no |
| 81 | getattr: no | 81 | getattr: no |
| 82 | setxattr: yes | 82 | setxattr: yes |
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt index 8789d1810bed..eb59c8b44be9 100644 --- a/Documentation/filesystems/path-lookup.txt +++ b/Documentation/filesystems/path-lookup.txt | |||
| @@ -316,11 +316,9 @@ The detailed design for rcu-walk is like this: | |||
| 316 | 316 | ||
| 317 | The cases where rcu-walk cannot continue are: | 317 | The cases where rcu-walk cannot continue are: |
| 318 | * NULL dentry (ie. any uncached path element) | 318 | * NULL dentry (ie. any uncached path element) |
| 319 | * parent with d_inode->i_op->permission or ACLs | ||
| 320 | * Following links | 319 | * Following links |
| 321 | 320 | ||
| 322 | In future patches, permission checks become rcu-walk aware. It may be possible | 321 | It may be possible eventually to make following links rcu-walk aware. |
| 323 | eventually to make following links rcu-walk aware. | ||
| 324 | 322 | ||
| 325 | Uncached path elements will always require dropping to ref-walk mode, at the | 323 | Uncached path elements will always require dropping to ref-walk mode, at the |
| 326 | very least because i_mutex needs to be grabbed, and objects allocated. | 324 | very least because i_mutex needs to be grabbed, and objects allocated. |
| @@ -336,9 +334,49 @@ or stored into. The result is massive improvements in performance and | |||
| 336 | scalability of path resolution. | 334 | scalability of path resolution. |
| 337 | 335 | ||
| 338 | 336 | ||
| 337 | Interesting statistics | ||
| 338 | ====================== | ||
| 339 | |||
| 340 | The following table gives rcu lookup statistics for a few simple workloads | ||
| 341 | (2s12c24t Westmere, debian non-graphical system). Ungraceful are attempts to | ||
| 342 | drop rcu that fail due to d_seq failure and requiring the entire path lookup | ||
| 343 | again. Other cases are successful rcu-drops that are required before the final | ||
| 344 | element, nodentry for missing dentry, revalidate for filesystem revalidate | ||
| 345 | routine requiring rcu drop, permission for permission check requiring drop, | ||
| 346 | and link for symlink traversal requiring drop. | ||
| 347 | |||
| 348 | rcu-lookups restart nodentry link revalidate permission | ||
| 349 | bootup 47121 0 4624 1010 10283 7852 | ||
| 350 | dbench 25386793 0 6778659(26.7%) 55 549 1156 | ||
| 351 | kbuild 2696672 10 64442(2.3%) 108764(4.0%) 1 1590 | ||
| 352 | git diff 39605 0 28 2 0 106 | ||
| 353 | vfstest 24185492 4945 708725(2.9%) 1076136(4.4%) 0 2651 | ||
| 354 | |||
| 355 | What this shows is that failed rcu-walk lookups, ie. ones that are restarted | ||
| 356 | entirely with ref-walk, are quite rare. Even the "vfstest" case which | ||
| 357 | specifically has concurrent renames/mkdir/rmdir/ creat/unlink/etc to excercise | ||
| 358 | such races is not showing a huge amount of restarts. | ||
| 359 | |||
| 360 | Dropping from rcu-walk to ref-walk mean that we have encountered a dentry where | ||
| 361 | the reference count needs to be taken for some reason. This is either because | ||
| 362 | we have reached the target of the path walk, or because we have encountered a | ||
| 363 | condition that can't be resolved in rcu-walk mode. Ideally, we drop rcu-walk | ||
| 364 | only when we have reached the target dentry, so the other statistics show where | ||
| 365 | this does not happen. | ||
| 366 | |||
| 367 | Note that a graceful drop from rcu-walk mode due to something such as the | ||
| 368 | dentry not existing (which can be common) is not necessarily a failure of | ||
| 369 | rcu-walk scheme, because some elements of the path may have been walked in | ||
| 370 | rcu-walk mode. The further we get from common path elements (such as cwd or | ||
| 371 | root), the less contended the dentry is likely to be. The closer we are to | ||
| 372 | common path elements, the more likely they will exist in dentry cache. | ||
| 373 | |||
| 374 | |||
| 339 | Papers and other documentation on dcache locking | 375 | Papers and other documentation on dcache locking |
| 340 | ================================================ | 376 | ================================================ |
| 341 | 377 | ||
| 342 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | 378 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). |
| 343 | 379 | ||
| 344 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | 380 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html |
| 381 | |||
| 382 | |||
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index cd9756a2709d..07a32b42cf9c 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting | |||
| @@ -380,3 +380,8 @@ the filesystem provides it), which requires dropping out of rcu-walk mode. This | |||
| 380 | may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be | 380 | may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be |
| 381 | returned if the filesystem cannot handle rcu-walk. See | 381 | returned if the filesystem cannot handle rcu-walk. See |
| 382 | Documentation/filesystems/vfs.txt for more details. | 382 | Documentation/filesystems/vfs.txt for more details. |
| 383 | |||
| 384 | permission and check_acl are inode permission checks that are called | ||
| 385 | on many or all directory inodes on the way down a path walk (to check for | ||
| 386 | exec permission). These must now be rcu-walk aware (flags & IPERM_RCU). See | ||
| 387 | Documentation/filesystems/vfs.txt for more details. | ||
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index c936b4912383..fbb324e2bd43 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
| @@ -325,7 +325,8 @@ struct inode_operations { | |||
| 325 | void * (*follow_link) (struct dentry *, struct nameidata *); | 325 | void * (*follow_link) (struct dentry *, struct nameidata *); |
| 326 | void (*put_link) (struct dentry *, struct nameidata *, void *); | 326 | void (*put_link) (struct dentry *, struct nameidata *, void *); |
| 327 | void (*truncate) (struct inode *); | 327 | void (*truncate) (struct inode *); |
| 328 | int (*permission) (struct inode *, int, struct nameidata *); | 328 | int (*permission) (struct inode *, int, unsigned int); |
| 329 | int (*check_acl)(struct inode *, int, unsigned int); | ||
| 329 | int (*setattr) (struct dentry *, struct iattr *); | 330 | int (*setattr) (struct dentry *, struct iattr *); |
| 330 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); | 331 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); |
| 331 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 332 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
| @@ -414,6 +415,13 @@ otherwise noted. | |||
| 414 | permission: called by the VFS to check for access rights on a POSIX-like | 415 | permission: called by the VFS to check for access rights on a POSIX-like |
| 415 | filesystem. | 416 | filesystem. |
| 416 | 417 | ||
| 418 | May be called in rcu-walk mode (flags & IPERM_RCU). If in rcu-walk | ||
| 419 | mode, the filesystem must check the permission without blocking or | ||
| 420 | storing to the inode. | ||
| 421 | |||
| 422 | If a situation is encountered that rcu-walk cannot handle, return | ||
| 423 | -ECHILD and it will be called again in ref-walk mode. | ||
| 424 | |||
| 417 | setattr: called by the VFS to set attributes for a file. This method | 425 | setattr: called by the VFS to set attributes for a file. This method |
| 418 | is called by chmod(2) and related system calls. | 426 | is called by chmod(2) and related system calls. |
| 419 | 427 | ||
