diff options
Diffstat (limited to 'Documentation/filesystems')
30 files changed, 1190 insertions, 459 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 4303614b5add..8c624a18f67d 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX | |||
| @@ -96,8 +96,6 @@ seq_file.txt | |||
| 96 | - how to use the seq_file API | 96 | - how to use the seq_file API |
| 97 | sharedsubtree.txt | 97 | sharedsubtree.txt |
| 98 | - a description of shared subtrees for namespaces. | 98 | - a description of shared subtrees for namespaces. |
| 99 | smbfs.txt | ||
| 100 | - info on using filesystems with the SMB protocol (Win 3.11 and NT). | ||
| 101 | spufs.txt | 99 | spufs.txt |
| 102 | - info and mount options for the SPU filesystem used on Cell. | 100 | - info and mount options for the SPU filesystem used on Cell. |
| 103 | sysfs-pci.txt | 101 | sysfs-pci.txt |
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt index f9765e8cf086..b22abba78fed 100644 --- a/Documentation/filesystems/9p.txt +++ b/Documentation/filesystems/9p.txt | |||
| @@ -111,7 +111,7 @@ OPTIONS | |||
| 111 | This can be used to share devices/named pipes/sockets between | 111 | This can be used to share devices/named pipes/sockets between |
| 112 | hosts. This functionality will be expanded in later versions. | 112 | hosts. This functionality will be expanded in later versions. |
| 113 | 113 | ||
| 114 | access there are three access modes. | 114 | access there are four access modes. |
| 115 | user = if a user tries to access a file on v9fs | 115 | user = if a user tries to access a file on v9fs |
| 116 | filesystem for the first time, v9fs sends an | 116 | filesystem for the first time, v9fs sends an |
| 117 | attach command (Tattach) for that user. | 117 | attach command (Tattach) for that user. |
| @@ -120,6 +120,8 @@ OPTIONS | |||
| 120 | the files on the mounted filesystem | 120 | the files on the mounted filesystem |
| 121 | any = v9fs does single attach and performs all | 121 | any = v9fs does single attach and performs all |
| 122 | operations as one user | 122 | operations as one user |
| 123 | client = ACL based access check on the 9p client | ||
| 124 | side for access validation | ||
| 123 | 125 | ||
| 124 | cachetag cache tag to use the specified persistent cache. | 126 | cachetag cache tag to use the specified persistent cache. |
| 125 | cache tags for existing cache sessions can be listed at | 127 | cache tags for existing cache sessions can be listed at |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2db4283efa8d..61b31acb9176 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
| @@ -9,24 +9,30 @@ be able to use diff(1). | |||
| 9 | 9 | ||
| 10 | --------------------------- dentry_operations -------------------------- | 10 | --------------------------- dentry_operations -------------------------- |
| 11 | prototypes: | 11 | prototypes: |
| 12 | int (*d_revalidate)(struct dentry *, int); | 12 | int (*d_revalidate)(struct dentry *, struct nameidata *); |
| 13 | int (*d_hash) (struct dentry *, struct qstr *); | 13 | int (*d_hash)(const struct dentry *, const struct inode *, |
| 14 | int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); | 14 | struct qstr *); |
| 15 | int (*d_compare)(const struct dentry *, const struct inode *, | ||
| 16 | const struct dentry *, const struct inode *, | ||
| 17 | unsigned int, const char *, const struct qstr *); | ||
| 15 | int (*d_delete)(struct dentry *); | 18 | int (*d_delete)(struct dentry *); |
| 16 | void (*d_release)(struct dentry *); | 19 | void (*d_release)(struct dentry *); |
| 17 | void (*d_iput)(struct dentry *, struct inode *); | 20 | void (*d_iput)(struct dentry *, struct inode *); |
| 18 | char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); | 21 | char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); |
| 22 | struct vfsmount *(*d_automount)(struct path *path); | ||
| 23 | int (*d_manage)(struct dentry *, bool); | ||
| 19 | 24 | ||
| 20 | locking rules: | 25 | locking rules: |
| 21 | none have BKL | 26 | rename_lock ->d_lock may block rcu-walk |
| 22 | dcache_lock rename_lock ->d_lock may block | 27 | d_revalidate: no no yes (ref-walk) maybe |
| 23 | d_revalidate: no no no yes | 28 | d_hash no no no maybe |
| 24 | d_hash no no no yes | 29 | d_compare: yes no no maybe |
| 25 | d_compare: no yes no no | 30 | d_delete: no yes no no |
| 26 | d_delete: yes no yes no | 31 | d_release: no no yes no |
| 27 | d_release: no no no yes | 32 | d_iput: no no yes no |
| 28 | d_iput: no no no yes | ||
| 29 | d_dname: no no no no | 33 | d_dname: no no no no |
| 34 | d_automount: no no yes no | ||
| 35 | d_manage: no no yes (ref-walk) maybe | ||
| 30 | 36 | ||
| 31 | --------------------------- inode_operations --------------------------- | 37 | --------------------------- inode_operations --------------------------- |
| 32 | prototypes: | 38 | prototypes: |
| @@ -42,18 +48,22 @@ ata *); | |||
| 42 | int (*rename) (struct inode *, struct dentry *, | 48 | int (*rename) (struct inode *, struct dentry *, |
| 43 | struct inode *, struct dentry *); | 49 | struct inode *, struct dentry *); |
| 44 | int (*readlink) (struct dentry *, char __user *,int); | 50 | int (*readlink) (struct dentry *, char __user *,int); |
| 45 | int (*follow_link) (struct dentry *, struct nameidata *); | 51 | void * (*follow_link) (struct dentry *, struct nameidata *); |
| 52 | void (*put_link) (struct dentry *, struct nameidata *, void *); | ||
| 46 | void (*truncate) (struct inode *); | 53 | void (*truncate) (struct inode *); |
| 47 | int (*permission) (struct inode *, int, struct nameidata *); | 54 | int (*permission) (struct inode *, int, unsigned int); |
| 55 | int (*check_acl)(struct inode *, int, unsigned int); | ||
| 48 | int (*setattr) (struct dentry *, struct iattr *); | 56 | int (*setattr) (struct dentry *, struct iattr *); |
| 49 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); | 57 | int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); |
| 50 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 58 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
| 51 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); | 59 | ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); |
| 52 | ssize_t (*listxattr) (struct dentry *, char *, size_t); | 60 | ssize_t (*listxattr) (struct dentry *, char *, size_t); |
| 53 | int (*removexattr) (struct dentry *, const char *); | 61 | int (*removexattr) (struct dentry *, const char *); |
| 62 | void (*truncate_range)(struct inode *, loff_t, loff_t); | ||
| 63 | int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); | ||
| 54 | 64 | ||
| 55 | locking rules: | 65 | locking rules: |
| 56 | all may block, none have BKL | 66 | all may block |
| 57 | i_mutex(inode) | 67 | i_mutex(inode) |
| 58 | lookup: yes | 68 | lookup: yes |
| 59 | create: yes | 69 | create: yes |
| @@ -66,19 +76,23 @@ rmdir: yes (both) (see below) | |||
| 66 | rename: yes (all) (see below) | 76 | rename: yes (all) (see below) |
| 67 | readlink: no | 77 | readlink: no |
| 68 | follow_link: no | 78 | follow_link: no |
| 79 | put_link: no | ||
| 69 | truncate: yes (see below) | 80 | truncate: yes (see below) |
| 70 | setattr: yes | 81 | setattr: yes |
| 71 | permission: no | 82 | permission: no (may not block if called in rcu-walk mode) |
| 83 | check_acl: no | ||
| 72 | getattr: no | 84 | getattr: no |
| 73 | setxattr: yes | 85 | setxattr: yes |
| 74 | getxattr: no | 86 | getxattr: no |
| 75 | listxattr: no | 87 | listxattr: no |
| 76 | removexattr: yes | 88 | removexattr: yes |
| 89 | truncate_range: yes | ||
| 90 | fiemap: no | ||
| 77 | Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on | 91 | Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on |
| 78 | victim. | 92 | victim. |
| 79 | cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. | 93 | cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. |
| 80 | ->truncate() is never called directly - it's a callback, not a | 94 | ->truncate() is never called directly - it's a callback, not a |
| 81 | method. It's called by vmtruncate() - library function normally used by | 95 | method. It's called by vmtruncate() - deprecated library function used by |
| 82 | ->setattr(). Locking information above applies to that call (i.e. is | 96 | ->setattr(). Locking information above applies to that call (i.e. is |
| 83 | inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been | 97 | inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been |
| 84 | passed). | 98 | passed). |
| @@ -91,7 +105,7 @@ prototypes: | |||
| 91 | struct inode *(*alloc_inode)(struct super_block *sb); | 105 | struct inode *(*alloc_inode)(struct super_block *sb); |
| 92 | void (*destroy_inode)(struct inode *); | 106 | void (*destroy_inode)(struct inode *); |
| 93 | void (*dirty_inode) (struct inode *); | 107 | void (*dirty_inode) (struct inode *); |
| 94 | int (*write_inode) (struct inode *, int); | 108 | int (*write_inode) (struct inode *, struct writeback_control *wbc); |
| 95 | int (*drop_inode) (struct inode *); | 109 | int (*drop_inode) (struct inode *); |
| 96 | void (*evict_inode) (struct inode *); | 110 | void (*evict_inode) (struct inode *); |
| 97 | void (*put_super) (struct super_block *); | 111 | void (*put_super) (struct super_block *); |
| @@ -105,16 +119,16 @@ prototypes: | |||
| 105 | int (*show_options)(struct seq_file *, struct vfsmount *); | 119 | int (*show_options)(struct seq_file *, struct vfsmount *); |
| 106 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); | 120 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); |
| 107 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); | 121 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); |
| 122 | int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); | ||
| 108 | 123 | ||
| 109 | locking rules: | 124 | locking rules: |
| 110 | All may block [not true, see below] | 125 | All may block [not true, see below] |
| 111 | None have BKL | ||
| 112 | s_umount | 126 | s_umount |
| 113 | alloc_inode: | 127 | alloc_inode: |
| 114 | destroy_inode: | 128 | destroy_inode: |
| 115 | dirty_inode: (must not sleep) | 129 | dirty_inode: (must not sleep) |
| 116 | write_inode: | 130 | write_inode: |
| 117 | drop_inode: !!!inode_lock!!! | 131 | drop_inode: !!!inode->i_lock!!! |
| 118 | evict_inode: | 132 | evict_inode: |
| 119 | put_super: write | 133 | put_super: write |
| 120 | write_super: read | 134 | write_super: read |
| @@ -127,6 +141,7 @@ umount_begin: no | |||
| 127 | show_options: no (namespace_sem) | 141 | show_options: no (namespace_sem) |
| 128 | quota_read: no (see below) | 142 | quota_read: no (see below) |
| 129 | quota_write: no (see below) | 143 | quota_write: no (see below) |
| 144 | bdev_try_to_free_page: no (see below) | ||
| 130 | 145 | ||
| 131 | ->statfs() has s_umount (shared) when called by ustat(2) (native or | 146 | ->statfs() has s_umount (shared) when called by ustat(2) (native or |
| 132 | compat), but that's an accident of bad API; s_umount is used to pin | 147 | compat), but that's an accident of bad API; s_umount is used to pin |
| @@ -139,19 +154,23 @@ be the only ones operating on the quota file by the quota code (via | |||
| 139 | dqio_sem) (unless an admin really wants to screw up something and | 154 | dqio_sem) (unless an admin really wants to screw up something and |
| 140 | writes to quota files with quotas on). For other details about locking | 155 | writes to quota files with quotas on). For other details about locking |
| 141 | see also dquot_operations section. | 156 | see also dquot_operations section. |
| 157 | ->bdev_try_to_free_page is called from the ->releasepage handler of | ||
| 158 | the block device inode. See there for more details. | ||
| 142 | 159 | ||
| 143 | --------------------------- file_system_type --------------------------- | 160 | --------------------------- file_system_type --------------------------- |
| 144 | prototypes: | 161 | prototypes: |
| 145 | int (*get_sb) (struct file_system_type *, int, | 162 | int (*get_sb) (struct file_system_type *, int, |
| 146 | const char *, void *, struct vfsmount *); | 163 | const char *, void *, struct vfsmount *); |
| 164 | struct dentry *(*mount) (struct file_system_type *, int, | ||
| 165 | const char *, void *); | ||
| 147 | void (*kill_sb) (struct super_block *); | 166 | void (*kill_sb) (struct super_block *); |
| 148 | locking rules: | 167 | locking rules: |
| 149 | may block BKL | 168 | may block |
| 150 | get_sb yes no | 169 | mount yes |
| 151 | kill_sb yes no | 170 | kill_sb yes |
| 152 | 171 | ||
| 153 | ->get_sb() returns error or 0 with locked superblock attached to the vfsmount | 172 | ->mount() returns ERR_PTR or the root dentry; its superblock should be locked |
| 154 | (exclusive on ->s_umount). | 173 | on return. |
| 155 | ->kill_sb() takes a write-locked superblock, does all shutdown work on it, | 174 | ->kill_sb() takes a write-locked superblock, does all shutdown work on it, |
| 156 | unlocks and drops the reference. | 175 | unlocks and drops the reference. |
| 157 | 176 | ||
| @@ -173,28 +192,38 @@ prototypes: | |||
| 173 | sector_t (*bmap)(struct address_space *, sector_t); | 192 | sector_t (*bmap)(struct address_space *, sector_t); |
| 174 | int (*invalidatepage) (struct page *, unsigned long); | 193 | int (*invalidatepage) (struct page *, unsigned long); |
| 175 | int (*releasepage) (struct page *, int); | 194 | int (*releasepage) (struct page *, int); |
| 195 | void (*freepage)(struct page *); | ||
| 176 | int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 196 | int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
| 177 | loff_t offset, unsigned long nr_segs); | 197 | loff_t offset, unsigned long nr_segs); |
| 178 | int (*launder_page) (struct page *); | 198 | int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, |
| 199 | unsigned long *); | ||
| 200 | int (*migratepage)(struct address_space *, struct page *, struct page *); | ||
| 201 | int (*launder_page)(struct page *); | ||
| 202 | int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long); | ||
| 203 | int (*error_remove_page)(struct address_space *, struct page *); | ||
| 179 | 204 | ||
| 180 | locking rules: | 205 | locking rules: |
| 181 | All except set_page_dirty may block | 206 | All except set_page_dirty and freepage may block |
| 182 | 207 | ||
| 183 | BKL PageLocked(page) i_mutex | 208 | PageLocked(page) i_mutex |
| 184 | writepage: no yes, unlocks (see below) | 209 | writepage: yes, unlocks (see below) |
| 185 | readpage: no yes, unlocks | 210 | readpage: yes, unlocks |
| 186 | sync_page: no maybe | 211 | sync_page: maybe |
| 187 | writepages: no | 212 | writepages: |
| 188 | set_page_dirty no no | 213 | set_page_dirty no |
| 189 | readpages: no | 214 | readpages: |
| 190 | write_begin: no locks the page yes | 215 | write_begin: locks the page yes |
| 191 | write_end: no yes, unlocks yes | 216 | write_end: yes, unlocks yes |
| 192 | perform_write: no n/a yes | 217 | bmap: |
| 193 | bmap: no | 218 | invalidatepage: yes |
| 194 | invalidatepage: no yes | 219 | releasepage: yes |
| 195 | releasepage: no yes | 220 | freepage: yes |
| 196 | direct_IO: no | 221 | direct_IO: |
| 197 | launder_page: no yes | 222 | get_xip_mem: maybe |
| 223 | migratepage: yes (both) | ||
| 224 | launder_page: yes | ||
| 225 | is_partially_uptodate: yes | ||
| 226 | error_remove_page: yes | ||
| 198 | 227 | ||
| 199 | ->write_begin(), ->write_end(), ->sync_page() and ->readpage() | 228 | ->write_begin(), ->write_end(), ->sync_page() and ->readpage() |
| 200 | may be called from the request handler (/dev/loop). | 229 | may be called from the request handler (/dev/loop). |
| @@ -274,9 +303,8 @@ under spinlock (it cannot block) and is sometimes called with the page | |||
| 274 | not locked. | 303 | not locked. |
| 275 | 304 | ||
| 276 | ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some | 305 | ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some |
| 277 | filesystems and by the swapper. The latter will eventually go away. All | 306 | filesystems and by the swapper. The latter will eventually go away. Please, |
| 278 | instances do not actually need the BKL. Please, keep it that way and don't | 307 | keep it that way and don't breed new callers. |
| 279 | breed new callers. | ||
| 280 | 308 | ||
| 281 | ->invalidatepage() is called when the filesystem must attempt to drop | 309 | ->invalidatepage() is called when the filesystem must attempt to drop |
| 282 | some or all of the buffers from the page when it is being truncated. It | 310 | some or all of the buffers from the page when it is being truncated. It |
| @@ -288,55 +316,44 @@ buffers from the page in preparation for freeing it. It returns zero to | |||
| 288 | indicate that the buffers are (or may be) freeable. If ->releasepage is zero, | 316 | indicate that the buffers are (or may be) freeable. If ->releasepage is zero, |
| 289 | the kernel assumes that the fs has no private interest in the buffers. | 317 | the kernel assumes that the fs has no private interest in the buffers. |
| 290 | 318 | ||
| 319 | ->freepage() is called when the kernel is done dropping the page | ||
| 320 | from the page cache. | ||
| 321 | |||
| 291 | ->launder_page() may be called prior to releasing a page if | 322 | ->launder_page() may be called prior to releasing a page if |
| 292 | it is still found to be dirty. It returns zero if the page was successfully | 323 | it is still found to be dirty. It returns zero if the page was successfully |
| 293 | cleaned, or an error value if not. Note that in order to prevent the page | 324 | cleaned, or an error value if not. Note that in order to prevent the page |
| 294 | getting mapped back in and redirtied, it needs to be kept locked | 325 | getting mapped back in and redirtied, it needs to be kept locked |
| 295 | across the entire operation. | 326 | across the entire operation. |
| 296 | 327 | ||
| 297 | Note: currently almost all instances of address_space methods are | ||
| 298 | using BKL for internal serialization and that's one of the worst sources | ||
| 299 | of contention. Normally they are calling library functions (in fs/buffer.c) | ||
| 300 | and pass foo_get_block() as a callback (on local block-based filesystems, | ||
| 301 | indeed). BKL is not needed for library stuff and is usually taken by | ||
| 302 | foo_get_block(). It's an overkill, since block bitmaps can be protected by | ||
| 303 | internal fs locking and real critical areas are much smaller than the areas | ||
| 304 | filesystems protect now. | ||
| 305 | |||
| 306 | ----------------------- file_lock_operations ------------------------------ | 328 | ----------------------- file_lock_operations ------------------------------ |
| 307 | prototypes: | 329 | prototypes: |
| 308 | void (*fl_insert)(struct file_lock *); /* lock insertion callback */ | ||
| 309 | void (*fl_remove)(struct file_lock *); /* lock removal callback */ | ||
| 310 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 330 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); |
| 311 | void (*fl_release_private)(struct file_lock *); | 331 | void (*fl_release_private)(struct file_lock *); |
| 312 | 332 | ||
| 313 | 333 | ||
| 314 | locking rules: | 334 | locking rules: |
| 315 | BKL may block | 335 | file_lock_lock may block |
| 316 | fl_insert: yes no | 336 | fl_copy_lock: yes no |
| 317 | fl_remove: yes no | 337 | fl_release_private: maybe no |
| 318 | fl_copy_lock: yes no | ||
| 319 | fl_release_private: yes yes | ||
| 320 | 338 | ||
| 321 | ----------------------- lock_manager_operations --------------------------- | 339 | ----------------------- lock_manager_operations --------------------------- |
| 322 | prototypes: | 340 | prototypes: |
| 323 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); | 341 | int (*fl_compare_owner)(struct file_lock *, struct file_lock *); |
| 324 | void (*fl_notify)(struct file_lock *); /* unblock callback */ | 342 | void (*fl_notify)(struct file_lock *); /* unblock callback */ |
| 325 | void (*fl_copy_lock)(struct file_lock *, struct file_lock *); | 343 | int (*fl_grant)(struct file_lock *, struct file_lock *, int); |
| 326 | void (*fl_release_private)(struct file_lock *); | 344 | void (*fl_release_private)(struct file_lock *); |
| 327 | void (*fl_break)(struct file_lock *); /* break_lease callback */ | 345 | void (*fl_break)(struct file_lock *); /* break_lease callback */ |
| 346 | int (*fl_change)(struct file_lock **, int); | ||
| 328 | 347 | ||
| 329 | locking rules: | 348 | locking rules: |
| 330 | BKL may block | 349 | file_lock_lock may block |
| 331 | fl_compare_owner: yes no | 350 | fl_compare_owner: yes no |
| 332 | fl_notify: yes no | 351 | fl_notify: yes no |
| 333 | fl_copy_lock: yes no | 352 | fl_grant: no no |
| 334 | fl_release_private: yes yes | 353 | fl_release_private: maybe no |
| 335 | fl_break: yes no | 354 | fl_break: yes no |
| 336 | 355 | fl_change yes no | |
| 337 | Currently only NFSD and NLM provide instances of this class. None of the | 356 | |
| 338 | them block. If you have out-of-tree instances - please, show up. Locking | ||
| 339 | in that area will change. | ||
| 340 | --------------------------- buffer_head ----------------------------------- | 357 | --------------------------- buffer_head ----------------------------------- |
| 341 | prototypes: | 358 | prototypes: |
| 342 | void (*b_end_io)(struct buffer_head *bh, int uptodate); | 359 | void (*b_end_io)(struct buffer_head *bh, int uptodate); |
| @@ -349,21 +366,36 @@ call this method upon the IO completion. | |||
| 349 | 366 | ||
| 350 | --------------------------- block_device_operations ----------------------- | 367 | --------------------------- block_device_operations ----------------------- |
| 351 | prototypes: | 368 | prototypes: |
| 352 | int (*open) (struct inode *, struct file *); | 369 | int (*open) (struct block_device *, fmode_t); |
| 353 | int (*release) (struct inode *, struct file *); | 370 | int (*release) (struct gendisk *, fmode_t); |
| 354 | int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); | 371 | int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); |
| 372 | int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); | ||
| 373 | int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); | ||
| 355 | int (*media_changed) (struct gendisk *); | 374 | int (*media_changed) (struct gendisk *); |
| 375 | void (*unlock_native_capacity) (struct gendisk *); | ||
| 356 | int (*revalidate_disk) (struct gendisk *); | 376 | int (*revalidate_disk) (struct gendisk *); |
| 377 | int (*getgeo)(struct block_device *, struct hd_geometry *); | ||
| 378 | void (*swap_slot_free_notify) (struct block_device *, unsigned long); | ||
| 357 | 379 | ||
| 358 | locking rules: | 380 | locking rules: |
| 359 | BKL bd_sem | 381 | bd_mutex |
| 360 | open: yes yes | 382 | open: yes |
| 361 | release: yes yes | 383 | release: yes |
| 362 | ioctl: yes no | 384 | ioctl: no |
| 363 | media_changed: no no | 385 | compat_ioctl: no |
| 364 | revalidate_disk: no no | 386 | direct_access: no |
| 387 | media_changed: no | ||
| 388 | unlock_native_capacity: no | ||
| 389 | revalidate_disk: no | ||
| 390 | getgeo: no | ||
| 391 | swap_slot_free_notify: no (see below) | ||
| 392 | |||
| 393 | media_changed, unlock_native_capacity and revalidate_disk are called only from | ||
| 394 | check_disk_change(). | ||
| 395 | |||
| 396 | swap_slot_free_notify is called with swap_lock and sometimes the page lock | ||
| 397 | held. | ||
| 365 | 398 | ||
| 366 | The last two are called only from check_disk_change(). | ||
| 367 | 399 | ||
| 368 | --------------------------- file_operations ------------------------------- | 400 | --------------------------- file_operations ------------------------------- |
| 369 | prototypes: | 401 | prototypes: |
| @@ -395,34 +427,22 @@ prototypes: | |||
| 395 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, | 427 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, |
| 396 | unsigned long, unsigned long, unsigned long); | 428 | unsigned long, unsigned long, unsigned long); |
| 397 | int (*check_flags)(int); | 429 | int (*check_flags)(int); |
| 430 | int (*flock) (struct file *, int, struct file_lock *); | ||
| 431 | ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, | ||
| 432 | size_t, unsigned int); | ||
| 433 | ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, | ||
| 434 | size_t, unsigned int); | ||
| 435 | int (*setlease)(struct file *, long, struct file_lock **); | ||
| 436 | long (*fallocate)(struct file *, int, loff_t, loff_t); | ||
| 398 | }; | 437 | }; |
| 399 | 438 | ||
| 400 | locking rules: | 439 | locking rules: |
| 401 | All may block. | 440 | All may block except for ->setlease. |
| 402 | BKL | 441 | No VFS locks held on entry except for ->fsync and ->setlease. |
| 403 | llseek: no (see below) | 442 | |
| 404 | read: no | 443 | ->fsync() has i_mutex on inode. |
| 405 | aio_read: no | 444 | |
| 406 | write: no | 445 | ->setlease has the file_list_lock held and must not sleep. |
| 407 | aio_write: no | ||
| 408 | readdir: no | ||
| 409 | poll: no | ||
| 410 | unlocked_ioctl: no | ||
| 411 | compat_ioctl: no | ||
| 412 | mmap: no | ||
| 413 | open: no | ||
| 414 | flush: no | ||
| 415 | release: no | ||
| 416 | fsync: no (see below) | ||
| 417 | aio_fsync: no | ||
| 418 | fasync: no | ||
| 419 | lock: yes | ||
| 420 | readv: no | ||
| 421 | writev: no | ||
| 422 | sendfile: no | ||
| 423 | sendpage: no | ||
| 424 | get_unmapped_area: no | ||
| 425 | check_flags: no | ||
| 426 | 446 | ||
| 427 | ->llseek() locking has moved from llseek to the individual llseek | 447 | ->llseek() locking has moved from llseek to the individual llseek |
| 428 | implementations. If your fs is not using generic_file_llseek, you | 448 | implementations. If your fs is not using generic_file_llseek, you |
| @@ -432,17 +452,10 @@ mutex or just to use i_size_read() instead. | |||
| 432 | Note: this does not protect the file->f_pos against concurrent modifications | 452 | Note: this does not protect the file->f_pos against concurrent modifications |
| 433 | since this is something the userspace has to take care about. | 453 | since this is something the userspace has to take care about. |
| 434 | 454 | ||
| 435 | Note: ext2_release() was *the* source of contention on fs-intensive | 455 | ->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. |
| 436 | loads and dropping BKL on ->release() helps to get rid of that (we still | 456 | Most instances call fasync_helper(), which does that maintenance, so it's |
| 437 | grab BKL for cases when we close a file that had been opened r/w, but that | 457 | not normally something one needs to worry about. Return values > 0 will be |
| 438 | can and should be done using the internal locking with smaller critical areas). | 458 | mapped to zero in the VFS layer. |
| 439 | Current worst offender is ext2_get_block()... | ||
| 440 | |||
| 441 | ->fasync() is called without BKL protection, and is responsible for | ||
| 442 | maintaining the FASYNC bit in filp->f_flags. Most instances call | ||
| 443 | fasync_helper(), which does that maintenance, so it's not normally | ||
| 444 | something one needs to worry about. Return values > 0 will be mapped to | ||
| 445 | zero in the VFS layer. | ||
| 446 | 459 | ||
| 447 | ->readdir() and ->ioctl() on directories must be changed. Ideally we would | 460 | ->readdir() and ->ioctl() on directories must be changed. Ideally we would |
| 448 | move ->readdir() to inode_operations and use a separate method for directory | 461 | move ->readdir() to inode_operations and use a separate method for directory |
| @@ -453,8 +466,6 @@ components. And there are other reasons why the current interface is a mess... | |||
| 453 | ->read on directories probably must go away - we should just enforce -EISDIR | 466 | ->read on directories probably must go away - we should just enforce -EISDIR |
| 454 | in sys_read() and friends. | 467 | in sys_read() and friends. |
| 455 | 468 | ||
| 456 | ->fsync() has i_mutex on inode. | ||
| 457 | |||
| 458 | --------------------------- dquot_operations ------------------------------- | 469 | --------------------------- dquot_operations ------------------------------- |
| 459 | prototypes: | 470 | prototypes: |
| 460 | int (*write_dquot) (struct dquot *); | 471 | int (*write_dquot) (struct dquot *); |
| @@ -489,12 +500,12 @@ prototypes: | |||
| 489 | int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); | 500 | int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); |
| 490 | 501 | ||
| 491 | locking rules: | 502 | locking rules: |
| 492 | BKL mmap_sem PageLocked(page) | 503 | mmap_sem PageLocked(page) |
| 493 | open: no yes | 504 | open: yes |
| 494 | close: no yes | 505 | close: yes |
| 495 | fault: no yes can return with page locked | 506 | fault: yes can return with page locked |
| 496 | page_mkwrite: no yes can return with page locked | 507 | page_mkwrite: yes can return with page locked |
| 497 | access: no yes | 508 | access: yes |
| 498 | 509 | ||
| 499 | ->fault() is called when a previously not present pte is about | 510 | ->fault() is called when a previously not present pte is about |
| 500 | to be faulted in. The filesystem must find and return the page associated | 511 | to be faulted in. The filesystem must find and return the page associated |
| @@ -521,6 +532,3 @@ VM_IO | VM_PFNMAP VMAs. | |||
| 521 | 532 | ||
| 522 | (if you break something or notice that it is broken and do not fix it yourself | 533 | (if you break something or notice that it is broken and do not fix it yourself |
| 523 | - at least put it here) | 534 | - at least put it here) |
| 524 | |||
| 525 | ipc/shm.c::shm_delete() - may need BKL. | ||
| 526 | ->read() and ->write() in many drivers are (probably) missing BKL. | ||
diff --git a/Documentation/filesystems/adfs.txt b/Documentation/filesystems/adfs.txt index 9e8811f92b84..5949766353f7 100644 --- a/Documentation/filesystems/adfs.txt +++ b/Documentation/filesystems/adfs.txt | |||
| @@ -9,6 +9,9 @@ Mount options for ADFS | |||
| 9 | will be nnn. Default 0700. | 9 | will be nnn. Default 0700. |
| 10 | othmask=nnn The permission mask for ADFS 'other' permissions | 10 | othmask=nnn The permission mask for ADFS 'other' permissions |
| 11 | will be nnn. Default 0077. | 11 | will be nnn. Default 0077. |
| 12 | ftsuffix=n When ftsuffix=0, no file type suffix will be applied. | ||
| 13 | When ftsuffix=1, a hexadecimal suffix corresponding to | ||
| 14 | the RISC OS file type will be added. Default 0. | ||
| 12 | 15 | ||
| 13 | Mapping of ADFS permissions to Linux permissions | 16 | Mapping of ADFS permissions to Linux permissions |
| 14 | ------------------------------------------------ | 17 | ------------------------------------------------ |
| @@ -55,3 +58,18 @@ Mapping of ADFS permissions to Linux permissions | |||
| 55 | 58 | ||
| 56 | You can therefore tailor the permission translation to whatever you | 59 | You can therefore tailor the permission translation to whatever you |
| 57 | desire the permissions should be under Linux. | 60 | desire the permissions should be under Linux. |
| 61 | |||
| 62 | RISC OS file type suffix | ||
| 63 | ------------------------ | ||
| 64 | |||
| 65 | RISC OS file types are stored in bits 19..8 of the file load address. | ||
| 66 | |||
| 67 | To enable non-RISC OS systems to be used to store files without losing | ||
| 68 | file type information, a file naming convention was devised (initially | ||
| 69 | for use with NFS) such that a hexadecimal suffix of the form ,xyz | ||
| 70 | denoted the file type: e.g. BasicFile,ffb is a BASIC (0xffb) file. This | ||
| 71 | naming convention is now also used by RISC OS emulators such as RPCEmu. | ||
| 72 | |||
| 73 | Mounting an ADFS disc with option ftsuffix=1 will cause appropriate file | ||
| 74 | type suffixes to be appended to file names read from a directory. If the | ||
| 75 | ftsuffix option is zero or omitted, no file type suffixes will be added. | ||
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt index 51986bf08a4d..4c95935cbcf4 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs4-mount-control.txt | |||
| @@ -309,7 +309,7 @@ ioctlfd field set to the descriptor obtained from the open call. | |||
| 309 | AUTOFS_DEV_IOCTL_TIMEOUT_CMD | 309 | AUTOFS_DEV_IOCTL_TIMEOUT_CMD |
| 310 | ---------------------------- | 310 | ---------------------------- |
| 311 | 311 | ||
| 312 | Set the expire timeout for mounts withing an autofs mount point. | 312 | Set the expire timeout for mounts within an autofs mount point. |
| 313 | 313 | ||
| 314 | The call requires an initialized struct autofs_dev_ioctl with the | 314 | The call requires an initialized struct autofs_dev_ioctl with the |
| 315 | ioctlfd field set to the descriptor obtained from the open call. | 315 | ioctlfd field set to the descriptor obtained from the open call. |
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt index 1902c57b72ef..a167ab876c35 100644 --- a/Documentation/filesystems/caching/netfs-api.txt +++ b/Documentation/filesystems/caching/netfs-api.txt | |||
| @@ -95,7 +95,7 @@ restraints as possible on how an index is structured and where it is placed in | |||
| 95 | the tree. The netfs can even mix indices and data files at the same level, but | 95 | the tree. The netfs can even mix indices and data files at the same level, but |
| 96 | it's not recommended. | 96 | it's not recommended. |
| 97 | 97 | ||
| 98 | Each index entry consists of a key of indeterminate length plus some auxilliary | 98 | Each index entry consists of a key of indeterminate length plus some auxiliary |
| 99 | data, also of indeterminate length. | 99 | data, also of indeterminate length. |
| 100 | 100 | ||
| 101 | There are some limits on indices: | 101 | There are some limits on indices: |
| @@ -203,23 +203,23 @@ This has the following fields: | |||
| 203 | 203 | ||
| 204 | If the function is absent, a file size of 0 is assumed. | 204 | If the function is absent, a file size of 0 is assumed. |
| 205 | 205 | ||
| 206 | (6) A function to retrieve auxilliary data from the netfs [optional]. | 206 | (6) A function to retrieve auxiliary data from the netfs [optional]. |
| 207 | 207 | ||
| 208 | This function will be called with the netfs data that was passed to the | 208 | This function will be called with the netfs data that was passed to the |
| 209 | cookie acquisition function and the maximum length of auxilliary data that | 209 | cookie acquisition function and the maximum length of auxiliary data that |
| 210 | it may provide. It should write the auxilliary data into the given buffer | 210 | it may provide. It should write the auxiliary data into the given buffer |
| 211 | and return the quantity it wrote. | 211 | and return the quantity it wrote. |
| 212 | 212 | ||
| 213 | If this function is absent, the auxilliary data length will be set to 0. | 213 | If this function is absent, the auxiliary data length will be set to 0. |
| 214 | 214 | ||
| 215 | The length of the auxilliary data buffer may be dependent on the key | 215 | The length of the auxiliary data buffer may be dependent on the key |
| 216 | length. A netfs mustn't rely on being able to provide more than 400 bytes | 216 | length. A netfs mustn't rely on being able to provide more than 400 bytes |
| 217 | for both. | 217 | for both. |
| 218 | 218 | ||
| 219 | (7) A function to check the auxilliary data [optional]. | 219 | (7) A function to check the auxiliary data [optional]. |
| 220 | 220 | ||
| 221 | This function will be called to check that a match found in the cache for | 221 | This function will be called to check that a match found in the cache for |
| 222 | this object is valid. For instance with AFS it could check the auxilliary | 222 | this object is valid. For instance with AFS it could check the auxiliary |
| 223 | data against the data version number returned by the server to determine | 223 | data against the data version number returned by the server to determine |
| 224 | whether the index entry in a cache is still valid. | 224 | whether the index entry in a cache is still valid. |
| 225 | 225 | ||
| @@ -232,7 +232,7 @@ This has the following fields: | |||
| 232 | (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update | 232 | (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update |
| 233 | (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted | 233 | (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted |
| 234 | 234 | ||
| 235 | This function can also be used to extract data from the auxilliary data in | 235 | This function can also be used to extract data from the auxiliary data in |
| 236 | the cache and copy it into the netfs's structures. | 236 | the cache and copy it into the netfs's structures. |
| 237 | 237 | ||
| 238 | (8) A pair of functions to manage contexts for the completion callback | 238 | (8) A pair of functions to manage contexts for the completion callback |
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index fabcb0e00f25..dd57bb6bb390 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt | |||
| @@ -409,7 +409,7 @@ As a consequence of this, default_groups cannot be removed directly via | |||
| 409 | rmdir(2). They also are not considered when rmdir(2) on the parent | 409 | rmdir(2). They also are not considered when rmdir(2) on the parent |
| 410 | group is checking for children. | 410 | group is checking for children. |
| 411 | 411 | ||
| 412 | [Dependant Subsystems] | 412 | [Dependent Subsystems] |
| 413 | 413 | ||
| 414 | Sometimes other drivers depend on particular configfs items. For | 414 | Sometimes other drivers depend on particular configfs items. For |
| 415 | example, ocfs2 mounts depend on a heartbeat region item. If that | 415 | example, ocfs2 mounts depend on a heartbeat region item. If that |
diff --git a/Documentation/filesystems/configfs/configfs_example_explicit.c b/Documentation/filesystems/configfs/configfs_example_explicit.c index d428cc9f07f3..fd53869f5633 100644 --- a/Documentation/filesystems/configfs/configfs_example_explicit.c +++ b/Documentation/filesystems/configfs/configfs_example_explicit.c | |||
| @@ -89,7 +89,7 @@ static ssize_t childless_storeme_write(struct childless *childless, | |||
| 89 | char *p = (char *) page; | 89 | char *p = (char *) page; |
| 90 | 90 | ||
| 91 | tmp = simple_strtoul(p, &p, 10); | 91 | tmp = simple_strtoul(p, &p, 10); |
| 92 | if (!p || (*p && (*p != '\n'))) | 92 | if ((*p != '\0') && (*p != '\n')) |
| 93 | return -EINVAL; | 93 | return -EINVAL; |
| 94 | 94 | ||
| 95 | if (tmp > INT_MAX) | 95 | if (tmp > INT_MAX) |
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt deleted file mode 100644 index 79334ed5daa7..000000000000 --- a/Documentation/filesystems/dentry-locking.txt +++ /dev/null | |||
| @@ -1,174 +0,0 @@ | |||
| 1 | RCU-based dcache locking model | ||
| 2 | ============================== | ||
| 3 | |||
| 4 | On many workloads, the most common operation on dcache is to look up a | ||
| 5 | dentry, given a parent dentry and the name of the child. Typically, | ||
| 6 | for every open(), stat() etc., the dentry corresponding to the | ||
| 7 | pathname will be looked up by walking the tree starting with the first | ||
| 8 | component of the pathname and using that dentry along with the next | ||
| 9 | component to look up the next level and so on. Since it is a frequent | ||
| 10 | operation for workloads like multiuser environments and web servers, | ||
| 11 | it is important to optimize this path. | ||
| 12 | |||
| 13 | Prior to 2.5.10, dcache_lock was acquired in d_lookup and thus in | ||
| 14 | every component during path look-up. Since 2.5.10 onwards, fast-walk | ||
| 15 | algorithm changed this by holding the dcache_lock at the beginning and | ||
| 16 | walking as many cached path component dentries as possible. This | ||
| 17 | significantly decreases the number of acquisition of | ||
| 18 | dcache_lock. However it also increases the lock hold time | ||
| 19 | significantly and affects performance in large SMP machines. Since | ||
| 20 | 2.5.62 kernel, dcache has been using a new locking model that uses RCU | ||
| 21 | to make dcache look-up lock-free. | ||
| 22 | |||
| 23 | The current dcache locking model is not very different from the | ||
| 24 | existing dcache locking model. Prior to 2.5.62 kernel, dcache_lock | ||
| 25 | protected the hash chain, d_child, d_alias, d_lru lists as well as | ||
| 26 | d_inode and several other things like mount look-up. RCU-based changes | ||
| 27 | affect only the way the hash chain is protected. For everything else | ||
| 28 | the dcache_lock must be taken for both traversing as well as | ||
| 29 | updating. The hash chain updates too take the dcache_lock. The | ||
| 30 | significant change is the way d_lookup traverses the hash chain, it | ||
| 31 | doesn't acquire the dcache_lock for this and rely on RCU to ensure | ||
| 32 | that the dentry has not been *freed*. | ||
| 33 | |||
| 34 | |||
| 35 | Dcache locking details | ||
| 36 | ====================== | ||
| 37 | |||
| 38 | For many multi-user workloads, open() and stat() on files are very | ||
| 39 | frequently occurring operations. Both involve walking of path names to | ||
| 40 | find the dentry corresponding to the concerned file. In 2.4 kernel, | ||
| 41 | dcache_lock was held during look-up of each path component. Contention | ||
| 42 | and cache-line bouncing of this global lock caused significant | ||
| 43 | scalability problems. With the introduction of RCU in Linux kernel, | ||
| 44 | this was worked around by making the look-up of path components during | ||
| 45 | path walking lock-free. | ||
| 46 | |||
| 47 | |||
| 48 | Safe lock-free look-up of dcache hash table | ||
| 49 | =========================================== | ||
| 50 | |||
| 51 | Dcache is a complex data structure with the hash table entries also | ||
| 52 | linked together in other lists. In 2.4 kernel, dcache_lock protected | ||
| 53 | all the lists. We applied RCU only on hash chain walking. The rest of | ||
| 54 | the lists are still protected by dcache_lock. Some of the important | ||
| 55 | changes are : | ||
| 56 | |||
| 57 | 1. The deletion from hash chain is done using hlist_del_rcu() macro | ||
| 58 | which doesn't initialize next pointer of the deleted dentry and | ||
| 59 | this allows us to walk safely lock-free while a deletion is | ||
| 60 | happening. | ||
| 61 | |||
| 62 | 2. Insertion of a dentry into the hash table is done using | ||
| 63 | hlist_add_head_rcu() which take care of ordering the writes - the | ||
| 64 | writes to the dentry must be visible before the dentry is | ||
| 65 | inserted. This works in conjunction with hlist_for_each_rcu(), | ||
| 66 | which has since been replaced by hlist_for_each_entry_rcu(), while | ||
| 67 | walking the hash chain. The only requirement is that all | ||
| 68 | initialization to the dentry must be done before | ||
| 69 | hlist_add_head_rcu() since we don't have dcache_lock protection | ||
| 70 | while traversing the hash chain. This isn't different from the | ||
| 71 | existing code. | ||
| 72 | |||
| 73 | 3. The dentry looked up without holding dcache_lock by cannot be | ||
| 74 | returned for walking if it is unhashed. It then may have a NULL | ||
| 75 | d_inode or other bogosity since RCU doesn't protect the other | ||
| 76 | fields in the dentry. We therefore use a flag DCACHE_UNHASHED to | ||
| 77 | indicate unhashed dentries and use this in conjunction with a | ||
| 78 | per-dentry lock (d_lock). Once looked up without the dcache_lock, | ||
| 79 | we acquire the per-dentry lock (d_lock) and check if the dentry is | ||
| 80 | unhashed. If so, the look-up is failed. If not, the reference count | ||
| 81 | of the dentry is increased and the dentry is returned. | ||
| 82 | |||
| 83 | 4. Once a dentry is looked up, it must be ensured during the path walk | ||
| 84 | for that component it doesn't go away. In pre-2.5.10 code, this was | ||
| 85 | done holding a reference to the dentry. dcache_rcu does the same. | ||
| 86 | In some sense, dcache_rcu path walking looks like the pre-2.5.10 | ||
| 87 | version. | ||
| 88 | |||
| 89 | 5. All dentry hash chain updates must take the dcache_lock as well as | ||
| 90 | the per-dentry lock in that order. dput() does this to ensure that | ||
| 91 | a dentry that has just been looked up in another CPU doesn't get | ||
| 92 | deleted before dget() can be done on it. | ||
| 93 | |||
| 94 | 6. There are several ways to do reference counting of RCU protected | ||
| 95 | objects. One such example is in ipv4 route cache where deferred | ||
| 96 | freeing (using call_rcu()) is done as soon as the reference count | ||
| 97 | goes to zero. This cannot be done in the case of dentries because | ||
| 98 | tearing down of dentries require blocking (dentry_iput()) which | ||
| 99 | isn't supported from RCU callbacks. Instead, tearing down of | ||
| 100 | dentries happen synchronously in dput(), but actual freeing happens | ||
| 101 | later when RCU grace period is over. This allows safe lock-free | ||
| 102 | walking of the hash chains, but a matched dentry may have been | ||
| 103 | partially torn down. The checking of DCACHE_UNHASHED flag with | ||
| 104 | d_lock held detects such dentries and prevents them from being | ||
| 105 | returned from look-up. | ||
| 106 | |||
| 107 | |||
| 108 | Maintaining POSIX rename semantics | ||
| 109 | ================================== | ||
| 110 | |||
| 111 | Since look-up of dentries is lock-free, it can race against a | ||
| 112 | concurrent rename operation. For example, during rename of file A to | ||
| 113 | B, look-up of either A or B must succeed. So, if look-up of B happens | ||
| 114 | after A has been removed from the hash chain but not added to the new | ||
| 115 | hash chain, it may fail. Also, a comparison while the name is being | ||
| 116 | written concurrently by a rename may result in false positive matches | ||
| 117 | violating rename semantics. Issues related to race with rename are | ||
| 118 | handled as described below : | ||
| 119 | |||
| 120 | 1. Look-up can be done in two ways - d_lookup() which is safe from | ||
| 121 | simultaneous renames and __d_lookup() which is not. If | ||
| 122 | __d_lookup() fails, it must be followed up by a d_lookup() to | ||
| 123 | correctly determine whether a dentry is in the hash table or | ||
| 124 | not. d_lookup() protects look-ups using a sequence lock | ||
| 125 | (rename_lock). | ||
| 126 | |||
| 127 | 2. The name associated with a dentry (d_name) may be changed if a | ||
| 128 | rename is allowed to happen simultaneously. To avoid memcmp() in | ||
| 129 | __d_lookup() go out of bounds due to a rename and false positive | ||
| 130 | comparison, the name comparison is done while holding the | ||
| 131 | per-dentry lock. This prevents concurrent renames during this | ||
| 132 | operation. | ||
| 133 | |||
| 134 | 3. Hash table walking during look-up may move to a different bucket as | ||
| 135 | the current dentry is moved to a different bucket due to rename. | ||
| 136 | But we use hlists in dcache hash table and they are | ||
| 137 | null-terminated. So, even if a dentry moves to a different bucket, | ||
| 138 | hash chain walk will terminate. [with a list_head list, it may not | ||
| 139 | since termination is when the list_head in the original bucket is | ||
| 140 | reached]. Since we redo the d_parent check and compare name while | ||
| 141 | holding d_lock, lock-free look-up will not race against d_move(). | ||
| 142 | |||
| 143 | 4. There can be a theoretical race when a dentry keeps coming back to | ||
| 144 | original bucket due to double moves. Due to this look-up may | ||
| 145 | consider that it has never moved and can end up in a infinite loop. | ||
| 146 | But this is not any worse that theoretical livelocks we already | ||
| 147 | have in the kernel. | ||
| 148 | |||
| 149 | |||
| 150 | Important guidelines for filesystem developers related to dcache_rcu | ||
| 151 | ==================================================================== | ||
| 152 | |||
| 153 | 1. Existing dcache interfaces (pre-2.5.62) exported to filesystem | ||
| 154 | don't change. Only dcache internal implementation changes. However | ||
| 155 | filesystems *must not* delete from the dentry hash chains directly | ||
| 156 | using the list macros like allowed earlier. They must use dcache | ||
| 157 | APIs like d_drop() or __d_drop() depending on the situation. | ||
| 158 | |||
| 159 | 2. d_flags is now protected by a per-dentry lock (d_lock). All access | ||
| 160 | to d_flags must be protected by it. | ||
| 161 | |||
| 162 | 3. For a hashed dentry, checking of d_count needs to be protected by | ||
| 163 | d_lock. | ||
| 164 | |||
| 165 | |||
| 166 | Papers and other documentation on dcache locking | ||
| 167 | ================================================ | ||
| 168 | |||
| 169 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
| 170 | |||
| 171 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
| 172 | |||
| 173 | |||
| 174 | |||
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt index abd2a9b5b787..23583a136975 100644 --- a/Documentation/filesystems/exofs.txt +++ b/Documentation/filesystems/exofs.txt | |||
| @@ -104,7 +104,15 @@ Where: | |||
| 104 | exofs specific options: Options are separated by commas (,) | 104 | exofs specific options: Options are separated by commas (,) |
| 105 | pid=<integer> - The partition number to mount/create as | 105 | pid=<integer> - The partition number to mount/create as |
| 106 | container of the filesystem. | 106 | container of the filesystem. |
| 107 | This option is mandatory. | 107 | This option is mandatory. integer can be |
| 108 | Hex by pre-pending an 0x to the number. | ||
| 109 | osdname=<id> - Mount by a device's osdname. | ||
| 110 | osdname is usually a 36 character uuid of the | ||
| 111 | form "d2683732-c906-4ee1-9dbd-c10c27bb40df". | ||
| 112 | It is one of the device's uuid specified in the | ||
| 113 | mkfs.exofs format command. | ||
| 114 | If this option is specified then the /dev/osdX | ||
| 115 | above can be empty and is ignored. | ||
| 108 | to=<integer> - Timeout in ticks for a single command. | 116 | to=<integer> - Timeout in ticks for a single command. |
| 109 | default is (60 * HZ) [for debugging only] | 117 | default is (60 * HZ) [for debugging only] |
| 110 | 118 | ||
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index e1def1786e50..c79ec58fd7f6 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
| @@ -97,7 +97,7 @@ Note: More extensive information for getting started with ext4 can be | |||
| 97 | * Inode allocation using large virtual block groups via flex_bg | 97 | * Inode allocation using large virtual block groups via flex_bg |
| 98 | * delayed allocation | 98 | * delayed allocation |
| 99 | * large block (up to pagesize) support | 99 | * large block (up to pagesize) support |
| 100 | * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force | 100 | * efficient new ordered mode in JBD2 and ext4(avoid using buffer head to force |
| 101 | the ordering) | 101 | the ordering) |
| 102 | 102 | ||
| 103 | [1] Filesystems with a block size of 1k may see a limit imposed by the | 103 | [1] Filesystems with a block size of 1k may see a limit imposed by the |
| @@ -106,7 +106,7 @@ directory hash tree having a maximum depth of two. | |||
| 106 | 2.2 Candidate features for future inclusion | 106 | 2.2 Candidate features for future inclusion |
| 107 | 107 | ||
| 108 | * Online defrag (patches available but not well tested) | 108 | * Online defrag (patches available but not well tested) |
| 109 | * reduced mke2fs time via lazy itable initialization in conjuction with | 109 | * reduced mke2fs time via lazy itable initialization in conjunction with |
| 110 | the uninit_bg feature (capability to do this is available in e2fsprogs | 110 | the uninit_bg feature (capability to do this is available in e2fsprogs |
| 111 | but a kernel thread to do lazy zeroing of unused inode table blocks | 111 | but a kernel thread to do lazy zeroing of unused inode table blocks |
| 112 | after filesystem is first mounted is required for safety) | 112 | after filesystem is first mounted is required for safety) |
| @@ -353,12 +353,61 @@ noauto_da_alloc replacing existing files via patterns such as | |||
| 353 | system crashes before the delayed allocation | 353 | system crashes before the delayed allocation |
| 354 | blocks are forced to disk. | 354 | blocks are forced to disk. |
| 355 | 355 | ||
| 356 | discard Controls whether ext4 should issue discard/TRIM | 356 | noinit_itable Do not initialize any uninitialized inode table |
| 357 | blocks in the background. This feature may be | ||
| 358 | used by installation CD's so that the install | ||
| 359 | process can complete as quickly as possible; the | ||
| 360 | inode table initialization process would then be | ||
| 361 | deferred until the next time the file system | ||
| 362 | is unmounted. | ||
| 363 | |||
| 364 | init_itable=n The lazy itable init code will wait n times the | ||
| 365 | number of milliseconds it took to zero out the | ||
| 366 | previous block group's inode table. This | ||
| 367 | minimizes the impact on the systme performance | ||
| 368 | while file system's inode table is being initialized. | ||
| 369 | |||
| 370 | discard Controls whether ext4 should issue discard/TRIM | ||
| 357 | nodiscard(*) commands to the underlying block device when | 371 | nodiscard(*) commands to the underlying block device when |
| 358 | blocks are freed. This is useful for SSD devices | 372 | blocks are freed. This is useful for SSD devices |
| 359 | and sparse/thinly-provisioned LUNs, but it is off | 373 | and sparse/thinly-provisioned LUNs, but it is off |
| 360 | by default until sufficient testing has been done. | 374 | by default until sufficient testing has been done. |
| 361 | 375 | ||
| 376 | nouid32 Disables 32-bit UIDs and GIDs. This is for | ||
| 377 | interoperability with older kernels which only | ||
| 378 | store and expect 16-bit values. | ||
| 379 | |||
| 380 | resize Allows to resize filesystem to the end of the last | ||
| 381 | existing block group, further resize has to be done | ||
| 382 | with resize2fs either online, or offline. It can be | ||
| 383 | used only with conjunction with remount. | ||
| 384 | |||
| 385 | block_validity This options allows to enables/disables the in-kernel | ||
| 386 | noblock_validity facility for tracking filesystem metadata blocks | ||
| 387 | within internal data structures. This allows multi- | ||
| 388 | block allocator and other routines to quickly locate | ||
| 389 | extents which might overlap with filesystem metadata | ||
| 390 | blocks. This option is intended for debugging | ||
| 391 | purposes and since it negatively affects the | ||
| 392 | performance, it is off by default. | ||
| 393 | |||
| 394 | dioread_lock Controls whether or not ext4 should use the DIO read | ||
| 395 | dioread_nolock locking. If the dioread_nolock option is specified | ||
| 396 | ext4 will allocate uninitialized extent before buffer | ||
| 397 | write and convert the extent to initialized after IO | ||
| 398 | completes. This approach allows ext4 code to avoid | ||
| 399 | using inode mutex, which improves scalability on high | ||
| 400 | speed storages. However this does not work with nobh | ||
| 401 | option and the mount will fail. Nor does it work with | ||
| 402 | data journaling and dioread_nolock option will be | ||
| 403 | ignored with kernel warning. Note that dioread_nolock | ||
| 404 | code path is only used for extent-based files. | ||
| 405 | Because of the restrictions this options comprises | ||
| 406 | it is off by default (e.g. dioread_lock). | ||
| 407 | |||
| 408 | i_version Enable 64-bit inode version support. This option is | ||
| 409 | off by default. | ||
| 410 | |||
| 362 | Data Mode | 411 | Data Mode |
| 363 | ========= | 412 | ========= |
| 364 | There are 3 different data modes: | 413 | There are 3 different data modes: |
| @@ -386,6 +435,176 @@ needs to be read from and written to disk at the same time where it | |||
| 386 | outperforms all others modes. Currently ext4 does not have delayed | 435 | outperforms all others modes. Currently ext4 does not have delayed |
| 387 | allocation support if this data journalling mode is selected. | 436 | allocation support if this data journalling mode is selected. |
| 388 | 437 | ||
| 438 | /proc entries | ||
| 439 | ============= | ||
| 440 | |||
| 441 | Information about mounted ext4 file systems can be found in | ||
| 442 | /proc/fs/ext4. Each mounted filesystem will have a directory in | ||
| 443 | /proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or | ||
| 444 | /proc/fs/ext4/dm-0). The files in each per-device directory are shown | ||
| 445 | in table below. | ||
| 446 | |||
| 447 | Files in /proc/fs/ext4/<devname> | ||
| 448 | .............................................................................. | ||
| 449 | File Content | ||
| 450 | mb_groups details of multiblock allocator buddy cache of free blocks | ||
| 451 | .............................................................................. | ||
| 452 | |||
| 453 | /sys entries | ||
| 454 | ============ | ||
| 455 | |||
| 456 | Information about mounted ext4 file systems can be found in | ||
| 457 | /sys/fs/ext4. Each mounted filesystem will have a directory in | ||
| 458 | /sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or | ||
| 459 | /sys/fs/ext4/dm-0). The files in each per-device directory are shown | ||
| 460 | in table below. | ||
| 461 | |||
| 462 | Files in /sys/fs/ext4/<devname> | ||
| 463 | (see also Documentation/ABI/testing/sysfs-fs-ext4) | ||
| 464 | .............................................................................. | ||
| 465 | File Content | ||
| 466 | |||
| 467 | delayed_allocation_blocks This file is read-only and shows the number of | ||
| 468 | blocks that are dirty in the page cache, but | ||
| 469 | which do not have their location in the | ||
| 470 | filesystem allocated yet. | ||
| 471 | |||
| 472 | inode_goal Tuning parameter which (if non-zero) controls | ||
| 473 | the goal inode used by the inode allocator in | ||
| 474 | preference to all other allocation heuristics. | ||
| 475 | This is intended for debugging use only, and | ||
| 476 | should be 0 on production systems. | ||
| 477 | |||
| 478 | inode_readahead_blks Tuning parameter which controls the maximum | ||
| 479 | number of inode table blocks that ext4's inode | ||
| 480 | table readahead algorithm will pre-read into | ||
| 481 | the buffer cache | ||
| 482 | |||
| 483 | lifetime_write_kbytes This file is read-only and shows the number of | ||
| 484 | kilobytes of data that have been written to this | ||
| 485 | filesystem since it was created. | ||
| 486 | |||
| 487 | max_writeback_mb_bump The maximum number of megabytes the writeback | ||
| 488 | code will try to write out before move on to | ||
| 489 | another inode. | ||
| 490 | |||
| 491 | mb_group_prealloc The multiblock allocator will round up allocation | ||
| 492 | requests to a multiple of this tuning parameter if | ||
| 493 | the stripe size is not set in the ext4 superblock | ||
| 494 | |||
| 495 | mb_max_to_scan The maximum number of extents the multiblock | ||
| 496 | allocator will search to find the best extent | ||
| 497 | |||
| 498 | mb_min_to_scan The minimum number of extents the multiblock | ||
| 499 | allocator will search to find the best extent | ||
| 500 | |||
| 501 | mb_order2_req Tuning parameter which controls the minimum size | ||
| 502 | for requests (as a power of 2) where the buddy | ||
| 503 | cache is used | ||
| 504 | |||
| 505 | mb_stats Controls whether the multiblock allocator should | ||
| 506 | collect statistics, which are shown during the | ||
| 507 | unmount. 1 means to collect statistics, 0 means | ||
| 508 | not to collect statistics | ||
| 509 | |||
| 510 | mb_stream_req Files which have fewer blocks than this tunable | ||
| 511 | parameter will have their blocks allocated out | ||
| 512 | of a block group specific preallocation pool, so | ||
| 513 | that small files are packed closely together. | ||
| 514 | Each large file will have its blocks allocated | ||
| 515 | out of its own unique preallocation pool. | ||
| 516 | |||
| 517 | session_write_kbytes This file is read-only and shows the number of | ||
| 518 | kilobytes of data that have been written to this | ||
| 519 | filesystem since it was mounted. | ||
| 520 | .............................................................................. | ||
| 521 | |||
| 522 | Ioctls | ||
| 523 | ====== | ||
| 524 | |||
| 525 | There is some Ext4 specific functionality which can be accessed by applications | ||
| 526 | through the system call interfaces. The list of all Ext4 specific ioctls are | ||
| 527 | shown in the table below. | ||
| 528 | |||
| 529 | Table of Ext4 specific ioctls | ||
| 530 | .............................................................................. | ||
| 531 | Ioctl Description | ||
| 532 | EXT4_IOC_GETFLAGS Get additional attributes associated with inode. | ||
| 533 | The ioctl argument is an integer bitfield, with | ||
| 534 | bit values described in ext4.h. This ioctl is an | ||
| 535 | alias for FS_IOC_GETFLAGS. | ||
| 536 | |||
| 537 | EXT4_IOC_SETFLAGS Set additional attributes associated with inode. | ||
| 538 | The ioctl argument is an integer bitfield, with | ||
| 539 | bit values described in ext4.h. This ioctl is an | ||
| 540 | alias for FS_IOC_SETFLAGS. | ||
| 541 | |||
| 542 | EXT4_IOC_GETVERSION | ||
| 543 | EXT4_IOC_GETVERSION_OLD | ||
| 544 | Get the inode i_generation number stored for | ||
| 545 | each inode. The i_generation number is normally | ||
| 546 | changed only when new inode is created and it is | ||
| 547 | particularly useful for network filesystems. The | ||
| 548 | '_OLD' version of this ioctl is an alias for | ||
| 549 | FS_IOC_GETVERSION. | ||
| 550 | |||
| 551 | EXT4_IOC_SETVERSION | ||
| 552 | EXT4_IOC_SETVERSION_OLD | ||
| 553 | Set the inode i_generation number stored for | ||
| 554 | each inode. The '_OLD' version of this ioctl | ||
| 555 | is an alias for FS_IOC_SETVERSION. | ||
| 556 | |||
| 557 | EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize | ||
| 558 | mount option. It allows to resize filesystem | ||
| 559 | to the end of the last existing block group, | ||
| 560 | further resize has to be done with resize2fs, | ||
| 561 | either online, or offline. The argument points | ||
| 562 | to the unsigned logn number representing the | ||
| 563 | filesystem new block count. | ||
| 564 | |||
| 565 | EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one | ||
| 566 | this ioctl is pointing to) to the donor_fd (the | ||
| 567 | one specified in move_extent structure passed | ||
| 568 | as an argument to this ioctl). Then, exchange | ||
| 569 | inode metadata between orig_fd and donor_fd. | ||
| 570 | This is especially useful for online | ||
| 571 | defragmentation, because the allocator has the | ||
| 572 | opportunity to allocate moved blocks better, | ||
| 573 | ideally into one contiguous extent. | ||
| 574 | |||
| 575 | EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or | ||
| 576 | new group descriptor block. The new group | ||
| 577 | descriptor is described by ext4_new_group_input | ||
| 578 | structure, which is passed as an argument to | ||
| 579 | this ioctl. This is especially useful in | ||
| 580 | conjunction with EXT4_IOC_GROUP_EXTEND, | ||
| 581 | which allows online resize of the filesystem | ||
| 582 | to the end of the last existing block group. | ||
| 583 | Those two ioctls combined is used in userspace | ||
| 584 | online resize tool (e.g. resize2fs). | ||
| 585 | |||
| 586 | EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself. | ||
| 587 | It converts (migrates) ext3 indirect block mapped | ||
| 588 | inode to ext4 extent mapped inode by walking | ||
| 589 | through indirect block mapping of the original | ||
| 590 | inode and converting contiguous block ranges | ||
| 591 | into ext4 extents of the temporary inode. Then, | ||
| 592 | inodes are swapped. This ioctl might help, when | ||
| 593 | migrating from ext3 to ext4 filesystem, however | ||
| 594 | suggestion is to create fresh ext4 filesystem | ||
| 595 | and copy data from the backup. Note, that | ||
| 596 | filesystem has to support extents for this ioctl | ||
| 597 | to work. | ||
| 598 | |||
| 599 | EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be | ||
| 600 | allocated to preserve application-expected ext3 | ||
| 601 | behaviour. Note that this will also start | ||
| 602 | triggering a write of the data blocks, but this | ||
| 603 | behaviour may change in the future as it is | ||
| 604 | not necessary and has been done this way only | ||
| 605 | for sake of simplicity. | ||
| 606 | .............................................................................. | ||
| 607 | |||
| 389 | References | 608 | References |
| 390 | ========== | 609 | ========== |
| 391 | 610 | ||
diff --git a/Documentation/filesystems/gfs2-uevents.txt b/Documentation/filesystems/gfs2-uevents.txt index fd966dc9979a..d81889669293 100644 --- a/Documentation/filesystems/gfs2-uevents.txt +++ b/Documentation/filesystems/gfs2-uevents.txt | |||
| @@ -62,7 +62,7 @@ be fixed. | |||
| 62 | 62 | ||
| 63 | The REMOVE uevent is generated at the end of an unsuccessful mount | 63 | The REMOVE uevent is generated at the end of an unsuccessful mount |
| 64 | or at the end of a umount of the filesystem. All REMOVE uevents will | 64 | or at the end of a umount of the filesystem. All REMOVE uevents will |
| 65 | have been preceeded by at least an ADD uevent for the same fileystem, | 65 | have been preceded by at least an ADD uevent for the same fileystem, |
| 66 | and unlike the other uevents is generated automatically by the kernel's | 66 | and unlike the other uevents is generated automatically by the kernel's |
| 67 | kobject subsystem. | 67 | kobject subsystem. |
| 68 | 68 | ||
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt index 0b59c0200912..4cda926628aa 100644 --- a/Documentation/filesystems/gfs2.txt +++ b/Documentation/filesystems/gfs2.txt | |||
| @@ -11,7 +11,7 @@ their I/O so file system consistency is maintained. One of the nifty | |||
| 11 | features of GFS is perfect consistency -- changes made to the file system | 11 | features of GFS is perfect consistency -- changes made to the file system |
| 12 | on one machine show up immediately on all other machines in the cluster. | 12 | on one machine show up immediately on all other machines in the cluster. |
| 13 | 13 | ||
| 14 | GFS uses interchangable inter-node locking mechanisms, the currently | 14 | GFS uses interchangeable inter-node locking mechanisms, the currently |
| 15 | supported mechanisms are: | 15 | supported mechanisms are: |
| 16 | 16 | ||
| 17 | lock_nolock -- allows gfs to be used as a local file system | 17 | lock_nolock -- allows gfs to be used as a local file system |
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX index 3225a5662114..a57e12411d2a 100644 --- a/Documentation/filesystems/nfs/00-INDEX +++ b/Documentation/filesystems/nfs/00-INDEX | |||
| @@ -12,6 +12,8 @@ nfs-rdma.txt | |||
| 12 | - how to install and setup the Linux NFS/RDMA client and server software | 12 | - how to install and setup the Linux NFS/RDMA client and server software |
| 13 | nfsroot.txt | 13 | nfsroot.txt |
| 14 | - short guide on setting up a diskless box with NFS root filesystem. | 14 | - short guide on setting up a diskless box with NFS root filesystem. |
| 15 | pnfs.txt | ||
| 16 | - short explanation of some of the internals of the pnfs client code | ||
| 15 | rpc-cache.txt | 17 | rpc-cache.txt |
| 16 | - introduction to the caching mechanisms in the sunrpc layer. | 18 | - introduction to the caching mechanisms in the sunrpc layer. |
| 17 | idmapper.txt | 19 | idmapper.txt |
diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt index c3852041a21f..b9b4192ea8b5 100644 --- a/Documentation/filesystems/nfs/idmapper.txt +++ b/Documentation/filesystems/nfs/idmapper.txt | |||
| @@ -6,7 +6,7 @@ Id mapper is used by NFS to translate user and group ids into names, and to | |||
| 6 | translate user and group names into ids. Part of this translation involves | 6 | translate user and group names into ids. Part of this translation involves |
| 7 | performing an upcall to userspace to request the information. Id mapper will | 7 | performing an upcall to userspace to request the information. Id mapper will |
| 8 | user request-key to perform this upcall and cache the result. The program | 8 | user request-key to perform this upcall and cache the result. The program |
| 9 | /usr/sbin/nfs.upcall should be called by request-key, and will perform the | 9 | /usr/sbin/nfs.idmap should be called by request-key, and will perform the |
| 10 | translation and initialize a key with the resulting information. | 10 | translation and initialize a key with the resulting information. |
| 11 | 11 | ||
| 12 | NFS_USE_NEW_IDMAPPER must be selected when configuring the kernel to use this | 12 | NFS_USE_NEW_IDMAPPER must be selected when configuring the kernel to use this |
| @@ -20,12 +20,12 @@ direct the upcall. The following line should be added: | |||
| 20 | 20 | ||
| 21 | #OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ... | 21 | #OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ... |
| 22 | #====== ======= =============== =============== =============================== | 22 | #====== ======= =============== =============== =============================== |
| 23 | create id_resolver * * /usr/sbin/nfs.upcall %k %d 600 | 23 | create id_resolver * * /usr/sbin/nfs.idmap %k %d 600 |
| 24 | 24 | ||
| 25 | This will direct all id_resolver requests to the program /usr/sbin/nfs.upcall. | 25 | This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap. |
| 26 | The last parameter, 600, defines how many seconds into the future the key will | 26 | The last parameter, 600, defines how many seconds into the future the key will |
| 27 | expire. This parameter is optional for /usr/sbin/nfs.upcall. When the timeout | 27 | expire. This parameter is optional for /usr/sbin/nfs.idmap. When the timeout |
| 28 | is not specified, nfs.upcall will default to 600 seconds. | 28 | is not specified, nfs.idmap will default to 600 seconds. |
| 29 | 29 | ||
| 30 | id mapper uses for key descriptions: | 30 | id mapper uses for key descriptions: |
| 31 | uid: Find the UID for the given user | 31 | uid: Find the UID for the given user |
| @@ -39,29 +39,29 @@ would edit your request-key.conf so it look similar to this: | |||
| 39 | 39 | ||
| 40 | #OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ... | 40 | #OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ... |
| 41 | #====== ======= =============== =============== =============================== | 41 | #====== ======= =============== =============== =============================== |
| 42 | create id_resolver uid:* * /some/other/program %k %d 600 | 42 | create id_resolver uid:* * /some/other/program %k %d 600 |
| 43 | create id_resolver * * /usr/sbin/nfs.upcall %k %d 600 | 43 | create id_resolver * * /usr/sbin/nfs.idmap %k %d 600 |
| 44 | 44 | ||
| 45 | Notice that the new line was added above the line for the generic program. | 45 | Notice that the new line was added above the line for the generic program. |
| 46 | request-key will find the first matching line and corresponding program. In | 46 | request-key will find the first matching line and corresponding program. In |
| 47 | this case, /some/other/program will handle all uid lookups and | 47 | this case, /some/other/program will handle all uid lookups and |
| 48 | /usr/sbin/nfs.upcall will handle gid, user, and group lookups. | 48 | /usr/sbin/nfs.idmap will handle gid, user, and group lookups. |
| 49 | 49 | ||
| 50 | See <file:Documentation/keys-request-keys.txt> for more information about the | 50 | See <file:Documentation/keys-request-keys.txt> for more information about the |
| 51 | request-key function. | 51 | request-key function. |
| 52 | 52 | ||
| 53 | 53 | ||
| 54 | ========== | 54 | ========= |
| 55 | nfs.upcall | 55 | nfs.idmap |
| 56 | ========== | 56 | ========= |
| 57 | nfs.upcall is designed to be called by request-key, and should not be run "by | 57 | nfs.idmap is designed to be called by request-key, and should not be run "by |
| 58 | hand". This program takes two arguments, a serialized key and a key | 58 | hand". This program takes two arguments, a serialized key and a key |
| 59 | description. The serialized key is first converted into a key_serial_t, and | 59 | description. The serialized key is first converted into a key_serial_t, and |
| 60 | then passed as an argument to keyctl_instantiate (both are part of keyutils.h). | 60 | then passed as an argument to keyctl_instantiate (both are part of keyutils.h). |
| 61 | 61 | ||
| 62 | The actual lookups are performed by functions found in nfsidmap.h. nfs.upcall | 62 | The actual lookups are performed by functions found in nfsidmap.h. nfs.idmap |
| 63 | determines the correct function to call by looking at the first part of the | 63 | determines the correct function to call by looking at the first part of the |
| 64 | description string. For example, a uid lookup description will appear as | 64 | description string. For example, a uid lookup description will appear as |
| 65 | "uid:user@domain". | 65 | "uid:user@domain". |
| 66 | 66 | ||
| 67 | nfs.upcall will return 0 if the key was instantiated, and non-zero otherwise. | 67 | nfs.idmap will return 0 if the key was instantiated, and non-zero otherwise. |
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt new file mode 100644 index 000000000000..983e14abe7e9 --- /dev/null +++ b/Documentation/filesystems/nfs/pnfs.txt | |||
| @@ -0,0 +1,55 @@ | |||
| 1 | Reference counting in pnfs: | ||
| 2 | ========================== | ||
| 3 | |||
| 4 | The are several inter-related caches. We have layouts which can | ||
| 5 | reference multiple devices, each of which can reference multiple data servers. | ||
| 6 | Each data server can be referenced by multiple devices. Each device | ||
| 7 | can be referenced by multiple layouts. To keep all of this straight, | ||
| 8 | we need to reference count. | ||
| 9 | |||
| 10 | |||
| 11 | struct pnfs_layout_hdr | ||
| 12 | ---------------------- | ||
| 13 | The on-the-wire command LAYOUTGET corresponds to struct | ||
| 14 | pnfs_layout_segment, usually referred to by the variable name lseg. | ||
| 15 | Each nfs_inode may hold a pointer to a cache of of these layout | ||
| 16 | segments in nfsi->layout, of type struct pnfs_layout_hdr. | ||
| 17 | |||
| 18 | We reference the header for the inode pointing to it, across each | ||
| 19 | outstanding RPC call that references it (LAYOUTGET, LAYOUTRETURN, | ||
| 20 | LAYOUTCOMMIT), and for each lseg held within. | ||
| 21 | |||
| 22 | Each header is also (when non-empty) put on a list associated with | ||
| 23 | struct nfs_client (cl_layouts). Being put on this list does not bump | ||
| 24 | the reference count, as the layout is kept around by the lseg that | ||
| 25 | keeps it in the list. | ||
| 26 | |||
| 27 | deviceid_cache | ||
| 28 | -------------- | ||
| 29 | lsegs reference device ids, which are resolved per nfs_client and | ||
| 30 | layout driver type. The device ids are held in a RCU cache (struct | ||
| 31 | nfs4_deviceid_cache). The cache itself is referenced across each | ||
| 32 | mount. The entries (struct nfs4_deviceid) themselves are held across | ||
| 33 | the lifetime of each lseg referencing them. | ||
| 34 | |||
| 35 | RCU is used because the deviceid is basically a write once, read many | ||
| 36 | data structure. The hlist size of 32 buckets needs better | ||
| 37 | justification, but seems reasonable given that we can have multiple | ||
| 38 | deviceid's per filesystem, and multiple filesystems per nfs_client. | ||
| 39 | |||
| 40 | The hash code is copied from the nfsd code base. A discussion of | ||
| 41 | hashing and variations of this algorithm can be found at: | ||
| 42 | http://groups.google.com/group/comp.lang.c/browse_thread/thread/9522965e2b8d3809 | ||
| 43 | |||
| 44 | data server cache | ||
| 45 | ----------------- | ||
| 46 | file driver devices refer to data servers, which are kept in a module | ||
| 47 | level cache. Its reference is held over the lifetime of the deviceid | ||
| 48 | pointing to it. | ||
| 49 | |||
| 50 | lseg | ||
| 51 | ---- | ||
| 52 | lseg maintains an extra reference corresponding to the NFS_LSEG_VALID | ||
| 53 | bit which holds it in the pnfs_layout_hdr's list. When the final lseg | ||
| 54 | is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED | ||
| 55 | bit is set, preventing any new lsegs from being added. | ||
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt index ac2a261c5f7d..791af8dac065 100644 --- a/Documentation/filesystems/ntfs.txt +++ b/Documentation/filesystems/ntfs.txt | |||
| @@ -350,7 +350,7 @@ Note the "Should sync?" parameter "nosync" means that the two mirrors are | |||
| 350 | already in sync which will be the case on a clean shutdown of Windows. If the | 350 | already in sync which will be the case on a clean shutdown of Windows. If the |
| 351 | mirrors are not clean, you can specify the "sync" option instead of "nosync" | 351 | mirrors are not clean, you can specify the "sync" option instead of "nosync" |
| 352 | and the Device-Mapper driver will then copy the entirety of the "Source Device" | 352 | and the Device-Mapper driver will then copy the entirety of the "Source Device" |
| 353 | to the "Target Device" or if you specified multipled target devices to all of | 353 | to the "Target Device" or if you specified multiple target devices to all of |
| 354 | them. | 354 | them. |
| 355 | 355 | ||
| 356 | Once you have your table, save it in a file somewhere (e.g. /etc/ntfsvolume1), | 356 | Once you have your table, save it in a file somewhere (e.g. /etc/ntfsvolume1), |
| @@ -457,6 +457,11 @@ ChangeLog | |||
| 457 | 457 | ||
| 458 | Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. | 458 | Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. |
| 459 | 459 | ||
| 460 | 2.1.30: | ||
| 461 | - Fix writev() (it kept writing the first segment over and over again | ||
| 462 | instead of moving onto subsequent segments). | ||
| 463 | - Fix crash in ntfs_mft_record_alloc() when mapping the new extent mft | ||
| 464 | record failed. | ||
| 460 | 2.1.29: | 465 | 2.1.29: |
| 461 | - Fix a deadlock when mounting read-write. | 466 | - Fix a deadlock when mounting read-write. |
| 462 | 2.1.28: | 467 | 2.1.28: |
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index 5393e6611691..9ed920a8cd79 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt | |||
| @@ -80,7 +80,7 @@ user_xattr (*) Enables Extended User Attributes. | |||
| 80 | nouser_xattr Disables Extended User Attributes. | 80 | nouser_xattr Disables Extended User Attributes. |
| 81 | acl Enables POSIX Access Control Lists support. | 81 | acl Enables POSIX Access Control Lists support. |
| 82 | noacl (*) Disables POSIX Access Control Lists support. | 82 | noacl (*) Disables POSIX Access Control Lists support. |
| 83 | resv_level=2 (*) Set how agressive allocation reservations will be. | 83 | resv_level=2 (*) Set how aggressive allocation reservations will be. |
| 84 | Valid values are between 0 (reservations off) to 8 | 84 | Valid values are between 0 (reservations off) to 8 |
| 85 | (maximum space for reservations). | 85 | (maximum space for reservations). |
| 86 | dir_resv_level= (*) By default, directory reservations will scale with file | 86 | dir_resv_level= (*) By default, directory reservations will scale with file |
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt new file mode 100644 index 000000000000..3571667c7105 --- /dev/null +++ b/Documentation/filesystems/path-lookup.txt | |||
| @@ -0,0 +1,382 @@ | |||
| 1 | Path walking and name lookup locking | ||
| 2 | ==================================== | ||
| 3 | |||
| 4 | Path resolution is the finding a dentry corresponding to a path name string, by | ||
| 5 | performing a path walk. Typically, for every open(), stat() etc., the path name | ||
| 6 | will be resolved. Paths are resolved by walking the namespace tree, starting | ||
| 7 | with the first component of the pathname (eg. root or cwd) with a known dentry, | ||
| 8 | then finding the child of that dentry, which is named the next component in the | ||
| 9 | path string. Then repeating the lookup from the child dentry and finding its | ||
| 10 | child with the next element, and so on. | ||
| 11 | |||
| 12 | Since it is a frequent operation for workloads like multiuser environments and | ||
| 13 | web servers, it is important to optimize this code. | ||
| 14 | |||
| 15 | Path walking synchronisation history: | ||
| 16 | Prior to 2.5.10, dcache_lock was acquired in d_lookup (dcache hash lookup) and | ||
| 17 | thus in every component during path look-up. Since 2.5.10 onwards, fast-walk | ||
| 18 | algorithm changed this by holding the dcache_lock at the beginning and walking | ||
| 19 | as many cached path component dentries as possible. This significantly | ||
| 20 | decreases the number of acquisition of dcache_lock. However it also increases | ||
| 21 | the lock hold time significantly and affects performance in large SMP machines. | ||
| 22 | Since 2.5.62 kernel, dcache has been using a new locking model that uses RCU to | ||
| 23 | make dcache look-up lock-free. | ||
| 24 | |||
| 25 | All the above algorithms required taking a lock and reference count on the | ||
| 26 | dentry that was looked up, so that may be used as the basis for walking the | ||
| 27 | next path element. This is inefficient and unscalable. It is inefficient | ||
| 28 | because of the locks and atomic operations required for every dentry element | ||
| 29 | slows things down. It is not scalable because many parallel applications that | ||
| 30 | are path-walk intensive tend to do path lookups starting from a common dentry | ||
| 31 | (usually, the root "/" or current working directory). So contention on these | ||
| 32 | common path elements causes lock and cacheline queueing. | ||
| 33 | |||
| 34 | Since 2.6.38, RCU is used to make a significant part of the entire path walk | ||
| 35 | (including dcache look-up) completely "store-free" (so, no locks, atomics, or | ||
| 36 | even stores into cachelines of common dentries). This is known as "rcu-walk" | ||
| 37 | path walking. | ||
| 38 | |||
| 39 | Path walking overview | ||
| 40 | ===================== | ||
| 41 | |||
| 42 | A name string specifies a start (root directory, cwd, fd-relative) and a | ||
| 43 | sequence of elements (directory entry names), which together refer to a path in | ||
| 44 | the namespace. A path is represented as a (dentry, vfsmount) tuple. The name | ||
| 45 | elements are sub-strings, separated by '/'. | ||
| 46 | |||
| 47 | Name lookups will want to find a particular path that a name string refers to | ||
| 48 | (usually the final element, or parent of final element). This is done by taking | ||
| 49 | the path given by the name's starting point (which we know in advance -- eg. | ||
| 50 | current->fs->cwd or current->fs->root) as the first parent of the lookup. Then | ||
| 51 | iteratively for each subsequent name element, look up the child of the current | ||
| 52 | parent with the given name and if it is not the desired entry, make it the | ||
| 53 | parent for the next lookup. | ||
| 54 | |||
| 55 | A parent, of course, must be a directory, and we must have appropriate | ||
| 56 | permissions on the parent inode to be able to walk into it. | ||
| 57 | |||
| 58 | Turning the child into a parent for the next lookup requires more checks and | ||
| 59 | procedures. Symlinks essentially substitute the symlink name for the target | ||
| 60 | name in the name string, and require some recursive path walking. Mount points | ||
| 61 | must be followed into (thus changing the vfsmount that subsequent path elements | ||
| 62 | refer to), switching from the mount point path to the root of the particular | ||
| 63 | mounted vfsmount. These behaviours are variously modified depending on the | ||
| 64 | exact path walking flags. | ||
| 65 | |||
| 66 | Path walking then must, broadly, do several particular things: | ||
| 67 | - find the start point of the walk; | ||
| 68 | - perform permissions and validity checks on inodes; | ||
| 69 | - perform dcache hash name lookups on (parent, name element) tuples; | ||
| 70 | - traverse mount points; | ||
| 71 | - traverse symlinks; | ||
| 72 | - lookup and create missing parts of the path on demand. | ||
| 73 | |||
| 74 | Safe store-free look-up of dcache hash table | ||
| 75 | ============================================ | ||
| 76 | |||
| 77 | Dcache name lookup | ||
| 78 | ------------------ | ||
| 79 | In order to lookup a dcache (parent, name) tuple, we take a hash on the tuple | ||
| 80 | and use that to select a bucket in the dcache-hash table. The list of entries | ||
| 81 | in that bucket is then walked, and we do a full comparison of each entry | ||
| 82 | against our (parent, name) tuple. | ||
| 83 | |||
| 84 | The hash lists are RCU protected, so list walking is not serialised with | ||
| 85 | concurrent updates (insertion, deletion from the hash). This is a standard RCU | ||
| 86 | list application with the exception of renames, which will be covered below. | ||
| 87 | |||
| 88 | Parent and name members of a dentry, as well as its membership in the dcache | ||
| 89 | hash, and its inode are protected by the per-dentry d_lock spinlock. A | ||
| 90 | reference is taken on the dentry (while the fields are verified under d_lock), | ||
| 91 | and this stabilises its d_inode pointer and actual inode. This gives a stable | ||
| 92 | point to perform the next step of our path walk against. | ||
| 93 | |||
| 94 | These members are also protected by d_seq seqlock, although this offers | ||
| 95 | read-only protection and no durability of results, so care must be taken when | ||
| 96 | using d_seq for synchronisation (see seqcount based lookups, below). | ||
| 97 | |||
| 98 | Renames | ||
| 99 | ------- | ||
| 100 | Back to the rename case. In usual RCU protected lists, the only operations that | ||
| 101 | will happen to an object is insertion, and then eventually removal from the | ||
| 102 | list. The object will not be reused until an RCU grace period is complete. | ||
| 103 | This ensures the RCU list traversal primitives can run over the object without | ||
| 104 | problems (see RCU documentation for how this works). | ||
| 105 | |||
| 106 | However when a dentry is renamed, its hash value can change, requiring it to be | ||
| 107 | moved to a new hash list. Allocating and inserting a new alias would be | ||
| 108 | expensive and also problematic for directory dentries. Latency would be far to | ||
| 109 | high to wait for a grace period after removing the dentry and before inserting | ||
| 110 | it in the new hash bucket. So what is done is to insert the dentry into the | ||
| 111 | new list immediately. | ||
| 112 | |||
| 113 | However, when the dentry's list pointers are updated to point to objects in the | ||
| 114 | new list before waiting for a grace period, this can result in a concurrent RCU | ||
| 115 | lookup of the old list veering off into the new (incorrect) list and missing | ||
| 116 | the remaining dentries on the list. | ||
| 117 | |||
| 118 | There is no fundamental problem with walking down the wrong list, because the | ||
| 119 | dentry comparisons will never match. However it is fatal to miss a matching | ||
| 120 | dentry. So a seqlock is used to detect when a rename has occurred, and so the | ||
| 121 | lookup can be retried. | ||
| 122 | |||
| 123 | 1 2 3 | ||
| 124 | +---+ +---+ +---+ | ||
| 125 | hlist-->| N-+->| N-+->| N-+-> | ||
| 126 | head <--+-P |<-+-P |<-+-P | | ||
| 127 | +---+ +---+ +---+ | ||
| 128 | |||
| 129 | Rename of dentry 2 may require it deleted from the above list, and inserted | ||
| 130 | into a new list. Deleting 2 gives the following list. | ||
| 131 | |||
| 132 | 1 3 | ||
| 133 | +---+ +---+ (don't worry, the longer pointers do not | ||
| 134 | hlist-->| N-+-------->| N-+-> impose a measurable performance overhead | ||
| 135 | head <--+-P |<--------+-P | on modern CPUs) | ||
| 136 | +---+ +---+ | ||
| 137 | ^ 2 ^ | ||
| 138 | | +---+ | | ||
| 139 | | | N-+----+ | ||
| 140 | +----+-P | | ||
| 141 | +---+ | ||
| 142 | |||
| 143 | This is a standard RCU-list deletion, which leaves the deleted object's | ||
| 144 | pointers intact, so a concurrent list walker that is currently looking at | ||
| 145 | object 2 will correctly continue to object 3 when it is time to traverse the | ||
| 146 | next object. | ||
| 147 | |||
| 148 | However, when inserting object 2 onto a new list, we end up with this: | ||
| 149 | |||
| 150 | 1 3 | ||
| 151 | +---+ +---+ | ||
| 152 | hlist-->| N-+-------->| N-+-> | ||
| 153 | head <--+-P |<--------+-P | | ||
| 154 | +---+ +---+ | ||
| 155 | 2 | ||
| 156 | +---+ | ||
| 157 | | N-+----> | ||
| 158 | <----+-P | | ||
| 159 | +---+ | ||
| 160 | |||
| 161 | Because we didn't wait for a grace period, there may be a concurrent lookup | ||
| 162 | still at 2. Now when it follows 2's 'next' pointer, it will walk off into | ||
| 163 | another list without ever having checked object 3. | ||
| 164 | |||
| 165 | A related, but distinctly different, issue is that of rename atomicity versus | ||
| 166 | lookup operations. If a file is renamed from 'A' to 'B', a lookup must only | ||
| 167 | find either 'A' or 'B'. So if a lookup of 'A' returns NULL, a subsequent lookup | ||
| 168 | of 'B' must succeed (note the reverse is not true). | ||
| 169 | |||
| 170 | Between deleting the dentry from the old hash list, and inserting it on the new | ||
| 171 | hash list, a lookup may find neither 'A' nor 'B' matching the dentry. The same | ||
| 172 | rename seqlock is also used to cover this race in much the same way, by | ||
| 173 | retrying a negative lookup result if a rename was in progress. | ||
| 174 | |||
| 175 | Seqcount based lookups | ||
| 176 | ---------------------- | ||
| 177 | In refcount based dcache lookups, d_lock is used to serialise access to | ||
| 178 | the dentry, stabilising it while comparing its name and parent and then | ||
| 179 | taking a reference count (the reference count then gives a stable place to | ||
| 180 | start the next part of the path walk from). | ||
| 181 | |||
| 182 | As explained above, we would like to do path walking without taking locks or | ||
| 183 | reference counts on intermediate dentries along the path. To do this, a per | ||
| 184 | dentry seqlock (d_seq) is used to take a "coherent snapshot" of what the dentry | ||
| 185 | looks like (its name, parent, and inode). That snapshot is then used to start | ||
| 186 | the next part of the path walk. When loading the coherent snapshot under d_seq, | ||
| 187 | care must be taken to load the members up-front, and use those pointers rather | ||
| 188 | than reloading from the dentry later on (otherwise we'd have interesting things | ||
| 189 | like d_inode going NULL underneath us, if the name was unlinked). | ||
| 190 | |||
| 191 | Also important is to avoid performing any destructive operations (pretty much: | ||
| 192 | no non-atomic stores to shared data), and to recheck the seqcount when we are | ||
| 193 | "done" with the operation. Retry or abort if the seqcount does not match. | ||
| 194 | Avoiding destructive or changing operations means we can easily unwind from | ||
| 195 | failure. | ||
| 196 | |||
| 197 | What this means is that a caller, provided they are holding RCU lock to | ||
| 198 | protect the dentry object from disappearing, can perform a seqcount based | ||
| 199 | lookup which does not increment the refcount on the dentry or write to | ||
| 200 | it in any way. This returned dentry can be used for subsequent operations, | ||
| 201 | provided that d_seq is rechecked after that operation is complete. | ||
| 202 | |||
| 203 | Inodes are also rcu freed, so the seqcount lookup dentry's inode may also be | ||
| 204 | queried for permissions. | ||
| 205 | |||
| 206 | With this two parts of the puzzle, we can do path lookups without taking | ||
| 207 | locks or refcounts on dentry elements. | ||
| 208 | |||
| 209 | RCU-walk path walking design | ||
| 210 | ============================ | ||
| 211 | |||
| 212 | Path walking code now has two distinct modes, ref-walk and rcu-walk. ref-walk | ||
| 213 | is the traditional[*] way of performing dcache lookups using d_lock to | ||
| 214 | serialise concurrent modifications to the dentry and take a reference count on | ||
| 215 | it. ref-walk is simple and obvious, and may sleep, take locks, etc while path | ||
| 216 | walking is operating on each dentry. rcu-walk uses seqcount based dentry | ||
| 217 | lookups, and can perform lookup of intermediate elements without any stores to | ||
| 218 | shared data in the dentry or inode. rcu-walk can not be applied to all cases, | ||
| 219 | eg. if the filesystem must sleep or perform non trivial operations, rcu-walk | ||
| 220 | must be switched to ref-walk mode. | ||
| 221 | |||
| 222 | [*] RCU is still used for the dentry hash lookup in ref-walk, but not the full | ||
| 223 | path walk. | ||
| 224 | |||
| 225 | Where ref-walk uses a stable, refcounted ``parent'' to walk the remaining | ||
| 226 | path string, rcu-walk uses a d_seq protected snapshot. When looking up a | ||
| 227 | child of this parent snapshot, we open d_seq critical section on the child | ||
| 228 | before closing d_seq critical section on the parent. This gives an interlocking | ||
| 229 | ladder of snapshots to walk down. | ||
| 230 | |||
| 231 | |||
| 232 | proc 101 | ||
| 233 | /----------------\ | ||
| 234 | / comm: "vi" \ | ||
| 235 | / fs.root: dentry0 \ | ||
| 236 | \ fs.cwd: dentry2 / | ||
| 237 | \ / | ||
| 238 | \----------------/ | ||
| 239 | |||
| 240 | So when vi wants to open("/home/npiggin/test.c", O_RDWR), then it will | ||
| 241 | start from current->fs->root, which is a pinned dentry. Alternatively, | ||
| 242 | "./test.c" would start from cwd; both names refer to the same path in | ||
| 243 | the context of proc101. | ||
| 244 | |||
| 245 | dentry 0 | ||
| 246 | +---------------------+ rcu-walk begins here, we note d_seq, check the | ||
| 247 | | name: "/" | inode's permission, and then look up the next | ||
| 248 | | inode: 10 | path element which is "home"... | ||
| 249 | | children:"home", ...| | ||
| 250 | +---------------------+ | ||
| 251 | | | ||
| 252 | dentry 1 V | ||
| 253 | +---------------------+ ... which brings us here. We find dentry1 via | ||
| 254 | | name: "home" | hash lookup, then note d_seq and compare name | ||
| 255 | | inode: 678 | string and parent pointer. When we have a match, | ||
| 256 | | children:"npiggin" | we now recheck the d_seq of dentry0. Then we | ||
| 257 | +---------------------+ check inode and look up the next element. | ||
| 258 | | | ||
| 259 | dentry2 V | ||
| 260 | +---------------------+ Note: if dentry0 is now modified, lookup is | ||
| 261 | | name: "npiggin" | not necessarily invalid, so we need only keep a | ||
| 262 | | inode: 543 | parent for d_seq verification, and grandparents | ||
| 263 | | children:"a.c", ... | can be forgotten. | ||
| 264 | +---------------------+ | ||
| 265 | | | ||
| 266 | dentry3 V | ||
| 267 | +---------------------+ At this point we have our destination dentry. | ||
| 268 | | name: "a.c" | We now take its d_lock, verify d_seq of this | ||
| 269 | | inode: 14221 | dentry. If that checks out, we can increment | ||
| 270 | | children:NULL | its refcount because we're holding d_lock. | ||
| 271 | +---------------------+ | ||
| 272 | |||
| 273 | Taking a refcount on a dentry from rcu-walk mode, by taking its d_lock, | ||
| 274 | re-checking its d_seq, and then incrementing its refcount is called | ||
| 275 | "dropping rcu" or dropping from rcu-walk into ref-walk mode. | ||
| 276 | |||
| 277 | It is, in some sense, a bit of a house of cards. If the seqcount check of the | ||
| 278 | parent snapshot fails, the house comes down, because we had closed the d_seq | ||
| 279 | section on the grandparent, so we have nothing left to stand on. In that case, | ||
| 280 | the path walk must be fully restarted (which we do in ref-walk mode, to avoid | ||
| 281 | live locks). It is costly to have a full restart, but fortunately they are | ||
| 282 | quite rare. | ||
| 283 | |||
| 284 | When we reach a point where sleeping is required, or a filesystem callout | ||
| 285 | requires ref-walk, then instead of restarting the walk, we attempt to drop rcu | ||
| 286 | at the last known good dentry we have. Avoiding a full restart in ref-walk in | ||
| 287 | these cases is fundamental for performance and scalability because blocking | ||
| 288 | operations such as creates and unlinks are not uncommon. | ||
| 289 | |||
| 290 | The detailed design for rcu-walk is like this: | ||
| 291 | * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. | ||
| 292 | * Take the RCU lock for the entire path walk, starting with the acquiring | ||
| 293 | of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are | ||
| 294 | not required for dentry persistence. | ||
| 295 | * synchronize_rcu is called when unregistering a filesystem, so we can | ||
| 296 | access d_ops and i_ops during rcu-walk. | ||
| 297 | * Similarly take the vfsmount lock for the entire path walk. So now mnt | ||
| 298 | refcounts are not required for persistence. Also we are free to perform mount | ||
| 299 | lookups, and to assume dentry mount points and mount roots are stable up and | ||
| 300 | down the path. | ||
| 301 | * Have a per-dentry seqlock to protect the dentry name, parent, and inode, | ||
| 302 | so we can load this tuple atomically, and also check whether any of its | ||
| 303 | members have changed. | ||
| 304 | * Dentry lookups (based on parent, candidate string tuple) recheck the parent | ||
| 305 | sequence after the child is found in case anything changed in the parent | ||
| 306 | during the path walk. | ||
| 307 | * inode is also RCU protected so we can load d_inode and use the inode for | ||
| 308 | limited things. | ||
| 309 | * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. | ||
| 310 | * i_op can be loaded. | ||
| 311 | * When the destination dentry is reached, drop rcu there (ie. take d_lock, | ||
| 312 | verify d_seq, increment refcount). | ||
| 313 | * If seqlock verification fails anywhere along the path, do a full restart | ||
| 314 | of the path lookup in ref-walk mode. -ECHILD tends to be used (for want of | ||
| 315 | a better errno) to signal an rcu-walk failure. | ||
| 316 | |||
| 317 | The cases where rcu-walk cannot continue are: | ||
| 318 | * NULL dentry (ie. any uncached path element) | ||
| 319 | * Following links | ||
| 320 | |||
| 321 | It may be possible eventually to make following links rcu-walk aware. | ||
| 322 | |||
| 323 | Uncached path elements will always require dropping to ref-walk mode, at the | ||
| 324 | very least because i_mutex needs to be grabbed, and objects allocated. | ||
| 325 | |||
| 326 | Final note: | ||
| 327 | "store-free" path walking is not strictly store free. We take vfsmount lock | ||
| 328 | and refcounts (both of which can be made per-cpu), and we also store to the | ||
| 329 | stack (which is essentially CPU-local), and we also have to take locks and | ||
| 330 | refcount on final dentry. | ||
| 331 | |||
| 332 | The point is that shared data, where practically possible, is not locked | ||
| 333 | or stored into. The result is massive improvements in performance and | ||
| 334 | scalability of path resolution. | ||
| 335 | |||
| 336 | |||
| 337 | Interesting statistics | ||
| 338 | ====================== | ||
| 339 | |||
| 340 | The following table gives rcu lookup statistics for a few simple workloads | ||
| 341 | (2s12c24t Westmere, debian non-graphical system). Ungraceful are attempts to | ||
| 342 | drop rcu that fail due to d_seq failure and requiring the entire path lookup | ||
| 343 | again. Other cases are successful rcu-drops that are required before the final | ||
| 344 | element, nodentry for missing dentry, revalidate for filesystem revalidate | ||
| 345 | routine requiring rcu drop, permission for permission check requiring drop, | ||
| 346 | and link for symlink traversal requiring drop. | ||
| 347 | |||
| 348 | rcu-lookups restart nodentry link revalidate permission | ||
| 349 | bootup 47121 0 4624 1010 10283 7852 | ||
| 350 | dbench 25386793 0 6778659(26.7%) 55 549 1156 | ||
| 351 | kbuild 2696672 10 64442(2.3%) 108764(4.0%) 1 1590 | ||
| 352 | git diff 39605 0 28 2 0 106 | ||
| 353 | vfstest 24185492 4945 708725(2.9%) 1076136(4.4%) 0 2651 | ||
| 354 | |||
| 355 | What this shows is that failed rcu-walk lookups, ie. ones that are restarted | ||
| 356 | entirely with ref-walk, are quite rare. Even the "vfstest" case which | ||
| 357 | specifically has concurrent renames/mkdir/rmdir/ creat/unlink/etc to exercise | ||
| 358 | such races is not showing a huge amount of restarts. | ||
| 359 | |||
| 360 | Dropping from rcu-walk to ref-walk mean that we have encountered a dentry where | ||
| 361 | the reference count needs to be taken for some reason. This is either because | ||
| 362 | we have reached the target of the path walk, or because we have encountered a | ||
| 363 | condition that can't be resolved in rcu-walk mode. Ideally, we drop rcu-walk | ||
| 364 | only when we have reached the target dentry, so the other statistics show where | ||
| 365 | this does not happen. | ||
| 366 | |||
| 367 | Note that a graceful drop from rcu-walk mode due to something such as the | ||
| 368 | dentry not existing (which can be common) is not necessarily a failure of | ||
| 369 | rcu-walk scheme, because some elements of the path may have been walked in | ||
| 370 | rcu-walk mode. The further we get from common path elements (such as cwd or | ||
| 371 | root), the less contended the dentry is likely to be. The closer we are to | ||
| 372 | common path elements, the more likely they will exist in dentry cache. | ||
| 373 | |||
| 374 | |||
| 375 | Papers and other documentation on dcache locking | ||
| 376 | ================================================ | ||
| 377 | |||
| 378 | 1. Scaling dcache with RCU (http://linuxjournal.com/article.php?sid=7124). | ||
| 379 | |||
| 380 | 2. http://lse.sourceforge.net/locking/dcache/dcache.html | ||
| 381 | |||
| 382 | |||
diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt index 40ea6c295afb..65e03dd44823 100644 --- a/Documentation/filesystems/pohmelfs/network_protocol.txt +++ b/Documentation/filesystems/pohmelfs/network_protocol.txt | |||
| @@ -20,7 +20,7 @@ Commands can be embedded into transaction command (which in turn has own command | |||
| 20 | so one can extend protocol as needed without breaking backward compatibility as long | 20 | so one can extend protocol as needed without breaking backward compatibility as long |
| 21 | as old commands are supported. All string lengths include tail 0 byte. | 21 | as old commands are supported. All string lengths include tail 0 byte. |
| 22 | 22 | ||
| 23 | All commans are transfered over the network in big-endian. CPU endianess is used at the end peers. | 23 | All commands are transferred over the network in big-endian. CPU endianess is used at the end peers. |
| 24 | 24 | ||
| 25 | @cmd - command number, which specifies command to be processed. Following | 25 | @cmd - command number, which specifies command to be processed. Following |
| 26 | commands are used currently: | 26 | commands are used currently: |
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index b12c89538680..6e29954851a2 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting | |||
| @@ -216,7 +216,6 @@ had ->revalidate()) add calls in ->follow_link()/->readlink(). | |||
| 216 | ->d_parent changes are not protected by BKL anymore. Read access is safe | 216 | ->d_parent changes are not protected by BKL anymore. Read access is safe |
| 217 | if at least one of the following is true: | 217 | if at least one of the following is true: |
| 218 | * filesystem has no cross-directory rename() | 218 | * filesystem has no cross-directory rename() |
| 219 | * dcache_lock is held | ||
| 220 | * we know that parent had been locked (e.g. we are looking at | 219 | * we know that parent had been locked (e.g. we are looking at |
| 221 | ->d_parent of ->lookup() argument). | 220 | ->d_parent of ->lookup() argument). |
| 222 | * we are called from ->rename(). | 221 | * we are called from ->rename(). |
| @@ -299,11 +298,14 @@ be used instead. It gets called whenever the inode is evicted, whether it has | |||
| 299 | remaining links or not. Caller does *not* evict the pagecache or inode-associated | 298 | remaining links or not. Caller does *not* evict the pagecache or inode-associated |
| 300 | metadata buffers; getting rid of those is responsibility of method, as it had | 299 | metadata buffers; getting rid of those is responsibility of method, as it had |
| 301 | been for ->delete_inode(). | 300 | been for ->delete_inode(). |
| 302 | ->drop_inode() returns int now; it's called on final iput() with inode_lock | 301 | |
| 303 | held and it returns true if filesystems wants the inode to be dropped. As before, | 302 | ->drop_inode() returns int now; it's called on final iput() with |
| 304 | generic_drop_inode() is still the default and it's been updated appropriately. | 303 | inode->i_lock held and it returns true if filesystems wants the inode to be |
| 305 | generic_delete_inode() is also alive and it consists simply of return 1. Note that | 304 | dropped. As before, generic_drop_inode() is still the default and it's been |
| 306 | all actual eviction work is done by caller after ->drop_inode() returns. | 305 | updated appropriately. generic_delete_inode() is also alive and it consists |
| 306 | simply of return 1. Note that all actual eviction work is done by caller after | ||
| 307 | ->drop_inode() returns. | ||
| 308 | |||
| 307 | clear_inode() is gone; use end_writeback() instead. As before, it must | 309 | clear_inode() is gone; use end_writeback() instead. As before, it must |
| 308 | be called exactly once on each call of ->evict_inode() (as it used to be for | 310 | be called exactly once on each call of ->evict_inode() (as it used to be for |
| 309 | each call of ->delete_inode()). Unlike before, if you are using inode-associated | 311 | each call of ->delete_inode()). Unlike before, if you are using inode-associated |
| @@ -318,3 +320,90 @@ if it's zero is not *and* *never* *had* *been* enough. Final unlink() and iput( | |||
| 318 | may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly | 320 | may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly |
| 319 | free the on-disk inode, you may end up doing that while ->write_inode() is writing | 321 | free the on-disk inode, you may end up doing that while ->write_inode() is writing |
| 320 | to it. | 322 | to it. |
| 323 | |||
| 324 | --- | ||
| 325 | [mandatory] | ||
| 326 | |||
| 327 | .d_delete() now only advises the dcache as to whether or not to cache | ||
| 328 | unreferenced dentries, and is now only called when the dentry refcount goes to | ||
| 329 | 0. Even on 0 refcount transition, it must be able to tolerate being called 0, | ||
| 330 | 1, or more times (eg. constant, idempotent). | ||
| 331 | |||
| 332 | --- | ||
| 333 | [mandatory] | ||
| 334 | |||
| 335 | .d_compare() calling convention and locking rules are significantly | ||
| 336 | changed. Read updated documentation in Documentation/filesystems/vfs.txt (and | ||
| 337 | look at examples of other filesystems) for guidance. | ||
| 338 | |||
| 339 | --- | ||
| 340 | [mandatory] | ||
| 341 | |||
| 342 | .d_hash() calling convention and locking rules are significantly | ||
| 343 | changed. Read updated documentation in Documentation/filesystems/vfs.txt (and | ||
| 344 | look at examples of other filesystems) for guidance. | ||
| 345 | |||
| 346 | --- | ||
| 347 | [mandatory] | ||
| 348 | dcache_lock is gone, replaced by fine grained locks. See fs/dcache.c | ||
| 349 | for details of what locks to replace dcache_lock with in order to protect | ||
| 350 | particular things. Most of the time, a filesystem only needs ->d_lock, which | ||
| 351 | protects *all* the dcache state of a given dentry. | ||
| 352 | |||
| 353 | -- | ||
| 354 | [mandatory] | ||
| 355 | |||
| 356 | Filesystems must RCU-free their inodes, if they can have been accessed | ||
| 357 | via rcu-walk path walk (basically, if the file can have had a path name in the | ||
| 358 | vfs namespace). | ||
| 359 | |||
| 360 | i_dentry and i_rcu share storage in a union, and the vfs expects | ||
| 361 | i_dentry to be reinitialized before it is freed, so an: | ||
| 362 | |||
| 363 | INIT_LIST_HEAD(&inode->i_dentry); | ||
| 364 | |||
| 365 | must be done in the RCU callback. | ||
| 366 | |||
| 367 | -- | ||
| 368 | [recommended] | ||
| 369 | vfs now tries to do path walking in "rcu-walk mode", which avoids | ||
| 370 | atomic operations and scalability hazards on dentries and inodes (see | ||
| 371 | Documentation/filesystems/path-lookup.txt). d_hash and d_compare changes | ||
| 372 | (above) are examples of the changes required to support this. For more complex | ||
| 373 | filesystem callbacks, the vfs drops out of rcu-walk mode before the fs call, so | ||
| 374 | no changes are required to the filesystem. However, this is costly and loses | ||
| 375 | the benefits of rcu-walk mode. We will begin to add filesystem callbacks that | ||
| 376 | are rcu-walk aware, shown below. Filesystems should take advantage of this | ||
| 377 | where possible. | ||
| 378 | |||
| 379 | -- | ||
| 380 | [mandatory] | ||
| 381 | d_revalidate is a callback that is made on every path element (if | ||
| 382 | the filesystem provides it), which requires dropping out of rcu-walk mode. This | ||
| 383 | may now be called in rcu-walk mode (nd->flags & LOOKUP_RCU). -ECHILD should be | ||
| 384 | returned if the filesystem cannot handle rcu-walk. See | ||
| 385 | Documentation/filesystems/vfs.txt for more details. | ||
| 386 | |||
| 387 | permission and check_acl are inode permission checks that are called | ||
| 388 | on many or all directory inodes on the way down a path walk (to check for | ||
| 389 | exec permission). These must now be rcu-walk aware (flags & IPERM_FLAG_RCU). | ||
| 390 | See Documentation/filesystems/vfs.txt for more details. | ||
| 391 | |||
| 392 | -- | ||
| 393 | [mandatory] | ||
| 394 | In ->fallocate() you must check the mode option passed in. If your | ||
| 395 | filesystem does not support hole punching (deallocating space in the middle of a | ||
| 396 | file) you must return -EOPNOTSUPP if FALLOC_FL_PUNCH_HOLE is set in mode. | ||
| 397 | Currently you can only have FALLOC_FL_PUNCH_HOLE with FALLOC_FL_KEEP_SIZE set, | ||
| 398 | so the i_size should not change when hole punching, even when puching the end of | ||
| 399 | a file off. | ||
| 400 | |||
| 401 | -- | ||
| 402 | [mandatory] | ||
| 403 | |||
| 404 | -- | ||
| 405 | [mandatory] | ||
| 406 | ->get_sb() is gone. Switch to use of ->mount(). Typically it's just | ||
| 407 | a matter of switching from calling get_sb_... to mount_... and changing the | ||
| 408 | function type. If you were doing it manually, just switch from setting ->mnt_root | ||
| 409 | to some pointer to returning that pointer. On errors return ERR_PTR(...). | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a6aca8740883..60740e8ecb37 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
| @@ -136,6 +136,7 @@ Table 1-1: Process specific entries in /proc | |||
| 136 | statm Process memory status information | 136 | statm Process memory status information |
| 137 | status Process status in human readable form | 137 | status Process status in human readable form |
| 138 | wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan | 138 | wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan |
| 139 | pagemap Page table | ||
| 139 | stack Report full stack trace, enable via CONFIG_STACKTRACE | 140 | stack Report full stack trace, enable via CONFIG_STACKTRACE |
| 140 | smaps a extension based on maps, showing the memory consumption of | 141 | smaps a extension based on maps, showing the memory consumption of |
| 141 | each mapping | 142 | each mapping |
| @@ -370,17 +371,25 @@ Shared_Dirty: 0 kB | |||
| 370 | Private_Clean: 0 kB | 371 | Private_Clean: 0 kB |
| 371 | Private_Dirty: 0 kB | 372 | Private_Dirty: 0 kB |
| 372 | Referenced: 892 kB | 373 | Referenced: 892 kB |
| 374 | Anonymous: 0 kB | ||
| 373 | Swap: 0 kB | 375 | Swap: 0 kB |
| 374 | KernelPageSize: 4 kB | 376 | KernelPageSize: 4 kB |
| 375 | MMUPageSize: 4 kB | 377 | MMUPageSize: 4 kB |
| 376 | 378 | Locked: 374 kB | |
| 377 | The first of these lines shows the same information as is displayed for the | 379 | |
| 378 | mapping in /proc/PID/maps. The remaining lines show the size of the mapping, | 380 | The first of these lines shows the same information as is displayed for the |
| 379 | the amount of the mapping that is currently resident in RAM, the "proportional | 381 | mapping in /proc/PID/maps. The remaining lines show the size of the mapping |
| 380 | set size” (divide each shared page by the number of processes sharing it), the | 382 | (size), the amount of the mapping that is currently resident in RAM (RSS), the |
| 381 | number of clean and dirty shared pages in the mapping, and the number of clean | 383 | process' proportional share of this mapping (PSS), the number of clean and |
| 382 | and dirty private pages in the mapping. The "Referenced" indicates the amount | 384 | dirty private pages in the mapping. Note that even a page which is part of a |
| 383 | of memory currently marked as referenced or accessed. | 385 | MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used |
| 386 | by only one process, is accounted as private and not as shared. "Referenced" | ||
| 387 | indicates the amount of memory currently marked as referenced or accessed. | ||
| 388 | "Anonymous" shows the amount of memory that does not belong to any file. Even | ||
| 389 | a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE | ||
| 390 | and a page is modified, the file page is replaced by a private anonymous copy. | ||
| 391 | "Swap" shows how much would-be-anonymous memory is also used, but out on | ||
| 392 | swap. | ||
| 384 | 393 | ||
| 385 | This file is only present if the CONFIG_MMU kernel configuration option is | 394 | This file is only present if the CONFIG_MMU kernel configuration option is |
| 386 | enabled. | 395 | enabled. |
| @@ -397,6 +406,9 @@ To clear the bits for the file mapped pages associated with the process | |||
| 397 | > echo 3 > /proc/PID/clear_refs | 406 | > echo 3 > /proc/PID/clear_refs |
| 398 | Any other value written to /proc/PID/clear_refs will have no effect. | 407 | Any other value written to /proc/PID/clear_refs will have no effect. |
| 399 | 408 | ||
| 409 | The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags | ||
| 410 | using /proc/kpageflags and number of times a page is mapped using | ||
| 411 | /proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt. | ||
| 400 | 412 | ||
| 401 | 1.2 Kernel data | 413 | 1.2 Kernel data |
| 402 | --------------- | 414 | --------------- |
| @@ -531,7 +543,7 @@ just those considered 'most important'. The new vectors are: | |||
| 531 | their statistics are used by kernel developers and interested users to | 543 | their statistics are used by kernel developers and interested users to |
| 532 | determine the occurrence of interrupts of the given type. | 544 | determine the occurrence of interrupts of the given type. |
| 533 | 545 | ||
| 534 | The above IRQ vectors are displayed only when relevent. For example, | 546 | The above IRQ vectors are displayed only when relevant. For example, |
| 535 | the threshold vector does not exist on x86_64 platforms. Others are | 547 | the threshold vector does not exist on x86_64 platforms. Others are |
| 536 | suppressed when the system is a uniprocessor. As of this writing, only | 548 | suppressed when the system is a uniprocessor. As of this writing, only |
| 537 | i386 and x86_64 platforms support the new IRQ vector displays. | 549 | i386 and x86_64 platforms support the new IRQ vector displays. |
| @@ -659,6 +671,8 @@ varies by architecture and compile options. The following is from a | |||
| 659 | 671 | ||
| 660 | > cat /proc/meminfo | 672 | > cat /proc/meminfo |
| 661 | 673 | ||
| 674 | The "Locked" indicates whether the mapping is locked in memory or not. | ||
| 675 | |||
| 662 | 676 | ||
| 663 | MemTotal: 16344972 kB | 677 | MemTotal: 16344972 kB |
| 664 | MemFree: 13634064 kB | 678 | MemFree: 13634064 kB |
| @@ -822,7 +836,6 @@ Provides counts of softirq handlers serviced since boot time, for each cpu. | |||
| 822 | TASKLET: 0 0 0 290 | 836 | TASKLET: 0 0 0 290 |
| 823 | SCHED: 27035 26983 26971 26746 | 837 | SCHED: 27035 26983 26971 26746 |
| 824 | HRTIMER: 0 0 0 0 | 838 | HRTIMER: 0 0 0 0 |
| 825 | RCU: 1678 1769 2178 2250 | ||
| 826 | 839 | ||
| 827 | 840 | ||
| 828 | 1.3 IDE devices in /proc/ide | 841 | 1.3 IDE devices in /proc/ide |
| @@ -1170,6 +1183,30 @@ Table 1-12: Files in /proc/fs/ext4/<devname> | |||
| 1170 | mb_groups details of multiblock allocator buddy cache of free blocks | 1183 | mb_groups details of multiblock allocator buddy cache of free blocks |
| 1171 | .............................................................................. | 1184 | .............................................................................. |
| 1172 | 1185 | ||
| 1186 | 2.0 /proc/consoles | ||
| 1187 | ------------------ | ||
| 1188 | Shows registered system console lines. | ||
| 1189 | |||
| 1190 | To see which character device lines are currently used for the system console | ||
| 1191 | /dev/console, you may simply look into the file /proc/consoles: | ||
| 1192 | |||
| 1193 | > cat /proc/consoles | ||
| 1194 | tty0 -WU (ECp) 4:7 | ||
| 1195 | ttyS0 -W- (Ep) 4:64 | ||
| 1196 | |||
| 1197 | The columns are: | ||
| 1198 | |||
| 1199 | device name of the device | ||
| 1200 | operations R = can do read operations | ||
| 1201 | W = can do write operations | ||
| 1202 | U = can do unblank | ||
| 1203 | flags E = it is enabled | ||
| 1204 | C = it is preferred console | ||
| 1205 | B = it is primary boot console | ||
| 1206 | p = it is used for printk buffer | ||
| 1207 | b = it is not a TTY but a Braille device | ||
| 1208 | a = it is safe to use when cpu is offline | ||
| 1209 | major:minor major and minor number of the device separated by a colon | ||
| 1173 | 1210 | ||
| 1174 | ------------------------------------------------------------------------------ | 1211 | ------------------------------------------------------------------------------ |
| 1175 | Summary | 1212 | Summary |
| @@ -1285,11 +1322,15 @@ scaled linearly with /proc/<pid>/oom_score_adj. | |||
| 1285 | Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the | 1322 | Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the |
| 1286 | other with its scaled value. | 1323 | other with its scaled value. |
| 1287 | 1324 | ||
| 1325 | The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last | ||
| 1326 | value set by a CAP_SYS_RESOURCE process. To reduce the value any lower | ||
| 1327 | requires CAP_SYS_RESOURCE. | ||
| 1328 | |||
| 1288 | NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see | 1329 | NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see |
| 1289 | Documentation/feature-removal-schedule.txt. | 1330 | Documentation/feature-removal-schedule.txt. |
| 1290 | 1331 | ||
| 1291 | Caveat: when a parent task is selected, the oom killer will sacrifice any first | 1332 | Caveat: when a parent task is selected, the oom killer will sacrifice any first |
| 1292 | generation children with seperate address spaces instead, if possible. This | 1333 | generation children with separate address spaces instead, if possible. This |
| 1293 | avoids servers and important system daemons from being killed and loses the | 1334 | avoids servers and important system daemons from being killed and loses the |
| 1294 | minimal amount of work. | 1335 | minimal amount of work. |
| 1295 | 1336 | ||
diff --git a/Documentation/filesystems/romfs.txt b/Documentation/filesystems/romfs.txt index 2d2a7b2a16b9..e2b07cc9120a 100644 --- a/Documentation/filesystems/romfs.txt +++ b/Documentation/filesystems/romfs.txt | |||
| @@ -17,8 +17,7 @@ comparison, an actual rescue disk used up 3202 blocks with ext2, while | |||
| 17 | with romfs, it needed 3079 blocks. | 17 | with romfs, it needed 3079 blocks. |
| 18 | 18 | ||
| 19 | To create such a file system, you'll need a user program named | 19 | To create such a file system, you'll need a user program named |
| 20 | genromfs. It is available via anonymous ftp on sunsite.unc.edu and | 20 | genromfs. It is available on http://romfs.sourceforge.net/ |
| 21 | its mirrors, in the /pub/Linux/system/recovery/ directory. | ||
| 22 | 21 | ||
| 23 | As the name suggests, romfs could be also used (space-efficiently) on | 22 | As the name suggests, romfs could be also used (space-efficiently) on |
| 24 | various read-only media, like (E)EPROM disks if someone will have the | 23 | various read-only media, like (E)EPROM disks if someone will have the |
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index fc0e39af43c3..4ede421c9687 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt | |||
| @@ -62,10 +62,10 @@ replicas continue to be exactly same. | |||
| 62 | # mount /dev/sd0 /tmp/a | 62 | # mount /dev/sd0 /tmp/a |
| 63 | 63 | ||
| 64 | #ls /tmp/a | 64 | #ls /tmp/a |
| 65 | t1 t2 t2 | 65 | t1 t2 t3 |
| 66 | 66 | ||
| 67 | #ls /mnt/a | 67 | #ls /mnt/a |
| 68 | t1 t2 t2 | 68 | t1 t2 t3 |
| 69 | 69 | ||
| 70 | Note that the mount has propagated to the mount at /mnt as well. | 70 | Note that the mount has propagated to the mount at /mnt as well. |
| 71 | 71 | ||
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt deleted file mode 100644 index 194fb0decd2c..000000000000 --- a/Documentation/filesystems/smbfs.txt +++ /dev/null | |||
| @@ -1,8 +0,0 @@ | |||
| 1 | Smbfs is a filesystem that implements the SMB protocol, which is the | ||
| 2 | protocol used by Windows for Workgroups, Windows 95 and Windows NT. | ||
| 3 | Smbfs was inspired by Samba, the program written by Andrew Tridgell | ||
| 4 | that turns any Unix host into a file server for DOS or Windows clients. | ||
| 5 | |||
| 6 | Smbfs is a SMB client, but uses parts of samba for its operation. For | ||
| 7 | more info on samba, including documentation, please go to | ||
| 8 | http://www.samba.org/ and then on to your nearest mirror. | ||
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt index 66699afd66ca..d4d41465a0b1 100644 --- a/Documentation/filesystems/squashfs.txt +++ b/Documentation/filesystems/squashfs.txt | |||
| @@ -59,12 +59,15 @@ obtained from this site also. | |||
| 59 | 3. SQUASHFS FILESYSTEM DESIGN | 59 | 3. SQUASHFS FILESYSTEM DESIGN |
| 60 | ----------------------------- | 60 | ----------------------------- |
| 61 | 61 | ||
| 62 | A squashfs filesystem consists of a maximum of eight parts, packed together on a byte | 62 | A squashfs filesystem consists of a maximum of nine parts, packed together on a |
| 63 | alignment: | 63 | byte alignment: |
| 64 | 64 | ||
| 65 | --------------- | 65 | --------------- |
| 66 | | superblock | | 66 | | superblock | |
| 67 | |---------------| | 67 | |---------------| |
| 68 | | compression | | ||
| 69 | | options | | ||
| 70 | |---------------| | ||
| 68 | | datablocks | | 71 | | datablocks | |
| 69 | | & fragments | | 72 | | & fragments | |
| 70 | |---------------| | 73 | |---------------| |
| @@ -91,7 +94,14 @@ the source directory, and checked for duplicates. Once all file data has been | |||
| 91 | written the completed inode, directory, fragment, export and uid/gid lookup | 94 | written the completed inode, directory, fragment, export and uid/gid lookup |
| 92 | tables are written. | 95 | tables are written. |
| 93 | 96 | ||
| 94 | 3.1 Inodes | 97 | 3.1 Compression options |
| 98 | ----------------------- | ||
| 99 | |||
| 100 | Compressors can optionally support compression specific options (e.g. | ||
| 101 | dictionary size). If non-default compression options have been used, then | ||
| 102 | these are stored here. | ||
| 103 | |||
| 104 | 3.2 Inodes | ||
| 95 | ---------- | 105 | ---------- |
| 96 | 106 | ||
| 97 | Metadata (inodes and directories) are compressed in 8Kbyte blocks. Each | 107 | Metadata (inodes and directories) are compressed in 8Kbyte blocks. Each |
| @@ -114,7 +124,7 @@ directory inode are defined: inodes optimised for frequently occurring | |||
| 114 | regular files and directories, and extended types where extra | 124 | regular files and directories, and extended types where extra |
| 115 | information has to be stored. | 125 | information has to be stored. |
| 116 | 126 | ||
| 117 | 3.2 Directories | 127 | 3.3 Directories |
| 118 | --------------- | 128 | --------------- |
| 119 | 129 | ||
| 120 | Like inodes, directories are packed into compressed metadata blocks, stored | 130 | Like inodes, directories are packed into compressed metadata blocks, stored |
| @@ -144,7 +154,7 @@ decompressed to do a lookup irrespective of the length of the directory. | |||
| 144 | This scheme has the advantage that it doesn't require extra memory overhead | 154 | This scheme has the advantage that it doesn't require extra memory overhead |
| 145 | and doesn't require much extra storage on disk. | 155 | and doesn't require much extra storage on disk. |
| 146 | 156 | ||
| 147 | 3.3 File data | 157 | 3.4 File data |
| 148 | ------------- | 158 | ------------- |
| 149 | 159 | ||
| 150 | Regular files consist of a sequence of contiguous compressed blocks, and/or a | 160 | Regular files consist of a sequence of contiguous compressed blocks, and/or a |
| @@ -163,7 +173,7 @@ Larger files use multiple slots, with 1.75 TiB files using all 8 slots. | |||
| 163 | The index cache is designed to be memory efficient, and by default uses | 173 | The index cache is designed to be memory efficient, and by default uses |
| 164 | 16 KiB. | 174 | 16 KiB. |
| 165 | 175 | ||
| 166 | 3.4 Fragment lookup table | 176 | 3.5 Fragment lookup table |
| 167 | ------------------------- | 177 | ------------------------- |
| 168 | 178 | ||
| 169 | Regular files can contain a fragment index which is mapped to a fragment | 179 | Regular files can contain a fragment index which is mapped to a fragment |
| @@ -173,7 +183,7 @@ A second index table is used to locate these. This second index table for | |||
| 173 | speed of access (and because it is small) is read at mount time and cached | 183 | speed of access (and because it is small) is read at mount time and cached |
| 174 | in memory. | 184 | in memory. |
| 175 | 185 | ||
| 176 | 3.5 Uid/gid lookup table | 186 | 3.6 Uid/gid lookup table |
| 177 | ------------------------ | 187 | ------------------------ |
| 178 | 188 | ||
| 179 | For space efficiency regular files store uid and gid indexes, which are | 189 | For space efficiency regular files store uid and gid indexes, which are |
| @@ -182,7 +192,7 @@ stored compressed into metadata blocks. A second index table is used to | |||
| 182 | locate these. This second index table for speed of access (and because it | 192 | locate these. This second index table for speed of access (and because it |
| 183 | is small) is read at mount time and cached in memory. | 193 | is small) is read at mount time and cached in memory. |
| 184 | 194 | ||
| 185 | 3.6 Export table | 195 | 3.7 Export table |
| 186 | ---------------- | 196 | ---------------- |
| 187 | 197 | ||
| 188 | To enable Squashfs filesystems to be exportable (via NFS etc.) filesystems | 198 | To enable Squashfs filesystems to be exportable (via NFS etc.) filesystems |
| @@ -196,7 +206,7 @@ This table is stored compressed into metadata blocks. A second index table is | |||
| 196 | used to locate these. This second index table for speed of access (and because | 206 | used to locate these. This second index table for speed of access (and because |
| 197 | it is small) is read at mount time and cached in memory. | 207 | it is small) is read at mount time and cached in memory. |
| 198 | 208 | ||
| 199 | 3.7 Xattr table | 209 | 3.8 Xattr table |
| 200 | --------------- | 210 | --------------- |
| 201 | 211 | ||
| 202 | The xattr table contains extended attributes for each inode. The xattrs | 212 | The xattr table contains extended attributes for each inode. The xattrs |
| @@ -209,7 +219,7 @@ or if it is stored out of line (in which case the value field stores a | |||
| 209 | reference to where the actual value is stored). This allows large values | 219 | reference to where the actual value is stored). This allows large values |
| 210 | to be stored out of line improving scanning and lookup performance and it | 220 | to be stored out of line improving scanning and lookup performance and it |
| 211 | also allows values to be de-duplicated, the value being stored once, and | 221 | also allows values to be de-duplicated, the value being stored once, and |
| 212 | all other occurences holding an out of line reference to that value. | 222 | all other occurrences holding an out of line reference to that value. |
| 213 | 223 | ||
| 214 | The xattr lists are packed into compressed 8K metadata blocks. | 224 | The xattr lists are packed into compressed 8K metadata blocks. |
| 215 | To reduce overhead in inodes, rather than storing the on-disk | 225 | To reduce overhead in inodes, rather than storing the on-disk |
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 5d1335faec2d..597f728e7b4e 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt | |||
| @@ -39,10 +39,12 @@ userspace. Top-level directories in sysfs represent the common | |||
| 39 | ancestors of object hierarchies; i.e. the subsystems the objects | 39 | ancestors of object hierarchies; i.e. the subsystems the objects |
| 40 | belong to. | 40 | belong to. |
| 41 | 41 | ||
| 42 | Sysfs internally stores the kobject that owns the directory in the | 42 | Sysfs internally stores a pointer to the kobject that implements a |
| 43 | ->d_fsdata pointer of the directory's dentry. This allows sysfs to do | 43 | directory in the sysfs_dirent object associated with the directory. In |
| 44 | reference counting directly on the kobject when the file is opened and | 44 | the past this kobject pointer has been used by sysfs to do reference |
| 45 | closed. | 45 | counting directly on the kobject whenever the file is opened or closed. |
| 46 | With the current sysfs implementation the kobject reference count is | ||
| 47 | only modified directly by the function sysfs_schedule_callback(). | ||
| 46 | 48 | ||
| 47 | 49 | ||
| 48 | Attributes | 50 | Attributes |
| @@ -60,7 +62,7 @@ values of the same type. | |||
| 60 | 62 | ||
| 61 | Mixing types, expressing multiple lines of data, and doing fancy | 63 | Mixing types, expressing multiple lines of data, and doing fancy |
| 62 | formatting of data is heavily frowned upon. Doing these things may get | 64 | formatting of data is heavily frowned upon. Doing these things may get |
| 63 | you publically humiliated and your code rewritten without notice. | 65 | you publicly humiliated and your code rewritten without notice. |
| 64 | 66 | ||
| 65 | 67 | ||
| 66 | An attribute definition is simply: | 68 | An attribute definition is simply: |
| @@ -208,9 +210,9 @@ Other notes: | |||
| 208 | is 4096. | 210 | is 4096. |
| 209 | 211 | ||
| 210 | - show() methods should return the number of bytes printed into the | 212 | - show() methods should return the number of bytes printed into the |
| 211 | buffer. This is the return value of snprintf(). | 213 | buffer. This is the return value of scnprintf(). |
| 212 | 214 | ||
| 213 | - show() should always use snprintf(). | 215 | - show() should always use scnprintf(). |
| 214 | 216 | ||
| 215 | - store() should return the number of bytes used from the buffer. If the | 217 | - store() should return the number of bytes used from the buffer. If the |
| 216 | entire buffer has been used, just return the count argument. | 218 | entire buffer has been used, just return the count argument. |
| @@ -229,7 +231,7 @@ A very simple (and naive) implementation of a device attribute is: | |||
| 229 | static ssize_t show_name(struct device *dev, struct device_attribute *attr, | 231 | static ssize_t show_name(struct device *dev, struct device_attribute *attr, |
| 230 | char *buf) | 232 | char *buf) |
| 231 | { | 233 | { |
| 232 | return snprintf(buf, PAGE_SIZE, "%s\n", dev->name); | 234 | return scnprintf(buf, PAGE_SIZE, "%s\n", dev->name); |
| 233 | } | 235 | } |
| 234 | 236 | ||
| 235 | static ssize_t store_name(struct device *dev, struct device_attribute *attr, | 237 | static ssize_t store_name(struct device *dev, struct device_attribute *attr, |
diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt index 12fedb7834c6..d7b13b01e980 100644 --- a/Documentation/filesystems/ubifs.txt +++ b/Documentation/filesystems/ubifs.txt | |||
| @@ -82,12 +82,12 @@ Mount options | |||
| 82 | bulk_read read more in one go to take advantage of flash | 82 | bulk_read read more in one go to take advantage of flash |
| 83 | media that read faster sequentially | 83 | media that read faster sequentially |
| 84 | no_bulk_read (*) do not bulk-read | 84 | no_bulk_read (*) do not bulk-read |
| 85 | no_chk_data_crc skip checking of CRCs on data nodes in order to | 85 | no_chk_data_crc (*) skip checking of CRCs on data nodes in order to |
| 86 | improve read performance. Use this option only | 86 | improve read performance. Use this option only |
| 87 | if the flash media is highly reliable. The effect | 87 | if the flash media is highly reliable. The effect |
| 88 | of this option is that corruption of the contents | 88 | of this option is that corruption of the contents |
| 89 | of a file can go unnoticed. | 89 | of a file can go unnoticed. |
| 90 | chk_data_crc (*) do not skip checking CRCs on data nodes | 90 | chk_data_crc do not skip checking CRCs on data nodes |
| 91 | compr=none override default compressor and set it to "none" | 91 | compr=none override default compressor and set it to "none" |
| 92 | compr=lzo override default compressor and set it to "lzo" | 92 | compr=lzo override default compressor and set it to "lzo" |
| 93 | compr=zlib override default compressor and set it to "zlib" | 93 | compr=zlib override default compressor and set it to "zlib" |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index ed7e5efc06d8..21a7dc467bba 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
| @@ -95,10 +95,11 @@ functions: | |||
| 95 | extern int unregister_filesystem(struct file_system_type *); | 95 | extern int unregister_filesystem(struct file_system_type *); |
| 96 | 96 | ||
| 97 | The passed struct file_system_type describes your filesystem. When a | 97 | The passed struct file_system_type describes your filesystem. When a |
| 98 | request is made to mount a device onto a directory in your filespace, | 98 | request is made to mount a filesystem onto a directory in your namespace, |
| 99 | the VFS will call the appropriate get_sb() method for the specific | 99 | the VFS will call the appropriate mount() method for the specific |
| 100 | filesystem. The dentry for the mount point will then be updated to | 100 | filesystem. New vfsmount referring to the tree returned by ->mount() |
| 101 | point to the root inode for the new filesystem. | 101 | will be attached to the mountpoint, so that when pathname resolution |
| 102 | reaches the mountpoint it will jump into the root of that vfsmount. | ||
| 102 | 103 | ||
| 103 | You can see all filesystems that are registered to the kernel in the | 104 | You can see all filesystems that are registered to the kernel in the |
| 104 | file /proc/filesystems. | 105 | file /proc/filesystems. |
| @@ -107,14 +108,14 @@ file /proc/filesystems. | |||
| 107 | struct file_system_type | 108 | struct file_system_type |
| 108 | ----------------------- | 109 | ----------------------- |
| 109 | 110 | ||
| 110 | This describes the filesystem. As of kernel 2.6.22, the following | 111 | This describes the filesystem. As of kernel 2.6.39, the following |
| 111 | members are defined: | 112 | members are defined: |
| 112 | 113 | ||
| 113 | struct file_system_type { | 114 | struct file_system_type { |
| 114 | const char *name; | 115 | const char *name; |
| 115 | int fs_flags; | 116 | int fs_flags; |
| 116 | int (*get_sb) (struct file_system_type *, int, | 117 | struct dentry (*mount) (struct file_system_type *, int, |
| 117 | const char *, void *, struct vfsmount *); | 118 | const char *, void *); |
| 118 | void (*kill_sb) (struct super_block *); | 119 | void (*kill_sb) (struct super_block *); |
| 119 | struct module *owner; | 120 | struct module *owner; |
| 120 | struct file_system_type * next; | 121 | struct file_system_type * next; |
| @@ -128,11 +129,11 @@ struct file_system_type { | |||
| 128 | 129 | ||
| 129 | fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) | 130 | fs_flags: various flags (i.e. FS_REQUIRES_DEV, FS_NO_DCACHE, etc.) |
| 130 | 131 | ||
| 131 | get_sb: the method to call when a new instance of this | 132 | mount: the method to call when a new instance of this |
| 132 | filesystem should be mounted | 133 | filesystem should be mounted |
| 133 | 134 | ||
| 134 | kill_sb: the method to call when an instance of this filesystem | 135 | kill_sb: the method to call when an instance of this filesystem |
| 135 | should be unmounted | 136 | should be shut down |
| 136 | 137 | ||
| 137 | owner: for internal VFS use: you should initialize this to THIS_MODULE in | 138 | owner: for internal VFS use: you should initialize this to THIS_MODULE in |
| 138 | most cases. | 139 | most cases. |
| @@ -141,7 +142,7 @@ struct file_system_type { | |||
| 141 | 142 | ||
| 142 | s_lock_key, s_umount_key: lockdep-specific | 143 | s_lock_key, s_umount_key: lockdep-specific |
| 143 | 144 | ||
| 144 | The get_sb() method has the following arguments: | 145 | The mount() method has the following arguments: |
| 145 | 146 | ||
| 146 | struct file_system_type *fs_type: describes the filesystem, partly initialized | 147 | struct file_system_type *fs_type: describes the filesystem, partly initialized |
| 147 | by the specific filesystem code | 148 | by the specific filesystem code |
| @@ -153,32 +154,39 @@ The get_sb() method has the following arguments: | |||
| 153 | void *data: arbitrary mount options, usually comes as an ASCII | 154 | void *data: arbitrary mount options, usually comes as an ASCII |
| 154 | string (see "Mount Options" section) | 155 | string (see "Mount Options" section) |
| 155 | 156 | ||
| 156 | struct vfsmount *mnt: a vfs-internal representation of a mount point | 157 | The mount() method must return the root dentry of the tree requested by |
| 158 | caller. An active reference to its superblock must be grabbed and the | ||
| 159 | superblock must be locked. On failure it should return ERR_PTR(error). | ||
| 157 | 160 | ||
| 158 | The get_sb() method must determine if the block device specified | 161 | The arguments match those of mount(2) and their interpretation |
| 159 | in the dev_name and fs_type contains a filesystem of the type the method | 162 | depends on filesystem type. E.g. for block filesystems, dev_name is |
| 160 | supports. If it succeeds in opening the named block device, it initializes a | 163 | interpreted as block device name, that device is opened and if it |
| 161 | struct super_block descriptor for the filesystem contained by the block device. | 164 | contains a suitable filesystem image the method creates and initializes |
| 162 | On failure it returns an error. | 165 | struct super_block accordingly, returning its root dentry to caller. |
| 166 | |||
| 167 | ->mount() may choose to return a subtree of existing filesystem - it | ||
| 168 | doesn't have to create a new one. The main result from the caller's | ||
| 169 | point of view is a reference to dentry at the root of (sub)tree to | ||
| 170 | be attached; creation of new superblock is a common side effect. | ||
| 163 | 171 | ||
| 164 | The most interesting member of the superblock structure that the | 172 | The most interesting member of the superblock structure that the |
| 165 | get_sb() method fills in is the "s_op" field. This is a pointer to | 173 | mount() method fills in is the "s_op" field. This is a pointer to |
| 166 | a "struct super_operations" which describes the next level of the | 174 | a "struct super_operations" which describes the next level of the |
| 167 | filesystem implementation. | 175 | filesystem implementation. |
| 168 | 176 | ||
| 169 | Usually, a filesystem uses one of the generic get_sb() implementations | 177 | Usually, a filesystem uses one of the generic mount() implementations |
| 170 | and provides a fill_super() method instead. The generic methods are: | 178 | and provides a fill_super() callback instead. The generic variants are: |
| 171 | 179 | ||
| 172 | get_sb_bdev: mount a filesystem residing on a block device | 180 | mount_bdev: mount a filesystem residing on a block device |
| 173 | 181 | ||
| 174 | get_sb_nodev: mount a filesystem that is not backed by a device | 182 | mount_nodev: mount a filesystem that is not backed by a device |
| 175 | 183 | ||
| 176 | get_sb_single: mount a filesystem which shares the instance between | 184 | mount_single: mount a filesystem which shares the instance between |
| 177 | all mounts | 185 | all mounts |
| 178 | 186 | ||
| 179 | A fill_super() method implementation has the following arguments: | 187 | A fill_super() callback implementation has the following arguments: |
| 180 | 188 | ||
| 181 | struct super_block *sb: the superblock structure. The method fill_super() | 189 | struct super_block *sb: the superblock structure. The callback |
| 182 | must initialize this properly. | 190 | must initialize this properly. |
| 183 | 191 | ||
| 184 | void *data: arbitrary mount options, usually comes as an ASCII | 192 | void *data: arbitrary mount options, usually comes as an ASCII |
| @@ -246,7 +254,7 @@ or bottom half). | |||
| 246 | should be synchronous or not, not all filesystems check this flag. | 254 | should be synchronous or not, not all filesystems check this flag. |
| 247 | 255 | ||
| 248 | drop_inode: called when the last access to the inode is dropped, | 256 | drop_inode: called when the last access to the inode is dropped, |
| 249 | with the inode_lock spinlock held. | 257 | with the inode->i_lock spinlock held. |
| 250 | 258 | ||
| 251 | This method should be either NULL (normal UNIX filesystem | 259 | This method should be either NULL (normal UNIX filesystem |
| 252 | semantics) or "generic_delete_inode" (for filesystems that do not | 260 | semantics) or "generic_delete_inode" (for filesystems that do not |
| @@ -325,7 +333,8 @@ struct inode_operations { | |||
| 325 | void * (*follow_link) (struct dentry *, struct nameidata *); | 333 | void * (*follow_link) (struct dentry *, struct nameidata *); |
| 326 | void (*put_link) (struct dentry *, struct nameidata *, void *); | 334 | void (*put_link) (struct dentry *, struct nameidata *, void *); |
| 327 | void (*truncate) (struct inode *); | 335 | void (*truncate) (struct inode *); |
| 328 | int (*permission) (struct inode *, int, struct nameidata *); | 336 | int (*permission) (struct inode *, int, unsigned int); |
| 337 | int (*check_acl)(struct inode *, int, unsigned int); | ||
| 329 | int (*setattr) (struct dentry *, struct iattr *); | 338 | int (*setattr) (struct dentry *, struct iattr *); |
| 330 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); | 339 | int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); |
| 331 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); | 340 | int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); |
| @@ -414,6 +423,13 @@ otherwise noted. | |||
| 414 | permission: called by the VFS to check for access rights on a POSIX-like | 423 | permission: called by the VFS to check for access rights on a POSIX-like |
| 415 | filesystem. | 424 | filesystem. |
| 416 | 425 | ||
| 426 | May be called in rcu-walk mode (flags & IPERM_FLAG_RCU). If in rcu-walk | ||
| 427 | mode, the filesystem must check the permission without blocking or | ||
| 428 | storing to the inode. | ||
| 429 | |||
| 430 | If a situation is encountered that rcu-walk cannot handle, return | ||
| 431 | -ECHILD and it will be called again in ref-walk mode. | ||
| 432 | |||
| 417 | setattr: called by the VFS to set attributes for a file. This method | 433 | setattr: called by the VFS to set attributes for a file. This method |
| 418 | is called by chmod(2) and related system calls. | 434 | is called by chmod(2) and related system calls. |
| 419 | 435 | ||
| @@ -534,6 +550,7 @@ struct address_space_operations { | |||
| 534 | sector_t (*bmap)(struct address_space *, sector_t); | 550 | sector_t (*bmap)(struct address_space *, sector_t); |
| 535 | int (*invalidatepage) (struct page *, unsigned long); | 551 | int (*invalidatepage) (struct page *, unsigned long); |
| 536 | int (*releasepage) (struct page *, int); | 552 | int (*releasepage) (struct page *, int); |
| 553 | void (*freepage)(struct page *); | ||
| 537 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, | 554 | ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov, |
| 538 | loff_t offset, unsigned long nr_segs); | 555 | loff_t offset, unsigned long nr_segs); |
| 539 | struct page* (*get_xip_page)(struct address_space *, sector_t, | 556 | struct page* (*get_xip_page)(struct address_space *, sector_t, |
| @@ -660,11 +677,10 @@ struct address_space_operations { | |||
| 660 | releasepage: releasepage is called on PagePrivate pages to indicate | 677 | releasepage: releasepage is called on PagePrivate pages to indicate |
| 661 | that the page should be freed if possible. ->releasepage | 678 | that the page should be freed if possible. ->releasepage |
| 662 | should remove any private data from the page and clear the | 679 | should remove any private data from the page and clear the |
| 663 | PagePrivate flag. It may also remove the page from the | 680 | PagePrivate flag. If releasepage() fails for some reason, it must |
| 664 | address_space. If this fails for some reason, it may indicate | 681 | indicate failure with a 0 return value. |
| 665 | failure with a 0 return value. | 682 | releasepage() is used in two distinct though related cases. The |
| 666 | This is used in two distinct though related cases. The first | 683 | first is when the VM finds a clean page with no active users and |
| 667 | is when the VM finds a clean page with no active users and | ||
| 668 | wants to make it a free page. If ->releasepage succeeds, the | 684 | wants to make it a free page. If ->releasepage succeeds, the |
| 669 | page will be removed from the address_space and become free. | 685 | page will be removed from the address_space and become free. |
| 670 | 686 | ||
| @@ -679,6 +695,12 @@ struct address_space_operations { | |||
| 679 | need to ensure this. Possibly it can clear the PageUptodate | 695 | need to ensure this. Possibly it can clear the PageUptodate |
| 680 | bit if it cannot free private data yet. | 696 | bit if it cannot free private data yet. |
| 681 | 697 | ||
| 698 | freepage: freepage is called once the page is no longer visible in | ||
| 699 | the page cache in order to allow the cleanup of any private | ||
| 700 | data. Since it may be called by the memory reclaimer, it | ||
| 701 | should not assume that the original address_space mapping still | ||
| 702 | exists, and it should not block. | ||
| 703 | |||
| 682 | direct_IO: called by the generic read/write routines to perform | 704 | direct_IO: called by the generic read/write routines to perform |
| 683 | direct_IO - that is IO requests which bypass the page cache | 705 | direct_IO - that is IO requests which bypass the page cache |
| 684 | and transfer data directly between the storage and the | 706 | and transfer data directly between the storage and the |
| @@ -841,12 +863,17 @@ defined: | |||
| 841 | 863 | ||
| 842 | struct dentry_operations { | 864 | struct dentry_operations { |
| 843 | int (*d_revalidate)(struct dentry *, struct nameidata *); | 865 | int (*d_revalidate)(struct dentry *, struct nameidata *); |
| 844 | int (*d_hash) (struct dentry *, struct qstr *); | 866 | int (*d_hash)(const struct dentry *, const struct inode *, |
| 845 | int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); | 867 | struct qstr *); |
| 846 | int (*d_delete)(struct dentry *); | 868 | int (*d_compare)(const struct dentry *, const struct inode *, |
| 869 | const struct dentry *, const struct inode *, | ||
| 870 | unsigned int, const char *, const struct qstr *); | ||
| 871 | int (*d_delete)(const struct dentry *); | ||
| 847 | void (*d_release)(struct dentry *); | 872 | void (*d_release)(struct dentry *); |
| 848 | void (*d_iput)(struct dentry *, struct inode *); | 873 | void (*d_iput)(struct dentry *, struct inode *); |
| 849 | char *(*d_dname)(struct dentry *, char *, int); | 874 | char *(*d_dname)(struct dentry *, char *, int); |
| 875 | struct vfsmount *(*d_automount)(struct path *); | ||
| 876 | int (*d_manage)(struct dentry *, bool); | ||
| 850 | }; | 877 | }; |
| 851 | 878 | ||
| 852 | d_revalidate: called when the VFS needs to revalidate a dentry. This | 879 | d_revalidate: called when the VFS needs to revalidate a dentry. This |
| @@ -854,13 +881,45 @@ struct dentry_operations { | |||
| 854 | dcache. Most filesystems leave this as NULL, because all their | 881 | dcache. Most filesystems leave this as NULL, because all their |
| 855 | dentries in the dcache are valid | 882 | dentries in the dcache are valid |
| 856 | 883 | ||
| 857 | d_hash: called when the VFS adds a dentry to the hash table | 884 | d_revalidate may be called in rcu-walk mode (nd->flags & LOOKUP_RCU). |
| 885 | If in rcu-walk mode, the filesystem must revalidate the dentry without | ||
| 886 | blocking or storing to the dentry, d_parent and d_inode should not be | ||
| 887 | used without care (because they can go NULL), instead nd->inode should | ||
| 888 | be used. | ||
| 889 | |||
| 890 | If a situation is encountered that rcu-walk cannot handle, return | ||
| 891 | -ECHILD and it will be called again in ref-walk mode. | ||
| 892 | |||
| 893 | d_hash: called when the VFS adds a dentry to the hash table. The first | ||
| 894 | dentry passed to d_hash is the parent directory that the name is | ||
| 895 | to be hashed into. The inode is the dentry's inode. | ||
| 858 | 896 | ||
| 859 | d_compare: called when a dentry should be compared with another | 897 | Same locking and synchronisation rules as d_compare regarding |
| 898 | what is safe to dereference etc. | ||
| 860 | 899 | ||
| 861 | d_delete: called when the last reference to a dentry is | 900 | d_compare: called to compare a dentry name with a given name. The first |
| 862 | deleted. This means no-one is using the dentry, however it is | 901 | dentry is the parent of the dentry to be compared, the second is |
| 863 | still valid and in the dcache | 902 | the parent's inode, then the dentry and inode (may be NULL) of the |
| 903 | child dentry. len and name string are properties of the dentry to be | ||
| 904 | compared. qstr is the name to compare it with. | ||
| 905 | |||
| 906 | Must be constant and idempotent, and should not take locks if | ||
| 907 | possible, and should not or store into the dentry or inodes. | ||
| 908 | Should not dereference pointers outside the dentry or inodes without | ||
| 909 | lots of care (eg. d_parent, d_inode, d_name should not be used). | ||
| 910 | |||
| 911 | However, our vfsmount is pinned, and RCU held, so the dentries and | ||
| 912 | inodes won't disappear, neither will our sb or filesystem module. | ||
| 913 | ->i_sb and ->d_sb may be used. | ||
| 914 | |||
| 915 | It is a tricky calling convention because it needs to be called under | ||
| 916 | "rcu-walk", ie. without any locks or references on things. | ||
| 917 | |||
| 918 | d_delete: called when the last reference to a dentry is dropped and the | ||
| 919 | dcache is deciding whether or not to cache it. Return 1 to delete | ||
| 920 | immediately, or 0 to cache the dentry. Default is NULL which means to | ||
| 921 | always cache a reachable dentry. d_delete must be constant and | ||
| 922 | idempotent. | ||
| 864 | 923 | ||
| 865 | d_release: called when a dentry is really deallocated | 924 | d_release: called when a dentry is really deallocated |
| 866 | 925 | ||
| @@ -881,6 +940,43 @@ struct dentry_operations { | |||
| 881 | at the end of the buffer, and returns a pointer to the first char. | 940 | at the end of the buffer, and returns a pointer to the first char. |
| 882 | dynamic_dname() helper function is provided to take care of this. | 941 | dynamic_dname() helper function is provided to take care of this. |
| 883 | 942 | ||
| 943 | d_automount: called when an automount dentry is to be traversed (optional). | ||
| 944 | This should create a new VFS mount record and return the record to the | ||
| 945 | caller. The caller is supplied with a path parameter giving the | ||
| 946 | automount directory to describe the automount target and the parent | ||
| 947 | VFS mount record to provide inheritable mount parameters. NULL should | ||
| 948 | be returned if someone else managed to make the automount first. If | ||
| 949 | the vfsmount creation failed, then an error code should be returned. | ||
| 950 | If -EISDIR is returned, then the directory will be treated as an | ||
| 951 | ordinary directory and returned to pathwalk to continue walking. | ||
| 952 | |||
| 953 | If a vfsmount is returned, the caller will attempt to mount it on the | ||
| 954 | mountpoint and will remove the vfsmount from its expiration list in | ||
| 955 | the case of failure. The vfsmount should be returned with 2 refs on | ||
| 956 | it to prevent automatic expiration - the caller will clean up the | ||
| 957 | additional ref. | ||
| 958 | |||
| 959 | This function is only used if DCACHE_NEED_AUTOMOUNT is set on the | ||
| 960 | dentry. This is set by __d_instantiate() if S_AUTOMOUNT is set on the | ||
| 961 | inode being added. | ||
| 962 | |||
| 963 | d_manage: called to allow the filesystem to manage the transition from a | ||
| 964 | dentry (optional). This allows autofs, for example, to hold up clients | ||
| 965 | waiting to explore behind a 'mountpoint' whilst letting the daemon go | ||
| 966 | past and construct the subtree there. 0 should be returned to let the | ||
| 967 | calling process continue. -EISDIR can be returned to tell pathwalk to | ||
| 968 | use this directory as an ordinary directory and to ignore anything | ||
| 969 | mounted on it and not to check the automount flag. Any other error | ||
| 970 | code will abort pathwalk completely. | ||
| 971 | |||
| 972 | If the 'rcu_walk' parameter is true, then the caller is doing a | ||
| 973 | pathwalk in RCU-walk mode. Sleeping is not permitted in this mode, | ||
| 974 | and the caller can be asked to leave it and call again by returing | ||
| 975 | -ECHILD. | ||
| 976 | |||
| 977 | This function is only used if DCACHE_MANAGE_TRANSIT is set on the | ||
| 978 | dentry being transited from. | ||
| 979 | |||
| 884 | Example : | 980 | Example : |
| 885 | 981 | ||
| 886 | static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) | 982 | static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen) |
| @@ -904,14 +1000,11 @@ manipulate dentries: | |||
| 904 | the usage count) | 1000 | the usage count) |
| 905 | 1001 | ||
| 906 | dput: close a handle for a dentry (decrements the usage count). If | 1002 | dput: close a handle for a dentry (decrements the usage count). If |
| 907 | the usage count drops to 0, the "d_delete" method is called | 1003 | the usage count drops to 0, and the dentry is still in its |
| 908 | and the dentry is placed on the unused list if the dentry is | 1004 | parent's hash, the "d_delete" method is called to check whether |
| 909 | still in its parents hash list. Putting the dentry on the | 1005 | it should be cached. If it should not be cached, or if the dentry |
| 910 | unused list just means that if the system needs some RAM, it | 1006 | is not hashed, it is deleted. Otherwise cached dentries are put |
| 911 | goes through the unused list of dentries and deallocates them. | 1007 | into an LRU list to be reclaimed on memory shortage. |
| 912 | If the dentry has already been unhashed and the usage count | ||
| 913 | drops to 0, in this case the dentry is deallocated after the | ||
| 914 | "d_delete" method is called | ||
| 915 | 1008 | ||
| 916 | d_drop: this unhashes a dentry from its parents hash list. A | 1009 | d_drop: this unhashes a dentry from its parents hash list. A |
| 917 | subsequent call to dput() will deallocate the dentry if its | 1010 | subsequent call to dput() will deallocate the dentry if its |
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt index 96d0df28bed3..2ce36439c09f 100644 --- a/Documentation/filesystems/xfs-delayed-logging-design.txt +++ b/Documentation/filesystems/xfs-delayed-logging-design.txt | |||
| @@ -42,7 +42,7 @@ the aggregation of all the previous changes currently held only in the log. | |||
| 42 | This relogging technique also allows objects to be moved forward in the log so | 42 | This relogging technique also allows objects to be moved forward in the log so |
| 43 | that an object being relogged does not prevent the tail of the log from ever | 43 | that an object being relogged does not prevent the tail of the log from ever |
| 44 | moving forward. This can be seen in the table above by the changing | 44 | moving forward. This can be seen in the table above by the changing |
| 45 | (increasing) LSN of each subsquent transaction - the LSN is effectively a | 45 | (increasing) LSN of each subsequent transaction - the LSN is effectively a |
| 46 | direct encoding of the location in the log of the transaction. | 46 | direct encoding of the location in the log of the transaction. |
| 47 | 47 | ||
| 48 | This relogging is also used to implement long-running, multiple-commit | 48 | This relogging is also used to implement long-running, multiple-commit |
| @@ -338,7 +338,7 @@ the same time another transaction modifies the item and inserts the log item | |||
| 338 | into the new CIL, then checkpoint transaction commit code cannot use log items | 338 | into the new CIL, then checkpoint transaction commit code cannot use log items |
| 339 | to store the list of log vectors that need to be written into the transaction. | 339 | to store the list of log vectors that need to be written into the transaction. |
| 340 | Hence log vectors need to be able to be chained together to allow them to be | 340 | Hence log vectors need to be able to be chained together to allow them to be |
| 341 | detatched from the log items. That is, when the CIL is flushed the memory | 341 | detached from the log items. That is, when the CIL is flushed the memory |
| 342 | buffer and log vector attached to each log item needs to be attached to the | 342 | buffer and log vector attached to each log item needs to be attached to the |
| 343 | checkpoint context so that the log item can be released. In diagrammatic form, | 343 | checkpoint context so that the log item can be released. In diagrammatic form, |
| 344 | the CIL would look like this before the flush: | 344 | the CIL would look like this before the flush: |
| @@ -577,7 +577,7 @@ only becomes unpinned when all the transactions complete and there are no | |||
| 577 | pending transactions. Thus the pinning and unpinning of a log item is symmetric | 577 | pending transactions. Thus the pinning and unpinning of a log item is symmetric |
| 578 | as there is a 1:1 relationship with transaction commit and log item completion. | 578 | as there is a 1:1 relationship with transaction commit and log item completion. |
| 579 | 579 | ||
| 580 | For delayed logging, however, we have an assymetric transaction commit to | 580 | For delayed logging, however, we have an asymmetric transaction commit to |
| 581 | completion relationship. Every time an object is relogged in the CIL it goes | 581 | completion relationship. Every time an object is relogged in the CIL it goes |
| 582 | through the commit process without a corresponding completion being registered. | 582 | through the commit process without a corresponding completion being registered. |
| 583 | That is, we now have a many-to-one relationship between transaction commit and | 583 | That is, we now have a many-to-one relationship between transaction commit and |
| @@ -780,7 +780,7 @@ With delayed logging, there are new steps inserted into the life cycle: | |||
| 780 | From this, it can be seen that the only life cycle differences between the two | 780 | From this, it can be seen that the only life cycle differences between the two |
| 781 | logging methods are in the middle of the life cycle - they still have the same | 781 | logging methods are in the middle of the life cycle - they still have the same |
| 782 | beginning and end and execution constraints. The only differences are in the | 782 | beginning and end and execution constraints. The only differences are in the |
| 783 | commiting of the log items to the log itself and the completion processing. | 783 | committing of the log items to the log itself and the completion processing. |
| 784 | Hence delayed logging should not introduce any constraints on log item | 784 | Hence delayed logging should not introduce any constraints on log item |
| 785 | behaviour, allocation or freeing that don't already exist. | 785 | behaviour, allocation or freeing that don't already exist. |
| 786 | 786 | ||
| @@ -791,21 +791,3 @@ mount option. Fundamentally, there is no reason why the log manager would not | |||
| 791 | be able to swap methods automatically and transparently depending on load | 791 | be able to swap methods automatically and transparently depending on load |
| 792 | characteristics, but this should not be necessary if delayed logging works as | 792 | characteristics, but this should not be necessary if delayed logging works as |
| 793 | designed. | 793 | designed. |
| 794 | |||
| 795 | Roadmap: | ||
| 796 | |||
| 797 | 2.6.37 Remove experimental tag from mount option | ||
| 798 | => should be roughly 6 months after initial merge | ||
| 799 | => enough time to: | ||
| 800 | => gain confidence and fix problems reported by early | ||
| 801 | adopters (a.k.a. guinea pigs) | ||
| 802 | => address worst performance regressions and undesired | ||
| 803 | behaviours | ||
| 804 | => start tuning/optimising code for parallelism | ||
| 805 | => start tuning/optimising algorithms consuming | ||
| 806 | excessive CPU time | ||
| 807 | |||
| 808 | 2.6.39 Switch default mount option to use delayed logging | ||
| 809 | => should be roughly 12 months after initial merge | ||
| 810 | => enough time to shake out remaining problems before next round of | ||
| 811 | enterprise distro kernel rebases | ||
