diff options
Diffstat (limited to 'Documentation/filesystems')
31 files changed, 1102 insertions, 139 deletions
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt index c0236e753bc8..f9765e8cf086 100644 --- a/Documentation/filesystems/9p.txt +++ b/Documentation/filesystems/9p.txt | |||
@@ -128,7 +128,7 @@ OPTIONS | |||
128 | RESOURCES | 128 | RESOURCES |
129 | ========= | 129 | ========= |
130 | 130 | ||
131 | Our current recommendation is to use Inferno (http://www.vitanuova.com/inferno) | 131 | Our current recommendation is to use Inferno (http://www.vitanuova.com/nferno/index.html) |
132 | as the 9p server. You can start a 9p server under Inferno by issuing the | 132 | as the 9p server. You can start a 9p server under Inferno by issuing the |
133 | following command: | 133 | following command: |
134 | ; styxlisten -A tcp!*!564 export '#U*' | 134 | ; styxlisten -A tcp!*!564 export '#U*' |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 06bbbed71206..bbcc15651a21 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -92,8 +92,8 @@ prototypes: | |||
92 | void (*destroy_inode)(struct inode *); | 92 | void (*destroy_inode)(struct inode *); |
93 | void (*dirty_inode) (struct inode *); | 93 | void (*dirty_inode) (struct inode *); |
94 | int (*write_inode) (struct inode *, int); | 94 | int (*write_inode) (struct inode *, int); |
95 | void (*drop_inode) (struct inode *); | 95 | int (*drop_inode) (struct inode *); |
96 | void (*delete_inode) (struct inode *); | 96 | void (*evict_inode) (struct inode *); |
97 | void (*put_super) (struct super_block *); | 97 | void (*put_super) (struct super_block *); |
98 | void (*write_super) (struct super_block *); | 98 | void (*write_super) (struct super_block *); |
99 | int (*sync_fs)(struct super_block *sb, int wait); | 99 | int (*sync_fs)(struct super_block *sb, int wait); |
@@ -101,14 +101,13 @@ prototypes: | |||
101 | int (*unfreeze_fs) (struct super_block *); | 101 | int (*unfreeze_fs) (struct super_block *); |
102 | int (*statfs) (struct dentry *, struct kstatfs *); | 102 | int (*statfs) (struct dentry *, struct kstatfs *); |
103 | int (*remount_fs) (struct super_block *, int *, char *); | 103 | int (*remount_fs) (struct super_block *, int *, char *); |
104 | void (*clear_inode) (struct inode *); | ||
105 | void (*umount_begin) (struct super_block *); | 104 | void (*umount_begin) (struct super_block *); |
106 | int (*show_options)(struct seq_file *, struct vfsmount *); | 105 | int (*show_options)(struct seq_file *, struct vfsmount *); |
107 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); | 106 | ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); |
108 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); | 107 | ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); |
109 | 108 | ||
110 | locking rules: | 109 | locking rules: |
111 | All may block. | 110 | All may block [not true, see below] |
112 | None have BKL | 111 | None have BKL |
113 | s_umount | 112 | s_umount |
114 | alloc_inode: | 113 | alloc_inode: |
@@ -116,22 +115,25 @@ destroy_inode: | |||
116 | dirty_inode: (must not sleep) | 115 | dirty_inode: (must not sleep) |
117 | write_inode: | 116 | write_inode: |
118 | drop_inode: !!!inode_lock!!! | 117 | drop_inode: !!!inode_lock!!! |
119 | delete_inode: | 118 | evict_inode: |
120 | put_super: write | 119 | put_super: write |
121 | write_super: read | 120 | write_super: read |
122 | sync_fs: read | 121 | sync_fs: read |
123 | freeze_fs: read | 122 | freeze_fs: read |
124 | unfreeze_fs: read | 123 | unfreeze_fs: read |
125 | statfs: no | 124 | statfs: maybe(read) (see below) |
126 | remount_fs: maybe (see below) | 125 | remount_fs: write |
127 | clear_inode: | ||
128 | umount_begin: no | 126 | umount_begin: no |
129 | show_options: no (namespace_sem) | 127 | show_options: no (namespace_sem) |
130 | quota_read: no (see below) | 128 | quota_read: no (see below) |
131 | quota_write: no (see below) | 129 | quota_write: no (see below) |
132 | 130 | ||
133 | ->remount_fs() will have the s_umount exclusive lock if it's already mounted. | 131 | ->statfs() has s_umount (shared) when called by ustat(2) (native or |
134 | When called from get_sb_single, it does NOT have the s_umount lock. | 132 | compat), but that's an accident of bad API; s_umount is used to pin |
133 | the superblock down when we only have dev_t given us by userland to | ||
134 | identify the superblock. Everything else (statfs(), fstatfs(), etc.) | ||
135 | doesn't hold it when calling ->statfs() - superblock is pinned down | ||
136 | by resolving the pathname passed to syscall. | ||
135 | ->quota_read() and ->quota_write() functions are both guaranteed to | 137 | ->quota_read() and ->quota_write() functions are both guaranteed to |
136 | be the only ones operating on the quota file by the quota code (via | 138 | be the only ones operating on the quota file by the quota code (via |
137 | dqio_sem) (unless an admin really wants to screw up something and | 139 | dqio_sem) (unless an admin really wants to screw up something and |
@@ -178,7 +180,7 @@ prototypes: | |||
178 | locking rules: | 180 | locking rules: |
179 | All except set_page_dirty may block | 181 | All except set_page_dirty may block |
180 | 182 | ||
181 | BKL PageLocked(page) i_sem | 183 | BKL PageLocked(page) i_mutex |
182 | writepage: no yes, unlocks (see below) | 184 | writepage: no yes, unlocks (see below) |
183 | readpage: no yes, unlocks | 185 | readpage: no yes, unlocks |
184 | sync_page: no maybe | 186 | sync_page: no maybe |
@@ -380,7 +382,7 @@ prototypes: | |||
380 | int (*open) (struct inode *, struct file *); | 382 | int (*open) (struct inode *, struct file *); |
381 | int (*flush) (struct file *); | 383 | int (*flush) (struct file *); |
382 | int (*release) (struct inode *, struct file *); | 384 | int (*release) (struct inode *, struct file *); |
383 | int (*fsync) (struct file *, struct dentry *, int datasync); | 385 | int (*fsync) (struct file *, int datasync); |
384 | int (*aio_fsync) (struct kiocb *, int datasync); | 386 | int (*aio_fsync) (struct kiocb *, int datasync); |
385 | int (*fasync) (int, struct file *, int); | 387 | int (*fasync) (int, struct file *, int); |
386 | int (*lock) (struct file *, int, struct file_lock *); | 388 | int (*lock) (struct file *, int, struct file_lock *); |
@@ -429,8 +431,9 @@ check_flags: no | |||
429 | implementations. If your fs is not using generic_file_llseek, you | 431 | implementations. If your fs is not using generic_file_llseek, you |
430 | need to acquire and release the appropriate locks in your ->llseek(). | 432 | need to acquire and release the appropriate locks in your ->llseek(). |
431 | For many filesystems, it is probably safe to acquire the inode | 433 | For many filesystems, it is probably safe to acquire the inode |
432 | semaphore. Note some filesystems (i.e. remote ones) provide no | 434 | mutex or just to use i_size_read() instead. |
433 | protection for i_size so you will need to use the BKL. | 435 | Note: this does not protect the file->f_pos against concurrent modifications |
436 | since this is something the userspace has to take care about. | ||
434 | 437 | ||
435 | Note: ext2_release() was *the* source of contention on fs-intensive | 438 | Note: ext2_release() was *the* source of contention on fs-intensive |
436 | loads and dropping BKL on ->release() helps to get rid of that (we still | 439 | loads and dropping BKL on ->release() helps to get rid of that (we still |
diff --git a/Documentation/filesystems/affs.txt b/Documentation/filesystems/affs.txt index 2d1524469c25..81ac488e3758 100644 --- a/Documentation/filesystems/affs.txt +++ b/Documentation/filesystems/affs.txt | |||
@@ -216,4 +216,4 @@ due to an incompatibility with the Amiga floppy controller. | |||
216 | 216 | ||
217 | If you are interested in an Amiga Emulator for Linux, look at | 217 | If you are interested in an Amiga Emulator for Linux, look at |
218 | 218 | ||
219 | http://www.freiburg.linux.de/~uae/ | 219 | http://web.archive.org/web/*/http://www.freiburg.linux.de/~uae/ |
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt index 8f78ded4b648..51986bf08a4d 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs4-mount-control.txt | |||
@@ -146,7 +146,7 @@ found to be inadequate, in this case. The Generic Netlink system was | |||
146 | used for this as raw Netlink would lead to a significant increase in | 146 | used for this as raw Netlink would lead to a significant increase in |
147 | complexity. There's no question that the Generic Netlink system is an | 147 | complexity. There's no question that the Generic Netlink system is an |
148 | elegant solution for common case ioctl functions but it's not a complete | 148 | elegant solution for common case ioctl functions but it's not a complete |
149 | replacement probably because it's primary purpose in life is to be a | 149 | replacement probably because its primary purpose in life is to be a |
150 | message bus implementation rather than specifically an ioctl replacement. | 150 | message bus implementation rather than specifically an ioctl replacement. |
151 | While it would be possible to work around this there is one concern | 151 | While it would be possible to work around this there is one concern |
152 | that lead to the decision to not use it. This is that the autofs | 152 | that lead to the decision to not use it. This is that the autofs |
diff --git a/Documentation/filesystems/befs.txt b/Documentation/filesystems/befs.txt index 67391a15949a..6e49c363938e 100644 --- a/Documentation/filesystems/befs.txt +++ b/Documentation/filesystems/befs.txt | |||
@@ -31,7 +31,7 @@ Current maintainer: Sergey S. Kostyliov <rathamahata@php4.ru> | |||
31 | 31 | ||
32 | WHAT IS THIS DRIVER? | 32 | WHAT IS THIS DRIVER? |
33 | ================== | 33 | ================== |
34 | This module implements the native filesystem of BeOS <http://www.be.com/> | 34 | This module implements the native filesystem of BeOS http://www.beincorporated.com/ |
35 | for the linux 2.4.1 and later kernels. Currently it is a read-only | 35 | for the linux 2.4.1 and later kernels. Currently it is a read-only |
36 | implementation. | 36 | implementation. |
37 | 37 | ||
@@ -61,7 +61,7 @@ step 2. Configuration & make kernel | |||
61 | 61 | ||
62 | The linux kernel has many compile-time options. Most of them are beyond the | 62 | The linux kernel has many compile-time options. Most of them are beyond the |
63 | scope of this document. I suggest the Kernel-HOWTO document as a good general | 63 | scope of this document. I suggest the Kernel-HOWTO document as a good general |
64 | reference on this topic. <http://www.linux.com/howto/Kernel-HOWTO.html> | 64 | reference on this topic. http://www.linuxdocs.org/HOWTOs/Kernel-HOWTO-4.html |
65 | 65 | ||
66 | However, to use the BeFS module, you must enable it at configure time. | 66 | However, to use the BeFS module, you must enable it at configure time. |
67 | 67 | ||
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt index a91e2e2095b0..770267af5b3e 100644 --- a/Documentation/filesystems/caching/fscache.txt +++ b/Documentation/filesystems/caching/fscache.txt | |||
@@ -343,8 +343,8 @@ This will look something like: | |||
343 | [root@andromeda ~]# head /proc/fs/fscache/objects | 343 | [root@andromeda ~]# head /proc/fs/fscache/objects |
344 | OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA | 344 | OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA |
345 | ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================ | 345 | ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================ |
346 | 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a | 346 | 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a |
347 | 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a | 347 | 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 0 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a |
348 | 348 | ||
349 | where the first set of columns before the '|' describe the object: | 349 | where the first set of columns before the '|' describe the object: |
350 | 350 | ||
@@ -362,7 +362,7 @@ where the first set of columns before the '|' describe the object: | |||
362 | EM Object's event mask | 362 | EM Object's event mask |
363 | EV Events raised on this object | 363 | EV Events raised on this object |
364 | F Object flags | 364 | F Object flags |
365 | S Object slow-work work item flags | 365 | S Object work item busy state mask (1:pending 2:running) |
366 | 366 | ||
367 | and the second set of columns describe the object's cookie, if present: | 367 | and the second set of columns describe the object's cookie, if present: |
368 | 368 | ||
@@ -395,8 +395,8 @@ and the following paired letters: | |||
395 | w Show objects that don't have pending writes | 395 | w Show objects that don't have pending writes |
396 | R Show objects that have outstanding reads | 396 | R Show objects that have outstanding reads |
397 | r Show objects that don't have outstanding reads | 397 | r Show objects that don't have outstanding reads |
398 | S Show objects that have slow work queued | 398 | S Show objects that have work queued |
399 | s Show objects that don't have slow work queued | 399 | s Show objects that don't have work queued |
400 | 400 | ||
401 | If neither side of a letter pair is given, then both are implied. For example: | 401 | If neither side of a letter pair is given, then both are implied. For example: |
402 | 402 | ||
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index 0660c9f5deef..763d8ebbbebd 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt | |||
@@ -90,7 +90,7 @@ Mount Options | |||
90 | Specify the IP and/or port the client should bind to locally. | 90 | Specify the IP and/or port the client should bind to locally. |
91 | There is normally not much reason to do this. If the IP is not | 91 | There is normally not much reason to do this. If the IP is not |
92 | specified, the client's IP address is determined by looking at the | 92 | specified, the client's IP address is determined by looking at the |
93 | address it's connection to the monitor originates from. | 93 | address its connection to the monitor originates from. |
94 | 94 | ||
95 | wsize=X | 95 | wsize=X |
96 | Specify the maximum write size in bytes. By default there is no | 96 | Specify the maximum write size in bytes. By default there is no |
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt index c50bbb2d52b4..1b528b2ad809 100644 --- a/Documentation/filesystems/dlmfs.txt +++ b/Documentation/filesystems/dlmfs.txt | |||
@@ -47,7 +47,7 @@ You'll want to start heartbeating on a volume which all the nodes in | |||
47 | your lockspace can access. The easiest way to do this is via | 47 | your lockspace can access. The easiest way to do this is via |
48 | ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires | 48 | ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires |
49 | that an OCFS2 file system be in place so that it can automatically | 49 | that an OCFS2 file system be in place so that it can automatically |
50 | find it's heartbeat area, though it will eventually support heartbeat | 50 | find its heartbeat area, though it will eventually support heartbeat |
51 | against raw disks. | 51 | against raw disks. |
52 | 52 | ||
53 | Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed | 53 | Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed |
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 867c5b50cb42..272f80d5f966 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt | |||
@@ -59,8 +59,19 @@ commit=nrsec (*) Ext3 can be told to sync all its data and metadata | |||
59 | Setting it to very large values will improve | 59 | Setting it to very large values will improve |
60 | performance. | 60 | performance. |
61 | 61 | ||
62 | barrier=1 This enables/disables barriers. barrier=0 disables | 62 | barrier=<0(*)|1> This enables/disables the use of write barriers in |
63 | it, barrier=1 enables it. | 63 | barrier the jbd code. barrier=0 disables, barrier=1 enables. |
64 | nobarrier (*) This also requires an IO stack which can support | ||
65 | barriers, and if jbd gets an error on a barrier | ||
66 | write, it will disable again with a warning. | ||
67 | Write barriers enforce proper on-disk ordering | ||
68 | of journal commits, making volatile disk write caches | ||
69 | safe to use, at some performance penalty. If | ||
70 | your disks are battery-backed in one way or another, | ||
71 | disabling barriers may safely improve performance. | ||
72 | The mount options "barrier" and "nobarrier" can | ||
73 | also be used to enable or disable barriers, for | ||
74 | consistency with other ext3 mount options. | ||
64 | 75 | ||
65 | orlov (*) This enables the new Orlov block allocator. It is | 76 | orlov (*) This enables the new Orlov block allocator. It is |
66 | enabled by default. | 77 | enabled by default. |
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt index 606233cd4618..1b805a0efbb0 100644 --- a/Documentation/filesystems/fiemap.txt +++ b/Documentation/filesystems/fiemap.txt | |||
@@ -38,7 +38,7 @@ flags, it will return EBADR and the contents of fm_flags will contain | |||
38 | the set of flags which caused the error. If the kernel is compatible | 38 | the set of flags which caused the error. If the kernel is compatible |
39 | with all flags passed, the contents of fm_flags will be unmodified. | 39 | with all flags passed, the contents of fm_flags will be unmodified. |
40 | It is up to userspace to determine whether rejection of a particular | 40 | It is up to userspace to determine whether rejection of a particular |
41 | flag is fatal to it's operation. This scheme is intended to allow the | 41 | flag is fatal to its operation. This scheme is intended to allow the |
42 | fiemap interface to grow in the future but without losing | 42 | fiemap interface to grow in the future but without losing |
43 | compatibility with old software. | 43 | compatibility with old software. |
44 | 44 | ||
@@ -56,7 +56,7 @@ If this flag is set, the kernel will sync the file before mapping extents. | |||
56 | 56 | ||
57 | * FIEMAP_FLAG_XATTR | 57 | * FIEMAP_FLAG_XATTR |
58 | If this flag is set, the extents returned will describe the inodes | 58 | If this flag is set, the extents returned will describe the inodes |
59 | extended attribute lookup tree, instead of it's data tree. | 59 | extended attribute lookup tree, instead of its data tree. |
60 | 60 | ||
61 | 61 | ||
62 | Extent Mapping | 62 | Extent Mapping |
@@ -89,7 +89,7 @@ struct fiemap_extent { | |||
89 | }; | 89 | }; |
90 | 90 | ||
91 | All offsets and lengths are in bytes and mirror those on disk. It is valid | 91 | All offsets and lengths are in bytes and mirror those on disk. It is valid |
92 | for an extents logical offset to start before the request or it's logical | 92 | for an extents logical offset to start before the request or its logical |
93 | length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is | 93 | length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is |
94 | returned, fe_logical, fe_physical, and fe_length will be aligned to the | 94 | returned, fe_logical, fe_physical, and fe_length will be aligned to the |
95 | block size of the file system. With the exception of extents flagged as | 95 | block size of the file system. With the exception of extents flagged as |
@@ -125,7 +125,7 @@ been allocated for the file yet. | |||
125 | 125 | ||
126 | * FIEMAP_EXTENT_DELALLOC | 126 | * FIEMAP_EXTENT_DELALLOC |
127 | - This will also set FIEMAP_EXTENT_UNKNOWN. | 127 | - This will also set FIEMAP_EXTENT_UNKNOWN. |
128 | Delayed allocation - while there is data for this extent, it's | 128 | Delayed allocation - while there is data for this extent, its |
129 | physical location has not been allocated yet. | 129 | physical location has not been allocated yet. |
130 | 130 | ||
131 | * FIEMAP_EXTENT_ENCODED | 131 | * FIEMAP_EXTENT_ENCODED |
@@ -159,7 +159,7 @@ Data is located within a meta data block. | |||
159 | Data is packed into a block with data from other files. | 159 | Data is packed into a block with data from other files. |
160 | 160 | ||
161 | * FIEMAP_EXTENT_UNWRITTEN | 161 | * FIEMAP_EXTENT_UNWRITTEN |
162 | Unwritten extent - the extent is allocated but it's data has not been | 162 | Unwritten extent - the extent is allocated but its data has not been |
163 | initialized. This indicates the extent's data will be all zero if read | 163 | initialized. This indicates the extent's data will be all zero if read |
164 | through the filesystem but the contents are undefined if read directly from | 164 | through the filesystem but the contents are undefined if read directly from |
165 | the device. | 165 | the device. |
@@ -176,7 +176,7 @@ VFS -> File System Implementation | |||
176 | 176 | ||
177 | File systems wishing to support fiemap must implement a ->fiemap callback on | 177 | File systems wishing to support fiemap must implement a ->fiemap callback on |
178 | their inode_operations structure. The fs ->fiemap call is responsible for | 178 | their inode_operations structure. The fs ->fiemap call is responsible for |
179 | defining it's set of supported fiemap flags, and calling a helper function on | 179 | defining its set of supported fiemap flags, and calling a helper function on |
180 | each discovered extent: | 180 | each discovered extent: |
181 | 181 | ||
182 | struct inode_operations { | 182 | struct inode_operations { |
diff --git a/Documentation/filesystems/fuse.txt b/Documentation/filesystems/fuse.txt index 397a41adb4c3..13af4a49e7db 100644 --- a/Documentation/filesystems/fuse.txt +++ b/Documentation/filesystems/fuse.txt | |||
@@ -91,7 +91,7 @@ Mount options | |||
91 | 'default_permissions' | 91 | 'default_permissions' |
92 | 92 | ||
93 | By default FUSE doesn't check file access permissions, the | 93 | By default FUSE doesn't check file access permissions, the |
94 | filesystem is free to implement it's access policy or leave it to | 94 | filesystem is free to implement its access policy or leave it to |
95 | the underlying file access mechanism (e.g. in case of network | 95 | the underlying file access mechanism (e.g. in case of network |
96 | filesystems). This option enables permission checking, restricting | 96 | filesystems). This option enables permission checking, restricting |
97 | access based on file mode. It is usually useful together with the | 97 | access based on file mode. It is usually useful together with the |
@@ -171,7 +171,7 @@ or may honor them by sending a reply to the _original_ request, with | |||
171 | the error set to EINTR. | 171 | the error set to EINTR. |
172 | 172 | ||
173 | It is also possible that there's a race between processing the | 173 | It is also possible that there's a race between processing the |
174 | original request and it's INTERRUPT request. There are two possibilities: | 174 | original request and its INTERRUPT request. There are two possibilities: |
175 | 175 | ||
176 | 1) The INTERRUPT request is processed before the original request is | 176 | 1) The INTERRUPT request is processed before the original request is |
177 | processed | 177 | processed |
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt index 5e3ab8f3beff..0b59c0200912 100644 --- a/Documentation/filesystems/gfs2.txt +++ b/Documentation/filesystems/gfs2.txt | |||
@@ -1,7 +1,7 @@ | |||
1 | Global File System | 1 | Global File System |
2 | ------------------ | 2 | ------------------ |
3 | 3 | ||
4 | http://sources.redhat.com/cluster/ | 4 | http://sources.redhat.com/cluster/wiki/ |
5 | 5 | ||
6 | GFS is a cluster file system. It allows a cluster of computers to | 6 | GFS is a cluster file system. It allows a cluster of computers to |
7 | simultaneously use a block device that is shared between them (with FC, | 7 | simultaneously use a block device that is shared between them (with FC, |
@@ -36,11 +36,11 @@ GFS2 is not on-disk compatible with previous versions of GFS, but it | |||
36 | is pretty close. | 36 | is pretty close. |
37 | 37 | ||
38 | The following man pages can be found at the URL above: | 38 | The following man pages can be found at the URL above: |
39 | fsck.gfs2 to repair a filesystem | 39 | fsck.gfs2 to repair a filesystem |
40 | gfs2_grow to expand a filesystem online | 40 | gfs2_grow to expand a filesystem online |
41 | gfs2_jadd to add journals to a filesystem online | 41 | gfs2_jadd to add journals to a filesystem online |
42 | gfs2_tool to manipulate, examine and tune a filesystem | 42 | gfs2_tool to manipulate, examine and tune a filesystem |
43 | gfs2_quota to examine and change quota values in a filesystem | 43 | gfs2_quota to examine and change quota values in a filesystem |
44 | gfs2_convert to convert a gfs filesystem to gfs2 in-place | 44 | gfs2_convert to convert a gfs filesystem to gfs2 in-place |
45 | mount.gfs2 to help mount(8) mount a filesystem | 45 | mount.gfs2 to help mount(8) mount a filesystem |
46 | mkfs.gfs2 to make a filesystem | 46 | mkfs.gfs2 to make a filesystem |
diff --git a/Documentation/filesystems/hpfs.txt b/Documentation/filesystems/hpfs.txt index fa45c3baed98..74630bd504fb 100644 --- a/Documentation/filesystems/hpfs.txt +++ b/Documentation/filesystems/hpfs.txt | |||
@@ -103,7 +103,7 @@ to analyze or change OS2SYS.INI. | |||
103 | Codepages | 103 | Codepages |
104 | 104 | ||
105 | HPFS can contain several uppercasing tables for several codepages and each | 105 | HPFS can contain several uppercasing tables for several codepages and each |
106 | file has a pointer to codepage it's name is in. However OS/2 was created in | 106 | file has a pointer to codepage its name is in. However OS/2 was created in |
107 | America where people don't care much about codepages and so multiple codepages | 107 | America where people don't care much about codepages and so multiple codepages |
108 | support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk. | 108 | support is quite buggy. I have Czech OS/2 working in codepage 852 on my disk. |
109 | Once I booted English OS/2 working in cp 850 and I created a file on my 852 | 109 | Once I booted English OS/2 working in cp 850 and I created a file on my 852 |
diff --git a/Documentation/filesystems/isofs.txt b/Documentation/filesystems/isofs.txt index 3c367c3b3608..ba0a93384de0 100644 --- a/Documentation/filesystems/isofs.txt +++ b/Documentation/filesystems/isofs.txt | |||
@@ -41,7 +41,7 @@ Mount options unique to the isofs filesystem. | |||
41 | sbsector=xxx Session begins from sector xxx | 41 | sbsector=xxx Session begins from sector xxx |
42 | 42 | ||
43 | Recommended documents about ISO 9660 standard are located at: | 43 | Recommended documents about ISO 9660 standard are located at: |
44 | http://www.y-adagio.com/public/standards/iso_cdromr/tocont.htm | 44 | http://www.y-adagio.com/ |
45 | ftp://ftp.ecma.ch/ecma-st/Ecma-119.pdf | 45 | ftp://ftp.ecma.ch/ecma-st/Ecma-119.pdf |
46 | Quoting from the PDF "This 2nd Edition of Standard ECMA-119 is technically | 46 | Quoting from the PDF "This 2nd Edition of Standard ECMA-119 is technically |
47 | identical with ISO 9660.", so it is a valid and gratis substitute of the | 47 | identical with ISO 9660.", so it is a valid and gratis substitute of the |
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt index e64c94ba401a..bca42c22a143 100644 --- a/Documentation/filesystems/logfs.txt +++ b/Documentation/filesystems/logfs.txt | |||
@@ -59,7 +59,7 @@ Levels | |||
59 | ------ | 59 | ------ |
60 | 60 | ||
61 | Garbage collection (GC) may fail if all data is written | 61 | Garbage collection (GC) may fail if all data is written |
62 | indiscriminately. One requirement of GC is that data is seperated | 62 | indiscriminately. One requirement of GC is that data is separated |
63 | roughly according to the distance between the tree root and the data. | 63 | roughly according to the distance between the tree root and the data. |
64 | Effectively that means all file data is on level 0, indirect blocks | 64 | Effectively that means all file data is on level 0, indirect blocks |
65 | are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, | 65 | are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, |
@@ -67,7 +67,7 @@ respectively. Inode file data is on level 6 for the inodes and 7-11 | |||
67 | for indirect blocks. | 67 | for indirect blocks. |
68 | 68 | ||
69 | Each segment contains objects of a single level only. As a result, | 69 | Each segment contains objects of a single level only. As a result, |
70 | each level requires its own seperate segment to be open for writing. | 70 | each level requires its own separate segment to be open for writing. |
71 | 71 | ||
72 | Inode File | 72 | Inode File |
73 | ---------- | 73 | ---------- |
@@ -106,9 +106,9 @@ Vim | |||
106 | --- | 106 | --- |
107 | 107 | ||
108 | By cleverly predicting the life time of data, it is possible to | 108 | By cleverly predicting the life time of data, it is possible to |
109 | seperate long-living data from short-living data and thereby reduce | 109 | separate long-living data from short-living data and thereby reduce |
110 | the GC overhead later. Each type of distinc life expectency (vim) can | 110 | the GC overhead later. Each type of distinc life expectency (vim) can |
111 | have a seperate segment open for writing. Each (level, vim) tupel can | 111 | have a separate segment open for writing. Each (level, vim) tupel can |
112 | be open just once. If an open segment with unknown vim is encountered | 112 | be open just once. If an open segment with unknown vim is encountered |
113 | at mount time, it is closed and ignored henceforth. | 113 | at mount time, it is closed and ignored henceforth. |
114 | 114 | ||
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt index 6a53a84afc72..04884914a1c8 100644 --- a/Documentation/filesystems/nfs/nfs41-server.txt +++ b/Documentation/filesystems/nfs/nfs41-server.txt | |||
@@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | | Section 18.17 | | |||
137 | | READ | REQ | | Section 18.22 | | 137 | | READ | REQ | | Section 18.22 | |
138 | | READDIR | REQ | | Section 18.23 | | 138 | | READDIR | REQ | | Section 18.23 | |
139 | | READLINK | OPT | | Section 18.24 | | 139 | | READLINK | OPT | | Section 18.24 | |
140 | NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | | 140 | | RECLAIM_COMPLETE | REQ | | Section 18.51 | |
141 | | RELEASE_LOCKOWNER | MNI | | N/A | | 141 | | RELEASE_LOCKOWNER | MNI | | N/A | |
142 | | REMOVE | REQ | | Section 18.25 | | 142 | | REMOVE | REQ | | Section 18.25 | |
143 | | RENAME | REQ | | Section 18.26 | | 143 | | RENAME | REQ | | Section 18.26 | |
diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt index 3ba0b945aaf8..f2430a7974e1 100644 --- a/Documentation/filesystems/nfs/nfsroot.txt +++ b/Documentation/filesystems/nfs/nfsroot.txt | |||
@@ -124,6 +124,8 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf> | |||
124 | 124 | ||
125 | <hostname> Name of the client. May be supplied by autoconfiguration, | 125 | <hostname> Name of the client. May be supplied by autoconfiguration, |
126 | but its absence will not trigger autoconfiguration. | 126 | but its absence will not trigger autoconfiguration. |
127 | If specified and DHCP is used, the user provided hostname will | ||
128 | be carried in the DHCP request to hopefully update DNS record. | ||
127 | 129 | ||
128 | Default: Client IP address is used in ASCII notation. | 130 | Default: Client IP address is used in ASCII notation. |
129 | 131 | ||
diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt index 8a382bea6808..ebcaaee21616 100644 --- a/Documentation/filesystems/nfs/rpc-cache.txt +++ b/Documentation/filesystems/nfs/rpc-cache.txt | |||
@@ -185,7 +185,7 @@ failed lookup meant a definite 'no'. | |||
185 | request/response format | 185 | request/response format |
186 | ----------------------- | 186 | ----------------------- |
187 | 187 | ||
188 | While each cache is free to use it's own format for requests | 188 | While each cache is free to use its own format for requests |
189 | and responses over channel, the following is recommended as | 189 | and responses over channel, the following is recommended as |
190 | appropriate and support routines are available to help: | 190 | appropriate and support routines are available to help: |
191 | Each request or response record should be printable ASCII | 191 | Each request or response record should be printable ASCII |
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index cf6d0d85ca82..d5c0cef38a71 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt | |||
@@ -49,9 +49,12 @@ Mount options | |||
49 | NILFS2 supports the following mount options: | 49 | NILFS2 supports the following mount options: |
50 | (*) == default | 50 | (*) == default |
51 | 51 | ||
52 | nobarrier Disables barriers. | 52 | barrier(*) This enables/disables the use of write barriers. This |
53 | errors=continue(*) Keep going on a filesystem error. | 53 | nobarrier requires an IO stack which can support barriers, and |
54 | errors=remount-ro Remount the filesystem read-only on an error. | 54 | if nilfs gets an error on a barrier write, it will |
55 | disable again with a warning. | ||
56 | errors=continue Keep going on a filesystem error. | ||
57 | errors=remount-ro(*) Remount the filesystem read-only on an error. | ||
55 | errors=panic Panic and halt the machine if an error occurs. | 58 | errors=panic Panic and halt the machine if an error occurs. |
56 | cp=n Specify the checkpoint-number of the snapshot to be | 59 | cp=n Specify the checkpoint-number of the snapshot to be |
57 | mounted. Checkpoints and snapshots are listed by lscp | 60 | mounted. Checkpoints and snapshots are listed by lscp |
@@ -74,9 +77,10 @@ norecovery Disable recovery of the filesystem on mount. | |||
74 | This disables every write access on the device for | 77 | This disables every write access on the device for |
75 | read-only mounts or snapshots. This option will fail | 78 | read-only mounts or snapshots. This option will fail |
76 | for r/w mounts on an unclean volume. | 79 | for r/w mounts on an unclean volume. |
77 | discard Issue discard/TRIM commands to the underlying block | 80 | discard This enables/disables the use of discard/TRIM commands. |
78 | device when blocks are freed. This is useful for SSD | 81 | nodiscard(*) The discard/TRIM commands are sent to the underlying |
79 | devices and sparse/thinly-provisioned LUNs. | 82 | block device when blocks are freed. This is useful |
83 | for SSD devices and sparse/thinly-provisioned LUNs. | ||
80 | 84 | ||
81 | NILFS2 usage | 85 | NILFS2 usage |
82 | ============ | 86 | ============ |
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index c58b9f5ba002..1f7ae144f6d8 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt | |||
@@ -80,3 +80,10 @@ user_xattr (*) Enables Extended User Attributes. | |||
80 | nouser_xattr Disables Extended User Attributes. | 80 | nouser_xattr Disables Extended User Attributes. |
81 | acl Enables POSIX Access Control Lists support. | 81 | acl Enables POSIX Access Control Lists support. |
82 | noacl (*) Disables POSIX Access Control Lists support. | 82 | noacl (*) Disables POSIX Access Control Lists support. |
83 | resv_level=2 (*) Set how agressive allocation reservations will be. | ||
84 | Valid values are between 0 (reservations off) to 8 | ||
85 | (maximum space for reservations). | ||
86 | dir_resv_level= (*) By default, directory reservations will scale with file | ||
87 | reservations - users should rarely need to change this | ||
88 | value. If allocation reservations are turned off, this | ||
89 | option will have no effect. | ||
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index a7e9746ee7ea..b12c89538680 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting | |||
@@ -273,3 +273,48 @@ it's safe to remove it. If you don't need it, remove it. | |||
273 | deliberate; as soon as struct block_device * is propagated in a reasonable | 273 | deliberate; as soon as struct block_device * is propagated in a reasonable |
274 | way by that code fixing will become trivial; until then nothing can be | 274 | way by that code fixing will become trivial; until then nothing can be |
275 | done. | 275 | done. |
276 | |||
277 | [mandatory] | ||
278 | |||
279 | block truncatation on error exit from ->write_begin, and ->direct_IO | ||
280 | moved from generic methods (block_write_begin, cont_write_begin, | ||
281 | nobh_write_begin, blockdev_direct_IO*) to callers. Take a look at | ||
282 | ext2_write_failed and callers for an example. | ||
283 | |||
284 | [mandatory] | ||
285 | |||
286 | ->truncate is going away. The whole truncate sequence needs to be | ||
287 | implemented in ->setattr, which is now mandatory for filesystems | ||
288 | implementing on-disk size changes. Start with a copy of the old inode_setattr | ||
289 | and vmtruncate, and the reorder the vmtruncate + foofs_vmtruncate sequence to | ||
290 | be in order of zeroing blocks using block_truncate_page or similar helpers, | ||
291 | size update and on finally on-disk truncation which should not fail. | ||
292 | inode_change_ok now includes the size checks for ATTR_SIZE and must be called | ||
293 | in the beginning of ->setattr unconditionally. | ||
294 | |||
295 | [mandatory] | ||
296 | |||
297 | ->clear_inode() and ->delete_inode() are gone; ->evict_inode() should | ||
298 | be used instead. It gets called whenever the inode is evicted, whether it has | ||
299 | remaining links or not. Caller does *not* evict the pagecache or inode-associated | ||
300 | metadata buffers; getting rid of those is responsibility of method, as it had | ||
301 | been for ->delete_inode(). | ||
302 | ->drop_inode() returns int now; it's called on final iput() with inode_lock | ||
303 | held and it returns true if filesystems wants the inode to be dropped. As before, | ||
304 | generic_drop_inode() is still the default and it's been updated appropriately. | ||
305 | generic_delete_inode() is also alive and it consists simply of return 1. Note that | ||
306 | all actual eviction work is done by caller after ->drop_inode() returns. | ||
307 | clear_inode() is gone; use end_writeback() instead. As before, it must | ||
308 | be called exactly once on each call of ->evict_inode() (as it used to be for | ||
309 | each call of ->delete_inode()). Unlike before, if you are using inode-associated | ||
310 | metadata buffers (i.e. mark_buffer_dirty_inode()), it's your responsibility to | ||
311 | call invalidate_inode_buffers() before end_writeback(). | ||
312 | No async writeback (and thus no calls of ->write_inode()) will happen | ||
313 | after end_writeback() returns, so actions that should not overlap with ->write_inode() | ||
314 | (e.g. freeing on-disk inode if i_nlink is 0) ought to be done after that call. | ||
315 | |||
316 | NOTE: checking i_nlink in the beginning of ->write_inode() and bailing out | ||
317 | if it's zero is not *and* *never* *had* *been* enough. Final unlink() and iput() | ||
318 | may happen while the inode is in the middle of ->write_inode(); e.g. if you blindly | ||
319 | free the on-disk inode, you may end up doing that while ->write_inode() is writing | ||
320 | to it. | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a4f30faa4f1f..a6aca8740883 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -33,7 +33,8 @@ Table of Contents | |||
33 | 2 Modifying System Parameters | 33 | 2 Modifying System Parameters |
34 | 34 | ||
35 | 3 Per-Process Parameters | 35 | 3 Per-Process Parameters |
36 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score | 36 | 3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer |
37 | score | ||
37 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score | 38 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score |
38 | 3.3 /proc/<pid>/io - Display the IO accounting fields | 39 | 3.3 /proc/<pid>/io - Display the IO accounting fields |
39 | 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings | 40 | 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings |
@@ -73,9 +74,9 @@ contact Bodo Bauer at bb@ricochet.net. We'll be happy to add them to this | |||
73 | document. | 74 | document. |
74 | 75 | ||
75 | The latest version of this document is available online at | 76 | The latest version of this document is available online at |
76 | http://skaro.nightcrawler.com/~bb/Docs/Proc as HTML version. | 77 | http://tldp.org/LDP/Linux-Filesystem-Hierarchy/html/proc.html |
77 | 78 | ||
78 | If the above direction does not works for you, ypu could try the kernel | 79 | If the above direction does not works for you, you could try the kernel |
79 | mailing list at linux-kernel@vger.kernel.org and/or try to reach me at | 80 | mailing list at linux-kernel@vger.kernel.org and/or try to reach me at |
80 | comandante@zaralinux.com. | 81 | comandante@zaralinux.com. |
81 | 82 | ||
@@ -305,7 +306,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) | |||
305 | cgtime guest time of the task children in jiffies | 306 | cgtime guest time of the task children in jiffies |
306 | .............................................................................. | 307 | .............................................................................. |
307 | 308 | ||
308 | The /proc/PID/map file containing the currently mapped memory regions and | 309 | The /proc/PID/maps file containing the currently mapped memory regions and |
309 | their access permissions. | 310 | their access permissions. |
310 | 311 | ||
311 | The format is: | 312 | The format is: |
@@ -316,7 +317,7 @@ address perms offset dev inode pathname | |||
316 | 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test | 317 | 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test |
317 | 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] | 318 | 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] |
318 | a7cb1000-a7cb2000 ---p 00000000 00:00 0 | 319 | a7cb1000-a7cb2000 ---p 00000000 00:00 0 |
319 | a7cb2000-a7eb2000 rw-p 00000000 00:00 0 [threadstack:001ff4b4] | 320 | a7cb2000-a7eb2000 rw-p 00000000 00:00 0 |
320 | a7eb2000-a7eb3000 ---p 00000000 00:00 0 | 321 | a7eb2000-a7eb3000 ---p 00000000 00:00 0 |
321 | a7eb3000-a7ed5000 rw-p 00000000 00:00 0 | 322 | a7eb3000-a7ed5000 rw-p 00000000 00:00 0 |
322 | a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 | 323 | a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 |
@@ -352,7 +353,6 @@ is not associated with a file: | |||
352 | [stack] = the stack of the main process | 353 | [stack] = the stack of the main process |
353 | [vdso] = the "virtual dynamic shared object", | 354 | [vdso] = the "virtual dynamic shared object", |
354 | the kernel system call handler | 355 | the kernel system call handler |
355 | [threadstack:xxxxxxxx] = the stack of the thread, xxxxxxxx is the stack size | ||
356 | 356 | ||
357 | or if empty, the mapping is anonymous. | 357 | or if empty, the mapping is anonymous. |
358 | 358 | ||
@@ -566,6 +566,10 @@ The default_smp_affinity mask applies to all non-active IRQs, which are the | |||
566 | IRQs which have not yet been allocated/activated, and hence which lack a | 566 | IRQs which have not yet been allocated/activated, and hence which lack a |
567 | /proc/irq/[0-9]* directory. | 567 | /proc/irq/[0-9]* directory. |
568 | 568 | ||
569 | The node file on an SMP system shows the node to which the device using the IRQ | ||
570 | reports itself as being attached. This hardware locality information does not | ||
571 | include information about any possible driver locality preference. | ||
572 | |||
569 | prof_cpu_mask specifies which CPUs are to be profiled by the system wide | 573 | prof_cpu_mask specifies which CPUs are to be profiled by the system wide |
570 | profiler. Default value is ffffffff (all cpus). | 574 | profiler. Default value is ffffffff (all cpus). |
571 | 575 | ||
@@ -965,7 +969,7 @@ your system and how much traffic was routed over those devices: | |||
965 | ...] 1375103 17405 0 0 0 0 0 0 | 969 | ...] 1375103 17405 0 0 0 0 0 0 |
966 | ...] 1703981 5535 0 0 0 3 0 0 | 970 | ...] 1703981 5535 0 0 0 3 0 0 |
967 | 971 | ||
968 | In addition, each Channel Bond interface has it's own directory. For | 972 | In addition, each Channel Bond interface has its own directory. For |
969 | example, the bond0 device will have a directory called /proc/net/bond0/. | 973 | example, the bond0 device will have a directory called /proc/net/bond0/. |
970 | It will contain information that is specific to that bond, such as the | 974 | It will contain information that is specific to that bond, such as the |
971 | current slaves of the bond, the link status of the slaves, and how | 975 | current slaves of the bond, the link status of the slaves, and how |
@@ -1231,42 +1235,64 @@ of the kernel. | |||
1231 | CHAPTER 3: PER-PROCESS PARAMETERS | 1235 | CHAPTER 3: PER-PROCESS PARAMETERS |
1232 | ------------------------------------------------------------------------------ | 1236 | ------------------------------------------------------------------------------ |
1233 | 1237 | ||
1234 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score | 1238 | 3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score |
1235 | ------------------------------------------------------ | 1239 | -------------------------------------------------------------------------------- |
1236 | 1240 | ||
1237 | This file can be used to adjust the score used to select which processes | 1241 | These file can be used to adjust the badness heuristic used to select which |
1238 | should be killed in an out-of-memory situation. Giving it a high score will | 1242 | process gets killed in out of memory conditions. |
1239 | increase the likelihood of this process being killed by the oom-killer. Valid | 1243 | |
1240 | values are in the range -16 to +15, plus the special value -17, which disables | 1244 | The badness heuristic assigns a value to each candidate task ranging from 0 |
1241 | oom-killing altogether for this process. | 1245 | (never kill) to 1000 (always kill) to determine which process is targeted. The |
1242 | 1246 | units are roughly a proportion along that range of allowed memory the process | |
1243 | The process to be killed in an out-of-memory situation is selected among all others | 1247 | may allocate from based on an estimation of its current memory and swap use. |
1244 | based on its badness score. This value equals the original memory size of the process | 1248 | For example, if a task is using all allowed memory, its badness score will be |
1245 | and is then updated according to its CPU time (utime + stime) and the | 1249 | 1000. If it is using half of its allowed memory, its score will be 500. |
1246 | run time (uptime - start time). The longer it runs the smaller is the score. | 1250 | |
1247 | Badness score is divided by the square root of the CPU time and then by | 1251 | There is an additional factor included in the badness score: root |
1248 | the double square root of the run time. | 1252 | processes are given 3% extra memory over other tasks. |
1249 | 1253 | ||
1250 | Swapped out tasks are killed first. Half of each child's memory size is added to | 1254 | The amount of "allowed" memory depends on the context in which the oom killer |
1251 | the parent's score if they do not share the same memory. Thus forking servers | 1255 | was called. If it is due to the memory assigned to the allocating task's cpuset |
1252 | are the prime candidates to be killed. Having only one 'hungry' child will make | 1256 | being exhausted, the allowed memory represents the set of mems assigned to that |
1253 | parent less preferable than the child. | 1257 | cpuset. If it is due to a mempolicy's node(s) being exhausted, the allowed |
1254 | 1258 | memory represents the set of mempolicy nodes. If it is due to a memory | |
1255 | /proc/<pid>/oom_score shows process' current badness score. | 1259 | limit (or swap limit) being reached, the allowed memory is that configured |
1256 | 1260 | limit. Finally, if it is due to the entire system being out of memory, the | |
1257 | The following heuristics are then applied: | 1261 | allowed memory represents all allocatable resources. |
1258 | * if the task was reniced, its score doubles | 1262 | |
1259 | * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE | 1263 | The value of /proc/<pid>/oom_score_adj is added to the badness score before it |
1260 | or CAP_SYS_RAWIO) have their score divided by 4 | 1264 | is used to determine which task to kill. Acceptable values range from -1000 |
1261 | * if oom condition happened in one cpuset and checked process does not belong | 1265 | (OOM_SCORE_ADJ_MIN) to +1000 (OOM_SCORE_ADJ_MAX). This allows userspace to |
1262 | to it, its score is divided by 8 | 1266 | polarize the preference for oom killing either by always preferring a certain |
1263 | * the resulting score is multiplied by two to the power of oom_adj, i.e. | 1267 | task or completely disabling it. The lowest possible value, -1000, is |
1264 | points <<= oom_adj when it is positive and | 1268 | equivalent to disabling oom killing entirely for that task since it will always |
1265 | points >>= -(oom_adj) otherwise | 1269 | report a badness score of 0. |
1266 | 1270 | ||
1267 | The task with the highest badness score is then selected and its children | 1271 | Consequently, it is very simple for userspace to define the amount of memory to |
1268 | are killed, process itself will be killed in an OOM situation when it does | 1272 | consider for each task. Setting a /proc/<pid>/oom_score_adj value of +500, for |
1269 | not have children or some of them disabled oom like described above. | 1273 | example, is roughly equivalent to allowing the remainder of tasks sharing the |
1274 | same system, cpuset, mempolicy, or memory controller resources to use at least | ||
1275 | 50% more memory. A value of -500, on the other hand, would be roughly | ||
1276 | equivalent to discounting 50% of the task's allowed memory from being considered | ||
1277 | as scoring against the task. | ||
1278 | |||
1279 | For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also | ||
1280 | be used to tune the badness score. Its acceptable values range from -16 | ||
1281 | (OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17 | ||
1282 | (OOM_DISABLE) to disable oom killing entirely for that task. Its value is | ||
1283 | scaled linearly with /proc/<pid>/oom_score_adj. | ||
1284 | |||
1285 | Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the | ||
1286 | other with its scaled value. | ||
1287 | |||
1288 | NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see | ||
1289 | Documentation/feature-removal-schedule.txt. | ||
1290 | |||
1291 | Caveat: when a parent task is selected, the oom killer will sacrifice any first | ||
1292 | generation children with seperate address spaces instead, if possible. This | ||
1293 | avoids servers and important system daemons from being killed and loses the | ||
1294 | minimal amount of work. | ||
1295 | |||
1270 | 1296 | ||
1271 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score | 1297 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score |
1272 | ------------------------------------------------------------- | 1298 | ------------------------------------------------------------- |
@@ -1362,7 +1388,7 @@ been accounted as having caused 1MB of write. | |||
1362 | In other words: The number of bytes which this process caused to not happen, | 1388 | In other words: The number of bytes which this process caused to not happen, |
1363 | by truncating pagecache. A task can cause "negative" IO too. If this task | 1389 | by truncating pagecache. A task can cause "negative" IO too. If this task |
1364 | truncates some dirty pagecache, some IO which another task has been accounted | 1390 | truncates some dirty pagecache, some IO which another task has been accounted |
1365 | for (in it's write_bytes) will not be happening. We _could_ just subtract that | 1391 | for (in its write_bytes) will not be happening. We _could_ just subtract that |
1366 | from the truncating task's write_bytes, but there is information loss in doing | 1392 | from the truncating task's write_bytes, but there is information loss in doing |
1367 | that. | 1393 | that. |
1368 | 1394 | ||
diff --git a/Documentation/filesystems/smbfs.txt b/Documentation/filesystems/smbfs.txt index f673ef0de0f7..194fb0decd2c 100644 --- a/Documentation/filesystems/smbfs.txt +++ b/Documentation/filesystems/smbfs.txt | |||
@@ -3,6 +3,6 @@ protocol used by Windows for Workgroups, Windows 95 and Windows NT. | |||
3 | Smbfs was inspired by Samba, the program written by Andrew Tridgell | 3 | Smbfs was inspired by Samba, the program written by Andrew Tridgell |
4 | that turns any Unix host into a file server for DOS or Windows clients. | 4 | that turns any Unix host into a file server for DOS or Windows clients. |
5 | 5 | ||
6 | Smbfs is a SMB client, but uses parts of samba for it's operation. For | 6 | Smbfs is a SMB client, but uses parts of samba for its operation. For |
7 | more info on samba, including documentation, please go to | 7 | more info on samba, including documentation, please go to |
8 | http://www.samba.org/ and then on to your nearest mirror. | 8 | http://www.samba.org/ and then on to your nearest mirror. |
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 85354b32d731..74eaac26f8b8 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt | |||
@@ -39,7 +39,7 @@ files, each with their own function. | |||
39 | local_cpus nearby CPU mask (cpumask, ro) | 39 | local_cpus nearby CPU mask (cpumask, ro) |
40 | remove remove device from kernel's list (ascii, wo) | 40 | remove remove device from kernel's list (ascii, wo) |
41 | resource PCI resource host addresses (ascii, ro) | 41 | resource PCI resource host addresses (ascii, ro) |
42 | resource0..N PCI resource N, if present (binary, mmap) | 42 | resource0..N PCI resource N, if present (binary, mmap, rw[1]) |
43 | resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) | 43 | resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) |
44 | rom PCI ROM resource, if present (binary, ro) | 44 | rom PCI ROM resource, if present (binary, ro) |
45 | subsystem_device PCI subsystem device (ascii, ro) | 45 | subsystem_device PCI subsystem device (ascii, ro) |
@@ -54,13 +54,16 @@ files, each with their own function. | |||
54 | binary - file contains binary data | 54 | binary - file contains binary data |
55 | cpumask - file contains a cpumask type | 55 | cpumask - file contains a cpumask type |
56 | 56 | ||
57 | [1] rw for RESOURCE_IO (I/O port) regions only | ||
58 | |||
57 | The read only files are informational, writes to them will be ignored, with | 59 | The read only files are informational, writes to them will be ignored, with |
58 | the exception of the 'rom' file. Writable files can be used to perform | 60 | the exception of the 'rom' file. Writable files can be used to perform |
59 | actions on the device (e.g. changing config space, detaching a device). | 61 | actions on the device (e.g. changing config space, detaching a device). |
60 | mmapable files are available via an mmap of the file at offset 0 and can be | 62 | mmapable files are available via an mmap of the file at offset 0 and can be |
61 | used to do actual device programming from userspace. Note that some platforms | 63 | used to do actual device programming from userspace. Note that some platforms |
62 | don't support mmapping of certain resources, so be sure to check the return | 64 | don't support mmapping of certain resources, so be sure to check the return |
63 | value from any attempted mmap. | 65 | value from any attempted mmap. The most notable of these are I/O port |
66 | resources, which also provide read/write access. | ||
64 | 67 | ||
65 | The 'enable' file provides a counter that indicates how many times the device | 68 | The 'enable' file provides a counter that indicates how many times the device |
66 | has been enabled. If the 'enable' file currently returns '4', and a '1' is | 69 | has been enabled. If the 'enable' file currently returns '4', and a '1' is |
diff --git a/Documentation/filesystems/sysfs-tagging.txt b/Documentation/filesystems/sysfs-tagging.txt new file mode 100644 index 000000000000..caaaf1266d8f --- /dev/null +++ b/Documentation/filesystems/sysfs-tagging.txt | |||
@@ -0,0 +1,42 @@ | |||
1 | Sysfs tagging | ||
2 | ------------- | ||
3 | |||
4 | (Taken almost verbatim from Eric Biederman's netns tagging patch | ||
5 | commit msg) | ||
6 | |||
7 | The problem. Network devices show up in sysfs and with the network | ||
8 | namespace active multiple devices with the same name can show up in | ||
9 | the same directory, ouch! | ||
10 | |||
11 | To avoid that problem and allow existing applications in network | ||
12 | namespaces to see the same interface that is currently presented in | ||
13 | sysfs, sysfs now has tagging directory support. | ||
14 | |||
15 | By using the network namespace pointers as tags to separate out the | ||
16 | the sysfs directory entries we ensure that we don't have conflicts | ||
17 | in the directories and applications only see a limited set of | ||
18 | the network devices. | ||
19 | |||
20 | Each sysfs directory entry may be tagged with zero or one | ||
21 | namespaces. A sysfs_dirent is augmented with a void *s_ns. If a | ||
22 | directory entry is tagged, then sysfs_dirent->s_flags will have a | ||
23 | flag between KOBJ_NS_TYPE_NONE and KOBJ_NS_TYPES, and s_ns will | ||
24 | point to the namespace to which it belongs. | ||
25 | |||
26 | Each sysfs superblock's sysfs_super_info contains an array void | ||
27 | *ns[KOBJ_NS_TYPES]. When a a task in a tagging namespace | ||
28 | kobj_nstype first mounts sysfs, a new superblock is created. It | ||
29 | will be differentiated from other sysfs mounts by having its | ||
30 | s_fs_info->ns[kobj_nstype] set to the new namespace. Note that | ||
31 | through bind mounting and mounts propagation, a task can easily view | ||
32 | the contents of other namespaces' sysfs mounts. Therefore, when a | ||
33 | namespace exits, it will call kobj_ns_exit() to invalidate any | ||
34 | sysfs_dirent->s_ns pointers pointing to it. | ||
35 | |||
36 | Users of this interface: | ||
37 | - define a type in the kobj_ns_type enumeration. | ||
38 | - call kobj_ns_type_register() with its kobj_ns_type_operations which has | ||
39 | - current_ns() which returns current's namespace | ||
40 | - netlink_ns() which returns a socket's namespace | ||
41 | - initial_ns() which returns the initial namesapce | ||
42 | - call kobj_ns_exit() when an individual tag is no longer valid | ||
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 931c806642c5..5d1335faec2d 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt | |||
@@ -4,7 +4,7 @@ sysfs - _The_ filesystem for exporting kernel objects. | |||
4 | Patrick Mochel <mochel@osdl.org> | 4 | Patrick Mochel <mochel@osdl.org> |
5 | Mike Murphy <mamurph@cs.clemson.edu> | 5 | Mike Murphy <mamurph@cs.clemson.edu> |
6 | 6 | ||
7 | Revised: 22 February 2009 | 7 | Revised: 15 July 2010 |
8 | Original: 10 January 2003 | 8 | Original: 10 January 2003 |
9 | 9 | ||
10 | 10 | ||
@@ -124,7 +124,7 @@ show and store methods of the attribute owners. | |||
124 | 124 | ||
125 | struct sysfs_ops { | 125 | struct sysfs_ops { |
126 | ssize_t (*show)(struct kobject *, struct attribute *, char *); | 126 | ssize_t (*show)(struct kobject *, struct attribute *, char *); |
127 | ssize_t (*store)(struct kobject *, struct attribute *, const char *); | 127 | ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); |
128 | }; | 128 | }; |
129 | 129 | ||
130 | [ Subsystems should have already defined a struct kobj_type as a | 130 | [ Subsystems should have already defined a struct kobj_type as a |
@@ -139,18 +139,22 @@ calls the associated methods. | |||
139 | 139 | ||
140 | To illustrate: | 140 | To illustrate: |
141 | 141 | ||
142 | #define to_dev(obj) container_of(obj, struct device, kobj) | ||
142 | #define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr) | 143 | #define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr) |
143 | #define to_dev(d) container_of(d, struct device, kobj) | ||
144 | 144 | ||
145 | static ssize_t | 145 | static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr, |
146 | dev_attr_show(struct kobject * kobj, struct attribute * attr, char * buf) | 146 | char *buf) |
147 | { | 147 | { |
148 | struct device_attribute * dev_attr = to_dev_attr(attr); | 148 | struct device_attribute *dev_attr = to_dev_attr(attr); |
149 | struct device * dev = to_dev(kobj); | 149 | struct device *dev = to_dev(kobj); |
150 | ssize_t ret = 0; | 150 | ssize_t ret = -EIO; |
151 | 151 | ||
152 | if (dev_attr->show) | 152 | if (dev_attr->show) |
153 | ret = dev_attr->show(dev, buf); | 153 | ret = dev_attr->show(dev, dev_attr, buf); |
154 | if (ret >= (ssize_t)PAGE_SIZE) { | ||
155 | print_symbol("dev_attr_show: %s returned bad count\n", | ||
156 | (unsigned long)dev_attr->show); | ||
157 | } | ||
154 | return ret; | 158 | return ret; |
155 | } | 159 | } |
156 | 160 | ||
@@ -163,10 +167,9 @@ To read or write attributes, show() or store() methods must be | |||
163 | specified when declaring the attribute. The method types should be as | 167 | specified when declaring the attribute. The method types should be as |
164 | simple as those defined for device attributes: | 168 | simple as those defined for device attributes: |
165 | 169 | ||
166 | ssize_t (*show)(struct device * dev, struct device_attribute * attr, | 170 | ssize_t (*show)(struct device *dev, struct device_attribute *attr, char *buf); |
167 | char * buf); | 171 | ssize_t (*store)(struct device *dev, struct device_attribute *attr, |
168 | ssize_t (*store)(struct device * dev, struct device_attribute * attr, | 172 | const char *buf, size_t count); |
169 | const char * buf); | ||
170 | 173 | ||
171 | IOW, they should take only an object, an attribute, and a buffer as parameters. | 174 | IOW, they should take only an object, an attribute, and a buffer as parameters. |
172 | 175 | ||
@@ -209,8 +212,8 @@ Other notes: | |||
209 | 212 | ||
210 | - show() should always use snprintf(). | 213 | - show() should always use snprintf(). |
211 | 214 | ||
212 | - store() should return the number of bytes used from the buffer. This | 215 | - store() should return the number of bytes used from the buffer. If the |
213 | can be done using strlen(). | 216 | entire buffer has been used, just return the count argument. |
214 | 217 | ||
215 | - show() or store() can always return errors. If a bad value comes | 218 | - show() or store() can always return errors. If a bad value comes |
216 | through, be sure to return an error. | 219 | through, be sure to return an error. |
@@ -223,15 +226,18 @@ Other notes: | |||
223 | 226 | ||
224 | A very simple (and naive) implementation of a device attribute is: | 227 | A very simple (and naive) implementation of a device attribute is: |
225 | 228 | ||
226 | static ssize_t show_name(struct device *dev, struct device_attribute *attr, char *buf) | 229 | static ssize_t show_name(struct device *dev, struct device_attribute *attr, |
230 | char *buf) | ||
227 | { | 231 | { |
228 | return snprintf(buf, PAGE_SIZE, "%s\n", dev->name); | 232 | return snprintf(buf, PAGE_SIZE, "%s\n", dev->name); |
229 | } | 233 | } |
230 | 234 | ||
231 | static ssize_t store_name(struct device * dev, const char * buf) | 235 | static ssize_t store_name(struct device *dev, struct device_attribute *attr, |
236 | const char *buf, size_t count) | ||
232 | { | 237 | { |
233 | sscanf(buf, "%20s", dev->name); | 238 | snprintf(dev->name, sizeof(dev->name), "%.*s", |
234 | return strnlen(buf, PAGE_SIZE); | 239 | (int)min(count, sizeof(dev->name) - 1), buf); |
240 | return count; | ||
235 | } | 241 | } |
236 | 242 | ||
237 | static DEVICE_ATTR(name, S_IRUGO, show_name, store_name); | 243 | static DEVICE_ATTR(name, S_IRUGO, show_name, store_name); |
@@ -327,7 +333,7 @@ Structure: | |||
327 | struct bus_attribute { | 333 | struct bus_attribute { |
328 | struct attribute attr; | 334 | struct attribute attr; |
329 | ssize_t (*show)(struct bus_type *, char * buf); | 335 | ssize_t (*show)(struct bus_type *, char * buf); |
330 | ssize_t (*store)(struct bus_type *, const char * buf); | 336 | ssize_t (*store)(struct bus_type *, const char * buf, size_t count); |
331 | }; | 337 | }; |
332 | 338 | ||
333 | Declaring: | 339 | Declaring: |
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt index fe09a2cb1858..98ef55124158 100644 --- a/Documentation/filesystems/tmpfs.txt +++ b/Documentation/filesystems/tmpfs.txt | |||
@@ -94,11 +94,19 @@ NodeList format is a comma-separated list of decimal numbers and ranges, | |||
94 | a range being two hyphen-separated decimal numbers, the smallest and | 94 | a range being two hyphen-separated decimal numbers, the smallest and |
95 | largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 | 95 | largest node numbers in the range. For example, mpol=bind:0-3,5,7,9-15 |
96 | 96 | ||
97 | A memory policy with a valid NodeList will be saved, as specified, for | ||
98 | use at file creation time. When a task allocates a file in the file | ||
99 | system, the mount option memory policy will be applied with a NodeList, | ||
100 | if any, modified by the calling task's cpuset constraints | ||
101 | [See Documentation/cgroups/cpusets.txt] and any optional flags, listed | ||
102 | below. If the resulting NodeLists is the empty set, the effective memory | ||
103 | policy for the file will revert to "default" policy. | ||
104 | |||
97 | NUMA memory allocation policies have optional flags that can be used in | 105 | NUMA memory allocation policies have optional flags that can be used in |
98 | conjunction with their modes. These optional flags can be specified | 106 | conjunction with their modes. These optional flags can be specified |
99 | when tmpfs is mounted by appending them to the mode before the NodeList. | 107 | when tmpfs is mounted by appending them to the mode before the NodeList. |
100 | See Documentation/vm/numa_memory_policy.txt for a list of all available | 108 | See Documentation/vm/numa_memory_policy.txt for a list of all available |
101 | memory allocation policy mode flags. | 109 | memory allocation policy mode flags and their effect on memory policy. |
102 | 110 | ||
103 | =static is equivalent to MPOL_F_STATIC_NODES | 111 | =static is equivalent to MPOL_F_STATIC_NODES |
104 | =relative is equivalent to MPOL_F_RELATIVE_NODES | 112 | =relative is equivalent to MPOL_F_RELATIVE_NODES |
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index eed520fd0c8e..ead764b2728f 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt | |||
@@ -165,7 +165,8 @@ TEST SUITE | |||
165 | If you plan to make any modifications to the vfat filesystem, please | 165 | If you plan to make any modifications to the vfat filesystem, please |
166 | get the test suite that comes with the vfat distribution at | 166 | get the test suite that comes with the vfat distribution at |
167 | 167 | ||
168 | http://bmrc.berkeley.edu/people/chaffee/vfat.html | 168 | http://web.archive.org/web/*/http://bmrc.berkeley.edu/ |
169 | people/chaffee/vfat.html | ||
169 | 170 | ||
170 | This tests quite a few parts of the vfat filesystem and additional | 171 | This tests quite a few parts of the vfat filesystem and additional |
171 | tests for new features or untested features would be appreciated. | 172 | tests for new features or untested features would be appreciated. |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 3de2f32edd90..94677e7dcb13 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -72,7 +72,7 @@ structure (this is the kernel-side implementation of file | |||
72 | descriptors). The freshly allocated file structure is initialized with | 72 | descriptors). The freshly allocated file structure is initialized with |
73 | a pointer to the dentry and a set of file operation member functions. | 73 | a pointer to the dentry and a set of file operation member functions. |
74 | These are taken from the inode data. The open() file method is then | 74 | These are taken from the inode data. The open() file method is then |
75 | called so the specific filesystem implementation can do it's work. You | 75 | called so the specific filesystem implementation can do its work. You |
76 | can see that this is another switch performed by the VFS. The file | 76 | can see that this is another switch performed by the VFS. The file |
77 | structure is placed into the file descriptor table for the process. | 77 | structure is placed into the file descriptor table for the process. |
78 | 78 | ||
@@ -401,11 +401,16 @@ otherwise noted. | |||
401 | started might not be in the page cache at the end of the | 401 | started might not be in the page cache at the end of the |
402 | walk). | 402 | walk). |
403 | 403 | ||
404 | truncate: called by the VFS to change the size of a file. The | 404 | truncate: Deprecated. This will not be called if ->setsize is defined. |
405 | Called by the VFS to change the size of a file. The | ||
405 | i_size field of the inode is set to the desired size by the | 406 | i_size field of the inode is set to the desired size by the |
406 | VFS before this method is called. This method is called by | 407 | VFS before this method is called. This method is called by |
407 | the truncate(2) system call and related functionality. | 408 | the truncate(2) system call and related functionality. |
408 | 409 | ||
410 | Note: ->truncate and vmtruncate are deprecated. Do not add new | ||
411 | instances/calls of these. Filesystems should be converted to do their | ||
412 | truncate sequence via ->setattr(). | ||
413 | |||
409 | permission: called by the VFS to check for access rights on a POSIX-like | 414 | permission: called by the VFS to check for access rights on a POSIX-like |
410 | filesystem. | 415 | filesystem. |
411 | 416 | ||
@@ -729,7 +734,7 @@ struct file_operations { | |||
729 | int (*open) (struct inode *, struct file *); | 734 | int (*open) (struct inode *, struct file *); |
730 | int (*flush) (struct file *); | 735 | int (*flush) (struct file *); |
731 | int (*release) (struct inode *, struct file *); | 736 | int (*release) (struct inode *, struct file *); |
732 | int (*fsync) (struct file *, struct dentry *, int datasync); | 737 | int (*fsync) (struct file *, int datasync); |
733 | int (*aio_fsync) (struct kiocb *, int datasync); | 738 | int (*aio_fsync) (struct kiocb *, int datasync); |
734 | int (*fasync) (int, struct file *, int); | 739 | int (*fasync) (int, struct file *, int); |
735 | int (*lock) (struct file *, int, struct file_lock *); | 740 | int (*lock) (struct file *, int, struct file_lock *); |
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt new file mode 100644 index 000000000000..96d0df28bed3 --- /dev/null +++ b/Documentation/filesystems/xfs-delayed-logging-design.txt | |||
@@ -0,0 +1,811 @@ | |||
1 | XFS Delayed Logging Design | ||
2 | -------------------------- | ||
3 | |||
4 | Introduction to Re-logging in XFS | ||
5 | --------------------------------- | ||
6 | |||
7 | XFS logging is a combination of logical and physical logging. Some objects, | ||
8 | such as inodes and dquots, are logged in logical format where the details | ||
9 | logged are made up of the changes to in-core structures rather than on-disk | ||
10 | structures. Other objects - typically buffers - have their physical changes | ||
11 | logged. The reason for these differences is to reduce the amount of log space | ||
12 | required for objects that are frequently logged. Some parts of inodes are more | ||
13 | frequently logged than others, and inodes are typically more frequently logged | ||
14 | than any other object (except maybe the superblock buffer) so keeping the | ||
15 | amount of metadata logged low is of prime importance. | ||
16 | |||
17 | The reason that this is such a concern is that XFS allows multiple separate | ||
18 | modifications to a single object to be carried in the log at any given time. | ||
19 | This allows the log to avoid needing to flush each change to disk before | ||
20 | recording a new change to the object. XFS does this via a method called | ||
21 | "re-logging". Conceptually, this is quite simple - all it requires is that any | ||
22 | new change to the object is recorded with a *new copy* of all the existing | ||
23 | changes in the new transaction that is written to the log. | ||
24 | |||
25 | That is, if we have a sequence of changes A through to F, and the object was | ||
26 | written to disk after change D, we would see in the log the following series | ||
27 | of transactions, their contents and the log sequence number (LSN) of the | ||
28 | transaction: | ||
29 | |||
30 | Transaction Contents LSN | ||
31 | A A X | ||
32 | B A+B X+n | ||
33 | C A+B+C X+n+m | ||
34 | D A+B+C+D X+n+m+o | ||
35 | <object written to disk> | ||
36 | E E Y (> X+n+m+o) | ||
37 | F E+F Yٍ+p | ||
38 | |||
39 | In other words, each time an object is relogged, the new transaction contains | ||
40 | the aggregation of all the previous changes currently held only in the log. | ||
41 | |||
42 | This relogging technique also allows objects to be moved forward in the log so | ||
43 | that an object being relogged does not prevent the tail of the log from ever | ||
44 | moving forward. This can be seen in the table above by the changing | ||
45 | (increasing) LSN of each subsquent transaction - the LSN is effectively a | ||
46 | direct encoding of the location in the log of the transaction. | ||
47 | |||
48 | This relogging is also used to implement long-running, multiple-commit | ||
49 | transactions. These transaction are known as rolling transactions, and require | ||
50 | a special log reservation known as a permanent transaction reservation. A | ||
51 | typical example of a rolling transaction is the removal of extents from an | ||
52 | inode which can only be done at a rate of two extents per transaction because | ||
53 | of reservation size limitations. Hence a rolling extent removal transaction | ||
54 | keeps relogging the inode and btree buffers as they get modified in each | ||
55 | removal operation. This keeps them moving forward in the log as the operation | ||
56 | progresses, ensuring that current operation never gets blocked by itself if the | ||
57 | log wraps around. | ||
58 | |||
59 | Hence it can be seen that the relogging operation is fundamental to the correct | ||
60 | working of the XFS journalling subsystem. From the above description, most | ||
61 | people should be able to see why the XFS metadata operations writes so much to | ||
62 | the log - repeated operations to the same objects write the same changes to | ||
63 | the log over and over again. Worse is the fact that objects tend to get | ||
64 | dirtier as they get relogged, so each subsequent transaction is writing more | ||
65 | metadata into the log. | ||
66 | |||
67 | Another feature of the XFS transaction subsystem is that most transactions are | ||
68 | asynchronous. That is, they don't commit to disk until either a log buffer is | ||
69 | filled (a log buffer can hold multiple transactions) or a synchronous operation | ||
70 | forces the log buffers holding the transactions to disk. This means that XFS is | ||
71 | doing aggregation of transactions in memory - batching them, if you like - to | ||
72 | minimise the impact of the log IO on transaction throughput. | ||
73 | |||
74 | The limitation on asynchronous transaction throughput is the number and size of | ||
75 | log buffers made available by the log manager. By default there are 8 log | ||
76 | buffers available and the size of each is 32kB - the size can be increased up | ||
77 | to 256kB by use of a mount option. | ||
78 | |||
79 | Effectively, this gives us the maximum bound of outstanding metadata changes | ||
80 | that can be made to the filesystem at any point in time - if all the log | ||
81 | buffers are full and under IO, then no more transactions can be committed until | ||
82 | the current batch completes. It is now common for a single current CPU core to | ||
83 | be to able to issue enough transactions to keep the log buffers full and under | ||
84 | IO permanently. Hence the XFS journalling subsystem can be considered to be IO | ||
85 | bound. | ||
86 | |||
87 | Delayed Logging: Concepts | ||
88 | ------------------------- | ||
89 | |||
90 | The key thing to note about the asynchronous logging combined with the | ||
91 | relogging technique XFS uses is that we can be relogging changed objects | ||
92 | multiple times before they are committed to disk in the log buffers. If we | ||
93 | return to the previous relogging example, it is entirely possible that | ||
94 | transactions A through D are committed to disk in the same log buffer. | ||
95 | |||
96 | That is, a single log buffer may contain multiple copies of the same object, | ||
97 | but only one of those copies needs to be there - the last one "D", as it | ||
98 | contains all the changes from the previous changes. In other words, we have one | ||
99 | necessary copy in the log buffer, and three stale copies that are simply | ||
100 | wasting space. When we are doing repeated operations on the same set of | ||
101 | objects, these "stale objects" can be over 90% of the space used in the log | ||
102 | buffers. It is clear that reducing the number of stale objects written to the | ||
103 | log would greatly reduce the amount of metadata we write to the log, and this | ||
104 | is the fundamental goal of delayed logging. | ||
105 | |||
106 | From a conceptual point of view, XFS is already doing relogging in memory (where | ||
107 | memory == log buffer), only it is doing it extremely inefficiently. It is using | ||
108 | logical to physical formatting to do the relogging because there is no | ||
109 | infrastructure to keep track of logical changes in memory prior to physically | ||
110 | formatting the changes in a transaction to the log buffer. Hence we cannot avoid | ||
111 | accumulating stale objects in the log buffers. | ||
112 | |||
113 | Delayed logging is the name we've given to keeping and tracking transactional | ||
114 | changes to objects in memory outside the log buffer infrastructure. Because of | ||
115 | the relogging concept fundamental to the XFS journalling subsystem, this is | ||
116 | actually relatively easy to do - all the changes to logged items are already | ||
117 | tracked in the current infrastructure. The big problem is how to accumulate | ||
118 | them and get them to the log in a consistent, recoverable manner. | ||
119 | Describing the problems and how they have been solved is the focus of this | ||
120 | document. | ||
121 | |||
122 | One of the key changes that delayed logging makes to the operation of the | ||
123 | journalling subsystem is that it disassociates the amount of outstanding | ||
124 | metadata changes from the size and number of log buffers available. In other | ||
125 | words, instead of there only being a maximum of 2MB of transaction changes not | ||
126 | written to the log at any point in time, there may be a much greater amount | ||
127 | being accumulated in memory. Hence the potential for loss of metadata on a | ||
128 | crash is much greater than for the existing logging mechanism. | ||
129 | |||
130 | It should be noted that this does not change the guarantee that log recovery | ||
131 | will result in a consistent filesystem. What it does mean is that as far as the | ||
132 | recovered filesystem is concerned, there may be many thousands of transactions | ||
133 | that simply did not occur as a result of the crash. This makes it even more | ||
134 | important that applications that care about their data use fsync() where they | ||
135 | need to ensure application level data integrity is maintained. | ||
136 | |||
137 | It should be noted that delayed logging is not an innovative new concept that | ||
138 | warrants rigorous proofs to determine whether it is correct or not. The method | ||
139 | of accumulating changes in memory for some period before writing them to the | ||
140 | log is used effectively in many filesystems including ext3 and ext4. Hence | ||
141 | no time is spent in this document trying to convince the reader that the | ||
142 | concept is sound. Instead it is simply considered a "solved problem" and as | ||
143 | such implementing it in XFS is purely an exercise in software engineering. | ||
144 | |||
145 | The fundamental requirements for delayed logging in XFS are simple: | ||
146 | |||
147 | 1. Reduce the amount of metadata written to the log by at least | ||
148 | an order of magnitude. | ||
149 | 2. Supply sufficient statistics to validate Requirement #1. | ||
150 | 3. Supply sufficient new tracing infrastructure to be able to debug | ||
151 | problems with the new code. | ||
152 | 4. No on-disk format change (metadata or log format). | ||
153 | 5. Enable and disable with a mount option. | ||
154 | 6. No performance regressions for synchronous transaction workloads. | ||
155 | |||
156 | Delayed Logging: Design | ||
157 | ----------------------- | ||
158 | |||
159 | Storing Changes | ||
160 | |||
161 | The problem with accumulating changes at a logical level (i.e. just using the | ||
162 | existing log item dirty region tracking) is that when it comes to writing the | ||
163 | changes to the log buffers, we need to ensure that the object we are formatting | ||
164 | is not changing while we do this. This requires locking the object to prevent | ||
165 | concurrent modification. Hence flushing the logical changes to the log would | ||
166 | require us to lock every object, format them, and then unlock them again. | ||
167 | |||
168 | This introduces lots of scope for deadlocks with transactions that are already | ||
169 | running. For example, a transaction has object A locked and modified, but needs | ||
170 | the delayed logging tracking lock to commit the transaction. However, the | ||
171 | flushing thread has the delayed logging tracking lock already held, and is | ||
172 | trying to get the lock on object A to flush it to the log buffer. This appears | ||
173 | to be an unsolvable deadlock condition, and it was solving this problem that | ||
174 | was the barrier to implementing delayed logging for so long. | ||
175 | |||
176 | The solution is relatively simple - it just took a long time to recognise it. | ||
177 | Put simply, the current logging code formats the changes to each item into an | ||
178 | vector array that points to the changed regions in the item. The log write code | ||
179 | simply copies the memory these vectors point to into the log buffer during | ||
180 | transaction commit while the item is locked in the transaction. Instead of | ||
181 | using the log buffer as the destination of the formatting code, we can use an | ||
182 | allocated memory buffer big enough to fit the formatted vector. | ||
183 | |||
184 | If we then copy the vector into the memory buffer and rewrite the vector to | ||
185 | point to the memory buffer rather than the object itself, we now have a copy of | ||
186 | the changes in a format that is compatible with the log buffer writing code. | ||
187 | that does not require us to lock the item to access. This formatting and | ||
188 | rewriting can all be done while the object is locked during transaction commit, | ||
189 | resulting in a vector that is transactionally consistent and can be accessed | ||
190 | without needing to lock the owning item. | ||
191 | |||
192 | Hence we avoid the need to lock items when we need to flush outstanding | ||
193 | asynchronous transactions to the log. The differences between the existing | ||
194 | formatting method and the delayed logging formatting can be seen in the | ||
195 | diagram below. | ||
196 | |||
197 | Current format log vector: | ||
198 | |||
199 | Object +---------------------------------------------+ | ||
200 | Vector 1 +----+ | ||
201 | Vector 2 +----+ | ||
202 | Vector 3 +----------+ | ||
203 | |||
204 | After formatting: | ||
205 | |||
206 | Log Buffer +-V1-+-V2-+----V3----+ | ||
207 | |||
208 | Delayed logging vector: | ||
209 | |||
210 | Object +---------------------------------------------+ | ||
211 | Vector 1 +----+ | ||
212 | Vector 2 +----+ | ||
213 | Vector 3 +----------+ | ||
214 | |||
215 | After formatting: | ||
216 | |||
217 | Memory Buffer +-V1-+-V2-+----V3----+ | ||
218 | Vector 1 +----+ | ||
219 | Vector 2 +----+ | ||
220 | Vector 3 +----------+ | ||
221 | |||
222 | The memory buffer and associated vector need to be passed as a single object, | ||
223 | but still need to be associated with the parent object so if the object is | ||
224 | relogged we can replace the current memory buffer with a new memory buffer that | ||
225 | contains the latest changes. | ||
226 | |||
227 | The reason for keeping the vector around after we've formatted the memory | ||
228 | buffer is to support splitting vectors across log buffer boundaries correctly. | ||
229 | If we don't keep the vector around, we do not know where the region boundaries | ||
230 | are in the item, so we'd need a new encapsulation method for regions in the log | ||
231 | buffer writing (i.e. double encapsulation). This would be an on-disk format | ||
232 | change and as such is not desirable. It also means we'd have to write the log | ||
233 | region headers in the formatting stage, which is problematic as there is per | ||
234 | region state that needs to be placed into the headers during the log write. | ||
235 | |||
236 | Hence we need to keep the vector, but by attaching the memory buffer to it and | ||
237 | rewriting the vector addresses to point at the memory buffer we end up with a | ||
238 | self-describing object that can be passed to the log buffer write code to be | ||
239 | handled in exactly the same manner as the existing log vectors are handled. | ||
240 | Hence we avoid needing a new on-disk format to handle items that have been | ||
241 | relogged in memory. | ||
242 | |||
243 | |||
244 | Tracking Changes | ||
245 | |||
246 | Now that we can record transactional changes in memory in a form that allows | ||
247 | them to be used without limitations, we need to be able to track and accumulate | ||
248 | them so that they can be written to the log at some later point in time. The | ||
249 | log item is the natural place to store this vector and buffer, and also makes sense | ||
250 | to be the object that is used to track committed objects as it will always | ||
251 | exist once the object has been included in a transaction. | ||
252 | |||
253 | The log item is already used to track the log items that have been written to | ||
254 | the log but not yet written to disk. Such log items are considered "active" | ||
255 | and as such are stored in the Active Item List (AIL) which is a LSN-ordered | ||
256 | double linked list. Items are inserted into this list during log buffer IO | ||
257 | completion, after which they are unpinned and can be written to disk. An object | ||
258 | that is in the AIL can be relogged, which causes the object to be pinned again | ||
259 | and then moved forward in the AIL when the log buffer IO completes for that | ||
260 | transaction. | ||
261 | |||
262 | Essentially, this shows that an item that is in the AIL can still be modified | ||
263 | and relogged, so any tracking must be separate to the AIL infrastructure. As | ||
264 | such, we cannot reuse the AIL list pointers for tracking committed items, nor | ||
265 | can we store state in any field that is protected by the AIL lock. Hence the | ||
266 | committed item tracking needs it's own locks, lists and state fields in the log | ||
267 | item. | ||
268 | |||
269 | Similar to the AIL, tracking of committed items is done through a new list | ||
270 | called the Committed Item List (CIL). The list tracks log items that have been | ||
271 | committed and have formatted memory buffers attached to them. It tracks objects | ||
272 | in transaction commit order, so when an object is relogged it is removed from | ||
273 | it's place in the list and re-inserted at the tail. This is entirely arbitrary | ||
274 | and done to make it easy for debugging - the last items in the list are the | ||
275 | ones that are most recently modified. Ordering of the CIL is not necessary for | ||
276 | transactional integrity (as discussed in the next section) so the ordering is | ||
277 | done for convenience/sanity of the developers. | ||
278 | |||
279 | |||
280 | Delayed Logging: Checkpoints | ||
281 | |||
282 | When we have a log synchronisation event, commonly known as a "log force", | ||
283 | all the items in the CIL must be written into the log via the log buffers. | ||
284 | We need to write these items in the order that they exist in the CIL, and they | ||
285 | need to be written as an atomic transaction. The need for all the objects to be | ||
286 | written as an atomic transaction comes from the requirements of relogging and | ||
287 | log replay - all the changes in all the objects in a given transaction must | ||
288 | either be completely replayed during log recovery, or not replayed at all. If | ||
289 | a transaction is not replayed because it is not complete in the log, then | ||
290 | no later transactions should be replayed, either. | ||
291 | |||
292 | To fulfill this requirement, we need to write the entire CIL in a single log | ||
293 | transaction. Fortunately, the XFS log code has no fixed limit on the size of a | ||
294 | transaction, nor does the log replay code. The only fundamental limit is that | ||
295 | the transaction cannot be larger than just under half the size of the log. The | ||
296 | reason for this limit is that to find the head and tail of the log, there must | ||
297 | be at least one complete transaction in the log at any given time. If a | ||
298 | transaction is larger than half the log, then there is the possibility that a | ||
299 | crash during the write of a such a transaction could partially overwrite the | ||
300 | only complete previous transaction in the log. This will result in a recovery | ||
301 | failure and an inconsistent filesystem and hence we must enforce the maximum | ||
302 | size of a checkpoint to be slightly less than a half the log. | ||
303 | |||
304 | Apart from this size requirement, a checkpoint transaction looks no different | ||
305 | to any other transaction - it contains a transaction header, a series of | ||
306 | formatted log items and a commit record at the tail. From a recovery | ||
307 | perspective, the checkpoint transaction is also no different - just a lot | ||
308 | bigger with a lot more items in it. The worst case effect of this is that we | ||
309 | might need to tune the recovery transaction object hash size. | ||
310 | |||
311 | Because the checkpoint is just another transaction and all the changes to log | ||
312 | items are stored as log vectors, we can use the existing log buffer writing | ||
313 | code to write the changes into the log. To do this efficiently, we need to | ||
314 | minimise the time we hold the CIL locked while writing the checkpoint | ||
315 | transaction. The current log write code enables us to do this easily with the | ||
316 | way it separates the writing of the transaction contents (the log vectors) from | ||
317 | the transaction commit record, but tracking this requires us to have a | ||
318 | per-checkpoint context that travels through the log write process through to | ||
319 | checkpoint completion. | ||
320 | |||
321 | Hence a checkpoint has a context that tracks the state of the current | ||
322 | checkpoint from initiation to checkpoint completion. A new context is initiated | ||
323 | at the same time a checkpoint transaction is started. That is, when we remove | ||
324 | all the current items from the CIL during a checkpoint operation, we move all | ||
325 | those changes into the current checkpoint context. We then initialise a new | ||
326 | context and attach that to the CIL for aggregation of new transactions. | ||
327 | |||
328 | This allows us to unlock the CIL immediately after transfer of all the | ||
329 | committed items and effectively allow new transactions to be issued while we | ||
330 | are formatting the checkpoint into the log. It also allows concurrent | ||
331 | checkpoints to be written into the log buffers in the case of log force heavy | ||
332 | workloads, just like the existing transaction commit code does. This, however, | ||
333 | requires that we strictly order the commit records in the log so that | ||
334 | checkpoint sequence order is maintained during log replay. | ||
335 | |||
336 | To ensure that we can be writing an item into a checkpoint transaction at | ||
337 | the same time another transaction modifies the item and inserts the log item | ||
338 | into the new CIL, then checkpoint transaction commit code cannot use log items | ||
339 | to store the list of log vectors that need to be written into the transaction. | ||
340 | Hence log vectors need to be able to be chained together to allow them to be | ||
341 | detatched from the log items. That is, when the CIL is flushed the memory | ||
342 | buffer and log vector attached to each log item needs to be attached to the | ||
343 | checkpoint context so that the log item can be released. In diagrammatic form, | ||
344 | the CIL would look like this before the flush: | ||
345 | |||
346 | CIL Head | ||
347 | | | ||
348 | V | ||
349 | Log Item <-> log vector 1 -> memory buffer | ||
350 | | -> vector array | ||
351 | V | ||
352 | Log Item <-> log vector 2 -> memory buffer | ||
353 | | -> vector array | ||
354 | V | ||
355 | ...... | ||
356 | | | ||
357 | V | ||
358 | Log Item <-> log vector N-1 -> memory buffer | ||
359 | | -> vector array | ||
360 | V | ||
361 | Log Item <-> log vector N -> memory buffer | ||
362 | -> vector array | ||
363 | |||
364 | And after the flush the CIL head is empty, and the checkpoint context log | ||
365 | vector list would look like: | ||
366 | |||
367 | Checkpoint Context | ||
368 | | | ||
369 | V | ||
370 | log vector 1 -> memory buffer | ||
371 | | -> vector array | ||
372 | | -> Log Item | ||
373 | V | ||
374 | log vector 2 -> memory buffer | ||
375 | | -> vector array | ||
376 | | -> Log Item | ||
377 | V | ||
378 | ...... | ||
379 | | | ||
380 | V | ||
381 | log vector N-1 -> memory buffer | ||
382 | | -> vector array | ||
383 | | -> Log Item | ||
384 | V | ||
385 | log vector N -> memory buffer | ||
386 | -> vector array | ||
387 | -> Log Item | ||
388 | |||
389 | Once this transfer is done, the CIL can be unlocked and new transactions can | ||
390 | start, while the checkpoint flush code works over the log vector chain to | ||
391 | commit the checkpoint. | ||
392 | |||
393 | Once the checkpoint is written into the log buffers, the checkpoint context is | ||
394 | attached to the log buffer that the commit record was written to along with a | ||
395 | completion callback. Log IO completion will call that callback, which can then | ||
396 | run transaction committed processing for the log items (i.e. insert into AIL | ||
397 | and unpin) in the log vector chain and then free the log vector chain and | ||
398 | checkpoint context. | ||
399 | |||
400 | Discussion Point: I am uncertain as to whether the log item is the most | ||
401 | efficient way to track vectors, even though it seems like the natural way to do | ||
402 | it. The fact that we walk the log items (in the CIL) just to chain the log | ||
403 | vectors and break the link between the log item and the log vector means that | ||
404 | we take a cache line hit for the log item list modification, then another for | ||
405 | the log vector chaining. If we track by the log vectors, then we only need to | ||
406 | break the link between the log item and the log vector, which means we should | ||
407 | dirty only the log item cachelines. Normally I wouldn't be concerned about one | ||
408 | vs two dirty cachelines except for the fact I've seen upwards of 80,000 log | ||
409 | vectors in one checkpoint transaction. I'd guess this is a "measure and | ||
410 | compare" situation that can be done after a working and reviewed implementation | ||
411 | is in the dev tree.... | ||
412 | |||
413 | Delayed Logging: Checkpoint Sequencing | ||
414 | |||
415 | One of the key aspects of the XFS transaction subsystem is that it tags | ||
416 | committed transactions with the log sequence number of the transaction commit. | ||
417 | This allows transactions to be issued asynchronously even though there may be | ||
418 | future operations that cannot be completed until that transaction is fully | ||
419 | committed to the log. In the rare case that a dependent operation occurs (e.g. | ||
420 | re-using a freed metadata extent for a data extent), a special, optimised log | ||
421 | force can be issued to force the dependent transaction to disk immediately. | ||
422 | |||
423 | To do this, transactions need to record the LSN of the commit record of the | ||
424 | transaction. This LSN comes directly from the log buffer the transaction is | ||
425 | written into. While this works just fine for the existing transaction | ||
426 | mechanism, it does not work for delayed logging because transactions are not | ||
427 | written directly into the log buffers. Hence some other method of sequencing | ||
428 | transactions is required. | ||
429 | |||
430 | As discussed in the checkpoint section, delayed logging uses per-checkpoint | ||
431 | contexts, and as such it is simple to assign a sequence number to each | ||
432 | checkpoint. Because the switching of checkpoint contexts must be done | ||
433 | atomically, it is simple to ensure that each new context has a monotonically | ||
434 | increasing sequence number assigned to it without the need for an external | ||
435 | atomic counter - we can just take the current context sequence number and add | ||
436 | one to it for the new context. | ||
437 | |||
438 | Then, instead of assigning a log buffer LSN to the transaction commit LSN | ||
439 | during the commit, we can assign the current checkpoint sequence. This allows | ||
440 | operations that track transactions that have not yet completed know what | ||
441 | checkpoint sequence needs to be committed before they can continue. As a | ||
442 | result, the code that forces the log to a specific LSN now needs to ensure that | ||
443 | the log forces to a specific checkpoint. | ||
444 | |||
445 | To ensure that we can do this, we need to track all the checkpoint contexts | ||
446 | that are currently committing to the log. When we flush a checkpoint, the | ||
447 | context gets added to a "committing" list which can be searched. When a | ||
448 | checkpoint commit completes, it is removed from the committing list. Because | ||
449 | the checkpoint context records the LSN of the commit record for the checkpoint, | ||
450 | we can also wait on the log buffer that contains the commit record, thereby | ||
451 | using the existing log force mechanisms to execute synchronous forces. | ||
452 | |||
453 | It should be noted that the synchronous forces may need to be extended with | ||
454 | mitigation algorithms similar to the current log buffer code to allow | ||
455 | aggregation of multiple synchronous transactions if there are already | ||
456 | synchronous transactions being flushed. Investigation of the performance of the | ||
457 | current design is needed before making any decisions here. | ||
458 | |||
459 | The main concern with log forces is to ensure that all the previous checkpoints | ||
460 | are also committed to disk before the one we need to wait for. Therefore we | ||
461 | need to check that all the prior contexts in the committing list are also | ||
462 | complete before waiting on the one we need to complete. We do this | ||
463 | synchronisation in the log force code so that we don't need to wait anywhere | ||
464 | else for such serialisation - it only matters when we do a log force. | ||
465 | |||
466 | The only remaining complexity is that a log force now also has to handle the | ||
467 | case where the forcing sequence number is the same as the current context. That | ||
468 | is, we need to flush the CIL and potentially wait for it to complete. This is a | ||
469 | simple addition to the existing log forcing code to check the sequence numbers | ||
470 | and push if required. Indeed, placing the current sequence checkpoint flush in | ||
471 | the log force code enables the current mechanism for issuing synchronous | ||
472 | transactions to remain untouched (i.e. commit an asynchronous transaction, then | ||
473 | force the log at the LSN of that transaction) and so the higher level code | ||
474 | behaves the same regardless of whether delayed logging is being used or not. | ||
475 | |||
476 | Delayed Logging: Checkpoint Log Space Accounting | ||
477 | |||
478 | The big issue for a checkpoint transaction is the log space reservation for the | ||
479 | transaction. We don't know how big a checkpoint transaction is going to be | ||
480 | ahead of time, nor how many log buffers it will take to write out, nor the | ||
481 | number of split log vector regions are going to be used. We can track the | ||
482 | amount of log space required as we add items to the commit item list, but we | ||
483 | still need to reserve the space in the log for the checkpoint. | ||
484 | |||
485 | A typical transaction reserves enough space in the log for the worst case space | ||
486 | usage of the transaction. The reservation accounts for log record headers, | ||
487 | transaction and region headers, headers for split regions, buffer tail padding, | ||
488 | etc. as well as the actual space for all the changed metadata in the | ||
489 | transaction. While some of this is fixed overhead, much of it is dependent on | ||
490 | the size of the transaction and the number of regions being logged (the number | ||
491 | of log vectors in the transaction). | ||
492 | |||
493 | An example of the differences would be logging directory changes versus logging | ||
494 | inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then | ||
495 | there are lots of transactions that only contain an inode core and an inode log | ||
496 | format structure. That is, two vectors totaling roughly 150 bytes. If we modify | ||
497 | 10,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each | ||
498 | vector is 12 bytes, so the total to be logged is approximately 1.75MB. In | ||
499 | comparison, if we are logging full directory buffers, they are typically 4KB | ||
500 | each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a | ||
501 | buffer format structure for each buffer - roughly 800 vectors or 1.51MB total | ||
502 | space. From this, it should be obvious that a static log space reservation is | ||
503 | not particularly flexible and is difficult to select the "optimal value" for | ||
504 | all workloads. | ||
505 | |||
506 | Further, if we are going to use a static reservation, which bit of the entire | ||
507 | reservation does it cover? We account for space used by the transaction | ||
508 | reservation by tracking the space currently used by the object in the CIL and | ||
509 | then calculating the increase or decrease in space used as the object is | ||
510 | relogged. This allows for a checkpoint reservation to only have to account for | ||
511 | log buffer metadata used such as log header records. | ||
512 | |||
513 | However, even using a static reservation for just the log metadata is | ||
514 | problematic. Typically log record headers use at least 16KB of log space per | ||
515 | 1MB of log space consumed (512 bytes per 32k) and the reservation needs to be | ||
516 | large enough to handle arbitrary sized checkpoint transactions. This | ||
517 | reservation needs to be made before the checkpoint is started, and we need to | ||
518 | be able to reserve the space without sleeping. For a 8MB checkpoint, we need a | ||
519 | reservation of around 150KB, which is a non-trivial amount of space. | ||
520 | |||
521 | A static reservation needs to manipulate the log grant counters - we can take a | ||
522 | permanent reservation on the space, but we still need to make sure we refresh | ||
523 | the write reservation (the actual space available to the transaction) after | ||
524 | every checkpoint transaction completion. Unfortunately, if this space is not | ||
525 | available when required, then the regrant code will sleep waiting for it. | ||
526 | |||
527 | The problem with this is that it can lead to deadlocks as we may need to commit | ||
528 | checkpoints to be able to free up log space (refer back to the description of | ||
529 | rolling transactions for an example of this). Hence we *must* always have | ||
530 | space available in the log if we are to use static reservations, and that is | ||
531 | very difficult and complex to arrange. It is possible to do, but there is a | ||
532 | simpler way. | ||
533 | |||
534 | The simpler way of doing this is tracking the entire log space used by the | ||
535 | items in the CIL and using this to dynamically calculate the amount of log | ||
536 | space required by the log metadata. If this log metadata space changes as a | ||
537 | result of a transaction commit inserting a new memory buffer into the CIL, then | ||
538 | the difference in space required is removed from the transaction that causes | ||
539 | the change. Transactions at this level will *always* have enough space | ||
540 | available in their reservation for this as they have already reserved the | ||
541 | maximal amount of log metadata space they require, and such a delta reservation | ||
542 | will always be less than or equal to the maximal amount in the reservation. | ||
543 | |||
544 | Hence we can grow the checkpoint transaction reservation dynamically as items | ||
545 | are added to the CIL and avoid the need for reserving and regranting log space | ||
546 | up front. This avoids deadlocks and removes a blocking point from the | ||
547 | checkpoint flush code. | ||
548 | |||
549 | As mentioned early, transactions can't grow to more than half the size of the | ||
550 | log. Hence as part of the reservation growing, we need to also check the size | ||
551 | of the reservation against the maximum allowed transaction size. If we reach | ||
552 | the maximum threshold, we need to push the CIL to the log. This is effectively | ||
553 | a "background flush" and is done on demand. This is identical to | ||
554 | a CIL push triggered by a log force, only that there is no waiting for the | ||
555 | checkpoint commit to complete. This background push is checked and executed by | ||
556 | transaction commit code. | ||
557 | |||
558 | If the transaction subsystem goes idle while we still have items in the CIL, | ||
559 | they will be flushed by the periodic log force issued by the xfssyncd. This log | ||
560 | force will push the CIL to disk, and if the transaction subsystem stays idle, | ||
561 | allow the idle log to be covered (effectively marked clean) in exactly the same | ||
562 | manner that is done for the existing logging method. A discussion point is | ||
563 | whether this log force needs to be done more frequently than the current rate | ||
564 | which is once every 30s. | ||
565 | |||
566 | |||
567 | Delayed Logging: Log Item Pinning | ||
568 | |||
569 | Currently log items are pinned during transaction commit while the items are | ||
570 | still locked. This happens just after the items are formatted, though it could | ||
571 | be done any time before the items are unlocked. The result of this mechanism is | ||
572 | that items get pinned once for every transaction that is committed to the log | ||
573 | buffers. Hence items that are relogged in the log buffers will have a pin count | ||
574 | for every outstanding transaction they were dirtied in. When each of these | ||
575 | transactions is completed, they will unpin the item once. As a result, the item | ||
576 | only becomes unpinned when all the transactions complete and there are no | ||
577 | pending transactions. Thus the pinning and unpinning of a log item is symmetric | ||
578 | as there is a 1:1 relationship with transaction commit and log item completion. | ||
579 | |||
580 | For delayed logging, however, we have an assymetric transaction commit to | ||
581 | completion relationship. Every time an object is relogged in the CIL it goes | ||
582 | through the commit process without a corresponding completion being registered. | ||
583 | That is, we now have a many-to-one relationship between transaction commit and | ||
584 | log item completion. The result of this is that pinning and unpinning of the | ||
585 | log items becomes unbalanced if we retain the "pin on transaction commit, unpin | ||
586 | on transaction completion" model. | ||
587 | |||
588 | To keep pin/unpin symmetry, the algorithm needs to change to a "pin on | ||
589 | insertion into the CIL, unpin on checkpoint completion". In other words, the | ||
590 | pinning and unpinning becomes symmetric around a checkpoint context. We have to | ||
591 | pin the object the first time it is inserted into the CIL - if it is already in | ||
592 | the CIL during a transaction commit, then we do not pin it again. Because there | ||
593 | can be multiple outstanding checkpoint contexts, we can still see elevated pin | ||
594 | counts, but as each checkpoint completes the pin count will retain the correct | ||
595 | value according to it's context. | ||
596 | |||
597 | Just to make matters more slightly more complex, this checkpoint level context | ||
598 | for the pin count means that the pinning of an item must take place under the | ||
599 | CIL commit/flush lock. If we pin the object outside this lock, we cannot | ||
600 | guarantee which context the pin count is associated with. This is because of | ||
601 | the fact pinning the item is dependent on whether the item is present in the | ||
602 | current CIL or not. If we don't pin the CIL first before we check and pin the | ||
603 | object, we have a race with CIL being flushed between the check and the pin | ||
604 | (or not pinning, as the case may be). Hence we must hold the CIL flush/commit | ||
605 | lock to guarantee that we pin the items correctly. | ||
606 | |||
607 | Delayed Logging: Concurrent Scalability | ||
608 | |||
609 | A fundamental requirement for the CIL is that accesses through transaction | ||
610 | commits must scale to many concurrent commits. The current transaction commit | ||
611 | code does not break down even when there are transactions coming from 2048 | ||
612 | processors at once. The current transaction code does not go any faster than if | ||
613 | there was only one CPU using it, but it does not slow down either. | ||
614 | |||
615 | As a result, the delayed logging transaction commit code needs to be designed | ||
616 | for concurrency from the ground up. It is obvious that there are serialisation | ||
617 | points in the design - the three important ones are: | ||
618 | |||
619 | 1. Locking out new transaction commits while flushing the CIL | ||
620 | 2. Adding items to the CIL and updating item space accounting | ||
621 | 3. Checkpoint commit ordering | ||
622 | |||
623 | Looking at the transaction commit and CIL flushing interactions, it is clear | ||
624 | that we have a many-to-one interaction here. That is, the only restriction on | ||
625 | the number of concurrent transactions that can be trying to commit at once is | ||
626 | the amount of space available in the log for their reservations. The practical | ||
627 | limit here is in the order of several hundred concurrent transactions for a | ||
628 | 128MB log, which means that it is generally one per CPU in a machine. | ||
629 | |||
630 | The amount of time a transaction commit needs to hold out a flush is a | ||
631 | relatively long period of time - the pinning of log items needs to be done | ||
632 | while we are holding out a CIL flush, so at the moment that means it is held | ||
633 | across the formatting of the objects into memory buffers (i.e. while memcpy()s | ||
634 | are in progress). Ultimately a two pass algorithm where the formatting is done | ||
635 | separately to the pinning of objects could be used to reduce the hold time of | ||
636 | the transaction commit side. | ||
637 | |||
638 | Because of the number of potential transaction commit side holders, the lock | ||
639 | really needs to be a sleeping lock - if the CIL flush takes the lock, we do not | ||
640 | want every other CPU in the machine spinning on the CIL lock. Given that | ||
641 | flushing the CIL could involve walking a list of tens of thousands of log | ||
642 | items, it will get held for a significant time and so spin contention is a | ||
643 | significant concern. Preventing lots of CPUs spinning doing nothing is the | ||
644 | main reason for choosing a sleeping lock even though nothing in either the | ||
645 | transaction commit or CIL flush side sleeps with the lock held. | ||
646 | |||
647 | It should also be noted that CIL flushing is also a relatively rare operation | ||
648 | compared to transaction commit for asynchronous transaction workloads - only | ||
649 | time will tell if using a read-write semaphore for exclusion will limit | ||
650 | transaction commit concurrency due to cache line bouncing of the lock on the | ||
651 | read side. | ||
652 | |||
653 | The second serialisation point is on the transaction commit side where items | ||
654 | are inserted into the CIL. Because transactions can enter this code | ||
655 | concurrently, the CIL needs to be protected separately from the above | ||
656 | commit/flush exclusion. It also needs to be an exclusive lock but it is only | ||
657 | held for a very short time and so a spin lock is appropriate here. It is | ||
658 | possible that this lock will become a contention point, but given the short | ||
659 | hold time once per transaction I think that contention is unlikely. | ||
660 | |||
661 | The final serialisation point is the checkpoint commit record ordering code | ||
662 | that is run as part of the checkpoint commit and log force sequencing. The code | ||
663 | path that triggers a CIL flush (i.e. whatever triggers the log force) will enter | ||
664 | an ordering loop after writing all the log vectors into the log buffers but | ||
665 | before writing the commit record. This loop walks the list of committing | ||
666 | checkpoints and needs to block waiting for checkpoints to complete their commit | ||
667 | record write. As a result it needs a lock and a wait variable. Log force | ||
668 | sequencing also requires the same lock, list walk, and blocking mechanism to | ||
669 | ensure completion of checkpoints. | ||
670 | |||
671 | These two sequencing operations can use the mechanism even though the | ||
672 | events they are waiting for are different. The checkpoint commit record | ||
673 | sequencing needs to wait until checkpoint contexts contain a commit LSN | ||
674 | (obtained through completion of a commit record write) while log force | ||
675 | sequencing needs to wait until previous checkpoint contexts are removed from | ||
676 | the committing list (i.e. they've completed). A simple wait variable and | ||
677 | broadcast wakeups (thundering herds) has been used to implement these two | ||
678 | serialisation queues. They use the same lock as the CIL, too. If we see too | ||
679 | much contention on the CIL lock, or too many context switches as a result of | ||
680 | the broadcast wakeups these operations can be put under a new spinlock and | ||
681 | given separate wait lists to reduce lock contention and the number of processes | ||
682 | woken by the wrong event. | ||
683 | |||
684 | |||
685 | Lifecycle Changes | ||
686 | |||
687 | The existing log item life cycle is as follows: | ||
688 | |||
689 | 1. Transaction allocate | ||
690 | 2. Transaction reserve | ||
691 | 3. Lock item | ||
692 | 4. Join item to transaction | ||
693 | If not already attached, | ||
694 | Allocate log item | ||
695 | Attach log item to owner item | ||
696 | Attach log item to transaction | ||
697 | 5. Modify item | ||
698 | Record modifications in log item | ||
699 | 6. Transaction commit | ||
700 | Pin item in memory | ||
701 | Format item into log buffer | ||
702 | Write commit LSN into transaction | ||
703 | Unlock item | ||
704 | Attach transaction to log buffer | ||
705 | |||
706 | <log buffer IO dispatched> | ||
707 | <log buffer IO completes> | ||
708 | |||
709 | 7. Transaction completion | ||
710 | Mark log item committed | ||
711 | Insert log item into AIL | ||
712 | Write commit LSN into log item | ||
713 | Unpin log item | ||
714 | 8. AIL traversal | ||
715 | Lock item | ||
716 | Mark log item clean | ||
717 | Flush item to disk | ||
718 | |||
719 | <item IO completion> | ||
720 | |||
721 | 9. Log item removed from AIL | ||
722 | Moves log tail | ||
723 | Item unlocked | ||
724 | |||
725 | Essentially, steps 1-6 operate independently from step 7, which is also | ||
726 | independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9 | ||
727 | at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur | ||
728 | at the same time. If the log item is in the AIL or between steps 6 and 7 | ||
729 | and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9 | ||
730 | are entered and completed is the object considered clean. | ||
731 | |||
732 | With delayed logging, there are new steps inserted into the life cycle: | ||
733 | |||
734 | 1. Transaction allocate | ||
735 | 2. Transaction reserve | ||
736 | 3. Lock item | ||
737 | 4. Join item to transaction | ||
738 | If not already attached, | ||
739 | Allocate log item | ||
740 | Attach log item to owner item | ||
741 | Attach log item to transaction | ||
742 | 5. Modify item | ||
743 | Record modifications in log item | ||
744 | 6. Transaction commit | ||
745 | Pin item in memory if not pinned in CIL | ||
746 | Format item into log vector + buffer | ||
747 | Attach log vector and buffer to log item | ||
748 | Insert log item into CIL | ||
749 | Write CIL context sequence into transaction | ||
750 | Unlock item | ||
751 | |||
752 | <next log force> | ||
753 | |||
754 | 7. CIL push | ||
755 | lock CIL flush | ||
756 | Chain log vectors and buffers together | ||
757 | Remove items from CIL | ||
758 | unlock CIL flush | ||
759 | write log vectors into log | ||
760 | sequence commit records | ||
761 | attach checkpoint context to log buffer | ||
762 | |||
763 | <log buffer IO dispatched> | ||
764 | <log buffer IO completes> | ||
765 | |||
766 | 8. Checkpoint completion | ||
767 | Mark log item committed | ||
768 | Insert item into AIL | ||
769 | Write commit LSN into log item | ||
770 | Unpin log item | ||
771 | 9. AIL traversal | ||
772 | Lock item | ||
773 | Mark log item clean | ||
774 | Flush item to disk | ||
775 | <item IO completion> | ||
776 | 10. Log item removed from AIL | ||
777 | Moves log tail | ||
778 | Item unlocked | ||
779 | |||
780 | From this, it can be seen that the only life cycle differences between the two | ||
781 | logging methods are in the middle of the life cycle - they still have the same | ||
782 | beginning and end and execution constraints. The only differences are in the | ||
783 | commiting of the log items to the log itself and the completion processing. | ||
784 | Hence delayed logging should not introduce any constraints on log item | ||
785 | behaviour, allocation or freeing that don't already exist. | ||
786 | |||
787 | As a result of this zero-impact "insertion" of delayed logging infrastructure | ||
788 | and the design of the internal structures to avoid on disk format changes, we | ||
789 | can basically switch between delayed logging and the existing mechanism with a | ||
790 | mount option. Fundamentally, there is no reason why the log manager would not | ||
791 | be able to swap methods automatically and transparently depending on load | ||
792 | characteristics, but this should not be necessary if delayed logging works as | ||
793 | designed. | ||
794 | |||
795 | Roadmap: | ||
796 | |||
797 | 2.6.37 Remove experimental tag from mount option | ||
798 | => should be roughly 6 months after initial merge | ||
799 | => enough time to: | ||
800 | => gain confidence and fix problems reported by early | ||
801 | adopters (a.k.a. guinea pigs) | ||
802 | => address worst performance regressions and undesired | ||
803 | behaviours | ||
804 | => start tuning/optimising code for parallelism | ||
805 | => start tuning/optimising algorithms consuming | ||
806 | excessive CPU time | ||
807 | |||
808 | 2.6.39 Switch default mount option to use delayed logging | ||
809 | => should be roughly 12 months after initial merge | ||
810 | => enough time to shake out remaining problems before next round of | ||
811 | enterprise distro kernel rebases | ||
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 9878f50d6ed6..7bff3e4f35df 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt | |||
@@ -131,17 +131,6 @@ When mounting an XFS filesystem, the following options are accepted. | |||
131 | Don't check for double mounted file systems using the file system uuid. | 131 | Don't check for double mounted file systems using the file system uuid. |
132 | This is useful to mount LVM snapshot volumes. | 132 | This is useful to mount LVM snapshot volumes. |
133 | 133 | ||
134 | osyncisosync | ||
135 | Make O_SYNC writes implement true O_SYNC. WITHOUT this option, | ||
136 | Linux XFS behaves as if an "osyncisdsync" option is used, | ||
137 | which will make writes to files opened with the O_SYNC flag set | ||
138 | behave as if the O_DSYNC flag had been used instead. | ||
139 | This can result in better performance without compromising | ||
140 | data safety. | ||
141 | However if this option is not in effect, timestamp updates from | ||
142 | O_SYNC writes can be lost if the system crashes. | ||
143 | If timestamp updates are critical, use the osyncisosync option. | ||
144 | |||
145 | uquota/usrquota/uqnoenforce/quota | 134 | uquota/usrquota/uqnoenforce/quota |
146 | User disk quota accounting enabled, and limits (optionally) | 135 | User disk quota accounting enabled, and limits (optionally) |
147 | enforced. Refer to xfs_quota(8) for further details. | 136 | enforced. Refer to xfs_quota(8) for further details. |