diff options
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/Locking | 15 | ||||
-rw-r--r-- | Documentation/filesystems/autofs4-mount-control.txt | 393 | ||||
-rw-r--r-- | Documentation/filesystems/ext3.txt | 3 | ||||
-rw-r--r-- | Documentation/filesystems/ext4.txt | 51 | ||||
-rw-r--r-- | Documentation/filesystems/fiemap.txt | 228 | ||||
-rw-r--r-- | Documentation/filesystems/nfsroot.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ocfs2.txt | 6 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 88 | ||||
-rw-r--r-- | Documentation/filesystems/ramfs-rootfs-initramfs.txt | 2 |
9 files changed, 711 insertions, 77 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 680fb566b928..8362860e21a7 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -144,8 +144,8 @@ prototypes: | |||
144 | void (*kill_sb) (struct super_block *); | 144 | void (*kill_sb) (struct super_block *); |
145 | locking rules: | 145 | locking rules: |
146 | may block BKL | 146 | may block BKL |
147 | get_sb yes yes | 147 | get_sb yes no |
148 | kill_sb yes yes | 148 | kill_sb yes no |
149 | 149 | ||
150 | ->get_sb() returns error or 0 with locked superblock attached to the vfsmount | 150 | ->get_sb() returns error or 0 with locked superblock attached to the vfsmount |
151 | (exclusive on ->s_umount). | 151 | (exclusive on ->s_umount). |
@@ -409,12 +409,12 @@ ioctl: yes (see below) | |||
409 | unlocked_ioctl: no (see below) | 409 | unlocked_ioctl: no (see below) |
410 | compat_ioctl: no | 410 | compat_ioctl: no |
411 | mmap: no | 411 | mmap: no |
412 | open: maybe (see below) | 412 | open: no |
413 | flush: no | 413 | flush: no |
414 | release: no | 414 | release: no |
415 | fsync: no (see below) | 415 | fsync: no (see below) |
416 | aio_fsync: no | 416 | aio_fsync: no |
417 | fasync: yes (see below) | 417 | fasync: no |
418 | lock: yes | 418 | lock: yes |
419 | readv: no | 419 | readv: no |
420 | writev: no | 420 | writev: no |
@@ -431,13 +431,6 @@ For many filesystems, it is probably safe to acquire the inode | |||
431 | semaphore. Note some filesystems (i.e. remote ones) provide no | 431 | semaphore. Note some filesystems (i.e. remote ones) provide no |
432 | protection for i_size so you will need to use the BKL. | 432 | protection for i_size so you will need to use the BKL. |
433 | 433 | ||
434 | ->open() locking is in-transit: big lock partially moved into the methods. | ||
435 | The only exception is ->open() in the instances of file_operations that never | ||
436 | end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices | ||
437 | (chrdev_open() takes lock before replacing ->f_op and calling the secondary | ||
438 | method. As soon as we fix the handling of module reference counters all | ||
439 | instances of ->open() will be called without the BKL. | ||
440 | |||
441 | Note: ext2_release() was *the* source of contention on fs-intensive | 434 | Note: ext2_release() was *the* source of contention on fs-intensive |
442 | loads and dropping BKL on ->release() helps to get rid of that (we still | 435 | loads and dropping BKL on ->release() helps to get rid of that (we still |
443 | grab BKL for cases when we close a file that had been opened r/w, but that | 436 | grab BKL for cases when we close a file that had been opened r/w, but that |
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt new file mode 100644 index 000000000000..c6341745df37 --- /dev/null +++ b/Documentation/filesystems/autofs4-mount-control.txt | |||
@@ -0,0 +1,393 @@ | |||
1 | |||
2 | Miscellaneous Device control operations for the autofs4 kernel module | ||
3 | ==================================================================== | ||
4 | |||
5 | The problem | ||
6 | =========== | ||
7 | |||
8 | There is a problem with active restarts in autofs (that is to say | ||
9 | restarting autofs when there are busy mounts). | ||
10 | |||
11 | During normal operation autofs uses a file descriptor opened on the | ||
12 | directory that is being managed in order to be able to issue control | ||
13 | operations. Using a file descriptor gives ioctl operations access to | ||
14 | autofs specific information stored in the super block. The operations | ||
15 | are things such as setting an autofs mount catatonic, setting the | ||
16 | expire timeout and requesting expire checks. As is explained below, | ||
17 | certain types of autofs triggered mounts can end up covering an autofs | ||
18 | mount itself which prevents us being able to use open(2) to obtain a | ||
19 | file descriptor for these operations if we don't already have one open. | ||
20 | |||
21 | Currently autofs uses "umount -l" (lazy umount) to clear active mounts | ||
22 | at restart. While using lazy umount works for most cases, anything that | ||
23 | needs to walk back up the mount tree to construct a path, such as | ||
24 | getcwd(2) and the proc file system /proc/<pid>/cwd, no longer works | ||
25 | because the point from which the path is constructed has been detached | ||
26 | from the mount tree. | ||
27 | |||
28 | The actual problem with autofs is that it can't reconnect to existing | ||
29 | mounts. Immediately one thinks of just adding the ability to remount | ||
30 | autofs file systems would solve it, but alas, that can't work. This is | ||
31 | because autofs direct mounts and the implementation of "on demand mount | ||
32 | and expire" of nested mount trees have the file system mounted directly | ||
33 | on top of the mount trigger directory dentry. | ||
34 | |||
35 | For example, there are two types of automount maps, direct (in the kernel | ||
36 | module source you will see a third type called an offset, which is just | ||
37 | a direct mount in disguise) and indirect. | ||
38 | |||
39 | Here is a master map with direct and indirect map entries: | ||
40 | |||
41 | /- /etc/auto.direct | ||
42 | /test /etc/auto.indirect | ||
43 | |||
44 | and the corresponding map files: | ||
45 | |||
46 | /etc/auto.direct: | ||
47 | |||
48 | /automount/dparse/g6 budgie:/autofs/export1 | ||
49 | /automount/dparse/g1 shark:/autofs/export1 | ||
50 | and so on. | ||
51 | |||
52 | /etc/auto.indirect: | ||
53 | |||
54 | g1 shark:/autofs/export1 | ||
55 | g6 budgie:/autofs/export1 | ||
56 | and so on. | ||
57 | |||
58 | For the above indirect map an autofs file system is mounted on /test and | ||
59 | mounts are triggered for each sub-directory key by the inode lookup | ||
60 | operation. So we see a mount of shark:/autofs/export1 on /test/g1, for | ||
61 | example. | ||
62 | |||
63 | The way that direct mounts are handled is by making an autofs mount on | ||
64 | each full path, such as /automount/dparse/g1, and using it as a mount | ||
65 | trigger. So when we walk on the path we mount shark:/autofs/export1 "on | ||
66 | top of this mount point". Since these are always directories we can | ||
67 | use the follow_link inode operation to trigger the mount. | ||
68 | |||
69 | But, each entry in direct and indirect maps can have offsets (making | ||
70 | them multi-mount map entries). | ||
71 | |||
72 | For example, an indirect mount map entry could also be: | ||
73 | |||
74 | g1 \ | ||
75 | / shark:/autofs/export5/testing/test \ | ||
76 | /s1 shark:/autofs/export/testing/test/s1 \ | ||
77 | /s2 shark:/autofs/export5/testing/test/s2 \ | ||
78 | /s1/ss1 shark:/autofs/export1 \ | ||
79 | /s2/ss2 shark:/autofs/export2 | ||
80 | |||
81 | and a similarly a direct mount map entry could also be: | ||
82 | |||
83 | /automount/dparse/g1 \ | ||
84 | / shark:/autofs/export5/testing/test \ | ||
85 | /s1 shark:/autofs/export/testing/test/s1 \ | ||
86 | /s2 shark:/autofs/export5/testing/test/s2 \ | ||
87 | /s1/ss1 shark:/autofs/export2 \ | ||
88 | /s2/ss2 shark:/autofs/export2 | ||
89 | |||
90 | One of the issues with version 4 of autofs was that, when mounting an | ||
91 | entry with a large number of offsets, possibly with nesting, we needed | ||
92 | to mount and umount all of the offsets as a single unit. Not really a | ||
93 | problem, except for people with a large number of offsets in map entries. | ||
94 | This mechanism is used for the well known "hosts" map and we have seen | ||
95 | cases (in 2.4) where the available number of mounts are exhausted or | ||
96 | where the number of privileged ports available is exhausted. | ||
97 | |||
98 | In version 5 we mount only as we go down the tree of offsets and | ||
99 | similarly for expiring them which resolves the above problem. There is | ||
100 | somewhat more detail to the implementation but it isn't needed for the | ||
101 | sake of the problem explanation. The one important detail is that these | ||
102 | offsets are implemented using the same mechanism as the direct mounts | ||
103 | above and so the mount points can be covered by a mount. | ||
104 | |||
105 | The current autofs implementation uses an ioctl file descriptor opened | ||
106 | on the mount point for control operations. The references held by the | ||
107 | descriptor are accounted for in checks made to determine if a mount is | ||
108 | in use and is also used to access autofs file system information held | ||
109 | in the mount super block. So the use of a file handle needs to be | ||
110 | retained. | ||
111 | |||
112 | |||
113 | The Solution | ||
114 | ============ | ||
115 | |||
116 | To be able to restart autofs leaving existing direct, indirect and | ||
117 | offset mounts in place we need to be able to obtain a file handle | ||
118 | for these potentially covered autofs mount points. Rather than just | ||
119 | implement an isolated operation it was decided to re-implement the | ||
120 | existing ioctl interface and add new operations to provide this | ||
121 | functionality. | ||
122 | |||
123 | In addition, to be able to reconstruct a mount tree that has busy mounts, | ||
124 | the uid and gid of the last user that triggered the mount needs to be | ||
125 | available because these can be used as macro substitution variables in | ||
126 | autofs maps. They are recorded at mount request time and an operation | ||
127 | has been added to retrieve them. | ||
128 | |||
129 | Since we're re-implementing the control interface, a couple of other | ||
130 | problems with the existing interface have been addressed. First, when | ||
131 | a mount or expire operation completes a status is returned to the | ||
132 | kernel by either a "send ready" or a "send fail" operation. The | ||
133 | "send fail" operation of the ioctl interface could only ever send | ||
134 | ENOENT so the re-implementation allows user space to send an actual | ||
135 | status. Another expensive operation in user space, for those using | ||
136 | very large maps, is discovering if a mount is present. Usually this | ||
137 | involves scanning /proc/mounts and since it needs to be done quite | ||
138 | often it can introduce significant overhead when there are many entries | ||
139 | in the mount table. An operation to lookup the mount status of a mount | ||
140 | point dentry (covered or not) has also been added. | ||
141 | |||
142 | Current kernel development policy recommends avoiding the use of the | ||
143 | ioctl mechanism in favor of systems such as Netlink. An implementation | ||
144 | using this system was attempted to evaluate its suitability and it was | ||
145 | found to be inadequate, in this case. The Generic Netlink system was | ||
146 | used for this as raw Netlink would lead to a significant increase in | ||
147 | complexity. There's no question that the Generic Netlink system is an | ||
148 | elegant solution for common case ioctl functions but it's not a complete | ||
149 | replacement probably because it's primary purpose in life is to be a | ||
150 | message bus implementation rather than specifically an ioctl replacement. | ||
151 | While it would be possible to work around this there is one concern | ||
152 | that lead to the decision to not use it. This is that the autofs | ||
153 | expire in the daemon has become far to complex because umount | ||
154 | candidates are enumerated, almost for no other reason than to "count" | ||
155 | the number of times to call the expire ioctl. This involves scanning | ||
156 | the mount table which has proved to be a big overhead for users with | ||
157 | large maps. The best way to improve this is try and get back to the | ||
158 | way the expire was done long ago. That is, when an expire request is | ||
159 | issued for a mount (file handle) we should continually call back to | ||
160 | the daemon until we can't umount any more mounts, then return the | ||
161 | appropriate status to the daemon. At the moment we just expire one | ||
162 | mount at a time. A Generic Netlink implementation would exclude this | ||
163 | possibility for future development due to the requirements of the | ||
164 | message bus architecture. | ||
165 | |||
166 | |||
167 | autofs4 Miscellaneous Device mount control interface | ||
168 | ==================================================== | ||
169 | |||
170 | The control interface is opening a device node, typically /dev/autofs. | ||
171 | |||
172 | All the ioctls use a common structure to pass the needed parameter | ||
173 | information and return operation results: | ||
174 | |||
175 | struct autofs_dev_ioctl { | ||
176 | __u32 ver_major; | ||
177 | __u32 ver_minor; | ||
178 | __u32 size; /* total size of data passed in | ||
179 | * including this struct */ | ||
180 | __s32 ioctlfd; /* automount command fd */ | ||
181 | |||
182 | __u32 arg1; /* Command parameters */ | ||
183 | __u32 arg2; | ||
184 | |||
185 | char path[0]; | ||
186 | }; | ||
187 | |||
188 | The ioctlfd field is a mount point file descriptor of an autofs mount | ||
189 | point. It is returned by the open call and is used by all calls except | ||
190 | the check for whether a given path is a mount point, where it may | ||
191 | optionally be used to check a specific mount corresponding to a given | ||
192 | mount point file descriptor, and when requesting the uid and gid of the | ||
193 | last successful mount on a directory within the autofs file system. | ||
194 | |||
195 | The fields arg1 and arg2 are used to communicate parameters and results of | ||
196 | calls made as described below. | ||
197 | |||
198 | The path field is used to pass a path where it is needed and the size field | ||
199 | is used account for the increased structure length when translating the | ||
200 | structure sent from user space. | ||
201 | |||
202 | This structure can be initialized before setting specific fields by using | ||
203 | the void function call init_autofs_dev_ioctl(struct autofs_dev_ioctl *). | ||
204 | |||
205 | All of the ioctls perform a copy of this structure from user space to | ||
206 | kernel space and return -EINVAL if the size parameter is smaller than | ||
207 | the structure size itself, -ENOMEM if the kernel memory allocation fails | ||
208 | or -EFAULT if the copy itself fails. Other checks include a version check | ||
209 | of the compiled in user space version against the module version and a | ||
210 | mismatch results in a -EINVAL return. If the size field is greater than | ||
211 | the structure size then a path is assumed to be present and is checked to | ||
212 | ensure it begins with a "/" and is NULL terminated, otherwise -EINVAL is | ||
213 | returned. Following these checks, for all ioctl commands except | ||
214 | AUTOFS_DEV_IOCTL_VERSION_CMD, AUTOFS_DEV_IOCTL_OPENMOUNT_CMD and | ||
215 | AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD the ioctlfd is validated and if it is | ||
216 | not a valid descriptor or doesn't correspond to an autofs mount point | ||
217 | an error of -EBADF, -ENOTTY or -EINVAL (not an autofs descriptor) is | ||
218 | returned. | ||
219 | |||
220 | |||
221 | The ioctls | ||
222 | ========== | ||
223 | |||
224 | An example of an implementation which uses this interface can be seen | ||
225 | in autofs version 5.0.4 and later in file lib/dev-ioctl-lib.c of the | ||
226 | distribution tar available for download from kernel.org in directory | ||
227 | /pub/linux/daemons/autofs/v5. | ||
228 | |||
229 | The device node ioctl operations implemented by this interface are: | ||
230 | |||
231 | |||
232 | AUTOFS_DEV_IOCTL_VERSION | ||
233 | ------------------------ | ||
234 | |||
235 | Get the major and minor version of the autofs4 device ioctl kernel module | ||
236 | implementation. It requires an initialized struct autofs_dev_ioctl as an | ||
237 | input parameter and sets the version information in the passed in structure. | ||
238 | It returns 0 on success or the error -EINVAL if a version mismatch is | ||
239 | detected. | ||
240 | |||
241 | |||
242 | AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD | ||
243 | ------------------------------------------------------------------ | ||
244 | |||
245 | Get the major and minor version of the autofs4 protocol version understood | ||
246 | by loaded module. This call requires an initialized struct autofs_dev_ioctl | ||
247 | with the ioctlfd field set to a valid autofs mount point descriptor | ||
248 | and sets the requested version number in structure field arg1. These | ||
249 | commands return 0 on success or one of the negative error codes if | ||
250 | validation fails. | ||
251 | |||
252 | |||
253 | AUTOFS_DEV_IOCTL_OPENMOUNT and AUTOFS_DEV_IOCTL_CLOSEMOUNT | ||
254 | ---------------------------------------------------------- | ||
255 | |||
256 | Obtain and release a file descriptor for an autofs managed mount point | ||
257 | path. The open call requires an initialized struct autofs_dev_ioctl with | ||
258 | the the path field set and the size field adjusted appropriately as well | ||
259 | as the arg1 field set to the device number of the autofs mount. The | ||
260 | device number can be obtained from the mount options shown in | ||
261 | /proc/mounts. The close call requires an initialized struct | ||
262 | autofs_dev_ioct with the ioctlfd field set to the descriptor obtained | ||
263 | from the open call. The release of the file descriptor can also be done | ||
264 | with close(2) so any open descriptors will also be closed at process exit. | ||
265 | The close call is included in the implemented operations largely for | ||
266 | completeness and to provide for a consistent user space implementation. | ||
267 | |||
268 | |||
269 | AUTOFS_DEV_IOCTL_READY_CMD and AUTOFS_DEV_IOCTL_FAIL_CMD | ||
270 | -------------------------------------------------------- | ||
271 | |||
272 | Return mount and expire result status from user space to the kernel. | ||
273 | Both of these calls require an initialized struct autofs_dev_ioctl | ||
274 | with the ioctlfd field set to the descriptor obtained from the open | ||
275 | call and the arg1 field set to the wait queue token number, received | ||
276 | by user space in the foregoing mount or expire request. The arg2 field | ||
277 | is set to the status to be returned. For the ready call this is always | ||
278 | 0 and for the fail call it is set to the errno of the operation. | ||
279 | |||
280 | |||
281 | AUTOFS_DEV_IOCTL_SETPIPEFD_CMD | ||
282 | ------------------------------ | ||
283 | |||
284 | Set the pipe file descriptor used for kernel communication to the daemon. | ||
285 | Normally this is set at mount time using an option but when reconnecting | ||
286 | to a existing mount we need to use this to tell the autofs mount about | ||
287 | the new kernel pipe descriptor. In order to protect mounts against | ||
288 | incorrectly setting the pipe descriptor we also require that the autofs | ||
289 | mount be catatonic (see next call). | ||
290 | |||
291 | The call requires an initialized struct autofs_dev_ioctl with the | ||
292 | ioctlfd field set to the descriptor obtained from the open call and | ||
293 | the arg1 field set to descriptor of the pipe. On success the call | ||
294 | also sets the process group id used to identify the controlling process | ||
295 | (eg. the owning automount(8) daemon) to the process group of the caller. | ||
296 | |||
297 | |||
298 | AUTOFS_DEV_IOCTL_CATATONIC_CMD | ||
299 | ------------------------------ | ||
300 | |||
301 | Make the autofs mount point catatonic. The autofs mount will no longer | ||
302 | issue mount requests, the kernel communication pipe descriptor is released | ||
303 | and any remaining waits in the queue released. | ||
304 | |||
305 | The call requires an initialized struct autofs_dev_ioctl with the | ||
306 | ioctlfd field set to the descriptor obtained from the open call. | ||
307 | |||
308 | |||
309 | AUTOFS_DEV_IOCTL_TIMEOUT_CMD | ||
310 | ---------------------------- | ||
311 | |||
312 | Set the expire timeout for mounts withing an autofs mount point. | ||
313 | |||
314 | The call requires an initialized struct autofs_dev_ioctl with the | ||
315 | ioctlfd field set to the descriptor obtained from the open call. | ||
316 | |||
317 | |||
318 | AUTOFS_DEV_IOCTL_REQUESTER_CMD | ||
319 | ------------------------------ | ||
320 | |||
321 | Return the uid and gid of the last process to successfully trigger a the | ||
322 | mount on the given path dentry. | ||
323 | |||
324 | The call requires an initialized struct autofs_dev_ioctl with the path | ||
325 | field set to the mount point in question and the size field adjusted | ||
326 | appropriately as well as the arg1 field set to the device number of the | ||
327 | containing autofs mount. Upon return the struct field arg1 contains the | ||
328 | uid and arg2 the gid. | ||
329 | |||
330 | When reconstructing an autofs mount tree with active mounts we need to | ||
331 | re-connect to mounts that may have used the original process uid and | ||
332 | gid (or string variations of them) for mount lookups within the map entry. | ||
333 | This call provides the ability to obtain this uid and gid so they may be | ||
334 | used by user space for the mount map lookups. | ||
335 | |||
336 | |||
337 | AUTOFS_DEV_IOCTL_EXPIRE_CMD | ||
338 | --------------------------- | ||
339 | |||
340 | Issue an expire request to the kernel for an autofs mount. Typically | ||
341 | this ioctl is called until no further expire candidates are found. | ||
342 | |||
343 | The call requires an initialized struct autofs_dev_ioctl with the | ||
344 | ioctlfd field set to the descriptor obtained from the open call. In | ||
345 | addition an immediate expire, independent of the mount timeout, can be | ||
346 | requested by setting the arg1 field to 1. If no expire candidates can | ||
347 | be found the ioctl returns -1 with errno set to EAGAIN. | ||
348 | |||
349 | This call causes the kernel module to check the mount corresponding | ||
350 | to the given ioctlfd for mounts that can be expired, issues an expire | ||
351 | request back to the daemon and waits for completion. | ||
352 | |||
353 | AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD | ||
354 | ------------------------------ | ||
355 | |||
356 | Checks if an autofs mount point is in use. | ||
357 | |||
358 | The call requires an initialized struct autofs_dev_ioctl with the | ||
359 | ioctlfd field set to the descriptor obtained from the open call and | ||
360 | it returns the result in the arg1 field, 1 for busy and 0 otherwise. | ||
361 | |||
362 | |||
363 | AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD | ||
364 | --------------------------------- | ||
365 | |||
366 | Check if the given path is a mountpoint. | ||
367 | |||
368 | The call requires an initialized struct autofs_dev_ioctl. There are two | ||
369 | possible variations. Both use the path field set to the path of the mount | ||
370 | point to check and the size field adjusted appropriately. One uses the | ||
371 | ioctlfd field to identify a specific mount point to check while the other | ||
372 | variation uses the path and optionaly arg1 set to an autofs mount type. | ||
373 | The call returns 1 if this is a mount point and sets arg1 to the device | ||
374 | number of the mount and field arg2 to the relevant super block magic | ||
375 | number (described below) or 0 if it isn't a mountpoint. In both cases | ||
376 | the the device number (as returned by new_encode_dev()) is returned | ||
377 | in field arg1. | ||
378 | |||
379 | If supplied with a file descriptor we're looking for a specific mount, | ||
380 | not necessarily at the top of the mounted stack. In this case the path | ||
381 | the descriptor corresponds to is considered a mountpoint if it is itself | ||
382 | a mountpoint or contains a mount, such as a multi-mount without a root | ||
383 | mount. In this case we return 1 if the descriptor corresponds to a mount | ||
384 | point and and also returns the super magic of the covering mount if there | ||
385 | is one or 0 if it isn't a mountpoint. | ||
386 | |||
387 | If a path is supplied (and the ioctlfd field is set to -1) then the path | ||
388 | is looked up and is checked to see if it is the root of a mount. If a | ||
389 | type is also given we are looking for a particular autofs mount and if | ||
390 | a match isn't found a fail is returned. If the the located path is the | ||
391 | root of a mount 1 is returned along with the super magic of the mount | ||
392 | or 0 otherwise. | ||
393 | |||
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index b45f3c1b8b43..295f26cd895a 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt | |||
@@ -193,6 +193,5 @@ kernel source: <file:fs/ext3/> | |||
193 | programs: http://e2fsprogs.sourceforge.net/ | 193 | programs: http://e2fsprogs.sourceforge.net/ |
194 | http://ext2resize.sourceforge.net | 194 | http://ext2resize.sourceforge.net |
195 | 195 | ||
196 | useful links: http://www.zip.com.au/~akpm/linux/ext3/ext3-usage.html | 196 | useful links: http://www-106.ibm.com/developerworks/linux/library/l-fs7/ |
197 | http://www-106.ibm.com/developerworks/linux/library/l-fs7/ | ||
198 | http://www-106.ibm.com/developerworks/linux/library/l-fs8/ | 197 | http://www-106.ibm.com/developerworks/linux/library/l-fs8/ |
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 0d5394920a31..174eaff7ded9 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -2,19 +2,24 @@ | |||
2 | Ext4 Filesystem | 2 | Ext4 Filesystem |
3 | =============== | 3 | =============== |
4 | 4 | ||
5 | This is a development version of the ext4 filesystem, an advanced level | 5 | Ext4 is an an advanced level of the ext3 filesystem which incorporates |
6 | of the ext3 filesystem which incorporates scalability and reliability | 6 | scalability and reliability enhancements for supporting large filesystems |
7 | enhancements for supporting large filesystems (64 bit) in keeping with | 7 | (64 bit) in keeping with increasing disk capacities and state-of-the-art |
8 | increasing disk capacities and state-of-the-art feature requirements. | 8 | feature requirements. |
9 | 9 | ||
10 | Mailing list: linux-ext4@vger.kernel.org | 10 | Mailing list: linux-ext4@vger.kernel.org |
11 | Web site: http://ext4.wiki.kernel.org | ||
11 | 12 | ||
12 | 13 | ||
13 | 1. Quick usage instructions: | 14 | 1. Quick usage instructions: |
14 | =========================== | 15 | =========================== |
15 | 16 | ||
17 | Note: More extensive information for getting started with ext4 can be | ||
18 | found at the ext4 wiki site at the URL: | ||
19 | http://ext4.wiki.kernel.org/index.php/Ext4_Howto | ||
20 | |||
16 | - Compile and install the latest version of e2fsprogs (as of this | 21 | - Compile and install the latest version of e2fsprogs (as of this |
17 | writing version 1.41) from: | 22 | writing version 1.41.3) from: |
18 | 23 | ||
19 | http://sourceforge.net/project/showfiles.php?group_id=2406 | 24 | http://sourceforge.net/project/showfiles.php?group_id=2406 |
20 | 25 | ||
@@ -32,28 +37,26 @@ Mailing list: linux-ext4@vger.kernel.org | |||
32 | you will need to merge your changes with the version from e2fsprogs | 37 | you will need to merge your changes with the version from e2fsprogs |
33 | 1.41.x. | 38 | 1.41.x. |
34 | 39 | ||
35 | - Create a new filesystem using the ext4dev filesystem type: | 40 | - Create a new filesystem using the ext4 filesystem type: |
36 | 41 | ||
37 | # mke2fs -t ext4dev /dev/hda1 | 42 | # mke2fs -t ext4 /dev/hda1 |
38 | 43 | ||
39 | Or configure an existing ext3 filesystem to support extents and set | 44 | Or to configure an existing ext3 filesystem to support extents: |
40 | the test_fs flag to indicate that it's ok for an in-development | ||
41 | filesystem to touch this filesystem: | ||
42 | 45 | ||
43 | # tune2fs -O extents -E test_fs /dev/hda1 | 46 | # tune2fs -O extents /dev/hda1 |
44 | 47 | ||
45 | If the filesystem was created with 128 byte inodes, it can be | 48 | If the filesystem was created with 128 byte inodes, it can be |
46 | converted to use 256 byte for greater efficiency via: | 49 | converted to use 256 byte for greater efficiency via: |
47 | 50 | ||
48 | # tune2fs -I 256 /dev/hda1 | 51 | # tune2fs -I 256 /dev/hda1 |
49 | 52 | ||
50 | (Note: we currently do not have tools to convert an ext4dev | 53 | (Note: we currently do not have tools to convert an ext4 |
51 | filesystem back to ext3; so please do not do try this on production | 54 | filesystem back to ext3; so please do not do try this on production |
52 | filesystems.) | 55 | filesystems.) |
53 | 56 | ||
54 | - Mounting: | 57 | - Mounting: |
55 | 58 | ||
56 | # mount -t ext4dev /dev/hda1 /wherever | 59 | # mount -t ext4 /dev/hda1 /wherever |
57 | 60 | ||
58 | - When comparing performance with other filesystems, remember that | 61 | - When comparing performance with other filesystems, remember that |
59 | ext3/4 by default offers higher data integrity guarantees than most. | 62 | ext3/4 by default offers higher data integrity guarantees than most. |
@@ -104,8 +107,8 @@ exist yet so I'm not sure they're in the near-term roadmap. | |||
104 | The big performance win will come with mballoc, delalloc and flex_bg | 107 | The big performance win will come with mballoc, delalloc and flex_bg |
105 | grouping of bitmaps and inode tables. Some test results available here: | 108 | grouping of bitmaps and inode tables. Some test results available here: |
106 | 109 | ||
107 | - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html | 110 | - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html |
108 | - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html | 111 | - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html |
109 | 112 | ||
110 | 3. Options | 113 | 3. Options |
111 | ========== | 114 | ========== |
@@ -177,6 +180,11 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in | |||
177 | your disks are battery-backed in one way or another, | 180 | your disks are battery-backed in one way or another, |
178 | disabling barriers may safely improve performance. | 181 | disabling barriers may safely improve performance. |
179 | 182 | ||
183 | inode_readahead=n This tuning parameter controls the maximum | ||
184 | number of inode table blocks that ext4's inode | ||
185 | table readahead algorithm will pre-read into | ||
186 | the buffer cache. The default value is 32 blocks. | ||
187 | |||
180 | orlov (*) This enables the new Orlov block allocator. It is | 188 | orlov (*) This enables the new Orlov block allocator. It is |
181 | enabled by default. | 189 | enabled by default. |
182 | 190 | ||
@@ -209,15 +217,17 @@ noreservation | |||
209 | bsddf (*) Make 'df' act like BSD. | 217 | bsddf (*) Make 'df' act like BSD. |
210 | minixdf Make 'df' act like Minix. | 218 | minixdf Make 'df' act like Minix. |
211 | 219 | ||
212 | check=none Don't do extra checking of bitmaps on mount. | ||
213 | nocheck | ||
214 | |||
215 | debug Extra debugging information is sent to syslog. | 220 | debug Extra debugging information is sent to syslog. |
216 | 221 | ||
217 | errors=remount-ro(*) Remount the filesystem read-only on an error. | 222 | errors=remount-ro(*) Remount the filesystem read-only on an error. |
218 | errors=continue Keep going on a filesystem error. | 223 | errors=continue Keep going on a filesystem error. |
219 | errors=panic Panic and halt the machine if an error occurs. | 224 | errors=panic Panic and halt the machine if an error occurs. |
220 | 225 | ||
226 | data_err=ignore(*) Just print an error message if an error occurs | ||
227 | in a file data buffer in ordered mode. | ||
228 | data_err=abort Abort the journal if an error occurs in a file | ||
229 | data buffer in ordered mode. | ||
230 | |||
221 | grpid Give objects the same group ID as their creator. | 231 | grpid Give objects the same group ID as their creator. |
222 | bsdgroups | 232 | bsdgroups |
223 | 233 | ||
@@ -243,8 +253,6 @@ nobh (a) cache disk block mapping information | |||
243 | "nobh" option tries to avoid associating buffer | 253 | "nobh" option tries to avoid associating buffer |
244 | heads (supported only for "writeback" mode). | 254 | heads (supported only for "writeback" mode). |
245 | 255 | ||
246 | mballoc (*) Use the multiple block allocator for block allocation | ||
247 | nomballoc disabled multiple block allocator for block allocation. | ||
248 | stripe=n Number of filesystem blocks that mballoc will try | 256 | stripe=n Number of filesystem blocks that mballoc will try |
249 | to use for allocation size and alignment. For RAID5/6 | 257 | to use for allocation size and alignment. For RAID5/6 |
250 | systems this should be the number of data | 258 | systems this should be the number of data |
@@ -252,6 +260,7 @@ stripe=n Number of filesystem blocks that mballoc will try | |||
252 | delalloc (*) Deferring block allocation until write-out time. | 260 | delalloc (*) Deferring block allocation until write-out time. |
253 | nodelalloc Disable delayed allocation. Blocks are allocation | 261 | nodelalloc Disable delayed allocation. Blocks are allocation |
254 | when data is copied from user to page cache. | 262 | when data is copied from user to page cache. |
263 | |||
255 | Data Mode | 264 | Data Mode |
256 | ========= | 265 | ========= |
257 | There are 3 different data modes: | 266 | There are 3 different data modes: |
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt new file mode 100644 index 000000000000..1e3defcfe50b --- /dev/null +++ b/Documentation/filesystems/fiemap.txt | |||
@@ -0,0 +1,228 @@ | |||
1 | ============ | ||
2 | Fiemap Ioctl | ||
3 | ============ | ||
4 | |||
5 | The fiemap ioctl is an efficient method for userspace to get file | ||
6 | extent mappings. Instead of block-by-block mapping (such as bmap), fiemap | ||
7 | returns a list of extents. | ||
8 | |||
9 | |||
10 | Request Basics | ||
11 | -------------- | ||
12 | |||
13 | A fiemap request is encoded within struct fiemap: | ||
14 | |||
15 | struct fiemap { | ||
16 | __u64 fm_start; /* logical offset (inclusive) at | ||
17 | * which to start mapping (in) */ | ||
18 | __u64 fm_length; /* logical length of mapping which | ||
19 | * userspace cares about (in) */ | ||
20 | __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ | ||
21 | __u32 fm_mapped_extents; /* number of extents that were | ||
22 | * mapped (out) */ | ||
23 | __u32 fm_extent_count; /* size of fm_extents array (in) */ | ||
24 | __u32 fm_reserved; | ||
25 | struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ | ||
26 | }; | ||
27 | |||
28 | |||
29 | fm_start, and fm_length specify the logical range within the file | ||
30 | which the process would like mappings for. Extents returned mirror | ||
31 | those on disk - that is, the logical offset of the 1st returned extent | ||
32 | may start before fm_start, and the range covered by the last returned | ||
33 | extent may end after fm_length. All offsets and lengths are in bytes. | ||
34 | |||
35 | Certain flags to modify the way in which mappings are looked up can be | ||
36 | set in fm_flags. If the kernel doesn't understand some particular | ||
37 | flags, it will return EBADR and the contents of fm_flags will contain | ||
38 | the set of flags which caused the error. If the kernel is compatible | ||
39 | with all flags passed, the contents of fm_flags will be unmodified. | ||
40 | It is up to userspace to determine whether rejection of a particular | ||
41 | flag is fatal to it's operation. This scheme is intended to allow the | ||
42 | fiemap interface to grow in the future but without losing | ||
43 | compatibility with old software. | ||
44 | |||
45 | fm_extent_count specifies the number of elements in the fm_extents[] array | ||
46 | that can be used to return extents. If fm_extent_count is zero, then the | ||
47 | fm_extents[] array is ignored (no extents will be returned), and the | ||
48 | fm_mapped_extents count will hold the number of extents needed in | ||
49 | fm_extents[] to hold the file's current mapping. Note that there is | ||
50 | nothing to prevent the file from changing between calls to FIEMAP. | ||
51 | |||
52 | The following flags can be set in fm_flags: | ||
53 | |||
54 | * FIEMAP_FLAG_SYNC | ||
55 | If this flag is set, the kernel will sync the file before mapping extents. | ||
56 | |||
57 | * FIEMAP_FLAG_XATTR | ||
58 | If this flag is set, the extents returned will describe the inodes | ||
59 | extended attribute lookup tree, instead of it's data tree. | ||
60 | |||
61 | |||
62 | Extent Mapping | ||
63 | -------------- | ||
64 | |||
65 | Extent information is returned within the embedded fm_extents array | ||
66 | which userspace must allocate along with the fiemap structure. The | ||
67 | number of elements in the fiemap_extents[] array should be passed via | ||
68 | fm_extent_count. The number of extents mapped by kernel will be | ||
69 | returned via fm_mapped_extents. If the number of fiemap_extents | ||
70 | allocated is less than would be required to map the requested range, | ||
71 | the maximum number of extents that can be mapped in the fm_extent[] | ||
72 | array will be returned and fm_mapped_extents will be equal to | ||
73 | fm_extent_count. In that case, the last extent in the array will not | ||
74 | complete the requested range and will not have the FIEMAP_EXTENT_LAST | ||
75 | flag set (see the next section on extent flags). | ||
76 | |||
77 | Each extent is described by a single fiemap_extent structure as | ||
78 | returned in fm_extents. | ||
79 | |||
80 | struct fiemap_extent { | ||
81 | __u64 fe_logical; /* logical offset in bytes for the start of | ||
82 | * the extent */ | ||
83 | __u64 fe_physical; /* physical offset in bytes for the start | ||
84 | * of the extent */ | ||
85 | __u64 fe_length; /* length in bytes for the extent */ | ||
86 | __u64 fe_reserved64[2]; | ||
87 | __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ | ||
88 | __u32 fe_reserved[3]; | ||
89 | }; | ||
90 | |||
91 | All offsets and lengths are in bytes and mirror those on disk. It is valid | ||
92 | for an extents logical offset to start before the request or it's logical | ||
93 | length to extend past the request. Unless FIEMAP_EXTENT_NOT_ALIGNED is | ||
94 | returned, fe_logical, fe_physical, and fe_length will be aligned to the | ||
95 | block size of the file system. With the exception of extents flagged as | ||
96 | FIEMAP_EXTENT_MERGED, adjacent extents will not be merged. | ||
97 | |||
98 | The fe_flags field contains flags which describe the extent returned. | ||
99 | A special flag, FIEMAP_EXTENT_LAST is always set on the last extent in | ||
100 | the file so that the process making fiemap calls can determine when no | ||
101 | more extents are available, without having to call the ioctl again. | ||
102 | |||
103 | Some flags are intentionally vague and will always be set in the | ||
104 | presence of other more specific flags. This way a program looking for | ||
105 | a general property does not have to know all existing and future flags | ||
106 | which imply that property. | ||
107 | |||
108 | For example, if FIEMAP_EXTENT_DATA_INLINE or FIEMAP_EXTENT_DATA_TAIL | ||
109 | are set, FIEMAP_EXTENT_NOT_ALIGNED will also be set. A program looking | ||
110 | for inline or tail-packed data can key on the specific flag. Software | ||
111 | which simply cares not to try operating on non-aligned extents | ||
112 | however, can just key on FIEMAP_EXTENT_NOT_ALIGNED, and not have to | ||
113 | worry about all present and future flags which might imply unaligned | ||
114 | data. Note that the opposite is not true - it would be valid for | ||
115 | FIEMAP_EXTENT_NOT_ALIGNED to appear alone. | ||
116 | |||
117 | * FIEMAP_EXTENT_LAST | ||
118 | This is the last extent in the file. A mapping attempt past this | ||
119 | extent will return nothing. | ||
120 | |||
121 | * FIEMAP_EXTENT_UNKNOWN | ||
122 | The location of this extent is currently unknown. This may indicate | ||
123 | the data is stored on an inaccessible volume or that no storage has | ||
124 | been allocated for the file yet. | ||
125 | |||
126 | * FIEMAP_EXTENT_DELALLOC | ||
127 | - This will also set FIEMAP_EXTENT_UNKNOWN. | ||
128 | Delayed allocation - while there is data for this extent, it's | ||
129 | physical location has not been allocated yet. | ||
130 | |||
131 | * FIEMAP_EXTENT_ENCODED | ||
132 | This extent does not consist of plain filesystem blocks but is | ||
133 | encoded (e.g. encrypted or compressed). Reading the data in this | ||
134 | extent via I/O to the block device will have undefined results. | ||
135 | |||
136 | Note that it is *always* undefined to try to update the data | ||
137 | in-place by writing to the indicated location without the | ||
138 | assistance of the filesystem, or to access the data using the | ||
139 | information returned by the FIEMAP interface while the filesystem | ||
140 | is mounted. In other words, user applications may only read the | ||
141 | extent data via I/O to the block device while the filesystem is | ||
142 | unmounted, and then only if the FIEMAP_EXTENT_ENCODED flag is | ||
143 | clear; user applications must not try reading or writing to the | ||
144 | filesystem via the block device under any other circumstances. | ||
145 | |||
146 | * FIEMAP_EXTENT_DATA_ENCRYPTED | ||
147 | - This will also set FIEMAP_EXTENT_ENCODED | ||
148 | The data in this extent has been encrypted by the file system. | ||
149 | |||
150 | * FIEMAP_EXTENT_NOT_ALIGNED | ||
151 | Extent offsets and length are not guaranteed to be block aligned. | ||
152 | |||
153 | * FIEMAP_EXTENT_DATA_INLINE | ||
154 | This will also set FIEMAP_EXTENT_NOT_ALIGNED | ||
155 | Data is located within a meta data block. | ||
156 | |||
157 | * FIEMAP_EXTENT_DATA_TAIL | ||
158 | This will also set FIEMAP_EXTENT_NOT_ALIGNED | ||
159 | Data is packed into a block with data from other files. | ||
160 | |||
161 | * FIEMAP_EXTENT_UNWRITTEN | ||
162 | Unwritten extent - the extent is allocated but it's data has not been | ||
163 | initialized. This indicates the extent's data will be all zero if read | ||
164 | through the filesystem but the contents are undefined if read directly from | ||
165 | the device. | ||
166 | |||
167 | * FIEMAP_EXTENT_MERGED | ||
168 | This will be set when a file does not support extents, i.e., it uses a block | ||
169 | based addressing scheme. Since returning an extent for each block back to | ||
170 | userspace would be highly inefficient, the kernel will try to merge most | ||
171 | adjacent blocks into 'extents'. | ||
172 | |||
173 | |||
174 | VFS -> File System Implementation | ||
175 | --------------------------------- | ||
176 | |||
177 | File systems wishing to support fiemap must implement a ->fiemap callback on | ||
178 | their inode_operations structure. The fs ->fiemap call is responsible for | ||
179 | defining it's set of supported fiemap flags, and calling a helper function on | ||
180 | each discovered extent: | ||
181 | |||
182 | struct inode_operations { | ||
183 | ... | ||
184 | |||
185 | int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, | ||
186 | u64 len); | ||
187 | |||
188 | ->fiemap is passed struct fiemap_extent_info which describes the | ||
189 | fiemap request: | ||
190 | |||
191 | struct fiemap_extent_info { | ||
192 | unsigned int fi_flags; /* Flags as passed from user */ | ||
193 | unsigned int fi_extents_mapped; /* Number of mapped extents */ | ||
194 | unsigned int fi_extents_max; /* Size of fiemap_extent array */ | ||
195 | struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */ | ||
196 | }; | ||
197 | |||
198 | It is intended that the file system should not need to access any of this | ||
199 | structure directly. | ||
200 | |||
201 | |||
202 | Flag checking should be done at the beginning of the ->fiemap callback via the | ||
203 | fiemap_check_flags() helper: | ||
204 | |||
205 | int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); | ||
206 | |||
207 | The struct fieinfo should be passed in as recieved from ioctl_fiemap(). The | ||
208 | set of fiemap flags which the fs understands should be passed via fs_flags. If | ||
209 | fiemap_check_flags finds invalid user flags, it will place the bad values in | ||
210 | fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from | ||
211 | fiemap_check_flags(), it should immediately exit, returning that error back to | ||
212 | ioctl_fiemap(). | ||
213 | |||
214 | |||
215 | For each extent in the request range, the file system should call | ||
216 | the helper function, fiemap_fill_next_extent(): | ||
217 | |||
218 | int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, | ||
219 | u64 phys, u64 len, u32 flags, u32 dev); | ||
220 | |||
221 | fiemap_fill_next_extent() will use the passed values to populate the | ||
222 | next free extent in the fm_extents array. 'General' extent flags will | ||
223 | automatically be set from specific flags on behalf of the calling file | ||
224 | system so that the userspace API is not broken. | ||
225 | |||
226 | fiemap_fill_next_extent() returns 0 on success, and 1 when the | ||
227 | user-supplied fm_extents array is full. If an error is encountered | ||
228 | while copying the extent to user memory, -EFAULT will be returned. | ||
diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfsroot.txt index 31b329172343..68baddf3c3e0 100644 --- a/Documentation/filesystems/nfsroot.txt +++ b/Documentation/filesystems/nfsroot.txt | |||
@@ -169,7 +169,7 @@ They depend on various facilities being available: | |||
169 | 3.1) Booting from a floppy using syslinux | 169 | 3.1) Booting from a floppy using syslinux |
170 | 170 | ||
171 | When building kernels, an easy way to create a boot floppy that uses | 171 | When building kernels, an easy way to create a boot floppy that uses |
172 | syslinux is to use the zdisk or bzdisk make targets which use | 172 | syslinux is to use the zdisk or bzdisk make targets which use zimage |
173 | and bzimage images respectively. Both targets accept the | 173 | and bzimage images respectively. Both targets accept the |
174 | FDARGS parameter which can be used to set the kernel command line. | 174 | FDARGS parameter which can be used to set the kernel command line. |
175 | 175 | ||
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index c318a8bbb1ef..4340cc825796 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt | |||
@@ -76,3 +76,9 @@ localalloc=8(*) Allows custom localalloc size in MB. If the value is too | |||
76 | large, the fs will silently revert it to the default. | 76 | large, the fs will silently revert it to the default. |
77 | Localalloc is not enabled for local mounts. | 77 | Localalloc is not enabled for local mounts. |
78 | localflocks This disables cluster aware flock. | 78 | localflocks This disables cluster aware flock. |
79 | inode64 Indicates that Ocfs2 is allowed to create inodes at | ||
80 | any location in the filesystem, including those which | ||
81 | will result in inode numbers occupying more than 32 | ||
82 | bits of significance. | ||
83 | user_xattr (*) Enables Extended User Attributes. | ||
84 | nouser_xattr Disables Extended User Attributes. | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 394eb2cc1c39..c032bf39e8b9 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -923,45 +923,44 @@ CPUs. | |||
923 | The "procs_blocked" line gives the number of processes currently blocked, | 923 | The "procs_blocked" line gives the number of processes currently blocked, |
924 | waiting for I/O to complete. | 924 | waiting for I/O to complete. |
925 | 925 | ||
926 | |||
926 | 1.9 Ext4 file system parameters | 927 | 1.9 Ext4 file system parameters |
927 | ------------------------------ | 928 | ------------------------------ |
928 | Ext4 file system have one directory per partition under /proc/fs/ext4/ | ||
929 | # ls /proc/fs/ext4/hdc/ | ||
930 | group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req | ||
931 | stats stream_req | ||
932 | |||
933 | mb_groups: | ||
934 | This file gives the details of multiblock allocator buddy cache of free blocks | ||
935 | |||
936 | mb_history: | ||
937 | Multiblock allocation history. | ||
938 | |||
939 | stats: | ||
940 | This file indicate whether the multiblock allocator should start collecting | ||
941 | statistics. The statistics are shown during unmount | ||
942 | 929 | ||
943 | group_prealloc: | 930 | Information about mounted ext4 file systems can be found in |
944 | The multiblock allocator normalize the block allocation request to | 931 | /proc/fs/ext4. Each mounted filesystem will have a directory in |
945 | group_prealloc filesystem blocks if we don't have strip value set. | 932 | /proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or |
946 | The stripe value can be specified at mount time or during mke2fs. | 933 | /proc/fs/ext4/dm-0). The files in each per-device directory are shown |
934 | in Table 1-10, below. | ||
947 | 935 | ||
948 | max_to_scan: | 936 | Table 1-10: Files in /proc/fs/ext4/<devname> |
949 | How long multiblock allocator can look for a best extent (in found extents) | 937 | .............................................................................. |
950 | 938 | File Content | |
951 | min_to_scan: | 939 | mb_groups details of multiblock allocator buddy cache of free blocks |
952 | How long multiblock allocator must look for a best extent | 940 | mb_history multiblock allocation history |
953 | 941 | stats controls whether the multiblock allocator should start | |
954 | order2_req: | 942 | collecting statistics, which are shown during the unmount |
955 | Multiblock allocator use 2^N search using buddies only for requests greater | 943 | group_prealloc the multiblock allocator will round up allocation |
956 | than or equal to order2_req. The request size is specfied in file system | 944 | requests to a multiple of this tuning parameter if the |
957 | blocks. A value of 2 indicate only if the requests are greater than or equal | 945 | stripe size is not set in the ext4 superblock |
958 | to 4 blocks. | 946 | max_to_scan The maximum number of extents the multiblock allocator |
947 | will search to find the best extent | ||
948 | min_to_scan The minimum number of extents the multiblock allocator | ||
949 | will search to find the best extent | ||
950 | order2_req Tuning parameter which controls the minimum size for | ||
951 | requests (as a power of 2) where the buddy cache is | ||
952 | used | ||
953 | stream_req Files which have fewer blocks than this tunable | ||
954 | parameter will have their blocks allocated out of a | ||
955 | block group specific preallocation pool, so that small | ||
956 | files are packed closely together. Each large file | ||
957 | will have its blocks allocated out of its own unique | ||
958 | preallocation pool. | ||
959 | inode_readahead Tuning parameter which controls the maximum number of | ||
960 | inode table blocks that ext4's inode table readahead | ||
961 | algorithm will pre-read into the buffer cache | ||
962 | .............................................................................. | ||
959 | 963 | ||
960 | stream_req: | ||
961 | Files smaller than stream_req are served by the stream allocator, whose | ||
962 | purpose is to pack requests as close each to other as possible to | ||
963 | produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16 | ||
964 | filesystem block size will use group based preallocation. | ||
965 | 964 | ||
966 | ------------------------------------------------------------------------------ | 965 | ------------------------------------------------------------------------------ |
967 | Summary | 966 | Summary |
@@ -1322,6 +1321,18 @@ debugging information is displayed on console. | |||
1322 | NMI switch that most IA32 servers have fires unknown NMI up, for example. | 1321 | NMI switch that most IA32 servers have fires unknown NMI up, for example. |
1323 | If a system hangs up, try pressing the NMI switch. | 1322 | If a system hangs up, try pressing the NMI switch. |
1324 | 1323 | ||
1324 | panic_on_unrecovered_nmi | ||
1325 | ------------------------ | ||
1326 | |||
1327 | The default Linux behaviour on an NMI of either memory or unknown is to continue | ||
1328 | operation. For many environments such as scientific computing it is preferable | ||
1329 | that the box is taken out and the error dealt with than an uncorrected | ||
1330 | parity/ECC error get propogated. | ||
1331 | |||
1332 | A small number of systems do generate NMI's for bizarre random reasons such as | ||
1333 | power management so the default is off. That sysctl works like the existing | ||
1334 | panic controls already in that directory. | ||
1335 | |||
1325 | nmi_watchdog | 1336 | nmi_watchdog |
1326 | ------------ | 1337 | ------------ |
1327 | 1338 | ||
@@ -1332,13 +1343,6 @@ determine whether or not they are still functioning properly. | |||
1332 | Because the NMI watchdog shares registers with oprofile, by disabling the NMI | 1343 | Because the NMI watchdog shares registers with oprofile, by disabling the NMI |
1333 | watchdog, oprofile may have more registers to utilize. | 1344 | watchdog, oprofile may have more registers to utilize. |
1334 | 1345 | ||
1335 | maps_protect | ||
1336 | ------------ | ||
1337 | |||
1338 | Enables/Disables the protection of the per-process proc entries "maps" and | ||
1339 | "smaps". When enabled, the contents of these files are visible only to | ||
1340 | readers that are allowed to ptrace() the given process. | ||
1341 | |||
1342 | msgmni | 1346 | msgmni |
1343 | ------ | 1347 | ------ |
1344 | 1348 | ||
@@ -2413,6 +2417,8 @@ The following 4 memory types are supported: | |||
2413 | - (bit 1) anonymous shared memory | 2417 | - (bit 1) anonymous shared memory |
2414 | - (bit 2) file-backed private memory | 2418 | - (bit 2) file-backed private memory |
2415 | - (bit 3) file-backed shared memory | 2419 | - (bit 3) file-backed shared memory |
2420 | - (bit 4) ELF header pages in file-backed private memory areas (it is | ||
2421 | effective only if the bit 2 is cleared) | ||
2416 | 2422 | ||
2417 | Note that MMIO pages such as frame buffer are never dumped and vDSO pages | 2423 | Note that MMIO pages such as frame buffer are never dumped and vDSO pages |
2418 | are always dumped regardless of the bitmask status. | 2424 | are always dumped regardless of the bitmask status. |
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt index 7be232b44ee4..62fe9b1e0890 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt | |||
@@ -263,7 +263,7 @@ User Mode Linux, like so: | |||
263 | sleep(999999999); | 263 | sleep(999999999); |
264 | } | 264 | } |
265 | EOF | 265 | EOF |
266 | gcc -static hello2.c -o init | 266 | gcc -static hello.c -o init |
267 | echo init | cpio -o -H newc | gzip > test.cpio.gz | 267 | echo init | cpio -o -H newc | gzip > test.cpio.gz |
268 | # Testing external initramfs using the initrd loading mechanism. | 268 | # Testing external initramfs using the initrd loading mechanism. |
269 | qemu -kernel /boot/vmlinuz -initrd test.cpio.gz /dev/zero | 269 | qemu -kernel /boot/vmlinuz -initrd test.cpio.gz /dev/zero |