diff options
author | Jiri Kosina <jkosina@suse.cz> | 2011-04-26 04:22:15 -0400 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2011-04-26 04:22:59 -0400 |
commit | 07f9479a40cc778bc1462ada11f95b01360ae4ff (patch) | |
tree | 0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /Documentation/filesystems | |
parent | 9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff) | |
parent | cd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff) |
Merge branch 'master' into for-next
Fast-forwarded to current state of Linus' tree as there are patches to be
applied for files that didn't exist on the old branch.
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/Locking | 2 | ||||
-rw-r--r-- | Documentation/filesystems/adfs.txt | 18 | ||||
-rw-r--r-- | Documentation/filesystems/autofs4-mount-control.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/caching/netfs-api.txt | 18 | ||||
-rw-r--r-- | Documentation/filesystems/configfs/configfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/exofs.txt | 10 | ||||
-rw-r--r-- | Documentation/filesystems/ext4.txt | 211 | ||||
-rw-r--r-- | Documentation/filesystems/gfs2-uevents.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/gfs2.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ntfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ocfs2.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/path-lookup.txt | 4 | ||||
-rw-r--r-- | Documentation/filesystems/pohmelfs/network_protocol.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/porting | 16 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 6 | ||||
-rw-r--r-- | Documentation/filesystems/squashfs.txt | 30 | ||||
-rw-r--r-- | Documentation/filesystems/sysfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 4 | ||||
-rw-r--r-- | Documentation/filesystems/xfs-delayed-logging-design.txt | 15 |
19 files changed, 295 insertions, 55 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2e994efe12cb..61b31acb9176 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -128,7 +128,7 @@ alloc_inode: | |||
128 | destroy_inode: | 128 | destroy_inode: |
129 | dirty_inode: (must not sleep) | 129 | dirty_inode: (must not sleep) |
130 | write_inode: | 130 | write_inode: |
131 | drop_inode: !!!inode_lock!!! | 131 | drop_inode: !!!inode->i_lock!!! |
132 | evict_inode: | 132 | evict_inode: |
133 | put_super: write | 133 | put_super: write |
134 | write_super: read | 134 | write_super: read |
diff --git a/Documentation/filesystems/adfs.txt b/Documentation/filesystems/adfs.txt index 9e8811f92b84..5949766353f7 100644 --- a/Documentation/filesystems/adfs.txt +++ b/Documentation/filesystems/adfs.txt | |||
@@ -9,6 +9,9 @@ Mount options for ADFS | |||
9 | will be nnn. Default 0700. | 9 | will be nnn. Default 0700. |
10 | othmask=nnn The permission mask for ADFS 'other' permissions | 10 | othmask=nnn The permission mask for ADFS 'other' permissions |
11 | will be nnn. Default 0077. | 11 | will be nnn. Default 0077. |
12 | ftsuffix=n When ftsuffix=0, no file type suffix will be applied. | ||
13 | When ftsuffix=1, a hexadecimal suffix corresponding to | ||
14 | the RISC OS file type will be added. Default 0. | ||
12 | 15 | ||
13 | Mapping of ADFS permissions to Linux permissions | 16 | Mapping of ADFS permissions to Linux permissions |
14 | ------------------------------------------------ | 17 | ------------------------------------------------ |
@@ -55,3 +58,18 @@ Mapping of ADFS permissions to Linux permissions | |||
55 | 58 | ||
56 | You can therefore tailor the permission translation to whatever you | 59 | You can therefore tailor the permission translation to whatever you |
57 | desire the permissions should be under Linux. | 60 | desire the permissions should be under Linux. |
61 | |||
62 | RISC OS file type suffix | ||
63 | ------------------------ | ||
64 | |||
65 | RISC OS file types are stored in bits 19..8 of the file load address. | ||
66 | |||
67 | To enable non-RISC OS systems to be used to store files without losing | ||
68 | file type information, a file naming convention was devised (initially | ||
69 | for use with NFS) such that a hexadecimal suffix of the form ,xyz | ||
70 | denoted the file type: e.g. BasicFile,ffb is a BASIC (0xffb) file. This | ||
71 | naming convention is now also used by RISC OS emulators such as RPCEmu. | ||
72 | |||
73 | Mounting an ADFS disc with option ftsuffix=1 will cause appropriate file | ||
74 | type suffixes to be appended to file names read from a directory. If the | ||
75 | ftsuffix option is zero or omitted, no file type suffixes will be added. | ||
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt index 51986bf08a4d..4c95935cbcf4 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs4-mount-control.txt | |||
@@ -309,7 +309,7 @@ ioctlfd field set to the descriptor obtained from the open call. | |||
309 | AUTOFS_DEV_IOCTL_TIMEOUT_CMD | 309 | AUTOFS_DEV_IOCTL_TIMEOUT_CMD |
310 | ---------------------------- | 310 | ---------------------------- |
311 | 311 | ||
312 | Set the expire timeout for mounts withing an autofs mount point. | 312 | Set the expire timeout for mounts within an autofs mount point. |
313 | 313 | ||
314 | The call requires an initialized struct autofs_dev_ioctl with the | 314 | The call requires an initialized struct autofs_dev_ioctl with the |
315 | ioctlfd field set to the descriptor obtained from the open call. | 315 | ioctlfd field set to the descriptor obtained from the open call. |
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt index 1902c57b72ef..a167ab876c35 100644 --- a/Documentation/filesystems/caching/netfs-api.txt +++ b/Documentation/filesystems/caching/netfs-api.txt | |||
@@ -95,7 +95,7 @@ restraints as possible on how an index is structured and where it is placed in | |||
95 | the tree. The netfs can even mix indices and data files at the same level, but | 95 | the tree. The netfs can even mix indices and data files at the same level, but |
96 | it's not recommended. | 96 | it's not recommended. |
97 | 97 | ||
98 | Each index entry consists of a key of indeterminate length plus some auxilliary | 98 | Each index entry consists of a key of indeterminate length plus some auxiliary |
99 | data, also of indeterminate length. | 99 | data, also of indeterminate length. |
100 | 100 | ||
101 | There are some limits on indices: | 101 | There are some limits on indices: |
@@ -203,23 +203,23 @@ This has the following fields: | |||
203 | 203 | ||
204 | If the function is absent, a file size of 0 is assumed. | 204 | If the function is absent, a file size of 0 is assumed. |
205 | 205 | ||
206 | (6) A function to retrieve auxilliary data from the netfs [optional]. | 206 | (6) A function to retrieve auxiliary data from the netfs [optional]. |
207 | 207 | ||
208 | This function will be called with the netfs data that was passed to the | 208 | This function will be called with the netfs data that was passed to the |
209 | cookie acquisition function and the maximum length of auxilliary data that | 209 | cookie acquisition function and the maximum length of auxiliary data that |
210 | it may provide. It should write the auxilliary data into the given buffer | 210 | it may provide. It should write the auxiliary data into the given buffer |
211 | and return the quantity it wrote. | 211 | and return the quantity it wrote. |
212 | 212 | ||
213 | If this function is absent, the auxilliary data length will be set to 0. | 213 | If this function is absent, the auxiliary data length will be set to 0. |
214 | 214 | ||
215 | The length of the auxilliary data buffer may be dependent on the key | 215 | The length of the auxiliary data buffer may be dependent on the key |
216 | length. A netfs mustn't rely on being able to provide more than 400 bytes | 216 | length. A netfs mustn't rely on being able to provide more than 400 bytes |
217 | for both. | 217 | for both. |
218 | 218 | ||
219 | (7) A function to check the auxilliary data [optional]. | 219 | (7) A function to check the auxiliary data [optional]. |
220 | 220 | ||
221 | This function will be called to check that a match found in the cache for | 221 | This function will be called to check that a match found in the cache for |
222 | this object is valid. For instance with AFS it could check the auxilliary | 222 | this object is valid. For instance with AFS it could check the auxiliary |
223 | data against the data version number returned by the server to determine | 223 | data against the data version number returned by the server to determine |
224 | whether the index entry in a cache is still valid. | 224 | whether the index entry in a cache is still valid. |
225 | 225 | ||
@@ -232,7 +232,7 @@ This has the following fields: | |||
232 | (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update | 232 | (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update |
233 | (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted | 233 | (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted |
234 | 234 | ||
235 | This function can also be used to extract data from the auxilliary data in | 235 | This function can also be used to extract data from the auxiliary data in |
236 | the cache and copy it into the netfs's structures. | 236 | the cache and copy it into the netfs's structures. |
237 | 237 | ||
238 | (8) A pair of functions to manage contexts for the completion callback | 238 | (8) A pair of functions to manage contexts for the completion callback |
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index fabcb0e00f25..dd57bb6bb390 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt | |||
@@ -409,7 +409,7 @@ As a consequence of this, default_groups cannot be removed directly via | |||
409 | rmdir(2). They also are not considered when rmdir(2) on the parent | 409 | rmdir(2). They also are not considered when rmdir(2) on the parent |
410 | group is checking for children. | 410 | group is checking for children. |
411 | 411 | ||
412 | [Dependant Subsystems] | 412 | [Dependent Subsystems] |
413 | 413 | ||
414 | Sometimes other drivers depend on particular configfs items. For | 414 | Sometimes other drivers depend on particular configfs items. For |
415 | example, ocfs2 mounts depend on a heartbeat region item. If that | 415 | example, ocfs2 mounts depend on a heartbeat region item. If that |
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt index abd2a9b5b787..23583a136975 100644 --- a/Documentation/filesystems/exofs.txt +++ b/Documentation/filesystems/exofs.txt | |||
@@ -104,7 +104,15 @@ Where: | |||
104 | exofs specific options: Options are separated by commas (,) | 104 | exofs specific options: Options are separated by commas (,) |
105 | pid=<integer> - The partition number to mount/create as | 105 | pid=<integer> - The partition number to mount/create as |
106 | container of the filesystem. | 106 | container of the filesystem. |
107 | This option is mandatory. | 107 | This option is mandatory. integer can be |
108 | Hex by pre-pending an 0x to the number. | ||
109 | osdname=<id> - Mount by a device's osdname. | ||
110 | osdname is usually a 36 character uuid of the | ||
111 | form "d2683732-c906-4ee1-9dbd-c10c27bb40df". | ||
112 | It is one of the device's uuid specified in the | ||
113 | mkfs.exofs format command. | ||
114 | If this option is specified then the /dev/osdX | ||
115 | above can be empty and is ignored. | ||
108 | to=<integer> - Timeout in ticks for a single command. | 116 | to=<integer> - Timeout in ticks for a single command. |
109 | default is (60 * HZ) [for debugging only] | 117 | default is (60 * HZ) [for debugging only] |
110 | 118 | ||
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6ab9442d7eeb..c79ec58fd7f6 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -97,7 +97,7 @@ Note: More extensive information for getting started with ext4 can be | |||
97 | * Inode allocation using large virtual block groups via flex_bg | 97 | * Inode allocation using large virtual block groups via flex_bg |
98 | * delayed allocation | 98 | * delayed allocation |
99 | * large block (up to pagesize) support | 99 | * large block (up to pagesize) support |
100 | * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force | 100 | * efficient new ordered mode in JBD2 and ext4(avoid using buffer head to force |
101 | the ordering) | 101 | the ordering) |
102 | 102 | ||
103 | [1] Filesystems with a block size of 1k may see a limit imposed by the | 103 | [1] Filesystems with a block size of 1k may see a limit imposed by the |
@@ -106,7 +106,7 @@ directory hash tree having a maximum depth of two. | |||
106 | 2.2 Candidate features for future inclusion | 106 | 2.2 Candidate features for future inclusion |
107 | 107 | ||
108 | * Online defrag (patches available but not well tested) | 108 | * Online defrag (patches available but not well tested) |
109 | * reduced mke2fs time via lazy itable initialization in conjuction with | 109 | * reduced mke2fs time via lazy itable initialization in conjunction with |
110 | the uninit_bg feature (capability to do this is available in e2fsprogs | 110 | the uninit_bg feature (capability to do this is available in e2fsprogs |
111 | but a kernel thread to do lazy zeroing of unused inode table blocks | 111 | but a kernel thread to do lazy zeroing of unused inode table blocks |
112 | after filesystem is first mounted is required for safety) | 112 | after filesystem is first mounted is required for safety) |
@@ -367,12 +367,47 @@ init_itable=n The lazy itable init code will wait n times the | |||
367 | minimizes the impact on the systme performance | 367 | minimizes the impact on the systme performance |
368 | while file system's inode table is being initialized. | 368 | while file system's inode table is being initialized. |
369 | 369 | ||
370 | discard Controls whether ext4 should issue discard/TRIM | 370 | discard Controls whether ext4 should issue discard/TRIM |
371 | nodiscard(*) commands to the underlying block device when | 371 | nodiscard(*) commands to the underlying block device when |
372 | blocks are freed. This is useful for SSD devices | 372 | blocks are freed. This is useful for SSD devices |
373 | and sparse/thinly-provisioned LUNs, but it is off | 373 | and sparse/thinly-provisioned LUNs, but it is off |
374 | by default until sufficient testing has been done. | 374 | by default until sufficient testing has been done. |
375 | 375 | ||
376 | nouid32 Disables 32-bit UIDs and GIDs. This is for | ||
377 | interoperability with older kernels which only | ||
378 | store and expect 16-bit values. | ||
379 | |||
380 | resize Allows to resize filesystem to the end of the last | ||
381 | existing block group, further resize has to be done | ||
382 | with resize2fs either online, or offline. It can be | ||
383 | used only with conjunction with remount. | ||
384 | |||
385 | block_validity This options allows to enables/disables the in-kernel | ||
386 | noblock_validity facility for tracking filesystem metadata blocks | ||
387 | within internal data structures. This allows multi- | ||
388 | block allocator and other routines to quickly locate | ||
389 | extents which might overlap with filesystem metadata | ||
390 | blocks. This option is intended for debugging | ||
391 | purposes and since it negatively affects the | ||
392 | performance, it is off by default. | ||
393 | |||
394 | dioread_lock Controls whether or not ext4 should use the DIO read | ||
395 | dioread_nolock locking. If the dioread_nolock option is specified | ||
396 | ext4 will allocate uninitialized extent before buffer | ||
397 | write and convert the extent to initialized after IO | ||
398 | completes. This approach allows ext4 code to avoid | ||
399 | using inode mutex, which improves scalability on high | ||
400 | speed storages. However this does not work with nobh | ||
401 | option and the mount will fail. Nor does it work with | ||
402 | data journaling and dioread_nolock option will be | ||
403 | ignored with kernel warning. Note that dioread_nolock | ||
404 | code path is only used for extent-based files. | ||
405 | Because of the restrictions this options comprises | ||
406 | it is off by default (e.g. dioread_lock). | ||
407 | |||
408 | i_version Enable 64-bit inode version support. This option is | ||
409 | off by default. | ||
410 | |||
376 | Data Mode | 411 | Data Mode |
377 | ========= | 412 | ========= |
378 | There are 3 different data modes: | 413 | There are 3 different data modes: |
@@ -400,6 +435,176 @@ needs to be read from and written to disk at the same time where it | |||
400 | outperforms all others modes. Currently ext4 does not have delayed | 435 | outperforms all others modes. Currently ext4 does not have delayed |
401 | allocation support if this data journalling mode is selected. | 436 | allocation support if this data journalling mode is selected. |
402 | 437 | ||
438 | /proc entries | ||
439 | ============= | ||
440 | |||
441 | Information about mounted ext4 file systems can be found in | ||
442 | /proc/fs/ext4. Each mounted filesystem will have a directory in | ||
443 | /proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or | ||
444 | /proc/fs/ext4/dm-0). The files in each per-device directory are shown | ||
445 | in table below. | ||
446 | |||
447 | Files in /proc/fs/ext4/<devname> | ||
448 | .............................................................................. | ||
449 | File Content | ||
450 | mb_groups details of multiblock allocator buddy cache of free blocks | ||
451 | .............................................................................. | ||
452 | |||
453 | /sys entries | ||
454 | ============ | ||
455 | |||
456 | Information about mounted ext4 file systems can be found in | ||
457 | /sys/fs/ext4. Each mounted filesystem will have a directory in | ||
458 | /sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or | ||
459 | /sys/fs/ext4/dm-0). The files in each per-device directory are shown | ||
460 | in table below. | ||
461 | |||
462 | Files in /sys/fs/ext4/<devname> | ||
463 | (see also Documentation/ABI/testing/sysfs-fs-ext4) | ||
464 | .............................................................................. | ||
465 | File Content | ||
466 | |||
467 | delayed_allocation_blocks This file is read-only and shows the number of | ||
468 | blocks that are dirty in the page cache, but | ||
469 | which do not have their location in the | ||
470 | filesystem allocated yet. | ||
471 | |||
472 | inode_goal Tuning parameter which (if non-zero) controls | ||
473 | the goal inode used by the inode allocator in | ||
474 | preference to all other allocation heuristics. | ||
475 | This is intended for debugging use only, and | ||
476 | should be 0 on production systems. | ||
477 | |||
478 | inode_readahead_blks Tuning parameter which controls the maximum | ||
479 | number of inode table blocks that ext4's inode | ||
480 | table readahead algorithm will pre-read into | ||
481 | the buffer cache | ||
482 | |||
483 | lifetime_write_kbytes This file is read-only and shows the number of | ||
484 | kilobytes of data that have been written to this | ||
485 | filesystem since it was created. | ||
486 | |||
487 | max_writeback_mb_bump The maximum number of megabytes the writeback | ||
488 | code will try to write out before move on to | ||
489 | another inode. | ||
490 | |||
491 | mb_group_prealloc The multiblock allocator will round up allocation | ||
492 | requests to a multiple of this tuning parameter if | ||
493 | the stripe size is not set in the ext4 superblock | ||
494 | |||
495 | mb_max_to_scan The maximum number of extents the multiblock | ||
496 | allocator will search to find the best extent | ||
497 | |||
498 | mb_min_to_scan The minimum number of extents the multiblock | ||
499 | allocator will search to find the best extent | ||
500 | |||
501 | mb_order2_req Tuning parameter which controls the minimum size | ||
502 | for requests (as a power of 2) where the buddy | ||
503 | cache is used | ||
504 | |||
505 | mb_stats Controls whether the multiblock allocator should | ||
506 | collect statistics, which are shown during the | ||
507 | unmount. 1 means to collect statistics, 0 means | ||
508 | not to collect statistics | ||
509 | |||
510 | mb_stream_req Files which have fewer blocks than this tunable | ||
511 | parameter will have their blocks allocated out | ||
512 | of a block group specific preallocation pool, so | ||
513 | that small files are packed closely together. | ||
514 | Each large file will have its blocks allocated | ||
515 | out of its own unique preallocation pool. | ||
516 | |||
517 | session_write_kbytes This file is read-only and shows the number of | ||
518 | kilobytes of data that have been written to this | ||
519 | filesystem since it was mounted. | ||
520 | .............................................................................. | ||
521 | |||
522 | Ioctls | ||
523 | ====== | ||
524 | |||
525 | There is some Ext4 specific functionality which can be accessed by applications | ||
526 | through the system call interfaces. The list of all Ext4 specific ioctls are | ||
527 | shown in the table below. | ||
528 | |||
529 | Table of Ext4 specific ioctls | ||
530 | .............................................................................. | ||
531 | Ioctl Description | ||
532 | EXT4_IOC_GETFLAGS Get additional attributes associated with inode. | ||
533 | The ioctl argument is an integer bitfield, with | ||
534 | bit values described in ext4.h. This ioctl is an | ||
535 | alias for FS_IOC_GETFLAGS. | ||
536 | |||
537 | EXT4_IOC_SETFLAGS Set additional attributes associated with inode. | ||
538 | The ioctl argument is an integer bitfield, with | ||
539 | bit values described in ext4.h. This ioctl is an | ||
540 | alias for FS_IOC_SETFLAGS. | ||
541 | |||
542 | EXT4_IOC_GETVERSION | ||
543 | EXT4_IOC_GETVERSION_OLD | ||
544 | Get the inode i_generation number stored for | ||
545 | each inode. The i_generation number is normally | ||
546 | changed only when new inode is created and it is | ||
547 | particularly useful for network filesystems. The | ||
548 | '_OLD' version of this ioctl is an alias for | ||
549 | FS_IOC_GETVERSION. | ||
550 | |||
551 | EXT4_IOC_SETVERSION | ||
552 | EXT4_IOC_SETVERSION_OLD | ||
553 | Set the inode i_generation number stored for | ||
554 | each inode. The '_OLD' version of this ioctl | ||
555 | is an alias for FS_IOC_SETVERSION. | ||
556 | |||
557 | EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize | ||
558 | mount option. It allows to resize filesystem | ||
559 | to the end of the last existing block group, | ||
560 | further resize has to be done with resize2fs, | ||
561 | either online, or offline. The argument points | ||
562 | to the unsigned logn number representing the | ||
563 | filesystem new block count. | ||
564 | |||
565 | EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one | ||
566 | this ioctl is pointing to) to the donor_fd (the | ||
567 | one specified in move_extent structure passed | ||
568 | as an argument to this ioctl). Then, exchange | ||
569 | inode metadata between orig_fd and donor_fd. | ||
570 | This is especially useful for online | ||
571 | defragmentation, because the allocator has the | ||
572 | opportunity to allocate moved blocks better, | ||
573 | ideally into one contiguous extent. | ||
574 | |||
575 | EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or | ||
576 | new group descriptor block. The new group | ||
577 | descriptor is described by ext4_new_group_input | ||
578 | structure, which is passed as an argument to | ||
579 | this ioctl. This is especially useful in | ||
580 | conjunction with EXT4_IOC_GROUP_EXTEND, | ||
581 | which allows online resize of the filesystem | ||
582 | to the end of the last existing block group. | ||
583 | Those two ioctls combined is used in userspace | ||
584 | online resize tool (e.g. resize2fs). | ||
585 | |||
586 | EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself. | ||
587 | It converts (migrates) ext3 indirect block mapped | ||
588 | inode to ext4 extent mapped inode by walking | ||
589 | through indirect block mapping of the original | ||
590 | inode and converting contiguous block ranges | ||
591 | into ext4 extents of the temporary inode. Then, | ||
592 | inodes are swapped. This ioctl might help, when | ||
593 | migrating from ext3 to ext4 filesystem, however | ||
594 | suggestion is to create fresh ext4 filesystem | ||
595 | and copy data from the backup. Note, that | ||
596 | filesystem has to support extents for this ioctl | ||
597 | to work. | ||
598 | |||
599 | EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be | ||
600 | allocated to preserve application-expected ext3 | ||
601 | behaviour. Note that this will also start | ||
602 | triggering a write of the data blocks, but this | ||
603 | behaviour may change in the future as it is | ||
604 | not necessary and has been done this way only | ||
605 | for sake of simplicity. | ||
606 | .............................................................................. | ||
607 | |||
403 | References | 608 | References |
404 | ========== | 609 | ========== |
405 | 610 | ||
diff --git a/Documentation/filesystems/gfs2-uevents.txt b/Documentation/filesystems/gfs2-uevents.txt index fd966dc9979a..d81889669293 100644 --- a/Documentation/filesystems/gfs2-uevents.txt +++ b/Documentation/filesystems/gfs2-uevents.txt | |||
@@ -62,7 +62,7 @@ be fixed. | |||
62 | 62 | ||
63 | The REMOVE uevent is generated at the end of an unsuccessful mount | 63 | The REMOVE uevent is generated at the end of an unsuccessful mount |
64 | or at the end of a umount of the filesystem. All REMOVE uevents will | 64 | or at the end of a umount of the filesystem. All REMOVE uevents will |
65 | have been preceeded by at least an ADD uevent for the same fileystem, | 65 | have been preceded by at least an ADD uevent for the same fileystem, |
66 | and unlike the other uevents is generated automatically by the kernel's | 66 | and unlike the other uevents is generated automatically by the kernel's |
67 | kobject subsystem. | 67 | kobject subsystem. |
68 | 68 | ||
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt index 0b59c0200912..4cda926628aa 100644 --- a/Documentation/filesystems/gfs2.txt +++ b/Documentation/filesystems/gfs2.txt | |||
@@ -11,7 +11,7 @@ their I/O so file system consistency is maintained. One of the nifty | |||
11 | features of GFS is perfect consistency -- changes made to the file system | 11 | features of GFS is perfect consistency -- changes made to the file system |
12 | on one machine show up immediately on all other machines in the cluster. | 12 | on one machine show up immediately on all other machines in the cluster. |
13 | 13 | ||
14 | GFS uses interchangable inter-node locking mechanisms, the currently | 14 | GFS uses interchangeable inter-node locking mechanisms, the currently |
15 | supported mechanisms are: | 15 | supported mechanisms are: |
16 | 16 | ||
17 | lock_nolock -- allows gfs to be used as a local file system | 17 | lock_nolock -- allows gfs to be used as a local file system |
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt index 933bc66ccff1..791af8dac065 100644 --- a/Documentation/filesystems/ntfs.txt +++ b/Documentation/filesystems/ntfs.txt | |||
@@ -350,7 +350,7 @@ Note the "Should sync?" parameter "nosync" means that the two mirrors are | |||
350 | already in sync which will be the case on a clean shutdown of Windows. If the | 350 | already in sync which will be the case on a clean shutdown of Windows. If the |
351 | mirrors are not clean, you can specify the "sync" option instead of "nosync" | 351 | mirrors are not clean, you can specify the "sync" option instead of "nosync" |
352 | and the Device-Mapper driver will then copy the entirety of the "Source Device" | 352 | and the Device-Mapper driver will then copy the entirety of the "Source Device" |
353 | to the "Target Device" or if you specified multipled target devices to all of | 353 | to the "Target Device" or if you specified multiple target devices to all of |
354 | them. | 354 | them. |
355 | 355 | ||
356 | Once you have your table, save it in a file somewhere (e.g. /etc/ntfsvolume1), | 356 | Once you have your table, save it in a file somewhere (e.g. /etc/ntfsvolume1), |
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index 5393e6611691..9ed920a8cd79 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt | |||
@@ -80,7 +80,7 @@ user_xattr (*) Enables Extended User Attributes. | |||
80 | nouser_xattr Disables Extended User Attributes. | 80 | nouser_xattr Disables Extended User Attributes. |
81 | acl Enables POSIX Access Control Lists support. | 81 | acl Enables POSIX Access Control Lists support. |
82 | noacl (*) Disables POSIX Access Control Lists support. | 82 | noacl (*) Disables POSIX Access Control Lists support. |
83 | resv_level=2 (*) Set how agressive allocation reservations will be. | 83 | resv_level=2 (*) Set how aggressive allocation reservations will be. |
84 | Valid values are between 0 (reservations off) to 8 | 84 | Valid values are between 0 (reservations off) to 8 |
85 | (maximum space for reservations). | 85 | (maximum space for reservations). |
86 | dir_resv_level= (*) By default, directory reservations will scale with file | 86 | dir_resv_level= (*) By default, directory reservations will scale with file |
diff --git a/Documentation/filesystems/path-lookup.txt b/Documentation/filesystems/path-lookup.txt index eb59c8b44be9..3571667c7105 100644 --- a/Documentation/filesystems/path-lookup.txt +++ b/Documentation/filesystems/path-lookup.txt | |||
@@ -42,7 +42,7 @@ Path walking overview | |||
42 | A name string specifies a start (root directory, cwd, fd-relative) and a | 42 | A name string specifies a start (root directory, cwd, fd-relative) and a |
43 | sequence of elements (directory entry names), which together refer to a path in | 43 | sequence of elements (directory entry names), which together refer to a path in |
44 | the namespace. A path is represented as a (dentry, vfsmount) tuple. The name | 44 | the namespace. A path is represented as a (dentry, vfsmount) tuple. The name |
45 | elements are sub-strings, seperated by '/'. | 45 | elements are sub-strings, separated by '/'. |
46 | 46 | ||
47 | Name lookups will want to find a particular path that a name string refers to | 47 | Name lookups will want to find a particular path that a name string refers to |
48 | (usually the final element, or parent of final element). This is done by taking | 48 | (usually the final element, or parent of final element). This is done by taking |
@@ -354,7 +354,7 @@ vfstest 24185492 4945 708725(2.9%) 1076136(4.4%) 0 2651 | |||
354 | 354 | ||
355 | What this shows is that failed rcu-walk lookups, ie. ones that are restarted | 355 | What this shows is that failed rcu-walk lookups, ie. ones that are restarted |
356 | entirely with ref-walk, are quite rare. Even the "vfstest" case which | 356 | entirely with ref-walk, are quite rare. Even the "vfstest" case which |
357 | specifically has concurrent renames/mkdir/rmdir/ creat/unlink/etc to excercise | 357 | specifically has concurrent renames/mkdir/rmdir/ creat/unlink/etc to exercise |
358 | such races is not showing a huge amount of restarts. | 358 | such races is not showing a huge amount of restarts. |
359 | 359 | ||
360 | Dropping from rcu-walk to ref-walk mean that we have encountered a dentry where | 360 | Dropping from rcu-walk to ref-walk mean that we have encountered a dentry where |
diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt index 40ea6c295afb..65e03dd44823 100644 --- a/Documentation/filesystems/pohmelfs/network_protocol.txt +++ b/Documentation/filesystems/pohmelfs/network_protocol.txt | |||
@@ -20,7 +20,7 @@ Commands can be embedded into transaction command (which in turn has own command | |||
20 | so one can extend protocol as needed without breaking backward compatibility as long | 20 | so one can extend protocol as needed without breaking backward compatibility as long |
21 | as old commands are supported. All string lengths include tail 0 byte. | 21 | as old commands are supported. All string lengths include tail 0 byte. |
22 | 22 | ||
23 | All commans are transfered over the network in big-endian. CPU endianess is used at the end peers. | 23 | All commands are transferred over the network in big-endian. CPU endianess is used at the end peers. |
24 | 24 | ||
25 | @cmd - command number, which specifies command to be processed. Following | 25 | @cmd - command number, which specifies command to be processed. Following |
26 | commands are used currently: | 26 | commands are used currently: |
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 0c986c9e8519..6e29954851a2 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting | |||
@@ -298,11 +298,14 @@ be used instead. It gets called whenever the inode is evicted, whether it has | |||
298 | remaining links or not. Caller does *not* evict the pagecache or inode-associated | 298 | remaining links or not. Caller does *not* evict the pagecache or inode-associated |
299 | metadata buffers; getting rid of those is responsibility of method, as it had | 299 | metadata buffers; getting rid of those is responsibility of method, as it had |
300 | been for ->delete_inode(). | 300 | been for ->delete_inode(). |
301 | ->drop_inode() returns int now; it's called on final iput() with inode_lock | 301 | |
302 | held and it returns true if filesystems wants the inode to be dropped. As before, | 302 | ->drop_inode() returns int now; it's called on final iput() with |
303 | generic_drop_inode() is still the default and it's been updated appropriately. | 303 | inode->i_lock held and it returns true if filesystems wants the inode to be |
304 | generic_delete_inode() is also alive and it consists simply of return 1. Note that | 304 | dropped. As before, generic_drop_inode() is still the default and it's been |
305 | all actual eviction work is done by caller after ->drop_inode() returns. | 305 | updated appropriately. generic_delete_inode() is also alive and it consists |
306 | simply of return 1. Note that all actual eviction work is done by caller after | ||
307 | ->drop_inode() returns. | ||
308 | |||
306 | clear_inode() is gone; use end_writeback() instead. As before, it must | 309 | clear_inode() is gone; use end_writeback() instead. As before, it must |
307 | be called exactly once on each call of ->evict_inode() (as it used to be for | 310 | be called exactly once on each call of ->evict_inode() (as it used to be for |
308 | each call of ->delete_inode()). Unlike before, if you are using inode-associated | 311 | each call of ->delete_inode()). Unlike before, if you are using inode-associated |
@@ -397,6 +400,9 @@ a file off. | |||
397 | 400 | ||
398 | -- | 401 | -- |
399 | [mandatory] | 402 | [mandatory] |
403 | |||
404 | -- | ||
405 | [mandatory] | ||
400 | ->get_sb() is gone. Switch to use of ->mount(). Typically it's just | 406 | ->get_sb() is gone. Switch to use of ->mount(). Typically it's just |
401 | a matter of switching from calling get_sb_... to mount_... and changing the | 407 | a matter of switching from calling get_sb_... to mount_... and changing the |
402 | function type. If you were doing it manually, just switch from setting ->mnt_root | 408 | function type. If you were doing it manually, just switch from setting ->mnt_root |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 23cae6548d3a..b0b814d75ca1 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -543,7 +543,7 @@ just those considered 'most important'. The new vectors are: | |||
543 | their statistics are used by kernel developers and interested users to | 543 | their statistics are used by kernel developers and interested users to |
544 | determine the occurrence of interrupts of the given type. | 544 | determine the occurrence of interrupts of the given type. |
545 | 545 | ||
546 | The above IRQ vectors are displayed only when relevent. For example, | 546 | The above IRQ vectors are displayed only when relevant. For example, |
547 | the threshold vector does not exist on x86_64 platforms. Others are | 547 | the threshold vector does not exist on x86_64 platforms. Others are |
548 | suppressed when the system is a uniprocessor. As of this writing, only | 548 | suppressed when the system is a uniprocessor. As of this writing, only |
549 | i386 and x86_64 platforms support the new IRQ vector displays. | 549 | i386 and x86_64 platforms support the new IRQ vector displays. |
@@ -1202,7 +1202,7 @@ The columns are: | |||
1202 | W = can do write operations | 1202 | W = can do write operations |
1203 | U = can do unblank | 1203 | U = can do unblank |
1204 | flags E = it is enabled | 1204 | flags E = it is enabled |
1205 | C = it is prefered console | 1205 | C = it is preferred console |
1206 | B = it is primary boot console | 1206 | B = it is primary boot console |
1207 | p = it is used for printk buffer | 1207 | p = it is used for printk buffer |
1208 | b = it is not a TTY but a Braille device | 1208 | b = it is not a TTY but a Braille device |
@@ -1331,7 +1331,7 @@ NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see | |||
1331 | Documentation/feature-removal-schedule.txt. | 1331 | Documentation/feature-removal-schedule.txt. |
1332 | 1332 | ||
1333 | Caveat: when a parent task is selected, the oom killer will sacrifice any first | 1333 | Caveat: when a parent task is selected, the oom killer will sacrifice any first |
1334 | generation children with seperate address spaces instead, if possible. This | 1334 | generation children with separate address spaces instead, if possible. This |
1335 | avoids servers and important system daemons from being killed and loses the | 1335 | avoids servers and important system daemons from being killed and loses the |
1336 | minimal amount of work. | 1336 | minimal amount of work. |
1337 | 1337 | ||
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt index 66699afd66ca..d4d41465a0b1 100644 --- a/Documentation/filesystems/squashfs.txt +++ b/Documentation/filesystems/squashfs.txt | |||
@@ -59,12 +59,15 @@ obtained from this site also. | |||
59 | 3. SQUASHFS FILESYSTEM DESIGN | 59 | 3. SQUASHFS FILESYSTEM DESIGN |
60 | ----------------------------- | 60 | ----------------------------- |
61 | 61 | ||
62 | A squashfs filesystem consists of a maximum of eight parts, packed together on a byte | 62 | A squashfs filesystem consists of a maximum of nine parts, packed together on a |
63 | alignment: | 63 | byte alignment: |
64 | 64 | ||
65 | --------------- | 65 | --------------- |
66 | | superblock | | 66 | | superblock | |
67 | |---------------| | 67 | |---------------| |
68 | | compression | | ||
69 | | options | | ||
70 | |---------------| | ||
68 | | datablocks | | 71 | | datablocks | |
69 | | & fragments | | 72 | | & fragments | |
70 | |---------------| | 73 | |---------------| |
@@ -91,7 +94,14 @@ the source directory, and checked for duplicates. Once all file data has been | |||
91 | written the completed inode, directory, fragment, export and uid/gid lookup | 94 | written the completed inode, directory, fragment, export and uid/gid lookup |
92 | tables are written. | 95 | tables are written. |
93 | 96 | ||
94 | 3.1 Inodes | 97 | 3.1 Compression options |
98 | ----------------------- | ||
99 | |||
100 | Compressors can optionally support compression specific options (e.g. | ||
101 | dictionary size). If non-default compression options have been used, then | ||
102 | these are stored here. | ||
103 | |||
104 | 3.2 Inodes | ||
95 | ---------- | 105 | ---------- |
96 | 106 | ||
97 | Metadata (inodes and directories) are compressed in 8Kbyte blocks. Each | 107 | Metadata (inodes and directories) are compressed in 8Kbyte blocks. Each |
@@ -114,7 +124,7 @@ directory inode are defined: inodes optimised for frequently occurring | |||
114 | regular files and directories, and extended types where extra | 124 | regular files and directories, and extended types where extra |
115 | information has to be stored. | 125 | information has to be stored. |
116 | 126 | ||
117 | 3.2 Directories | 127 | 3.3 Directories |
118 | --------------- | 128 | --------------- |
119 | 129 | ||
120 | Like inodes, directories are packed into compressed metadata blocks, stored | 130 | Like inodes, directories are packed into compressed metadata blocks, stored |
@@ -144,7 +154,7 @@ decompressed to do a lookup irrespective of the length of the directory. | |||
144 | This scheme has the advantage that it doesn't require extra memory overhead | 154 | This scheme has the advantage that it doesn't require extra memory overhead |
145 | and doesn't require much extra storage on disk. | 155 | and doesn't require much extra storage on disk. |
146 | 156 | ||
147 | 3.3 File data | 157 | 3.4 File data |
148 | ------------- | 158 | ------------- |
149 | 159 | ||
150 | Regular files consist of a sequence of contiguous compressed blocks, and/or a | 160 | Regular files consist of a sequence of contiguous compressed blocks, and/or a |
@@ -163,7 +173,7 @@ Larger files use multiple slots, with 1.75 TiB files using all 8 slots. | |||
163 | The index cache is designed to be memory efficient, and by default uses | 173 | The index cache is designed to be memory efficient, and by default uses |
164 | 16 KiB. | 174 | 16 KiB. |
165 | 175 | ||
166 | 3.4 Fragment lookup table | 176 | 3.5 Fragment lookup table |
167 | ------------------------- | 177 | ------------------------- |
168 | 178 | ||
169 | Regular files can contain a fragment index which is mapped to a fragment | 179 | Regular files can contain a fragment index which is mapped to a fragment |
@@ -173,7 +183,7 @@ A second index table is used to locate these. This second index table for | |||
173 | speed of access (and because it is small) is read at mount time and cached | 183 | speed of access (and because it is small) is read at mount time and cached |
174 | in memory. | 184 | in memory. |
175 | 185 | ||
176 | 3.5 Uid/gid lookup table | 186 | 3.6 Uid/gid lookup table |
177 | ------------------------ | 187 | ------------------------ |
178 | 188 | ||
179 | For space efficiency regular files store uid and gid indexes, which are | 189 | For space efficiency regular files store uid and gid indexes, which are |
@@ -182,7 +192,7 @@ stored compressed into metadata blocks. A second index table is used to | |||
182 | locate these. This second index table for speed of access (and because it | 192 | locate these. This second index table for speed of access (and because it |
183 | is small) is read at mount time and cached in memory. | 193 | is small) is read at mount time and cached in memory. |
184 | 194 | ||
185 | 3.6 Export table | 195 | 3.7 Export table |
186 | ---------------- | 196 | ---------------- |
187 | 197 | ||
188 | To enable Squashfs filesystems to be exportable (via NFS etc.) filesystems | 198 | To enable Squashfs filesystems to be exportable (via NFS etc.) filesystems |
@@ -196,7 +206,7 @@ This table is stored compressed into metadata blocks. A second index table is | |||
196 | used to locate these. This second index table for speed of access (and because | 206 | used to locate these. This second index table for speed of access (and because |
197 | it is small) is read at mount time and cached in memory. | 207 | it is small) is read at mount time and cached in memory. |
198 | 208 | ||
199 | 3.7 Xattr table | 209 | 3.8 Xattr table |
200 | --------------- | 210 | --------------- |
201 | 211 | ||
202 | The xattr table contains extended attributes for each inode. The xattrs | 212 | The xattr table contains extended attributes for each inode. The xattrs |
@@ -209,7 +219,7 @@ or if it is stored out of line (in which case the value field stores a | |||
209 | reference to where the actual value is stored). This allows large values | 219 | reference to where the actual value is stored). This allows large values |
210 | to be stored out of line improving scanning and lookup performance and it | 220 | to be stored out of line improving scanning and lookup performance and it |
211 | also allows values to be de-duplicated, the value being stored once, and | 221 | also allows values to be de-duplicated, the value being stored once, and |
212 | all other occurences holding an out of line reference to that value. | 222 | all other occurrences holding an out of line reference to that value. |
213 | 223 | ||
214 | The xattr lists are packed into compressed 8K metadata blocks. | 224 | The xattr lists are packed into compressed 8K metadata blocks. |
215 | To reduce overhead in inodes, rather than storing the on-disk | 225 | To reduce overhead in inodes, rather than storing the on-disk |
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index f806e50aaa63..597f728e7b4e 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt | |||
@@ -62,7 +62,7 @@ values of the same type. | |||
62 | 62 | ||
63 | Mixing types, expressing multiple lines of data, and doing fancy | 63 | Mixing types, expressing multiple lines of data, and doing fancy |
64 | formatting of data is heavily frowned upon. Doing these things may get | 64 | formatting of data is heavily frowned upon. Doing these things may get |
65 | you publically humiliated and your code rewritten without notice. | 65 | you publicly humiliated and your code rewritten without notice. |
66 | 66 | ||
67 | 67 | ||
68 | An attribute definition is simply: | 68 | An attribute definition is simply: |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 306f0ae8df09..21a7dc467bba 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -97,7 +97,7 @@ functions: | |||
97 | The passed struct file_system_type describes your filesystem. When a | 97 | The passed struct file_system_type describes your filesystem. When a |
98 | request is made to mount a filesystem onto a directory in your namespace, | 98 | request is made to mount a filesystem onto a directory in your namespace, |
99 | the VFS will call the appropriate mount() method for the specific | 99 | the VFS will call the appropriate mount() method for the specific |
100 | filesystem. New vfsmount refering to the tree returned by ->mount() | 100 | filesystem. New vfsmount referring to the tree returned by ->mount() |
101 | will be attached to the mountpoint, so that when pathname resolution | 101 | will be attached to the mountpoint, so that when pathname resolution |
102 | reaches the mountpoint it will jump into the root of that vfsmount. | 102 | reaches the mountpoint it will jump into the root of that vfsmount. |
103 | 103 | ||
@@ -254,7 +254,7 @@ or bottom half). | |||
254 | should be synchronous or not, not all filesystems check this flag. | 254 | should be synchronous or not, not all filesystems check this flag. |
255 | 255 | ||
256 | drop_inode: called when the last access to the inode is dropped, | 256 | drop_inode: called when the last access to the inode is dropped, |
257 | with the inode_lock spinlock held. | 257 | with the inode->i_lock spinlock held. |
258 | 258 | ||
259 | This method should be either NULL (normal UNIX filesystem | 259 | This method should be either NULL (normal UNIX filesystem |
260 | semantics) or "generic_delete_inode" (for filesystems that do not | 260 | semantics) or "generic_delete_inode" (for filesystems that do not |
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt index 7445bf335dae..2ce36439c09f 100644 --- a/Documentation/filesystems/xfs-delayed-logging-design.txt +++ b/Documentation/filesystems/xfs-delayed-logging-design.txt | |||
@@ -42,7 +42,7 @@ the aggregation of all the previous changes currently held only in the log. | |||
42 | This relogging technique also allows objects to be moved forward in the log so | 42 | This relogging technique also allows objects to be moved forward in the log so |
43 | that an object being relogged does not prevent the tail of the log from ever | 43 | that an object being relogged does not prevent the tail of the log from ever |
44 | moving forward. This can be seen in the table above by the changing | 44 | moving forward. This can be seen in the table above by the changing |
45 | (increasing) LSN of each subsquent transaction - the LSN is effectively a | 45 | (increasing) LSN of each subsequent transaction - the LSN is effectively a |
46 | direct encoding of the location in the log of the transaction. | 46 | direct encoding of the location in the log of the transaction. |
47 | 47 | ||
48 | This relogging is also used to implement long-running, multiple-commit | 48 | This relogging is also used to implement long-running, multiple-commit |
@@ -338,7 +338,7 @@ the same time another transaction modifies the item and inserts the log item | |||
338 | into the new CIL, then checkpoint transaction commit code cannot use log items | 338 | into the new CIL, then checkpoint transaction commit code cannot use log items |
339 | to store the list of log vectors that need to be written into the transaction. | 339 | to store the list of log vectors that need to be written into the transaction. |
340 | Hence log vectors need to be able to be chained together to allow them to be | 340 | Hence log vectors need to be able to be chained together to allow them to be |
341 | detatched from the log items. That is, when the CIL is flushed the memory | 341 | detached from the log items. That is, when the CIL is flushed the memory |
342 | buffer and log vector attached to each log item needs to be attached to the | 342 | buffer and log vector attached to each log item needs to be attached to the |
343 | checkpoint context so that the log item can be released. In diagrammatic form, | 343 | checkpoint context so that the log item can be released. In diagrammatic form, |
344 | the CIL would look like this before the flush: | 344 | the CIL would look like this before the flush: |
@@ -577,7 +577,7 @@ only becomes unpinned when all the transactions complete and there are no | |||
577 | pending transactions. Thus the pinning and unpinning of a log item is symmetric | 577 | pending transactions. Thus the pinning and unpinning of a log item is symmetric |
578 | as there is a 1:1 relationship with transaction commit and log item completion. | 578 | as there is a 1:1 relationship with transaction commit and log item completion. |
579 | 579 | ||
580 | For delayed logging, however, we have an assymetric transaction commit to | 580 | For delayed logging, however, we have an asymmetric transaction commit to |
581 | completion relationship. Every time an object is relogged in the CIL it goes | 581 | completion relationship. Every time an object is relogged in the CIL it goes |
582 | through the commit process without a corresponding completion being registered. | 582 | through the commit process without a corresponding completion being registered. |
583 | That is, we now have a many-to-one relationship between transaction commit and | 583 | That is, we now have a many-to-one relationship between transaction commit and |
@@ -780,7 +780,7 @@ With delayed logging, there are new steps inserted into the life cycle: | |||
780 | From this, it can be seen that the only life cycle differences between the two | 780 | From this, it can be seen that the only life cycle differences between the two |
781 | logging methods are in the middle of the life cycle - they still have the same | 781 | logging methods are in the middle of the life cycle - they still have the same |
782 | beginning and end and execution constraints. The only differences are in the | 782 | beginning and end and execution constraints. The only differences are in the |
783 | commiting of the log items to the log itself and the completion processing. | 783 | committing of the log items to the log itself and the completion processing. |
784 | Hence delayed logging should not introduce any constraints on log item | 784 | Hence delayed logging should not introduce any constraints on log item |
785 | behaviour, allocation or freeing that don't already exist. | 785 | behaviour, allocation or freeing that don't already exist. |
786 | 786 | ||
@@ -791,10 +791,3 @@ mount option. Fundamentally, there is no reason why the log manager would not | |||
791 | be able to swap methods automatically and transparently depending on load | 791 | be able to swap methods automatically and transparently depending on load |
792 | characteristics, but this should not be necessary if delayed logging works as | 792 | characteristics, but this should not be necessary if delayed logging works as |
793 | designed. | 793 | designed. |
794 | |||
795 | Roadmap: | ||
796 | |||
797 | 2.6.39 Switch default mount option to use delayed logging | ||
798 | => should be roughly 12 months after initial merge | ||
799 | => enough time to shake out remaining problems before next round of | ||
800 | enterprise distro kernel rebases | ||