diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 01:34:47 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 01:34:47 -0400 |
commit | 10f3e23f07cb0c20f9bcb77a5b5a7eb2a1b2a2fe (patch) | |
tree | 1fcb34309b3542512c6f3345f092f7adb8c3312c | |
parent | 3bb37da509e576c80180fa0e4d1cfcaddf0cb82e (diff) | |
parent | 863c37fcb14f8b66ea831b45fb35a53ac4a8d69e (diff) |
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
- Convert content from the ext4 wiki to Documentation rst files so it
is more likely to be updated as we add new features to ext4.
- Add 64-bit timestamp support to ext4's superblock fields.
- ... and the usual bug fixes and cleanups, including a Spectre gadget
fixup and some hardening against maliciously corrupted file systems.
* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (34 commits)
ext4: remove unneeded variable "err" in ext4_mb_release_inode_pa()
ext4: improve code readability in ext4_iget()
ext4: fix spectre gadget in ext4_mb_regular_allocator()
ext4: check for NUL characters in extended attribute's name
ext4: use ext4_warning() for sb_getblk failure
ext4: fix race when setting the bitmap corrupted flag
ext4: reset error code in ext4_find_entry in fallback
ext4: handle layout changes to pinned DAX mappings
dax: dax_layout_busy_page() warn on !exceptional
docs: fix up the obviously obsolete bits in the new ext4 documentation
docs: add new ext4 superblock time extension fields
docs: create filesystem internal section
ext4: use swap macro in mext_page_double_lock
ext4: check allocation failure when duplicating "data" in ext4_remount()
ext4: fix warning message in ext4_enable_quotas()
ext4: super: extend timestamps to 40 bits
jbd2: replace current_kernel_time64 with ktime equivalent
ext4: use timespec64 for all inode times
ext4: use ktime_get_real_seconds for i_dtime
ext4: use 64-bit timestamps for mmp_time
...
42 files changed, 4036 insertions, 150 deletions
diff --git a/Documentation/conf.py b/Documentation/conf.py index 62ac5a9f3a9f..b691af4831fa 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py | |||
@@ -34,7 +34,7 @@ needs_sphinx = '1.3' | |||
34 | # Add any Sphinx extension module names here, as strings. They can be | 34 | # Add any Sphinx extension module names here, as strings. They can be |
35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom | 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom |
36 | # ones. | 36 | # ones. |
37 | extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure'] | 37 | extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig'] |
38 | 38 | ||
39 | # The name of the math extension changed on Sphinx 1.4 | 39 | # The name of the math extension changed on Sphinx 1.4 |
40 | if major == 1 and minor > 3: | 40 | if major == 1 and minor > 3: |
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4/ext4.rst index 7f628b9f7c4b..9d4368d591fa 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4/ext4.rst | |||
@@ -1,6 +1,8 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
1 | 2 | ||
2 | Ext4 Filesystem | 3 | ======================== |
3 | =============== | 4 | General Information |
5 | ======================== | ||
4 | 6 | ||
5 | Ext4 is an advanced level of the ext3 filesystem which incorporates | 7 | Ext4 is an advanced level of the ext3 filesystem which incorporates |
6 | scalability and reliability enhancements for supporting large filesystems | 8 | scalability and reliability enhancements for supporting large filesystems |
@@ -11,37 +13,30 @@ Mailing list: linux-ext4@vger.kernel.org | |||
11 | Web site: http://ext4.wiki.kernel.org | 13 | Web site: http://ext4.wiki.kernel.org |
12 | 14 | ||
13 | 15 | ||
14 | 1. Quick usage instructions: | 16 | Quick usage instructions |
15 | =========================== | 17 | ======================== |
16 | 18 | ||
17 | Note: More extensive information for getting started with ext4 can be | 19 | Note: More extensive information for getting started with ext4 can be |
18 | found at the ext4 wiki site at the URL: | 20 | found at the ext4 wiki site at the URL: |
19 | http://ext4.wiki.kernel.org/index.php/Ext4_Howto | 21 | http://ext4.wiki.kernel.org/index.php/Ext4_Howto |
20 | 22 | ||
21 | - Compile and install the latest version of e2fsprogs (as of this | 23 | - The latest version of e2fsprogs can be found at: |
22 | writing version 1.41.3) from: | 24 | |
25 | https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ | ||
23 | 26 | ||
24 | http://sourceforge.net/project/showfiles.php?group_id=2406 | ||
25 | |||
26 | or | 27 | or |
27 | 28 | ||
28 | https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ | 29 | http://sourceforge.net/project/showfiles.php?group_id=2406 |
29 | 30 | ||
30 | or grab the latest git repository from: | 31 | or grab the latest git repository from: |
31 | 32 | ||
32 | git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git | 33 | https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git |
33 | |||
34 | - Note that it is highly important to install the mke2fs.conf file | ||
35 | that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If | ||
36 | you have edited the /etc/mke2fs.conf file installed on your system, | ||
37 | you will need to merge your changes with the version from e2fsprogs | ||
38 | 1.41.x. | ||
39 | 34 | ||
40 | - Create a new filesystem using the ext4 filesystem type: | 35 | - Create a new filesystem using the ext4 filesystem type: |
41 | 36 | ||
42 | # mke2fs -t ext4 /dev/hda1 | 37 | # mke2fs -t ext4 /dev/hda1 |
43 | 38 | ||
44 | Or to configure an existing ext3 filesystem to support extents: | 39 | Or to configure an existing ext3 filesystem to support extents: |
45 | 40 | ||
46 | # tune2fs -O extents /dev/hda1 | 41 | # tune2fs -O extents /dev/hda1 |
47 | 42 | ||
@@ -50,10 +45,6 @@ Note: More extensive information for getting started with ext4 can be | |||
50 | 45 | ||
51 | # tune2fs -I 256 /dev/hda1 | 46 | # tune2fs -I 256 /dev/hda1 |
52 | 47 | ||
53 | (Note: we currently do not have tools to convert an ext4 | ||
54 | filesystem back to ext3; so please do not do try this on production | ||
55 | filesystems.) | ||
56 | |||
57 | - Mounting: | 48 | - Mounting: |
58 | 49 | ||
59 | # mount -t ext4 /dev/hda1 /wherever | 50 | # mount -t ext4 /dev/hda1 /wherever |
@@ -75,10 +66,11 @@ Note: More extensive information for getting started with ext4 can be | |||
75 | the filesystem with a large journal can also be helpful for | 66 | the filesystem with a large journal can also be helpful for |
76 | metadata-intensive workloads. | 67 | metadata-intensive workloads. |
77 | 68 | ||
78 | 2. Features | 69 | Features |
79 | =========== | 70 | ======== |
80 | 71 | ||
81 | 2.1 Currently available | 72 | Currently Available |
73 | ------------------- | ||
82 | 74 | ||
83 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) | 75 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) |
84 | * extent format reduces metadata overhead (RAM, IO for access, transactions) | 76 | * extent format reduces metadata overhead (RAM, IO for access, transactions) |
@@ -103,31 +95,15 @@ Note: More extensive information for getting started with ext4 can be | |||
103 | [1] Filesystems with a block size of 1k may see a limit imposed by the | 95 | [1] Filesystems with a block size of 1k may see a limit imposed by the |
104 | directory hash tree having a maximum depth of two. | 96 | directory hash tree having a maximum depth of two. |
105 | 97 | ||
106 | 2.2 Candidate features for future inclusion | 98 | Options |
107 | 99 | ======= | |
108 | * online defrag (patches available but not well tested) | ||
109 | * reduced mke2fs time via lazy itable initialization in conjunction with | ||
110 | the uninit_bg feature (capability to do this is available in e2fsprogs | ||
111 | but a kernel thread to do lazy zeroing of unused inode table blocks | ||
112 | after filesystem is first mounted is required for safety) | ||
113 | |||
114 | There are several others under discussion, whether they all make it in is | ||
115 | partly a function of how much time everyone has to work on them. Features like | ||
116 | metadata checksumming have been discussed and planned for a bit but no patches | ||
117 | exist yet so I'm not sure they're in the near-term roadmap. | ||
118 | |||
119 | The big performance win will come with mballoc, delalloc and flex_bg | ||
120 | grouping of bitmaps and inode tables. Some test results available here: | ||
121 | |||
122 | - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html | ||
123 | - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html | ||
124 | |||
125 | 3. Options | ||
126 | ========== | ||
127 | 100 | ||
128 | When mounting an ext4 filesystem, the following option are accepted: | 101 | When mounting an ext4 filesystem, the following option are accepted: |
129 | (*) == default | 102 | (*) == default |
130 | 103 | ||
104 | ======================= ======================================================= | ||
105 | Mount Option Description | ||
106 | ======================= ======================================================= | ||
131 | ro Mount filesystem read only. Note that ext4 will | 107 | ro Mount filesystem read only. Note that ext4 will |
132 | replay the journal (and thus write to the | 108 | replay the journal (and thus write to the |
133 | partition) even when mounted "read only". The | 109 | partition) even when mounted "read only". The |
@@ -387,33 +363,38 @@ i_version Enable 64-bit inode version support. This option is | |||
387 | dax Use direct access (no page cache). See | 363 | dax Use direct access (no page cache). See |
388 | Documentation/filesystems/dax.txt. Note that | 364 | Documentation/filesystems/dax.txt. Note that |
389 | this option is incompatible with data=journal. | 365 | this option is incompatible with data=journal. |
366 | ======================= ======================================================= | ||
390 | 367 | ||
391 | Data Mode | 368 | Data Mode |
392 | ========= | 369 | ========= |
393 | There are 3 different data modes: | 370 | There are 3 different data modes: |
394 | 371 | ||
395 | * writeback mode | 372 | * writeback mode |
396 | In data=writeback mode, ext4 does not journal data at all. This mode provides | 373 | |
397 | a similar level of journaling as that of XFS, JFS, and ReiserFS in its default | 374 | In data=writeback mode, ext4 does not journal data at all. This mode provides |
398 | mode - metadata journaling. A crash+recovery can cause incorrect data to | 375 | a similar level of journaling as that of XFS, JFS, and ReiserFS in its default |
399 | appear in files which were written shortly before the crash. This mode will | 376 | mode - metadata journaling. A crash+recovery can cause incorrect data to |
400 | typically provide the best ext4 performance. | 377 | appear in files which were written shortly before the crash. This mode will |
378 | typically provide the best ext4 performance. | ||
401 | 379 | ||
402 | * ordered mode | 380 | * ordered mode |
403 | In data=ordered mode, ext4 only officially journals metadata, but it logically | 381 | |
404 | groups metadata information related to data changes with the data blocks into a | 382 | In data=ordered mode, ext4 only officially journals metadata, but it logically |
405 | single unit called a transaction. When it's time to write the new metadata | 383 | groups metadata information related to data changes with the data blocks into |
406 | out to disk, the associated data blocks are written first. In general, | 384 | a single unit called a transaction. When it's time to write the new metadata |
407 | this mode performs slightly slower than writeback but significantly faster than journal mode. | 385 | out to disk, the associated data blocks are written first. In general, this |
386 | mode performs slightly slower than writeback but significantly faster than | ||
387 | journal mode. | ||
408 | 388 | ||
409 | * journal mode | 389 | * journal mode |
410 | data=journal mode provides full data and metadata journaling. All new data is | 390 | |
411 | written to the journal first, and then to its final location. | 391 | data=journal mode provides full data and metadata journaling. All new data is |
412 | In the event of a crash, the journal can be replayed, bringing both data and | 392 | written to the journal first, and then to its final location. In the event of |
413 | metadata into a consistent state. This mode is the slowest except when data | 393 | a crash, the journal can be replayed, bringing both data and metadata into a |
414 | needs to be read from and written to disk at the same time where it | 394 | consistent state. This mode is the slowest except when data needs to be read |
415 | outperforms all others modes. Enabling this mode will disable delayed | 395 | from and written to disk at the same time where it outperforms all others |
416 | allocation and O_DIRECT support. | 396 | modes. Enabling this mode will disable delayed allocation and O_DIRECT |
397 | support. | ||
417 | 398 | ||
418 | /proc entries | 399 | /proc entries |
419 | ============= | 400 | ============= |
@@ -425,10 +406,12 @@ Information about mounted ext4 file systems can be found in | |||
425 | in table below. | 406 | in table below. |
426 | 407 | ||
427 | Files in /proc/fs/ext4/<devname> | 408 | Files in /proc/fs/ext4/<devname> |
428 | .............................................................................. | 409 | |
410 | ================ ======= | ||
429 | File Content | 411 | File Content |
412 | ================ ======= | ||
430 | mb_groups details of multiblock allocator buddy cache of free blocks | 413 | mb_groups details of multiblock allocator buddy cache of free blocks |
431 | .............................................................................. | 414 | ================ ======= |
432 | 415 | ||
433 | /sys entries | 416 | /sys entries |
434 | ============ | 417 | ============ |
@@ -439,28 +422,30 @@ Information about mounted ext4 file systems can be found in | |||
439 | /sys/fs/ext4/dm-0). The files in each per-device directory are shown | 422 | /sys/fs/ext4/dm-0). The files in each per-device directory are shown |
440 | in table below. | 423 | in table below. |
441 | 424 | ||
442 | Files in /sys/fs/ext4/<devname> | 425 | Files in /sys/fs/ext4/<devname>: |
426 | |||
443 | (see also Documentation/ABI/testing/sysfs-fs-ext4) | 427 | (see also Documentation/ABI/testing/sysfs-fs-ext4) |
444 | .............................................................................. | ||
445 | File Content | ||
446 | 428 | ||
429 | ============================= ================================================= | ||
430 | File Content | ||
431 | ============================= ================================================= | ||
447 | delayed_allocation_blocks This file is read-only and shows the number of | 432 | delayed_allocation_blocks This file is read-only and shows the number of |
448 | blocks that are dirty in the page cache, but | 433 | blocks that are dirty in the page cache, but |
449 | which do not have their location in the | 434 | which do not have their location in the |
450 | filesystem allocated yet. | 435 | filesystem allocated yet. |
451 | 436 | ||
452 | inode_goal Tuning parameter which (if non-zero) controls | 437 | inode_goal Tuning parameter which (if non-zero) controls |
453 | the goal inode used by the inode allocator in | 438 | the goal inode used by the inode allocator in |
454 | preference to all other allocation heuristics. | 439 | preference to all other allocation heuristics. |
455 | This is intended for debugging use only, and | 440 | This is intended for debugging use only, and |
456 | should be 0 on production systems. | 441 | should be 0 on production systems. |
457 | 442 | ||
458 | inode_readahead_blks Tuning parameter which controls the maximum | 443 | inode_readahead_blks Tuning parameter which controls the maximum |
459 | number of inode table blocks that ext4's inode | 444 | number of inode table blocks that ext4's inode |
460 | table readahead algorithm will pre-read into | 445 | table readahead algorithm will pre-read into |
461 | the buffer cache | 446 | the buffer cache |
462 | 447 | ||
463 | lifetime_write_kbytes This file is read-only and shows the number of | 448 | lifetime_write_kbytes This file is read-only and shows the number of |
464 | kilobytes of data that have been written to this | 449 | kilobytes of data that have been written to this |
465 | filesystem since it was created. | 450 | filesystem since it was created. |
466 | 451 | ||
@@ -508,7 +493,7 @@ Files in /sys/fs/ext4/<devname> | |||
508 | in the file system. If there is not enough space | 493 | in the file system. If there is not enough space |
509 | for the reserved space when mounting the file | 494 | for the reserved space when mounting the file |
510 | mount will _not_ fail. | 495 | mount will _not_ fail. |
511 | .............................................................................. | 496 | ============================= ================================================= |
512 | 497 | ||
513 | Ioctls | 498 | Ioctls |
514 | ====== | 499 | ====== |
@@ -518,8 +503,10 @@ through the system call interfaces. The list of all Ext4 specific ioctls are | |||
518 | shown in the table below. | 503 | shown in the table below. |
519 | 504 | ||
520 | Table of Ext4 specific ioctls | 505 | Table of Ext4 specific ioctls |
521 | .............................................................................. | 506 | |
522 | Ioctl Description | 507 | ============================= ================================================= |
508 | Ioctl Description | ||
509 | ============================= ================================================= | ||
523 | EXT4_IOC_GETFLAGS Get additional attributes associated with inode. | 510 | EXT4_IOC_GETFLAGS Get additional attributes associated with inode. |
524 | The ioctl argument is an integer bitfield, with | 511 | The ioctl argument is an integer bitfield, with |
525 | bit values described in ext4.h. This ioctl is an | 512 | bit values described in ext4.h. This ioctl is an |
@@ -610,8 +597,7 @@ Table of Ext4 specific ioctls | |||
610 | normal user by accident. | 597 | normal user by accident. |
611 | The data blocks of the previous boot loader | 598 | The data blocks of the previous boot loader |
612 | will be associated with the given inode. | 599 | will be associated with the given inode. |
613 | 600 | ============================= ================================================= | |
614 | .............................................................................. | ||
615 | 601 | ||
616 | References | 602 | References |
617 | ========== | 603 | ========== |
diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst new file mode 100644 index 000000000000..71121605558c --- /dev/null +++ b/Documentation/filesystems/ext4/index.rst | |||
@@ -0,0 +1,17 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | =============== | ||
4 | ext4 Filesystem | ||
5 | =============== | ||
6 | |||
7 | General usage and on-disk artifacts writen by ext4. More documentation may | ||
8 | be ported from the wiki as time permits. This should be considered the | ||
9 | canonical source of information as the details here have been reviewed by | ||
10 | the ext4 community. | ||
11 | |||
12 | .. toctree:: | ||
13 | :maxdepth: 5 | ||
14 | :numbered: | ||
15 | |||
16 | ext4 | ||
17 | ondisk/index | ||
diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/ondisk/about.rst new file mode 100644 index 000000000000..0aadba052264 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/about.rst | |||
@@ -0,0 +1,44 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | About this Book | ||
4 | =============== | ||
5 | |||
6 | This document attempts to describe the on-disk format for ext4 | ||
7 | filesystems. The same general ideas should apply to ext2/3 filesystems | ||
8 | as well, though they do not support all the features that ext4 supports, | ||
9 | and the fields will be shorter. | ||
10 | |||
11 | **NOTE**: This is a work in progress, based on notes that the author | ||
12 | (djwong) made while picking apart a filesystem by hand. The data | ||
13 | structure definitions should be current as of Linux 4.18 and | ||
14 | e2fsprogs-1.44. All comments and corrections are welcome, since there is | ||
15 | undoubtedly plenty of lore that might not be reflected in freshly | ||
16 | created demonstration filesystems. | ||
17 | |||
18 | License | ||
19 | ------- | ||
20 | This book is licensed under the terms of the GNU Public License, v2. | ||
21 | |||
22 | Terminology | ||
23 | ----------- | ||
24 | |||
25 | ext4 divides a storage device into an array of logical blocks both to | ||
26 | reduce bookkeeping overhead and to increase throughput by forcing larger | ||
27 | transfer sizes. Generally, the block size will be 4KiB (the same size as | ||
28 | pages on x86 and the block layer's default block size), though the | ||
29 | actual size is calculated as 2 ^ (10 + ``sb.s_log_block_size``) bytes. | ||
30 | Throughout this document, disk locations are given in terms of these | ||
31 | logical blocks, not raw LBAs, and not 1024-byte blocks. For the sake of | ||
32 | convenience, the logical block size will be referred to as | ||
33 | ``$block_size`` throughout the rest of the document. | ||
34 | |||
35 | When referenced in ``preformatted text`` blocks, ``sb`` refers to fields | ||
36 | in the super block, and ``inode`` refers to fields in an inode table | ||
37 | entry. | ||
38 | |||
39 | Other References | ||
40 | ---------------- | ||
41 | |||
42 | Also see http://www.nongnu.org/ext2-doc/ for quite a collection of | ||
43 | information about ext2/3. Here's another old reference: | ||
44 | http://wiki.osdev.org/Ext2 | ||
diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/ondisk/allocators.rst new file mode 100644 index 000000000000..7aa85152ace3 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/allocators.rst | |||
@@ -0,0 +1,56 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Block and Inode Allocation Policy | ||
4 | --------------------------------- | ||
5 | |||
6 | ext4 recognizes (better than ext3, anyway) that data locality is | ||
7 | generally a desirably quality of a filesystem. On a spinning disk, | ||
8 | keeping related blocks near each other reduces the amount of movement | ||
9 | that the head actuator and disk must perform to access a data block, | ||
10 | thus speeding up disk IO. On an SSD there of course are no moving parts, | ||
11 | but locality can increase the size of each transfer request while | ||
12 | reducing the total number of requests. This locality may also have the | ||
13 | effect of concentrating writes on a single erase block, which can speed | ||
14 | up file rewrites significantly. Therefore, it is useful to reduce | ||
15 | fragmentation whenever possible. | ||
16 | |||
17 | The first tool that ext4 uses to combat fragmentation is the multi-block | ||
18 | allocator. When a file is first created, the block allocator | ||
19 | speculatively allocates 8KiB of disk space to the file on the assumption | ||
20 | that the space will get written soon. When the file is closed, the | ||
21 | unused speculative allocations are of course freed, but if the | ||
22 | speculation is correct (typically the case for full writes of small | ||
23 | files) then the file data gets written out in a single multi-block | ||
24 | extent. A second related trick that ext4 uses is delayed allocation. | ||
25 | Under this scheme, when a file needs more blocks to absorb file writes, | ||
26 | the filesystem defers deciding the exact placement on the disk until all | ||
27 | the dirty buffers are being written out to disk. By not committing to a | ||
28 | particular placement until it's absolutely necessary (the commit timeout | ||
29 | is hit, or sync() is called, or the kernel runs out of memory), the hope | ||
30 | is that the filesystem can make better location decisions. | ||
31 | |||
32 | The third trick that ext4 (and ext3) uses is that it tries to keep a | ||
33 | file's data blocks in the same block group as its inode. This cuts down | ||
34 | on the seek penalty when the filesystem first has to read a file's inode | ||
35 | to learn where the file's data blocks live and then seek over to the | ||
36 | file's data blocks to begin I/O operations. | ||
37 | |||
38 | The fourth trick is that all the inodes in a directory are placed in the | ||
39 | same block group as the directory, when feasible. The working assumption | ||
40 | here is that all the files in a directory might be related, therefore it | ||
41 | is useful to try to keep them all together. | ||
42 | |||
43 | The fifth trick is that the disk volume is cut up into 128MB block | ||
44 | groups; these mini-containers are used as outlined above to try to | ||
45 | maintain data locality. However, there is a deliberate quirk -- when a | ||
46 | directory is created in the root directory, the inode allocator scans | ||
47 | the block groups and puts that directory into the least heavily loaded | ||
48 | block group that it can find. This encourages directories to spread out | ||
49 | over a disk; as the top-level directory/file blobs fill up one block | ||
50 | group, the allocators simply move on to the next block group. Allegedly | ||
51 | this scheme evens out the loading on the block groups, though the author | ||
52 | suspects that the directories which are so unlucky as to land towards | ||
53 | the end of a spinning drive get a raw deal performance-wise. | ||
54 | |||
55 | Of course if all of these mechanisms fail, one can always use e4defrag | ||
56 | to defragment files. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/ondisk/attributes.rst new file mode 100644 index 000000000000..0b01b67b81fe --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/attributes.rst | |||
@@ -0,0 +1,191 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Extended Attributes | ||
4 | ------------------- | ||
5 | |||
6 | Extended attributes (xattrs) are typically stored in a separate data | ||
7 | block on the disk and referenced from inodes via ``inode.i_file_acl*``. | ||
8 | The first use of extended attributes seems to have been for storing file | ||
9 | ACLs and other security data (selinux). With the ``user_xattr`` mount | ||
10 | option it is possible for users to store extended attributes so long as | ||
11 | all attribute names begin with “user”; this restriction seems to have | ||
12 | disappeared as of Linux 3.0. | ||
13 | |||
14 | There are two places where extended attributes can be found. The first | ||
15 | place is between the end of each inode entry and the beginning of the | ||
16 | next inode entry. For example, if inode.i\_extra\_isize = 28 and | ||
17 | sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes | ||
18 | available for in-inode extended attribute storage. The second place | ||
19 | where extended attributes can be found is in the block pointed to by | ||
20 | ``inode.i_file_acl``. As of Linux 3.11, it is not possible for this | ||
21 | block to contain a pointer to a second extended attribute block (or even | ||
22 | the remaining blocks of a cluster). In theory it is possible for each | ||
23 | attribute's value to be stored in a separate data block, though as of | ||
24 | Linux 3.11 the code does not permit this. | ||
25 | |||
26 | Keys are generally assumed to be ASCIIZ strings, whereas values can be | ||
27 | strings or binary data. | ||
28 | |||
29 | Extended attributes, when stored after the inode, have a header | ||
30 | ``ext4_xattr_ibody_header`` that is 4 bytes long: | ||
31 | |||
32 | .. list-table:: | ||
33 | :widths: 1 1 1 77 | ||
34 | :header-rows: 1 | ||
35 | |||
36 | * - Offset | ||
37 | - Type | ||
38 | - Name | ||
39 | - Description | ||
40 | * - 0x0 | ||
41 | - \_\_le32 | ||
42 | - h\_magic | ||
43 | - Magic number for identification, 0xEA020000. This value is set by the | ||
44 | Linux driver, though e2fsprogs doesn't seem to check it(?) | ||
45 | |||
46 | The beginning of an extended attribute block is in | ||
47 | ``struct ext4_xattr_header``, which is 32 bytes long: | ||
48 | |||
49 | .. list-table:: | ||
50 | :widths: 1 1 1 77 | ||
51 | :header-rows: 1 | ||
52 | |||
53 | * - Offset | ||
54 | - Type | ||
55 | - Name | ||
56 | - Description | ||
57 | * - 0x0 | ||
58 | - \_\_le32 | ||
59 | - h\_magic | ||
60 | - Magic number for identification, 0xEA020000. | ||
61 | * - 0x4 | ||
62 | - \_\_le32 | ||
63 | - h\_refcount | ||
64 | - Reference count. | ||
65 | * - 0x8 | ||
66 | - \_\_le32 | ||
67 | - h\_blocks | ||
68 | - Number of disk blocks used. | ||
69 | * - 0xC | ||
70 | - \_\_le32 | ||
71 | - h\_hash | ||
72 | - Hash value of all attributes. | ||
73 | * - 0x10 | ||
74 | - \_\_le32 | ||
75 | - h\_checksum | ||
76 | - Checksum of the extended attribute block. | ||
77 | * - 0x14 | ||
78 | - \_\_u32 | ||
79 | - h\_reserved[2] | ||
80 | - Zero. | ||
81 | |||
82 | The checksum is calculated against the FS UUID, the 64-bit block number | ||
83 | of the extended attribute block, and the entire block (header + | ||
84 | entries). | ||
85 | |||
86 | Following the ``struct ext4_xattr_header`` or | ||
87 | ``struct ext4_xattr_ibody_header`` is an array of | ||
88 | ``struct ext4_xattr_entry``; each of these entries is at least 16 bytes | ||
89 | long. When stored in an external block, the ``struct ext4_xattr_entry`` | ||
90 | entries must be stored in sorted order. The sort order is | ||
91 | ``e_name_index``, then ``e_name_len``, and finally ``e_name``. | ||
92 | Attributes stored inside an inode do not need be stored in sorted order. | ||
93 | |||
94 | .. list-table:: | ||
95 | :widths: 1 1 1 77 | ||
96 | :header-rows: 1 | ||
97 | |||
98 | * - Offset | ||
99 | - Type | ||
100 | - Name | ||
101 | - Description | ||
102 | * - 0x0 | ||
103 | - \_\_u8 | ||
104 | - e\_name\_len | ||
105 | - Length of name. | ||
106 | * - 0x1 | ||
107 | - \_\_u8 | ||
108 | - e\_name\_index | ||
109 | - Attribute name index. There is a discussion of this below. | ||
110 | * - 0x2 | ||
111 | - \_\_le16 | ||
112 | - e\_value\_offs | ||
113 | - Location of this attribute's value on the disk block where it is stored. | ||
114 | Multiple attributes can share the same value. For an inode attribute | ||
115 | this value is relative to the start of the first entry; for a block this | ||
116 | value is relative to the start of the block (i.e. the header). | ||
117 | * - 0x4 | ||
118 | - \_\_le32 | ||
119 | - e\_value\_inum | ||
120 | - The inode where the value is stored. Zero indicates the value is in the | ||
121 | same block as this entry. This field is only used if the | ||
122 | INCOMPAT\_EA\_INODE feature is enabled. | ||
123 | * - 0x8 | ||
124 | - \_\_le32 | ||
125 | - e\_value\_size | ||
126 | - Length of attribute value. | ||
127 | * - 0xC | ||
128 | - \_\_le32 | ||
129 | - e\_hash | ||
130 | - Hash value of attribute name and attribute value. The kernel doesn't | ||
131 | update the hash for in-inode attributes, so for that case this value | ||
132 | must be zero, because e2fsck validates any non-zero hash regardless of | ||
133 | where the xattr lives. | ||
134 | * - 0x10 | ||
135 | - char | ||
136 | - e\_name[e\_name\_len] | ||
137 | - Attribute name. Does not include trailing NULL. | ||
138 | |||
139 | Attribute values can follow the end of the entry table. There appears to | ||
140 | be a requirement that they be aligned to 4-byte boundaries. The values | ||
141 | are stored starting at the end of the block and grow towards the | ||
142 | xattr\_header/xattr\_entry table. When the two collide, the overflow is | ||
143 | put into a separate disk block. If the disk block fills up, the | ||
144 | filesystem returns -ENOSPC. | ||
145 | |||
146 | The first four fields of the ``ext4_xattr_entry`` are set to zero to | ||
147 | mark the end of the key list. | ||
148 | |||
149 | Attribute Name Indices | ||
150 | ~~~~~~~~~~~~~~~~~~~~~~ | ||
151 | |||
152 | Logically speaking, extended attributes are a series of key=value pairs. | ||
153 | The keys are assumed to be NULL-terminated strings. To reduce the amount | ||
154 | of on-disk space that the keys consume, the beginning of the key string | ||
155 | is matched against the attribute name index. If a match is found, the | ||
156 | attribute name index field is set, and matching string is removed from | ||
157 | the key name. Here is a map of name index values to key prefixes: | ||
158 | |||
159 | .. list-table:: | ||
160 | :widths: 1 79 | ||
161 | :header-rows: 1 | ||
162 | |||
163 | * - Name Index | ||
164 | - Key Prefix | ||
165 | * - 0 | ||
166 | - (no prefix) | ||
167 | * - 1 | ||
168 | - “user.” | ||
169 | * - 2 | ||
170 | - “system.posix\_acl\_access” | ||
171 | * - 3 | ||
172 | - “system.posix\_acl\_default” | ||
173 | * - 4 | ||
174 | - “trusted.” | ||
175 | * - 6 | ||
176 | - “security.” | ||
177 | * - 7 | ||
178 | - “system.” (inline\_data only?) | ||
179 | * - 8 | ||
180 | - “system.richacl” (SuSE kernels only?) | ||
181 | |||
182 | For example, if the attribute key is “user.fubar”, the attribute name | ||
183 | index is set to 1 and the “fubar” name is recorded on disk. | ||
184 | |||
185 | POSIX ACLs | ||
186 | ~~~~~~~~~~ | ||
187 | |||
188 | POSIX ACLs are stored in a reduced version of the Linux kernel (and | ||
189 | libacl's) internal ACL format. The key difference is that the version | ||
190 | number is different (1) and the ``e_id`` field is only stored for named | ||
191 | user and group ACLs. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/ondisk/bigalloc.rst new file mode 100644 index 000000000000..c6d88557553c --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/bigalloc.rst | |||
@@ -0,0 +1,22 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Bigalloc | ||
4 | -------- | ||
5 | |||
6 | At the moment, the default size of a block is 4KiB, which is a commonly | ||
7 | supported page size on most MMU-capable hardware. This is fortunate, as | ||
8 | ext4 code is not prepared to handle the case where the block size | ||
9 | exceeds the page size. However, for a filesystem of mostly huge files, | ||
10 | it is desirable to be able to allocate disk blocks in units of multiple | ||
11 | blocks to reduce both fragmentation and metadata overhead. The | ||
12 | `bigalloc <Bigalloc>`__ feature provides exactly this ability. The | ||
13 | administrator can set a block cluster size at mkfs time (which is stored | ||
14 | in the s\_log\_cluster\_size field in the superblock); from then on, the | ||
15 | block bitmaps track clusters, not individual blocks. This means that | ||
16 | block groups can be several gigabytes in size (instead of just 128MiB); | ||
17 | however, the minimum allocation unit becomes a cluster, not a block, | ||
18 | even for directories. TaoBao had a patchset to extend the “use units of | ||
19 | clusters instead of blocks” to the extent tree, though it is not clear | ||
20 | where those patches went-- they eventually morphed into “extent tree v2” | ||
21 | but that code has not landed as of May 2015. | ||
22 | |||
diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/ondisk/bitmaps.rst new file mode 100644 index 000000000000..c7546dbc197a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/bitmaps.rst | |||
@@ -0,0 +1,28 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Block and inode Bitmaps | ||
4 | ----------------------- | ||
5 | |||
6 | The data block bitmap tracks the usage of data blocks within the block | ||
7 | group. | ||
8 | |||
9 | The inode bitmap records which entries in the inode table are in use. | ||
10 | |||
11 | As with most bitmaps, one bit represents the usage status of one data | ||
12 | block or inode table entry. This implies a block group size of 8 \* | ||
13 | number\_of\_bytes\_in\_a\_logical\_block. | ||
14 | |||
15 | NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts | ||
16 | of the kernel and e2fsprogs code pretends that the block bitmap contains | ||
17 | zeros (i.e. all blocks in the group are free). However, it is not | ||
18 | necessarily the case that no blocks are in use -- if ``meta_bg`` is set, | ||
19 | the bitmaps and group descriptor live inside the group. Unfortunately, | ||
20 | ext2fs\_test\_block\_bitmap2() will return '0' for those locations, | ||
21 | which produces confusing debugfs output. | ||
22 | |||
23 | Inode Table | ||
24 | ----------- | ||
25 | Inode tables are statically allocated at mkfs time. Each block group | ||
26 | descriptor points to the start of the table, and the superblock records | ||
27 | the number of inodes per group. See the section on inodes for more | ||
28 | information. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/ondisk/blockgroup.rst new file mode 100644 index 000000000000..baf888e4c06a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/blockgroup.rst | |||
@@ -0,0 +1,135 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Layout | ||
4 | ------ | ||
5 | |||
6 | The layout of a standard block group is approximately as follows (each | ||
7 | of these fields is discussed in a separate section below): | ||
8 | |||
9 | .. list-table:: | ||
10 | :widths: 1 1 1 1 1 1 1 1 | ||
11 | :header-rows: 1 | ||
12 | |||
13 | * - Group 0 Padding | ||
14 | - ext4 Super Block | ||
15 | - Group Descriptors | ||
16 | - Reserved GDT Blocks | ||
17 | - Data Block Bitmap | ||
18 | - inode Bitmap | ||
19 | - inode Table | ||
20 | - Data Blocks | ||
21 | * - 1024 bytes | ||
22 | - 1 block | ||
23 | - many blocks | ||
24 | - many blocks | ||
25 | - 1 block | ||
26 | - 1 block | ||
27 | - many blocks | ||
28 | - many more blocks | ||
29 | |||
30 | For the special case of block group 0, the first 1024 bytes are unused, | ||
31 | to allow for the installation of x86 boot sectors and other oddities. | ||
32 | The superblock will start at offset 1024 bytes, whichever block that | ||
33 | happens to be (usually 0). However, if for some reason the block size = | ||
34 | 1024, then block 0 is marked in use and the superblock goes in block 1. | ||
35 | For all other block groups, there is no padding. | ||
36 | |||
37 | The ext4 driver primarily works with the superblock and the group | ||
38 | descriptors that are found in block group 0. Redundant copies of the | ||
39 | superblock and group descriptors are written to some of the block groups | ||
40 | across the disk in case the beginning of the disk gets trashed, though | ||
41 | not all block groups necessarily host a redundant copy (see following | ||
42 | paragraph for more details). If the group does not have a redundant | ||
43 | copy, the block group begins with the data block bitmap. Note also that | ||
44 | when the filesystem is freshly formatted, mkfs will allocate “reserve | ||
45 | GDT block” space after the block group descriptors and before the start | ||
46 | of the block bitmaps to allow for future expansion of the filesystem. By | ||
47 | default, a filesystem is allowed to increase in size by a factor of | ||
48 | 1024x over the original filesystem size. | ||
49 | |||
50 | The location of the inode table is given by ``grp.bg_inode_table_*``. It | ||
51 | is continuous range of blocks large enough to contain | ||
52 | ``sb.s_inodes_per_group * sb.s_inode_size`` bytes. | ||
53 | |||
54 | As for the ordering of items in a block group, it is generally | ||
55 | established that the super block and the group descriptor table, if | ||
56 | present, will be at the beginning of the block group. The bitmaps and | ||
57 | the inode table can be anywhere, and it is quite possible for the | ||
58 | bitmaps to come after the inode table, or for both to be in different | ||
59 | groups (flex\_bg). Leftover space is used for file data blocks, indirect | ||
60 | block maps, extent tree blocks, and extended attributes. | ||
61 | |||
62 | Flexible Block Groups | ||
63 | --------------------- | ||
64 | |||
65 | Starting in ext4, there is a new feature called flexible block groups | ||
66 | (flex\_bg). In a flex\_bg, several block groups are tied together as one | ||
67 | logical block group; the bitmap spaces and the inode table space in the | ||
68 | first block group of the flex\_bg are expanded to include the bitmaps | ||
69 | and inode tables of all other block groups in the flex\_bg. For example, | ||
70 | if the flex\_bg size is 4, then group 0 will contain (in order) the | ||
71 | superblock, group descriptors, data block bitmaps for groups 0-3, inode | ||
72 | bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining | ||
73 | space in group 0 is for file data. The effect of this is to group the | ||
74 | block metadata close together for faster loading, and to enable large | ||
75 | files to be continuous on disk. Backup copies of the superblock and | ||
76 | group descriptors are always at the beginning of block groups, even if | ||
77 | flex\_bg is enabled. The number of block groups that make up a flex\_bg | ||
78 | is given by 2 ^ ``sb.s_log_groups_per_flex``. | ||
79 | |||
80 | Meta Block Groups | ||
81 | ----------------- | ||
82 | |||
83 | Without the option META\_BG, for safety concerns, all block group | ||
84 | descriptors copies are kept in the first block group. Given the default | ||
85 | 128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4 | ||
86 | can have at most 2^27/64 = 2^21 block groups. This limits the entire | ||
87 | filesystem size to 2^21 ∗ 2^27 = 2^48bytes or 256TiB. | ||
88 | |||
89 | The solution to this problem is to use the metablock group feature | ||
90 | (META\_BG), which is already in ext3 for all 2.6 releases. With the | ||
91 | META\_BG feature, ext4 filesystems are partitioned into many metablock | ||
92 | groups. Each metablock group is a cluster of block groups whose group | ||
93 | descriptor structures can be stored in a single disk block. For ext4 | ||
94 | filesystems with 4 KB block size, a single metablock group partition | ||
95 | includes 64 block groups, or 8 GiB of disk space. The metablock group | ||
96 | feature moves the location of the group descriptors from the congested | ||
97 | first block group of the whole filesystem into the first group of each | ||
98 | metablock group itself. The backups are in the second and last group of | ||
99 | each metablock group. This increases the 2^21 maximum block groups limit | ||
100 | to the hard limit 2^32, allowing support for a 512PiB filesystem. | ||
101 | |||
102 | The change in the filesystem format replaces the current scheme where | ||
103 | the superblock is followed by a variable-length set of block group | ||
104 | descriptors. Instead, the superblock and a single block group descriptor | ||
105 | block is placed at the beginning of the first, second, and last block | ||
106 | groups in a meta-block group. A meta-block group is a collection of | ||
107 | block groups which can be described by a single block group descriptor | ||
108 | block. Since the size of the block group descriptor structure is 32 | ||
109 | bytes, a meta-block group contains 32 block groups for filesystems with | ||
110 | a 1KB block size, and 128 block groups for filesystems with a 4KB | ||
111 | blocksize. Filesystems can either be created using this new block group | ||
112 | descriptor layout, or existing filesystems can be resized on-line, and | ||
113 | the field s\_first\_meta\_bg in the superblock will indicate the first | ||
114 | block group using this new layout. | ||
115 | |||
116 | Please see an important note about ``BLOCK_UNINIT`` in the section about | ||
117 | block and inode bitmaps. | ||
118 | |||
119 | Lazy Block Group Initialization | ||
120 | ------------------------------- | ||
121 | |||
122 | A new feature for ext4 are three block group descriptor flags that | ||
123 | enable mkfs to skip initializing other parts of the block group | ||
124 | metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean | ||
125 | that the inode and block bitmaps for that group can be calculated and | ||
126 | therefore the on-disk bitmap blocks are not initialized. This is | ||
127 | generally the case for an empty block group or a block group containing | ||
128 | only fixed-location block group metadata. The INODE\_ZEROED flag means | ||
129 | that the inode table has been initialized; mkfs will unset this flag and | ||
130 | rely on the kernel to initialize the inode tables in the background. | ||
131 | |||
132 | By not writing zeroes to the bitmaps and inode table, mkfs time is | ||
133 | reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM, | ||
134 | but the dumpe2fs output prints this as “uninit\_bg”. They are the same | ||
135 | thing. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/ondisk/blockmap.rst new file mode 100644 index 000000000000..30e25750d88a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/blockmap.rst | |||
@@ -0,0 +1,49 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
4 | | i.i\_block Offset | Where It Points | | ||
5 | +=====================+==============================================================================================================================================================================================================================+ | ||
6 | | 0 to 11 | Direct map to file blocks 0 to 11. | | ||
7 | +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
8 | | 12 | Indirect block: (file blocks 12 to (``$block_size`` / 4) + 11, or 12 to 1035 if 4KiB blocks) | | ||
9 | | | | | ||
10 | | | +------------------------------+--------------------------------------------------------------------+ | | ||
11 | | | | Indirect Block Offset | Where It Points | | | ||
12 | | | +==============================+====================================================================+ | | ||
13 | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | ||
14 | | | +------------------------------+--------------------------------------------------------------------+ | | ||
15 | +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
16 | | 13 | Double-indirect block: (file blocks ``$block_size``/4 + 12 to (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 11, or 1036 to 1049611 if 4KiB blocks) | | ||
17 | | | | | ||
18 | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | ||
19 | | | | Double Indirect Block Offset | Where It Points | | | ||
20 | | | +================================+=========================================================================================================+ | | ||
21 | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | | ||
22 | | | | | | | | ||
23 | | | | | +------------------------------+--------------------------------------------------------------------+ | | | ||
24 | | | | | | Indirect Block Offset | Where It Points | | | | ||
25 | | | | | +==============================+====================================================================+ | | | ||
26 | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | | ||
27 | | | | | +------------------------------+--------------------------------------------------------------------+ | | | ||
28 | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | ||
29 | +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
30 | | 14 | Triple-indirect block: (file blocks (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12 to (``$block_size`` / 4) ^ 3 + (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12, or 1049612 to 1074791436 if 4KiB blocks) | | ||
31 | | | | | ||
32 | | | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | | ||
33 | | | | Triple Indirect Block Offset | Where It Points | | | ||
34 | | | +================================+================================================================================================================================================+ | | ||
35 | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) double indirect blocks (1024 if 4KiB blocks) | | | ||
36 | | | | | | | | ||
37 | | | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | | ||
38 | | | | | | Double Indirect Block Offset | Where It Points | | | | ||
39 | | | | | +================================+=========================================================================================================+ | | | ||
40 | | | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | | | ||
41 | | | | | | | | | | | ||
42 | | | | | | | +------------------------------+--------------------------------------------------------------------+ | | | | ||
43 | | | | | | | | Indirect Block Offset | Where It Points | | | | | ||
44 | | | | | | | +==============================+====================================================================+ | | | | ||
45 | | | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | | | ||
46 | | | | | | | +------------------------------+--------------------------------------------------------------------+ | | | | ||
47 | | | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | | | ||
48 | | | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ | | ||
49 | +---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/ondisk/blocks.rst new file mode 100644 index 000000000000..73d4dc0f7bda --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/blocks.rst | |||
@@ -0,0 +1,142 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Blocks | ||
4 | ------ | ||
5 | |||
6 | ext4 allocates storage space in units of “blocks”. A block is a group of | ||
7 | sectors between 1KiB and 64KiB, and the number of sectors must be an | ||
8 | integral power of 2. Blocks are in turn grouped into larger units called | ||
9 | block groups. Block size is specified at mkfs time and typically is | ||
10 | 4KiB. You may experience mounting problems if block size is greater than | ||
11 | page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory | ||
12 | pages). By default a filesystem can contain 2^32 blocks; if the '64bit' | ||
13 | feature is enabled, then a filesystem can have 2^64 blocks. | ||
14 | |||
15 | For 32-bit filesystems, limits are as follows: | ||
16 | |||
17 | .. list-table:: | ||
18 | :widths: 1 1 1 1 1 | ||
19 | :header-rows: 1 | ||
20 | |||
21 | * - Item | ||
22 | - 1KiB | ||
23 | - 2KiB | ||
24 | - 4KiB | ||
25 | - 64KiB | ||
26 | * - Blocks | ||
27 | - 2^32 | ||
28 | - 2^32 | ||
29 | - 2^32 | ||
30 | - 2^32 | ||
31 | * - Inodes | ||
32 | - 2^32 | ||
33 | - 2^32 | ||
34 | - 2^32 | ||
35 | - 2^32 | ||
36 | * - File System Size | ||
37 | - 4TiB | ||
38 | - 8TiB | ||
39 | - 16TiB | ||
40 | - 256PiB | ||
41 | * - Blocks Per Block Group | ||
42 | - 8,192 | ||
43 | - 16,384 | ||
44 | - 32,768 | ||
45 | - 524,288 | ||
46 | * - Inodes Per Block Group | ||
47 | - 8,192 | ||
48 | - 16,384 | ||
49 | - 32,768 | ||
50 | - 524,288 | ||
51 | * - Block Group Size | ||
52 | - 8MiB | ||
53 | - 32MiB | ||
54 | - 128MiB | ||
55 | - 32GiB | ||
56 | * - Blocks Per File, Extents | ||
57 | - 2^32 | ||
58 | - 2^32 | ||
59 | - 2^32 | ||
60 | - 2^32 | ||
61 | * - Blocks Per File, Block Maps | ||
62 | - 16,843,020 | ||
63 | - 134,480,396 | ||
64 | - 1,074,791,436 | ||
65 | - 4,398,314,962,956 (really 2^32 due to field size limitations) | ||
66 | * - File Size, Extents | ||
67 | - 4TiB | ||
68 | - 8TiB | ||
69 | - 16TiB | ||
70 | - 256TiB | ||
71 | * - File Size, Block Maps | ||
72 | - 16GiB | ||
73 | - 256GiB | ||
74 | - 4TiB | ||
75 | - 256TiB | ||
76 | |||
77 | For 64-bit filesystems, limits are as follows: | ||
78 | |||
79 | .. list-table:: | ||
80 | :widths: 1 1 1 1 1 | ||
81 | :header-rows: 1 | ||
82 | |||
83 | * - Item | ||
84 | - 1KiB | ||
85 | - 2KiB | ||
86 | - 4KiB | ||
87 | - 64KiB | ||
88 | * - Blocks | ||
89 | - 2^64 | ||
90 | - 2^64 | ||
91 | - 2^64 | ||
92 | - 2^64 | ||
93 | * - Inodes | ||
94 | - 2^32 | ||
95 | - 2^32 | ||
96 | - 2^32 | ||
97 | - 2^32 | ||
98 | * - File System Size | ||
99 | - 16ZiB | ||
100 | - 32ZiB | ||
101 | - 64ZiB | ||
102 | - 1YiB | ||
103 | * - Blocks Per Block Group | ||
104 | - 8,192 | ||
105 | - 16,384 | ||
106 | - 32,768 | ||
107 | - 524,288 | ||
108 | * - Inodes Per Block Group | ||
109 | - 8,192 | ||
110 | - 16,384 | ||
111 | - 32,768 | ||
112 | - 524,288 | ||
113 | * - Block Group Size | ||
114 | - 8MiB | ||
115 | - 32MiB | ||
116 | - 128MiB | ||
117 | - 32GiB | ||
118 | * - Blocks Per File, Extents | ||
119 | - 2^32 | ||
120 | - 2^32 | ||
121 | - 2^32 | ||
122 | - 2^32 | ||
123 | * - Blocks Per File, Block Maps | ||
124 | - 16,843,020 | ||
125 | - 134,480,396 | ||
126 | - 1,074,791,436 | ||
127 | - 4,398,314,962,956 (really 2^32 due to field size limitations) | ||
128 | * - File Size, Extents | ||
129 | - 4TiB | ||
130 | - 8TiB | ||
131 | - 16TiB | ||
132 | - 256TiB | ||
133 | * - File Size, Block Maps | ||
134 | - 16GiB | ||
135 | - 256GiB | ||
136 | - 4TiB | ||
137 | - 256TiB | ||
138 | |||
139 | Note: Files not using extents (i.e. files using block maps) must be | ||
140 | placed within the first 2^32 blocks of a filesystem. Files with extents | ||
141 | must be placed within the first 2^48 blocks of a filesystem. It's not | ||
142 | clear what happens with larger filesystems. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/ondisk/checksums.rst new file mode 100644 index 000000000000..9d6a793b2e03 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/checksums.rst | |||
@@ -0,0 +1,73 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Checksums | ||
4 | --------- | ||
5 | |||
6 | Starting in early 2012, metadata checksums were added to all major ext4 | ||
7 | and jbd2 data structures. The associated feature flag is metadata\_csum. | ||
8 | The desired checksum algorithm is indicated in the superblock, though as | ||
9 | of October 2012 the only supported algorithm is crc32c. Some data | ||
10 | structures did not have space to fit a full 32-bit checksum, so only the | ||
11 | lower 16 bits are stored. Enabling the 64bit feature increases the data | ||
12 | structure size so that full 32-bit checksums can be stored for many data | ||
13 | structures. However, existing 32-bit filesystems cannot be extended to | ||
14 | enable 64bit mode, at least not without the experimental resize2fs | ||
15 | patches to do so. | ||
16 | |||
17 | Existing filesystems can have checksumming added by running | ||
18 | ``tune2fs -O metadata_csum`` against the underlying device. If tune2fs | ||
19 | encounters directory blocks that lack sufficient empty space to add a | ||
20 | checksum, it will request that you run ``e2fsck -D`` to have the | ||
21 | directories rebuilt with checksums. This has the added benefit of | ||
22 | removing slack space from the directory files and rebalancing the htree | ||
23 | indexes. If you \_ignore\_ this step, your directories will not be | ||
24 | protected by a checksum! | ||
25 | |||
26 | The following table describes the data elements that go into each type | ||
27 | of checksum. The checksum function is whatever the superblock describes | ||
28 | (crc32c as of October 2013) unless noted otherwise. | ||
29 | |||
30 | .. list-table:: | ||
31 | :widths: 1 1 4 | ||
32 | :header-rows: 1 | ||
33 | |||
34 | * - Metadata | ||
35 | - Length | ||
36 | - Ingredients | ||
37 | * - Superblock | ||
38 | - \_\_le32 | ||
39 | - The entire superblock up to the checksum field. The UUID lives inside | ||
40 | the superblock. | ||
41 | * - MMP | ||
42 | - \_\_le32 | ||
43 | - UUID + the entire MMP block up to the checksum field. | ||
44 | * - Extended Attributes | ||
45 | - \_\_le32 | ||
46 | - UUID + the entire extended attribute block. The checksum field is set to | ||
47 | zero. | ||
48 | * - Directory Entries | ||
49 | - \_\_le32 | ||
50 | - UUID + inode number + inode generation + the directory block up to the | ||
51 | fake entry enclosing the checksum field. | ||
52 | * - HTREE Nodes | ||
53 | - \_\_le32 | ||
54 | - UUID + inode number + inode generation + all valid extents + HTREE tail. | ||
55 | The checksum field is set to zero. | ||
56 | * - Extents | ||
57 | - \_\_le32 | ||
58 | - UUID + inode number + inode generation + the entire extent block up to | ||
59 | the checksum field. | ||
60 | * - Bitmaps | ||
61 | - \_\_le32 or \_\_le16 | ||
62 | - UUID + the entire bitmap. Checksums are stored in the group descriptor, | ||
63 | and truncated if the group descriptor size is 32 bytes (i.e. ^64bit) | ||
64 | * - Inodes | ||
65 | - \_\_le32 | ||
66 | - UUID + inode number + inode generation + the entire inode. The checksum | ||
67 | field is set to zero. Each inode has its own checksum. | ||
68 | * - Group Descriptors | ||
69 | - \_\_le16 | ||
70 | - If metadata\_csum, then UUID + group number + the entire descriptor; | ||
71 | else if gdt\_csum, then crc16(UUID + group number + the entire | ||
72 | descriptor). In all cases, only the lower 16 bits are stored. | ||
73 | |||
diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/ondisk/directory.rst new file mode 100644 index 000000000000..8fcba68c2884 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/directory.rst | |||
@@ -0,0 +1,426 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Directory Entries | ||
4 | ----------------- | ||
5 | |||
6 | In an ext4 filesystem, a directory is more or less a flat file that maps | ||
7 | an arbitrary byte string (usually ASCII) to an inode number on the | ||
8 | filesystem. There can be many directory entries across the filesystem | ||
9 | that reference the same inode number--these are known as hard links, and | ||
10 | that is why hard links cannot reference files on other filesystems. As | ||
11 | such, directory entries are found by reading the data block(s) | ||
12 | associated with a directory file for the particular directory entry that | ||
13 | is desired. | ||
14 | |||
15 | Linear (Classic) Directories | ||
16 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
17 | |||
18 | By default, each directory lists its entries in an “almost-linear” | ||
19 | array. I write “almost” because it's not a linear array in the memory | ||
20 | sense because directory entries are not split across filesystem blocks. | ||
21 | Therefore, it is more accurate to say that a directory is a series of | ||
22 | data blocks and that each block contains a linear array of directory | ||
23 | entries. The end of each per-block array is signified by reaching the | ||
24 | end of the block; the last entry in the block has a record length that | ||
25 | takes it all the way to the end of the block. The end of the entire | ||
26 | directory is of course signified by reaching the end of the file. Unused | ||
27 | directory entries are signified by inode = 0. By default the filesystem | ||
28 | uses ``struct ext4_dir_entry_2`` for directory entries unless the | ||
29 | “filetype” feature flag is not set, in which case it uses | ||
30 | ``struct ext4_dir_entry``. | ||
31 | |||
32 | The original directory entry format is ``struct ext4_dir_entry``, which | ||
33 | is at most 263 bytes long, though on disk you'll need to reference | ||
34 | ``dirent.rec_len`` to know for sure. | ||
35 | |||
36 | .. list-table:: | ||
37 | :widths: 1 1 1 77 | ||
38 | :header-rows: 1 | ||
39 | |||
40 | * - Offset | ||
41 | - Size | ||
42 | - Name | ||
43 | - Description | ||
44 | * - 0x0 | ||
45 | - \_\_le32 | ||
46 | - inode | ||
47 | - Number of the inode that this directory entry points to. | ||
48 | * - 0x4 | ||
49 | - \_\_le16 | ||
50 | - rec\_len | ||
51 | - Length of this directory entry. Must be a multiple of 4. | ||
52 | * - 0x6 | ||
53 | - \_\_le16 | ||
54 | - name\_len | ||
55 | - Length of the file name. | ||
56 | * - 0x8 | ||
57 | - char | ||
58 | - name[EXT4\_NAME\_LEN] | ||
59 | - File name. | ||
60 | |||
61 | Since file names cannot be longer than 255 bytes, the new directory | ||
62 | entry format shortens the rec\_len field and uses the space for a file | ||
63 | type flag, probably to avoid having to load every inode during directory | ||
64 | tree traversal. This format is ``ext4_dir_entry_2``, which is at most | ||
65 | 263 bytes long, though on disk you'll need to reference | ||
66 | ``dirent.rec_len`` to know for sure. | ||
67 | |||
68 | .. list-table:: | ||
69 | :widths: 1 1 1 77 | ||
70 | :header-rows: 1 | ||
71 | |||
72 | * - Offset | ||
73 | - Size | ||
74 | - Name | ||
75 | - Description | ||
76 | * - 0x0 | ||
77 | - \_\_le32 | ||
78 | - inode | ||
79 | - Number of the inode that this directory entry points to. | ||
80 | * - 0x4 | ||
81 | - \_\_le16 | ||
82 | - rec\_len | ||
83 | - Length of this directory entry. | ||
84 | * - 0x6 | ||
85 | - \_\_u8 | ||
86 | - name\_len | ||
87 | - Length of the file name. | ||
88 | * - 0x7 | ||
89 | - \_\_u8 | ||
90 | - file\_type | ||
91 | - File type code, see ftype_ table below. | ||
92 | * - 0x8 | ||
93 | - char | ||
94 | - name[EXT4\_NAME\_LEN] | ||
95 | - File name. | ||
96 | |||
97 | .. _ftype: | ||
98 | |||
99 | The directory file type is one of the following values: | ||
100 | |||
101 | .. list-table:: | ||
102 | :widths: 1 79 | ||
103 | :header-rows: 1 | ||
104 | |||
105 | * - Value | ||
106 | - Description | ||
107 | * - 0x0 | ||
108 | - Unknown. | ||
109 | * - 0x1 | ||
110 | - Regular file. | ||
111 | * - 0x2 | ||
112 | - Directory. | ||
113 | * - 0x3 | ||
114 | - Character device file. | ||
115 | * - 0x4 | ||
116 | - Block device file. | ||
117 | * - 0x5 | ||
118 | - FIFO. | ||
119 | * - 0x6 | ||
120 | - Socket. | ||
121 | * - 0x7 | ||
122 | - Symbolic link. | ||
123 | |||
124 | In order to add checksums to these classic directory blocks, a phony | ||
125 | ``struct ext4_dir_entry`` is placed at the end of each leaf block to | ||
126 | hold the checksum. The directory entry is 12 bytes long. The inode | ||
127 | number and name\_len fields are set to zero to fool old software into | ||
128 | ignoring an apparently empty directory entry, and the checksum is stored | ||
129 | in the place where the name normally goes. The structure is | ||
130 | ``struct ext4_dir_entry_tail``: | ||
131 | |||
132 | .. list-table:: | ||
133 | :widths: 1 1 1 77 | ||
134 | :header-rows: 1 | ||
135 | |||
136 | * - Offset | ||
137 | - Size | ||
138 | - Name | ||
139 | - Description | ||
140 | * - 0x0 | ||
141 | - \_\_le32 | ||
142 | - det\_reserved\_zero1 | ||
143 | - Inode number, which must be zero. | ||
144 | * - 0x4 | ||
145 | - \_\_le16 | ||
146 | - det\_rec\_len | ||
147 | - Length of this directory entry, which must be 12. | ||
148 | * - 0x6 | ||
149 | - \_\_u8 | ||
150 | - det\_reserved\_zero2 | ||
151 | - Length of the file name, which must be zero. | ||
152 | * - 0x7 | ||
153 | - \_\_u8 | ||
154 | - det\_reserved\_ft | ||
155 | - File type, which must be 0xDE. | ||
156 | * - 0x8 | ||
157 | - \_\_le32 | ||
158 | - det\_checksum | ||
159 | - Directory leaf block checksum. | ||
160 | |||
161 | The leaf directory block checksum is calculated against the FS UUID, the | ||
162 | directory's inode number, the directory's inode generation number, and | ||
163 | the entire directory entry block up to (but not including) the fake | ||
164 | directory entry. | ||
165 | |||
166 | Hash Tree Directories | ||
167 | ~~~~~~~~~~~~~~~~~~~~~ | ||
168 | |||
169 | A linear array of directory entries isn't great for performance, so a | ||
170 | new feature was added to ext3 to provide a faster (but peculiar) | ||
171 | balanced tree keyed off a hash of the directory entry name. If the | ||
172 | EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a | ||
173 | hashed btree (htree) to organize and find directory entries. For | ||
174 | backwards read-only compatibility with ext2, this tree is actually | ||
175 | hidden inside the directory file, masquerading as “empty” directory data | ||
176 | blocks! It was stated previously that the end of the linear directory | ||
177 | entry table was signified with an entry pointing to inode 0; this is | ||
178 | (ab)used to fool the old linear-scan algorithm into thinking that the | ||
179 | rest of the directory block is empty so that it moves on. | ||
180 | |||
181 | The root of the tree always lives in the first data block of the | ||
182 | directory. By ext2 custom, the '.' and '..' entries must appear at the | ||
183 | beginning of this first block, so they are put here as two | ||
184 | ``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of | ||
185 | the root node contains metadata about the tree and finally a hash->block | ||
186 | map to find nodes that are lower in the htree. If | ||
187 | ``dx_root.info.indirect_levels`` is non-zero then the htree has two | ||
188 | levels; the data block pointed to by the root node's map is an interior | ||
189 | node, which is indexed by a minor hash. Interior nodes in this tree | ||
190 | contains a zeroed out ``struct ext4_dir_entry_2`` followed by a | ||
191 | minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear | ||
192 | array of all ``struct ext4_dir_entry_2``; all of these entries | ||
193 | (presumably) hash to the same value. If there is an overflow, the | ||
194 | entries simply overflow into the next leaf node, and the | ||
195 | least-significant bit of the hash (in the interior node map) that gets | ||
196 | us to this next leaf node is set. | ||
197 | |||
198 | To traverse the directory as a htree, the code calculates the hash of | ||
199 | the desired file name and uses it to find the corresponding block | ||
200 | number. If the tree is flat, the block is a linear array of directory | ||
201 | entries that can be searched; otherwise, the minor hash of the file name | ||
202 | is computed and used against this second block to find the corresponding | ||
203 | third block number. That third block number will be a linear array of | ||
204 | directory entries. | ||
205 | |||
206 | To traverse the directory as a linear array (such as the old code does), | ||
207 | the code simply reads every data block in the directory. The blocks used | ||
208 | for the htree will appear to have no entries (aside from '.' and '..') | ||
209 | and so only the leaf nodes will appear to have any interesting content. | ||
210 | |||
211 | The root of the htree is in ``struct dx_root``, which is the full length | ||
212 | of a data block: | ||
213 | |||
214 | .. list-table:: | ||
215 | :widths: 1 1 1 77 | ||
216 | :header-rows: 1 | ||
217 | |||
218 | * - Offset | ||
219 | - Type | ||
220 | - Name | ||
221 | - Description | ||
222 | * - 0x0 | ||
223 | - \_\_le32 | ||
224 | - dot.inode | ||
225 | - inode number of this directory. | ||
226 | * - 0x4 | ||
227 | - \_\_le16 | ||
228 | - dot.rec\_len | ||
229 | - Length of this record, 12. | ||
230 | * - 0x6 | ||
231 | - u8 | ||
232 | - dot.name\_len | ||
233 | - Length of the name, 1. | ||
234 | * - 0x7 | ||
235 | - u8 | ||
236 | - dot.file\_type | ||
237 | - File type of this entry, 0x2 (directory) (if the feature flag is set). | ||
238 | * - 0x8 | ||
239 | - char | ||
240 | - dot.name[4] | ||
241 | - “.\\0\\0\\0” | ||
242 | * - 0xC | ||
243 | - \_\_le32 | ||
244 | - dotdot.inode | ||
245 | - inode number of parent directory. | ||
246 | * - 0x10 | ||
247 | - \_\_le16 | ||
248 | - dotdot.rec\_len | ||
249 | - block\_size - 12. The record length is long enough to cover all htree | ||
250 | data. | ||
251 | * - 0x12 | ||
252 | - u8 | ||
253 | - dotdot.name\_len | ||
254 | - Length of the name, 2. | ||
255 | * - 0x13 | ||
256 | - u8 | ||
257 | - dotdot.file\_type | ||
258 | - File type of this entry, 0x2 (directory) (if the feature flag is set). | ||
259 | * - 0x14 | ||
260 | - char | ||
261 | - dotdot\_name[4] | ||
262 | - “..\\0\\0” | ||
263 | * - 0x18 | ||
264 | - \_\_le32 | ||
265 | - struct dx\_root\_info.reserved\_zero | ||
266 | - Zero. | ||
267 | * - 0x1C | ||
268 | - u8 | ||
269 | - struct dx\_root\_info.hash\_version | ||
270 | - Hash type, see dirhash_ table below. | ||
271 | * - 0x1D | ||
272 | - u8 | ||
273 | - struct dx\_root\_info.info\_length | ||
274 | - Length of the tree information, 0x8. | ||
275 | * - 0x1E | ||
276 | - u8 | ||
277 | - struct dx\_root\_info.indirect\_levels | ||
278 | - Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR | ||
279 | feature is set; cannot be larger than 2 otherwise. | ||
280 | * - 0x1F | ||
281 | - u8 | ||
282 | - struct dx\_root\_info.unused\_flags | ||
283 | - | ||
284 | * - 0x20 | ||
285 | - \_\_le16 | ||
286 | - limit | ||
287 | - Maximum number of dx\_entries that can follow this header, plus 1 for | ||
288 | the header itself. | ||
289 | * - 0x22 | ||
290 | - \_\_le16 | ||
291 | - count | ||
292 | - Actual number of dx\_entries that follow this header, plus 1 for the | ||
293 | header itself. | ||
294 | * - 0x24 | ||
295 | - \_\_le32 | ||
296 | - block | ||
297 | - The block number (within the directory file) that goes with hash=0. | ||
298 | * - 0x28 | ||
299 | - struct dx\_entry | ||
300 | - entries[0] | ||
301 | - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. | ||
302 | |||
303 | .. _dirhash: | ||
304 | |||
305 | The directory hash is one of the following values: | ||
306 | |||
307 | .. list-table:: | ||
308 | :widths: 1 79 | ||
309 | :header-rows: 1 | ||
310 | |||
311 | * - Value | ||
312 | - Description | ||
313 | * - 0x0 | ||
314 | - Legacy. | ||
315 | * - 0x1 | ||
316 | - Half MD4. | ||
317 | * - 0x2 | ||
318 | - Tea. | ||
319 | * - 0x3 | ||
320 | - Legacy, unsigned. | ||
321 | * - 0x4 | ||
322 | - Half MD4, unsigned. | ||
323 | * - 0x5 | ||
324 | - Tea, unsigned. | ||
325 | |||
326 | Interior nodes of an htree are recorded as ``struct dx_node``, which is | ||
327 | also the full length of a data block: | ||
328 | |||
329 | .. list-table:: | ||
330 | :widths: 1 1 1 77 | ||
331 | :header-rows: 1 | ||
332 | |||
333 | * - Offset | ||
334 | - Type | ||
335 | - Name | ||
336 | - Description | ||
337 | * - 0x0 | ||
338 | - \_\_le32 | ||
339 | - fake.inode | ||
340 | - Zero, to make it look like this entry is not in use. | ||
341 | * - 0x4 | ||
342 | - \_\_le16 | ||
343 | - fake.rec\_len | ||
344 | - The size of the block, in order to hide all of the dx\_node data. | ||
345 | * - 0x6 | ||
346 | - u8 | ||
347 | - name\_len | ||
348 | - Zero. There is no name for this “unused” directory entry. | ||
349 | * - 0x7 | ||
350 | - u8 | ||
351 | - file\_type | ||
352 | - Zero. There is no file type for this “unused” directory entry. | ||
353 | * - 0x8 | ||
354 | - \_\_le16 | ||
355 | - limit | ||
356 | - Maximum number of dx\_entries that can follow this header, plus 1 for | ||
357 | the header itself. | ||
358 | * - 0xA | ||
359 | - \_\_le16 | ||
360 | - count | ||
361 | - Actual number of dx\_entries that follow this header, plus 1 for the | ||
362 | header itself. | ||
363 | * - 0xE | ||
364 | - \_\_le32 | ||
365 | - block | ||
366 | - The block number (within the directory file) that goes with the lowest | ||
367 | hash value of this block. This value is stored in the parent block. | ||
368 | * - 0x12 | ||
369 | - struct dx\_entry | ||
370 | - entries[0] | ||
371 | - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block. | ||
372 | |||
373 | The hash maps that exist in both ``struct dx_root`` and | ||
374 | ``struct dx_node`` are recorded as ``struct dx_entry``, which is 8 bytes | ||
375 | long: | ||
376 | |||
377 | .. list-table:: | ||
378 | :widths: 1 1 1 77 | ||
379 | :header-rows: 1 | ||
380 | |||
381 | * - Offset | ||
382 | - Type | ||
383 | - Name | ||
384 | - Description | ||
385 | * - 0x0 | ||
386 | - \_\_le32 | ||
387 | - hash | ||
388 | - Hash code. | ||
389 | * - 0x4 | ||
390 | - \_\_le32 | ||
391 | - block | ||
392 | - Block number (within the directory file, not filesystem blocks) of the | ||
393 | next node in the htree. | ||
394 | |||
395 | (If you think this is all quite clever and peculiar, so does the | ||
396 | author.) | ||
397 | |||
398 | If metadata checksums are enabled, the last 8 bytes of the directory | ||
399 | block (precisely the length of one dx\_entry) are used to store a | ||
400 | ``struct dx_tail``, which contains the checksum. The ``limit`` and | ||
401 | ``count`` entries in the dx\_root/dx\_node structures are adjusted as | ||
402 | necessary to fit the dx\_tail into the block. If there is no space for | ||
403 | the dx\_tail, the user is notified to run e2fsck -D to rebuild the | ||
404 | directory index (which will ensure that there's space for the checksum. | ||
405 | The dx\_tail structure is 8 bytes long and looks like this: | ||
406 | |||
407 | .. list-table:: | ||
408 | :widths: 1 1 1 77 | ||
409 | :header-rows: 1 | ||
410 | |||
411 | * - Offset | ||
412 | - Type | ||
413 | - Name | ||
414 | - Description | ||
415 | * - 0x0 | ||
416 | - u32 | ||
417 | - dt\_reserved | ||
418 | - Zero. | ||
419 | * - 0x4 | ||
420 | - \_\_le32 | ||
421 | - dt\_checksum | ||
422 | - Checksum of the htree directory block. | ||
423 | |||
424 | The checksum is calculated against the FS UUID, the htree index header | ||
425 | (dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in | ||
426 | use, and the tail block (dx\_tail). | ||
diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/ondisk/dynamic.rst new file mode 100644 index 000000000000..bb0c84333341 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/dynamic.rst | |||
@@ -0,0 +1,12 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Dynamic Structures | ||
4 | ================== | ||
5 | |||
6 | Dynamic metadata are created on the fly when files and blocks are | ||
7 | allocated to files. | ||
8 | |||
9 | .. include:: inodes.rst | ||
10 | .. include:: ifork.rst | ||
11 | .. include:: directory.rst | ||
12 | .. include:: attributes.rst | ||
diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/ondisk/eainode.rst new file mode 100644 index 000000000000..ecc0d01a0a72 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/eainode.rst | |||
@@ -0,0 +1,18 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Large Extended Attribute Values | ||
4 | ------------------------------- | ||
5 | |||
6 | To enable ext4 to store extended attribute values that do not fit in the | ||
7 | inode or in the single extended attribute block attached to an inode, | ||
8 | the EA\_INODE feature allows us to store the value in the data blocks of | ||
9 | a regular file inode. This “EA inode” is linked only from the extended | ||
10 | attribute name index and must not appear in a directory entry. The | ||
11 | inode's i\_atime field is used to store a checksum of the xattr value; | ||
12 | and i\_ctime/i\_version store a 64-bit reference count, which enables | ||
13 | sharing of large xattr values between multiple owning inodes. For | ||
14 | backward compatibility with older versions of this feature, the | ||
15 | i\_mtime/i\_generation *may* store a back-reference to the inode number | ||
16 | and i\_generation of the **one** owning inode (in cases where the EA | ||
17 | inode is not referenced by multiple inodes) to verify that the EA inode | ||
18 | is the correct one being accessed. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/ondisk/globals.rst new file mode 100644 index 000000000000..368bf7662b96 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/globals.rst | |||
@@ -0,0 +1,13 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Global Structures | ||
4 | ================= | ||
5 | |||
6 | The filesystem is sharded into a number of block groups, each of which | ||
7 | have static metadata at fixed locations. | ||
8 | |||
9 | .. include:: super.rst | ||
10 | .. include:: group_descr.rst | ||
11 | .. include:: bitmaps.rst | ||
12 | .. include:: mmp.rst | ||
13 | .. include:: journal.rst | ||
diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/ondisk/group_descr.rst new file mode 100644 index 000000000000..759827e5d2cf --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/group_descr.rst | |||
@@ -0,0 +1,170 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Block Group Descriptors | ||
4 | ----------------------- | ||
5 | |||
6 | Each block group on the filesystem has one of these descriptors | ||
7 | associated with it. As noted in the Layout section above, the group | ||
8 | descriptors (if present) are the second item in the block group. The | ||
9 | standard configuration is for each block group to contain a full copy of | ||
10 | the block group descriptor table unless the sparse\_super feature flag | ||
11 | is set. | ||
12 | |||
13 | Notice how the group descriptor records the location of both bitmaps and | ||
14 | the inode table (i.e. they can float). This means that within a block | ||
15 | group, the only data structures with fixed locations are the superblock | ||
16 | and the group descriptor table. The flex\_bg mechanism uses this | ||
17 | property to group several block groups into a flex group and lay out all | ||
18 | of the groups' bitmaps and inode tables into one long run in the first | ||
19 | group of the flex group. | ||
20 | |||
21 | If the meta\_bg feature flag is set, then several block groups are | ||
22 | grouped together into a meta group. Note that in the meta\_bg case, | ||
23 | however, the first and last two block groups within the larger meta | ||
24 | group contain only group descriptors for the groups inside the meta | ||
25 | group. | ||
26 | |||
27 | flex\_bg and meta\_bg do not appear to be mutually exclusive features. | ||
28 | |||
29 | In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the | ||
30 | block group descriptor was only 32 bytes long and therefore ends at | ||
31 | bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the | ||
32 | block group descriptor expands to at least the 64 bytes described below; | ||
33 | the size is stored in the superblock. | ||
34 | |||
35 | If gdt\_csum is set and metadata\_csum is not set, the block group | ||
36 | checksum is the crc16 of the FS UUID, the group number, and the group | ||
37 | descriptor structure. If metadata\_csum is set, then the block group | ||
38 | checksum is the lower 16 bits of the checksum of the FS UUID, the group | ||
39 | number, and the group descriptor structure. Both block and inode bitmap | ||
40 | checksums are calculated against the FS UUID, the group number, and the | ||
41 | entire bitmap. | ||
42 | |||
43 | The block group descriptor is laid out in ``struct ext4_group_desc``. | ||
44 | |||
45 | .. list-table:: | ||
46 | :widths: 1 1 1 77 | ||
47 | :header-rows: 1 | ||
48 | |||
49 | * - Offset | ||
50 | - Size | ||
51 | - Name | ||
52 | - Description | ||
53 | * - 0x0 | ||
54 | - \_\_le32 | ||
55 | - bg\_block\_bitmap\_lo | ||
56 | - Lower 32-bits of location of block bitmap. | ||
57 | * - 0x4 | ||
58 | - \_\_le32 | ||
59 | - bg\_inode\_bitmap\_lo | ||
60 | - Lower 32-bits of location of inode bitmap. | ||
61 | * - 0x8 | ||
62 | - \_\_le32 | ||
63 | - bg\_inode\_table\_lo | ||
64 | - Lower 32-bits of location of inode table. | ||
65 | * - 0xC | ||
66 | - \_\_le16 | ||
67 | - bg\_free\_blocks\_count\_lo | ||
68 | - Lower 16-bits of free block count. | ||
69 | * - 0xE | ||
70 | - \_\_le16 | ||
71 | - bg\_free\_inodes\_count\_lo | ||
72 | - Lower 16-bits of free inode count. | ||
73 | * - 0x10 | ||
74 | - \_\_le16 | ||
75 | - bg\_used\_dirs\_count\_lo | ||
76 | - Lower 16-bits of directory count. | ||
77 | * - 0x12 | ||
78 | - \_\_le16 | ||
79 | - bg\_flags | ||
80 | - Block group flags. See the bgflags_ table below. | ||
81 | * - 0x14 | ||
82 | - \_\_le32 | ||
83 | - bg\_exclude\_bitmap\_lo | ||
84 | - Lower 32-bits of location of snapshot exclusion bitmap. | ||
85 | * - 0x18 | ||
86 | - \_\_le16 | ||
87 | - bg\_block\_bitmap\_csum\_lo | ||
88 | - Lower 16-bits of the block bitmap checksum. | ||
89 | * - 0x1A | ||
90 | - \_\_le16 | ||
91 | - bg\_inode\_bitmap\_csum\_lo | ||
92 | - Lower 16-bits of the inode bitmap checksum. | ||
93 | * - 0x1C | ||
94 | - \_\_le16 | ||
95 | - bg\_itable\_unused\_lo | ||
96 | - Lower 16-bits of unused inode count. If set, we needn't scan past the | ||
97 | ``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the | ||
98 | inode table for this group. | ||
99 | * - 0x1E | ||
100 | - \_\_le16 | ||
101 | - bg\_checksum | ||
102 | - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the | ||
103 | RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) & | ||
104 | 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set. | ||
105 | * - | ||
106 | - | ||
107 | - | ||
108 | - These fields only exist if the 64bit feature is enabled and s_desc_size | ||
109 | > 32. | ||
110 | * - 0x20 | ||
111 | - \_\_le32 | ||
112 | - bg\_block\_bitmap\_hi | ||
113 | - Upper 32-bits of location of block bitmap. | ||
114 | * - 0x24 | ||
115 | - \_\_le32 | ||
116 | - bg\_inode\_bitmap\_hi | ||
117 | - Upper 32-bits of location of inodes bitmap. | ||
118 | * - 0x28 | ||
119 | - \_\_le32 | ||
120 | - bg\_inode\_table\_hi | ||
121 | - Upper 32-bits of location of inodes table. | ||
122 | * - 0x2C | ||
123 | - \_\_le16 | ||
124 | - bg\_free\_blocks\_count\_hi | ||
125 | - Upper 16-bits of free block count. | ||
126 | * - 0x2E | ||
127 | - \_\_le16 | ||
128 | - bg\_free\_inodes\_count\_hi | ||
129 | - Upper 16-bits of free inode count. | ||
130 | * - 0x30 | ||
131 | - \_\_le16 | ||
132 | - bg\_used\_dirs\_count\_hi | ||
133 | - Upper 16-bits of directory count. | ||
134 | * - 0x32 | ||
135 | - \_\_le16 | ||
136 | - bg\_itable\_unused\_hi | ||
137 | - Upper 16-bits of unused inode count. | ||
138 | * - 0x34 | ||
139 | - \_\_le32 | ||
140 | - bg\_exclude\_bitmap\_hi | ||
141 | - Upper 32-bits of location of snapshot exclusion bitmap. | ||
142 | * - 0x38 | ||
143 | - \_\_le16 | ||
144 | - bg\_block\_bitmap\_csum\_hi | ||
145 | - Upper 16-bits of the block bitmap checksum. | ||
146 | * - 0x3A | ||
147 | - \_\_le16 | ||
148 | - bg\_inode\_bitmap\_csum\_hi | ||
149 | - Upper 16-bits of the inode bitmap checksum. | ||
150 | * - 0x3C | ||
151 | - \_\_u32 | ||
152 | - bg\_reserved | ||
153 | - Padding to 64 bytes. | ||
154 | |||
155 | .. _bgflags: | ||
156 | |||
157 | Block group flags can be any combination of the following: | ||
158 | |||
159 | .. list-table:: | ||
160 | :widths: 1 79 | ||
161 | :header-rows: 1 | ||
162 | |||
163 | * - Value | ||
164 | - Description | ||
165 | * - 0x1 | ||
166 | - inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT). | ||
167 | * - 0x2 | ||
168 | - block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT). | ||
169 | * - 0x4 | ||
170 | - inode table is zeroed (EXT4\_BG\_INODE\_ZEROED). | ||
diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ondisk/ifork.rst new file mode 100644 index 000000000000..5dbe3b2b121a --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/ifork.rst | |||
@@ -0,0 +1,194 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | The Contents of inode.i\_block | ||
4 | ------------------------------ | ||
5 | |||
6 | Depending on the type of file an inode describes, the 60 bytes of | ||
7 | storage in ``inode.i_block`` can be used in different ways. In general, | ||
8 | regular files and directories will use it for file block indexing | ||
9 | information, and special files will use it for special purposes. | ||
10 | |||
11 | Symbolic Links | ||
12 | ~~~~~~~~~~~~~~ | ||
13 | |||
14 | The target of a symbolic link will be stored in this field if the target | ||
15 | string is less than 60 bytes long. Otherwise, either extents or block | ||
16 | maps will be used to allocate data blocks to store the link target. | ||
17 | |||
18 | Direct/Indirect Block Addressing | ||
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
20 | |||
21 | In ext2/3, file block numbers were mapped to logical block numbers by | ||
22 | means of an (up to) three level 1-1 block map. To find the logical block | ||
23 | that stores a particular file block, the code would navigate through | ||
24 | this increasingly complicated structure. Notice that there is neither a | ||
25 | magic number nor a checksum to provide any level of confidence that the | ||
26 | block isn't full of garbage. | ||
27 | |||
28 | .. ifconfig:: builder != 'latex' | ||
29 | |||
30 | .. include:: blockmap.rst | ||
31 | |||
32 | .. ifconfig:: builder == 'latex' | ||
33 | |||
34 | [Table omitted because LaTeX doesn't support nested tables.] | ||
35 | |||
36 | Note that with this block mapping scheme, it is necessary to fill out a | ||
37 | lot of mapping data even for a large contiguous file! This inefficiency | ||
38 | led to the creation of the extent mapping scheme, discussed below. | ||
39 | |||
40 | Notice also that a file using this mapping scheme cannot be placed | ||
41 | higher than 2^32 blocks. | ||
42 | |||
43 | Extent Tree | ||
44 | ~~~~~~~~~~~ | ||
45 | |||
46 | In ext4, the file to logical block map has been replaced with an extent | ||
47 | tree. Under the old scheme, allocating a contiguous run of 1,000 blocks | ||
48 | requires an indirect block to map all 1,000 entries; with extents, the | ||
49 | mapping is reduced to a single ``struct ext4_extent`` with | ||
50 | ``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate | ||
51 | very large files with a single extent, at a considerable reduction in | ||
52 | metadata block use, and some improvement in disk efficiency. The inode | ||
53 | must have the extents flag (0x80000) flag set for this feature to be in | ||
54 | use. | ||
55 | |||
56 | Extents are arranged as a tree. Each node of the tree begins with a | ||
57 | ``struct ext4_extent_header``. If the node is an interior node | ||
58 | (``eh.eh_depth`` > 0), the header is followed by ``eh.eh_entries`` | ||
59 | instances of ``struct ext4_extent_idx``; each of these index entries | ||
60 | points to a block containing more nodes in the extent tree. If the node | ||
61 | is a leaf node (``eh.eh_depth == 0``), then the header is followed by | ||
62 | ``eh.eh_entries`` instances of ``struct ext4_extent``; these instances | ||
63 | point to the file's data blocks. The root node of the extent tree is | ||
64 | stored in ``inode.i_block``, which allows for the first four extents to | ||
65 | be recorded without the use of extra metadata blocks. | ||
66 | |||
67 | The extent tree header is recorded in ``struct ext4_extent_header``, | ||
68 | which is 12 bytes long: | ||
69 | |||
70 | .. list-table:: | ||
71 | :widths: 1 1 1 77 | ||
72 | :header-rows: 1 | ||
73 | |||
74 | * - Offset | ||
75 | - Size | ||
76 | - Name | ||
77 | - Description | ||
78 | * - 0x0 | ||
79 | - \_\_le16 | ||
80 | - eh\_magic | ||
81 | - Magic number, 0xF30A. | ||
82 | * - 0x2 | ||
83 | - \_\_le16 | ||
84 | - eh\_entries | ||
85 | - Number of valid entries following the header. | ||
86 | * - 0x4 | ||
87 | - \_\_le16 | ||
88 | - eh\_max | ||
89 | - Maximum number of entries that could follow the header. | ||
90 | * - 0x6 | ||
91 | - \_\_le16 | ||
92 | - eh\_depth | ||
93 | - Depth of this extent node in the extent tree. 0 = this extent node | ||
94 | points to data blocks; otherwise, this extent node points to other | ||
95 | extent nodes. The extent tree can be at most 5 levels deep: a logical | ||
96 | block number can be at most ``2^32``, and the smallest ``n`` that | ||
97 | satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5. | ||
98 | * - 0x8 | ||
99 | - \_\_le32 | ||
100 | - eh\_generation | ||
101 | - Generation of the tree. (Used by Lustre, but not standard ext4). | ||
102 | |||
103 | Internal nodes of the extent tree, also known as index nodes, are | ||
104 | recorded as ``struct ext4_extent_idx``, and are 12 bytes long: | ||
105 | |||
106 | .. list-table:: | ||
107 | :widths: 1 1 1 77 | ||
108 | :header-rows: 1 | ||
109 | |||
110 | * - Offset | ||
111 | - Size | ||
112 | - Name | ||
113 | - Description | ||
114 | * - 0x0 | ||
115 | - \_\_le32 | ||
116 | - ei\_block | ||
117 | - This index node covers file blocks from 'block' onward. | ||
118 | * - 0x4 | ||
119 | - \_\_le32 | ||
120 | - ei\_leaf\_lo | ||
121 | - Lower 32-bits of the block number of the extent node that is the next | ||
122 | level lower in the tree. The tree node pointed to can be either another | ||
123 | internal node or a leaf node, described below. | ||
124 | * - 0x8 | ||
125 | - \_\_le16 | ||
126 | - ei\_leaf\_hi | ||
127 | - Upper 16-bits of the previous field. | ||
128 | * - 0xA | ||
129 | - \_\_u16 | ||
130 | - ei\_unused | ||
131 | - | ||
132 | |||
133 | Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, | ||
134 | and are also 12 bytes long: | ||
135 | |||
136 | .. list-table:: | ||
137 | :widths: 1 1 1 77 | ||
138 | :header-rows: 1 | ||
139 | |||
140 | * - Offset | ||
141 | - Size | ||
142 | - Name | ||
143 | - Description | ||
144 | * - 0x0 | ||
145 | - \_\_le32 | ||
146 | - ee\_block | ||
147 | - First file block number that this extent covers. | ||
148 | * - 0x4 | ||
149 | - \_\_le16 | ||
150 | - ee\_len | ||
151 | - Number of blocks covered by extent. If the value of this field is <= | ||
152 | 32768, the extent is initialized. If the value of the field is > 32768, | ||
153 | the extent is uninitialized and the actual extent length is ``ee_len`` - | ||
154 | 32768. Therefore, the maximum length of a initialized extent is 32768 | ||
155 | blocks, and the maximum length of an uninitialized extent is 32767. | ||
156 | * - 0x6 | ||
157 | - \_\_le16 | ||
158 | - ee\_start\_hi | ||
159 | - Upper 16-bits of the block number to which this extent points. | ||
160 | * - 0x8 | ||
161 | - \_\_le32 | ||
162 | - ee\_start\_lo | ||
163 | - Lower 32-bits of the block number to which this extent points. | ||
164 | |||
165 | Prior to the introduction of metadata checksums, the extent header + | ||
166 | extent entries always left at least 4 bytes of unallocated space at the | ||
167 | end of each extent tree data block (because (2^x % 12) >= 4). Therefore, | ||
168 | the 32-bit checksum is inserted into this space. The 4 extents in the | ||
169 | inode do not need checksumming, since the inode is already checksummed. | ||
170 | The checksum is calculated against the FS UUID, the inode number, the | ||
171 | inode generation, and the entire extent block leading up to (but not | ||
172 | including) the checksum itself. | ||
173 | |||
174 | ``struct ext4_extent_tail`` is 4 bytes long: | ||
175 | |||
176 | .. list-table:: | ||
177 | :widths: 1 1 1 77 | ||
178 | :header-rows: 1 | ||
179 | |||
180 | * - Offset | ||
181 | - Size | ||
182 | - Name | ||
183 | - Description | ||
184 | * - 0x0 | ||
185 | - \_\_le32 | ||
186 | - eb\_checksum | ||
187 | - Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock) | ||
188 | |||
189 | Inline Data | ||
190 | ~~~~~~~~~~~ | ||
191 | |||
192 | If the inline data feature is enabled for the filesystem and the flag is | ||
193 | set for the inode, it is possible that the first 60 bytes of the file | ||
194 | data are stored here. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst new file mode 100644 index 000000000000..f7d082c3a435 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/index.rst | |||
@@ -0,0 +1,9 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | ============================== | ||
4 | Data Structures and Algorithms | ||
5 | ============================== | ||
6 | .. include:: about.rst | ||
7 | .. include:: overview.rst | ||
8 | .. include:: globals.rst | ||
9 | .. include:: dynamic.rst | ||
diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/ondisk/inlinedata.rst new file mode 100644 index 000000000000..d1075178ce0b --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/inlinedata.rst | |||
@@ -0,0 +1,37 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Inline Data | ||
4 | ----------- | ||
5 | |||
6 | The inline data feature was designed to handle the case that a file's | ||
7 | data is so tiny that it readily fits inside the inode, which | ||
8 | (theoretically) reduces disk block consumption and reduces seeks. If the | ||
9 | file is smaller than 60 bytes, then the data are stored inline in | ||
10 | ``inode.i_block``. If the rest of the file would fit inside the extended | ||
11 | attribute space, then it might be found as an extended attribute | ||
12 | “system.data” within the inode body (“ibody EA”). This of course | ||
13 | constrains the amount of extended attributes one can attach to an inode. | ||
14 | If the data size increases beyond i\_block + ibody EA, a regular block | ||
15 | is allocated and the contents moved to that block. | ||
16 | |||
17 | Pending a change to compact the extended attribute key used to store | ||
18 | inline data, one ought to be able to store 160 bytes of data in a | ||
19 | 256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to | ||
20 | that, the limit was 156 bytes due to inefficient use of inode space. | ||
21 | |||
22 | The inline data feature requires the presence of an extended attribute | ||
23 | for “system.data”, even if the attribute value is zero length. | ||
24 | |||
25 | Inline Directories | ||
26 | ~~~~~~~~~~~~~~~~~~ | ||
27 | |||
28 | The first four bytes of i\_block are the inode number of the parent | ||
29 | directory. Following that is a 56-byte space for an array of directory | ||
30 | entries; see ``struct ext4_dir_entry``. If there is a “system.data” | ||
31 | attribute in the inode body, the EA value is an array of | ||
32 | ``struct ext4_dir_entry`` as well. Note that for inline directories, the | ||
33 | i\_block and EA space are treated as separate dirent blocks; directory | ||
34 | entries cannot span the two. | ||
35 | |||
36 | Inline directory entries are not checksummed, as the inode checksum | ||
37 | should protect all inline data contents. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/ondisk/inodes.rst new file mode 100644 index 000000000000..655ce898f3f5 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/inodes.rst | |||
@@ -0,0 +1,575 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Index Nodes | ||
4 | ----------- | ||
5 | |||
6 | In a regular UNIX filesystem, the inode stores all the metadata | ||
7 | pertaining to the file (time stamps, block maps, extended attributes, | ||
8 | etc), not the directory entry. To find the information associated with a | ||
9 | file, one must traverse the directory files to find the directory entry | ||
10 | associated with a file, then load the inode to find the metadata for | ||
11 | that file. ext4 appears to cheat (for performance reasons) a little bit | ||
12 | by storing a copy of the file type (normally stored in the inode) in the | ||
13 | directory entry. (Compare all this to FAT, which stores all the file | ||
14 | information directly in the directory entry, but does not support hard | ||
15 | links and is in general more seek-happy than ext4 due to its simpler | ||
16 | block allocator and extensive use of linked lists.) | ||
17 | |||
18 | The inode table is a linear array of ``struct ext4_inode``. The table is | ||
19 | sized to have enough blocks to store at least | ||
20 | ``sb.s_inode_size * sb.s_inodes_per_group`` bytes. The number of the | ||
21 | block group containing an inode can be calculated as | ||
22 | ``(inode_number - 1) / sb.s_inodes_per_group``, and the offset into the | ||
23 | group's table is ``(inode_number - 1) % sb.s_inodes_per_group``. There | ||
24 | is no inode 0. | ||
25 | |||
26 | The inode checksum is calculated against the FS UUID, the inode number, | ||
27 | and the inode structure itself. | ||
28 | |||
29 | The inode table entry is laid out in ``struct ext4_inode``. | ||
30 | |||
31 | .. list-table:: | ||
32 | :widths: 1 1 1 77 | ||
33 | :header-rows: 1 | ||
34 | |||
35 | * - Offset | ||
36 | - Size | ||
37 | - Name | ||
38 | - Description | ||
39 | * - 0x0 | ||
40 | - \_\_le16 | ||
41 | - i\_mode | ||
42 | - File mode. See the table i_mode_ below. | ||
43 | * - 0x2 | ||
44 | - \_\_le16 | ||
45 | - i\_uid | ||
46 | - Lower 16-bits of Owner UID. | ||
47 | * - 0x4 | ||
48 | - \_\_le32 | ||
49 | - i\_size\_lo | ||
50 | - Lower 32-bits of size in bytes. | ||
51 | * - 0x8 | ||
52 | - \_\_le32 | ||
53 | - i\_atime | ||
54 | - Last access time, in seconds since the epoch. However, if the EA\_INODE | ||
55 | inode flag is set, this inode stores an extended attribute value and | ||
56 | this field contains the checksum of the value. | ||
57 | * - 0xC | ||
58 | - \_\_le32 | ||
59 | - i\_ctime | ||
60 | - Last inode change time, in seconds since the epoch. However, if the | ||
61 | EA\_INODE inode flag is set, this inode stores an extended attribute | ||
62 | value and this field contains the lower 32 bits of the attribute value's | ||
63 | reference count. | ||
64 | * - 0x10 | ||
65 | - \_\_le32 | ||
66 | - i\_mtime | ||
67 | - Last data modification time, in seconds since the epoch. However, if the | ||
68 | EA\_INODE inode flag is set, this inode stores an extended attribute | ||
69 | value and this field contains the number of the inode that owns the | ||
70 | extended attribute. | ||
71 | * - 0x14 | ||
72 | - \_\_le32 | ||
73 | - i\_dtime | ||
74 | - Deletion Time, in seconds since the epoch. | ||
75 | * - 0x18 | ||
76 | - \_\_le16 | ||
77 | - i\_gid | ||
78 | - Lower 16-bits of GID. | ||
79 | * - 0x1A | ||
80 | - \_\_le16 | ||
81 | - i\_links\_count | ||
82 | - Hard link count. Normally, ext4 does not permit an inode to have more | ||
83 | than 65,000 hard links. This applies to files as well as directories, | ||
84 | which means that there cannot be more than 64,998 subdirectories in a | ||
85 | directory (each subdirectory's '..' entry counts as a hard link, as does | ||
86 | the '.' entry in the directory itself). With the DIR\_NLINK feature | ||
87 | enabled, ext4 supports more than 64,998 subdirectories by setting this | ||
88 | field to 1 to indicate that the number of hard links is not known. | ||
89 | * - 0x1C | ||
90 | - \_\_le32 | ||
91 | - i\_blocks\_lo | ||
92 | - Lower 32-bits of “block” count. If the huge\_file feature flag is not | ||
93 | set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks | ||
94 | on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in | ||
95 | ``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi | ||
96 | << 32)`` 512-byte blocks on disk. If huge\_file is set and | ||
97 | EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file | ||
98 | consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on | ||
99 | disk. | ||
100 | * - 0x20 | ||
101 | - \_\_le32 | ||
102 | - i\_flags | ||
103 | - Inode flags. See the table i_flags_ below. | ||
104 | * - 0x24 | ||
105 | - 4 bytes | ||
106 | - i\_osd1 | ||
107 | - See the table i_osd1_ for more details. | ||
108 | * - 0x28 | ||
109 | - 60 bytes | ||
110 | - i\_block[EXT4\_N\_BLOCKS=15] | ||
111 | - Block map or extent tree. See the section “The Contents of inode.i\_block”. | ||
112 | * - 0x64 | ||
113 | - \_\_le32 | ||
114 | - i\_generation | ||
115 | - File version (for NFS). | ||
116 | * - 0x68 | ||
117 | - \_\_le32 | ||
118 | - i\_file\_acl\_lo | ||
119 | - Lower 32-bits of extended attribute block. ACLs are of course one of | ||
120 | many possible extended attributes; I think the name of this field is a | ||
121 | result of the first use of extended attributes being for ACLs. | ||
122 | * - 0x6C | ||
123 | - \_\_le32 | ||
124 | - i\_size\_high / i\_dir\_acl | ||
125 | - Upper 32-bits of file/directory size. In ext2/3 this field was named | ||
126 | i\_dir\_acl, though it was usually set to zero and never used. | ||
127 | * - 0x70 | ||
128 | - \_\_le32 | ||
129 | - i\_obso\_faddr | ||
130 | - (Obsolete) fragment address. | ||
131 | * - 0x74 | ||
132 | - 12 bytes | ||
133 | - i\_osd2 | ||
134 | - See the table i_osd2_ for more details. | ||
135 | * - 0x80 | ||
136 | - \_\_le16 | ||
137 | - i\_extra\_isize | ||
138 | - Size of this inode - 128. Alternately, the size of the extended inode | ||
139 | fields beyond the original ext2 inode, including this field. | ||
140 | * - 0x82 | ||
141 | - \_\_le16 | ||
142 | - i\_checksum\_hi | ||
143 | - Upper 16-bits of the inode checksum. | ||
144 | * - 0x84 | ||
145 | - \_\_le32 | ||
146 | - i\_ctime\_extra | ||
147 | - Extra change time bits. This provides sub-second precision. See Inode | ||
148 | Timestamps section. | ||
149 | * - 0x88 | ||
150 | - \_\_le32 | ||
151 | - i\_mtime\_extra | ||
152 | - Extra modification time bits. This provides sub-second precision. | ||
153 | * - 0x8C | ||
154 | - \_\_le32 | ||
155 | - i\_atime\_extra | ||
156 | - Extra access time bits. This provides sub-second precision. | ||
157 | * - 0x90 | ||
158 | - \_\_le32 | ||
159 | - i\_crtime | ||
160 | - File creation time, in seconds since the epoch. | ||
161 | * - 0x94 | ||
162 | - \_\_le32 | ||
163 | - i\_crtime\_extra | ||
164 | - Extra file creation time bits. This provides sub-second precision. | ||
165 | * - 0x98 | ||
166 | - \_\_le32 | ||
167 | - i\_version\_hi | ||
168 | - Upper 32-bits for version number. | ||
169 | * - 0x9C | ||
170 | - \_\_le32 | ||
171 | - i\_projid | ||
172 | - Project ID. | ||
173 | |||
174 | .. _i_mode: | ||
175 | |||
176 | The ``i_mode`` value is a combination of the following flags: | ||
177 | |||
178 | .. list-table:: | ||
179 | :widths: 1 79 | ||
180 | :header-rows: 1 | ||
181 | |||
182 | * - Value | ||
183 | - Description | ||
184 | * - 0x1 | ||
185 | - S\_IXOTH (Others may execute) | ||
186 | * - 0x2 | ||
187 | - S\_IWOTH (Others may write) | ||
188 | * - 0x4 | ||
189 | - S\_IROTH (Others may read) | ||
190 | * - 0x8 | ||
191 | - S\_IXGRP (Group members may execute) | ||
192 | * - 0x10 | ||
193 | - S\_IWGRP (Group members may write) | ||
194 | * - 0x20 | ||
195 | - S\_IRGRP (Group members may read) | ||
196 | * - 0x40 | ||
197 | - S\_IXUSR (Owner may execute) | ||
198 | * - 0x80 | ||
199 | - S\_IWUSR (Owner may write) | ||
200 | * - 0x100 | ||
201 | - S\_IRUSR (Owner may read) | ||
202 | * - 0x200 | ||
203 | - S\_ISVTX (Sticky bit) | ||
204 | * - 0x400 | ||
205 | - S\_ISGID (Set GID) | ||
206 | * - 0x800 | ||
207 | - S\_ISUID (Set UID) | ||
208 | * - | ||
209 | - These are mutually-exclusive file types: | ||
210 | * - 0x1000 | ||
211 | - S\_IFIFO (FIFO) | ||
212 | * - 0x2000 | ||
213 | - S\_IFCHR (Character device) | ||
214 | * - 0x4000 | ||
215 | - S\_IFDIR (Directory) | ||
216 | * - 0x6000 | ||
217 | - S\_IFBLK (Block device) | ||
218 | * - 0x8000 | ||
219 | - S\_IFREG (Regular file) | ||
220 | * - 0xA000 | ||
221 | - S\_IFLNK (Symbolic link) | ||
222 | * - 0xC000 | ||
223 | - S\_IFSOCK (Socket) | ||
224 | |||
225 | .. _i_flags: | ||
226 | |||
227 | The ``i_flags`` field is a combination of these values: | ||
228 | |||
229 | .. list-table:: | ||
230 | :widths: 1 79 | ||
231 | :header-rows: 1 | ||
232 | |||
233 | * - Value | ||
234 | - Description | ||
235 | * - 0x1 | ||
236 | - This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented) | ||
237 | * - 0x2 | ||
238 | - This file should be preserved, should undeletion be desired | ||
239 | (EXT4\_UNRM\_FL). (not implemented) | ||
240 | * - 0x4 | ||
241 | - File is compressed (EXT4\_COMPR\_FL). (not really implemented) | ||
242 | * - 0x8 | ||
243 | - All writes to the file must be synchronous (EXT4\_SYNC\_FL). | ||
244 | * - 0x10 | ||
245 | - File is immutable (EXT4\_IMMUTABLE\_FL). | ||
246 | * - 0x20 | ||
247 | - File can only be appended (EXT4\_APPEND\_FL). | ||
248 | * - 0x40 | ||
249 | - The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL). | ||
250 | * - 0x80 | ||
251 | - Do not update access time (EXT4\_NOATIME\_FL). | ||
252 | * - 0x100 | ||
253 | - Dirty compressed file (EXT4\_DIRTY\_FL). (not used) | ||
254 | * - 0x200 | ||
255 | - File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used) | ||
256 | * - 0x400 | ||
257 | - Do not compress file (EXT4\_NOCOMPR\_FL). (not used) | ||
258 | * - 0x800 | ||
259 | - Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was | ||
260 | EXT4\_ECOMPR\_FL (compression error), which was never used. | ||
261 | * - 0x1000 | ||
262 | - Directory has hashed indexes (EXT4\_INDEX\_FL). | ||
263 | * - 0x2000 | ||
264 | - AFS magic directory (EXT4\_IMAGIC\_FL). | ||
265 | * - 0x4000 | ||
266 | - File data must always be written through the journal | ||
267 | (EXT4\_JOURNAL\_DATA\_FL). | ||
268 | * - 0x8000 | ||
269 | - File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4) | ||
270 | * - 0x10000 | ||
271 | - All directory entry data should be written synchronously (see | ||
272 | ``dirsync``) (EXT4\_DIRSYNC\_FL). | ||
273 | * - 0x20000 | ||
274 | - Top of directory hierarchy (EXT4\_TOPDIR\_FL). | ||
275 | * - 0x40000 | ||
276 | - This is a huge file (EXT4\_HUGE\_FILE\_FL). | ||
277 | * - 0x80000 | ||
278 | - Inode uses extents (EXT4\_EXTENTS\_FL). | ||
279 | * - 0x200000 | ||
280 | - Inode stores a large extended attribute value in its data blocks | ||
281 | (EXT4\_EA\_INODE\_FL). | ||
282 | * - 0x400000 | ||
283 | - This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL). | ||
284 | (deprecated) | ||
285 | * - 0x01000000 | ||
286 | - Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline) | ||
287 | * - 0x04000000 | ||
288 | - Snapshot is being deleted (``EXT4_SNAPFILE_DELETED_FL``). (not in | ||
289 | mainline) | ||
290 | * - 0x08000000 | ||
291 | - Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in | ||
292 | mainline) | ||
293 | * - 0x10000000 | ||
294 | - Inode has inline data (EXT4\_INLINE\_DATA\_FL). | ||
295 | * - 0x20000000 | ||
296 | - Create children with the same project ID (EXT4\_PROJINHERIT\_FL). | ||
297 | * - 0x80000000 | ||
298 | - Reserved for ext4 library (EXT4\_RESERVED\_FL). | ||
299 | * - | ||
300 | - Aggregate flags: | ||
301 | * - 0x4BDFFF | ||
302 | - User-visible flags. | ||
303 | * - 0x4B80FF | ||
304 | - User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and | ||
305 | EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's | ||
306 | EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of | ||
307 | these flags in a special manner and they are masked out of the set of | ||
308 | flags that are saved directly to i\_flags. | ||
309 | |||
310 | .. _i_osd1: | ||
311 | |||
312 | The ``osd1`` field has multiple meanings depending on the creator: | ||
313 | |||
314 | Linux: | ||
315 | |||
316 | .. list-table:: | ||
317 | :widths: 1 1 1 77 | ||
318 | :header-rows: 1 | ||
319 | |||
320 | * - Offset | ||
321 | - Size | ||
322 | - Name | ||
323 | - Description | ||
324 | * - 0x0 | ||
325 | - \_\_le32 | ||
326 | - l\_i\_version | ||
327 | - Inode version. However, if the EA\_INODE inode flag is set, this inode | ||
328 | stores an extended attribute value and this field contains the upper 32 | ||
329 | bits of the attribute value's reference count. | ||
330 | |||
331 | Hurd: | ||
332 | |||
333 | .. list-table:: | ||
334 | :widths: 1 1 1 77 | ||
335 | :header-rows: 1 | ||
336 | |||
337 | * - Offset | ||
338 | - Size | ||
339 | - Name | ||
340 | - Description | ||
341 | * - 0x0 | ||
342 | - \_\_le32 | ||
343 | - h\_i\_translator | ||
344 | - ?? | ||
345 | |||
346 | Masix: | ||
347 | |||
348 | .. list-table:: | ||
349 | :widths: 1 1 1 77 | ||
350 | :header-rows: 1 | ||
351 | |||
352 | * - Offset | ||
353 | - Size | ||
354 | - Name | ||
355 | - Description | ||
356 | * - 0x0 | ||
357 | - \_\_le32 | ||
358 | - m\_i\_reserved | ||
359 | - ?? | ||
360 | |||
361 | .. _i_osd2: | ||
362 | |||
363 | The ``osd2`` field has multiple meanings depending on the filesystem creator: | ||
364 | |||
365 | Linux: | ||
366 | |||
367 | .. list-table:: | ||
368 | :widths: 1 1 1 77 | ||
369 | :header-rows: 1 | ||
370 | |||
371 | * - Offset | ||
372 | - Size | ||
373 | - Name | ||
374 | - Description | ||
375 | * - 0x0 | ||
376 | - \_\_le16 | ||
377 | - l\_i\_blocks\_high | ||
378 | - Upper 16-bits of the block count. Please see the note attached to | ||
379 | i\_blocks\_lo. | ||
380 | * - 0x2 | ||
381 | - \_\_le16 | ||
382 | - l\_i\_file\_acl\_high | ||
383 | - Upper 16-bits of the extended attribute block (historically, the file | ||
384 | ACL location). See the Extended Attributes section below. | ||
385 | * - 0x4 | ||
386 | - \_\_le16 | ||
387 | - l\_i\_uid\_high | ||
388 | - Upper 16-bits of the Owner UID. | ||
389 | * - 0x6 | ||
390 | - \_\_le16 | ||
391 | - l\_i\_gid\_high | ||
392 | - Upper 16-bits of the GID. | ||
393 | * - 0x8 | ||
394 | - \_\_le16 | ||
395 | - l\_i\_checksum\_lo | ||
396 | - Lower 16-bits of the inode checksum. | ||
397 | * - 0xA | ||
398 | - \_\_le16 | ||
399 | - l\_i\_reserved | ||
400 | - Unused. | ||
401 | |||
402 | Hurd: | ||
403 | |||
404 | .. list-table:: | ||
405 | :widths: 1 1 1 77 | ||
406 | :header-rows: 1 | ||
407 | |||
408 | * - Offset | ||
409 | - Size | ||
410 | - Name | ||
411 | - Description | ||
412 | * - 0x0 | ||
413 | - \_\_le16 | ||
414 | - h\_i\_reserved1 | ||
415 | - ?? | ||
416 | * - 0x2 | ||
417 | - \_\_u16 | ||
418 | - h\_i\_mode\_high | ||
419 | - Upper 16-bits of the file mode. | ||
420 | * - 0x4 | ||
421 | - \_\_le16 | ||
422 | - h\_i\_uid\_high | ||
423 | - Upper 16-bits of the Owner UID. | ||
424 | * - 0x6 | ||
425 | - \_\_le16 | ||
426 | - h\_i\_gid\_high | ||
427 | - Upper 16-bits of the GID. | ||
428 | * - 0x8 | ||
429 | - \_\_u32 | ||
430 | - h\_i\_author | ||
431 | - Author code? | ||
432 | |||
433 | Masix: | ||
434 | |||
435 | .. list-table:: | ||
436 | :widths: 1 1 1 77 | ||
437 | :header-rows: 1 | ||
438 | |||
439 | * - Offset | ||
440 | - Size | ||
441 | - Name | ||
442 | - Description | ||
443 | * - 0x0 | ||
444 | - \_\_le16 | ||
445 | - h\_i\_reserved1 | ||
446 | - ?? | ||
447 | * - 0x2 | ||
448 | - \_\_u16 | ||
449 | - m\_i\_file\_acl\_high | ||
450 | - Upper 16-bits of the extended attribute block (historically, the file | ||
451 | ACL location). | ||
452 | * - 0x4 | ||
453 | - \_\_u32 | ||
454 | - m\_i\_reserved2[2] | ||
455 | - ?? | ||
456 | |||
457 | Inode Size | ||
458 | ~~~~~~~~~~ | ||
459 | |||
460 | In ext2 and ext3, the inode structure size was fixed at 128 bytes | ||
461 | (``EXT2_GOOD_OLD_INODE_SIZE``) and each inode had a disk record size of | ||
462 | 128 bytes. Starting with ext4, it is possible to allocate a larger | ||
463 | on-disk inode at format time for all inodes in the filesystem to provide | ||
464 | space beyond the end of the original ext2 inode. The on-disk inode | ||
465 | record size is recorded in the superblock as ``s_inode_size``. The | ||
466 | number of bytes actually used by struct ext4\_inode beyond the original | ||
467 | 128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each | ||
468 | inode, which allows struct ext4\_inode to grow for a new kernel without | ||
469 | having to upgrade all of the on-disk inodes. Access to fields beyond | ||
470 | EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within | ||
471 | ``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as | ||
472 | of October 2013) the inode structure is 156 bytes | ||
473 | (``i_extra_isize = 28``). The extra space between the end of the inode | ||
474 | structure and the end of the inode record can be used to store extended | ||
475 | attributes. Each inode record can be as large as the filesystem block | ||
476 | size, though this is not terribly efficient. | ||
477 | |||
478 | Finding an Inode | ||
479 | ~~~~~~~~~~~~~~~~ | ||
480 | |||
481 | Each block group contains ``sb->s_inodes_per_group`` inodes. Because | ||
482 | inode 0 is defined not to exist, this formula can be used to find the | ||
483 | block group that an inode lives in: | ||
484 | ``bg = (inode_num - 1) / sb->s_inodes_per_group``. The particular inode | ||
485 | can be found within the block group's inode table at | ||
486 | ``index = (inode_num - 1) % sb->s_inodes_per_group``. To get the byte | ||
487 | address within the inode table, use | ||
488 | ``offset = index * sb->s_inode_size``. | ||
489 | |||
490 | Inode Timestamps | ||
491 | ~~~~~~~~~~~~~~~~ | ||
492 | |||
493 | Four timestamps are recorded in the lower 128 bytes of the inode | ||
494 | structure -- inode change time (ctime), access time (atime), data | ||
495 | modification time (mtime), and deletion time (dtime). The four fields | ||
496 | are 32-bit signed integers that represent seconds since the Unix epoch | ||
497 | (1970-01-01 00:00:00 GMT), which means that the fields will overflow in | ||
498 | January 2038. For inodes that are not linked from any directory but are | ||
499 | still open (orphan inodes), the dtime field is overloaded for use with | ||
500 | the orphan list. The superblock field ``s_last_orphan`` points to the | ||
501 | first inode in the orphan list; dtime is then the number of the next | ||
502 | orphaned inode, or zero if there are no more orphans. | ||
503 | |||
504 | If the inode structure size ``sb->s_inode_size`` is larger than 128 | ||
505 | bytes and the ``i_inode_extra`` field is large enough to encompass the | ||
506 | respective ``i_[cma]time_extra`` field, the ctime, atime, and mtime | ||
507 | inode fields are widened to 64 bits. Within this “extra” 32-bit field, | ||
508 | the lower two bits are used to extend the 32-bit seconds field to be 34 | ||
509 | bit wide; the upper 30 bits are used to provide nanosecond timestamp | ||
510 | accuracy. Therefore, timestamps should not overflow until May 2446. | ||
511 | dtime was not widened. There is also a fifth timestamp to record inode | ||
512 | creation time (crtime); this field is 64-bits wide and decoded in the | ||
513 | same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible | ||
514 | through the regular stat() interface, though debugfs will report them. | ||
515 | |||
516 | We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)). | ||
517 | In other words: | ||
518 | |||
519 | .. list-table:: | ||
520 | :widths: 20 20 20 20 20 | ||
521 | :header-rows: 1 | ||
522 | |||
523 | * - Extra epoch bits | ||
524 | - MSB of 32-bit time | ||
525 | - Adjustment for signed 32-bit to 64-bit tv\_sec | ||
526 | - Decoded 64-bit tv\_sec | ||
527 | - valid time range | ||
528 | * - 0 0 | ||
529 | - 1 | ||
530 | - 0 | ||
531 | - ``-0x80000000 - -0x00000001`` | ||
532 | - 1901-12-13 to 1969-12-31 | ||
533 | * - 0 0 | ||
534 | - 0 | ||
535 | - 0 | ||
536 | - ``0x000000000 - 0x07fffffff`` | ||
537 | - 1970-01-01 to 2038-01-19 | ||
538 | * - 0 1 | ||
539 | - 1 | ||
540 | - 0x100000000 | ||
541 | - ``0x080000000 - 0x0ffffffff`` | ||
542 | - 2038-01-19 to 2106-02-07 | ||
543 | * - 0 1 | ||
544 | - 0 | ||
545 | - 0x100000000 | ||
546 | - ``0x100000000 - 0x17fffffff`` | ||
547 | - 2106-02-07 to 2174-02-25 | ||
548 | * - 1 0 | ||
549 | - 1 | ||
550 | - 0x200000000 | ||
551 | - ``0x180000000 - 0x1ffffffff`` | ||
552 | - 2174-02-25 to 2242-03-16 | ||
553 | * - 1 0 | ||
554 | - 0 | ||
555 | - 0x200000000 | ||
556 | - ``0x200000000 - 0x27fffffff`` | ||
557 | - 2242-03-16 to 2310-04-04 | ||
558 | * - 1 1 | ||
559 | - 1 | ||
560 | - 0x300000000 | ||
561 | - ``0x280000000 - 0x2ffffffff`` | ||
562 | - 2310-04-04 to 2378-04-22 | ||
563 | * - 1 1 | ||
564 | - 0 | ||
565 | - 0x300000000 | ||
566 | - ``0x300000000 - 0x37fffffff`` | ||
567 | - 2378-04-22 to 2446-05-10 | ||
568 | |||
569 | This is a somewhat odd encoding since there are effectively seven times | ||
570 | as many positive values as negative values. There have also been | ||
571 | long-standing bugs decoding and encoding dates beyond 2038, which don't | ||
572 | seem to be fixed as of kernel 3.12 and e2fsprogs 1.42.8. 64-bit kernels | ||
573 | incorrectly use the extra epoch bits 1,1 for dates between 1901 and | ||
574 | 1970. At some point the kernel will be fixed and e2fsck will fix this | ||
575 | situation, assuming that it is run before 2310. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/ondisk/journal.rst new file mode 100644 index 000000000000..e7031af86876 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/journal.rst | |||
@@ -0,0 +1,611 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Journal (jbd2) | ||
4 | -------------- | ||
5 | |||
6 | Introduced in ext3, the ext4 filesystem employs a journal to protect the | ||
7 | filesystem against corruption in the case of a system crash. A small | ||
8 | continuous region of disk (default 128MiB) is reserved inside the | ||
9 | filesystem as a place to land “important” data writes on-disk as quickly | ||
10 | as possible. Once the important data transaction is fully written to the | ||
11 | disk and flushed from the disk write cache, a record of the data being | ||
12 | committed is also written to the journal. At some later point in time, | ||
13 | the journal code writes the transactions to their final locations on | ||
14 | disk (this could involve a lot of seeking or a lot of small | ||
15 | read-write-erases) before erasing the commit record. Should the system | ||
16 | crash during the second slow write, the journal can be replayed all the | ||
17 | way to the latest commit record, guaranteeing the atomicity of whatever | ||
18 | gets written through the journal to the disk. The effect of this is to | ||
19 | guarantee that the filesystem does not become stuck midway through a | ||
20 | metadata update. | ||
21 | |||
22 | For performance reasons, ext4 by default only writes filesystem metadata | ||
23 | through the journal. This means that file data blocks are /not/ | ||
24 | guaranteed to be in any consistent state after a crash. If this default | ||
25 | guarantee level (``data=ordered``) is not satisfactory, there is a mount | ||
26 | option to control journal behavior. If ``data=journal``, all data and | ||
27 | metadata are written to disk through the journal. This is slower but | ||
28 | safest. If ``data=writeback``, dirty data blocks are not flushed to the | ||
29 | disk before the metadata are written to disk through the journal. | ||
30 | |||
31 | The journal inode is typically inode 8. The first 68 bytes of the | ||
32 | journal inode are replicated in the ext4 superblock. The journal itself | ||
33 | is normal (but hidden) file within the filesystem. The file usually | ||
34 | consumes an entire block group, though mke2fs tries to put it in the | ||
35 | middle of the disk. | ||
36 | |||
37 | All fields in jbd2 are written to disk in big-endian order. This is the | ||
38 | opposite of ext4. | ||
39 | |||
40 | NOTE: Both ext4 and ocfs2 use jbd2. | ||
41 | |||
42 | The maximum size of a journal embedded in an ext4 filesystem is 2^32 | ||
43 | blocks. jbd2 itself does not seem to care. | ||
44 | |||
45 | Layout | ||
46 | ~~~~~~ | ||
47 | |||
48 | Generally speaking, the journal has this format: | ||
49 | |||
50 | .. list-table:: | ||
51 | :widths: 1 1 78 | ||
52 | :header-rows: 1 | ||
53 | |||
54 | * - Superblock | ||
55 | - descriptor\_block (data\_blocks or revocation\_block) [more data or | ||
56 | revocations] commmit\_block | ||
57 | - [more transactions...] | ||
58 | * - | ||
59 | - One transaction | ||
60 | - | ||
61 | |||
62 | Notice that a transaction begins with either a descriptor and some data, | ||
63 | or a block revocation list. A finished transaction always ends with a | ||
64 | commit. If there is no commit record (or the checksums don't match), the | ||
65 | transaction will be discarded during replay. | ||
66 | |||
67 | External Journal | ||
68 | ~~~~~~~~~~~~~~~~ | ||
69 | |||
70 | Optionally, an ext4 filesystem can be created with an external journal | ||
71 | device (as opposed to an internal journal, which uses a reserved inode). | ||
72 | In this case, on the filesystem device, ``s_journal_inum`` should be | ||
73 | zero and ``s_journal_uuid`` should be set. On the journal device there | ||
74 | will be an ext4 super block in the usual place, with a matching UUID. | ||
75 | The journal superblock will be in the next full block after the | ||
76 | superblock. | ||
77 | |||
78 | .. list-table:: | ||
79 | :widths: 1 1 1 1 76 | ||
80 | :header-rows: 1 | ||
81 | |||
82 | * - 1024 bytes of padding | ||
83 | - ext4 Superblock | ||
84 | - Journal Superblock | ||
85 | - descriptor\_block (data\_blocks or revocation\_block) [more data or | ||
86 | revocations] commmit\_block | ||
87 | - [more transactions...] | ||
88 | * - | ||
89 | - | ||
90 | - | ||
91 | - One transaction | ||
92 | - | ||
93 | |||
94 | Block Header | ||
95 | ~~~~~~~~~~~~ | ||
96 | |||
97 | Every block in the journal starts with a common 12-byte header | ||
98 | ``struct journal_header_s``: | ||
99 | |||
100 | .. list-table:: | ||
101 | :widths: 1 1 1 77 | ||
102 | :header-rows: 1 | ||
103 | |||
104 | * - Offset | ||
105 | - Type | ||
106 | - Name | ||
107 | - Description | ||
108 | * - 0x0 | ||
109 | - \_\_be32 | ||
110 | - h\_magic | ||
111 | - jbd2 magic number, 0xC03B3998. | ||
112 | * - 0x4 | ||
113 | - \_\_be32 | ||
114 | - h\_blocktype | ||
115 | - Description of what this block contains. See the jbd2_blocktype_ table | ||
116 | below. | ||
117 | * - 0x8 | ||
118 | - \_\_be32 | ||
119 | - h\_sequence | ||
120 | - The transaction ID that goes with this block. | ||
121 | |||
122 | .. _jbd2_blocktype: | ||
123 | |||
124 | The journal block type can be any one of: | ||
125 | |||
126 | .. list-table:: | ||
127 | :widths: 1 79 | ||
128 | :header-rows: 1 | ||
129 | |||
130 | * - Value | ||
131 | - Description | ||
132 | * - 1 | ||
133 | - Descriptor. This block precedes a series of data blocks that were | ||
134 | written through the journal during a transaction. | ||
135 | * - 2 | ||
136 | - Block commit record. This block signifies the completion of a | ||
137 | transaction. | ||
138 | * - 3 | ||
139 | - Journal superblock, v1. | ||
140 | * - 4 | ||
141 | - Journal superblock, v2. | ||
142 | * - 5 | ||
143 | - Block revocation records. This speeds up recovery by enabling the | ||
144 | journal to skip writing blocks that were subsequently rewritten. | ||
145 | |||
146 | Super Block | ||
147 | ~~~~~~~~~~~ | ||
148 | |||
149 | The super block for the journal is much simpler as compared to ext4's. | ||
150 | The key data kept within are size of the journal, and where to find the | ||
151 | start of the log of transactions. | ||
152 | |||
153 | The journal superblock is recorded as ``struct journal_superblock_s``, | ||
154 | which is 1024 bytes long: | ||
155 | |||
156 | .. list-table:: | ||
157 | :widths: 1 1 1 77 | ||
158 | :header-rows: 1 | ||
159 | |||
160 | * - Offset | ||
161 | - Type | ||
162 | - Name | ||
163 | - Description | ||
164 | * - | ||
165 | - | ||
166 | - | ||
167 | - Static information describing the journal. | ||
168 | * - 0x0 | ||
169 | - journal\_header\_t (12 bytes) | ||
170 | - s\_header | ||
171 | - Common header identifying this as a superblock. | ||
172 | * - 0xC | ||
173 | - \_\_be32 | ||
174 | - s\_blocksize | ||
175 | - Journal device block size. | ||
176 | * - 0x10 | ||
177 | - \_\_be32 | ||
178 | - s\_maxlen | ||
179 | - Total number of blocks in this journal. | ||
180 | * - 0x14 | ||
181 | - \_\_be32 | ||
182 | - s\_first | ||
183 | - First block of log information. | ||
184 | * - | ||
185 | - | ||
186 | - | ||
187 | - Dynamic information describing the current state of the log. | ||
188 | * - 0x18 | ||
189 | - \_\_be32 | ||
190 | - s\_sequence | ||
191 | - First commit ID expected in log. | ||
192 | * - 0x1C | ||
193 | - \_\_be32 | ||
194 | - s\_start | ||
195 | - Block number of the start of log. Contrary to the comments, this field | ||
196 | being zero does not imply that the journal is clean! | ||
197 | * - 0x20 | ||
198 | - \_\_be32 | ||
199 | - s\_errno | ||
200 | - Error value, as set by jbd2\_journal\_abort(). | ||
201 | * - | ||
202 | - | ||
203 | - | ||
204 | - The remaining fields are only valid in a v2 superblock. | ||
205 | * - 0x24 | ||
206 | - \_\_be32 | ||
207 | - s\_feature\_compat; | ||
208 | - Compatible feature set. See the table jbd2_compat_ below. | ||
209 | * - 0x28 | ||
210 | - \_\_be32 | ||
211 | - s\_feature\_incompat | ||
212 | - Incompatible feature set. See the table jbd2_incompat_ below. | ||
213 | * - 0x2C | ||
214 | - \_\_be32 | ||
215 | - s\_feature\_ro\_compat | ||
216 | - Read-only compatible feature set. There aren't any of these currently. | ||
217 | * - 0x30 | ||
218 | - \_\_u8 | ||
219 | - s\_uuid[16] | ||
220 | - 128-bit uuid for journal. This is compared against the copy in the ext4 | ||
221 | super block at mount time. | ||
222 | * - 0x40 | ||
223 | - \_\_be32 | ||
224 | - s\_nr\_users | ||
225 | - Number of file systems sharing this journal. | ||
226 | * - 0x44 | ||
227 | - \_\_be32 | ||
228 | - s\_dynsuper | ||
229 | - Location of dynamic super block copy. (Not used?) | ||
230 | * - 0x48 | ||
231 | - \_\_be32 | ||
232 | - s\_max\_transaction | ||
233 | - Limit of journal blocks per transaction. (Not used?) | ||
234 | * - 0x4C | ||
235 | - \_\_be32 | ||
236 | - s\_max\_trans\_data | ||
237 | - Limit of data blocks per transaction. (Not used?) | ||
238 | * - 0x50 | ||
239 | - \_\_u8 | ||
240 | - s\_checksum\_type | ||
241 | - Checksum algorithm used for the journal. See jbd2_checksum_type_ for | ||
242 | more info. | ||
243 | * - 0x51 | ||
244 | - \_\_u8[3] | ||
245 | - s\_padding2 | ||
246 | - | ||
247 | * - 0x54 | ||
248 | - \_\_u32 | ||
249 | - s\_padding[42] | ||
250 | - | ||
251 | * - 0xFC | ||
252 | - \_\_be32 | ||
253 | - s\_checksum | ||
254 | - Checksum of the entire superblock, with this field set to zero. | ||
255 | * - 0x100 | ||
256 | - \_\_u8 | ||
257 | - s\_users[16\*48] | ||
258 | - ids of all file systems sharing the log. e2fsprogs/Linux don't allow | ||
259 | shared external journals, but I imagine Lustre (or ocfs2?), which use | ||
260 | the jbd2 code, might. | ||
261 | |||
262 | .. _jbd2_compat: | ||
263 | |||
264 | The journal compat features are any combination of the following: | ||
265 | |||
266 | .. list-table:: | ||
267 | :widths: 1 79 | ||
268 | :header-rows: 1 | ||
269 | |||
270 | * - Value | ||
271 | - Description | ||
272 | * - 0x1 | ||
273 | - Journal maintains checksums on the data blocks. | ||
274 | (JBD2\_FEATURE\_COMPAT\_CHECKSUM) | ||
275 | |||
276 | .. _jbd2_incompat: | ||
277 | |||
278 | The journal incompat features are any combination of the following: | ||
279 | |||
280 | .. list-table:: | ||
281 | :widths: 1 79 | ||
282 | :header-rows: 1 | ||
283 | |||
284 | * - Value | ||
285 | - Description | ||
286 | * - 0x1 | ||
287 | - Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE) | ||
288 | * - 0x2 | ||
289 | - Journal can deal with 64-bit block numbers. | ||
290 | (JBD2\_FEATURE\_INCOMPAT\_64BIT) | ||
291 | * - 0x4 | ||
292 | - Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT) | ||
293 | * - 0x8 | ||
294 | - This journal uses v2 of the checksum on-disk format. Each journal | ||
295 | metadata block gets its own checksum, and the block tags in the | ||
296 | descriptor table contain checksums for each of the data blocks in the | ||
297 | journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2) | ||
298 | * - 0x10 | ||
299 | - This journal uses v3 of the checksum on-disk format. This is the same as | ||
300 | v2, but the journal block tag size is fixed regardless of the size of | ||
301 | block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3) | ||
302 | |||
303 | .. _jbd2_checksum_type: | ||
304 | |||
305 | Journal checksum type codes are one of the following. crc32 or crc32c are the | ||
306 | most likely choices. | ||
307 | |||
308 | .. list-table:: | ||
309 | :widths: 1 79 | ||
310 | :header-rows: 1 | ||
311 | |||
312 | * - Value | ||
313 | - Description | ||
314 | * - 1 | ||
315 | - CRC32 | ||
316 | * - 2 | ||
317 | - MD5 | ||
318 | * - 3 | ||
319 | - SHA1 | ||
320 | * - 4 | ||
321 | - CRC32C | ||
322 | |||
323 | Descriptor Block | ||
324 | ~~~~~~~~~~~~~~~~ | ||
325 | |||
326 | The descriptor block contains an array of journal block tags that | ||
327 | describe the final locations of the data blocks that follow in the | ||
328 | journal. Descriptor blocks are open-coded instead of being completely | ||
329 | described by a data structure, but here is the block structure anyway. | ||
330 | Descriptor blocks consume at least 36 bytes, but use a full block: | ||
331 | |||
332 | .. list-table:: | ||
333 | :widths: 1 1 1 77 | ||
334 | :header-rows: 1 | ||
335 | |||
336 | * - Offset | ||
337 | - Type | ||
338 | - Name | ||
339 | - Descriptor | ||
340 | * - 0x0 | ||
341 | - journal\_header\_t | ||
342 | - (open coded) | ||
343 | - Common block header. | ||
344 | * - 0xC | ||
345 | - struct journal\_block\_tag\_s | ||
346 | - open coded array[] | ||
347 | - Enough tags either to fill up the block or to describe all the data | ||
348 | blocks that follow this descriptor block. | ||
349 | |||
350 | Journal block tags have any of the following formats, depending on which | ||
351 | journal feature and block tag flags are set. | ||
352 | |||
353 | If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is | ||
354 | defined as ``struct journal_block_tag3_s``, which looks like the | ||
355 | following. The size is 16 or 32 bytes. | ||
356 | |||
357 | .. list-table:: | ||
358 | :widths: 1 1 1 77 | ||
359 | :header-rows: 1 | ||
360 | |||
361 | * - Offset | ||
362 | - Type | ||
363 | - Name | ||
364 | - Descriptor | ||
365 | * - 0x0 | ||
366 | - \_\_be32 | ||
367 | - t\_blocknr | ||
368 | - Lower 32-bits of the location of where the corresponding data block | ||
369 | should end up on disk. | ||
370 | * - 0x4 | ||
371 | - \_\_be32 | ||
372 | - t\_flags | ||
373 | - Flags that go with the descriptor. See the table jbd2_tag_flags_ for | ||
374 | more info. | ||
375 | * - 0x8 | ||
376 | - \_\_be32 | ||
377 | - t\_blocknr\_high | ||
378 | - Upper 32-bits of the location of where the corresponding data block | ||
379 | should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is | ||
380 | not enabled. | ||
381 | * - 0xC | ||
382 | - \_\_be32 | ||
383 | - t\_checksum | ||
384 | - Checksum of the journal UUID, the sequence number, and the data block. | ||
385 | * - | ||
386 | - | ||
387 | - | ||
388 | - This field appears to be open coded. It always comes at the end of the | ||
389 | tag, after t_checksum. This field is not present if the "same UUID" flag | ||
390 | is set. | ||
391 | * - 0x8 or 0xC | ||
392 | - char | ||
393 | - uuid[16] | ||
394 | - A UUID to go with this tag. This field appears to be copied from the | ||
395 | ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that | ||
396 | field. | ||
397 | |||
398 | .. _jbd2_tag_flags: | ||
399 | |||
400 | The journal tag flags are any combination of the following: | ||
401 | |||
402 | .. list-table:: | ||
403 | :widths: 1 79 | ||
404 | :header-rows: 1 | ||
405 | |||
406 | * - Value | ||
407 | - Description | ||
408 | * - 0x1 | ||
409 | - On-disk block is escaped. The first four bytes of the data block just | ||
410 | happened to match the jbd2 magic number. | ||
411 | * - 0x2 | ||
412 | - This block has the same UUID as previous, therefore the UUID field is | ||
413 | omitted. | ||
414 | * - 0x4 | ||
415 | - The data block was deleted by the transaction. (Not used?) | ||
416 | * - 0x8 | ||
417 | - This is the last tag in this descriptor block. | ||
418 | |||
419 | If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag | ||
420 | is defined as ``struct journal_block_tag_s``, which looks like the | ||
421 | following. The size is 8, 12, 24, or 28 bytes: | ||
422 | |||
423 | .. list-table:: | ||
424 | :widths: 1 1 1 77 | ||
425 | :header-rows: 1 | ||
426 | |||
427 | * - Offset | ||
428 | - Type | ||
429 | - Name | ||
430 | - Descriptor | ||
431 | * - 0x0 | ||
432 | - \_\_be32 | ||
433 | - t\_blocknr | ||
434 | - Lower 32-bits of the location of where the corresponding data block | ||
435 | should end up on disk. | ||
436 | * - 0x4 | ||
437 | - \_\_be16 | ||
438 | - t\_checksum | ||
439 | - Checksum of the journal UUID, the sequence number, and the data block. | ||
440 | Note that only the lower 16 bits are stored. | ||
441 | * - 0x6 | ||
442 | - \_\_be16 | ||
443 | - t\_flags | ||
444 | - Flags that go with the descriptor. See the table jbd2_tag_flags_ for | ||
445 | more info. | ||
446 | * - | ||
447 | - | ||
448 | - | ||
449 | - This next field is only present if the super block indicates support for | ||
450 | 64-bit block numbers. | ||
451 | * - 0x8 | ||
452 | - \_\_be32 | ||
453 | - t\_blocknr\_high | ||
454 | - Upper 32-bits of the location of where the corresponding data block | ||
455 | should end up on disk. | ||
456 | * - | ||
457 | - | ||
458 | - | ||
459 | - This field appears to be open coded. It always comes at the end of the | ||
460 | tag, after t_flags or t_blocknr_high. This field is not present if the | ||
461 | "same UUID" flag is set. | ||
462 | * - 0x8 or 0xC | ||
463 | - char | ||
464 | - uuid[16] | ||
465 | - A UUID to go with this tag. This field appears to be copied from the | ||
466 | ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that | ||
467 | field. | ||
468 | |||
469 | If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or | ||
470 | JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a | ||
471 | ``struct jbd2_journal_block_tail``, which looks like this: | ||
472 | |||
473 | .. list-table:: | ||
474 | :widths: 1 1 1 77 | ||
475 | :header-rows: 1 | ||
476 | |||
477 | * - Offset | ||
478 | - Type | ||
479 | - Name | ||
480 | - Descriptor | ||
481 | * - 0x0 | ||
482 | - \_\_be32 | ||
483 | - t\_checksum | ||
484 | - Checksum of the journal UUID + the descriptor block, with this field set | ||
485 | to zero. | ||
486 | |||
487 | Data Block | ||
488 | ~~~~~~~~~~ | ||
489 | |||
490 | In general, the data blocks being written to disk through the journal | ||
491 | are written verbatim into the journal file after the descriptor block. | ||
492 | However, if the first four bytes of the block match the jbd2 magic | ||
493 | number then those four bytes are replaced with zeroes and the “escaped” | ||
494 | flag is set in the descriptor block tag. | ||
495 | |||
496 | Revocation Block | ||
497 | ~~~~~~~~~~~~~~~~ | ||
498 | |||
499 | A revocation block is used to prevent replay of a block in an earlier | ||
500 | transaction. This is used to mark blocks that were journalled at one | ||
501 | time but are no longer journalled. Typically this happens if a metadata | ||
502 | block is freed and re-allocated as a file data block; in this case, a | ||
503 | journal replay after the file block was written to disk will cause | ||
504 | corruption. | ||
505 | |||
506 | **NOTE**: This mechanism is NOT used to express “this journal block is | ||
507 | superseded by this other journal block”, as the author (djwong) | ||
508 | mistakenly thought. Any block being added to a transaction will cause | ||
509 | the removal of all existing revocation records for that block. | ||
510 | |||
511 | Revocation blocks are described in | ||
512 | ``struct jbd2_journal_revoke_header_s``, are at least 16 bytes in | ||
513 | length, but use a full block: | ||
514 | |||
515 | .. list-table:: | ||
516 | :widths: 1 1 1 77 | ||
517 | :header-rows: 1 | ||
518 | |||
519 | * - Offset | ||
520 | - Type | ||
521 | - Name | ||
522 | - Description | ||
523 | * - 0x0 | ||
524 | - journal\_header\_t | ||
525 | - r\_header | ||
526 | - Common block header. | ||
527 | * - 0xC | ||
528 | - \_\_be32 | ||
529 | - r\_count | ||
530 | - Number of bytes used in this block. | ||
531 | * - 0x10 | ||
532 | - \_\_be32 or \_\_be64 | ||
533 | - blocks[0] | ||
534 | - Blocks to revoke. | ||
535 | |||
536 | After r\_count is a linear array of block numbers that are effectively | ||
537 | revoked by this transaction. The size of each block number is 8 bytes if | ||
538 | the superblock advertises 64-bit block number support, or 4 bytes | ||
539 | otherwise. | ||
540 | |||
541 | If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or | ||
542 | JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation | ||
543 | block is a ``struct jbd2_journal_revoke_tail``, which has this format: | ||
544 | |||
545 | .. list-table:: | ||
546 | :widths: 1 1 1 77 | ||
547 | :header-rows: 1 | ||
548 | |||
549 | * - Offset | ||
550 | - Type | ||
551 | - Name | ||
552 | - Description | ||
553 | * - 0x0 | ||
554 | - \_\_be32 | ||
555 | - r\_checksum | ||
556 | - Checksum of the journal UUID + revocation block | ||
557 | |||
558 | Commit Block | ||
559 | ~~~~~~~~~~~~ | ||
560 | |||
561 | The commit block is a sentry that indicates that a transaction has been | ||
562 | completely written to the journal. Once this commit block reaches the | ||
563 | journal, the data stored with this transaction can be written to their | ||
564 | final locations on disk. | ||
565 | |||
566 | The commit block is described by ``struct commit_header``, which is 32 | ||
567 | bytes long (but uses a full block): | ||
568 | |||
569 | .. list-table:: | ||
570 | :widths: 1 1 1 77 | ||
571 | :header-rows: 1 | ||
572 | |||
573 | * - Offset | ||
574 | - Type | ||
575 | - Name | ||
576 | - Descriptor | ||
577 | * - 0x0 | ||
578 | - journal\_header\_s | ||
579 | - (open coded) | ||
580 | - Common block header. | ||
581 | * - 0xC | ||
582 | - unsigned char | ||
583 | - h\_chksum\_type | ||
584 | - The type of checksum to use to verify the integrity of the data blocks | ||
585 | in the transaction. See jbd2_checksum_type_ for more info. | ||
586 | * - 0xD | ||
587 | - unsigned char | ||
588 | - h\_chksum\_size | ||
589 | - The number of bytes used by the checksum. Most likely 4. | ||
590 | * - 0xE | ||
591 | - unsigned char | ||
592 | - h\_padding[2] | ||
593 | - | ||
594 | * - 0x10 | ||
595 | - \_\_be32 | ||
596 | - h\_chksum[JBD2\_CHECKSUM\_BYTES] | ||
597 | - 32 bytes of space to store checksums. If | ||
598 | JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 | ||
599 | are set, the first ``__be32`` is the checksum of the journal UUID and | ||
600 | the entire commit block, with this field zeroed. If | ||
601 | JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the | ||
602 | crc32 of all the blocks already written to the transaction. | ||
603 | * - 0x30 | ||
604 | - \_\_be64 | ||
605 | - h\_commit\_sec | ||
606 | - The time that the transaction was committed, in seconds since the epoch. | ||
607 | * - 0x38 | ||
608 | - \_\_be32 | ||
609 | - h\_commit\_nsec | ||
610 | - Nanoseconds component of the above timestamp. | ||
611 | |||
diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/ondisk/mmp.rst new file mode 100644 index 000000000000..b7d7a3137f80 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/mmp.rst | |||
@@ -0,0 +1,77 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Multiple Mount Protection | ||
4 | ------------------------- | ||
5 | |||
6 | Multiple mount protection (MMP) is a feature that protects the | ||
7 | filesystem against multiple hosts trying to use the filesystem | ||
8 | simultaneously. When a filesystem is opened (for mounting, or fsck, | ||
9 | etc.), the MMP code running on the node (call it node A) checks a | ||
10 | sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the | ||
11 | open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then | ||
12 | fsck is (hopefully) running, and open fails immediately. Otherwise, the | ||
13 | open code will wait for twice the specified MMP check interval and check | ||
14 | the sequence number again. If the sequence number has changed, then the | ||
15 | filesystem is active on another machine and the open fails. If the MMP | ||
16 | code passes all of those checks, a new MMP sequence number is generated | ||
17 | and written to the MMP block, and the mount proceeds. | ||
18 | |||
19 | While the filesystem is live, the kernel sets up a timer to re-check the | ||
20 | MMP block at the specified MMP check interval. To perform the re-check, | ||
21 | the MMP sequence number is re-read; if it does not match the in-memory | ||
22 | MMP sequence number, then another node (node B) has mounted the | ||
23 | filesystem, and node A remounts the filesystem read-only. If the | ||
24 | sequence numbers match, the sequence number is incremented both in | ||
25 | memory and on disk, and the re-check is complete. | ||
26 | |||
27 | The hostname and device filename are written into the MMP block whenever | ||
28 | an open operation succeeds. The MMP code does not use these values; they | ||
29 | are provided purely for informational purposes. | ||
30 | |||
31 | The checksum is calculated against the FS UUID and the MMP structure. | ||
32 | The MMP structure (``struct mmp_struct``) is as follows: | ||
33 | |||
34 | .. list-table:: | ||
35 | :widths: 1 1 1 77 | ||
36 | :header-rows: 1 | ||
37 | |||
38 | * - Offset | ||
39 | - Type | ||
40 | - Name | ||
41 | - Description | ||
42 | * - 0x0 | ||
43 | - \_\_le32 | ||
44 | - mmp\_magic | ||
45 | - Magic number for MMP, 0x004D4D50 (“MMP”). | ||
46 | * - 0x4 | ||
47 | - \_\_le32 | ||
48 | - mmp\_seq | ||
49 | - Sequence number, updated periodically. | ||
50 | * - 0x8 | ||
51 | - \_\_le64 | ||
52 | - mmp\_time | ||
53 | - Time that the MMP block was last updated. | ||
54 | * - 0x10 | ||
55 | - char[64] | ||
56 | - mmp\_nodename | ||
57 | - Hostname of the node that opened the filesystem. | ||
58 | * - 0x50 | ||
59 | - char[32] | ||
60 | - mmp\_bdevname | ||
61 | - Block device name of the filesystem. | ||
62 | * - 0x70 | ||
63 | - \_\_le16 | ||
64 | - mmp\_check\_interval | ||
65 | - The MMP re-check interval, in seconds. | ||
66 | * - 0x72 | ||
67 | - \_\_le16 | ||
68 | - mmp\_pad1 | ||
69 | - Zero. | ||
70 | * - 0x74 | ||
71 | - \_\_le32[226] | ||
72 | - mmp\_pad2 | ||
73 | - Zero. | ||
74 | * - 0x3FC | ||
75 | - \_\_le32 | ||
76 | - mmp\_checksum | ||
77 | - Checksum of the MMP block. | ||
diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/ondisk/overview.rst new file mode 100644 index 000000000000..cbab18baba12 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/overview.rst | |||
@@ -0,0 +1,26 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | High Level Design | ||
4 | ================= | ||
5 | |||
6 | An ext4 file system is split into a series of block groups. To reduce | ||
7 | performance difficulties due to fragmentation, the block allocator tries | ||
8 | very hard to keep each file's blocks within the same group, thereby | ||
9 | reducing seek times. The size of a block group is specified in | ||
10 | ``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \* | ||
11 | ``block_size_in_bytes``. With the default block size of 4KiB, each group | ||
12 | will contain 32,768 blocks, for a length of 128MiB. The number of block | ||
13 | groups is the size of the device divided by the size of a block group. | ||
14 | |||
15 | All fields in ext4 are written to disk in little-endian order. HOWEVER, | ||
16 | all fields in jbd2 (the journal) are written to disk in big-endian | ||
17 | order. | ||
18 | |||
19 | .. include:: blocks.rst | ||
20 | .. include:: blockgroup.rst | ||
21 | .. include:: special_inodes.rst | ||
22 | .. include:: allocators.rst | ||
23 | .. include:: checksums.rst | ||
24 | .. include:: bigalloc.rst | ||
25 | .. include:: inlinedata.rst | ||
26 | .. include:: eainode.rst | ||
diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/ondisk/special_inodes.rst new file mode 100644 index 000000000000..a82f70c9baeb --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/special_inodes.rst | |||
@@ -0,0 +1,38 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Special inodes | ||
4 | -------------- | ||
5 | |||
6 | ext4 reserves some inode for special features, as follows: | ||
7 | |||
8 | .. list-table:: | ||
9 | :widths: 1 79 | ||
10 | :header-rows: 1 | ||
11 | |||
12 | * - inode Number | ||
13 | - Purpose | ||
14 | * - 0 | ||
15 | - Doesn't exist; there is no inode 0. | ||
16 | * - 1 | ||
17 | - List of defective blocks. | ||
18 | * - 2 | ||
19 | - Root directory. | ||
20 | * - 3 | ||
21 | - User quota. | ||
22 | * - 4 | ||
23 | - Group quota. | ||
24 | * - 5 | ||
25 | - Boot loader. | ||
26 | * - 6 | ||
27 | - Undelete directory. | ||
28 | * - 7 | ||
29 | - Reserved group descriptors inode. (“resize inode”) | ||
30 | * - 8 | ||
31 | - Journal inode. | ||
32 | * - 9 | ||
33 | - The “exclude” inode, for snapshots(?) | ||
34 | * - 10 | ||
35 | - Replica inode, used for some non-upstream feature? | ||
36 | * - 11 | ||
37 | - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock. | ||
38 | |||
diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/ondisk/super.rst new file mode 100644 index 000000000000..5f81dd87e0b9 --- /dev/null +++ b/Documentation/filesystems/ext4/ondisk/super.rst | |||
@@ -0,0 +1,801 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | Super Block | ||
4 | ----------- | ||
5 | |||
6 | The superblock records various information about the enclosing | ||
7 | filesystem, such as block counts, inode counts, supported features, | ||
8 | maintenance information, and more. | ||
9 | |||
10 | If the sparse\_super feature flag is set, redundant copies of the | ||
11 | superblock and group descriptors are kept only in the groups whose group | ||
12 | number is either 0 or a power of 3, 5, or 7. If the flag is not set, | ||
13 | redundant copies are kept in all groups. | ||
14 | |||
15 | The superblock checksum is calculated against the superblock structure, | ||
16 | which includes the FS UUID. | ||
17 | |||
18 | The ext4 superblock is laid out as follows in | ||
19 | ``struct ext4_super_block``: | ||
20 | |||
21 | .. list-table:: | ||
22 | :widths: 1 1 1 77 | ||
23 | :header-rows: 1 | ||
24 | |||
25 | * - Offset | ||
26 | - Size | ||
27 | - Name | ||
28 | - Description | ||
29 | * - 0x0 | ||
30 | - \_\_le32 | ||
31 | - s\_inodes\_count | ||
32 | - Total inode count. | ||
33 | * - 0x4 | ||
34 | - \_\_le32 | ||
35 | - s\_blocks\_count\_lo | ||
36 | - Total block count. | ||
37 | * - 0x8 | ||
38 | - \_\_le32 | ||
39 | - s\_r\_blocks\_count\_lo | ||
40 | - This number of blocks can only be allocated by the super-user. | ||
41 | * - 0xC | ||
42 | - \_\_le32 | ||
43 | - s\_free\_blocks\_count\_lo | ||
44 | - Free block count. | ||
45 | * - 0x10 | ||
46 | - \_\_le32 | ||
47 | - s\_free\_inodes\_count | ||
48 | - Free inode count. | ||
49 | * - 0x14 | ||
50 | - \_\_le32 | ||
51 | - s\_first\_data\_block | ||
52 | - First data block. This must be at least 1 for 1k-block filesystems and | ||
53 | is typically 0 for all other block sizes. | ||
54 | * - 0x18 | ||
55 | - \_\_le32 | ||
56 | - s\_log\_block\_size | ||
57 | - Block size is 2 ^ (10 + s\_log\_block\_size). | ||
58 | * - 0x1C | ||
59 | - \_\_le32 | ||
60 | - s\_log\_cluster\_size | ||
61 | - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is | ||
62 | enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. | ||
63 | * - 0x20 | ||
64 | - \_\_le32 | ||
65 | - s\_blocks\_per\_group | ||
66 | - Blocks per group. | ||
67 | * - 0x24 | ||
68 | - \_\_le32 | ||
69 | - s\_clusters\_per\_group | ||
70 | - Clusters per group, if bigalloc is enabled. Otherwise | ||
71 | s\_clusters\_per\_group must equal s\_blocks\_per\_group. | ||
72 | * - 0x28 | ||
73 | - \_\_le32 | ||
74 | - s\_inodes\_per\_group | ||
75 | - Inodes per group. | ||
76 | * - 0x2C | ||
77 | - \_\_le32 | ||
78 | - s\_mtime | ||
79 | - Mount time, in seconds since the epoch. | ||
80 | * - 0x30 | ||
81 | - \_\_le32 | ||
82 | - s\_wtime | ||
83 | - Write time, in seconds since the epoch. | ||
84 | * - 0x34 | ||
85 | - \_\_le16 | ||
86 | - s\_mnt\_count | ||
87 | - Number of mounts since the last fsck. | ||
88 | * - 0x36 | ||
89 | - \_\_le16 | ||
90 | - s\_max\_mnt\_count | ||
91 | - Number of mounts beyond which a fsck is needed. | ||
92 | * - 0x38 | ||
93 | - \_\_le16 | ||
94 | - s\_magic | ||
95 | - Magic signature, 0xEF53 | ||
96 | * - 0x3A | ||
97 | - \_\_le16 | ||
98 | - s\_state | ||
99 | - File system state. See super_state_ for more info. | ||
100 | * - 0x3C | ||
101 | - \_\_le16 | ||
102 | - s\_errors | ||
103 | - Behaviour when detecting errors. See super_errors_ for more info. | ||
104 | * - 0x3E | ||
105 | - \_\_le16 | ||
106 | - s\_minor\_rev\_level | ||
107 | - Minor revision level. | ||
108 | * - 0x40 | ||
109 | - \_\_le32 | ||
110 | - s\_lastcheck | ||
111 | - Time of last check, in seconds since the epoch. | ||
112 | * - 0x44 | ||
113 | - \_\_le32 | ||
114 | - s\_checkinterval | ||
115 | - Maximum time between checks, in seconds. | ||
116 | * - 0x48 | ||
117 | - \_\_le32 | ||
118 | - s\_creator\_os | ||
119 | - Creator OS. See the table super_creator_ for more info. | ||
120 | * - 0x4C | ||
121 | - \_\_le32 | ||
122 | - s\_rev\_level | ||
123 | - Revision level. See the table super_revision_ for more info. | ||
124 | * - 0x50 | ||
125 | - \_\_le16 | ||
126 | - s\_def\_resuid | ||
127 | - Default uid for reserved blocks. | ||
128 | * - 0x52 | ||
129 | - \_\_le16 | ||
130 | - s\_def\_resgid | ||
131 | - Default gid for reserved blocks. | ||
132 | * - | ||
133 | - | ||
134 | - | ||
135 | - These fields are for EXT4_DYNAMIC_REV superblocks only. | ||
136 | |||
137 | Note: the difference between the compatible feature set and the | ||
138 | incompatible feature set is that if there is a bit set in the | ||
139 | incompatible feature set that the kernel doesn't know about, it should | ||
140 | refuse to mount the filesystem. | ||
141 | |||
142 | e2fsck's requirements are more strict; if it doesn't know | ||
143 | about a feature in either the compatible or incompatible feature set, it | ||
144 | must abort and not try to meddle with things it doesn't understand... | ||
145 | * - 0x54 | ||
146 | - \_\_le32 | ||
147 | - s\_first\_ino | ||
148 | - First non-reserved inode. | ||
149 | * - 0x58 | ||
150 | - \_\_le16 | ||
151 | - s\_inode\_size | ||
152 | - Size of inode structure, in bytes. | ||
153 | * - 0x5A | ||
154 | - \_\_le16 | ||
155 | - s\_block\_group\_nr | ||
156 | - Block group # of this superblock. | ||
157 | * - 0x5C | ||
158 | - \_\_le32 | ||
159 | - s\_feature\_compat | ||
160 | - Compatible feature set flags. Kernel can still read/write this fs even | ||
161 | if it doesn't understand a flag; fsck should not do that. See the | ||
162 | super_compat_ table for more info. | ||
163 | * - 0x60 | ||
164 | - \_\_le32 | ||
165 | - s\_feature\_incompat | ||
166 | - Incompatible feature set. If the kernel or fsck doesn't understand one | ||
167 | of these bits, it should stop. See the super_incompat_ table for more | ||
168 | info. | ||
169 | * - 0x64 | ||
170 | - \_\_le32 | ||
171 | - s\_feature\_ro\_compat | ||
172 | - Readonly-compatible feature set. If the kernel doesn't understand one of | ||
173 | these bits, it can still mount read-only. See the super_rocompat_ table | ||
174 | for more info. | ||
175 | * - 0x68 | ||
176 | - \_\_u8 | ||
177 | - s\_uuid[16] | ||
178 | - 128-bit UUID for volume. | ||
179 | * - 0x78 | ||
180 | - char | ||
181 | - s\_volume\_name[16] | ||
182 | - Volume label. | ||
183 | * - 0x88 | ||
184 | - char | ||
185 | - s\_last\_mounted[64] | ||
186 | - Directory where filesystem was last mounted. | ||
187 | * - 0xC8 | ||
188 | - \_\_le32 | ||
189 | - s\_algorithm\_usage\_bitmap | ||
190 | - For compression (Not used in e2fsprogs/Linux) | ||
191 | * - | ||
192 | - | ||
193 | - | ||
194 | - Performance hints. Directory preallocation should only happen if the | ||
195 | EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. | ||
196 | * - 0xCC | ||
197 | - \_\_u8 | ||
198 | - s\_prealloc\_blocks | ||
199 | - #. of blocks to try to preallocate for ... files? (Not used in | ||
200 | e2fsprogs/Linux) | ||
201 | * - 0xCD | ||
202 | - \_\_u8 | ||
203 | - s\_prealloc\_dir\_blocks | ||
204 | - #. of blocks to preallocate for directories. (Not used in | ||
205 | e2fsprogs/Linux) | ||
206 | * - 0xCE | ||
207 | - \_\_le16 | ||
208 | - s\_reserved\_gdt\_blocks | ||
209 | - Number of reserved GDT entries for future filesystem expansion. | ||
210 | * - | ||
211 | - | ||
212 | - | ||
213 | - Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is | ||
214 | set. | ||
215 | * - 0xD0 | ||
216 | - \_\_u8 | ||
217 | - s\_journal\_uuid[16] | ||
218 | - UUID of journal superblock | ||
219 | * - 0xE0 | ||
220 | - \_\_le32 | ||
221 | - s\_journal\_inum | ||
222 | - inode number of journal file. | ||
223 | * - 0xE4 | ||
224 | - \_\_le32 | ||
225 | - s\_journal\_dev | ||
226 | - Device number of journal file, if the external journal feature flag is | ||
227 | set. | ||
228 | * - 0xE8 | ||
229 | - \_\_le32 | ||
230 | - s\_last\_orphan | ||
231 | - Start of list of orphaned inodes to delete. | ||
232 | * - 0xEC | ||
233 | - \_\_le32 | ||
234 | - s\_hash\_seed[4] | ||
235 | - HTREE hash seed. | ||
236 | * - 0xFC | ||
237 | - \_\_u8 | ||
238 | - s\_def\_hash\_version | ||
239 | - Default hash algorithm to use for directory hashes. See super_def_hash_ | ||
240 | for more info. | ||
241 | * - 0xFD | ||
242 | - \_\_u8 | ||
243 | - s\_jnl\_backup\_type | ||
244 | - If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the | ||
245 | ``s_jnl_blocks`` field contains a duplicate copy of the inode's | ||
246 | ``i_block[]`` array and ``i_size``. | ||
247 | * - 0xFE | ||
248 | - \_\_le16 | ||
249 | - s\_desc\_size | ||
250 | - Size of group descriptors, in bytes, if the 64bit incompat feature flag | ||
251 | is set. | ||
252 | * - 0x100 | ||
253 | - \_\_le32 | ||
254 | - s\_default\_mount\_opts | ||
255 | - Default mount options. See the super_mountopts_ table for more info. | ||
256 | * - 0x104 | ||
257 | - \_\_le32 | ||
258 | - s\_first\_meta\_bg | ||
259 | - First metablock block group, if the meta\_bg feature is enabled. | ||
260 | * - 0x108 | ||
261 | - \_\_le32 | ||
262 | - s\_mkfs\_time | ||
263 | - When the filesystem was created, in seconds since the epoch. | ||
264 | * - 0x10C | ||
265 | - \_\_le32 | ||
266 | - s\_jnl\_blocks[17] | ||
267 | - Backup copy of the journal inode's ``i_block[]`` array in the first 15 | ||
268 | elements and i\_size\_high and i\_size in the 16th and 17th elements, | ||
269 | respectively. | ||
270 | * - | ||
271 | - | ||
272 | - | ||
273 | - 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set. | ||
274 | * - 0x150 | ||
275 | - \_\_le32 | ||
276 | - s\_blocks\_count\_hi | ||
277 | - High 32-bits of the block count. | ||
278 | * - 0x154 | ||
279 | - \_\_le32 | ||
280 | - s\_r\_blocks\_count\_hi | ||
281 | - High 32-bits of the reserved block count. | ||
282 | * - 0x158 | ||
283 | - \_\_le32 | ||
284 | - s\_free\_blocks\_count\_hi | ||
285 | - High 32-bits of the free block count. | ||
286 | * - 0x15C | ||
287 | - \_\_le16 | ||
288 | - s\_min\_extra\_isize | ||
289 | - All inodes have at least # bytes. | ||
290 | * - 0x15E | ||
291 | - \_\_le16 | ||
292 | - s\_want\_extra\_isize | ||
293 | - New inodes should reserve # bytes. | ||
294 | * - 0x160 | ||
295 | - \_\_le32 | ||
296 | - s\_flags | ||
297 | - Miscellaneous flags. See the super_flags_ table for more info. | ||
298 | * - 0x164 | ||
299 | - \_\_le16 | ||
300 | - s\_raid\_stride | ||
301 | - RAID stride. This is the number of logical blocks read from or written | ||
302 | to the disk before moving to the next disk. This affects the placement | ||
303 | of filesystem metadata, which will hopefully make RAID storage faster. | ||
304 | * - 0x166 | ||
305 | - \_\_le16 | ||
306 | - s\_mmp\_interval | ||
307 | - #. seconds to wait in multi-mount prevention (MMP) checking. In theory, | ||
308 | MMP is a mechanism to record in the superblock which host and device | ||
309 | have mounted the filesystem, in order to prevent multiple mounts. This | ||
310 | feature does not seem to be implemented... | ||
311 | * - 0x168 | ||
312 | - \_\_le64 | ||
313 | - s\_mmp\_block | ||
314 | - Block # for multi-mount protection data. | ||
315 | * - 0x170 | ||
316 | - \_\_le32 | ||
317 | - s\_raid\_stripe\_width | ||
318 | - RAID stripe width. This is the number of logical blocks read from or | ||
319 | written to the disk before coming back to the current disk. This is used | ||
320 | by the block allocator to try to reduce the number of read-modify-write | ||
321 | operations in a RAID5/6. | ||
322 | * - 0x174 | ||
323 | - \_\_u8 | ||
324 | - s\_log\_groups\_per\_flex | ||
325 | - Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``. | ||
326 | * - 0x175 | ||
327 | - \_\_u8 | ||
328 | - s\_checksum\_type | ||
329 | - Metadata checksum algorithm type. The only valid value is 1 (crc32c). | ||
330 | * - 0x176 | ||
331 | - \_\_le16 | ||
332 | - s\_reserved\_pad | ||
333 | - | ||
334 | * - 0x178 | ||
335 | - \_\_le64 | ||
336 | - s\_kbytes\_written | ||
337 | - Number of KiB written to this filesystem over its lifetime. | ||
338 | * - 0x180 | ||
339 | - \_\_le32 | ||
340 | - s\_snapshot\_inum | ||
341 | - inode number of active snapshot. (Not used in e2fsprogs/Linux.) | ||
342 | * - 0x184 | ||
343 | - \_\_le32 | ||
344 | - s\_snapshot\_id | ||
345 | - Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.) | ||
346 | * - 0x188 | ||
347 | - \_\_le64 | ||
348 | - s\_snapshot\_r\_blocks\_count | ||
349 | - Number of blocks reserved for active snapshot's future use. (Not used in | ||
350 | e2fsprogs/Linux.) | ||
351 | * - 0x190 | ||
352 | - \_\_le32 | ||
353 | - s\_snapshot\_list | ||
354 | - inode number of the head of the on-disk snapshot list. (Not used in | ||
355 | e2fsprogs/Linux.) | ||
356 | * - 0x194 | ||
357 | - \_\_le32 | ||
358 | - s\_error\_count | ||
359 | - Number of errors seen. | ||
360 | * - 0x198 | ||
361 | - \_\_le32 | ||
362 | - s\_first\_error\_time | ||
363 | - First time an error happened, in seconds since the epoch. | ||
364 | * - 0x19C | ||
365 | - \_\_le32 | ||
366 | - s\_first\_error\_ino | ||
367 | - inode involved in first error. | ||
368 | * - 0x1A0 | ||
369 | - \_\_le64 | ||
370 | - s\_first\_error\_block | ||
371 | - Number of block involved of first error. | ||
372 | * - 0x1A8 | ||
373 | - \_\_u8 | ||
374 | - s\_first\_error\_func[32] | ||
375 | - Name of function where the error happened. | ||
376 | * - 0x1C8 | ||
377 | - \_\_le32 | ||
378 | - s\_first\_error\_line | ||
379 | - Line number where error happened. | ||
380 | * - 0x1CC | ||
381 | - \_\_le32 | ||
382 | - s\_last\_error\_time | ||
383 | - Time of most recent error, in seconds since the epoch. | ||
384 | * - 0x1D0 | ||
385 | - \_\_le32 | ||
386 | - s\_last\_error\_ino | ||
387 | - inode involved in most recent error. | ||
388 | * - 0x1D4 | ||
389 | - \_\_le32 | ||
390 | - s\_last\_error\_line | ||
391 | - Line number where most recent error happened. | ||
392 | * - 0x1D8 | ||
393 | - \_\_le64 | ||
394 | - s\_last\_error\_block | ||
395 | - Number of block involved in most recent error. | ||
396 | * - 0x1E0 | ||
397 | - \_\_u8 | ||
398 | - s\_last\_error\_func[32] | ||
399 | - Name of function where the most recent error happened. | ||
400 | * - 0x200 | ||
401 | - \_\_u8 | ||
402 | - s\_mount\_opts[64] | ||
403 | - ASCIIZ string of mount options. | ||
404 | * - 0x240 | ||
405 | - \_\_le32 | ||
406 | - s\_usr\_quota\_inum | ||
407 | - Inode number of user `quota <quota>`__ file. | ||
408 | * - 0x244 | ||
409 | - \_\_le32 | ||
410 | - s\_grp\_quota\_inum | ||
411 | - Inode number of group `quota <quota>`__ file. | ||
412 | * - 0x248 | ||
413 | - \_\_le32 | ||
414 | - s\_overhead\_blocks | ||
415 | - Overhead blocks/clusters in fs. (Huh? This field is always zero, which | ||
416 | means that the kernel calculates it dynamically.) | ||
417 | * - 0x24C | ||
418 | - \_\_le32 | ||
419 | - s\_backup\_bgs[2] | ||
420 | - Block groups containing superblock backups (if sparse\_super2) | ||
421 | * - 0x254 | ||
422 | - \_\_u8 | ||
423 | - s\_encrypt\_algos[4] | ||
424 | - Encryption algorithms in use. There can be up to four algorithms in use | ||
425 | at any time; valid algorithm codes are given in the super_encrypt_ table | ||
426 | below. | ||
427 | * - 0x258 | ||
428 | - \_\_u8 | ||
429 | - s\_encrypt\_pw\_salt[16] | ||
430 | - Salt for the string2key algorithm for encryption. | ||
431 | * - 0x268 | ||
432 | - \_\_le32 | ||
433 | - s\_lpf\_ino | ||
434 | - Inode number of lost+found | ||
435 | * - 0x26C | ||
436 | - \_\_le32 | ||
437 | - s\_prj\_quota\_inum | ||
438 | - Inode that tracks project quotas. | ||
439 | * - 0x270 | ||
440 | - \_\_le32 | ||
441 | - s\_checksum\_seed | ||
442 | - Checksum seed used for metadata\_csum calculations. This value is | ||
443 | crc32c(~0, $orig\_fs\_uuid). | ||
444 | * - 0x274 | ||
445 | - \_\_u8 | ||
446 | - s\_wtime_hi | ||
447 | - Upper 8 bits of the s_wtime field. | ||
448 | * - 0x275 | ||
449 | - \_\_u8 | ||
450 | - s\_wtime_hi | ||
451 | - Upper 8 bits of the s_mtime field. | ||
452 | * - 0x276 | ||
453 | - \_\_u8 | ||
454 | - s\_mkfs_time_hi | ||
455 | - Upper 8 bits of the s_mkfs_time field. | ||
456 | * - 0x277 | ||
457 | - \_\_u8 | ||
458 | - s\_lastcheck_hi | ||
459 | - Upper 8 bits of the s_lastcheck_hi field. | ||
460 | * - 0x278 | ||
461 | - \_\_u8 | ||
462 | - s\_first_error_time_hi | ||
463 | - Upper 8 bits of the s_first_error_time_hi field. | ||
464 | * - 0x279 | ||
465 | - \_\_u8 | ||
466 | - s\_last_error_time_hi | ||
467 | - Upper 8 bits of the s_last_error_time_hi field. | ||
468 | * - 0x27A | ||
469 | - \_\_u8[2] | ||
470 | - s\_pad | ||
471 | - Zero padding. | ||
472 | * - 0x27C | ||
473 | - \_\_le32 | ||
474 | - s\_reserved[96] | ||
475 | - Padding to the end of the block. | ||
476 | * - 0x3FC | ||
477 | - \_\_le32 | ||
478 | - s\_checksum | ||
479 | - Superblock checksum. | ||
480 | |||
481 | .. _super_state: | ||
482 | |||
483 | The superblock state is some combination of the following: | ||
484 | |||
485 | .. list-table:: | ||
486 | :widths: 1 79 | ||
487 | :header-rows: 1 | ||
488 | |||
489 | * - Value | ||
490 | - Description | ||
491 | * - 0x0001 | ||
492 | - Cleanly umounted | ||
493 | * - 0x0002 | ||
494 | - Errors detected | ||
495 | * - 0x0004 | ||
496 | - Orphans being recovered | ||
497 | |||
498 | .. _super_errors: | ||
499 | |||
500 | The superblock error policy is one of the following: | ||
501 | |||
502 | .. list-table:: | ||
503 | :widths: 1 79 | ||
504 | :header-rows: 1 | ||
505 | |||
506 | * - Value | ||
507 | - Description | ||
508 | * - 1 | ||
509 | - Continue | ||
510 | * - 2 | ||
511 | - Remount read-only | ||
512 | * - 3 | ||
513 | - Panic | ||
514 | |||
515 | .. _super_creator: | ||
516 | |||
517 | The filesystem creator is one of the following: | ||
518 | |||
519 | .. list-table:: | ||
520 | :widths: 1 79 | ||
521 | :header-rows: 1 | ||
522 | |||
523 | * - Value | ||
524 | - Description | ||
525 | * - 0 | ||
526 | - Linux | ||
527 | * - 1 | ||
528 | - Hurd | ||
529 | * - 2 | ||
530 | - Masix | ||
531 | * - 3 | ||
532 | - FreeBSD | ||
533 | * - 4 | ||
534 | - Lites | ||
535 | |||
536 | .. _super_revision: | ||
537 | |||
538 | The superblock revision is one of the following: | ||
539 | |||
540 | .. list-table:: | ||
541 | :widths: 1 79 | ||
542 | :header-rows: 1 | ||
543 | |||
544 | * - Value | ||
545 | - Description | ||
546 | * - 0 | ||
547 | - Original format | ||
548 | * - 1 | ||
549 | - v2 format w/ dynamic inode sizes | ||
550 | |||
551 | Note that ``EXT4_DYNAMIC_REV`` refers to a revision 1 or newer filesystem. | ||
552 | |||
553 | .. _super_compat: | ||
554 | |||
555 | The superblock compatible features field is a combination of any of the | ||
556 | following: | ||
557 | |||
558 | .. list-table:: | ||
559 | :widths: 1 79 | ||
560 | :header-rows: 1 | ||
561 | |||
562 | * - Value | ||
563 | - Description | ||
564 | * - 0x1 | ||
565 | - Directory preallocation (COMPAT\_DIR\_PREALLOC). | ||
566 | * - 0x2 | ||
567 | - “imagic inodes”. Not clear from the code what this does | ||
568 | (COMPAT\_IMAGIC\_INODES). | ||
569 | * - 0x4 | ||
570 | - Has a journal (COMPAT\_HAS\_JOURNAL). | ||
571 | * - 0x8 | ||
572 | - Supports extended attributes (COMPAT\_EXT\_ATTR). | ||
573 | * - 0x10 | ||
574 | - Has reserved GDT blocks for filesystem expansion | ||
575 | (COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER. | ||
576 | * - 0x20 | ||
577 | - Has directory indices (COMPAT\_DIR\_INDEX). | ||
578 | * - 0x40 | ||
579 | - “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized | ||
580 | block groups? (COMPAT\_LAZY\_BG) | ||
581 | * - 0x80 | ||
582 | - “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE). | ||
583 | * - 0x100 | ||
584 | - “Exclude bitmap”. Seems to be used to indicate the presence of | ||
585 | snapshot-related exclude bitmaps? Not defined in kernel or used in | ||
586 | e2fsprogs (COMPAT\_EXCLUDE\_BITMAP). | ||
587 | * - 0x200 | ||
588 | - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs | ||
589 | points to the two block groups that contain backup superblocks | ||
590 | (COMPAT\_SPARSE\_SUPER2). | ||
591 | |||
592 | .. _super_incompat: | ||
593 | |||
594 | The superblock incompatible features field is a combination of any of the | ||
595 | following: | ||
596 | |||
597 | .. list-table:: | ||
598 | :widths: 1 79 | ||
599 | :header-rows: 1 | ||
600 | |||
601 | * - Value | ||
602 | - Description | ||
603 | * - 0x1 | ||
604 | - Compression (INCOMPAT\_COMPRESSION). | ||
605 | * - 0x2 | ||
606 | - Directory entries record the file type. See ext4\_dir\_entry\_2 below | ||
607 | (INCOMPAT\_FILETYPE). | ||
608 | * - 0x4 | ||
609 | - Filesystem needs recovery (INCOMPAT\_RECOVER). | ||
610 | * - 0x8 | ||
611 | - Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV). | ||
612 | * - 0x10 | ||
613 | - Meta block groups. See the earlier discussion of this feature | ||
614 | (INCOMPAT\_META\_BG). | ||
615 | * - 0x40 | ||
616 | - Files in this filesystem use extents (INCOMPAT\_EXTENTS). | ||
617 | * - 0x80 | ||
618 | - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). | ||
619 | * - 0x100 | ||
620 | - Multiple mount protection. Not implemented (INCOMPAT\_MMP). | ||
621 | * - 0x200 | ||
622 | - Flexible block groups. See the earlier discussion of this feature | ||
623 | (INCOMPAT\_FLEX\_BG). | ||
624 | * - 0x400 | ||
625 | - Inodes can be used to store large extended attribute values | ||
626 | (INCOMPAT\_EA\_INODE). | ||
627 | * - 0x1000 | ||
628 | - Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?) | ||
629 | * - 0x2000 | ||
630 | - Metadata checksum seed is stored in the superblock. This feature enables | ||
631 | the administrator to change the UUID of a metadata\_csum filesystem | ||
632 | while the filesystem is mounted; without it, the checksum definition | ||
633 | requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED). | ||
634 | * - 0x4000 | ||
635 | - Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to | ||
636 | this feature, directories could not be larger than 4GiB and could not | ||
637 | have an htree more than 2 levels deep. If this feature is enabled, | ||
638 | directories can be larger than 4GiB and have a maximum htree depth of 3. | ||
639 | * - 0x8000 | ||
640 | - Data in inode (INCOMPAT\_INLINE\_DATA). | ||
641 | * - 0x10000 | ||
642 | - Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT). | ||
643 | |||
644 | .. _super_rocompat: | ||
645 | |||
646 | The superblock read-only compatible features field is a combination of any of | ||
647 | the following: | ||
648 | |||
649 | .. list-table:: | ||
650 | :widths: 1 79 | ||
651 | :header-rows: 1 | ||
652 | |||
653 | * - Value | ||
654 | - Description | ||
655 | * - 0x1 | ||
656 | - Sparse superblocks. See the earlier discussion of this feature | ||
657 | (RO\_COMPAT\_SPARSE\_SUPER). | ||
658 | * - 0x2 | ||
659 | - This filesystem has been used to store a file greater than 2GiB | ||
660 | (RO\_COMPAT\_LARGE\_FILE). | ||
661 | * - 0x4 | ||
662 | - Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR). | ||
663 | * - 0x8 | ||
664 | - This filesystem has files whose sizes are represented in units of | ||
665 | logical blocks, not 512-byte sectors. This implies a very large file | ||
666 | indeed! (RO\_COMPAT\_HUGE\_FILE) | ||
667 | * - 0x10 | ||
668 | - Group descriptors have checksums. In addition to detecting corruption, | ||
669 | this is useful for lazy formatting with uninitialized groups | ||
670 | (RO\_COMPAT\_GDT\_CSUM). | ||
671 | * - 0x20 | ||
672 | - Indicates that the old ext3 32,000 subdirectory limit no longer applies | ||
673 | (RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1 | ||
674 | if it is incremented past 64,999. | ||
675 | * - 0x40 | ||
676 | - Indicates that large inodes exist on this filesystem | ||
677 | (RO\_COMPAT\_EXTRA\_ISIZE). | ||
678 | * - 0x80 | ||
679 | - This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT). | ||
680 | * - 0x100 | ||
681 | - `Quota <Quota>`__ (RO\_COMPAT\_QUOTA). | ||
682 | * - 0x200 | ||
683 | - This filesystem supports “bigalloc”, which means that file extents are | ||
684 | tracked in units of clusters (of blocks) instead of blocks | ||
685 | (RO\_COMPAT\_BIGALLOC). | ||
686 | * - 0x400 | ||
687 | - This filesystem supports metadata checksumming. | ||
688 | (RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though | ||
689 | GDT\_CSUM must not be set) | ||
690 | * - 0x800 | ||
691 | - Filesystem supports replicas. This feature is neither in the kernel nor | ||
692 | e2fsprogs. (RO\_COMPAT\_REPLICA) | ||
693 | * - 0x1000 | ||
694 | - Read-only filesystem image; the kernel will not mount this image | ||
695 | read-write and most tools will refuse to write to the image. | ||
696 | (RO\_COMPAT\_READONLY) | ||
697 | * - 0x2000 | ||
698 | - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT) | ||
699 | |||
700 | .. _super_def_hash: | ||
701 | |||
702 | The ``s_def_hash_version`` field is one of the following: | ||
703 | |||
704 | .. list-table:: | ||
705 | :widths: 1 79 | ||
706 | :header-rows: 1 | ||
707 | |||
708 | * - Value | ||
709 | - Description | ||
710 | * - 0x0 | ||
711 | - Legacy. | ||
712 | * - 0x1 | ||
713 | - Half MD4. | ||
714 | * - 0x2 | ||
715 | - Tea. | ||
716 | * - 0x3 | ||
717 | - Legacy, unsigned. | ||
718 | * - 0x4 | ||
719 | - Half MD4, unsigned. | ||
720 | * - 0x5 | ||
721 | - Tea, unsigned. | ||
722 | |||
723 | .. _super_mountopts: | ||
724 | |||
725 | The ``s_default_mount_opts`` field is any combination of the following: | ||
726 | |||
727 | .. list-table:: | ||
728 | :widths: 1 79 | ||
729 | :header-rows: 1 | ||
730 | |||
731 | * - Value | ||
732 | - Description | ||
733 | * - 0x0001 | ||
734 | - Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG) | ||
735 | * - 0x0002 | ||
736 | - New files take the gid of the containing directory (instead of the fsgid | ||
737 | of the current process). (EXT4\_DEFM\_BSDGROUPS) | ||
738 | * - 0x0004 | ||
739 | - Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER) | ||
740 | * - 0x0008 | ||
741 | - Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL) | ||
742 | * - 0x0010 | ||
743 | - Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16) | ||
744 | * - 0x0020 | ||
745 | - All data and metadata are commited to the journal. | ||
746 | (EXT4\_DEFM\_JMODE\_DATA) | ||
747 | * - 0x0040 | ||
748 | - All data are flushed to the disk before metadata are committed to the | ||
749 | journal. (EXT4\_DEFM\_JMODE\_ORDERED) | ||
750 | * - 0x0060 | ||
751 | - Data ordering is not preserved; data may be written after the metadata | ||
752 | has been written. (EXT4\_DEFM\_JMODE\_WBACK) | ||
753 | * - 0x0100 | ||
754 | - Disable write flushes. (EXT4\_DEFM\_NOBARRIER) | ||
755 | * - 0x0200 | ||
756 | - Track which blocks in a filesystem are metadata and therefore should not | ||
757 | be used as data blocks. This option will be enabled by default on 3.18, | ||
758 | hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY) | ||
759 | * - 0x0400 | ||
760 | - Enable DISCARD support, where the storage device is told about blocks | ||
761 | becoming unused. (EXT4\_DEFM\_DISCARD) | ||
762 | * - 0x0800 | ||
763 | - Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC) | ||
764 | |||
765 | .. _super_flags: | ||
766 | |||
767 | The ``s_flags`` field is any combination of the following: | ||
768 | |||
769 | .. list-table:: | ||
770 | :widths: 1 79 | ||
771 | :header-rows: 1 | ||
772 | |||
773 | * - Value | ||
774 | - Description | ||
775 | * - 0x0001 | ||
776 | - Signed directory hash in use. | ||
777 | * - 0x0002 | ||
778 | - Unsigned directory hash in use. | ||
779 | * - 0x0004 | ||
780 | - To test development code. | ||
781 | |||
782 | .. _super_encrypt: | ||
783 | |||
784 | The ``s_encrypt_algos`` list can contain any of the following: | ||
785 | |||
786 | .. list-table:: | ||
787 | :widths: 1 79 | ||
788 | :header-rows: 1 | ||
789 | |||
790 | * - Value | ||
791 | - Description | ||
792 | * - 0 | ||
793 | - Invalid algorithm (ENCRYPTION\_MODE\_INVALID). | ||
794 | * - 1 | ||
795 | - 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS). | ||
796 | * - 2 | ||
797 | - 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM). | ||
798 | * - 3 | ||
799 | - 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC). | ||
800 | |||
801 | Total size of the superblock is 1024 bytes. | ||
diff --git a/Documentation/index.rst b/Documentation/index.rst index fdc585703498..f95ba981f8cd 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst | |||
@@ -102,6 +102,17 @@ implementation. | |||
102 | 102 | ||
103 | sh/index | 103 | sh/index |
104 | 104 | ||
105 | Filesystem Documentation | ||
106 | ------------------------ | ||
107 | |||
108 | The documentation in this section are provided by specific filesystem | ||
109 | subprojects. | ||
110 | |||
111 | .. toctree:: | ||
112 | :maxdepth: 2 | ||
113 | |||
114 | filesystems/ext4/index | ||
115 | |||
105 | Korean translations | 116 | Korean translations |
106 | ------------------- | 117 | ------------------- |
107 | 118 | ||
@@ -566,7 +566,8 @@ struct page *dax_layout_busy_page(struct address_space *mapping) | |||
566 | if (index >= end) | 566 | if (index >= end) |
567 | break; | 567 | break; |
568 | 568 | ||
569 | if (!radix_tree_exceptional_entry(pvec_ent)) | 569 | if (WARN_ON_ONCE( |
570 | !radix_tree_exceptional_entry(pvec_ent))) | ||
570 | continue; | 571 | continue; |
571 | 572 | ||
572 | xa_lock_irq(&mapping->i_pages); | 573 | xa_lock_irq(&mapping->i_pages); |
@@ -578,6 +579,13 @@ struct page *dax_layout_busy_page(struct address_space *mapping) | |||
578 | if (page) | 579 | if (page) |
579 | break; | 580 | break; |
580 | } | 581 | } |
582 | |||
583 | /* | ||
584 | * We don't expect normal struct page entries to exist in our | ||
585 | * tree, but we keep these pagevec calls so that this code is | ||
586 | * consistent with the common pattern for handling pagevecs | ||
587 | * throughout the kernel. | ||
588 | */ | ||
581 | pagevec_remove_exceptionals(&pvec); | 589 | pagevec_remove_exceptionals(&pvec); |
582 | pagevec_release(&pvec); | 590 | pagevec_release(&pvec); |
583 | index++; | 591 | index++; |
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index aa52d87985aa..e5d6ee61ff48 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -426,9 +426,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) | |||
426 | } | 426 | } |
427 | bh = sb_getblk(sb, bitmap_blk); | 427 | bh = sb_getblk(sb, bitmap_blk); |
428 | if (unlikely(!bh)) { | 428 | if (unlikely(!bh)) { |
429 | ext4_error(sb, "Cannot get buffer for block bitmap - " | 429 | ext4_warning(sb, "Cannot get buffer for block bitmap - " |
430 | "block_group = %u, block_bitmap = %llu", | 430 | "block_group = %u, block_bitmap = %llu", |
431 | block_group, bitmap_blk); | 431 | block_group, bitmap_blk); |
432 | return ERR_PTR(-ENOMEM); | 432 | return ERR_PTR(-ENOMEM); |
433 | } | 433 | } |
434 | 434 | ||
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7c7123f265c2..1fc013f3d944 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -789,17 +789,16 @@ struct move_extent { | |||
789 | * affected filesystem before 2242. | 789 | * affected filesystem before 2242. |
790 | */ | 790 | */ |
791 | 791 | ||
792 | static inline __le32 ext4_encode_extra_time(struct timespec *time) | 792 | static inline __le32 ext4_encode_extra_time(struct timespec64 *time) |
793 | { | 793 | { |
794 | u32 extra = sizeof(time->tv_sec) > 4 ? | 794 | u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; |
795 | ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0; | ||
796 | return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); | 795 | return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); |
797 | } | 796 | } |
798 | 797 | ||
799 | static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) | 798 | static inline void ext4_decode_extra_time(struct timespec64 *time, |
799 | __le32 extra) | ||
800 | { | 800 | { |
801 | if (unlikely(sizeof(time->tv_sec) > 4 && | 801 | if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) { |
802 | (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { | ||
803 | 802 | ||
804 | #if 1 | 803 | #if 1 |
805 | /* Handle legacy encoding of pre-1970 dates with epoch | 804 | /* Handle legacy encoding of pre-1970 dates with epoch |
@@ -821,9 +820,8 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) | |||
821 | do { \ | 820 | do { \ |
822 | (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ | 821 | (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ |
823 | if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ | 822 | if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ |
824 | struct timespec ts = timespec64_to_timespec((inode)->xtime); \ | ||
825 | (raw_inode)->xtime ## _extra = \ | 823 | (raw_inode)->xtime ## _extra = \ |
826 | ext4_encode_extra_time(&ts); \ | 824 | ext4_encode_extra_time(&(inode)->xtime); \ |
827 | } \ | 825 | } \ |
828 | } while (0) | 826 | } while (0) |
829 | 827 | ||
@@ -840,10 +838,8 @@ do { \ | |||
840 | do { \ | 838 | do { \ |
841 | (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ | 839 | (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ |
842 | if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ | 840 | if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ |
843 | struct timespec ts = timespec64_to_timespec((inode)->xtime); \ | 841 | ext4_decode_extra_time(&(inode)->xtime, \ |
844 | ext4_decode_extra_time(&ts, \ | ||
845 | raw_inode->xtime ## _extra); \ | 842 | raw_inode->xtime ## _extra); \ |
846 | (inode)->xtime = timespec_to_timespec64(ts); \ | ||
847 | } \ | 843 | } \ |
848 | else \ | 844 | else \ |
849 | (inode)->xtime.tv_nsec = 0; \ | 845 | (inode)->xtime.tv_nsec = 0; \ |
@@ -993,9 +989,9 @@ struct ext4_inode_info { | |||
993 | 989 | ||
994 | /* | 990 | /* |
995 | * File creation time. Its function is same as that of | 991 | * File creation time. Its function is same as that of |
996 | * struct timespec i_{a,c,m}time in the generic inode. | 992 | * struct timespec64 i_{a,c,m}time in the generic inode. |
997 | */ | 993 | */ |
998 | struct timespec i_crtime; | 994 | struct timespec64 i_crtime; |
999 | 995 | ||
1000 | /* mballoc */ | 996 | /* mballoc */ |
1001 | struct list_head i_prealloc_list; | 997 | struct list_head i_prealloc_list; |
@@ -1299,7 +1295,14 @@ struct ext4_super_block { | |||
1299 | __le32 s_lpf_ino; /* Location of the lost+found inode */ | 1295 | __le32 s_lpf_ino; /* Location of the lost+found inode */ |
1300 | __le32 s_prj_quota_inum; /* inode for tracking project quota */ | 1296 | __le32 s_prj_quota_inum; /* inode for tracking project quota */ |
1301 | __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ | 1297 | __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ |
1302 | __le32 s_reserved[98]; /* Padding to the end of the block */ | 1298 | __u8 s_wtime_hi; |
1299 | __u8 s_mtime_hi; | ||
1300 | __u8 s_mkfs_time_hi; | ||
1301 | __u8 s_lastcheck_hi; | ||
1302 | __u8 s_first_error_time_hi; | ||
1303 | __u8 s_last_error_time_hi; | ||
1304 | __u8 s_pad[2]; | ||
1305 | __le32 s_reserved[96]; /* Padding to the end of the block */ | ||
1303 | __le32 s_checksum; /* crc32c(superblock) */ | 1306 | __le32 s_checksum; /* crc32c(superblock) */ |
1304 | }; | 1307 | }; |
1305 | 1308 | ||
@@ -2456,6 +2459,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); | |||
2456 | extern int ext4_inode_attach_jinode(struct inode *inode); | 2459 | extern int ext4_inode_attach_jinode(struct inode *inode); |
2457 | extern int ext4_can_truncate(struct inode *inode); | 2460 | extern int ext4_can_truncate(struct inode *inode); |
2458 | extern int ext4_truncate(struct inode *); | 2461 | extern int ext4_truncate(struct inode *); |
2462 | extern int ext4_break_layouts(struct inode *); | ||
2459 | extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); | 2463 | extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); |
2460 | extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); | 2464 | extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); |
2461 | extern void ext4_set_inode_flags(struct inode *); | 2465 | extern void ext4_set_inode_flags(struct inode *); |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8ce6fd5b10dd..72a361d5ef74 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -4826,6 +4826,13 @@ static long ext4_zero_range(struct file *file, loff_t offset, | |||
4826 | * released from page cache. | 4826 | * released from page cache. |
4827 | */ | 4827 | */ |
4828 | down_write(&EXT4_I(inode)->i_mmap_sem); | 4828 | down_write(&EXT4_I(inode)->i_mmap_sem); |
4829 | |||
4830 | ret = ext4_break_layouts(inode); | ||
4831 | if (ret) { | ||
4832 | up_write(&EXT4_I(inode)->i_mmap_sem); | ||
4833 | goto out_mutex; | ||
4834 | } | ||
4835 | |||
4829 | ret = ext4_update_disksize_before_punch(inode, offset, len); | 4836 | ret = ext4_update_disksize_before_punch(inode, offset, len); |
4830 | if (ret) { | 4837 | if (ret) { |
4831 | up_write(&EXT4_I(inode)->i_mmap_sem); | 4838 | up_write(&EXT4_I(inode)->i_mmap_sem); |
@@ -5499,6 +5506,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) | |||
5499 | * page cache. | 5506 | * page cache. |
5500 | */ | 5507 | */ |
5501 | down_write(&EXT4_I(inode)->i_mmap_sem); | 5508 | down_write(&EXT4_I(inode)->i_mmap_sem); |
5509 | |||
5510 | ret = ext4_break_layouts(inode); | ||
5511 | if (ret) | ||
5512 | goto out_mmap; | ||
5513 | |||
5502 | /* | 5514 | /* |
5503 | * Need to round down offset to be aligned with page size boundary | 5515 | * Need to round down offset to be aligned with page size boundary |
5504 | * for page size > block size. | 5516 | * for page size > block size. |
@@ -5647,6 +5659,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) | |||
5647 | * page cache. | 5659 | * page cache. |
5648 | */ | 5660 | */ |
5649 | down_write(&EXT4_I(inode)->i_mmap_sem); | 5661 | down_write(&EXT4_I(inode)->i_mmap_sem); |
5662 | |||
5663 | ret = ext4_break_layouts(inode); | ||
5664 | if (ret) | ||
5665 | goto out_mmap; | ||
5666 | |||
5650 | /* | 5667 | /* |
5651 | * Need to round down to align start offset to page size boundary | 5668 | * Need to round down to align start offset to page size boundary |
5652 | * for page size > block size. | 5669 | * for page size > block size. |
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f336cbc6e932..2addcb8730e1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -138,9 +138,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) | |||
138 | } | 138 | } |
139 | bh = sb_getblk(sb, bitmap_blk); | 139 | bh = sb_getblk(sb, bitmap_blk); |
140 | if (unlikely(!bh)) { | 140 | if (unlikely(!bh)) { |
141 | ext4_error(sb, "Cannot read inode bitmap - " | 141 | ext4_warning(sb, "Cannot read inode bitmap - " |
142 | "block_group = %u, inode_bitmap = %llu", | 142 | "block_group = %u, inode_bitmap = %llu", |
143 | block_group, bitmap_blk); | 143 | block_group, bitmap_blk); |
144 | return ERR_PTR(-ENOMEM); | 144 | return ERR_PTR(-ENOMEM); |
145 | } | 145 | } |
146 | if (bitmap_uptodate(bh)) | 146 | if (bitmap_uptodate(bh)) |
@@ -1086,7 +1086,7 @@ got: | |||
1086 | /* This is the optimal IO size (for stat), not the fs block size */ | 1086 | /* This is the optimal IO size (for stat), not the fs block size */ |
1087 | inode->i_blocks = 0; | 1087 | inode->i_blocks = 0; |
1088 | inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); | 1088 | inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); |
1089 | ei->i_crtime = timespec64_to_timespec(inode->i_mtime); | 1089 | ei->i_crtime = inode->i_mtime; |
1090 | 1090 | ||
1091 | memset(ei->i_data, 0, sizeof(ei->i_data)); | 1091 | memset(ei->i_data, 0, sizeof(ei->i_data)); |
1092 | ei->i_dir_start_lookup = 0; | 1092 | ei->i_dir_start_lookup = 0; |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4efe77286ecd..8f6ad7667974 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -317,7 +317,7 @@ stop_handle: | |||
317 | * (Well, we could do this if we need to, but heck - it works) | 317 | * (Well, we could do this if we need to, but heck - it works) |
318 | */ | 318 | */ |
319 | ext4_orphan_del(handle, inode); | 319 | ext4_orphan_del(handle, inode); |
320 | EXT4_I(inode)->i_dtime = get_seconds(); | 320 | EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); |
321 | 321 | ||
322 | /* | 322 | /* |
323 | * One subtle ordering requirement: if anything has gone wrong | 323 | * One subtle ordering requirement: if anything has gone wrong |
@@ -4191,6 +4191,39 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, | |||
4191 | return 0; | 4191 | return 0; |
4192 | } | 4192 | } |
4193 | 4193 | ||
4194 | static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock) | ||
4195 | { | ||
4196 | *did_unlock = true; | ||
4197 | up_write(&ei->i_mmap_sem); | ||
4198 | schedule(); | ||
4199 | down_write(&ei->i_mmap_sem); | ||
4200 | } | ||
4201 | |||
4202 | int ext4_break_layouts(struct inode *inode) | ||
4203 | { | ||
4204 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
4205 | struct page *page; | ||
4206 | bool retry; | ||
4207 | int error; | ||
4208 | |||
4209 | if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) | ||
4210 | return -EINVAL; | ||
4211 | |||
4212 | do { | ||
4213 | retry = false; | ||
4214 | page = dax_layout_busy_page(inode->i_mapping); | ||
4215 | if (!page) | ||
4216 | return 0; | ||
4217 | |||
4218 | error = ___wait_var_event(&page->_refcount, | ||
4219 | atomic_read(&page->_refcount) == 1, | ||
4220 | TASK_INTERRUPTIBLE, 0, 0, | ||
4221 | ext4_wait_dax_page(ei, &retry)); | ||
4222 | } while (error == 0 && retry); | ||
4223 | |||
4224 | return error; | ||
4225 | } | ||
4226 | |||
4194 | /* | 4227 | /* |
4195 | * ext4_punch_hole: punches a hole in a file by releasing the blocks | 4228 | * ext4_punch_hole: punches a hole in a file by releasing the blocks |
4196 | * associated with the given offset and length | 4229 | * associated with the given offset and length |
@@ -4264,6 +4297,11 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) | |||
4264 | * page cache. | 4297 | * page cache. |
4265 | */ | 4298 | */ |
4266 | down_write(&EXT4_I(inode)->i_mmap_sem); | 4299 | down_write(&EXT4_I(inode)->i_mmap_sem); |
4300 | |||
4301 | ret = ext4_break_layouts(inode); | ||
4302 | if (ret) | ||
4303 | goto out_dio; | ||
4304 | |||
4267 | first_block_offset = round_up(offset, sb->s_blocksize); | 4305 | first_block_offset = round_up(offset, sb->s_blocksize); |
4268 | last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; | 4306 | last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; |
4269 | 4307 | ||
@@ -4944,17 +4982,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
4944 | ret = -EFSCORRUPTED; | 4982 | ret = -EFSCORRUPTED; |
4945 | goto bad_inode; | 4983 | goto bad_inode; |
4946 | } else if (!ext4_has_inline_data(inode)) { | 4984 | } else if (!ext4_has_inline_data(inode)) { |
4947 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 4985 | /* validate the block references in the inode */ |
4948 | if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 4986 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
4949 | (S_ISLNK(inode->i_mode) && | 4987 | (S_ISLNK(inode->i_mode) && |
4950 | !ext4_inode_is_fast_symlink(inode)))) | 4988 | !ext4_inode_is_fast_symlink(inode))) { |
4951 | /* Validate extent which is part of inode */ | 4989 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
4952 | ret = ext4_ext_check_inode(inode); | 4990 | ret = ext4_ext_check_inode(inode); |
4953 | } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 4991 | else |
4954 | (S_ISLNK(inode->i_mode) && | 4992 | ret = ext4_ind_check_inode(inode); |
4955 | !ext4_inode_is_fast_symlink(inode))) { | ||
4956 | /* Validate block references which are part of inode */ | ||
4957 | ret = ext4_ind_check_inode(inode); | ||
4958 | } | 4993 | } |
4959 | } | 4994 | } |
4960 | if (ret) | 4995 | if (ret) |
@@ -5553,6 +5588,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
5553 | ext4_wait_for_tail_page_commit(inode); | 5588 | ext4_wait_for_tail_page_commit(inode); |
5554 | } | 5589 | } |
5555 | down_write(&EXT4_I(inode)->i_mmap_sem); | 5590 | down_write(&EXT4_I(inode)->i_mmap_sem); |
5591 | |||
5592 | rc = ext4_break_layouts(inode); | ||
5593 | if (rc) { | ||
5594 | up_write(&EXT4_I(inode)->i_mmap_sem); | ||
5595 | error = rc; | ||
5596 | goto err_out; | ||
5597 | } | ||
5598 | |||
5556 | /* | 5599 | /* |
5557 | * Truncate pagecache after we've waited for commit | 5600 | * Truncate pagecache after we've waited for commit |
5558 | * in data=journal mode to make pages freeable. | 5601 | * in data=journal mode to make pages freeable. |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f7ab34088162..e29fce2fbf25 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/log2.h> | 14 | #include <linux/log2.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/nospec.h> | ||
17 | #include <linux/backing-dev.h> | 18 | #include <linux/backing-dev.h> |
18 | #include <trace/events/ext4.h> | 19 | #include <trace/events/ext4.h> |
19 | 20 | ||
@@ -2140,7 +2141,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | |||
2140 | * This should tell if fe_len is exactly power of 2 | 2141 | * This should tell if fe_len is exactly power of 2 |
2141 | */ | 2142 | */ |
2142 | if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) | 2143 | if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) |
2143 | ac->ac_2order = i - 1; | 2144 | ac->ac_2order = array_index_nospec(i - 1, |
2145 | sb->s_blocksize_bits + 2); | ||
2144 | } | 2146 | } |
2145 | 2147 | ||
2146 | /* if stream allocation is enabled, use global goal */ | 2148 | /* if stream allocation is enabled, use global goal */ |
@@ -3799,7 +3801,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |||
3799 | ext4_group_t group; | 3801 | ext4_group_t group; |
3800 | ext4_grpblk_t bit; | 3802 | ext4_grpblk_t bit; |
3801 | unsigned long long grp_blk_start; | 3803 | unsigned long long grp_blk_start; |
3802 | int err = 0; | ||
3803 | int free = 0; | 3804 | int free = 0; |
3804 | 3805 | ||
3805 | BUG_ON(pa->pa_deleted == 0); | 3806 | BUG_ON(pa->pa_deleted == 0); |
@@ -3840,7 +3841,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |||
3840 | } | 3841 | } |
3841 | atomic_add(free, &sbi->s_mb_discarded); | 3842 | atomic_add(free, &sbi->s_mb_discarded); |
3842 | 3843 | ||
3843 | return err; | 3844 | return 0; |
3844 | } | 3845 | } |
3845 | 3846 | ||
3846 | static noinline_for_stack int | 3847 | static noinline_for_stack int |
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 638ad4743477..39b07c2d3384 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c | |||
@@ -147,7 +147,7 @@ static int kmmpd(void *data) | |||
147 | 147 | ||
148 | mmp_block = le64_to_cpu(es->s_mmp_block); | 148 | mmp_block = le64_to_cpu(es->s_mmp_block); |
149 | mmp = (struct mmp_struct *)(bh->b_data); | 149 | mmp = (struct mmp_struct *)(bh->b_data); |
150 | mmp->mmp_time = cpu_to_le64(get_seconds()); | 150 | mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); |
151 | /* | 151 | /* |
152 | * Start with the higher mmp_check_interval and reduce it if | 152 | * Start with the higher mmp_check_interval and reduce it if |
153 | * the MMP block is being updated on time. | 153 | * the MMP block is being updated on time. |
@@ -165,7 +165,7 @@ static int kmmpd(void *data) | |||
165 | seq = 1; | 165 | seq = 1; |
166 | 166 | ||
167 | mmp->mmp_seq = cpu_to_le32(seq); | 167 | mmp->mmp_seq = cpu_to_le32(seq); |
168 | mmp->mmp_time = cpu_to_le64(get_seconds()); | 168 | mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); |
169 | last_update_time = jiffies; | 169 | last_update_time = jiffies; |
170 | 170 | ||
171 | retval = write_mmp_block(sb, bh); | 171 | retval = write_mmp_block(sb, bh); |
@@ -241,7 +241,7 @@ static int kmmpd(void *data) | |||
241 | * Unmount seems to be clean. | 241 | * Unmount seems to be clean. |
242 | */ | 242 | */ |
243 | mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); | 243 | mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); |
244 | mmp->mmp_time = cpu_to_le64(get_seconds()); | 244 | mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds()); |
245 | 245 | ||
246 | retval = write_mmp_block(sb, bh); | 246 | retval = write_mmp_block(sb, bh); |
247 | 247 | ||
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 8e17efdcbf11..a409ff70d67b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c | |||
@@ -134,9 +134,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, | |||
134 | mapping[0] = inode1->i_mapping; | 134 | mapping[0] = inode1->i_mapping; |
135 | mapping[1] = inode2->i_mapping; | 135 | mapping[1] = inode2->i_mapping; |
136 | } else { | 136 | } else { |
137 | pgoff_t tmp = index1; | 137 | swap(index1, index2); |
138 | index1 = index2; | ||
139 | index2 = tmp; | ||
140 | mapping[0] = inode2->i_mapping; | 138 | mapping[0] = inode2->i_mapping; |
141 | mapping[1] = inode1->i_mapping; | 139 | mapping[1] = inode1->i_mapping; |
142 | } | 140 | } |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2a4c25c4681d..116ff68c5bd4 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -1398,6 +1398,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, | |||
1398 | goto cleanup_and_exit; | 1398 | goto cleanup_and_exit; |
1399 | dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " | 1399 | dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " |
1400 | "falling back\n")); | 1400 | "falling back\n")); |
1401 | ret = NULL; | ||
1401 | } | 1402 | } |
1402 | nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); | 1403 | nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); |
1403 | if (!nblocks) { | 1404 | if (!nblocks) { |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b7f7922061be..f7750bc5b85a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -312,6 +312,24 @@ void ext4_itable_unused_set(struct super_block *sb, | |||
312 | bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); | 312 | bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); |
313 | } | 313 | } |
314 | 314 | ||
315 | static void __ext4_update_tstamp(__le32 *lo, __u8 *hi) | ||
316 | { | ||
317 | time64_t now = ktime_get_real_seconds(); | ||
318 | |||
319 | now = clamp_val(now, 0, (1ull << 40) - 1); | ||
320 | |||
321 | *lo = cpu_to_le32(lower_32_bits(now)); | ||
322 | *hi = upper_32_bits(now); | ||
323 | } | ||
324 | |||
325 | static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) | ||
326 | { | ||
327 | return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); | ||
328 | } | ||
329 | #define ext4_update_tstamp(es, tstamp) \ | ||
330 | __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) | ||
331 | #define ext4_get_tstamp(es, tstamp) \ | ||
332 | __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) | ||
315 | 333 | ||
316 | static void __save_error_info(struct super_block *sb, const char *func, | 334 | static void __save_error_info(struct super_block *sb, const char *func, |
317 | unsigned int line) | 335 | unsigned int line) |
@@ -322,11 +340,12 @@ static void __save_error_info(struct super_block *sb, const char *func, | |||
322 | if (bdev_read_only(sb->s_bdev)) | 340 | if (bdev_read_only(sb->s_bdev)) |
323 | return; | 341 | return; |
324 | es->s_state |= cpu_to_le16(EXT4_ERROR_FS); | 342 | es->s_state |= cpu_to_le16(EXT4_ERROR_FS); |
325 | es->s_last_error_time = cpu_to_le32(get_seconds()); | 343 | ext4_update_tstamp(es, s_last_error_time); |
326 | strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); | 344 | strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); |
327 | es->s_last_error_line = cpu_to_le32(line); | 345 | es->s_last_error_line = cpu_to_le32(line); |
328 | if (!es->s_first_error_time) { | 346 | if (!es->s_first_error_time) { |
329 | es->s_first_error_time = es->s_last_error_time; | 347 | es->s_first_error_time = es->s_last_error_time; |
348 | es->s_first_error_time_hi = es->s_last_error_time_hi; | ||
330 | strncpy(es->s_first_error_func, func, | 349 | strncpy(es->s_first_error_func, func, |
331 | sizeof(es->s_first_error_func)); | 350 | sizeof(es->s_first_error_func)); |
332 | es->s_first_error_line = cpu_to_le32(line); | 351 | es->s_first_error_line = cpu_to_le32(line); |
@@ -776,26 +795,26 @@ void ext4_mark_group_bitmap_corrupted(struct super_block *sb, | |||
776 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 795 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
777 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); | 796 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); |
778 | struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); | 797 | struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); |
798 | int ret; | ||
779 | 799 | ||
780 | if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) && | 800 | if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { |
781 | !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { | 801 | ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, |
782 | percpu_counter_sub(&sbi->s_freeclusters_counter, | 802 | &grp->bb_state); |
783 | grp->bb_free); | 803 | if (!ret) |
784 | set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, | 804 | percpu_counter_sub(&sbi->s_freeclusters_counter, |
785 | &grp->bb_state); | 805 | grp->bb_free); |
786 | } | 806 | } |
787 | 807 | ||
788 | if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) && | 808 | if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { |
789 | !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { | 809 | ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, |
790 | if (gdp) { | 810 | &grp->bb_state); |
811 | if (!ret && gdp) { | ||
791 | int count; | 812 | int count; |
792 | 813 | ||
793 | count = ext4_free_inodes_count(sb, gdp); | 814 | count = ext4_free_inodes_count(sb, gdp); |
794 | percpu_counter_sub(&sbi->s_freeinodes_counter, | 815 | percpu_counter_sub(&sbi->s_freeinodes_counter, |
795 | count); | 816 | count); |
796 | } | 817 | } |
797 | set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, | ||
798 | &grp->bb_state); | ||
799 | } | 818 | } |
800 | } | 819 | } |
801 | 820 | ||
@@ -2174,8 +2193,8 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, | |||
2174 | "warning: maximal mount count reached, " | 2193 | "warning: maximal mount count reached, " |
2175 | "running e2fsck is recommended"); | 2194 | "running e2fsck is recommended"); |
2176 | else if (le32_to_cpu(es->s_checkinterval) && | 2195 | else if (le32_to_cpu(es->s_checkinterval) && |
2177 | (le32_to_cpu(es->s_lastcheck) + | 2196 | (ext4_get_tstamp(es, s_lastcheck) + |
2178 | le32_to_cpu(es->s_checkinterval) <= get_seconds())) | 2197 | le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) |
2179 | ext4_msg(sb, KERN_WARNING, | 2198 | ext4_msg(sb, KERN_WARNING, |
2180 | "warning: checktime reached, " | 2199 | "warning: checktime reached, " |
2181 | "running e2fsck is recommended"); | 2200 | "running e2fsck is recommended"); |
@@ -2184,7 +2203,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, | |||
2184 | if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) | 2203 | if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) |
2185 | es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); | 2204 | es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); |
2186 | le16_add_cpu(&es->s_mnt_count, 1); | 2205 | le16_add_cpu(&es->s_mnt_count, 1); |
2187 | es->s_mtime = cpu_to_le32(get_seconds()); | 2206 | ext4_update_tstamp(es, s_mtime); |
2188 | ext4_update_dynamic_rev(sb); | 2207 | ext4_update_dynamic_rev(sb); |
2189 | if (sbi->s_journal) | 2208 | if (sbi->s_journal) |
2190 | ext4_set_feature_journal_needs_recovery(sb); | 2209 | ext4_set_feature_journal_needs_recovery(sb); |
@@ -2875,8 +2894,9 @@ static void print_daily_error_info(struct timer_list *t) | |||
2875 | ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", | 2894 | ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", |
2876 | le32_to_cpu(es->s_error_count)); | 2895 | le32_to_cpu(es->s_error_count)); |
2877 | if (es->s_first_error_time) { | 2896 | if (es->s_first_error_time) { |
2878 | printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", | 2897 | printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", |
2879 | sb->s_id, le32_to_cpu(es->s_first_error_time), | 2898 | sb->s_id, |
2899 | ext4_get_tstamp(es, s_first_error_time), | ||
2880 | (int) sizeof(es->s_first_error_func), | 2900 | (int) sizeof(es->s_first_error_func), |
2881 | es->s_first_error_func, | 2901 | es->s_first_error_func, |
2882 | le32_to_cpu(es->s_first_error_line)); | 2902 | le32_to_cpu(es->s_first_error_line)); |
@@ -2889,8 +2909,9 @@ static void print_daily_error_info(struct timer_list *t) | |||
2889 | printk(KERN_CONT "\n"); | 2909 | printk(KERN_CONT "\n"); |
2890 | } | 2910 | } |
2891 | if (es->s_last_error_time) { | 2911 | if (es->s_last_error_time) { |
2892 | printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", | 2912 | printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", |
2893 | sb->s_id, le32_to_cpu(es->s_last_error_time), | 2913 | sb->s_id, |
2914 | ext4_get_tstamp(es, s_last_error_time), | ||
2894 | (int) sizeof(es->s_last_error_func), | 2915 | (int) sizeof(es->s_last_error_func), |
2895 | es->s_last_error_func, | 2916 | es->s_last_error_func, |
2896 | le32_to_cpu(es->s_last_error_line)); | 2917 | le32_to_cpu(es->s_last_error_line)); |
@@ -4813,7 +4834,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) | |||
4813 | * to complain and force a full file system check. | 4834 | * to complain and force a full file system check. |
4814 | */ | 4835 | */ |
4815 | if (!(sb->s_flags & SB_RDONLY)) | 4836 | if (!(sb->s_flags & SB_RDONLY)) |
4816 | es->s_wtime = cpu_to_le32(get_seconds()); | 4837 | ext4_update_tstamp(es, s_wtime); |
4817 | if (sb->s_bdev->bd_part) | 4838 | if (sb->s_bdev->bd_part) |
4818 | es->s_kbytes_written = | 4839 | es->s_kbytes_written = |
4819 | cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + | 4840 | cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + |
@@ -5080,6 +5101,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
5080 | #endif | 5101 | #endif |
5081 | char *orig_data = kstrdup(data, GFP_KERNEL); | 5102 | char *orig_data = kstrdup(data, GFP_KERNEL); |
5082 | 5103 | ||
5104 | if (data && !orig_data) | ||
5105 | return -ENOMEM; | ||
5106 | |||
5083 | /* Store the original options */ | 5107 | /* Store the original options */ |
5084 | old_sb_flags = sb->s_flags; | 5108 | old_sb_flags = sb->s_flags; |
5085 | old_opts.s_mount_opt = sbi->s_mount_opt; | 5109 | old_opts.s_mount_opt = sbi->s_mount_opt; |
@@ -5665,13 +5689,13 @@ static int ext4_enable_quotas(struct super_block *sb) | |||
5665 | DQUOT_USAGE_ENABLED | | 5689 | DQUOT_USAGE_ENABLED | |
5666 | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); | 5690 | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); |
5667 | if (err) { | 5691 | if (err) { |
5668 | for (type--; type >= 0; type--) | ||
5669 | dquot_quota_off(sb, type); | ||
5670 | |||
5671 | ext4_warning(sb, | 5692 | ext4_warning(sb, |
5672 | "Failed to enable quota tracking " | 5693 | "Failed to enable quota tracking " |
5673 | "(type=%d, err=%d). Please run " | 5694 | "(type=%d, err=%d). Please run " |
5674 | "e2fsck to fix.", type, err); | 5695 | "e2fsck to fix.", type, err); |
5696 | for (type--; type >= 0; type--) | ||
5697 | dquot_quota_off(sb, type); | ||
5698 | |||
5675 | return err; | 5699 | return err; |
5676 | } | 5700 | } |
5677 | } | 5701 | } |
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index f34da0bb8f17..e60cc5e89023 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c | |||
@@ -25,6 +25,8 @@ typedef enum { | |||
25 | attr_reserved_clusters, | 25 | attr_reserved_clusters, |
26 | attr_inode_readahead, | 26 | attr_inode_readahead, |
27 | attr_trigger_test_error, | 27 | attr_trigger_test_error, |
28 | attr_first_error_time, | ||
29 | attr_last_error_time, | ||
28 | attr_feature, | 30 | attr_feature, |
29 | attr_pointer_ui, | 31 | attr_pointer_ui, |
30 | attr_pointer_atomic, | 32 | attr_pointer_atomic, |
@@ -182,8 +184,8 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); | |||
182 | EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); | 184 | EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); |
183 | EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); | 185 | EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); |
184 | EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); | 186 | EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); |
185 | EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); | 187 | EXT4_ATTR(first_error_time, 0444, first_error_time); |
186 | EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); | 188 | EXT4_ATTR(last_error_time, 0444, last_error_time); |
187 | 189 | ||
188 | static unsigned int old_bump_val = 128; | 190 | static unsigned int old_bump_val = 128; |
189 | EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); | 191 | EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); |
@@ -249,6 +251,15 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) | |||
249 | return NULL; | 251 | return NULL; |
250 | } | 252 | } |
251 | 253 | ||
254 | static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) | ||
255 | { | ||
256 | return snprintf(buf, PAGE_SIZE, "%lld", | ||
257 | ((time64_t)hi << 32) + le32_to_cpu(lo)); | ||
258 | } | ||
259 | |||
260 | #define print_tstamp(buf, es, tstamp) \ | ||
261 | __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) | ||
262 | |||
252 | static ssize_t ext4_attr_show(struct kobject *kobj, | 263 | static ssize_t ext4_attr_show(struct kobject *kobj, |
253 | struct attribute *attr, char *buf) | 264 | struct attribute *attr, char *buf) |
254 | { | 265 | { |
@@ -274,8 +285,12 @@ static ssize_t ext4_attr_show(struct kobject *kobj, | |||
274 | case attr_pointer_ui: | 285 | case attr_pointer_ui: |
275 | if (!ptr) | 286 | if (!ptr) |
276 | return 0; | 287 | return 0; |
277 | return snprintf(buf, PAGE_SIZE, "%u\n", | 288 | if (a->attr_ptr == ptr_ext4_super_block_offset) |
278 | *((unsigned int *) ptr)); | 289 | return snprintf(buf, PAGE_SIZE, "%u\n", |
290 | le32_to_cpup(ptr)); | ||
291 | else | ||
292 | return snprintf(buf, PAGE_SIZE, "%u\n", | ||
293 | *((unsigned int *) ptr)); | ||
279 | case attr_pointer_atomic: | 294 | case attr_pointer_atomic: |
280 | if (!ptr) | 295 | if (!ptr) |
281 | return 0; | 296 | return 0; |
@@ -283,6 +298,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj, | |||
283 | atomic_read((atomic_t *) ptr)); | 298 | atomic_read((atomic_t *) ptr)); |
284 | case attr_feature: | 299 | case attr_feature: |
285 | return snprintf(buf, PAGE_SIZE, "supported\n"); | 300 | return snprintf(buf, PAGE_SIZE, "supported\n"); |
301 | case attr_first_error_time: | ||
302 | return print_tstamp(buf, sbi->s_es, s_first_error_time); | ||
303 | case attr_last_error_time: | ||
304 | return print_tstamp(buf, sbi->s_es, s_last_error_time); | ||
286 | } | 305 | } |
287 | 306 | ||
288 | return 0; | 307 | return 0; |
@@ -308,7 +327,10 @@ static ssize_t ext4_attr_store(struct kobject *kobj, | |||
308 | ret = kstrtoul(skip_spaces(buf), 0, &t); | 327 | ret = kstrtoul(skip_spaces(buf), 0, &t); |
309 | if (ret) | 328 | if (ret) |
310 | return ret; | 329 | return ret; |
311 | *((unsigned int *) ptr) = t; | 330 | if (a->attr_ptr == ptr_ext4_super_block_offset) |
331 | *((__le32 *) ptr) = cpu_to_le32(t); | ||
332 | else | ||
333 | *((unsigned int *) ptr) = t; | ||
312 | return len; | 334 | return len; |
313 | case attr_inode_readahead: | 335 | case attr_inode_readahead: |
314 | return inode_readahead_blks_store(sbi, buf, len); | 336 | return inode_readahead_blks_store(sbi, buf, len); |
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index 0cb13badf473..bcbe3668c1d4 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h | |||
@@ -11,6 +11,10 @@ | |||
11 | */ | 11 | */ |
12 | static inline void ext4_truncate_failed_write(struct inode *inode) | 12 | static inline void ext4_truncate_failed_write(struct inode *inode) |
13 | { | 13 | { |
14 | /* | ||
15 | * We don't need to call ext4_break_layouts() because the blocks we | ||
16 | * are truncating were never visible to userspace. | ||
17 | */ | ||
14 | down_write(&EXT4_I(inode)->i_mmap_sem); | 18 | down_write(&EXT4_I(inode)->i_mmap_sem); |
15 | truncate_inode_pages(inode->i_mapping, inode->i_size); | 19 | truncate_inode_pages(inode->i_mapping, inode->i_size); |
16 | ext4_truncate(inode); | 20 | ext4_truncate(inode); |
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 723df14f4084..f36fc5d5b257 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c | |||
@@ -190,6 +190,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, | |||
190 | struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); | 190 | struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); |
191 | if ((void *)next >= end) | 191 | if ((void *)next >= end) |
192 | return -EFSCORRUPTED; | 192 | return -EFSCORRUPTED; |
193 | if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) | ||
194 | return -EFSCORRUPTED; | ||
193 | e = next; | 195 | e = next; |
194 | } | 196 | } |
195 | 197 | ||
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8de0e7723316..150cc030b4d7 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -121,7 +121,7 @@ static int journal_submit_commit_record(journal_t *journal, | |||
121 | struct commit_header *tmp; | 121 | struct commit_header *tmp; |
122 | struct buffer_head *bh; | 122 | struct buffer_head *bh; |
123 | int ret; | 123 | int ret; |
124 | struct timespec64 now = current_kernel_time64(); | 124 | struct timespec64 now; |
125 | 125 | ||
126 | *cbh = NULL; | 126 | *cbh = NULL; |
127 | 127 | ||
@@ -134,6 +134,7 @@ static int journal_submit_commit_record(journal_t *journal, | |||
134 | return 1; | 134 | return 1; |
135 | 135 | ||
136 | tmp = (struct commit_header *)bh->b_data; | 136 | tmp = (struct commit_header *)bh->b_data; |
137 | ktime_get_coarse_real_ts64(&now); | ||
137 | tmp->h_commit_sec = cpu_to_be64(now.tv_sec); | 138 | tmp->h_commit_sec = cpu_to_be64(now.tv_sec); |
138 | tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); | 139 | tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); |
139 | 140 | ||