aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-14 01:34:47 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-14 01:34:47 -0400
commit10f3e23f07cb0c20f9bcb77a5b5a7eb2a1b2a2fe (patch)
tree1fcb34309b3542512c6f3345f092f7adb8c3312c
parent3bb37da509e576c80180fa0e4d1cfcaddf0cb82e (diff)
parent863c37fcb14f8b66ea831b45fb35a53ac4a8d69e (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: - Convert content from the ext4 wiki to Documentation rst files so it is more likely to be updated as we add new features to ext4. - Add 64-bit timestamp support to ext4's superblock fields. - ... and the usual bug fixes and cleanups, including a Spectre gadget fixup and some hardening against maliciously corrupted file systems. * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (34 commits) ext4: remove unneeded variable "err" in ext4_mb_release_inode_pa() ext4: improve code readability in ext4_iget() ext4: fix spectre gadget in ext4_mb_regular_allocator() ext4: check for NUL characters in extended attribute's name ext4: use ext4_warning() for sb_getblk failure ext4: fix race when setting the bitmap corrupted flag ext4: reset error code in ext4_find_entry in fallback ext4: handle layout changes to pinned DAX mappings dax: dax_layout_busy_page() warn on !exceptional docs: fix up the obviously obsolete bits in the new ext4 documentation docs: add new ext4 superblock time extension fields docs: create filesystem internal section ext4: use swap macro in mext_page_double_lock ext4: check allocation failure when duplicating "data" in ext4_remount() ext4: fix warning message in ext4_enable_quotas() ext4: super: extend timestamps to 40 bits jbd2: replace current_kernel_time64 with ktime equivalent ext4: use timespec64 for all inode times ext4: use ktime_get_real_seconds for i_dtime ext4: use 64-bit timestamps for mmp_time ...
-rw-r--r--Documentation/conf.py2
-rw-r--r--Documentation/filesystems/ext4/ext4.rst (renamed from Documentation/filesystems/ext4.txt)142
-rw-r--r--Documentation/filesystems/ext4/index.rst17
-rw-r--r--Documentation/filesystems/ext4/ondisk/about.rst44
-rw-r--r--Documentation/filesystems/ext4/ondisk/allocators.rst56
-rw-r--r--Documentation/filesystems/ext4/ondisk/attributes.rst191
-rw-r--r--Documentation/filesystems/ext4/ondisk/bigalloc.rst22
-rw-r--r--Documentation/filesystems/ext4/ondisk/bitmaps.rst28
-rw-r--r--Documentation/filesystems/ext4/ondisk/blockgroup.rst135
-rw-r--r--Documentation/filesystems/ext4/ondisk/blockmap.rst49
-rw-r--r--Documentation/filesystems/ext4/ondisk/blocks.rst142
-rw-r--r--Documentation/filesystems/ext4/ondisk/checksums.rst73
-rw-r--r--Documentation/filesystems/ext4/ondisk/directory.rst426
-rw-r--r--Documentation/filesystems/ext4/ondisk/dynamic.rst12
-rw-r--r--Documentation/filesystems/ext4/ondisk/eainode.rst18
-rw-r--r--Documentation/filesystems/ext4/ondisk/globals.rst13
-rw-r--r--Documentation/filesystems/ext4/ondisk/group_descr.rst170
-rw-r--r--Documentation/filesystems/ext4/ondisk/ifork.rst194
-rw-r--r--Documentation/filesystems/ext4/ondisk/index.rst9
-rw-r--r--Documentation/filesystems/ext4/ondisk/inlinedata.rst37
-rw-r--r--Documentation/filesystems/ext4/ondisk/inodes.rst575
-rw-r--r--Documentation/filesystems/ext4/ondisk/journal.rst611
-rw-r--r--Documentation/filesystems/ext4/ondisk/mmp.rst77
-rw-r--r--Documentation/filesystems/ext4/ondisk/overview.rst26
-rw-r--r--Documentation/filesystems/ext4/ondisk/special_inodes.rst38
-rw-r--r--Documentation/filesystems/ext4/ondisk/super.rst801
-rw-r--r--Documentation/index.rst11
-rw-r--r--fs/dax.c10
-rw-r--r--fs/ext4/balloc.c6
-rw-r--r--fs/ext4/ext4.h32
-rw-r--r--fs/ext4/extents.c17
-rw-r--r--fs/ext4/ialloc.c8
-rw-r--r--fs/ext4/inode.c65
-rw-r--r--fs/ext4/mballoc.c7
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c4
-rw-r--r--fs/ext4/namei.c1
-rw-r--r--fs/ext4/super.c70
-rw-r--r--fs/ext4/sysfs.c32
-rw-r--r--fs/ext4/truncate.h4
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/jbd2/commit.c3
42 files changed, 4036 insertions, 150 deletions
diff --git a/Documentation/conf.py b/Documentation/conf.py
index 62ac5a9f3a9f..b691af4831fa 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -34,7 +34,7 @@ needs_sphinx = '1.3'
34# Add any Sphinx extension module names here, as strings. They can be 34# Add any Sphinx extension module names here, as strings. They can be
35# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36# ones. 36# ones.
37extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure'] 37extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig']
38 38
39# The name of the math extension changed on Sphinx 1.4 39# The name of the math extension changed on Sphinx 1.4
40if major == 1 and minor > 3: 40if major == 1 and minor > 3:
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4/ext4.rst
index 7f628b9f7c4b..9d4368d591fa 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4/ext4.rst
@@ -1,6 +1,8 @@
1.. SPDX-License-Identifier: GPL-2.0
1 2
2Ext4 Filesystem 3========================
3=============== 4General Information
5========================
4 6
5Ext4 is an advanced level of the ext3 filesystem which incorporates 7Ext4 is an advanced level of the ext3 filesystem which incorporates
6scalability and reliability enhancements for supporting large filesystems 8scalability and reliability enhancements for supporting large filesystems
@@ -11,37 +13,30 @@ Mailing list: linux-ext4@vger.kernel.org
11Web site: http://ext4.wiki.kernel.org 13Web site: http://ext4.wiki.kernel.org
12 14
13 15
141. Quick usage instructions: 16Quick usage instructions
15=========================== 17========================
16 18
17Note: More extensive information for getting started with ext4 can be 19Note: More extensive information for getting started with ext4 can be
18 found at the ext4 wiki site at the URL: 20found at the ext4 wiki site at the URL:
19 http://ext4.wiki.kernel.org/index.php/Ext4_Howto 21http://ext4.wiki.kernel.org/index.php/Ext4_Howto
20 22
21 - Compile and install the latest version of e2fsprogs (as of this 23 - The latest version of e2fsprogs can be found at:
22 writing version 1.41.3) from: 24
25 https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
23 26
24 http://sourceforge.net/project/showfiles.php?group_id=2406
25
26 or 27 or
27 28
28 https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ 29 http://sourceforge.net/project/showfiles.php?group_id=2406
29 30
30 or grab the latest git repository from: 31 or grab the latest git repository from:
31 32
32 git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git 33 https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
33
34 - Note that it is highly important to install the mke2fs.conf file
35 that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If
36 you have edited the /etc/mke2fs.conf file installed on your system,
37 you will need to merge your changes with the version from e2fsprogs
38 1.41.x.
39 34
40 - Create a new filesystem using the ext4 filesystem type: 35 - Create a new filesystem using the ext4 filesystem type:
41 36
42 # mke2fs -t ext4 /dev/hda1 37 # mke2fs -t ext4 /dev/hda1
43 38
44 Or to configure an existing ext3 filesystem to support extents: 39 Or to configure an existing ext3 filesystem to support extents:
45 40
46 # tune2fs -O extents /dev/hda1 41 # tune2fs -O extents /dev/hda1
47 42
@@ -50,10 +45,6 @@ Note: More extensive information for getting started with ext4 can be
50 45
51 # tune2fs -I 256 /dev/hda1 46 # tune2fs -I 256 /dev/hda1
52 47
53 (Note: we currently do not have tools to convert an ext4
54 filesystem back to ext3; so please do not do try this on production
55 filesystems.)
56
57 - Mounting: 48 - Mounting:
58 49
59 # mount -t ext4 /dev/hda1 /wherever 50 # mount -t ext4 /dev/hda1 /wherever
@@ -75,10 +66,11 @@ Note: More extensive information for getting started with ext4 can be
75 the filesystem with a large journal can also be helpful for 66 the filesystem with a large journal can also be helpful for
76 metadata-intensive workloads. 67 metadata-intensive workloads.
77 68
782. Features 69Features
79=========== 70========
80 71
812.1 Currently available 72Currently Available
73-------------------
82 74
83* ability to use filesystems > 16TB (e2fsprogs support not available yet) 75* ability to use filesystems > 16TB (e2fsprogs support not available yet)
84* extent format reduces metadata overhead (RAM, IO for access, transactions) 76* extent format reduces metadata overhead (RAM, IO for access, transactions)
@@ -103,31 +95,15 @@ Note: More extensive information for getting started with ext4 can be
103[1] Filesystems with a block size of 1k may see a limit imposed by the 95[1] Filesystems with a block size of 1k may see a limit imposed by the
104directory hash tree having a maximum depth of two. 96directory hash tree having a maximum depth of two.
105 97
1062.2 Candidate features for future inclusion 98Options
107 99=======
108* online defrag (patches available but not well tested)
109* reduced mke2fs time via lazy itable initialization in conjunction with
110 the uninit_bg feature (capability to do this is available in e2fsprogs
111 but a kernel thread to do lazy zeroing of unused inode table blocks
112 after filesystem is first mounted is required for safety)
113
114There are several others under discussion, whether they all make it in is
115partly a function of how much time everyone has to work on them. Features like
116metadata checksumming have been discussed and planned for a bit but no patches
117exist yet so I'm not sure they're in the near-term roadmap.
118
119The big performance win will come with mballoc, delalloc and flex_bg
120grouping of bitmaps and inode tables. Some test results available here:
121
122 - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html
123 - http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html
124
1253. Options
126==========
127 100
128When mounting an ext4 filesystem, the following option are accepted: 101When mounting an ext4 filesystem, the following option are accepted:
129(*) == default 102(*) == default
130 103
104======================= =======================================================
105Mount Option Description
106======================= =======================================================
131ro Mount filesystem read only. Note that ext4 will 107ro Mount filesystem read only. Note that ext4 will
132 replay the journal (and thus write to the 108 replay the journal (and thus write to the
133 partition) even when mounted "read only". The 109 partition) even when mounted "read only". The
@@ -387,33 +363,38 @@ i_version Enable 64-bit inode version support. This option is
387dax Use direct access (no page cache). See 363dax Use direct access (no page cache). See
388 Documentation/filesystems/dax.txt. Note that 364 Documentation/filesystems/dax.txt. Note that
389 this option is incompatible with data=journal. 365 this option is incompatible with data=journal.
366======================= =======================================================
390 367
391Data Mode 368Data Mode
392========= 369=========
393There are 3 different data modes: 370There are 3 different data modes:
394 371
395* writeback mode 372* writeback mode
396In data=writeback mode, ext4 does not journal data at all. This mode provides 373
397a similar level of journaling as that of XFS, JFS, and ReiserFS in its default 374 In data=writeback mode, ext4 does not journal data at all. This mode provides
398mode - metadata journaling. A crash+recovery can cause incorrect data to 375 a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
399appear in files which were written shortly before the crash. This mode will 376 mode - metadata journaling. A crash+recovery can cause incorrect data to
400typically provide the best ext4 performance. 377 appear in files which were written shortly before the crash. This mode will
378 typically provide the best ext4 performance.
401 379
402* ordered mode 380* ordered mode
403In data=ordered mode, ext4 only officially journals metadata, but it logically 381
404groups metadata information related to data changes with the data blocks into a 382 In data=ordered mode, ext4 only officially journals metadata, but it logically
405single unit called a transaction. When it's time to write the new metadata 383 groups metadata information related to data changes with the data blocks into
406out to disk, the associated data blocks are written first. In general, 384 a single unit called a transaction. When it's time to write the new metadata
407this mode performs slightly slower than writeback but significantly faster than journal mode. 385 out to disk, the associated data blocks are written first. In general, this
386 mode performs slightly slower than writeback but significantly faster than
387 journal mode.
408 388
409* journal mode 389* journal mode
410data=journal mode provides full data and metadata journaling. All new data is 390
411written to the journal first, and then to its final location. 391 data=journal mode provides full data and metadata journaling. All new data is
412In the event of a crash, the journal can be replayed, bringing both data and 392 written to the journal first, and then to its final location. In the event of
413metadata into a consistent state. This mode is the slowest except when data 393 a crash, the journal can be replayed, bringing both data and metadata into a
414needs to be read from and written to disk at the same time where it 394 consistent state. This mode is the slowest except when data needs to be read
415outperforms all others modes. Enabling this mode will disable delayed 395 from and written to disk at the same time where it outperforms all others
416allocation and O_DIRECT support. 396 modes. Enabling this mode will disable delayed allocation and O_DIRECT
397 support.
417 398
418/proc entries 399/proc entries
419============= 400=============
@@ -425,10 +406,12 @@ Information about mounted ext4 file systems can be found in
425in table below. 406in table below.
426 407
427Files in /proc/fs/ext4/<devname> 408Files in /proc/fs/ext4/<devname>
428.............................................................................. 409
410================ =======
429 File Content 411 File Content
412================ =======
430 mb_groups details of multiblock allocator buddy cache of free blocks 413 mb_groups details of multiblock allocator buddy cache of free blocks
431.............................................................................. 414================ =======
432 415
433/sys entries 416/sys entries
434============ 417============
@@ -439,28 +422,30 @@ Information about mounted ext4 file systems can be found in
439/sys/fs/ext4/dm-0). The files in each per-device directory are shown 422/sys/fs/ext4/dm-0). The files in each per-device directory are shown
440in table below. 423in table below.
441 424
442Files in /sys/fs/ext4/<devname> 425Files in /sys/fs/ext4/<devname>:
426
443(see also Documentation/ABI/testing/sysfs-fs-ext4) 427(see also Documentation/ABI/testing/sysfs-fs-ext4)
444..............................................................................
445 File Content
446 428
429============================= =================================================
430File Content
431============================= =================================================
447 delayed_allocation_blocks This file is read-only and shows the number of 432 delayed_allocation_blocks This file is read-only and shows the number of
448 blocks that are dirty in the page cache, but 433 blocks that are dirty in the page cache, but
449 which do not have their location in the 434 which do not have their location in the
450 filesystem allocated yet. 435 filesystem allocated yet.
451 436
452 inode_goal Tuning parameter which (if non-zero) controls 437inode_goal Tuning parameter which (if non-zero) controls
453 the goal inode used by the inode allocator in 438 the goal inode used by the inode allocator in
454 preference to all other allocation heuristics. 439 preference to all other allocation heuristics.
455 This is intended for debugging use only, and 440 This is intended for debugging use only, and
456 should be 0 on production systems. 441 should be 0 on production systems.
457 442
458 inode_readahead_blks Tuning parameter which controls the maximum 443inode_readahead_blks Tuning parameter which controls the maximum
459 number of inode table blocks that ext4's inode 444 number of inode table blocks that ext4's inode
460 table readahead algorithm will pre-read into 445 table readahead algorithm will pre-read into
461 the buffer cache 446 the buffer cache
462 447
463 lifetime_write_kbytes This file is read-only and shows the number of 448lifetime_write_kbytes This file is read-only and shows the number of
464 kilobytes of data that have been written to this 449 kilobytes of data that have been written to this
465 filesystem since it was created. 450 filesystem since it was created.
466 451
@@ -508,7 +493,7 @@ Files in /sys/fs/ext4/<devname>
508 in the file system. If there is not enough space 493 in the file system. If there is not enough space
509 for the reserved space when mounting the file 494 for the reserved space when mounting the file
510 mount will _not_ fail. 495 mount will _not_ fail.
511.............................................................................. 496============================= =================================================
512 497
513Ioctls 498Ioctls
514====== 499======
@@ -518,8 +503,10 @@ through the system call interfaces. The list of all Ext4 specific ioctls are
518shown in the table below. 503shown in the table below.
519 504
520Table of Ext4 specific ioctls 505Table of Ext4 specific ioctls
521.............................................................................. 506
522 Ioctl Description 507============================= =================================================
508Ioctl Description
509============================= =================================================
523 EXT4_IOC_GETFLAGS Get additional attributes associated with inode. 510 EXT4_IOC_GETFLAGS Get additional attributes associated with inode.
524 The ioctl argument is an integer bitfield, with 511 The ioctl argument is an integer bitfield, with
525 bit values described in ext4.h. This ioctl is an 512 bit values described in ext4.h. This ioctl is an
@@ -610,8 +597,7 @@ Table of Ext4 specific ioctls
610 normal user by accident. 597 normal user by accident.
611 The data blocks of the previous boot loader 598 The data blocks of the previous boot loader
612 will be associated with the given inode. 599 will be associated with the given inode.
613 600============================= =================================================
614..............................................................................
615 601
616References 602References
617========== 603==========
diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst
new file mode 100644
index 000000000000..71121605558c
--- /dev/null
+++ b/Documentation/filesystems/ext4/index.rst
@@ -0,0 +1,17 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3===============
4ext4 Filesystem
5===============
6
7General usage and on-disk artifacts writen by ext4. More documentation may
8be ported from the wiki as time permits. This should be considered the
9canonical source of information as the details here have been reviewed by
10the ext4 community.
11
12.. toctree::
13 :maxdepth: 5
14 :numbered:
15
16 ext4
17 ondisk/index
diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/ondisk/about.rst
new file mode 100644
index 000000000000..0aadba052264
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/about.rst
@@ -0,0 +1,44 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3About this Book
4===============
5
6This document attempts to describe the on-disk format for ext4
7filesystems. The same general ideas should apply to ext2/3 filesystems
8as well, though they do not support all the features that ext4 supports,
9and the fields will be shorter.
10
11**NOTE**: This is a work in progress, based on notes that the author
12(djwong) made while picking apart a filesystem by hand. The data
13structure definitions should be current as of Linux 4.18 and
14e2fsprogs-1.44. All comments and corrections are welcome, since there is
15undoubtedly plenty of lore that might not be reflected in freshly
16created demonstration filesystems.
17
18License
19-------
20This book is licensed under the terms of the GNU Public License, v2.
21
22Terminology
23-----------
24
25ext4 divides a storage device into an array of logical blocks both to
26reduce bookkeeping overhead and to increase throughput by forcing larger
27transfer sizes. Generally, the block size will be 4KiB (the same size as
28pages on x86 and the block layer's default block size), though the
29actual size is calculated as 2 ^ (10 + ``sb.s_log_block_size``) bytes.
30Throughout this document, disk locations are given in terms of these
31logical blocks, not raw LBAs, and not 1024-byte blocks. For the sake of
32convenience, the logical block size will be referred to as
33``$block_size`` throughout the rest of the document.
34
35When referenced in ``preformatted text`` blocks, ``sb`` refers to fields
36in the super block, and ``inode`` refers to fields in an inode table
37entry.
38
39Other References
40----------------
41
42Also see http://www.nongnu.org/ext2-doc/ for quite a collection of
43information about ext2/3. Here's another old reference:
44http://wiki.osdev.org/Ext2
diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/ondisk/allocators.rst
new file mode 100644
index 000000000000..7aa85152ace3
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/allocators.rst
@@ -0,0 +1,56 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Block and Inode Allocation Policy
4---------------------------------
5
6ext4 recognizes (better than ext3, anyway) that data locality is
7generally a desirably quality of a filesystem. On a spinning disk,
8keeping related blocks near each other reduces the amount of movement
9that the head actuator and disk must perform to access a data block,
10thus speeding up disk IO. On an SSD there of course are no moving parts,
11but locality can increase the size of each transfer request while
12reducing the total number of requests. This locality may also have the
13effect of concentrating writes on a single erase block, which can speed
14up file rewrites significantly. Therefore, it is useful to reduce
15fragmentation whenever possible.
16
17The first tool that ext4 uses to combat fragmentation is the multi-block
18allocator. When a file is first created, the block allocator
19speculatively allocates 8KiB of disk space to the file on the assumption
20that the space will get written soon. When the file is closed, the
21unused speculative allocations are of course freed, but if the
22speculation is correct (typically the case for full writes of small
23files) then the file data gets written out in a single multi-block
24extent. A second related trick that ext4 uses is delayed allocation.
25Under this scheme, when a file needs more blocks to absorb file writes,
26the filesystem defers deciding the exact placement on the disk until all
27the dirty buffers are being written out to disk. By not committing to a
28particular placement until it's absolutely necessary (the commit timeout
29is hit, or sync() is called, or the kernel runs out of memory), the hope
30is that the filesystem can make better location decisions.
31
32The third trick that ext4 (and ext3) uses is that it tries to keep a
33file's data blocks in the same block group as its inode. This cuts down
34on the seek penalty when the filesystem first has to read a file's inode
35to learn where the file's data blocks live and then seek over to the
36file's data blocks to begin I/O operations.
37
38The fourth trick is that all the inodes in a directory are placed in the
39same block group as the directory, when feasible. The working assumption
40here is that all the files in a directory might be related, therefore it
41is useful to try to keep them all together.
42
43The fifth trick is that the disk volume is cut up into 128MB block
44groups; these mini-containers are used as outlined above to try to
45maintain data locality. However, there is a deliberate quirk -- when a
46directory is created in the root directory, the inode allocator scans
47the block groups and puts that directory into the least heavily loaded
48block group that it can find. This encourages directories to spread out
49over a disk; as the top-level directory/file blobs fill up one block
50group, the allocators simply move on to the next block group. Allegedly
51this scheme evens out the loading on the block groups, though the author
52suspects that the directories which are so unlucky as to land towards
53the end of a spinning drive get a raw deal performance-wise.
54
55Of course if all of these mechanisms fail, one can always use e4defrag
56to defragment files.
diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/ondisk/attributes.rst
new file mode 100644
index 000000000000..0b01b67b81fe
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/attributes.rst
@@ -0,0 +1,191 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Extended Attributes
4-------------------
5
6Extended attributes (xattrs) are typically stored in a separate data
7block on the disk and referenced from inodes via ``inode.i_file_acl*``.
8The first use of extended attributes seems to have been for storing file
9ACLs and other security data (selinux). With the ``user_xattr`` mount
10option it is possible for users to store extended attributes so long as
11all attribute names begin with “user”; this restriction seems to have
12disappeared as of Linux 3.0.
13
14There are two places where extended attributes can be found. The first
15place is between the end of each inode entry and the beginning of the
16next inode entry. For example, if inode.i\_extra\_isize = 28 and
17sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes
18available for in-inode extended attribute storage. The second place
19where extended attributes can be found is in the block pointed to by
20``inode.i_file_acl``. As of Linux 3.11, it is not possible for this
21block to contain a pointer to a second extended attribute block (or even
22the remaining blocks of a cluster). In theory it is possible for each
23attribute's value to be stored in a separate data block, though as of
24Linux 3.11 the code does not permit this.
25
26Keys are generally assumed to be ASCIIZ strings, whereas values can be
27strings or binary data.
28
29Extended attributes, when stored after the inode, have a header
30``ext4_xattr_ibody_header`` that is 4 bytes long:
31
32.. list-table::
33 :widths: 1 1 1 77
34 :header-rows: 1
35
36 * - Offset
37 - Type
38 - Name
39 - Description
40 * - 0x0
41 - \_\_le32
42 - h\_magic
43 - Magic number for identification, 0xEA020000. This value is set by the
44 Linux driver, though e2fsprogs doesn't seem to check it(?)
45
46The beginning of an extended attribute block is in
47``struct ext4_xattr_header``, which is 32 bytes long:
48
49.. list-table::
50 :widths: 1 1 1 77
51 :header-rows: 1
52
53 * - Offset
54 - Type
55 - Name
56 - Description
57 * - 0x0
58 - \_\_le32
59 - h\_magic
60 - Magic number for identification, 0xEA020000.
61 * - 0x4
62 - \_\_le32
63 - h\_refcount
64 - Reference count.
65 * - 0x8
66 - \_\_le32
67 - h\_blocks
68 - Number of disk blocks used.
69 * - 0xC
70 - \_\_le32
71 - h\_hash
72 - Hash value of all attributes.
73 * - 0x10
74 - \_\_le32
75 - h\_checksum
76 - Checksum of the extended attribute block.
77 * - 0x14
78 - \_\_u32
79 - h\_reserved[2]
80 - Zero.
81
82The checksum is calculated against the FS UUID, the 64-bit block number
83of the extended attribute block, and the entire block (header +
84entries).
85
86Following the ``struct ext4_xattr_header`` or
87``struct ext4_xattr_ibody_header`` is an array of
88``struct ext4_xattr_entry``; each of these entries is at least 16 bytes
89long. When stored in an external block, the ``struct ext4_xattr_entry``
90entries must be stored in sorted order. The sort order is
91``e_name_index``, then ``e_name_len``, and finally ``e_name``.
92Attributes stored inside an inode do not need be stored in sorted order.
93
94.. list-table::
95 :widths: 1 1 1 77
96 :header-rows: 1
97
98 * - Offset
99 - Type
100 - Name
101 - Description
102 * - 0x0
103 - \_\_u8
104 - e\_name\_len
105 - Length of name.
106 * - 0x1
107 - \_\_u8
108 - e\_name\_index
109 - Attribute name index. There is a discussion of this below.
110 * - 0x2
111 - \_\_le16
112 - e\_value\_offs
113 - Location of this attribute's value on the disk block where it is stored.
114 Multiple attributes can share the same value. For an inode attribute
115 this value is relative to the start of the first entry; for a block this
116 value is relative to the start of the block (i.e. the header).
117 * - 0x4
118 - \_\_le32
119 - e\_value\_inum
120 - The inode where the value is stored. Zero indicates the value is in the
121 same block as this entry. This field is only used if the
122 INCOMPAT\_EA\_INODE feature is enabled.
123 * - 0x8
124 - \_\_le32
125 - e\_value\_size
126 - Length of attribute value.
127 * - 0xC
128 - \_\_le32
129 - e\_hash
130 - Hash value of attribute name and attribute value. The kernel doesn't
131 update the hash for in-inode attributes, so for that case this value
132 must be zero, because e2fsck validates any non-zero hash regardless of
133 where the xattr lives.
134 * - 0x10
135 - char
136 - e\_name[e\_name\_len]
137 - Attribute name. Does not include trailing NULL.
138
139Attribute values can follow the end of the entry table. There appears to
140be a requirement that they be aligned to 4-byte boundaries. The values
141are stored starting at the end of the block and grow towards the
142xattr\_header/xattr\_entry table. When the two collide, the overflow is
143put into a separate disk block. If the disk block fills up, the
144filesystem returns -ENOSPC.
145
146The first four fields of the ``ext4_xattr_entry`` are set to zero to
147mark the end of the key list.
148
149Attribute Name Indices
150~~~~~~~~~~~~~~~~~~~~~~
151
152Logically speaking, extended attributes are a series of key=value pairs.
153The keys are assumed to be NULL-terminated strings. To reduce the amount
154of on-disk space that the keys consume, the beginning of the key string
155is matched against the attribute name index. If a match is found, the
156attribute name index field is set, and matching string is removed from
157the key name. Here is a map of name index values to key prefixes:
158
159.. list-table::
160 :widths: 1 79
161 :header-rows: 1
162
163 * - Name Index
164 - Key Prefix
165 * - 0
166 - (no prefix)
167 * - 1
168 - “user.”
169 * - 2
170 - “system.posix\_acl\_access”
171 * - 3
172 - “system.posix\_acl\_default”
173 * - 4
174 - “trusted.”
175 * - 6
176 - “security.”
177 * - 7
178 - “system.” (inline\_data only?)
179 * - 8
180 - “system.richacl” (SuSE kernels only?)
181
182For example, if the attribute key is “user.fubar”, the attribute name
183index is set to 1 and the “fubar” name is recorded on disk.
184
185POSIX ACLs
186~~~~~~~~~~
187
188POSIX ACLs are stored in a reduced version of the Linux kernel (and
189libacl's) internal ACL format. The key difference is that the version
190number is different (1) and the ``e_id`` field is only stored for named
191user and group ACLs.
diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/ondisk/bigalloc.rst
new file mode 100644
index 000000000000..c6d88557553c
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/bigalloc.rst
@@ -0,0 +1,22 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Bigalloc
4--------
5
6At the moment, the default size of a block is 4KiB, which is a commonly
7supported page size on most MMU-capable hardware. This is fortunate, as
8ext4 code is not prepared to handle the case where the block size
9exceeds the page size. However, for a filesystem of mostly huge files,
10it is desirable to be able to allocate disk blocks in units of multiple
11blocks to reduce both fragmentation and metadata overhead. The
12`bigalloc <Bigalloc>`__ feature provides exactly this ability. The
13administrator can set a block cluster size at mkfs time (which is stored
14in the s\_log\_cluster\_size field in the superblock); from then on, the
15block bitmaps track clusters, not individual blocks. This means that
16block groups can be several gigabytes in size (instead of just 128MiB);
17however, the minimum allocation unit becomes a cluster, not a block,
18even for directories. TaoBao had a patchset to extend the “use units of
19clusters instead of blocks” to the extent tree, though it is not clear
20where those patches went-- they eventually morphed into “extent tree v2”
21but that code has not landed as of May 2015.
22
diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/ondisk/bitmaps.rst
new file mode 100644
index 000000000000..c7546dbc197a
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/bitmaps.rst
@@ -0,0 +1,28 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Block and inode Bitmaps
4-----------------------
5
6The data block bitmap tracks the usage of data blocks within the block
7group.
8
9The inode bitmap records which entries in the inode table are in use.
10
11As with most bitmaps, one bit represents the usage status of one data
12block or inode table entry. This implies a block group size of 8 \*
13number\_of\_bytes\_in\_a\_logical\_block.
14
15NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts
16of the kernel and e2fsprogs code pretends that the block bitmap contains
17zeros (i.e. all blocks in the group are free). However, it is not
18necessarily the case that no blocks are in use -- if ``meta_bg`` is set,
19the bitmaps and group descriptor live inside the group. Unfortunately,
20ext2fs\_test\_block\_bitmap2() will return '0' for those locations,
21which produces confusing debugfs output.
22
23Inode Table
24-----------
25Inode tables are statically allocated at mkfs time. Each block group
26descriptor points to the start of the table, and the superblock records
27the number of inodes per group. See the section on inodes for more
28information.
diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/ondisk/blockgroup.rst
new file mode 100644
index 000000000000..baf888e4c06a
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/blockgroup.rst
@@ -0,0 +1,135 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Layout
4------
5
6The layout of a standard block group is approximately as follows (each
7of these fields is discussed in a separate section below):
8
9.. list-table::
10 :widths: 1 1 1 1 1 1 1 1
11 :header-rows: 1
12
13 * - Group 0 Padding
14 - ext4 Super Block
15 - Group Descriptors
16 - Reserved GDT Blocks
17 - Data Block Bitmap
18 - inode Bitmap
19 - inode Table
20 - Data Blocks
21 * - 1024 bytes
22 - 1 block
23 - many blocks
24 - many blocks
25 - 1 block
26 - 1 block
27 - many blocks
28 - many more blocks
29
30For the special case of block group 0, the first 1024 bytes are unused,
31to allow for the installation of x86 boot sectors and other oddities.
32The superblock will start at offset 1024 bytes, whichever block that
33happens to be (usually 0). However, if for some reason the block size =
341024, then block 0 is marked in use and the superblock goes in block 1.
35For all other block groups, there is no padding.
36
37The ext4 driver primarily works with the superblock and the group
38descriptors that are found in block group 0. Redundant copies of the
39superblock and group descriptors are written to some of the block groups
40across the disk in case the beginning of the disk gets trashed, though
41not all block groups necessarily host a redundant copy (see following
42paragraph for more details). If the group does not have a redundant
43copy, the block group begins with the data block bitmap. Note also that
44when the filesystem is freshly formatted, mkfs will allocate “reserve
45GDT block” space after the block group descriptors and before the start
46of the block bitmaps to allow for future expansion of the filesystem. By
47default, a filesystem is allowed to increase in size by a factor of
481024x over the original filesystem size.
49
50The location of the inode table is given by ``grp.bg_inode_table_*``. It
51is continuous range of blocks large enough to contain
52``sb.s_inodes_per_group * sb.s_inode_size`` bytes.
53
54As for the ordering of items in a block group, it is generally
55established that the super block and the group descriptor table, if
56present, will be at the beginning of the block group. The bitmaps and
57the inode table can be anywhere, and it is quite possible for the
58bitmaps to come after the inode table, or for both to be in different
59groups (flex\_bg). Leftover space is used for file data blocks, indirect
60block maps, extent tree blocks, and extended attributes.
61
62Flexible Block Groups
63---------------------
64
65Starting in ext4, there is a new feature called flexible block groups
66(flex\_bg). In a flex\_bg, several block groups are tied together as one
67logical block group; the bitmap spaces and the inode table space in the
68first block group of the flex\_bg are expanded to include the bitmaps
69and inode tables of all other block groups in the flex\_bg. For example,
70if the flex\_bg size is 4, then group 0 will contain (in order) the
71superblock, group descriptors, data block bitmaps for groups 0-3, inode
72bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining
73space in group 0 is for file data. The effect of this is to group the
74block metadata close together for faster loading, and to enable large
75files to be continuous on disk. Backup copies of the superblock and
76group descriptors are always at the beginning of block groups, even if
77flex\_bg is enabled. The number of block groups that make up a flex\_bg
78is given by 2 ^ ``sb.s_log_groups_per_flex``.
79
80Meta Block Groups
81-----------------
82
83Without the option META\_BG, for safety concerns, all block group
84descriptors copies are kept in the first block group. Given the default
85128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4
86can have at most 2^27/64 = 2^21 block groups. This limits the entire
87filesystem size to 2^21 ∗ 2^27 = 2^48bytes or 256TiB.
88
89The solution to this problem is to use the metablock group feature
90(META\_BG), which is already in ext3 for all 2.6 releases. With the
91META\_BG feature, ext4 filesystems are partitioned into many metablock
92groups. Each metablock group is a cluster of block groups whose group
93descriptor structures can be stored in a single disk block. For ext4
94filesystems with 4 KB block size, a single metablock group partition
95includes 64 block groups, or 8 GiB of disk space. The metablock group
96feature moves the location of the group descriptors from the congested
97first block group of the whole filesystem into the first group of each
98metablock group itself. The backups are in the second and last group of
99each metablock group. This increases the 2^21 maximum block groups limit
100to the hard limit 2^32, allowing support for a 512PiB filesystem.
101
102The change in the filesystem format replaces the current scheme where
103the superblock is followed by a variable-length set of block group
104descriptors. Instead, the superblock and a single block group descriptor
105block is placed at the beginning of the first, second, and last block
106groups in a meta-block group. A meta-block group is a collection of
107block groups which can be described by a single block group descriptor
108block. Since the size of the block group descriptor structure is 32
109bytes, a meta-block group contains 32 block groups for filesystems with
110a 1KB block size, and 128 block groups for filesystems with a 4KB
111blocksize. Filesystems can either be created using this new block group
112descriptor layout, or existing filesystems can be resized on-line, and
113the field s\_first\_meta\_bg in the superblock will indicate the first
114block group using this new layout.
115
116Please see an important note about ``BLOCK_UNINIT`` in the section about
117block and inode bitmaps.
118
119Lazy Block Group Initialization
120-------------------------------
121
122A new feature for ext4 are three block group descriptor flags that
123enable mkfs to skip initializing other parts of the block group
124metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean
125that the inode and block bitmaps for that group can be calculated and
126therefore the on-disk bitmap blocks are not initialized. This is
127generally the case for an empty block group or a block group containing
128only fixed-location block group metadata. The INODE\_ZEROED flag means
129that the inode table has been initialized; mkfs will unset this flag and
130rely on the kernel to initialize the inode tables in the background.
131
132By not writing zeroes to the bitmaps and inode table, mkfs time is
133reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM,
134but the dumpe2fs output prints this as “uninit\_bg”. They are the same
135thing.
diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/ondisk/blockmap.rst
new file mode 100644
index 000000000000..30e25750d88a
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/blockmap.rst
@@ -0,0 +1,49 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
4| i.i\_block Offset | Where It Points |
5+=====================+==============================================================================================================================================================================================================================+
6| 0 to 11 | Direct map to file blocks 0 to 11. |
7+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
8| 12 | Indirect block: (file blocks 12 to (``$block_size`` / 4) + 11, or 12 to 1035 if 4KiB blocks) |
9| | |
10| | +------------------------------+--------------------------------------------------------------------+ |
11| | | Indirect Block Offset | Where It Points | |
12| | +==============================+====================================================================+ |
13| | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | |
14| | +------------------------------+--------------------------------------------------------------------+ |
15+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
16| 13 | Double-indirect block: (file blocks ``$block_size``/4 + 12 to (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 11, or 1036 to 1049611 if 4KiB blocks) |
17| | |
18| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ |
19| | | Double Indirect Block Offset | Where It Points | |
20| | +================================+=========================================================================================================+ |
21| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | |
22| | | | | |
23| | | | +------------------------------+--------------------------------------------------------------------+ | |
24| | | | | Indirect Block Offset | Where It Points | | |
25| | | | +==============================+====================================================================+ | |
26| | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | |
27| | | | +------------------------------+--------------------------------------------------------------------+ | |
28| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ |
29+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
30| 14 | Triple-indirect block: (file blocks (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12 to (``$block_size`` / 4) ^ 3 + (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12, or 1049612 to 1074791436 if 4KiB blocks) |
31| | |
32| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ |
33| | | Triple Indirect Block Offset | Where It Points | |
34| | +================================+================================================================================================================================================+ |
35| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) double indirect blocks (1024 if 4KiB blocks) | |
36| | | | | |
37| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | |
38| | | | | Double Indirect Block Offset | Where It Points | | |
39| | | | +================================+=========================================================================================================+ | |
40| | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | |
41| | | | | | | | |
42| | | | | | +------------------------------+--------------------------------------------------------------------+ | | |
43| | | | | | | Indirect Block Offset | Where It Points | | | |
44| | | | | | +==============================+====================================================================+ | | |
45| | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | |
46| | | | | | +------------------------------+--------------------------------------------------------------------+ | | |
47| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | |
48| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ |
49+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/ondisk/blocks.rst
new file mode 100644
index 000000000000..73d4dc0f7bda
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/blocks.rst
@@ -0,0 +1,142 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Blocks
4------
5
6ext4 allocates storage space in units of “blocks”. A block is a group of
7sectors between 1KiB and 64KiB, and the number of sectors must be an
8integral power of 2. Blocks are in turn grouped into larger units called
9block groups. Block size is specified at mkfs time and typically is
104KiB. You may experience mounting problems if block size is greater than
11page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory
12pages). By default a filesystem can contain 2^32 blocks; if the '64bit'
13feature is enabled, then a filesystem can have 2^64 blocks.
14
15For 32-bit filesystems, limits are as follows:
16
17.. list-table::
18 :widths: 1 1 1 1 1
19 :header-rows: 1
20
21 * - Item
22 - 1KiB
23 - 2KiB
24 - 4KiB
25 - 64KiB
26 * - Blocks
27 - 2^32
28 - 2^32
29 - 2^32
30 - 2^32
31 * - Inodes
32 - 2^32
33 - 2^32
34 - 2^32
35 - 2^32
36 * - File System Size
37 - 4TiB
38 - 8TiB
39 - 16TiB
40 - 256PiB
41 * - Blocks Per Block Group
42 - 8,192
43 - 16,384
44 - 32,768
45 - 524,288
46 * - Inodes Per Block Group
47 - 8,192
48 - 16,384
49 - 32,768
50 - 524,288
51 * - Block Group Size
52 - 8MiB
53 - 32MiB
54 - 128MiB
55 - 32GiB
56 * - Blocks Per File, Extents
57 - 2^32
58 - 2^32
59 - 2^32
60 - 2^32
61 * - Blocks Per File, Block Maps
62 - 16,843,020
63 - 134,480,396
64 - 1,074,791,436
65 - 4,398,314,962,956 (really 2^32 due to field size limitations)
66 * - File Size, Extents
67 - 4TiB
68 - 8TiB
69 - 16TiB
70 - 256TiB
71 * - File Size, Block Maps
72 - 16GiB
73 - 256GiB
74 - 4TiB
75 - 256TiB
76
77For 64-bit filesystems, limits are as follows:
78
79.. list-table::
80 :widths: 1 1 1 1 1
81 :header-rows: 1
82
83 * - Item
84 - 1KiB
85 - 2KiB
86 - 4KiB
87 - 64KiB
88 * - Blocks
89 - 2^64
90 - 2^64
91 - 2^64
92 - 2^64
93 * - Inodes
94 - 2^32
95 - 2^32
96 - 2^32
97 - 2^32
98 * - File System Size
99 - 16ZiB
100 - 32ZiB
101 - 64ZiB
102 - 1YiB
103 * - Blocks Per Block Group
104 - 8,192
105 - 16,384
106 - 32,768
107 - 524,288
108 * - Inodes Per Block Group
109 - 8,192
110 - 16,384
111 - 32,768
112 - 524,288
113 * - Block Group Size
114 - 8MiB
115 - 32MiB
116 - 128MiB
117 - 32GiB
118 * - Blocks Per File, Extents
119 - 2^32
120 - 2^32
121 - 2^32
122 - 2^32
123 * - Blocks Per File, Block Maps
124 - 16,843,020
125 - 134,480,396
126 - 1,074,791,436
127 - 4,398,314,962,956 (really 2^32 due to field size limitations)
128 * - File Size, Extents
129 - 4TiB
130 - 8TiB
131 - 16TiB
132 - 256TiB
133 * - File Size, Block Maps
134 - 16GiB
135 - 256GiB
136 - 4TiB
137 - 256TiB
138
139Note: Files not using extents (i.e. files using block maps) must be
140placed within the first 2^32 blocks of a filesystem. Files with extents
141must be placed within the first 2^48 blocks of a filesystem. It's not
142clear what happens with larger filesystems.
diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/ondisk/checksums.rst
new file mode 100644
index 000000000000..9d6a793b2e03
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/checksums.rst
@@ -0,0 +1,73 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Checksums
4---------
5
6Starting in early 2012, metadata checksums were added to all major ext4
7and jbd2 data structures. The associated feature flag is metadata\_csum.
8The desired checksum algorithm is indicated in the superblock, though as
9of October 2012 the only supported algorithm is crc32c. Some data
10structures did not have space to fit a full 32-bit checksum, so only the
11lower 16 bits are stored. Enabling the 64bit feature increases the data
12structure size so that full 32-bit checksums can be stored for many data
13structures. However, existing 32-bit filesystems cannot be extended to
14enable 64bit mode, at least not without the experimental resize2fs
15patches to do so.
16
17Existing filesystems can have checksumming added by running
18``tune2fs -O metadata_csum`` against the underlying device. If tune2fs
19encounters directory blocks that lack sufficient empty space to add a
20checksum, it will request that you run ``e2fsck -D`` to have the
21directories rebuilt with checksums. This has the added benefit of
22removing slack space from the directory files and rebalancing the htree
23indexes. If you \_ignore\_ this step, your directories will not be
24protected by a checksum!
25
26The following table describes the data elements that go into each type
27of checksum. The checksum function is whatever the superblock describes
28(crc32c as of October 2013) unless noted otherwise.
29
30.. list-table::
31 :widths: 1 1 4
32 :header-rows: 1
33
34 * - Metadata
35 - Length
36 - Ingredients
37 * - Superblock
38 - \_\_le32
39 - The entire superblock up to the checksum field. The UUID lives inside
40 the superblock.
41 * - MMP
42 - \_\_le32
43 - UUID + the entire MMP block up to the checksum field.
44 * - Extended Attributes
45 - \_\_le32
46 - UUID + the entire extended attribute block. The checksum field is set to
47 zero.
48 * - Directory Entries
49 - \_\_le32
50 - UUID + inode number + inode generation + the directory block up to the
51 fake entry enclosing the checksum field.
52 * - HTREE Nodes
53 - \_\_le32
54 - UUID + inode number + inode generation + all valid extents + HTREE tail.
55 The checksum field is set to zero.
56 * - Extents
57 - \_\_le32
58 - UUID + inode number + inode generation + the entire extent block up to
59 the checksum field.
60 * - Bitmaps
61 - \_\_le32 or \_\_le16
62 - UUID + the entire bitmap. Checksums are stored in the group descriptor,
63 and truncated if the group descriptor size is 32 bytes (i.e. ^64bit)
64 * - Inodes
65 - \_\_le32
66 - UUID + inode number + inode generation + the entire inode. The checksum
67 field is set to zero. Each inode has its own checksum.
68 * - Group Descriptors
69 - \_\_le16
70 - If metadata\_csum, then UUID + group number + the entire descriptor;
71 else if gdt\_csum, then crc16(UUID + group number + the entire
72 descriptor). In all cases, only the lower 16 bits are stored.
73
diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/ondisk/directory.rst
new file mode 100644
index 000000000000..8fcba68c2884
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/directory.rst
@@ -0,0 +1,426 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Directory Entries
4-----------------
5
6In an ext4 filesystem, a directory is more or less a flat file that maps
7an arbitrary byte string (usually ASCII) to an inode number on the
8filesystem. There can be many directory entries across the filesystem
9that reference the same inode number--these are known as hard links, and
10that is why hard links cannot reference files on other filesystems. As
11such, directory entries are found by reading the data block(s)
12associated with a directory file for the particular directory entry that
13is desired.
14
15Linear (Classic) Directories
16~~~~~~~~~~~~~~~~~~~~~~~~~~~~
17
18By default, each directory lists its entries in an “almost-linear”
19array. I write “almost” because it's not a linear array in the memory
20sense because directory entries are not split across filesystem blocks.
21Therefore, it is more accurate to say that a directory is a series of
22data blocks and that each block contains a linear array of directory
23entries. The end of each per-block array is signified by reaching the
24end of the block; the last entry in the block has a record length that
25takes it all the way to the end of the block. The end of the entire
26directory is of course signified by reaching the end of the file. Unused
27directory entries are signified by inode = 0. By default the filesystem
28uses ``struct ext4_dir_entry_2`` for directory entries unless the
29“filetype” feature flag is not set, in which case it uses
30``struct ext4_dir_entry``.
31
32The original directory entry format is ``struct ext4_dir_entry``, which
33is at most 263 bytes long, though on disk you'll need to reference
34``dirent.rec_len`` to know for sure.
35
36.. list-table::
37 :widths: 1 1 1 77
38 :header-rows: 1
39
40 * - Offset
41 - Size
42 - Name
43 - Description
44 * - 0x0
45 - \_\_le32
46 - inode
47 - Number of the inode that this directory entry points to.
48 * - 0x4
49 - \_\_le16
50 - rec\_len
51 - Length of this directory entry. Must be a multiple of 4.
52 * - 0x6
53 - \_\_le16
54 - name\_len
55 - Length of the file name.
56 * - 0x8
57 - char
58 - name[EXT4\_NAME\_LEN]
59 - File name.
60
61Since file names cannot be longer than 255 bytes, the new directory
62entry format shortens the rec\_len field and uses the space for a file
63type flag, probably to avoid having to load every inode during directory
64tree traversal. This format is ``ext4_dir_entry_2``, which is at most
65263 bytes long, though on disk you'll need to reference
66``dirent.rec_len`` to know for sure.
67
68.. list-table::
69 :widths: 1 1 1 77
70 :header-rows: 1
71
72 * - Offset
73 - Size
74 - Name
75 - Description
76 * - 0x0
77 - \_\_le32
78 - inode
79 - Number of the inode that this directory entry points to.
80 * - 0x4
81 - \_\_le16
82 - rec\_len
83 - Length of this directory entry.
84 * - 0x6
85 - \_\_u8
86 - name\_len
87 - Length of the file name.
88 * - 0x7
89 - \_\_u8
90 - file\_type
91 - File type code, see ftype_ table below.
92 * - 0x8
93 - char
94 - name[EXT4\_NAME\_LEN]
95 - File name.
96
97.. _ftype:
98
99The directory file type is one of the following values:
100
101.. list-table::
102 :widths: 1 79
103 :header-rows: 1
104
105 * - Value
106 - Description
107 * - 0x0
108 - Unknown.
109 * - 0x1
110 - Regular file.
111 * - 0x2
112 - Directory.
113 * - 0x3
114 - Character device file.
115 * - 0x4
116 - Block device file.
117 * - 0x5
118 - FIFO.
119 * - 0x6
120 - Socket.
121 * - 0x7
122 - Symbolic link.
123
124In order to add checksums to these classic directory blocks, a phony
125``struct ext4_dir_entry`` is placed at the end of each leaf block to
126hold the checksum. The directory entry is 12 bytes long. The inode
127number and name\_len fields are set to zero to fool old software into
128ignoring an apparently empty directory entry, and the checksum is stored
129in the place where the name normally goes. The structure is
130``struct ext4_dir_entry_tail``:
131
132.. list-table::
133 :widths: 1 1 1 77
134 :header-rows: 1
135
136 * - Offset
137 - Size
138 - Name
139 - Description
140 * - 0x0
141 - \_\_le32
142 - det\_reserved\_zero1
143 - Inode number, which must be zero.
144 * - 0x4
145 - \_\_le16
146 - det\_rec\_len
147 - Length of this directory entry, which must be 12.
148 * - 0x6
149 - \_\_u8
150 - det\_reserved\_zero2
151 - Length of the file name, which must be zero.
152 * - 0x7
153 - \_\_u8
154 - det\_reserved\_ft
155 - File type, which must be 0xDE.
156 * - 0x8
157 - \_\_le32
158 - det\_checksum
159 - Directory leaf block checksum.
160
161The leaf directory block checksum is calculated against the FS UUID, the
162directory's inode number, the directory's inode generation number, and
163the entire directory entry block up to (but not including) the fake
164directory entry.
165
166Hash Tree Directories
167~~~~~~~~~~~~~~~~~~~~~
168
169A linear array of directory entries isn't great for performance, so a
170new feature was added to ext3 to provide a faster (but peculiar)
171balanced tree keyed off a hash of the directory entry name. If the
172EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a
173hashed btree (htree) to organize and find directory entries. For
174backwards read-only compatibility with ext2, this tree is actually
175hidden inside the directory file, masquerading as “empty” directory data
176blocks! It was stated previously that the end of the linear directory
177entry table was signified with an entry pointing to inode 0; this is
178(ab)used to fool the old linear-scan algorithm into thinking that the
179rest of the directory block is empty so that it moves on.
180
181The root of the tree always lives in the first data block of the
182directory. By ext2 custom, the '.' and '..' entries must appear at the
183beginning of this first block, so they are put here as two
184``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of
185the root node contains metadata about the tree and finally a hash->block
186map to find nodes that are lower in the htree. If
187``dx_root.info.indirect_levels`` is non-zero then the htree has two
188levels; the data block pointed to by the root node's map is an interior
189node, which is indexed by a minor hash. Interior nodes in this tree
190contains a zeroed out ``struct ext4_dir_entry_2`` followed by a
191minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear
192array of all ``struct ext4_dir_entry_2``; all of these entries
193(presumably) hash to the same value. If there is an overflow, the
194entries simply overflow into the next leaf node, and the
195least-significant bit of the hash (in the interior node map) that gets
196us to this next leaf node is set.
197
198To traverse the directory as a htree, the code calculates the hash of
199the desired file name and uses it to find the corresponding block
200number. If the tree is flat, the block is a linear array of directory
201entries that can be searched; otherwise, the minor hash of the file name
202is computed and used against this second block to find the corresponding
203third block number. That third block number will be a linear array of
204directory entries.
205
206To traverse the directory as a linear array (such as the old code does),
207the code simply reads every data block in the directory. The blocks used
208for the htree will appear to have no entries (aside from '.' and '..')
209and so only the leaf nodes will appear to have any interesting content.
210
211The root of the htree is in ``struct dx_root``, which is the full length
212of a data block:
213
214.. list-table::
215 :widths: 1 1 1 77
216 :header-rows: 1
217
218 * - Offset
219 - Type
220 - Name
221 - Description
222 * - 0x0
223 - \_\_le32
224 - dot.inode
225 - inode number of this directory.
226 * - 0x4
227 - \_\_le16
228 - dot.rec\_len
229 - Length of this record, 12.
230 * - 0x6
231 - u8
232 - dot.name\_len
233 - Length of the name, 1.
234 * - 0x7
235 - u8
236 - dot.file\_type
237 - File type of this entry, 0x2 (directory) (if the feature flag is set).
238 * - 0x8
239 - char
240 - dot.name[4]
241 - “.\\0\\0\\0”
242 * - 0xC
243 - \_\_le32
244 - dotdot.inode
245 - inode number of parent directory.
246 * - 0x10
247 - \_\_le16
248 - dotdot.rec\_len
249 - block\_size - 12. The record length is long enough to cover all htree
250 data.
251 * - 0x12
252 - u8
253 - dotdot.name\_len
254 - Length of the name, 2.
255 * - 0x13
256 - u8
257 - dotdot.file\_type
258 - File type of this entry, 0x2 (directory) (if the feature flag is set).
259 * - 0x14
260 - char
261 - dotdot\_name[4]
262 - “..\\0\\0”
263 * - 0x18
264 - \_\_le32
265 - struct dx\_root\_info.reserved\_zero
266 - Zero.
267 * - 0x1C
268 - u8
269 - struct dx\_root\_info.hash\_version
270 - Hash type, see dirhash_ table below.
271 * - 0x1D
272 - u8
273 - struct dx\_root\_info.info\_length
274 - Length of the tree information, 0x8.
275 * - 0x1E
276 - u8
277 - struct dx\_root\_info.indirect\_levels
278 - Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR
279 feature is set; cannot be larger than 2 otherwise.
280 * - 0x1F
281 - u8
282 - struct dx\_root\_info.unused\_flags
283 -
284 * - 0x20
285 - \_\_le16
286 - limit
287 - Maximum number of dx\_entries that can follow this header, plus 1 for
288 the header itself.
289 * - 0x22
290 - \_\_le16
291 - count
292 - Actual number of dx\_entries that follow this header, plus 1 for the
293 header itself.
294 * - 0x24
295 - \_\_le32
296 - block
297 - The block number (within the directory file) that goes with hash=0.
298 * - 0x28
299 - struct dx\_entry
300 - entries[0]
301 - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block.
302
303.. _dirhash:
304
305The directory hash is one of the following values:
306
307.. list-table::
308 :widths: 1 79
309 :header-rows: 1
310
311 * - Value
312 - Description
313 * - 0x0
314 - Legacy.
315 * - 0x1
316 - Half MD4.
317 * - 0x2
318 - Tea.
319 * - 0x3
320 - Legacy, unsigned.
321 * - 0x4
322 - Half MD4, unsigned.
323 * - 0x5
324 - Tea, unsigned.
325
326Interior nodes of an htree are recorded as ``struct dx_node``, which is
327also the full length of a data block:
328
329.. list-table::
330 :widths: 1 1 1 77
331 :header-rows: 1
332
333 * - Offset
334 - Type
335 - Name
336 - Description
337 * - 0x0
338 - \_\_le32
339 - fake.inode
340 - Zero, to make it look like this entry is not in use.
341 * - 0x4
342 - \_\_le16
343 - fake.rec\_len
344 - The size of the block, in order to hide all of the dx\_node data.
345 * - 0x6
346 - u8
347 - name\_len
348 - Zero. There is no name for this “unused” directory entry.
349 * - 0x7
350 - u8
351 - file\_type
352 - Zero. There is no file type for this “unused” directory entry.
353 * - 0x8
354 - \_\_le16
355 - limit
356 - Maximum number of dx\_entries that can follow this header, plus 1 for
357 the header itself.
358 * - 0xA
359 - \_\_le16
360 - count
361 - Actual number of dx\_entries that follow this header, plus 1 for the
362 header itself.
363 * - 0xE
364 - \_\_le32
365 - block
366 - The block number (within the directory file) that goes with the lowest
367 hash value of this block. This value is stored in the parent block.
368 * - 0x12
369 - struct dx\_entry
370 - entries[0]
371 - As many 8-byte ``struct dx_entry`` as fits in the rest of the data block.
372
373The hash maps that exist in both ``struct dx_root`` and
374``struct dx_node`` are recorded as ``struct dx_entry``, which is 8 bytes
375long:
376
377.. list-table::
378 :widths: 1 1 1 77
379 :header-rows: 1
380
381 * - Offset
382 - Type
383 - Name
384 - Description
385 * - 0x0
386 - \_\_le32
387 - hash
388 - Hash code.
389 * - 0x4
390 - \_\_le32
391 - block
392 - Block number (within the directory file, not filesystem blocks) of the
393 next node in the htree.
394
395(If you think this is all quite clever and peculiar, so does the
396author.)
397
398If metadata checksums are enabled, the last 8 bytes of the directory
399block (precisely the length of one dx\_entry) are used to store a
400``struct dx_tail``, which contains the checksum. The ``limit`` and
401``count`` entries in the dx\_root/dx\_node structures are adjusted as
402necessary to fit the dx\_tail into the block. If there is no space for
403the dx\_tail, the user is notified to run e2fsck -D to rebuild the
404directory index (which will ensure that there's space for the checksum.
405The dx\_tail structure is 8 bytes long and looks like this:
406
407.. list-table::
408 :widths: 1 1 1 77
409 :header-rows: 1
410
411 * - Offset
412 - Type
413 - Name
414 - Description
415 * - 0x0
416 - u32
417 - dt\_reserved
418 - Zero.
419 * - 0x4
420 - \_\_le32
421 - dt\_checksum
422 - Checksum of the htree directory block.
423
424The checksum is calculated against the FS UUID, the htree index header
425(dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in
426use, and the tail block (dx\_tail).
diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/ondisk/dynamic.rst
new file mode 100644
index 000000000000..bb0c84333341
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/dynamic.rst
@@ -0,0 +1,12 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Dynamic Structures
4==================
5
6Dynamic metadata are created on the fly when files and blocks are
7allocated to files.
8
9.. include:: inodes.rst
10.. include:: ifork.rst
11.. include:: directory.rst
12.. include:: attributes.rst
diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/ondisk/eainode.rst
new file mode 100644
index 000000000000..ecc0d01a0a72
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/eainode.rst
@@ -0,0 +1,18 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Large Extended Attribute Values
4-------------------------------
5
6To enable ext4 to store extended attribute values that do not fit in the
7inode or in the single extended attribute block attached to an inode,
8the EA\_INODE feature allows us to store the value in the data blocks of
9a regular file inode. This “EA inode” is linked only from the extended
10attribute name index and must not appear in a directory entry. The
11inode's i\_atime field is used to store a checksum of the xattr value;
12and i\_ctime/i\_version store a 64-bit reference count, which enables
13sharing of large xattr values between multiple owning inodes. For
14backward compatibility with older versions of this feature, the
15i\_mtime/i\_generation *may* store a back-reference to the inode number
16and i\_generation of the **one** owning inode (in cases where the EA
17inode is not referenced by multiple inodes) to verify that the EA inode
18is the correct one being accessed.
diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/ondisk/globals.rst
new file mode 100644
index 000000000000..368bf7662b96
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/globals.rst
@@ -0,0 +1,13 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Global Structures
4=================
5
6The filesystem is sharded into a number of block groups, each of which
7have static metadata at fixed locations.
8
9.. include:: super.rst
10.. include:: group_descr.rst
11.. include:: bitmaps.rst
12.. include:: mmp.rst
13.. include:: journal.rst
diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/ondisk/group_descr.rst
new file mode 100644
index 000000000000..759827e5d2cf
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/group_descr.rst
@@ -0,0 +1,170 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Block Group Descriptors
4-----------------------
5
6Each block group on the filesystem has one of these descriptors
7associated with it. As noted in the Layout section above, the group
8descriptors (if present) are the second item in the block group. The
9standard configuration is for each block group to contain a full copy of
10the block group descriptor table unless the sparse\_super feature flag
11is set.
12
13Notice how the group descriptor records the location of both bitmaps and
14the inode table (i.e. they can float). This means that within a block
15group, the only data structures with fixed locations are the superblock
16and the group descriptor table. The flex\_bg mechanism uses this
17property to group several block groups into a flex group and lay out all
18of the groups' bitmaps and inode tables into one long run in the first
19group of the flex group.
20
21If the meta\_bg feature flag is set, then several block groups are
22grouped together into a meta group. Note that in the meta\_bg case,
23however, the first and last two block groups within the larger meta
24group contain only group descriptors for the groups inside the meta
25group.
26
27flex\_bg and meta\_bg do not appear to be mutually exclusive features.
28
29In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the
30block group descriptor was only 32 bytes long and therefore ends at
31bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the
32block group descriptor expands to at least the 64 bytes described below;
33the size is stored in the superblock.
34
35If gdt\_csum is set and metadata\_csum is not set, the block group
36checksum is the crc16 of the FS UUID, the group number, and the group
37descriptor structure. If metadata\_csum is set, then the block group
38checksum is the lower 16 bits of the checksum of the FS UUID, the group
39number, and the group descriptor structure. Both block and inode bitmap
40checksums are calculated against the FS UUID, the group number, and the
41entire bitmap.
42
43The block group descriptor is laid out in ``struct ext4_group_desc``.
44
45.. list-table::
46 :widths: 1 1 1 77
47 :header-rows: 1
48
49 * - Offset
50 - Size
51 - Name
52 - Description
53 * - 0x0
54 - \_\_le32
55 - bg\_block\_bitmap\_lo
56 - Lower 32-bits of location of block bitmap.
57 * - 0x4
58 - \_\_le32
59 - bg\_inode\_bitmap\_lo
60 - Lower 32-bits of location of inode bitmap.
61 * - 0x8
62 - \_\_le32
63 - bg\_inode\_table\_lo
64 - Lower 32-bits of location of inode table.
65 * - 0xC
66 - \_\_le16
67 - bg\_free\_blocks\_count\_lo
68 - Lower 16-bits of free block count.
69 * - 0xE
70 - \_\_le16
71 - bg\_free\_inodes\_count\_lo
72 - Lower 16-bits of free inode count.
73 * - 0x10
74 - \_\_le16
75 - bg\_used\_dirs\_count\_lo
76 - Lower 16-bits of directory count.
77 * - 0x12
78 - \_\_le16
79 - bg\_flags
80 - Block group flags. See the bgflags_ table below.
81 * - 0x14
82 - \_\_le32
83 - bg\_exclude\_bitmap\_lo
84 - Lower 32-bits of location of snapshot exclusion bitmap.
85 * - 0x18
86 - \_\_le16
87 - bg\_block\_bitmap\_csum\_lo
88 - Lower 16-bits of the block bitmap checksum.
89 * - 0x1A
90 - \_\_le16
91 - bg\_inode\_bitmap\_csum\_lo
92 - Lower 16-bits of the inode bitmap checksum.
93 * - 0x1C
94 - \_\_le16
95 - bg\_itable\_unused\_lo
96 - Lower 16-bits of unused inode count. If set, we needn't scan past the
97 ``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the
98 inode table for this group.
99 * - 0x1E
100 - \_\_le16
101 - bg\_checksum
102 - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the
103 RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) &
104 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set.
105 * -
106 -
107 -
108 - These fields only exist if the 64bit feature is enabled and s_desc_size
109 > 32.
110 * - 0x20
111 - \_\_le32
112 - bg\_block\_bitmap\_hi
113 - Upper 32-bits of location of block bitmap.
114 * - 0x24
115 - \_\_le32
116 - bg\_inode\_bitmap\_hi
117 - Upper 32-bits of location of inodes bitmap.
118 * - 0x28
119 - \_\_le32
120 - bg\_inode\_table\_hi
121 - Upper 32-bits of location of inodes table.
122 * - 0x2C
123 - \_\_le16
124 - bg\_free\_blocks\_count\_hi
125 - Upper 16-bits of free block count.
126 * - 0x2E
127 - \_\_le16
128 - bg\_free\_inodes\_count\_hi
129 - Upper 16-bits of free inode count.
130 * - 0x30
131 - \_\_le16
132 - bg\_used\_dirs\_count\_hi
133 - Upper 16-bits of directory count.
134 * - 0x32
135 - \_\_le16
136 - bg\_itable\_unused\_hi
137 - Upper 16-bits of unused inode count.
138 * - 0x34
139 - \_\_le32
140 - bg\_exclude\_bitmap\_hi
141 - Upper 32-bits of location of snapshot exclusion bitmap.
142 * - 0x38
143 - \_\_le16
144 - bg\_block\_bitmap\_csum\_hi
145 - Upper 16-bits of the block bitmap checksum.
146 * - 0x3A
147 - \_\_le16
148 - bg\_inode\_bitmap\_csum\_hi
149 - Upper 16-bits of the inode bitmap checksum.
150 * - 0x3C
151 - \_\_u32
152 - bg\_reserved
153 - Padding to 64 bytes.
154
155.. _bgflags:
156
157Block group flags can be any combination of the following:
158
159.. list-table::
160 :widths: 1 79
161 :header-rows: 1
162
163 * - Value
164 - Description
165 * - 0x1
166 - inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT).
167 * - 0x2
168 - block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT).
169 * - 0x4
170 - inode table is zeroed (EXT4\_BG\_INODE\_ZEROED).
diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ondisk/ifork.rst
new file mode 100644
index 000000000000..5dbe3b2b121a
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/ifork.rst
@@ -0,0 +1,194 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3The Contents of inode.i\_block
4------------------------------
5
6Depending on the type of file an inode describes, the 60 bytes of
7storage in ``inode.i_block`` can be used in different ways. In general,
8regular files and directories will use it for file block indexing
9information, and special files will use it for special purposes.
10
11Symbolic Links
12~~~~~~~~~~~~~~
13
14The target of a symbolic link will be stored in this field if the target
15string is less than 60 bytes long. Otherwise, either extents or block
16maps will be used to allocate data blocks to store the link target.
17
18Direct/Indirect Block Addressing
19~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20
21In ext2/3, file block numbers were mapped to logical block numbers by
22means of an (up to) three level 1-1 block map. To find the logical block
23that stores a particular file block, the code would navigate through
24this increasingly complicated structure. Notice that there is neither a
25magic number nor a checksum to provide any level of confidence that the
26block isn't full of garbage.
27
28.. ifconfig:: builder != 'latex'
29
30 .. include:: blockmap.rst
31
32.. ifconfig:: builder == 'latex'
33
34 [Table omitted because LaTeX doesn't support nested tables.]
35
36Note that with this block mapping scheme, it is necessary to fill out a
37lot of mapping data even for a large contiguous file! This inefficiency
38led to the creation of the extent mapping scheme, discussed below.
39
40Notice also that a file using this mapping scheme cannot be placed
41higher than 2^32 blocks.
42
43Extent Tree
44~~~~~~~~~~~
45
46In ext4, the file to logical block map has been replaced with an extent
47tree. Under the old scheme, allocating a contiguous run of 1,000 blocks
48requires an indirect block to map all 1,000 entries; with extents, the
49mapping is reduced to a single ``struct ext4_extent`` with
50``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate
51very large files with a single extent, at a considerable reduction in
52metadata block use, and some improvement in disk efficiency. The inode
53must have the extents flag (0x80000) flag set for this feature to be in
54use.
55
56Extents are arranged as a tree. Each node of the tree begins with a
57``struct ext4_extent_header``. If the node is an interior node
58(``eh.eh_depth`` > 0), the header is followed by ``eh.eh_entries``
59instances of ``struct ext4_extent_idx``; each of these index entries
60points to a block containing more nodes in the extent tree. If the node
61is a leaf node (``eh.eh_depth == 0``), then the header is followed by
62``eh.eh_entries`` instances of ``struct ext4_extent``; these instances
63point to the file's data blocks. The root node of the extent tree is
64stored in ``inode.i_block``, which allows for the first four extents to
65be recorded without the use of extra metadata blocks.
66
67The extent tree header is recorded in ``struct ext4_extent_header``,
68which is 12 bytes long:
69
70.. list-table::
71 :widths: 1 1 1 77
72 :header-rows: 1
73
74 * - Offset
75 - Size
76 - Name
77 - Description
78 * - 0x0
79 - \_\_le16
80 - eh\_magic
81 - Magic number, 0xF30A.
82 * - 0x2
83 - \_\_le16
84 - eh\_entries
85 - Number of valid entries following the header.
86 * - 0x4
87 - \_\_le16
88 - eh\_max
89 - Maximum number of entries that could follow the header.
90 * - 0x6
91 - \_\_le16
92 - eh\_depth
93 - Depth of this extent node in the extent tree. 0 = this extent node
94 points to data blocks; otherwise, this extent node points to other
95 extent nodes. The extent tree can be at most 5 levels deep: a logical
96 block number can be at most ``2^32``, and the smallest ``n`` that
97 satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5.
98 * - 0x8
99 - \_\_le32
100 - eh\_generation
101 - Generation of the tree. (Used by Lustre, but not standard ext4).
102
103Internal nodes of the extent tree, also known as index nodes, are
104recorded as ``struct ext4_extent_idx``, and are 12 bytes long:
105
106.. list-table::
107 :widths: 1 1 1 77
108 :header-rows: 1
109
110 * - Offset
111 - Size
112 - Name
113 - Description
114 * - 0x0
115 - \_\_le32
116 - ei\_block
117 - This index node covers file blocks from 'block' onward.
118 * - 0x4
119 - \_\_le32
120 - ei\_leaf\_lo
121 - Lower 32-bits of the block number of the extent node that is the next
122 level lower in the tree. The tree node pointed to can be either another
123 internal node or a leaf node, described below.
124 * - 0x8
125 - \_\_le16
126 - ei\_leaf\_hi
127 - Upper 16-bits of the previous field.
128 * - 0xA
129 - \_\_u16
130 - ei\_unused
131 -
132
133Leaf nodes of the extent tree are recorded as ``struct ext4_extent``,
134and are also 12 bytes long:
135
136.. list-table::
137 :widths: 1 1 1 77
138 :header-rows: 1
139
140 * - Offset
141 - Size
142 - Name
143 - Description
144 * - 0x0
145 - \_\_le32
146 - ee\_block
147 - First file block number that this extent covers.
148 * - 0x4
149 - \_\_le16
150 - ee\_len
151 - Number of blocks covered by extent. If the value of this field is <=
152 32768, the extent is initialized. If the value of the field is > 32768,
153 the extent is uninitialized and the actual extent length is ``ee_len`` -
154 32768. Therefore, the maximum length of a initialized extent is 32768
155 blocks, and the maximum length of an uninitialized extent is 32767.
156 * - 0x6
157 - \_\_le16
158 - ee\_start\_hi
159 - Upper 16-bits of the block number to which this extent points.
160 * - 0x8
161 - \_\_le32
162 - ee\_start\_lo
163 - Lower 32-bits of the block number to which this extent points.
164
165Prior to the introduction of metadata checksums, the extent header +
166extent entries always left at least 4 bytes of unallocated space at the
167end of each extent tree data block (because (2^x % 12) >= 4). Therefore,
168the 32-bit checksum is inserted into this space. The 4 extents in the
169inode do not need checksumming, since the inode is already checksummed.
170The checksum is calculated against the FS UUID, the inode number, the
171inode generation, and the entire extent block leading up to (but not
172including) the checksum itself.
173
174``struct ext4_extent_tail`` is 4 bytes long:
175
176.. list-table::
177 :widths: 1 1 1 77
178 :header-rows: 1
179
180 * - Offset
181 - Size
182 - Name
183 - Description
184 * - 0x0
185 - \_\_le32
186 - eb\_checksum
187 - Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock)
188
189Inline Data
190~~~~~~~~~~~
191
192If the inline data feature is enabled for the filesystem and the flag is
193set for the inode, it is possible that the first 60 bytes of the file
194data are stored here.
diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst
new file mode 100644
index 000000000000..f7d082c3a435
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/index.rst
@@ -0,0 +1,9 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3==============================
4Data Structures and Algorithms
5==============================
6.. include:: about.rst
7.. include:: overview.rst
8.. include:: globals.rst
9.. include:: dynamic.rst
diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/ondisk/inlinedata.rst
new file mode 100644
index 000000000000..d1075178ce0b
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/inlinedata.rst
@@ -0,0 +1,37 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Inline Data
4-----------
5
6The inline data feature was designed to handle the case that a file's
7data is so tiny that it readily fits inside the inode, which
8(theoretically) reduces disk block consumption and reduces seeks. If the
9file is smaller than 60 bytes, then the data are stored inline in
10``inode.i_block``. If the rest of the file would fit inside the extended
11attribute space, then it might be found as an extended attribute
12“system.data” within the inode body (“ibody EA”). This of course
13constrains the amount of extended attributes one can attach to an inode.
14If the data size increases beyond i\_block + ibody EA, a regular block
15is allocated and the contents moved to that block.
16
17Pending a change to compact the extended attribute key used to store
18inline data, one ought to be able to store 160 bytes of data in a
19256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to
20that, the limit was 156 bytes due to inefficient use of inode space.
21
22The inline data feature requires the presence of an extended attribute
23for “system.data”, even if the attribute value is zero length.
24
25Inline Directories
26~~~~~~~~~~~~~~~~~~
27
28The first four bytes of i\_block are the inode number of the parent
29directory. Following that is a 56-byte space for an array of directory
30entries; see ``struct ext4_dir_entry``. If there is a “system.data”
31attribute in the inode body, the EA value is an array of
32``struct ext4_dir_entry`` as well. Note that for inline directories, the
33i\_block and EA space are treated as separate dirent blocks; directory
34entries cannot span the two.
35
36Inline directory entries are not checksummed, as the inode checksum
37should protect all inline data contents.
diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/ondisk/inodes.rst
new file mode 100644
index 000000000000..655ce898f3f5
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/inodes.rst
@@ -0,0 +1,575 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Index Nodes
4-----------
5
6In a regular UNIX filesystem, the inode stores all the metadata
7pertaining to the file (time stamps, block maps, extended attributes,
8etc), not the directory entry. To find the information associated with a
9file, one must traverse the directory files to find the directory entry
10associated with a file, then load the inode to find the metadata for
11that file. ext4 appears to cheat (for performance reasons) a little bit
12by storing a copy of the file type (normally stored in the inode) in the
13directory entry. (Compare all this to FAT, which stores all the file
14information directly in the directory entry, but does not support hard
15links and is in general more seek-happy than ext4 due to its simpler
16block allocator and extensive use of linked lists.)
17
18The inode table is a linear array of ``struct ext4_inode``. The table is
19sized to have enough blocks to store at least
20``sb.s_inode_size * sb.s_inodes_per_group`` bytes. The number of the
21block group containing an inode can be calculated as
22``(inode_number - 1) / sb.s_inodes_per_group``, and the offset into the
23group's table is ``(inode_number - 1) % sb.s_inodes_per_group``. There
24is no inode 0.
25
26The inode checksum is calculated against the FS UUID, the inode number,
27and the inode structure itself.
28
29The inode table entry is laid out in ``struct ext4_inode``.
30
31.. list-table::
32 :widths: 1 1 1 77
33 :header-rows: 1
34
35 * - Offset
36 - Size
37 - Name
38 - Description
39 * - 0x0
40 - \_\_le16
41 - i\_mode
42 - File mode. See the table i_mode_ below.
43 * - 0x2
44 - \_\_le16
45 - i\_uid
46 - Lower 16-bits of Owner UID.
47 * - 0x4
48 - \_\_le32
49 - i\_size\_lo
50 - Lower 32-bits of size in bytes.
51 * - 0x8
52 - \_\_le32
53 - i\_atime
54 - Last access time, in seconds since the epoch. However, if the EA\_INODE
55 inode flag is set, this inode stores an extended attribute value and
56 this field contains the checksum of the value.
57 * - 0xC
58 - \_\_le32
59 - i\_ctime
60 - Last inode change time, in seconds since the epoch. However, if the
61 EA\_INODE inode flag is set, this inode stores an extended attribute
62 value and this field contains the lower 32 bits of the attribute value's
63 reference count.
64 * - 0x10
65 - \_\_le32
66 - i\_mtime
67 - Last data modification time, in seconds since the epoch. However, if the
68 EA\_INODE inode flag is set, this inode stores an extended attribute
69 value and this field contains the number of the inode that owns the
70 extended attribute.
71 * - 0x14
72 - \_\_le32
73 - i\_dtime
74 - Deletion Time, in seconds since the epoch.
75 * - 0x18
76 - \_\_le16
77 - i\_gid
78 - Lower 16-bits of GID.
79 * - 0x1A
80 - \_\_le16
81 - i\_links\_count
82 - Hard link count. Normally, ext4 does not permit an inode to have more
83 than 65,000 hard links. This applies to files as well as directories,
84 which means that there cannot be more than 64,998 subdirectories in a
85 directory (each subdirectory's '..' entry counts as a hard link, as does
86 the '.' entry in the directory itself). With the DIR\_NLINK feature
87 enabled, ext4 supports more than 64,998 subdirectories by setting this
88 field to 1 to indicate that the number of hard links is not known.
89 * - 0x1C
90 - \_\_le32
91 - i\_blocks\_lo
92 - Lower 32-bits of “block” count. If the huge\_file feature flag is not
93 set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks
94 on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in
95 ``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi
96 << 32)`` 512-byte blocks on disk. If huge\_file is set and
97 EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file
98 consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on
99 disk.
100 * - 0x20
101 - \_\_le32
102 - i\_flags
103 - Inode flags. See the table i_flags_ below.
104 * - 0x24
105 - 4 bytes
106 - i\_osd1
107 - See the table i_osd1_ for more details.
108 * - 0x28
109 - 60 bytes
110 - i\_block[EXT4\_N\_BLOCKS=15]
111 - Block map or extent tree. See the section “The Contents of inode.i\_block”.
112 * - 0x64
113 - \_\_le32
114 - i\_generation
115 - File version (for NFS).
116 * - 0x68
117 - \_\_le32
118 - i\_file\_acl\_lo
119 - Lower 32-bits of extended attribute block. ACLs are of course one of
120 many possible extended attributes; I think the name of this field is a
121 result of the first use of extended attributes being for ACLs.
122 * - 0x6C
123 - \_\_le32
124 - i\_size\_high / i\_dir\_acl
125 - Upper 32-bits of file/directory size. In ext2/3 this field was named
126 i\_dir\_acl, though it was usually set to zero and never used.
127 * - 0x70
128 - \_\_le32
129 - i\_obso\_faddr
130 - (Obsolete) fragment address.
131 * - 0x74
132 - 12 bytes
133 - i\_osd2
134 - See the table i_osd2_ for more details.
135 * - 0x80
136 - \_\_le16
137 - i\_extra\_isize
138 - Size of this inode - 128. Alternately, the size of the extended inode
139 fields beyond the original ext2 inode, including this field.
140 * - 0x82
141 - \_\_le16
142 - i\_checksum\_hi
143 - Upper 16-bits of the inode checksum.
144 * - 0x84
145 - \_\_le32
146 - i\_ctime\_extra
147 - Extra change time bits. This provides sub-second precision. See Inode
148 Timestamps section.
149 * - 0x88
150 - \_\_le32
151 - i\_mtime\_extra
152 - Extra modification time bits. This provides sub-second precision.
153 * - 0x8C
154 - \_\_le32
155 - i\_atime\_extra
156 - Extra access time bits. This provides sub-second precision.
157 * - 0x90
158 - \_\_le32
159 - i\_crtime
160 - File creation time, in seconds since the epoch.
161 * - 0x94
162 - \_\_le32
163 - i\_crtime\_extra
164 - Extra file creation time bits. This provides sub-second precision.
165 * - 0x98
166 - \_\_le32
167 - i\_version\_hi
168 - Upper 32-bits for version number.
169 * - 0x9C
170 - \_\_le32
171 - i\_projid
172 - Project ID.
173
174.. _i_mode:
175
176The ``i_mode`` value is a combination of the following flags:
177
178.. list-table::
179 :widths: 1 79
180 :header-rows: 1
181
182 * - Value
183 - Description
184 * - 0x1
185 - S\_IXOTH (Others may execute)
186 * - 0x2
187 - S\_IWOTH (Others may write)
188 * - 0x4
189 - S\_IROTH (Others may read)
190 * - 0x8
191 - S\_IXGRP (Group members may execute)
192 * - 0x10
193 - S\_IWGRP (Group members may write)
194 * - 0x20
195 - S\_IRGRP (Group members may read)
196 * - 0x40
197 - S\_IXUSR (Owner may execute)
198 * - 0x80
199 - S\_IWUSR (Owner may write)
200 * - 0x100
201 - S\_IRUSR (Owner may read)
202 * - 0x200
203 - S\_ISVTX (Sticky bit)
204 * - 0x400
205 - S\_ISGID (Set GID)
206 * - 0x800
207 - S\_ISUID (Set UID)
208 * -
209 - These are mutually-exclusive file types:
210 * - 0x1000
211 - S\_IFIFO (FIFO)
212 * - 0x2000
213 - S\_IFCHR (Character device)
214 * - 0x4000
215 - S\_IFDIR (Directory)
216 * - 0x6000
217 - S\_IFBLK (Block device)
218 * - 0x8000
219 - S\_IFREG (Regular file)
220 * - 0xA000
221 - S\_IFLNK (Symbolic link)
222 * - 0xC000
223 - S\_IFSOCK (Socket)
224
225.. _i_flags:
226
227The ``i_flags`` field is a combination of these values:
228
229.. list-table::
230 :widths: 1 79
231 :header-rows: 1
232
233 * - Value
234 - Description
235 * - 0x1
236 - This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented)
237 * - 0x2
238 - This file should be preserved, should undeletion be desired
239 (EXT4\_UNRM\_FL). (not implemented)
240 * - 0x4
241 - File is compressed (EXT4\_COMPR\_FL). (not really implemented)
242 * - 0x8
243 - All writes to the file must be synchronous (EXT4\_SYNC\_FL).
244 * - 0x10
245 - File is immutable (EXT4\_IMMUTABLE\_FL).
246 * - 0x20
247 - File can only be appended (EXT4\_APPEND\_FL).
248 * - 0x40
249 - The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL).
250 * - 0x80
251 - Do not update access time (EXT4\_NOATIME\_FL).
252 * - 0x100
253 - Dirty compressed file (EXT4\_DIRTY\_FL). (not used)
254 * - 0x200
255 - File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used)
256 * - 0x400
257 - Do not compress file (EXT4\_NOCOMPR\_FL). (not used)
258 * - 0x800
259 - Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was
260 EXT4\_ECOMPR\_FL (compression error), which was never used.
261 * - 0x1000
262 - Directory has hashed indexes (EXT4\_INDEX\_FL).
263 * - 0x2000
264 - AFS magic directory (EXT4\_IMAGIC\_FL).
265 * - 0x4000
266 - File data must always be written through the journal
267 (EXT4\_JOURNAL\_DATA\_FL).
268 * - 0x8000
269 - File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4)
270 * - 0x10000
271 - All directory entry data should be written synchronously (see
272 ``dirsync``) (EXT4\_DIRSYNC\_FL).
273 * - 0x20000
274 - Top of directory hierarchy (EXT4\_TOPDIR\_FL).
275 * - 0x40000
276 - This is a huge file (EXT4\_HUGE\_FILE\_FL).
277 * - 0x80000
278 - Inode uses extents (EXT4\_EXTENTS\_FL).
279 * - 0x200000
280 - Inode stores a large extended attribute value in its data blocks
281 (EXT4\_EA\_INODE\_FL).
282 * - 0x400000
283 - This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL).
284 (deprecated)
285 * - 0x01000000
286 - Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline)
287 * - 0x04000000
288 - Snapshot is being deleted (``EXT4_SNAPFILE_DELETED_FL``). (not in
289 mainline)
290 * - 0x08000000
291 - Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in
292 mainline)
293 * - 0x10000000
294 - Inode has inline data (EXT4\_INLINE\_DATA\_FL).
295 * - 0x20000000
296 - Create children with the same project ID (EXT4\_PROJINHERIT\_FL).
297 * - 0x80000000
298 - Reserved for ext4 library (EXT4\_RESERVED\_FL).
299 * -
300 - Aggregate flags:
301 * - 0x4BDFFF
302 - User-visible flags.
303 * - 0x4B80FF
304 - User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and
305 EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's
306 EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of
307 these flags in a special manner and they are masked out of the set of
308 flags that are saved directly to i\_flags.
309
310.. _i_osd1:
311
312The ``osd1`` field has multiple meanings depending on the creator:
313
314Linux:
315
316.. list-table::
317 :widths: 1 1 1 77
318 :header-rows: 1
319
320 * - Offset
321 - Size
322 - Name
323 - Description
324 * - 0x0
325 - \_\_le32
326 - l\_i\_version
327 - Inode version. However, if the EA\_INODE inode flag is set, this inode
328 stores an extended attribute value and this field contains the upper 32
329 bits of the attribute value's reference count.
330
331Hurd:
332
333.. list-table::
334 :widths: 1 1 1 77
335 :header-rows: 1
336
337 * - Offset
338 - Size
339 - Name
340 - Description
341 * - 0x0
342 - \_\_le32
343 - h\_i\_translator
344 - ??
345
346Masix:
347
348.. list-table::
349 :widths: 1 1 1 77
350 :header-rows: 1
351
352 * - Offset
353 - Size
354 - Name
355 - Description
356 * - 0x0
357 - \_\_le32
358 - m\_i\_reserved
359 - ??
360
361.. _i_osd2:
362
363The ``osd2`` field has multiple meanings depending on the filesystem creator:
364
365Linux:
366
367.. list-table::
368 :widths: 1 1 1 77
369 :header-rows: 1
370
371 * - Offset
372 - Size
373 - Name
374 - Description
375 * - 0x0
376 - \_\_le16
377 - l\_i\_blocks\_high
378 - Upper 16-bits of the block count. Please see the note attached to
379 i\_blocks\_lo.
380 * - 0x2
381 - \_\_le16
382 - l\_i\_file\_acl\_high
383 - Upper 16-bits of the extended attribute block (historically, the file
384 ACL location). See the Extended Attributes section below.
385 * - 0x4
386 - \_\_le16
387 - l\_i\_uid\_high
388 - Upper 16-bits of the Owner UID.
389 * - 0x6
390 - \_\_le16
391 - l\_i\_gid\_high
392 - Upper 16-bits of the GID.
393 * - 0x8
394 - \_\_le16
395 - l\_i\_checksum\_lo
396 - Lower 16-bits of the inode checksum.
397 * - 0xA
398 - \_\_le16
399 - l\_i\_reserved
400 - Unused.
401
402Hurd:
403
404.. list-table::
405 :widths: 1 1 1 77
406 :header-rows: 1
407
408 * - Offset
409 - Size
410 - Name
411 - Description
412 * - 0x0
413 - \_\_le16
414 - h\_i\_reserved1
415 - ??
416 * - 0x2
417 - \_\_u16
418 - h\_i\_mode\_high
419 - Upper 16-bits of the file mode.
420 * - 0x4
421 - \_\_le16
422 - h\_i\_uid\_high
423 - Upper 16-bits of the Owner UID.
424 * - 0x6
425 - \_\_le16
426 - h\_i\_gid\_high
427 - Upper 16-bits of the GID.
428 * - 0x8
429 - \_\_u32
430 - h\_i\_author
431 - Author code?
432
433Masix:
434
435.. list-table::
436 :widths: 1 1 1 77
437 :header-rows: 1
438
439 * - Offset
440 - Size
441 - Name
442 - Description
443 * - 0x0
444 - \_\_le16
445 - h\_i\_reserved1
446 - ??
447 * - 0x2
448 - \_\_u16
449 - m\_i\_file\_acl\_high
450 - Upper 16-bits of the extended attribute block (historically, the file
451 ACL location).
452 * - 0x4
453 - \_\_u32
454 - m\_i\_reserved2[2]
455 - ??
456
457Inode Size
458~~~~~~~~~~
459
460In ext2 and ext3, the inode structure size was fixed at 128 bytes
461(``EXT2_GOOD_OLD_INODE_SIZE``) and each inode had a disk record size of
462128 bytes. Starting with ext4, it is possible to allocate a larger
463on-disk inode at format time for all inodes in the filesystem to provide
464space beyond the end of the original ext2 inode. The on-disk inode
465record size is recorded in the superblock as ``s_inode_size``. The
466number of bytes actually used by struct ext4\_inode beyond the original
467128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each
468inode, which allows struct ext4\_inode to grow for a new kernel without
469having to upgrade all of the on-disk inodes. Access to fields beyond
470EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within
471``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as
472of October 2013) the inode structure is 156 bytes
473(``i_extra_isize = 28``). The extra space between the end of the inode
474structure and the end of the inode record can be used to store extended
475attributes. Each inode record can be as large as the filesystem block
476size, though this is not terribly efficient.
477
478Finding an Inode
479~~~~~~~~~~~~~~~~
480
481Each block group contains ``sb->s_inodes_per_group`` inodes. Because
482inode 0 is defined not to exist, this formula can be used to find the
483block group that an inode lives in:
484``bg = (inode_num - 1) / sb->s_inodes_per_group``. The particular inode
485can be found within the block group's inode table at
486``index = (inode_num - 1) % sb->s_inodes_per_group``. To get the byte
487address within the inode table, use
488``offset = index * sb->s_inode_size``.
489
490Inode Timestamps
491~~~~~~~~~~~~~~~~
492
493Four timestamps are recorded in the lower 128 bytes of the inode
494structure -- inode change time (ctime), access time (atime), data
495modification time (mtime), and deletion time (dtime). The four fields
496are 32-bit signed integers that represent seconds since the Unix epoch
497(1970-01-01 00:00:00 GMT), which means that the fields will overflow in
498January 2038. For inodes that are not linked from any directory but are
499still open (orphan inodes), the dtime field is overloaded for use with
500the orphan list. The superblock field ``s_last_orphan`` points to the
501first inode in the orphan list; dtime is then the number of the next
502orphaned inode, or zero if there are no more orphans.
503
504If the inode structure size ``sb->s_inode_size`` is larger than 128
505bytes and the ``i_inode_extra`` field is large enough to encompass the
506respective ``i_[cma]time_extra`` field, the ctime, atime, and mtime
507inode fields are widened to 64 bits. Within this “extra” 32-bit field,
508the lower two bits are used to extend the 32-bit seconds field to be 34
509bit wide; the upper 30 bits are used to provide nanosecond timestamp
510accuracy. Therefore, timestamps should not overflow until May 2446.
511dtime was not widened. There is also a fifth timestamp to record inode
512creation time (crtime); this field is 64-bits wide and decoded in the
513same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible
514through the regular stat() interface, though debugfs will report them.
515
516We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)).
517In other words:
518
519.. list-table::
520 :widths: 20 20 20 20 20
521 :header-rows: 1
522
523 * - Extra epoch bits
524 - MSB of 32-bit time
525 - Adjustment for signed 32-bit to 64-bit tv\_sec
526 - Decoded 64-bit tv\_sec
527 - valid time range
528 * - 0 0
529 - 1
530 - 0
531 - ``-0x80000000 - -0x00000001``
532 - 1901-12-13 to 1969-12-31
533 * - 0 0
534 - 0
535 - 0
536 - ``0x000000000 - 0x07fffffff``
537 - 1970-01-01 to 2038-01-19
538 * - 0 1
539 - 1
540 - 0x100000000
541 - ``0x080000000 - 0x0ffffffff``
542 - 2038-01-19 to 2106-02-07
543 * - 0 1
544 - 0
545 - 0x100000000
546 - ``0x100000000 - 0x17fffffff``
547 - 2106-02-07 to 2174-02-25
548 * - 1 0
549 - 1
550 - 0x200000000
551 - ``0x180000000 - 0x1ffffffff``
552 - 2174-02-25 to 2242-03-16
553 * - 1 0
554 - 0
555 - 0x200000000
556 - ``0x200000000 - 0x27fffffff``
557 - 2242-03-16 to 2310-04-04
558 * - 1 1
559 - 1
560 - 0x300000000
561 - ``0x280000000 - 0x2ffffffff``
562 - 2310-04-04 to 2378-04-22
563 * - 1 1
564 - 0
565 - 0x300000000
566 - ``0x300000000 - 0x37fffffff``
567 - 2378-04-22 to 2446-05-10
568
569This is a somewhat odd encoding since there are effectively seven times
570as many positive values as negative values. There have also been
571long-standing bugs decoding and encoding dates beyond 2038, which don't
572seem to be fixed as of kernel 3.12 and e2fsprogs 1.42.8. 64-bit kernels
573incorrectly use the extra epoch bits 1,1 for dates between 1901 and
5741970. At some point the kernel will be fixed and e2fsck will fix this
575situation, assuming that it is run before 2310.
diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/ondisk/journal.rst
new file mode 100644
index 000000000000..e7031af86876
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/journal.rst
@@ -0,0 +1,611 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Journal (jbd2)
4--------------
5
6Introduced in ext3, the ext4 filesystem employs a journal to protect the
7filesystem against corruption in the case of a system crash. A small
8continuous region of disk (default 128MiB) is reserved inside the
9filesystem as a place to land “important” data writes on-disk as quickly
10as possible. Once the important data transaction is fully written to the
11disk and flushed from the disk write cache, a record of the data being
12committed is also written to the journal. At some later point in time,
13the journal code writes the transactions to their final locations on
14disk (this could involve a lot of seeking or a lot of small
15read-write-erases) before erasing the commit record. Should the system
16crash during the second slow write, the journal can be replayed all the
17way to the latest commit record, guaranteeing the atomicity of whatever
18gets written through the journal to the disk. The effect of this is to
19guarantee that the filesystem does not become stuck midway through a
20metadata update.
21
22For performance reasons, ext4 by default only writes filesystem metadata
23through the journal. This means that file data blocks are /not/
24guaranteed to be in any consistent state after a crash. If this default
25guarantee level (``data=ordered``) is not satisfactory, there is a mount
26option to control journal behavior. If ``data=journal``, all data and
27metadata are written to disk through the journal. This is slower but
28safest. If ``data=writeback``, dirty data blocks are not flushed to the
29disk before the metadata are written to disk through the journal.
30
31The journal inode is typically inode 8. The first 68 bytes of the
32journal inode are replicated in the ext4 superblock. The journal itself
33is normal (but hidden) file within the filesystem. The file usually
34consumes an entire block group, though mke2fs tries to put it in the
35middle of the disk.
36
37All fields in jbd2 are written to disk in big-endian order. This is the
38opposite of ext4.
39
40NOTE: Both ext4 and ocfs2 use jbd2.
41
42The maximum size of a journal embedded in an ext4 filesystem is 2^32
43blocks. jbd2 itself does not seem to care.
44
45Layout
46~~~~~~
47
48Generally speaking, the journal has this format:
49
50.. list-table::
51 :widths: 1 1 78
52 :header-rows: 1
53
54 * - Superblock
55 - descriptor\_block (data\_blocks or revocation\_block) [more data or
56 revocations] commmit\_block
57 - [more transactions...]
58 * -
59 - One transaction
60 -
61
62Notice that a transaction begins with either a descriptor and some data,
63or a block revocation list. A finished transaction always ends with a
64commit. If there is no commit record (or the checksums don't match), the
65transaction will be discarded during replay.
66
67External Journal
68~~~~~~~~~~~~~~~~
69
70Optionally, an ext4 filesystem can be created with an external journal
71device (as opposed to an internal journal, which uses a reserved inode).
72In this case, on the filesystem device, ``s_journal_inum`` should be
73zero and ``s_journal_uuid`` should be set. On the journal device there
74will be an ext4 super block in the usual place, with a matching UUID.
75The journal superblock will be in the next full block after the
76superblock.
77
78.. list-table::
79 :widths: 1 1 1 1 76
80 :header-rows: 1
81
82 * - 1024 bytes of padding
83 - ext4 Superblock
84 - Journal Superblock
85 - descriptor\_block (data\_blocks or revocation\_block) [more data or
86 revocations] commmit\_block
87 - [more transactions...]
88 * -
89 -
90 -
91 - One transaction
92 -
93
94Block Header
95~~~~~~~~~~~~
96
97Every block in the journal starts with a common 12-byte header
98``struct journal_header_s``:
99
100.. list-table::
101 :widths: 1 1 1 77
102 :header-rows: 1
103
104 * - Offset
105 - Type
106 - Name
107 - Description
108 * - 0x0
109 - \_\_be32
110 - h\_magic
111 - jbd2 magic number, 0xC03B3998.
112 * - 0x4
113 - \_\_be32
114 - h\_blocktype
115 - Description of what this block contains. See the jbd2_blocktype_ table
116 below.
117 * - 0x8
118 - \_\_be32
119 - h\_sequence
120 - The transaction ID that goes with this block.
121
122.. _jbd2_blocktype:
123
124The journal block type can be any one of:
125
126.. list-table::
127 :widths: 1 79
128 :header-rows: 1
129
130 * - Value
131 - Description
132 * - 1
133 - Descriptor. This block precedes a series of data blocks that were
134 written through the journal during a transaction.
135 * - 2
136 - Block commit record. This block signifies the completion of a
137 transaction.
138 * - 3
139 - Journal superblock, v1.
140 * - 4
141 - Journal superblock, v2.
142 * - 5
143 - Block revocation records. This speeds up recovery by enabling the
144 journal to skip writing blocks that were subsequently rewritten.
145
146Super Block
147~~~~~~~~~~~
148
149The super block for the journal is much simpler as compared to ext4's.
150The key data kept within are size of the journal, and where to find the
151start of the log of transactions.
152
153The journal superblock is recorded as ``struct journal_superblock_s``,
154which is 1024 bytes long:
155
156.. list-table::
157 :widths: 1 1 1 77
158 :header-rows: 1
159
160 * - Offset
161 - Type
162 - Name
163 - Description
164 * -
165 -
166 -
167 - Static information describing the journal.
168 * - 0x0
169 - journal\_header\_t (12 bytes)
170 - s\_header
171 - Common header identifying this as a superblock.
172 * - 0xC
173 - \_\_be32
174 - s\_blocksize
175 - Journal device block size.
176 * - 0x10
177 - \_\_be32
178 - s\_maxlen
179 - Total number of blocks in this journal.
180 * - 0x14
181 - \_\_be32
182 - s\_first
183 - First block of log information.
184 * -
185 -
186 -
187 - Dynamic information describing the current state of the log.
188 * - 0x18
189 - \_\_be32
190 - s\_sequence
191 - First commit ID expected in log.
192 * - 0x1C
193 - \_\_be32
194 - s\_start
195 - Block number of the start of log. Contrary to the comments, this field
196 being zero does not imply that the journal is clean!
197 * - 0x20
198 - \_\_be32
199 - s\_errno
200 - Error value, as set by jbd2\_journal\_abort().
201 * -
202 -
203 -
204 - The remaining fields are only valid in a v2 superblock.
205 * - 0x24
206 - \_\_be32
207 - s\_feature\_compat;
208 - Compatible feature set. See the table jbd2_compat_ below.
209 * - 0x28
210 - \_\_be32
211 - s\_feature\_incompat
212 - Incompatible feature set. See the table jbd2_incompat_ below.
213 * - 0x2C
214 - \_\_be32
215 - s\_feature\_ro\_compat
216 - Read-only compatible feature set. There aren't any of these currently.
217 * - 0x30
218 - \_\_u8
219 - s\_uuid[16]
220 - 128-bit uuid for journal. This is compared against the copy in the ext4
221 super block at mount time.
222 * - 0x40
223 - \_\_be32
224 - s\_nr\_users
225 - Number of file systems sharing this journal.
226 * - 0x44
227 - \_\_be32
228 - s\_dynsuper
229 - Location of dynamic super block copy. (Not used?)
230 * - 0x48
231 - \_\_be32
232 - s\_max\_transaction
233 - Limit of journal blocks per transaction. (Not used?)
234 * - 0x4C
235 - \_\_be32
236 - s\_max\_trans\_data
237 - Limit of data blocks per transaction. (Not used?)
238 * - 0x50
239 - \_\_u8
240 - s\_checksum\_type
241 - Checksum algorithm used for the journal. See jbd2_checksum_type_ for
242 more info.
243 * - 0x51
244 - \_\_u8[3]
245 - s\_padding2
246 -
247 * - 0x54
248 - \_\_u32
249 - s\_padding[42]
250 -
251 * - 0xFC
252 - \_\_be32
253 - s\_checksum
254 - Checksum of the entire superblock, with this field set to zero.
255 * - 0x100
256 - \_\_u8
257 - s\_users[16\*48]
258 - ids of all file systems sharing the log. e2fsprogs/Linux don't allow
259 shared external journals, but I imagine Lustre (or ocfs2?), which use
260 the jbd2 code, might.
261
262.. _jbd2_compat:
263
264The journal compat features are any combination of the following:
265
266.. list-table::
267 :widths: 1 79
268 :header-rows: 1
269
270 * - Value
271 - Description
272 * - 0x1
273 - Journal maintains checksums on the data blocks.
274 (JBD2\_FEATURE\_COMPAT\_CHECKSUM)
275
276.. _jbd2_incompat:
277
278The journal incompat features are any combination of the following:
279
280.. list-table::
281 :widths: 1 79
282 :header-rows: 1
283
284 * - Value
285 - Description
286 * - 0x1
287 - Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE)
288 * - 0x2
289 - Journal can deal with 64-bit block numbers.
290 (JBD2\_FEATURE\_INCOMPAT\_64BIT)
291 * - 0x4
292 - Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT)
293 * - 0x8
294 - This journal uses v2 of the checksum on-disk format. Each journal
295 metadata block gets its own checksum, and the block tags in the
296 descriptor table contain checksums for each of the data blocks in the
297 journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2)
298 * - 0x10
299 - This journal uses v3 of the checksum on-disk format. This is the same as
300 v2, but the journal block tag size is fixed regardless of the size of
301 block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3)
302
303.. _jbd2_checksum_type:
304
305Journal checksum type codes are one of the following. crc32 or crc32c are the
306most likely choices.
307
308.. list-table::
309 :widths: 1 79
310 :header-rows: 1
311
312 * - Value
313 - Description
314 * - 1
315 - CRC32
316 * - 2
317 - MD5
318 * - 3
319 - SHA1
320 * - 4
321 - CRC32C
322
323Descriptor Block
324~~~~~~~~~~~~~~~~
325
326The descriptor block contains an array of journal block tags that
327describe the final locations of the data blocks that follow in the
328journal. Descriptor blocks are open-coded instead of being completely
329described by a data structure, but here is the block structure anyway.
330Descriptor blocks consume at least 36 bytes, but use a full block:
331
332.. list-table::
333 :widths: 1 1 1 77
334 :header-rows: 1
335
336 * - Offset
337 - Type
338 - Name
339 - Descriptor
340 * - 0x0
341 - journal\_header\_t
342 - (open coded)
343 - Common block header.
344 * - 0xC
345 - struct journal\_block\_tag\_s
346 - open coded array[]
347 - Enough tags either to fill up the block or to describe all the data
348 blocks that follow this descriptor block.
349
350Journal block tags have any of the following formats, depending on which
351journal feature and block tag flags are set.
352
353If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is
354defined as ``struct journal_block_tag3_s``, which looks like the
355following. The size is 16 or 32 bytes.
356
357.. list-table::
358 :widths: 1 1 1 77
359 :header-rows: 1
360
361 * - Offset
362 - Type
363 - Name
364 - Descriptor
365 * - 0x0
366 - \_\_be32
367 - t\_blocknr
368 - Lower 32-bits of the location of where the corresponding data block
369 should end up on disk.
370 * - 0x4
371 - \_\_be32
372 - t\_flags
373 - Flags that go with the descriptor. See the table jbd2_tag_flags_ for
374 more info.
375 * - 0x8
376 - \_\_be32
377 - t\_blocknr\_high
378 - Upper 32-bits of the location of where the corresponding data block
379 should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is
380 not enabled.
381 * - 0xC
382 - \_\_be32
383 - t\_checksum
384 - Checksum of the journal UUID, the sequence number, and the data block.
385 * -
386 -
387 -
388 - This field appears to be open coded. It always comes at the end of the
389 tag, after t_checksum. This field is not present if the "same UUID" flag
390 is set.
391 * - 0x8 or 0xC
392 - char
393 - uuid[16]
394 - A UUID to go with this tag. This field appears to be copied from the
395 ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that
396 field.
397
398.. _jbd2_tag_flags:
399
400The journal tag flags are any combination of the following:
401
402.. list-table::
403 :widths: 1 79
404 :header-rows: 1
405
406 * - Value
407 - Description
408 * - 0x1
409 - On-disk block is escaped. The first four bytes of the data block just
410 happened to match the jbd2 magic number.
411 * - 0x2
412 - This block has the same UUID as previous, therefore the UUID field is
413 omitted.
414 * - 0x4
415 - The data block was deleted by the transaction. (Not used?)
416 * - 0x8
417 - This is the last tag in this descriptor block.
418
419If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag
420is defined as ``struct journal_block_tag_s``, which looks like the
421following. The size is 8, 12, 24, or 28 bytes:
422
423.. list-table::
424 :widths: 1 1 1 77
425 :header-rows: 1
426
427 * - Offset
428 - Type
429 - Name
430 - Descriptor
431 * - 0x0
432 - \_\_be32
433 - t\_blocknr
434 - Lower 32-bits of the location of where the corresponding data block
435 should end up on disk.
436 * - 0x4
437 - \_\_be16
438 - t\_checksum
439 - Checksum of the journal UUID, the sequence number, and the data block.
440 Note that only the lower 16 bits are stored.
441 * - 0x6
442 - \_\_be16
443 - t\_flags
444 - Flags that go with the descriptor. See the table jbd2_tag_flags_ for
445 more info.
446 * -
447 -
448 -
449 - This next field is only present if the super block indicates support for
450 64-bit block numbers.
451 * - 0x8
452 - \_\_be32
453 - t\_blocknr\_high
454 - Upper 32-bits of the location of where the corresponding data block
455 should end up on disk.
456 * -
457 -
458 -
459 - This field appears to be open coded. It always comes at the end of the
460 tag, after t_flags or t_blocknr_high. This field is not present if the
461 "same UUID" flag is set.
462 * - 0x8 or 0xC
463 - char
464 - uuid[16]
465 - A UUID to go with this tag. This field appears to be copied from the
466 ``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that
467 field.
468
469If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or
470JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a
471``struct jbd2_journal_block_tail``, which looks like this:
472
473.. list-table::
474 :widths: 1 1 1 77
475 :header-rows: 1
476
477 * - Offset
478 - Type
479 - Name
480 - Descriptor
481 * - 0x0
482 - \_\_be32
483 - t\_checksum
484 - Checksum of the journal UUID + the descriptor block, with this field set
485 to zero.
486
487Data Block
488~~~~~~~~~~
489
490In general, the data blocks being written to disk through the journal
491are written verbatim into the journal file after the descriptor block.
492However, if the first four bytes of the block match the jbd2 magic
493number then those four bytes are replaced with zeroes and the “escaped”
494flag is set in the descriptor block tag.
495
496Revocation Block
497~~~~~~~~~~~~~~~~
498
499A revocation block is used to prevent replay of a block in an earlier
500transaction. This is used to mark blocks that were journalled at one
501time but are no longer journalled. Typically this happens if a metadata
502block is freed and re-allocated as a file data block; in this case, a
503journal replay after the file block was written to disk will cause
504corruption.
505
506**NOTE**: This mechanism is NOT used to express “this journal block is
507superseded by this other journal block”, as the author (djwong)
508mistakenly thought. Any block being added to a transaction will cause
509the removal of all existing revocation records for that block.
510
511Revocation blocks are described in
512``struct jbd2_journal_revoke_header_s``, are at least 16 bytes in
513length, but use a full block:
514
515.. list-table::
516 :widths: 1 1 1 77
517 :header-rows: 1
518
519 * - Offset
520 - Type
521 - Name
522 - Description
523 * - 0x0
524 - journal\_header\_t
525 - r\_header
526 - Common block header.
527 * - 0xC
528 - \_\_be32
529 - r\_count
530 - Number of bytes used in this block.
531 * - 0x10
532 - \_\_be32 or \_\_be64
533 - blocks[0]
534 - Blocks to revoke.
535
536After r\_count is a linear array of block numbers that are effectively
537revoked by this transaction. The size of each block number is 8 bytes if
538the superblock advertises 64-bit block number support, or 4 bytes
539otherwise.
540
541If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or
542JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation
543block is a ``struct jbd2_journal_revoke_tail``, which has this format:
544
545.. list-table::
546 :widths: 1 1 1 77
547 :header-rows: 1
548
549 * - Offset
550 - Type
551 - Name
552 - Description
553 * - 0x0
554 - \_\_be32
555 - r\_checksum
556 - Checksum of the journal UUID + revocation block
557
558Commit Block
559~~~~~~~~~~~~
560
561The commit block is a sentry that indicates that a transaction has been
562completely written to the journal. Once this commit block reaches the
563journal, the data stored with this transaction can be written to their
564final locations on disk.
565
566The commit block is described by ``struct commit_header``, which is 32
567bytes long (but uses a full block):
568
569.. list-table::
570 :widths: 1 1 1 77
571 :header-rows: 1
572
573 * - Offset
574 - Type
575 - Name
576 - Descriptor
577 * - 0x0
578 - journal\_header\_s
579 - (open coded)
580 - Common block header.
581 * - 0xC
582 - unsigned char
583 - h\_chksum\_type
584 - The type of checksum to use to verify the integrity of the data blocks
585 in the transaction. See jbd2_checksum_type_ for more info.
586 * - 0xD
587 - unsigned char
588 - h\_chksum\_size
589 - The number of bytes used by the checksum. Most likely 4.
590 * - 0xE
591 - unsigned char
592 - h\_padding[2]
593 -
594 * - 0x10
595 - \_\_be32
596 - h\_chksum[JBD2\_CHECKSUM\_BYTES]
597 - 32 bytes of space to store checksums. If
598 JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3
599 are set, the first ``__be32`` is the checksum of the journal UUID and
600 the entire commit block, with this field zeroed. If
601 JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the
602 crc32 of all the blocks already written to the transaction.
603 * - 0x30
604 - \_\_be64
605 - h\_commit\_sec
606 - The time that the transaction was committed, in seconds since the epoch.
607 * - 0x38
608 - \_\_be32
609 - h\_commit\_nsec
610 - Nanoseconds component of the above timestamp.
611
diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/ondisk/mmp.rst
new file mode 100644
index 000000000000..b7d7a3137f80
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/mmp.rst
@@ -0,0 +1,77 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Multiple Mount Protection
4-------------------------
5
6Multiple mount protection (MMP) is a feature that protects the
7filesystem against multiple hosts trying to use the filesystem
8simultaneously. When a filesystem is opened (for mounting, or fsck,
9etc.), the MMP code running on the node (call it node A) checks a
10sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the
11open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then
12fsck is (hopefully) running, and open fails immediately. Otherwise, the
13open code will wait for twice the specified MMP check interval and check
14the sequence number again. If the sequence number has changed, then the
15filesystem is active on another machine and the open fails. If the MMP
16code passes all of those checks, a new MMP sequence number is generated
17and written to the MMP block, and the mount proceeds.
18
19While the filesystem is live, the kernel sets up a timer to re-check the
20MMP block at the specified MMP check interval. To perform the re-check,
21the MMP sequence number is re-read; if it does not match the in-memory
22MMP sequence number, then another node (node B) has mounted the
23filesystem, and node A remounts the filesystem read-only. If the
24sequence numbers match, the sequence number is incremented both in
25memory and on disk, and the re-check is complete.
26
27The hostname and device filename are written into the MMP block whenever
28an open operation succeeds. The MMP code does not use these values; they
29are provided purely for informational purposes.
30
31The checksum is calculated against the FS UUID and the MMP structure.
32The MMP structure (``struct mmp_struct``) is as follows:
33
34.. list-table::
35 :widths: 1 1 1 77
36 :header-rows: 1
37
38 * - Offset
39 - Type
40 - Name
41 - Description
42 * - 0x0
43 - \_\_le32
44 - mmp\_magic
45 - Magic number for MMP, 0x004D4D50 (“MMP”).
46 * - 0x4
47 - \_\_le32
48 - mmp\_seq
49 - Sequence number, updated periodically.
50 * - 0x8
51 - \_\_le64
52 - mmp\_time
53 - Time that the MMP block was last updated.
54 * - 0x10
55 - char[64]
56 - mmp\_nodename
57 - Hostname of the node that opened the filesystem.
58 * - 0x50
59 - char[32]
60 - mmp\_bdevname
61 - Block device name of the filesystem.
62 * - 0x70
63 - \_\_le16
64 - mmp\_check\_interval
65 - The MMP re-check interval, in seconds.
66 * - 0x72
67 - \_\_le16
68 - mmp\_pad1
69 - Zero.
70 * - 0x74
71 - \_\_le32[226]
72 - mmp\_pad2
73 - Zero.
74 * - 0x3FC
75 - \_\_le32
76 - mmp\_checksum
77 - Checksum of the MMP block.
diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/ondisk/overview.rst
new file mode 100644
index 000000000000..cbab18baba12
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/overview.rst
@@ -0,0 +1,26 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3High Level Design
4=================
5
6An ext4 file system is split into a series of block groups. To reduce
7performance difficulties due to fragmentation, the block allocator tries
8very hard to keep each file's blocks within the same group, thereby
9reducing seek times. The size of a block group is specified in
10``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \*
11``block_size_in_bytes``. With the default block size of 4KiB, each group
12will contain 32,768 blocks, for a length of 128MiB. The number of block
13groups is the size of the device divided by the size of a block group.
14
15All fields in ext4 are written to disk in little-endian order. HOWEVER,
16all fields in jbd2 (the journal) are written to disk in big-endian
17order.
18
19.. include:: blocks.rst
20.. include:: blockgroup.rst
21.. include:: special_inodes.rst
22.. include:: allocators.rst
23.. include:: checksums.rst
24.. include:: bigalloc.rst
25.. include:: inlinedata.rst
26.. include:: eainode.rst
diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/ondisk/special_inodes.rst
new file mode 100644
index 000000000000..a82f70c9baeb
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/special_inodes.rst
@@ -0,0 +1,38 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Special inodes
4--------------
5
6ext4 reserves some inode for special features, as follows:
7
8.. list-table::
9 :widths: 1 79
10 :header-rows: 1
11
12 * - inode Number
13 - Purpose
14 * - 0
15 - Doesn't exist; there is no inode 0.
16 * - 1
17 - List of defective blocks.
18 * - 2
19 - Root directory.
20 * - 3
21 - User quota.
22 * - 4
23 - Group quota.
24 * - 5
25 - Boot loader.
26 * - 6
27 - Undelete directory.
28 * - 7
29 - Reserved group descriptors inode. (“resize inode”)
30 * - 8
31 - Journal inode.
32 * - 9
33 - The “exclude” inode, for snapshots(?)
34 * - 10
35 - Replica inode, used for some non-upstream feature?
36 * - 11
37 - Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock.
38
diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/ondisk/super.rst
new file mode 100644
index 000000000000..5f81dd87e0b9
--- /dev/null
+++ b/Documentation/filesystems/ext4/ondisk/super.rst
@@ -0,0 +1,801 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3Super Block
4-----------
5
6The superblock records various information about the enclosing
7filesystem, such as block counts, inode counts, supported features,
8maintenance information, and more.
9
10If the sparse\_super feature flag is set, redundant copies of the
11superblock and group descriptors are kept only in the groups whose group
12number is either 0 or a power of 3, 5, or 7. If the flag is not set,
13redundant copies are kept in all groups.
14
15The superblock checksum is calculated against the superblock structure,
16which includes the FS UUID.
17
18The ext4 superblock is laid out as follows in
19``struct ext4_super_block``:
20
21.. list-table::
22 :widths: 1 1 1 77
23 :header-rows: 1
24
25 * - Offset
26 - Size
27 - Name
28 - Description
29 * - 0x0
30 - \_\_le32
31 - s\_inodes\_count
32 - Total inode count.
33 * - 0x4
34 - \_\_le32
35 - s\_blocks\_count\_lo
36 - Total block count.
37 * - 0x8
38 - \_\_le32
39 - s\_r\_blocks\_count\_lo
40 - This number of blocks can only be allocated by the super-user.
41 * - 0xC
42 - \_\_le32
43 - s\_free\_blocks\_count\_lo
44 - Free block count.
45 * - 0x10
46 - \_\_le32
47 - s\_free\_inodes\_count
48 - Free inode count.
49 * - 0x14
50 - \_\_le32
51 - s\_first\_data\_block
52 - First data block. This must be at least 1 for 1k-block filesystems and
53 is typically 0 for all other block sizes.
54 * - 0x18
55 - \_\_le32
56 - s\_log\_block\_size
57 - Block size is 2 ^ (10 + s\_log\_block\_size).
58 * - 0x1C
59 - \_\_le32
60 - s\_log\_cluster\_size
61 - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is
62 enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size.
63 * - 0x20
64 - \_\_le32
65 - s\_blocks\_per\_group
66 - Blocks per group.
67 * - 0x24
68 - \_\_le32
69 - s\_clusters\_per\_group
70 - Clusters per group, if bigalloc is enabled. Otherwise
71 s\_clusters\_per\_group must equal s\_blocks\_per\_group.
72 * - 0x28
73 - \_\_le32
74 - s\_inodes\_per\_group
75 - Inodes per group.
76 * - 0x2C
77 - \_\_le32
78 - s\_mtime
79 - Mount time, in seconds since the epoch.
80 * - 0x30
81 - \_\_le32
82 - s\_wtime
83 - Write time, in seconds since the epoch.
84 * - 0x34
85 - \_\_le16
86 - s\_mnt\_count
87 - Number of mounts since the last fsck.
88 * - 0x36
89 - \_\_le16
90 - s\_max\_mnt\_count
91 - Number of mounts beyond which a fsck is needed.
92 * - 0x38
93 - \_\_le16
94 - s\_magic
95 - Magic signature, 0xEF53
96 * - 0x3A
97 - \_\_le16
98 - s\_state
99 - File system state. See super_state_ for more info.
100 * - 0x3C
101 - \_\_le16
102 - s\_errors
103 - Behaviour when detecting errors. See super_errors_ for more info.
104 * - 0x3E
105 - \_\_le16
106 - s\_minor\_rev\_level
107 - Minor revision level.
108 * - 0x40
109 - \_\_le32
110 - s\_lastcheck
111 - Time of last check, in seconds since the epoch.
112 * - 0x44
113 - \_\_le32
114 - s\_checkinterval
115 - Maximum time between checks, in seconds.
116 * - 0x48
117 - \_\_le32
118 - s\_creator\_os
119 - Creator OS. See the table super_creator_ for more info.
120 * - 0x4C
121 - \_\_le32
122 - s\_rev\_level
123 - Revision level. See the table super_revision_ for more info.
124 * - 0x50
125 - \_\_le16
126 - s\_def\_resuid
127 - Default uid for reserved blocks.
128 * - 0x52
129 - \_\_le16
130 - s\_def\_resgid
131 - Default gid for reserved blocks.
132 * -
133 -
134 -
135 - These fields are for EXT4_DYNAMIC_REV superblocks only.
136
137 Note: the difference between the compatible feature set and the
138 incompatible feature set is that if there is a bit set in the
139 incompatible feature set that the kernel doesn't know about, it should
140 refuse to mount the filesystem.
141
142 e2fsck's requirements are more strict; if it doesn't know
143 about a feature in either the compatible or incompatible feature set, it
144 must abort and not try to meddle with things it doesn't understand...
145 * - 0x54
146 - \_\_le32
147 - s\_first\_ino
148 - First non-reserved inode.
149 * - 0x58
150 - \_\_le16
151 - s\_inode\_size
152 - Size of inode structure, in bytes.
153 * - 0x5A
154 - \_\_le16
155 - s\_block\_group\_nr
156 - Block group # of this superblock.
157 * - 0x5C
158 - \_\_le32
159 - s\_feature\_compat
160 - Compatible feature set flags. Kernel can still read/write this fs even
161 if it doesn't understand a flag; fsck should not do that. See the
162 super_compat_ table for more info.
163 * - 0x60
164 - \_\_le32
165 - s\_feature\_incompat
166 - Incompatible feature set. If the kernel or fsck doesn't understand one
167 of these bits, it should stop. See the super_incompat_ table for more
168 info.
169 * - 0x64
170 - \_\_le32
171 - s\_feature\_ro\_compat
172 - Readonly-compatible feature set. If the kernel doesn't understand one of
173 these bits, it can still mount read-only. See the super_rocompat_ table
174 for more info.
175 * - 0x68
176 - \_\_u8
177 - s\_uuid[16]
178 - 128-bit UUID for volume.
179 * - 0x78
180 - char
181 - s\_volume\_name[16]
182 - Volume label.
183 * - 0x88
184 - char
185 - s\_last\_mounted[64]
186 - Directory where filesystem was last mounted.
187 * - 0xC8
188 - \_\_le32
189 - s\_algorithm\_usage\_bitmap
190 - For compression (Not used in e2fsprogs/Linux)
191 * -
192 -
193 -
194 - Performance hints. Directory preallocation should only happen if the
195 EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
196 * - 0xCC
197 - \_\_u8
198 - s\_prealloc\_blocks
199 - #. of blocks to try to preallocate for ... files? (Not used in
200 e2fsprogs/Linux)
201 * - 0xCD
202 - \_\_u8
203 - s\_prealloc\_dir\_blocks
204 - #. of blocks to preallocate for directories. (Not used in
205 e2fsprogs/Linux)
206 * - 0xCE
207 - \_\_le16
208 - s\_reserved\_gdt\_blocks
209 - Number of reserved GDT entries for future filesystem expansion.
210 * -
211 -
212 -
213 - Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is
214 set.
215 * - 0xD0
216 - \_\_u8
217 - s\_journal\_uuid[16]
218 - UUID of journal superblock
219 * - 0xE0
220 - \_\_le32
221 - s\_journal\_inum
222 - inode number of journal file.
223 * - 0xE4
224 - \_\_le32
225 - s\_journal\_dev
226 - Device number of journal file, if the external journal feature flag is
227 set.
228 * - 0xE8
229 - \_\_le32
230 - s\_last\_orphan
231 - Start of list of orphaned inodes to delete.
232 * - 0xEC
233 - \_\_le32
234 - s\_hash\_seed[4]
235 - HTREE hash seed.
236 * - 0xFC
237 - \_\_u8
238 - s\_def\_hash\_version
239 - Default hash algorithm to use for directory hashes. See super_def_hash_
240 for more info.
241 * - 0xFD
242 - \_\_u8
243 - s\_jnl\_backup\_type
244 - If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the
245 ``s_jnl_blocks`` field contains a duplicate copy of the inode's
246 ``i_block[]`` array and ``i_size``.
247 * - 0xFE
248 - \_\_le16
249 - s\_desc\_size
250 - Size of group descriptors, in bytes, if the 64bit incompat feature flag
251 is set.
252 * - 0x100
253 - \_\_le32
254 - s\_default\_mount\_opts
255 - Default mount options. See the super_mountopts_ table for more info.
256 * - 0x104
257 - \_\_le32
258 - s\_first\_meta\_bg
259 - First metablock block group, if the meta\_bg feature is enabled.
260 * - 0x108
261 - \_\_le32
262 - s\_mkfs\_time
263 - When the filesystem was created, in seconds since the epoch.
264 * - 0x10C
265 - \_\_le32
266 - s\_jnl\_blocks[17]
267 - Backup copy of the journal inode's ``i_block[]`` array in the first 15
268 elements and i\_size\_high and i\_size in the 16th and 17th elements,
269 respectively.
270 * -
271 -
272 -
273 - 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set.
274 * - 0x150
275 - \_\_le32
276 - s\_blocks\_count\_hi
277 - High 32-bits of the block count.
278 * - 0x154
279 - \_\_le32
280 - s\_r\_blocks\_count\_hi
281 - High 32-bits of the reserved block count.
282 * - 0x158
283 - \_\_le32
284 - s\_free\_blocks\_count\_hi
285 - High 32-bits of the free block count.
286 * - 0x15C
287 - \_\_le16
288 - s\_min\_extra\_isize
289 - All inodes have at least # bytes.
290 * - 0x15E
291 - \_\_le16
292 - s\_want\_extra\_isize
293 - New inodes should reserve # bytes.
294 * - 0x160
295 - \_\_le32
296 - s\_flags
297 - Miscellaneous flags. See the super_flags_ table for more info.
298 * - 0x164
299 - \_\_le16
300 - s\_raid\_stride
301 - RAID stride. This is the number of logical blocks read from or written
302 to the disk before moving to the next disk. This affects the placement
303 of filesystem metadata, which will hopefully make RAID storage faster.
304 * - 0x166
305 - \_\_le16
306 - s\_mmp\_interval
307 - #. seconds to wait in multi-mount prevention (MMP) checking. In theory,
308 MMP is a mechanism to record in the superblock which host and device
309 have mounted the filesystem, in order to prevent multiple mounts. This
310 feature does not seem to be implemented...
311 * - 0x168
312 - \_\_le64
313 - s\_mmp\_block
314 - Block # for multi-mount protection data.
315 * - 0x170
316 - \_\_le32
317 - s\_raid\_stripe\_width
318 - RAID stripe width. This is the number of logical blocks read from or
319 written to the disk before coming back to the current disk. This is used
320 by the block allocator to try to reduce the number of read-modify-write
321 operations in a RAID5/6.
322 * - 0x174
323 - \_\_u8
324 - s\_log\_groups\_per\_flex
325 - Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``.
326 * - 0x175
327 - \_\_u8
328 - s\_checksum\_type
329 - Metadata checksum algorithm type. The only valid value is 1 (crc32c).
330 * - 0x176
331 - \_\_le16
332 - s\_reserved\_pad
333 -
334 * - 0x178
335 - \_\_le64
336 - s\_kbytes\_written
337 - Number of KiB written to this filesystem over its lifetime.
338 * - 0x180
339 - \_\_le32
340 - s\_snapshot\_inum
341 - inode number of active snapshot. (Not used in e2fsprogs/Linux.)
342 * - 0x184
343 - \_\_le32
344 - s\_snapshot\_id
345 - Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.)
346 * - 0x188
347 - \_\_le64
348 - s\_snapshot\_r\_blocks\_count
349 - Number of blocks reserved for active snapshot's future use. (Not used in
350 e2fsprogs/Linux.)
351 * - 0x190
352 - \_\_le32
353 - s\_snapshot\_list
354 - inode number of the head of the on-disk snapshot list. (Not used in
355 e2fsprogs/Linux.)
356 * - 0x194
357 - \_\_le32
358 - s\_error\_count
359 - Number of errors seen.
360 * - 0x198
361 - \_\_le32
362 - s\_first\_error\_time
363 - First time an error happened, in seconds since the epoch.
364 * - 0x19C
365 - \_\_le32
366 - s\_first\_error\_ino
367 - inode involved in first error.
368 * - 0x1A0
369 - \_\_le64
370 - s\_first\_error\_block
371 - Number of block involved of first error.
372 * - 0x1A8
373 - \_\_u8
374 - s\_first\_error\_func[32]
375 - Name of function where the error happened.
376 * - 0x1C8
377 - \_\_le32
378 - s\_first\_error\_line
379 - Line number where error happened.
380 * - 0x1CC
381 - \_\_le32
382 - s\_last\_error\_time
383 - Time of most recent error, in seconds since the epoch.
384 * - 0x1D0
385 - \_\_le32
386 - s\_last\_error\_ino
387 - inode involved in most recent error.
388 * - 0x1D4
389 - \_\_le32
390 - s\_last\_error\_line
391 - Line number where most recent error happened.
392 * - 0x1D8
393 - \_\_le64
394 - s\_last\_error\_block
395 - Number of block involved in most recent error.
396 * - 0x1E0
397 - \_\_u8
398 - s\_last\_error\_func[32]
399 - Name of function where the most recent error happened.
400 * - 0x200
401 - \_\_u8
402 - s\_mount\_opts[64]
403 - ASCIIZ string of mount options.
404 * - 0x240
405 - \_\_le32
406 - s\_usr\_quota\_inum
407 - Inode number of user `quota <quota>`__ file.
408 * - 0x244
409 - \_\_le32
410 - s\_grp\_quota\_inum
411 - Inode number of group `quota <quota>`__ file.
412 * - 0x248
413 - \_\_le32
414 - s\_overhead\_blocks
415 - Overhead blocks/clusters in fs. (Huh? This field is always zero, which
416 means that the kernel calculates it dynamically.)
417 * - 0x24C
418 - \_\_le32
419 - s\_backup\_bgs[2]
420 - Block groups containing superblock backups (if sparse\_super2)
421 * - 0x254
422 - \_\_u8
423 - s\_encrypt\_algos[4]
424 - Encryption algorithms in use. There can be up to four algorithms in use
425 at any time; valid algorithm codes are given in the super_encrypt_ table
426 below.
427 * - 0x258
428 - \_\_u8
429 - s\_encrypt\_pw\_salt[16]
430 - Salt for the string2key algorithm for encryption.
431 * - 0x268
432 - \_\_le32
433 - s\_lpf\_ino
434 - Inode number of lost+found
435 * - 0x26C
436 - \_\_le32
437 - s\_prj\_quota\_inum
438 - Inode that tracks project quotas.
439 * - 0x270
440 - \_\_le32
441 - s\_checksum\_seed
442 - Checksum seed used for metadata\_csum calculations. This value is
443 crc32c(~0, $orig\_fs\_uuid).
444 * - 0x274
445 - \_\_u8
446 - s\_wtime_hi
447 - Upper 8 bits of the s_wtime field.
448 * - 0x275
449 - \_\_u8
450 - s\_wtime_hi
451 - Upper 8 bits of the s_mtime field.
452 * - 0x276
453 - \_\_u8
454 - s\_mkfs_time_hi
455 - Upper 8 bits of the s_mkfs_time field.
456 * - 0x277
457 - \_\_u8
458 - s\_lastcheck_hi
459 - Upper 8 bits of the s_lastcheck_hi field.
460 * - 0x278
461 - \_\_u8
462 - s\_first_error_time_hi
463 - Upper 8 bits of the s_first_error_time_hi field.
464 * - 0x279
465 - \_\_u8
466 - s\_last_error_time_hi
467 - Upper 8 bits of the s_last_error_time_hi field.
468 * - 0x27A
469 - \_\_u8[2]
470 - s\_pad
471 - Zero padding.
472 * - 0x27C
473 - \_\_le32
474 - s\_reserved[96]
475 - Padding to the end of the block.
476 * - 0x3FC
477 - \_\_le32
478 - s\_checksum
479 - Superblock checksum.
480
481.. _super_state:
482
483The superblock state is some combination of the following:
484
485.. list-table::
486 :widths: 1 79
487 :header-rows: 1
488
489 * - Value
490 - Description
491 * - 0x0001
492 - Cleanly umounted
493 * - 0x0002
494 - Errors detected
495 * - 0x0004
496 - Orphans being recovered
497
498.. _super_errors:
499
500The superblock error policy is one of the following:
501
502.. list-table::
503 :widths: 1 79
504 :header-rows: 1
505
506 * - Value
507 - Description
508 * - 1
509 - Continue
510 * - 2
511 - Remount read-only
512 * - 3
513 - Panic
514
515.. _super_creator:
516
517The filesystem creator is one of the following:
518
519.. list-table::
520 :widths: 1 79
521 :header-rows: 1
522
523 * - Value
524 - Description
525 * - 0
526 - Linux
527 * - 1
528 - Hurd
529 * - 2
530 - Masix
531 * - 3
532 - FreeBSD
533 * - 4
534 - Lites
535
536.. _super_revision:
537
538The superblock revision is one of the following:
539
540.. list-table::
541 :widths: 1 79
542 :header-rows: 1
543
544 * - Value
545 - Description
546 * - 0
547 - Original format
548 * - 1
549 - v2 format w/ dynamic inode sizes
550
551Note that ``EXT4_DYNAMIC_REV`` refers to a revision 1 or newer filesystem.
552
553.. _super_compat:
554
555The superblock compatible features field is a combination of any of the
556following:
557
558.. list-table::
559 :widths: 1 79
560 :header-rows: 1
561
562 * - Value
563 - Description
564 * - 0x1
565 - Directory preallocation (COMPAT\_DIR\_PREALLOC).
566 * - 0x2
567 - “imagic inodes”. Not clear from the code what this does
568 (COMPAT\_IMAGIC\_INODES).
569 * - 0x4
570 - Has a journal (COMPAT\_HAS\_JOURNAL).
571 * - 0x8
572 - Supports extended attributes (COMPAT\_EXT\_ATTR).
573 * - 0x10
574 - Has reserved GDT blocks for filesystem expansion
575 (COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER.
576 * - 0x20
577 - Has directory indices (COMPAT\_DIR\_INDEX).
578 * - 0x40
579 - “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized
580 block groups? (COMPAT\_LAZY\_BG)
581 * - 0x80
582 - “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE).
583 * - 0x100
584 - “Exclude bitmap”. Seems to be used to indicate the presence of
585 snapshot-related exclude bitmaps? Not defined in kernel or used in
586 e2fsprogs (COMPAT\_EXCLUDE\_BITMAP).
587 * - 0x200
588 - Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs
589 points to the two block groups that contain backup superblocks
590 (COMPAT\_SPARSE\_SUPER2).
591
592.. _super_incompat:
593
594The superblock incompatible features field is a combination of any of the
595following:
596
597.. list-table::
598 :widths: 1 79
599 :header-rows: 1
600
601 * - Value
602 - Description
603 * - 0x1
604 - Compression (INCOMPAT\_COMPRESSION).
605 * - 0x2
606 - Directory entries record the file type. See ext4\_dir\_entry\_2 below
607 (INCOMPAT\_FILETYPE).
608 * - 0x4
609 - Filesystem needs recovery (INCOMPAT\_RECOVER).
610 * - 0x8
611 - Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV).
612 * - 0x10
613 - Meta block groups. See the earlier discussion of this feature
614 (INCOMPAT\_META\_BG).
615 * - 0x40
616 - Files in this filesystem use extents (INCOMPAT\_EXTENTS).
617 * - 0x80
618 - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT).
619 * - 0x100
620 - Multiple mount protection. Not implemented (INCOMPAT\_MMP).
621 * - 0x200
622 - Flexible block groups. See the earlier discussion of this feature
623 (INCOMPAT\_FLEX\_BG).
624 * - 0x400
625 - Inodes can be used to store large extended attribute values
626 (INCOMPAT\_EA\_INODE).
627 * - 0x1000
628 - Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?)
629 * - 0x2000
630 - Metadata checksum seed is stored in the superblock. This feature enables
631 the administrator to change the UUID of a metadata\_csum filesystem
632 while the filesystem is mounted; without it, the checksum definition
633 requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED).
634 * - 0x4000
635 - Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to
636 this feature, directories could not be larger than 4GiB and could not
637 have an htree more than 2 levels deep. If this feature is enabled,
638 directories can be larger than 4GiB and have a maximum htree depth of 3.
639 * - 0x8000
640 - Data in inode (INCOMPAT\_INLINE\_DATA).
641 * - 0x10000
642 - Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT).
643
644.. _super_rocompat:
645
646The superblock read-only compatible features field is a combination of any of
647the following:
648
649.. list-table::
650 :widths: 1 79
651 :header-rows: 1
652
653 * - Value
654 - Description
655 * - 0x1
656 - Sparse superblocks. See the earlier discussion of this feature
657 (RO\_COMPAT\_SPARSE\_SUPER).
658 * - 0x2
659 - This filesystem has been used to store a file greater than 2GiB
660 (RO\_COMPAT\_LARGE\_FILE).
661 * - 0x4
662 - Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR).
663 * - 0x8
664 - This filesystem has files whose sizes are represented in units of
665 logical blocks, not 512-byte sectors. This implies a very large file
666 indeed! (RO\_COMPAT\_HUGE\_FILE)
667 * - 0x10
668 - Group descriptors have checksums. In addition to detecting corruption,
669 this is useful for lazy formatting with uninitialized groups
670 (RO\_COMPAT\_GDT\_CSUM).
671 * - 0x20
672 - Indicates that the old ext3 32,000 subdirectory limit no longer applies
673 (RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1
674 if it is incremented past 64,999.
675 * - 0x40
676 - Indicates that large inodes exist on this filesystem
677 (RO\_COMPAT\_EXTRA\_ISIZE).
678 * - 0x80
679 - This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT).
680 * - 0x100
681 - `Quota <Quota>`__ (RO\_COMPAT\_QUOTA).
682 * - 0x200
683 - This filesystem supports “bigalloc”, which means that file extents are
684 tracked in units of clusters (of blocks) instead of blocks
685 (RO\_COMPAT\_BIGALLOC).
686 * - 0x400
687 - This filesystem supports metadata checksumming.
688 (RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though
689 GDT\_CSUM must not be set)
690 * - 0x800
691 - Filesystem supports replicas. This feature is neither in the kernel nor
692 e2fsprogs. (RO\_COMPAT\_REPLICA)
693 * - 0x1000
694 - Read-only filesystem image; the kernel will not mount this image
695 read-write and most tools will refuse to write to the image.
696 (RO\_COMPAT\_READONLY)
697 * - 0x2000
698 - Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT)
699
700.. _super_def_hash:
701
702The ``s_def_hash_version`` field is one of the following:
703
704.. list-table::
705 :widths: 1 79
706 :header-rows: 1
707
708 * - Value
709 - Description
710 * - 0x0
711 - Legacy.
712 * - 0x1
713 - Half MD4.
714 * - 0x2
715 - Tea.
716 * - 0x3
717 - Legacy, unsigned.
718 * - 0x4
719 - Half MD4, unsigned.
720 * - 0x5
721 - Tea, unsigned.
722
723.. _super_mountopts:
724
725The ``s_default_mount_opts`` field is any combination of the following:
726
727.. list-table::
728 :widths: 1 79
729 :header-rows: 1
730
731 * - Value
732 - Description
733 * - 0x0001
734 - Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG)
735 * - 0x0002
736 - New files take the gid of the containing directory (instead of the fsgid
737 of the current process). (EXT4\_DEFM\_BSDGROUPS)
738 * - 0x0004
739 - Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER)
740 * - 0x0008
741 - Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL)
742 * - 0x0010
743 - Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16)
744 * - 0x0020
745 - All data and metadata are commited to the journal.
746 (EXT4\_DEFM\_JMODE\_DATA)
747 * - 0x0040
748 - All data are flushed to the disk before metadata are committed to the
749 journal. (EXT4\_DEFM\_JMODE\_ORDERED)
750 * - 0x0060
751 - Data ordering is not preserved; data may be written after the metadata
752 has been written. (EXT4\_DEFM\_JMODE\_WBACK)
753 * - 0x0100
754 - Disable write flushes. (EXT4\_DEFM\_NOBARRIER)
755 * - 0x0200
756 - Track which blocks in a filesystem are metadata and therefore should not
757 be used as data blocks. This option will be enabled by default on 3.18,
758 hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY)
759 * - 0x0400
760 - Enable DISCARD support, where the storage device is told about blocks
761 becoming unused. (EXT4\_DEFM\_DISCARD)
762 * - 0x0800
763 - Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC)
764
765.. _super_flags:
766
767The ``s_flags`` field is any combination of the following:
768
769.. list-table::
770 :widths: 1 79
771 :header-rows: 1
772
773 * - Value
774 - Description
775 * - 0x0001
776 - Signed directory hash in use.
777 * - 0x0002
778 - Unsigned directory hash in use.
779 * - 0x0004
780 - To test development code.
781
782.. _super_encrypt:
783
784The ``s_encrypt_algos`` list can contain any of the following:
785
786.. list-table::
787 :widths: 1 79
788 :header-rows: 1
789
790 * - Value
791 - Description
792 * - 0
793 - Invalid algorithm (ENCRYPTION\_MODE\_INVALID).
794 * - 1
795 - 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS).
796 * - 2
797 - 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM).
798 * - 3
799 - 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC).
800
801Total size of the superblock is 1024 bytes.
diff --git a/Documentation/index.rst b/Documentation/index.rst
index fdc585703498..f95ba981f8cd 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -102,6 +102,17 @@ implementation.
102 102
103 sh/index 103 sh/index
104 104
105Filesystem Documentation
106------------------------
107
108The documentation in this section are provided by specific filesystem
109subprojects.
110
111.. toctree::
112 :maxdepth: 2
113
114 filesystems/ext4/index
115
105Korean translations 116Korean translations
106------------------- 117-------------------
107 118
diff --git a/fs/dax.c b/fs/dax.c
index 641192808bb6..897b51e41d8f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -566,7 +566,8 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
566 if (index >= end) 566 if (index >= end)
567 break; 567 break;
568 568
569 if (!radix_tree_exceptional_entry(pvec_ent)) 569 if (WARN_ON_ONCE(
570 !radix_tree_exceptional_entry(pvec_ent)))
570 continue; 571 continue;
571 572
572 xa_lock_irq(&mapping->i_pages); 573 xa_lock_irq(&mapping->i_pages);
@@ -578,6 +579,13 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
578 if (page) 579 if (page)
579 break; 580 break;
580 } 581 }
582
583 /*
584 * We don't expect normal struct page entries to exist in our
585 * tree, but we keep these pagevec calls so that this code is
586 * consistent with the common pattern for handling pagevecs
587 * throughout the kernel.
588 */
581 pagevec_remove_exceptionals(&pvec); 589 pagevec_remove_exceptionals(&pvec);
582 pagevec_release(&pvec); 590 pagevec_release(&pvec);
583 index++; 591 index++;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index aa52d87985aa..e5d6ee61ff48 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -426,9 +426,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
426 } 426 }
427 bh = sb_getblk(sb, bitmap_blk); 427 bh = sb_getblk(sb, bitmap_blk);
428 if (unlikely(!bh)) { 428 if (unlikely(!bh)) {
429 ext4_error(sb, "Cannot get buffer for block bitmap - " 429 ext4_warning(sb, "Cannot get buffer for block bitmap - "
430 "block_group = %u, block_bitmap = %llu", 430 "block_group = %u, block_bitmap = %llu",
431 block_group, bitmap_blk); 431 block_group, bitmap_blk);
432 return ERR_PTR(-ENOMEM); 432 return ERR_PTR(-ENOMEM);
433 } 433 }
434 434
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7c7123f265c2..1fc013f3d944 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -789,17 +789,16 @@ struct move_extent {
789 * affected filesystem before 2242. 789 * affected filesystem before 2242.
790 */ 790 */
791 791
792static inline __le32 ext4_encode_extra_time(struct timespec *time) 792static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
793{ 793{
794 u32 extra = sizeof(time->tv_sec) > 4 ? 794 u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK;
795 ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0;
796 return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); 795 return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
797} 796}
798 797
799static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) 798static inline void ext4_decode_extra_time(struct timespec64 *time,
799 __le32 extra)
800{ 800{
801 if (unlikely(sizeof(time->tv_sec) > 4 && 801 if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) {
802 (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) {
803 802
804#if 1 803#if 1
805 /* Handle legacy encoding of pre-1970 dates with epoch 804 /* Handle legacy encoding of pre-1970 dates with epoch
@@ -821,9 +820,8 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
821do { \ 820do { \
822 (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ 821 (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
823 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ 822 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\
824 struct timespec ts = timespec64_to_timespec((inode)->xtime); \
825 (raw_inode)->xtime ## _extra = \ 823 (raw_inode)->xtime ## _extra = \
826 ext4_encode_extra_time(&ts); \ 824 ext4_encode_extra_time(&(inode)->xtime); \
827 } \ 825 } \
828} while (0) 826} while (0)
829 827
@@ -840,10 +838,8 @@ do { \
840do { \ 838do { \
841 (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ 839 (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \
842 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ 840 if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
843 struct timespec ts = timespec64_to_timespec((inode)->xtime); \ 841 ext4_decode_extra_time(&(inode)->xtime, \
844 ext4_decode_extra_time(&ts, \
845 raw_inode->xtime ## _extra); \ 842 raw_inode->xtime ## _extra); \
846 (inode)->xtime = timespec_to_timespec64(ts); \
847 } \ 843 } \
848 else \ 844 else \
849 (inode)->xtime.tv_nsec = 0; \ 845 (inode)->xtime.tv_nsec = 0; \
@@ -993,9 +989,9 @@ struct ext4_inode_info {
993 989
994 /* 990 /*
995 * File creation time. Its function is same as that of 991 * File creation time. Its function is same as that of
996 * struct timespec i_{a,c,m}time in the generic inode. 992 * struct timespec64 i_{a,c,m}time in the generic inode.
997 */ 993 */
998 struct timespec i_crtime; 994 struct timespec64 i_crtime;
999 995
1000 /* mballoc */ 996 /* mballoc */
1001 struct list_head i_prealloc_list; 997 struct list_head i_prealloc_list;
@@ -1299,7 +1295,14 @@ struct ext4_super_block {
1299 __le32 s_lpf_ino; /* Location of the lost+found inode */ 1295 __le32 s_lpf_ino; /* Location of the lost+found inode */
1300 __le32 s_prj_quota_inum; /* inode for tracking project quota */ 1296 __le32 s_prj_quota_inum; /* inode for tracking project quota */
1301 __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ 1297 __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */
1302 __le32 s_reserved[98]; /* Padding to the end of the block */ 1298 __u8 s_wtime_hi;
1299 __u8 s_mtime_hi;
1300 __u8 s_mkfs_time_hi;
1301 __u8 s_lastcheck_hi;
1302 __u8 s_first_error_time_hi;
1303 __u8 s_last_error_time_hi;
1304 __u8 s_pad[2];
1305 __le32 s_reserved[96]; /* Padding to the end of the block */
1303 __le32 s_checksum; /* crc32c(superblock) */ 1306 __le32 s_checksum; /* crc32c(superblock) */
1304}; 1307};
1305 1308
@@ -2456,6 +2459,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
2456extern int ext4_inode_attach_jinode(struct inode *inode); 2459extern int ext4_inode_attach_jinode(struct inode *inode);
2457extern int ext4_can_truncate(struct inode *inode); 2460extern int ext4_can_truncate(struct inode *inode);
2458extern int ext4_truncate(struct inode *); 2461extern int ext4_truncate(struct inode *);
2462extern int ext4_break_layouts(struct inode *);
2459extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); 2463extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
2460extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 2464extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
2461extern void ext4_set_inode_flags(struct inode *); 2465extern void ext4_set_inode_flags(struct inode *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8ce6fd5b10dd..72a361d5ef74 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4826,6 +4826,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4826 * released from page cache. 4826 * released from page cache.
4827 */ 4827 */
4828 down_write(&EXT4_I(inode)->i_mmap_sem); 4828 down_write(&EXT4_I(inode)->i_mmap_sem);
4829
4830 ret = ext4_break_layouts(inode);
4831 if (ret) {
4832 up_write(&EXT4_I(inode)->i_mmap_sem);
4833 goto out_mutex;
4834 }
4835
4829 ret = ext4_update_disksize_before_punch(inode, offset, len); 4836 ret = ext4_update_disksize_before_punch(inode, offset, len);
4830 if (ret) { 4837 if (ret) {
4831 up_write(&EXT4_I(inode)->i_mmap_sem); 4838 up_write(&EXT4_I(inode)->i_mmap_sem);
@@ -5499,6 +5506,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5499 * page cache. 5506 * page cache.
5500 */ 5507 */
5501 down_write(&EXT4_I(inode)->i_mmap_sem); 5508 down_write(&EXT4_I(inode)->i_mmap_sem);
5509
5510 ret = ext4_break_layouts(inode);
5511 if (ret)
5512 goto out_mmap;
5513
5502 /* 5514 /*
5503 * Need to round down offset to be aligned with page size boundary 5515 * Need to round down offset to be aligned with page size boundary
5504 * for page size > block size. 5516 * for page size > block size.
@@ -5647,6 +5659,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5647 * page cache. 5659 * page cache.
5648 */ 5660 */
5649 down_write(&EXT4_I(inode)->i_mmap_sem); 5661 down_write(&EXT4_I(inode)->i_mmap_sem);
5662
5663 ret = ext4_break_layouts(inode);
5664 if (ret)
5665 goto out_mmap;
5666
5650 /* 5667 /*
5651 * Need to round down to align start offset to page size boundary 5668 * Need to round down to align start offset to page size boundary
5652 * for page size > block size. 5669 * for page size > block size.
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f336cbc6e932..2addcb8730e1 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -138,9 +138,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
138 } 138 }
139 bh = sb_getblk(sb, bitmap_blk); 139 bh = sb_getblk(sb, bitmap_blk);
140 if (unlikely(!bh)) { 140 if (unlikely(!bh)) {
141 ext4_error(sb, "Cannot read inode bitmap - " 141 ext4_warning(sb, "Cannot read inode bitmap - "
142 "block_group = %u, inode_bitmap = %llu", 142 "block_group = %u, inode_bitmap = %llu",
143 block_group, bitmap_blk); 143 block_group, bitmap_blk);
144 return ERR_PTR(-ENOMEM); 144 return ERR_PTR(-ENOMEM);
145 } 145 }
146 if (bitmap_uptodate(bh)) 146 if (bitmap_uptodate(bh))
@@ -1086,7 +1086,7 @@ got:
1086 /* This is the optimal IO size (for stat), not the fs block size */ 1086 /* This is the optimal IO size (for stat), not the fs block size */
1087 inode->i_blocks = 0; 1087 inode->i_blocks = 0;
1088 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); 1088 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
1089 ei->i_crtime = timespec64_to_timespec(inode->i_mtime); 1089 ei->i_crtime = inode->i_mtime;
1090 1090
1091 memset(ei->i_data, 0, sizeof(ei->i_data)); 1091 memset(ei->i_data, 0, sizeof(ei->i_data));
1092 ei->i_dir_start_lookup = 0; 1092 ei->i_dir_start_lookup = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4efe77286ecd..8f6ad7667974 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -317,7 +317,7 @@ stop_handle:
317 * (Well, we could do this if we need to, but heck - it works) 317 * (Well, we could do this if we need to, but heck - it works)
318 */ 318 */
319 ext4_orphan_del(handle, inode); 319 ext4_orphan_del(handle, inode);
320 EXT4_I(inode)->i_dtime = get_seconds(); 320 EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds();
321 321
322 /* 322 /*
323 * One subtle ordering requirement: if anything has gone wrong 323 * One subtle ordering requirement: if anything has gone wrong
@@ -4191,6 +4191,39 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
4191 return 0; 4191 return 0;
4192} 4192}
4193 4193
4194static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock)
4195{
4196 *did_unlock = true;
4197 up_write(&ei->i_mmap_sem);
4198 schedule();
4199 down_write(&ei->i_mmap_sem);
4200}
4201
4202int ext4_break_layouts(struct inode *inode)
4203{
4204 struct ext4_inode_info *ei = EXT4_I(inode);
4205 struct page *page;
4206 bool retry;
4207 int error;
4208
4209 if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
4210 return -EINVAL;
4211
4212 do {
4213 retry = false;
4214 page = dax_layout_busy_page(inode->i_mapping);
4215 if (!page)
4216 return 0;
4217
4218 error = ___wait_var_event(&page->_refcount,
4219 atomic_read(&page->_refcount) == 1,
4220 TASK_INTERRUPTIBLE, 0, 0,
4221 ext4_wait_dax_page(ei, &retry));
4222 } while (error == 0 && retry);
4223
4224 return error;
4225}
4226
4194/* 4227/*
4195 * ext4_punch_hole: punches a hole in a file by releasing the blocks 4228 * ext4_punch_hole: punches a hole in a file by releasing the blocks
4196 * associated with the given offset and length 4229 * associated with the given offset and length
@@ -4264,6 +4297,11 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
4264 * page cache. 4297 * page cache.
4265 */ 4298 */
4266 down_write(&EXT4_I(inode)->i_mmap_sem); 4299 down_write(&EXT4_I(inode)->i_mmap_sem);
4300
4301 ret = ext4_break_layouts(inode);
4302 if (ret)
4303 goto out_dio;
4304
4267 first_block_offset = round_up(offset, sb->s_blocksize); 4305 first_block_offset = round_up(offset, sb->s_blocksize);
4268 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 4306 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
4269 4307
@@ -4944,17 +4982,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4944 ret = -EFSCORRUPTED; 4982 ret = -EFSCORRUPTED;
4945 goto bad_inode; 4983 goto bad_inode;
4946 } else if (!ext4_has_inline_data(inode)) { 4984 } else if (!ext4_has_inline_data(inode)) {
4947 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4985 /* validate the block references in the inode */
4948 if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4986 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4949 (S_ISLNK(inode->i_mode) && 4987 (S_ISLNK(inode->i_mode) &&
4950 !ext4_inode_is_fast_symlink(inode)))) 4988 !ext4_inode_is_fast_symlink(inode))) {
4951 /* Validate extent which is part of inode */ 4989 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4952 ret = ext4_ext_check_inode(inode); 4990 ret = ext4_ext_check_inode(inode);
4953 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4991 else
4954 (S_ISLNK(inode->i_mode) && 4992 ret = ext4_ind_check_inode(inode);
4955 !ext4_inode_is_fast_symlink(inode))) {
4956 /* Validate block references which are part of inode */
4957 ret = ext4_ind_check_inode(inode);
4958 } 4993 }
4959 } 4994 }
4960 if (ret) 4995 if (ret)
@@ -5553,6 +5588,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5553 ext4_wait_for_tail_page_commit(inode); 5588 ext4_wait_for_tail_page_commit(inode);
5554 } 5589 }
5555 down_write(&EXT4_I(inode)->i_mmap_sem); 5590 down_write(&EXT4_I(inode)->i_mmap_sem);
5591
5592 rc = ext4_break_layouts(inode);
5593 if (rc) {
5594 up_write(&EXT4_I(inode)->i_mmap_sem);
5595 error = rc;
5596 goto err_out;
5597 }
5598
5556 /* 5599 /*
5557 * Truncate pagecache after we've waited for commit 5600 * Truncate pagecache after we've waited for commit
5558 * in data=journal mode to make pages freeable. 5601 * in data=journal mode to make pages freeable.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f7ab34088162..e29fce2fbf25 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -14,6 +14,7 @@
14#include <linux/log2.h> 14#include <linux/log2.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/nospec.h>
17#include <linux/backing-dev.h> 18#include <linux/backing-dev.h>
18#include <trace/events/ext4.h> 19#include <trace/events/ext4.h>
19 20
@@ -2140,7 +2141,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2140 * This should tell if fe_len is exactly power of 2 2141 * This should tell if fe_len is exactly power of 2
2141 */ 2142 */
2142 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2143 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2143 ac->ac_2order = i - 1; 2144 ac->ac_2order = array_index_nospec(i - 1,
2145 sb->s_blocksize_bits + 2);
2144 } 2146 }
2145 2147
2146 /* if stream allocation is enabled, use global goal */ 2148 /* if stream allocation is enabled, use global goal */
@@ -3799,7 +3801,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3799 ext4_group_t group; 3801 ext4_group_t group;
3800 ext4_grpblk_t bit; 3802 ext4_grpblk_t bit;
3801 unsigned long long grp_blk_start; 3803 unsigned long long grp_blk_start;
3802 int err = 0;
3803 int free = 0; 3804 int free = 0;
3804 3805
3805 BUG_ON(pa->pa_deleted == 0); 3806 BUG_ON(pa->pa_deleted == 0);
@@ -3840,7 +3841,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3840 } 3841 }
3841 atomic_add(free, &sbi->s_mb_discarded); 3842 atomic_add(free, &sbi->s_mb_discarded);
3842 3843
3843 return err; 3844 return 0;
3844} 3845}
3845 3846
3846static noinline_for_stack int 3847static noinline_for_stack int
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 638ad4743477..39b07c2d3384 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -147,7 +147,7 @@ static int kmmpd(void *data)
147 147
148 mmp_block = le64_to_cpu(es->s_mmp_block); 148 mmp_block = le64_to_cpu(es->s_mmp_block);
149 mmp = (struct mmp_struct *)(bh->b_data); 149 mmp = (struct mmp_struct *)(bh->b_data);
150 mmp->mmp_time = cpu_to_le64(get_seconds()); 150 mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
151 /* 151 /*
152 * Start with the higher mmp_check_interval and reduce it if 152 * Start with the higher mmp_check_interval and reduce it if
153 * the MMP block is being updated on time. 153 * the MMP block is being updated on time.
@@ -165,7 +165,7 @@ static int kmmpd(void *data)
165 seq = 1; 165 seq = 1;
166 166
167 mmp->mmp_seq = cpu_to_le32(seq); 167 mmp->mmp_seq = cpu_to_le32(seq);
168 mmp->mmp_time = cpu_to_le64(get_seconds()); 168 mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
169 last_update_time = jiffies; 169 last_update_time = jiffies;
170 170
171 retval = write_mmp_block(sb, bh); 171 retval = write_mmp_block(sb, bh);
@@ -241,7 +241,7 @@ static int kmmpd(void *data)
241 * Unmount seems to be clean. 241 * Unmount seems to be clean.
242 */ 242 */
243 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); 243 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
244 mmp->mmp_time = cpu_to_le64(get_seconds()); 244 mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
245 245
246 retval = write_mmp_block(sb, bh); 246 retval = write_mmp_block(sb, bh);
247 247
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 8e17efdcbf11..a409ff70d67b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -134,9 +134,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
134 mapping[0] = inode1->i_mapping; 134 mapping[0] = inode1->i_mapping;
135 mapping[1] = inode2->i_mapping; 135 mapping[1] = inode2->i_mapping;
136 } else { 136 } else {
137 pgoff_t tmp = index1; 137 swap(index1, index2);
138 index1 = index2;
139 index2 = tmp;
140 mapping[0] = inode2->i_mapping; 138 mapping[0] = inode2->i_mapping;
141 mapping[1] = inode1->i_mapping; 139 mapping[1] = inode1->i_mapping;
142 } 140 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2a4c25c4681d..116ff68c5bd4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1398,6 +1398,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
1398 goto cleanup_and_exit; 1398 goto cleanup_and_exit;
1399 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " 1399 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
1400 "falling back\n")); 1400 "falling back\n"));
1401 ret = NULL;
1401 } 1402 }
1402 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); 1403 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1403 if (!nblocks) { 1404 if (!nblocks) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b7f7922061be..f7750bc5b85a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -312,6 +312,24 @@ void ext4_itable_unused_set(struct super_block *sb,
312 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 312 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
313} 313}
314 314
315static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
316{
317 time64_t now = ktime_get_real_seconds();
318
319 now = clamp_val(now, 0, (1ull << 40) - 1);
320
321 *lo = cpu_to_le32(lower_32_bits(now));
322 *hi = upper_32_bits(now);
323}
324
325static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
326{
327 return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
328}
329#define ext4_update_tstamp(es, tstamp) \
330 __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
331#define ext4_get_tstamp(es, tstamp) \
332 __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
315 333
316static void __save_error_info(struct super_block *sb, const char *func, 334static void __save_error_info(struct super_block *sb, const char *func,
317 unsigned int line) 335 unsigned int line)
@@ -322,11 +340,12 @@ static void __save_error_info(struct super_block *sb, const char *func,
322 if (bdev_read_only(sb->s_bdev)) 340 if (bdev_read_only(sb->s_bdev))
323 return; 341 return;
324 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 342 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
325 es->s_last_error_time = cpu_to_le32(get_seconds()); 343 ext4_update_tstamp(es, s_last_error_time);
326 strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); 344 strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
327 es->s_last_error_line = cpu_to_le32(line); 345 es->s_last_error_line = cpu_to_le32(line);
328 if (!es->s_first_error_time) { 346 if (!es->s_first_error_time) {
329 es->s_first_error_time = es->s_last_error_time; 347 es->s_first_error_time = es->s_last_error_time;
348 es->s_first_error_time_hi = es->s_last_error_time_hi;
330 strncpy(es->s_first_error_func, func, 349 strncpy(es->s_first_error_func, func,
331 sizeof(es->s_first_error_func)); 350 sizeof(es->s_first_error_func));
332 es->s_first_error_line = cpu_to_le32(line); 351 es->s_first_error_line = cpu_to_le32(line);
@@ -776,26 +795,26 @@ void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
776 struct ext4_sb_info *sbi = EXT4_SB(sb); 795 struct ext4_sb_info *sbi = EXT4_SB(sb);
777 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 796 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
778 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 797 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
798 int ret;
779 799
780 if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) && 800 if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
781 !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { 801 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
782 percpu_counter_sub(&sbi->s_freeclusters_counter, 802 &grp->bb_state);
783 grp->bb_free); 803 if (!ret)
784 set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, 804 percpu_counter_sub(&sbi->s_freeclusters_counter,
785 &grp->bb_state); 805 grp->bb_free);
786 } 806 }
787 807
788 if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) && 808 if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
789 !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { 809 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
790 if (gdp) { 810 &grp->bb_state);
811 if (!ret && gdp) {
791 int count; 812 int count;
792 813
793 count = ext4_free_inodes_count(sb, gdp); 814 count = ext4_free_inodes_count(sb, gdp);
794 percpu_counter_sub(&sbi->s_freeinodes_counter, 815 percpu_counter_sub(&sbi->s_freeinodes_counter,
795 count); 816 count);
796 } 817 }
797 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
798 &grp->bb_state);
799 } 818 }
800} 819}
801 820
@@ -2174,8 +2193,8 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
2174 "warning: maximal mount count reached, " 2193 "warning: maximal mount count reached, "
2175 "running e2fsck is recommended"); 2194 "running e2fsck is recommended");
2176 else if (le32_to_cpu(es->s_checkinterval) && 2195 else if (le32_to_cpu(es->s_checkinterval) &&
2177 (le32_to_cpu(es->s_lastcheck) + 2196 (ext4_get_tstamp(es, s_lastcheck) +
2178 le32_to_cpu(es->s_checkinterval) <= get_seconds())) 2197 le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
2179 ext4_msg(sb, KERN_WARNING, 2198 ext4_msg(sb, KERN_WARNING,
2180 "warning: checktime reached, " 2199 "warning: checktime reached, "
2181 "running e2fsck is recommended"); 2200 "running e2fsck is recommended");
@@ -2184,7 +2203,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
2184 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 2203 if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
2185 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 2204 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
2186 le16_add_cpu(&es->s_mnt_count, 1); 2205 le16_add_cpu(&es->s_mnt_count, 1);
2187 es->s_mtime = cpu_to_le32(get_seconds()); 2206 ext4_update_tstamp(es, s_mtime);
2188 ext4_update_dynamic_rev(sb); 2207 ext4_update_dynamic_rev(sb);
2189 if (sbi->s_journal) 2208 if (sbi->s_journal)
2190 ext4_set_feature_journal_needs_recovery(sb); 2209 ext4_set_feature_journal_needs_recovery(sb);
@@ -2875,8 +2894,9 @@ static void print_daily_error_info(struct timer_list *t)
2875 ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", 2894 ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
2876 le32_to_cpu(es->s_error_count)); 2895 le32_to_cpu(es->s_error_count));
2877 if (es->s_first_error_time) { 2896 if (es->s_first_error_time) {
2878 printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d", 2897 printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
2879 sb->s_id, le32_to_cpu(es->s_first_error_time), 2898 sb->s_id,
2899 ext4_get_tstamp(es, s_first_error_time),
2880 (int) sizeof(es->s_first_error_func), 2900 (int) sizeof(es->s_first_error_func),
2881 es->s_first_error_func, 2901 es->s_first_error_func,
2882 le32_to_cpu(es->s_first_error_line)); 2902 le32_to_cpu(es->s_first_error_line));
@@ -2889,8 +2909,9 @@ static void print_daily_error_info(struct timer_list *t)
2889 printk(KERN_CONT "\n"); 2909 printk(KERN_CONT "\n");
2890 } 2910 }
2891 if (es->s_last_error_time) { 2911 if (es->s_last_error_time) {
2892 printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d", 2912 printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
2893 sb->s_id, le32_to_cpu(es->s_last_error_time), 2913 sb->s_id,
2914 ext4_get_tstamp(es, s_last_error_time),
2894 (int) sizeof(es->s_last_error_func), 2915 (int) sizeof(es->s_last_error_func),
2895 es->s_last_error_func, 2916 es->s_last_error_func,
2896 le32_to_cpu(es->s_last_error_line)); 2917 le32_to_cpu(es->s_last_error_line));
@@ -4813,7 +4834,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4813 * to complain and force a full file system check. 4834 * to complain and force a full file system check.
4814 */ 4835 */
4815 if (!(sb->s_flags & SB_RDONLY)) 4836 if (!(sb->s_flags & SB_RDONLY))
4816 es->s_wtime = cpu_to_le32(get_seconds()); 4837 ext4_update_tstamp(es, s_wtime);
4817 if (sb->s_bdev->bd_part) 4838 if (sb->s_bdev->bd_part)
4818 es->s_kbytes_written = 4839 es->s_kbytes_written =
4819 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 4840 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
@@ -5080,6 +5101,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
5080#endif 5101#endif
5081 char *orig_data = kstrdup(data, GFP_KERNEL); 5102 char *orig_data = kstrdup(data, GFP_KERNEL);
5082 5103
5104 if (data && !orig_data)
5105 return -ENOMEM;
5106
5083 /* Store the original options */ 5107 /* Store the original options */
5084 old_sb_flags = sb->s_flags; 5108 old_sb_flags = sb->s_flags;
5085 old_opts.s_mount_opt = sbi->s_mount_opt; 5109 old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -5665,13 +5689,13 @@ static int ext4_enable_quotas(struct super_block *sb)
5665 DQUOT_USAGE_ENABLED | 5689 DQUOT_USAGE_ENABLED |
5666 (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); 5690 (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
5667 if (err) { 5691 if (err) {
5668 for (type--; type >= 0; type--)
5669 dquot_quota_off(sb, type);
5670
5671 ext4_warning(sb, 5692 ext4_warning(sb,
5672 "Failed to enable quota tracking " 5693 "Failed to enable quota tracking "
5673 "(type=%d, err=%d). Please run " 5694 "(type=%d, err=%d). Please run "
5674 "e2fsck to fix.", type, err); 5695 "e2fsck to fix.", type, err);
5696 for (type--; type >= 0; type--)
5697 dquot_quota_off(sb, type);
5698
5675 return err; 5699 return err;
5676 } 5700 }
5677 } 5701 }
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index f34da0bb8f17..e60cc5e89023 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -25,6 +25,8 @@ typedef enum {
25 attr_reserved_clusters, 25 attr_reserved_clusters,
26 attr_inode_readahead, 26 attr_inode_readahead,
27 attr_trigger_test_error, 27 attr_trigger_test_error,
28 attr_first_error_time,
29 attr_last_error_time,
28 attr_feature, 30 attr_feature,
29 attr_pointer_ui, 31 attr_pointer_ui,
30 attr_pointer_atomic, 32 attr_pointer_atomic,
@@ -182,8 +184,8 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
182EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); 184EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
183EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); 185EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
184EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); 186EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
185EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); 187EXT4_ATTR(first_error_time, 0444, first_error_time);
186EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); 188EXT4_ATTR(last_error_time, 0444, last_error_time);
187 189
188static unsigned int old_bump_val = 128; 190static unsigned int old_bump_val = 128;
189EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); 191EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -249,6 +251,15 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
249 return NULL; 251 return NULL;
250} 252}
251 253
254static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
255{
256 return snprintf(buf, PAGE_SIZE, "%lld",
257 ((time64_t)hi << 32) + le32_to_cpu(lo));
258}
259
260#define print_tstamp(buf, es, tstamp) \
261 __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi)
262
252static ssize_t ext4_attr_show(struct kobject *kobj, 263static ssize_t ext4_attr_show(struct kobject *kobj,
253 struct attribute *attr, char *buf) 264 struct attribute *attr, char *buf)
254{ 265{
@@ -274,8 +285,12 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
274 case attr_pointer_ui: 285 case attr_pointer_ui:
275 if (!ptr) 286 if (!ptr)
276 return 0; 287 return 0;
277 return snprintf(buf, PAGE_SIZE, "%u\n", 288 if (a->attr_ptr == ptr_ext4_super_block_offset)
278 *((unsigned int *) ptr)); 289 return snprintf(buf, PAGE_SIZE, "%u\n",
290 le32_to_cpup(ptr));
291 else
292 return snprintf(buf, PAGE_SIZE, "%u\n",
293 *((unsigned int *) ptr));
279 case attr_pointer_atomic: 294 case attr_pointer_atomic:
280 if (!ptr) 295 if (!ptr)
281 return 0; 296 return 0;
@@ -283,6 +298,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
283 atomic_read((atomic_t *) ptr)); 298 atomic_read((atomic_t *) ptr));
284 case attr_feature: 299 case attr_feature:
285 return snprintf(buf, PAGE_SIZE, "supported\n"); 300 return snprintf(buf, PAGE_SIZE, "supported\n");
301 case attr_first_error_time:
302 return print_tstamp(buf, sbi->s_es, s_first_error_time);
303 case attr_last_error_time:
304 return print_tstamp(buf, sbi->s_es, s_last_error_time);
286 } 305 }
287 306
288 return 0; 307 return 0;
@@ -308,7 +327,10 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
308 ret = kstrtoul(skip_spaces(buf), 0, &t); 327 ret = kstrtoul(skip_spaces(buf), 0, &t);
309 if (ret) 328 if (ret)
310 return ret; 329 return ret;
311 *((unsigned int *) ptr) = t; 330 if (a->attr_ptr == ptr_ext4_super_block_offset)
331 *((__le32 *) ptr) = cpu_to_le32(t);
332 else
333 *((unsigned int *) ptr) = t;
312 return len; 334 return len;
313 case attr_inode_readahead: 335 case attr_inode_readahead:
314 return inode_readahead_blks_store(sbi, buf, len); 336 return inode_readahead_blks_store(sbi, buf, len);
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index 0cb13badf473..bcbe3668c1d4 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -11,6 +11,10 @@
11 */ 11 */
12static inline void ext4_truncate_failed_write(struct inode *inode) 12static inline void ext4_truncate_failed_write(struct inode *inode)
13{ 13{
14 /*
15 * We don't need to call ext4_break_layouts() because the blocks we
16 * are truncating were never visible to userspace.
17 */
14 down_write(&EXT4_I(inode)->i_mmap_sem); 18 down_write(&EXT4_I(inode)->i_mmap_sem);
15 truncate_inode_pages(inode->i_mapping, inode->i_size); 19 truncate_inode_pages(inode->i_mapping, inode->i_size);
16 ext4_truncate(inode); 20 ext4_truncate(inode);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 723df14f4084..f36fc5d5b257 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -190,6 +190,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
190 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); 190 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
191 if ((void *)next >= end) 191 if ((void *)next >= end)
192 return -EFSCORRUPTED; 192 return -EFSCORRUPTED;
193 if (strnlen(e->e_name, e->e_name_len) != e->e_name_len)
194 return -EFSCORRUPTED;
193 e = next; 195 e = next;
194 } 196 }
195 197
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8de0e7723316..150cc030b4d7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -121,7 +121,7 @@ static int journal_submit_commit_record(journal_t *journal,
121 struct commit_header *tmp; 121 struct commit_header *tmp;
122 struct buffer_head *bh; 122 struct buffer_head *bh;
123 int ret; 123 int ret;
124 struct timespec64 now = current_kernel_time64(); 124 struct timespec64 now;
125 125
126 *cbh = NULL; 126 *cbh = NULL;
127 127
@@ -134,6 +134,7 @@ static int journal_submit_commit_record(journal_t *journal,
134 return 1; 134 return 1;
135 135
136 tmp = (struct commit_header *)bh->b_data; 136 tmp = (struct commit_header *)bh->b_data;
137 ktime_get_coarse_real_ts64(&now);
137 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 138 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
138 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 139 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
139 140