diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-24 12:42:24 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-24 12:42:24 -0400 |
| commit | 5993692f09582accb4cb7af11d344598af43c3b8 (patch) | |
| tree | 062447eb44769d6da6e50302853eac1bb1d6e5d3 | |
| parent | d6edff78fe9e34dbea1bec7dc26cfce92c6d96d5 (diff) | |
| parent | 33458eaba4dfe778a426df6a19b7aad2ff9f7eec (diff) | |
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
- further restructure ext4 documentation
- fix up ext4's delayed allocation for bigalloc file systems
- fix up some syzbot-detected races in EXT4_IOC_MOVE_EXT,
EXT4_IOC_SWAP_BOOT, and ext4_remount
- ... and a few other miscellaneous bugs and optimizations.
* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits)
ext4: fix use-after-free race in ext4_remount()'s error path
ext4: cache NULL when both default_acl and acl are NULL
docs: promote the ext4 data structures book to top level
docs: move ext4 administrative docs to admin-guide/
jbd2: fix use after free in jbd2_log_do_checkpoint()
ext4: propagate error from dquot_initialize() in EXT4_IOC_FSSETXATTR
ext4: fix setattr project check in fssetxattr ioctl
docs: make ext4 readme tables readable
docs: fix ext4 documentation table formatting problems
docs: generate a separate ext4 pdf file from the documentation
ext4: convert fault handler to use vm_fault_t type
ext4: initialize retries variable in ext4_da_write_inline_data_begin()
ext4: fix EXT4_IOC_SWAP_BOOT
ext4: fix build error when DX_DEBUG is defined
ext4: fix argument checking in EXT4_IOC_MOVE_EXT
ext4: fix reserved cluster accounting at page invalidation time
ext4: adjust reserved cluster count when removing extents
ext4: reduce reserved cluster count by number of allocated clusters
ext4: fix reserved cluster accounting at delayed write time
ext4: add new pending reservation mechanism
...
| -rw-r--r-- | Documentation/admin-guide/ext4.rst | 574 | ||||
| -rw-r--r-- | Documentation/admin-guide/index.rst | 1 | ||||
| -rw-r--r-- | Documentation/conf.py | 4 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/about.rst (renamed from Documentation/filesystems/ext4/ondisk/about.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/allocators.rst (renamed from Documentation/filesystems/ext4/ondisk/allocators.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/attributes.rst (renamed from Documentation/filesystems/ext4/ondisk/attributes.rst) | 8 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/bigalloc.rst (renamed from Documentation/filesystems/ext4/ondisk/bigalloc.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/bitmaps.rst (renamed from Documentation/filesystems/ext4/ondisk/bitmaps.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/blockgroup.rst (renamed from Documentation/filesystems/ext4/ondisk/blockgroup.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/blockmap.rst (renamed from Documentation/filesystems/ext4/ondisk/blockmap.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/blocks.rst (renamed from Documentation/filesystems/ext4/ondisk/blocks.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/checksums.rst (renamed from Documentation/filesystems/ext4/ondisk/checksums.rst) | 2 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/directory.rst (renamed from Documentation/filesystems/ext4/ondisk/directory.rst) | 18 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/dynamic.rst (renamed from Documentation/filesystems/ext4/ondisk/dynamic.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/eainode.rst (renamed from Documentation/filesystems/ext4/ondisk/eainode.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/ext4.rst | 613 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/globals.rst (renamed from Documentation/filesystems/ext4/ondisk/globals.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/group_descr.rst (renamed from Documentation/filesystems/ext4/ondisk/group_descr.rst) | 4 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/ifork.rst (renamed from Documentation/filesystems/ext4/ondisk/ifork.rst) | 8 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/index.rst | 19 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/inlinedata.rst (renamed from Documentation/filesystems/ext4/ondisk/inlinedata.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/inodes.rst) | 19 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/journal.rst (renamed from Documentation/filesystems/ext4/ondisk/journal.rst) | 32 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/mmp.rst (renamed from Documentation/filesystems/ext4/ondisk/mmp.rst) | 2 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/ondisk/index.rst | 9 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/overview.rst (renamed from Documentation/filesystems/ext4/ondisk/overview.rst) | 0 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/special_inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/special_inodes.rst) | 2 | ||||
| -rw-r--r-- | Documentation/filesystems/ext4/super.rst (renamed from Documentation/filesystems/ext4/ondisk/super.rst) | 24 | ||||
| -rw-r--r-- | fs/ext4/acl.c | 4 | ||||
| -rw-r--r-- | fs/ext4/ext4.h | 17 | ||||
| -rw-r--r-- | fs/ext4/ext4_extents.h | 13 | ||||
| -rw-r--r-- | fs/ext4/extents.c | 595 | ||||
| -rw-r--r-- | fs/ext4/extents_status.c | 654 | ||||
| -rw-r--r-- | fs/ext4/extents_status.h | 80 | ||||
| -rw-r--r-- | fs/ext4/inline.c | 2 | ||||
| -rw-r--r-- | fs/ext4/inode.c | 142 | ||||
| -rw-r--r-- | fs/ext4/ioctl.c | 97 | ||||
| -rw-r--r-- | fs/ext4/mballoc.c | 14 | ||||
| -rw-r--r-- | fs/ext4/move_extent.c | 8 | ||||
| -rw-r--r-- | fs/ext4/namei.c | 2 | ||||
| -rw-r--r-- | fs/ext4/super.c | 81 | ||||
| -rw-r--r-- | fs/jbd2/checkpoint.c | 4 | ||||
| -rw-r--r-- | include/linux/buffer_head.h | 2 | ||||
| -rw-r--r-- | include/trace/events/ext4.h | 99 |
44 files changed, 1984 insertions, 1169 deletions
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst new file mode 100644 index 000000000000..e506d3dae510 --- /dev/null +++ b/Documentation/admin-guide/ext4.rst | |||
| @@ -0,0 +1,574 @@ | |||
| 1 | .. SPDX-License-Identifier: GPL-2.0 | ||
| 2 | |||
| 3 | ======================== | ||
| 4 | ext4 General Information | ||
| 5 | ======================== | ||
| 6 | |||
| 7 | Ext4 is an advanced level of the ext3 filesystem which incorporates | ||
| 8 | scalability and reliability enhancements for supporting large filesystems | ||
| 9 | (64 bit) in keeping with increasing disk capacities and state-of-the-art | ||
| 10 | feature requirements. | ||
| 11 | |||
| 12 | Mailing list: linux-ext4@vger.kernel.org | ||
| 13 | Web site: http://ext4.wiki.kernel.org | ||
| 14 | |||
| 15 | |||
| 16 | Quick usage instructions | ||
| 17 | ======================== | ||
| 18 | |||
| 19 | Note: More extensive information for getting started with ext4 can be | ||
| 20 | found at the ext4 wiki site at the URL: | ||
| 21 | http://ext4.wiki.kernel.org/index.php/Ext4_Howto | ||
| 22 | |||
| 23 | - The latest version of e2fsprogs can be found at: | ||
| 24 | |||
| 25 | https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ | ||
| 26 | |||
| 27 | or | ||
| 28 | |||
| 29 | http://sourceforge.net/project/showfiles.php?group_id=2406 | ||
| 30 | |||
| 31 | or grab the latest git repository from: | ||
| 32 | |||
| 33 | https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git | ||
| 34 | |||
| 35 | - Create a new filesystem using the ext4 filesystem type: | ||
| 36 | |||
| 37 | # mke2fs -t ext4 /dev/hda1 | ||
| 38 | |||
| 39 | Or to configure an existing ext3 filesystem to support extents: | ||
| 40 | |||
| 41 | # tune2fs -O extents /dev/hda1 | ||
| 42 | |||
| 43 | If the filesystem was created with 128 byte inodes, it can be | ||
| 44 | converted to use 256 byte for greater efficiency via: | ||
| 45 | |||
| 46 | # tune2fs -I 256 /dev/hda1 | ||
| 47 | |||
| 48 | - Mounting: | ||
| 49 | |||
| 50 | # mount -t ext4 /dev/hda1 /wherever | ||
| 51 | |||
| 52 | - When comparing performance with other filesystems, it's always | ||
| 53 | important to try multiple workloads; very often a subtle change in a | ||
| 54 | workload parameter can completely change the ranking of which | ||
| 55 | filesystems do well compared to others. When comparing versus ext3, | ||
| 56 | note that ext4 enables write barriers by default, while ext3 does | ||
| 57 | not enable write barriers by default. So it is useful to use | ||
| 58 | explicitly specify whether barriers are enabled or not when via the | ||
| 59 | '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems | ||
| 60 | for a fair comparison. When tuning ext3 for best benchmark numbers, | ||
| 61 | it is often worthwhile to try changing the data journaling mode; '-o | ||
| 62 | data=writeback' can be faster for some workloads. (Note however that | ||
| 63 | running mounted with data=writeback can potentially leave stale data | ||
| 64 | exposed in recently written files in case of an unclean shutdown, | ||
| 65 | which could be a security exposure in some situations.) Configuring | ||
| 66 | the filesystem with a large journal can also be helpful for | ||
| 67 | metadata-intensive workloads. | ||
| 68 | |||
| 69 | Features | ||
| 70 | ======== | ||
| 71 | |||
| 72 | Currently Available | ||
| 73 | ------------------- | ||
| 74 | |||
| 75 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) | ||
| 76 | * extent format reduces metadata overhead (RAM, IO for access, transactions) | ||
| 77 | * extent format more robust in face of on-disk corruption due to magics, | ||
| 78 | * internal redundancy in tree | ||
| 79 | * improved file allocation (multi-block alloc) | ||
| 80 | * lift 32000 subdirectory limit imposed by i_links_count[1] | ||
| 81 | * nsec timestamps for mtime, atime, ctime, create time | ||
| 82 | * inode version field on disk (NFSv4, Lustre) | ||
| 83 | * reduced e2fsck time via uninit_bg feature | ||
| 84 | * journal checksumming for robustness, performance | ||
| 85 | * persistent file preallocation (e.g for streaming media, databases) | ||
| 86 | * ability to pack bitmaps and inode tables into larger virtual groups via the | ||
| 87 | flex_bg feature | ||
| 88 | * large file support | ||
| 89 | * inode allocation using large virtual block groups via flex_bg | ||
| 90 | * delayed allocation | ||
| 91 | * large block (up to pagesize) support | ||
| 92 | * efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force | ||
| 93 | the ordering) | ||
| 94 | |||
| 95 | [1] Filesystems with a block size of 1k may see a limit imposed by the | ||
| 96 | directory hash tree having a maximum depth of two. | ||
| 97 | |||
| 98 | Options | ||
| 99 | ======= | ||
| 100 | |||
| 101 | When mounting an ext4 filesystem, the following option are accepted: | ||
| 102 | (*) == default | ||
| 103 | |||
| 104 | ro | ||
| 105 | Mount filesystem read only. Note that ext4 will replay the journal (and | ||
| 106 | thus write to the partition) even when mounted "read only". The mount | ||
| 107 | options "ro,noload" can be used to prevent writes to the filesystem. | ||
| 108 | |||
| 109 | journal_checksum | ||
| 110 | Enable checksumming of the journal transactions. This will allow the | ||
| 111 | recovery code in e2fsck and the kernel to detect corruption in the | ||
| 112 | kernel. It is a compatible change and will be ignored by older | ||
| 113 | kernels. | ||
| 114 | |||
| 115 | journal_async_commit | ||
| 116 | Commit block can be written to disk without waiting for descriptor | ||
| 117 | blocks. If enabled older kernels cannot mount the device. This will | ||
| 118 | enable 'journal_checksum' internally. | ||
| 119 | |||
| 120 | journal_path=path, journal_dev=devnum | ||
| 121 | When the external journal device's major/minor numbers have changed, | ||
| 122 | these options allow the user to specify the new journal location. The | ||
| 123 | journal device is identified through either its new major/minor numbers | ||
| 124 | encoded in devnum, or via a path to the device. | ||
| 125 | |||
| 126 | norecovery, noload | ||
| 127 | Don't load the journal on mounting. Note that if the filesystem was | ||
| 128 | not unmounted cleanly, skipping the journal replay will lead to the | ||
| 129 | filesystem containing inconsistencies that can lead to any number of | ||
| 130 | problems. | ||
| 131 | |||
| 132 | data=journal | ||
| 133 | All data are committed into the journal prior to being written into the | ||
| 134 | main file system. Enabling this mode will disable delayed allocation | ||
| 135 | and O_DIRECT support. | ||
| 136 | |||
| 137 | data=ordered (*) | ||
| 138 | All data are forced directly out to the main file system prior to its | ||
| 139 | metadata being committed to the journal. | ||
| 140 | |||
| 141 | data=writeback | ||
| 142 | Data ordering is not preserved, data may be written into the main file | ||
| 143 | system after its metadata has been committed to the journal. | ||
| 144 | |||
| 145 | commit=nrsec (*) | ||
| 146 | Ext4 can be told to sync all its data and metadata every 'nrsec' | ||
| 147 | seconds. The default value is 5 seconds. This means that if you lose | ||
| 148 | your power, you will lose as much as the latest 5 seconds of work (your | ||
| 149 | filesystem will not be damaged though, thanks to the journaling). This | ||
| 150 | default value (or any low value) will hurt performance, but it's good | ||
| 151 | for data-safety. Setting it to 0 will have the same effect as leaving | ||
| 152 | it at the default (5 seconds). Setting it to very large values will | ||
| 153 | improve performance. | ||
| 154 | |||
| 155 | barrier=<0|1(*)>, barrier(*), nobarrier | ||
| 156 | This enables/disables the use of write barriers in the jbd code. | ||
| 157 | barrier=0 disables, barrier=1 enables. This also requires an IO stack | ||
| 158 | which can support barriers, and if jbd gets an error on a barrier | ||
| 159 | write, it will disable again with a warning. Write barriers enforce | ||
| 160 | proper on-disk ordering of journal commits, making volatile disk write | ||
| 161 | caches safe to use, at some performance penalty. If your disks are | ||
| 162 | battery-backed in one way or another, disabling barriers may safely | ||
| 163 | improve performance. The mount options "barrier" and "nobarrier" can | ||
| 164 | also be used to enable or disable barriers, for consistency with other | ||
| 165 | ext4 mount options. | ||
| 166 | |||
| 167 | inode_readahead_blks=n | ||
| 168 | This tuning parameter controls the maximum number of inode table blocks | ||
| 169 | that ext4's inode table readahead algorithm will pre-read into the | ||
| 170 | buffer cache. The default value is 32 blocks. | ||
| 171 | |||
| 172 | nouser_xattr | ||
| 173 | Disables Extended User Attributes. See the attr(5) manual page for | ||
| 174 | more information about extended attributes. | ||
| 175 | |||
| 176 | noacl | ||
| 177 | This option disables POSIX Access Control List support. If ACL support | ||
| 178 | is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL | ||
| 179 | is enabled by default on mount. See the acl(5) manual page for more | ||
| 180 | information about acl. | ||
| 181 | |||
| 182 | bsddf (*) | ||
| 183 | Make 'df' act like BSD. | ||
| 184 | |||
| 185 | minixdf | ||
| 186 | Make 'df' act like Minix. | ||
| 187 | |||
| 188 | debug | ||
| 189 | Extra debugging information is sent to syslog. | ||
| 190 | |||
| 191 | abort | ||
| 192 | Simulate the effects of calling ext4_abort() for debugging purposes. | ||
| 193 | This is normally used while remounting a filesystem which is already | ||
| 194 | mounted. | ||
| 195 | |||
| 196 | errors=remount-ro | ||
| 197 | Remount the filesystem read-only on an error. | ||
| 198 | |||
| 199 | errors=continue | ||
| 200 | Keep going on a filesystem error. | ||
| 201 | |||
| 202 | errors=panic | ||
| 203 | Panic and halt the machine if an error occurs. (These mount options | ||
| 204 | override the errors behavior specified in the superblock, which can be | ||
| 205 | configured using tune2fs) | ||
| 206 | |||
| 207 | data_err=ignore(*) | ||
| 208 | Just print an error message if an error occurs in a file data buffer in | ||
| 209 | ordered mode. | ||
| 210 | data_err=abort | ||
| 211 | Abort the journal if an error occurs in a file data buffer in ordered | ||
| 212 | mode. | ||
| 213 | |||
| 214 | grpid | bsdgroups | ||
| 215 | New objects have the group ID of their parent. | ||
| 216 | |||
| 217 | nogrpid (*) | sysvgroups | ||
| 218 | New objects have the group ID of their creator. | ||
| 219 | |||
| 220 | resgid=n | ||
| 221 | The group ID which may use the reserved blocks. | ||
| 222 | |||
| 223 | resuid=n | ||
| 224 | The user ID which may use the reserved blocks. | ||
| 225 | |||
| 226 | sb= | ||
| 227 | Use alternate superblock at this location. | ||
| 228 | |||
| 229 | quota, noquota, grpquota, usrquota | ||
| 230 | These options are ignored by the filesystem. They are used only by | ||
| 231 | quota tools to recognize volumes where quota should be turned on. See | ||
| 232 | documentation in the quota-tools package for more details | ||
| 233 | (http://sourceforge.net/projects/linuxquota). | ||
| 234 | |||
| 235 | jqfmt=<quota type>, usrjquota=<file>, grpjquota=<file> | ||
| 236 | These options tell filesystem details about quota so that quota | ||
| 237 | information can be properly updated during journal replay. They replace | ||
| 238 | the above quota options. See documentation in the quota-tools package | ||
| 239 | for more details (http://sourceforge.net/projects/linuxquota). | ||
| 240 | |||
| 241 | stripe=n | ||
| 242 | Number of filesystem blocks that mballoc will try to use for allocation | ||
| 243 | size and alignment. For RAID5/6 systems this should be the number of | ||
| 244 | data disks * RAID chunk size in file system blocks. | ||
| 245 | |||
| 246 | delalloc (*) | ||
| 247 | Defer block allocation until just before ext4 writes out the block(s) | ||
| 248 | in question. This allows ext4 to better allocation decisions more | ||
| 249 | efficiently. | ||
| 250 | |||
| 251 | nodelalloc | ||
| 252 | Disable delayed allocation. Blocks are allocated when the data is | ||
| 253 | copied from userspace to the page cache, either via the write(2) system | ||
| 254 | call or when an mmap'ed page which was previously unallocated is | ||
| 255 | written for the first time. | ||
| 256 | |||
| 257 | max_batch_time=usec | ||
| 258 | Maximum amount of time ext4 should wait for additional filesystem | ||
| 259 | operations to be batch together with a synchronous write operation. | ||
| 260 | Since a synchronous write operation is going to force a commit and then | ||
| 261 | a wait for the I/O complete, it doesn't cost much, and can be a huge | ||
| 262 | throughput win, we wait for a small amount of time to see if any other | ||
| 263 | transactions can piggyback on the synchronous write. The algorithm | ||
| 264 | used is designed to automatically tune for the speed of the disk, by | ||
| 265 | measuring the amount of time (on average) that it takes to finish | ||
| 266 | committing a transaction. Call this time the "commit time". If the | ||
| 267 | time that the transaction has been running is less than the commit | ||
| 268 | time, ext4 will try sleeping for the commit time to see if other | ||
| 269 | operations will join the transaction. The commit time is capped by | ||
| 270 | the max_batch_time, which defaults to 15000us (15ms). This | ||
| 271 | optimization can be turned off entirely by setting max_batch_time to 0. | ||
| 272 | |||
| 273 | min_batch_time=usec | ||
| 274 | This parameter sets the commit time (as described above) to be at least | ||
| 275 | min_batch_time. It defaults to zero microseconds. Increasing this | ||
| 276 | parameter may improve the throughput of multi-threaded, synchronous | ||
| 277 | workloads on very fast disks, at the cost of increasing latency. | ||
| 278 | |||
| 279 | journal_ioprio=prio | ||
| 280 | The I/O priority (from 0 to 7, where 0 is the highest priority) which | ||
| 281 | should be used for I/O operations submitted by kjournald2 during a | ||
| 282 | commit operation. This defaults to 3, which is a slightly higher | ||
| 283 | priority than the default I/O priority. | ||
| 284 | |||
| 285 | auto_da_alloc(*), noauto_da_alloc | ||
| 286 | Many broken applications don't use fsync() when replacing existing | ||
| 287 | files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/ | ||
| 288 | rename("foo.new", "foo"), or worse yet, fd = open("foo", | ||
| 289 | O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4 | ||
| 290 | will detect the replace-via-rename and replace-via-truncate patterns | ||
| 291 | and force that any delayed allocation blocks are allocated such that at | ||
| 292 | the next journal commit, in the default data=ordered mode, the data | ||
| 293 | blocks of the new file are forced to disk before the rename() operation | ||
| 294 | is committed. This provides roughly the same level of guarantees as | ||
| 295 | ext3, and avoids the "zero-length" problem that can happen when a | ||
| 296 | system crashes before the delayed allocation blocks are forced to disk. | ||
| 297 | |||
| 298 | noinit_itable | ||
| 299 | Do not initialize any uninitialized inode table blocks in the | ||
| 300 | background. This feature may be used by installation CD's so that the | ||
| 301 | install process can complete as quickly as possible; the inode table | ||
| 302 | initialization process would then be deferred until the next time the | ||
| 303 | file system is unmounted. | ||
| 304 | |||
| 305 | init_itable=n | ||
| 306 | The lazy itable init code will wait n times the number of milliseconds | ||
| 307 | it took to zero out the previous block group's inode table. This | ||
| 308 | minimizes the impact on the system performance while file system's | ||
| 309 | inode table is being initialized. | ||
| 310 | |||
| 311 | discard, nodiscard(*) | ||
| 312 | Controls whether ext4 should issue discard/TRIM commands to the | ||
| 313 | underlying block device when blocks are freed. This is useful for SSD | ||
| 314 | devices and sparse/thinly-provisioned LUNs, but it is off by default | ||
| 315 | until sufficient testing has been done. | ||
| 316 | |||
| 317 | nouid32 | ||
| 318 | Disables 32-bit UIDs and GIDs. This is for interoperability with | ||
| 319 | older kernels which only store and expect 16-bit values. | ||
| 320 | |||
| 321 | block_validity(*), noblock_validity | ||
| 322 | These options enable or disable the in-kernel facility for tracking | ||
| 323 | filesystem metadata blocks within internal data structures. This | ||
| 324 | allows multi- block allocator and other routines to notice bugs or | ||
| 325 | corrupted allocation bitmaps which cause blocks to be allocated which | ||
| 326 | overlap with filesystem metadata blocks. | ||
| 327 | |||
| 328 | dioread_lock, dioread_nolock | ||
| 329 | Controls whether or not ext4 should use the DIO read locking. If the | ||
| 330 | dioread_nolock option is specified ext4 will allocate uninitialized | ||
| 331 | extent before buffer write and convert the extent to initialized after | ||
| 332 | IO completes. This approach allows ext4 code to avoid using inode | ||
| 333 | mutex, which improves scalability on high speed storages. However this | ||
| 334 | does not work with data journaling and dioread_nolock option will be | ||
| 335 | ignored with kernel warning. Note that dioread_nolock code path is only | ||
| 336 | used for extent-based files. Because of the restrictions this options | ||
| 337 | comprises it is off by default (e.g. dioread_lock). | ||
| 338 | |||
| 339 | max_dir_size_kb=n | ||
| 340 | This limits the size of directories so that any attempt to expand them | ||
| 341 | beyond the specified limit in kilobytes will cause an ENOSPC error. | ||
| 342 | This is useful in memory constrained environments, where a very large | ||
| 343 | directory can cause severe performance problems or even provoke the Out | ||
| 344 | Of Memory killer. (For example, if there is only 512mb memory | ||
| 345 | available, a 176mb directory may seriously cramp the system's style.) | ||
| 346 | |||
| 347 | i_version | ||
| 348 | Enable 64-bit inode version support. This option is off by default. | ||
| 349 | |||
| 350 | dax | ||
| 351 | Use direct access (no page cache). See | ||
| 352 | Documentation/filesystems/dax.txt. Note that this option is | ||
| 353 | incompatible with data=journal. | ||
| 354 | |||
| 355 | Data Mode | ||
| 356 | ========= | ||
| 357 | There are 3 different data modes: | ||
| 358 | |||
| 359 | * writeback mode | ||
| 360 | |||
| 361 | In data=writeback mode, ext4 does not journal data at all. This mode provides | ||
| 362 | a similar level of journaling as that of XFS, JFS, and ReiserFS in its default | ||
| 363 | mode - metadata journaling. A crash+recovery can cause incorrect data to | ||
| 364 | appear in files which were written shortly before the crash. This mode will | ||
| 365 | typically provide the best ext4 performance. | ||
| 366 | |||
| 367 | * ordered mode | ||
| 368 | |||
| 369 | In data=ordered mode, ext4 only officially journals metadata, but it logically | ||
| 370 | groups metadata information related to data changes with the data blocks into | ||
| 371 | a single unit called a transaction. When it's time to write the new metadata | ||
| 372 | out to disk, the associated data blocks are written first. In general, this | ||
| 373 | mode performs slightly slower than writeback but significantly faster than | ||
| 374 | journal mode. | ||
| 375 | |||
| 376 | * journal mode | ||
| 377 | |||
| 378 | data=journal mode provides full data and metadata journaling. All new data is | ||
| 379 | written to the journal first, and then to its final location. In the event of | ||
| 380 | a crash, the journal can be replayed, bringing both data and metadata into a | ||
| 381 | consistent state. This mode is the slowest except when data needs to be read | ||
| 382 | from and written to disk at the same time where it outperforms all others | ||
| 383 | modes. Enabling this mode will disable delayed allocation and O_DIRECT | ||
| 384 | support. | ||
| 385 | |||
| 386 | /proc entries | ||
| 387 | ============= | ||
| 388 | |||
| 389 | Information about mounted ext4 file systems can be found in | ||
| 390 | /proc/fs/ext4. Each mounted filesystem will have a directory in | ||
| 391 | /proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or | ||
| 392 | /proc/fs/ext4/dm-0). The files in each per-device directory are shown | ||
| 393 | in table below. | ||
| 394 | |||
| 395 | Files in /proc/fs/ext4/<devname> | ||
| 396 | |||
| 397 | mb_groups | ||
| 398 | details of multiblock allocator buddy cache of free blocks | ||
| 399 | |||
| 400 | /sys entries | ||
| 401 | ============ | ||
| 402 | |||
| 403 | Information about mounted ext4 file systems can be found in | ||
| 404 | /sys/fs/ext4. Each mounted filesystem will have a directory in | ||
| 405 | /sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or | ||
| 406 | /sys/fs/ext4/dm-0). The files in each per-device directory are shown | ||
| 407 | in table below. | ||
| 408 | |||
| 409 | Files in /sys/fs/ext4/<devname>: | ||
| 410 | |||
| 411 | (see also Documentation/ABI/testing/sysfs-fs-ext4) | ||
| 412 | |||
| 413 | delayed_allocation_blocks | ||
| 414 | This file is read-only and shows the number of blocks that are dirty in | ||
| 415 | the page cache, but which do not have their location in the filesystem | ||
| 416 | allocated yet. | ||
| 417 | |||
| 418 | inode_goal | ||
| 419 | Tuning parameter which (if non-zero) controls the goal inode used by | ||
| 420 | the inode allocator in preference to all other allocation heuristics. | ||
| 421 | This is intended for debugging use only, and should be 0 on production | ||
| 422 | systems. | ||
| 423 | |||
| 424 | inode_readahead_blks | ||
| 425 | Tuning parameter which controls the maximum number of inode table | ||
| 426 | blocks that ext4's inode table readahead algorithm will pre-read into | ||
| 427 | the buffer cache. | ||
| 428 | |||
| 429 | lifetime_write_kbytes | ||
| 430 | This file is read-only and shows the number of kilobytes of data that | ||
| 431 | have been written to this filesystem since it was created. | ||
| 432 | |||
| 433 | max_writeback_mb_bump | ||
| 434 | The maximum number of megabytes the writeback code will try to write | ||
| 435 | out before move on to another inode. | ||
| 436 | |||
| 437 | mb_group_prealloc | ||
| 438 | The multiblock allocator will round up allocation requests to a | ||
| 439 | multiple of this tuning parameter if the stripe size is not set in the | ||
| 440 | ext4 superblock | ||
| 441 | |||
| 442 | mb_max_to_scan | ||
| 443 | The maximum number of extents the multiblock allocator will search to | ||
| 444 | find the best extent. | ||
| 445 | |||
| 446 | mb_min_to_scan | ||
| 447 | The minimum number of extents the multiblock allocator will search to | ||
| 448 | find the best extent. | ||
| 449 | |||
| 450 | mb_order2_req | ||
| 451 | Tuning parameter which controls the minimum size for requests (as a | ||
| 452 | power of 2) where the buddy cache is used. | ||
| 453 | |||
| 454 | mb_stats | ||
| 455 | Controls whether the multiblock allocator should collect statistics, | ||
| 456 | which are shown during the unmount. 1 means to collect statistics, 0 | ||
| 457 | means not to collect statistics. | ||
| 458 | |||
| 459 | mb_stream_req | ||
| 460 | Files which have fewer blocks than this tunable parameter will have | ||
| 461 | their blocks allocated out of a block group specific preallocation | ||
| 462 | pool, so that small files are packed closely together. Each large file | ||
| 463 | will have its blocks allocated out of its own unique preallocation | ||
| 464 | pool. | ||
| 465 | |||
| 466 | session_write_kbytes | ||
| 467 | This file is read-only and shows the number of kilobytes of data that | ||
| 468 | have been written to this filesystem since it was mounted. | ||
| 469 | |||
| 470 | reserved_clusters | ||
| 471 | This is RW file and contains number of reserved clusters in the file | ||
| 472 | system which will be used in the specific situations to avoid costly | ||
| 473 | zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or | ||
| 474 | 4096 clusters, whichever is smaller and this can be changed however it | ||
| 475 | can never exceed number of clusters in the file system. If there is not | ||
| 476 | enough space for the reserved space when mounting the file mount will | ||
| 477 | _not_ fail. | ||
| 478 | |||
| 479 | Ioctls | ||
| 480 | ====== | ||
| 481 | |||
| 482 | There is some Ext4 specific functionality which can be accessed by applications | ||
| 483 | through the system call interfaces. The list of all Ext4 specific ioctls are | ||
| 484 | shown in the table below. | ||
| 485 | |||
| 486 | Table of Ext4 specific ioctls | ||
| 487 | |||
| 488 | EXT4_IOC_GETFLAGS | ||
| 489 | Get additional attributes associated with inode. The ioctl argument is | ||
| 490 | an integer bitfield, with bit values described in ext4.h. This ioctl is | ||
| 491 | an alias for FS_IOC_GETFLAGS. | ||
| 492 | |||
| 493 | EXT4_IOC_SETFLAGS | ||
| 494 | Set additional attributes associated with inode. The ioctl argument is | ||
| 495 | an integer bitfield, with bit values described in ext4.h. This ioctl is | ||
| 496 | an alias for FS_IOC_SETFLAGS. | ||
| 497 | |||
| 498 | EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD | ||
| 499 | Get the inode i_generation number stored for each inode. The | ||
| 500 | i_generation number is normally changed only when new inode is created | ||
| 501 | and it is particularly useful for network filesystems. The '_OLD' | ||
| 502 | version of this ioctl is an alias for FS_IOC_GETVERSION. | ||
| 503 | |||
| 504 | EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD | ||
| 505 | Set the inode i_generation number stored for each inode. The '_OLD' | ||
| 506 | version of this ioctl is an alias for FS_IOC_SETVERSION. | ||
| 507 | |||
| 508 | EXT4_IOC_GROUP_EXTEND | ||
| 509 | This ioctl has the same purpose as the resize mount option. It allows | ||
| 510 | to resize filesystem to the end of the last existing block group, | ||
| 511 | further resize has to be done with resize2fs, either online, or | ||
| 512 | offline. The argument points to the unsigned logn number representing | ||
| 513 | the filesystem new block count. | ||
| 514 | |||
| 515 | EXT4_IOC_MOVE_EXT | ||
| 516 | Move the block extents from orig_fd (the one this ioctl is pointing to) | ||
| 517 | to the donor_fd (the one specified in move_extent structure passed as | ||
| 518 | an argument to this ioctl). Then, exchange inode metadata between | ||
| 519 | orig_fd and donor_fd. This is especially useful for online | ||
| 520 | defragmentation, because the allocator has the opportunity to allocate | ||
| 521 | moved blocks better, ideally into one contiguous extent. | ||
| 522 | |||
| 523 | EXT4_IOC_GROUP_ADD | ||
| 524 | Add a new group descriptor to an existing or new group descriptor | ||
| 525 | block. The new group descriptor is described by ext4_new_group_input | ||
| 526 | structure, which is passed as an argument to this ioctl. This is | ||
| 527 | especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which | ||
| 528 | allows online resize of the filesystem to the end of the last existing | ||
| 529 | block group. Those two ioctls combined is used in userspace online | ||
| 530 | resize tool (e.g. resize2fs). | ||
| 531 | |||
| 532 | EXT4_IOC_MIGRATE | ||
| 533 | This ioctl operates on the filesystem itself. It converts (migrates) | ||
| 534 | ext3 indirect block mapped inode to ext4 extent mapped inode by walking | ||
| 535 | through indirect block mapping of the original inode and converting | ||
| 536 | contiguous block ranges into ext4 extents of the temporary inode. Then, | ||
| 537 | inodes are swapped. This ioctl might help, when migrating from ext3 to | ||
| 538 | ext4 filesystem, however suggestion is to create fresh ext4 filesystem | ||
| 539 | and copy data from the backup. Note, that filesystem has to support | ||
| 540 | extents for this ioctl to work. | ||
| 541 | |||
| 542 | EXT4_IOC_ALLOC_DA_BLKS | ||
| 543 | Force all of the delay allocated blocks to be allocated to preserve | ||
| 544 | application-expected ext3 behaviour. Note that this will also start | ||
| 545 | triggering a write of the data blocks, but this behaviour may change in | ||
| 546 | the future as it is not necessary and has been done this way only for | ||
| 547 | sake of simplicity. | ||
| 548 | |||
| 549 | EXT4_IOC_RESIZE_FS | ||
| 550 | Resize the filesystem to a new size. The number of blocks of resized | ||
| 551 | filesystem is passed in via 64 bit integer argument. The kernel | ||
| 552 | allocates bitmaps and inode table, the userspace tool thus just passes | ||
| 553 | the new number of blocks. | ||
| 554 | |||
| 555 | EXT4_IOC_SWAP_BOOT | ||
| 556 | Swap i_blocks and associated attributes (like i_blocks, i_size, | ||
| 557 | i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO | ||
| 558 | (#5). This is typically used to store a boot loader in a secure part of | ||
| 559 | the filesystem, where it can't be changed by a normal user by accident. | ||
| 560 | The data blocks of the previous boot loader will be associated with the | ||
| 561 | given inode. | ||
| 562 | |||
| 563 | References | ||
| 564 | ========== | ||
| 565 | |||
| 566 | kernel source: <file:fs/ext4/> | ||
| 567 | <file:fs/jbd2/> | ||
| 568 | |||
| 569 | programs: http://e2fsprogs.sourceforge.net/ | ||
| 570 | |||
| 571 | useful links: http://fedoraproject.org/wiki/ext3-devel | ||
| 572 | http://www.bullopensource.org/ext4/ | ||
| 573 | http://ext4.wiki.kernel.org/index.php/Main_Page | ||
| 574 | http://fedoraproject.org/wiki/Features/Ext4 | ||
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 0873685bab0f..965745d5fb9a 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst | |||
| @@ -71,6 +71,7 @@ configure specific aspects of kernel behavior to your liking. | |||
| 71 | java | 71 | java |
| 72 | ras | 72 | ras |
| 73 | bcache | 73 | bcache |
| 74 | ext4 | ||
| 74 | pm/index | 75 | pm/index |
| 75 | thunderbolt | 76 | thunderbolt |
| 76 | LSM/index | 77 | LSM/index |
diff --git a/Documentation/conf.py b/Documentation/conf.py index b691af4831fa..ede67ccafc29 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py | |||
| @@ -383,6 +383,10 @@ latex_documents = [ | |||
| 383 | 'The kernel development community', 'manual'), | 383 | 'The kernel development community', 'manual'), |
| 384 | ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', | 384 | ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', |
| 385 | 'The kernel development community', 'manual'), | 385 | 'The kernel development community', 'manual'), |
| 386 | ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide', | ||
| 387 | 'ext4 Community', 'manual'), | ||
| 388 | ('filesystems/ext4/index', 'ext4-data-structures.tex', | ||
| 389 | 'ext4 Data Structures and Algorithms', 'ext4 Community', 'manual'), | ||
| 386 | ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', | 390 | ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', |
| 387 | 'The kernel development community', 'manual'), | 391 | 'The kernel development community', 'manual'), |
| 388 | ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', | 392 | ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', |
diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/about.rst index 0aadba052264..0aadba052264 100644 --- a/Documentation/filesystems/ext4/ondisk/about.rst +++ b/Documentation/filesystems/ext4/about.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/allocators.rst index 7aa85152ace3..7aa85152ace3 100644 --- a/Documentation/filesystems/ext4/ondisk/allocators.rst +++ b/Documentation/filesystems/ext4/allocators.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/attributes.rst index 0b01b67b81fe..54386a010a8d 100644 --- a/Documentation/filesystems/ext4/ondisk/attributes.rst +++ b/Documentation/filesystems/ext4/attributes.rst | |||
| @@ -30,7 +30,7 @@ Extended attributes, when stored after the inode, have a header | |||
| 30 | ``ext4_xattr_ibody_header`` that is 4 bytes long: | 30 | ``ext4_xattr_ibody_header`` that is 4 bytes long: |
| 31 | 31 | ||
| 32 | .. list-table:: | 32 | .. list-table:: |
| 33 | :widths: 1 1 1 77 | 33 | :widths: 8 8 24 40 |
| 34 | :header-rows: 1 | 34 | :header-rows: 1 |
| 35 | 35 | ||
| 36 | * - Offset | 36 | * - Offset |
| @@ -47,7 +47,7 @@ The beginning of an extended attribute block is in | |||
| 47 | ``struct ext4_xattr_header``, which is 32 bytes long: | 47 | ``struct ext4_xattr_header``, which is 32 bytes long: |
| 48 | 48 | ||
| 49 | .. list-table:: | 49 | .. list-table:: |
| 50 | :widths: 1 1 1 77 | 50 | :widths: 8 8 24 40 |
| 51 | :header-rows: 1 | 51 | :header-rows: 1 |
| 52 | 52 | ||
| 53 | * - Offset | 53 | * - Offset |
| @@ -92,7 +92,7 @@ entries must be stored in sorted order. The sort order is | |||
| 92 | Attributes stored inside an inode do not need be stored in sorted order. | 92 | Attributes stored inside an inode do not need be stored in sorted order. |
| 93 | 93 | ||
| 94 | .. list-table:: | 94 | .. list-table:: |
| 95 | :widths: 1 1 1 77 | 95 | :widths: 8 8 24 40 |
| 96 | :header-rows: 1 | 96 | :header-rows: 1 |
| 97 | 97 | ||
| 98 | * - Offset | 98 | * - Offset |
| @@ -157,7 +157,7 @@ attribute name index field is set, and matching string is removed from | |||
| 157 | the key name. Here is a map of name index values to key prefixes: | 157 | the key name. Here is a map of name index values to key prefixes: |
| 158 | 158 | ||
| 159 | .. list-table:: | 159 | .. list-table:: |
| 160 | :widths: 1 79 | 160 | :widths: 16 64 |
| 161 | :header-rows: 1 | 161 | :header-rows: 1 |
| 162 | 162 | ||
| 163 | * - Name Index | 163 | * - Name Index |
diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst index c6d88557553c..c6d88557553c 100644 --- a/Documentation/filesystems/ext4/ondisk/bigalloc.rst +++ b/Documentation/filesystems/ext4/bigalloc.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/bitmaps.rst index c7546dbc197a..c7546dbc197a 100644 --- a/Documentation/filesystems/ext4/ondisk/bitmaps.rst +++ b/Documentation/filesystems/ext4/bitmaps.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst index baf888e4c06a..baf888e4c06a 100644 --- a/Documentation/filesystems/ext4/ondisk/blockgroup.rst +++ b/Documentation/filesystems/ext4/blockgroup.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/blockmap.rst index 30e25750d88a..30e25750d88a 100644 --- a/Documentation/filesystems/ext4/ondisk/blockmap.rst +++ b/Documentation/filesystems/ext4/blockmap.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/blocks.rst index 73d4dc0f7bda..73d4dc0f7bda 100644 --- a/Documentation/filesystems/ext4/ondisk/blocks.rst +++ b/Documentation/filesystems/ext4/blocks.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/checksums.rst index 9d6a793b2e03..5519e253810d 100644 --- a/Documentation/filesystems/ext4/ondisk/checksums.rst +++ b/Documentation/filesystems/ext4/checksums.rst | |||
| @@ -28,7 +28,7 @@ of checksum. The checksum function is whatever the superblock describes | |||
| 28 | (crc32c as of October 2013) unless noted otherwise. | 28 | (crc32c as of October 2013) unless noted otherwise. |
| 29 | 29 | ||
| 30 | .. list-table:: | 30 | .. list-table:: |
| 31 | :widths: 1 1 4 | 31 | :widths: 20 8 50 |
| 32 | :header-rows: 1 | 32 | :header-rows: 1 |
| 33 | 33 | ||
| 34 | * - Metadata | 34 | * - Metadata |
diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/directory.rst index 8fcba68c2884..614034e24669 100644 --- a/Documentation/filesystems/ext4/ondisk/directory.rst +++ b/Documentation/filesystems/ext4/directory.rst | |||
| @@ -34,7 +34,7 @@ is at most 263 bytes long, though on disk you'll need to reference | |||
| 34 | ``dirent.rec_len`` to know for sure. | 34 | ``dirent.rec_len`` to know for sure. |
| 35 | 35 | ||
| 36 | .. list-table:: | 36 | .. list-table:: |
| 37 | :widths: 1 1 1 77 | 37 | :widths: 8 8 24 40 |
| 38 | :header-rows: 1 | 38 | :header-rows: 1 |
| 39 | 39 | ||
| 40 | * - Offset | 40 | * - Offset |
| @@ -66,7 +66,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most | |||
| 66 | ``dirent.rec_len`` to know for sure. | 66 | ``dirent.rec_len`` to know for sure. |
| 67 | 67 | ||
| 68 | .. list-table:: | 68 | .. list-table:: |
| 69 | :widths: 1 1 1 77 | 69 | :widths: 8 8 24 40 |
| 70 | :header-rows: 1 | 70 | :header-rows: 1 |
| 71 | 71 | ||
| 72 | * - Offset | 72 | * - Offset |
| @@ -99,7 +99,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most | |||
| 99 | The directory file type is one of the following values: | 99 | The directory file type is one of the following values: |
| 100 | 100 | ||
| 101 | .. list-table:: | 101 | .. list-table:: |
| 102 | :widths: 1 79 | 102 | :widths: 16 64 |
| 103 | :header-rows: 1 | 103 | :header-rows: 1 |
| 104 | 104 | ||
| 105 | * - Value | 105 | * - Value |
| @@ -130,7 +130,7 @@ in the place where the name normally goes. The structure is | |||
| 130 | ``struct ext4_dir_entry_tail``: | 130 | ``struct ext4_dir_entry_tail``: |
| 131 | 131 | ||
| 132 | .. list-table:: | 132 | .. list-table:: |
| 133 | :widths: 1 1 1 77 | 133 | :widths: 8 8 24 40 |
| 134 | :header-rows: 1 | 134 | :header-rows: 1 |
| 135 | 135 | ||
| 136 | * - Offset | 136 | * - Offset |
| @@ -212,7 +212,7 @@ The root of the htree is in ``struct dx_root``, which is the full length | |||
| 212 | of a data block: | 212 | of a data block: |
| 213 | 213 | ||
| 214 | .. list-table:: | 214 | .. list-table:: |
| 215 | :widths: 1 1 1 77 | 215 | :widths: 8 8 24 40 |
| 216 | :header-rows: 1 | 216 | :header-rows: 1 |
| 217 | 217 | ||
| 218 | * - Offset | 218 | * - Offset |
| @@ -305,7 +305,7 @@ of a data block: | |||
| 305 | The directory hash is one of the following values: | 305 | The directory hash is one of the following values: |
| 306 | 306 | ||
| 307 | .. list-table:: | 307 | .. list-table:: |
| 308 | :widths: 1 79 | 308 | :widths: 16 64 |
| 309 | :header-rows: 1 | 309 | :header-rows: 1 |
| 310 | 310 | ||
| 311 | * - Value | 311 | * - Value |
| @@ -327,7 +327,7 @@ Interior nodes of an htree are recorded as ``struct dx_node``, which is | |||
| 327 | also the full length of a data block: | 327 | also the full length of a data block: |
| 328 | 328 | ||
| 329 | .. list-table:: | 329 | .. list-table:: |
| 330 | :widths: 1 1 1 77 | 330 | :widths: 8 8 24 40 |
| 331 | :header-rows: 1 | 331 | :header-rows: 1 |
| 332 | 332 | ||
| 333 | * - Offset | 333 | * - Offset |
| @@ -375,7 +375,7 @@ The hash maps that exist in both ``struct dx_root`` and | |||
| 375 | long: | 375 | long: |
| 376 | 376 | ||
| 377 | .. list-table:: | 377 | .. list-table:: |
| 378 | :widths: 1 1 1 77 | 378 | :widths: 8 8 24 40 |
| 379 | :header-rows: 1 | 379 | :header-rows: 1 |
| 380 | 380 | ||
| 381 | * - Offset | 381 | * - Offset |
| @@ -405,7 +405,7 @@ directory index (which will ensure that there's space for the checksum. | |||
| 405 | The dx\_tail structure is 8 bytes long and looks like this: | 405 | The dx\_tail structure is 8 bytes long and looks like this: |
| 406 | 406 | ||
| 407 | .. list-table:: | 407 | .. list-table:: |
| 408 | :widths: 1 1 1 77 | 408 | :widths: 8 8 24 40 |
| 409 | :header-rows: 1 | 409 | :header-rows: 1 |
| 410 | 410 | ||
| 411 | * - Offset | 411 | * - Offset |
diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/dynamic.rst index bb0c84333341..bb0c84333341 100644 --- a/Documentation/filesystems/ext4/ondisk/dynamic.rst +++ b/Documentation/filesystems/ext4/dynamic.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/eainode.rst index ecc0d01a0a72..ecc0d01a0a72 100644 --- a/Documentation/filesystems/ext4/ondisk/eainode.rst +++ b/Documentation/filesystems/ext4/eainode.rst | |||
diff --git a/Documentation/filesystems/ext4/ext4.rst b/Documentation/filesystems/ext4/ext4.rst deleted file mode 100644 index 9d4368d591fa..000000000000 --- a/Documentation/filesystems/ext4/ext4.rst +++ /dev/null | |||
| @@ -1,613 +0,0 @@ | |||
| 1 | .. SPDX-License-Identifier: GPL-2.0 | ||
| 2 | |||
| 3 | ======================== | ||
| 4 | General Information | ||
| 5 | ======================== | ||
| 6 | |||
| 7 | Ext4 is an advanced level of the ext3 filesystem which incorporates | ||
| 8 | scalability and reliability enhancements for supporting large filesystems | ||
| 9 | (64 bit) in keeping with increasing disk capacities and state-of-the-art | ||
| 10 | feature requirements. | ||
| 11 | |||
| 12 | Mailing list: linux-ext4@vger.kernel.org | ||
| 13 | Web site: http://ext4.wiki.kernel.org | ||
| 14 | |||
| 15 | |||
| 16 | Quick usage instructions | ||
| 17 | ======================== | ||
| 18 | |||
| 19 | Note: More extensive information for getting started with ext4 can be | ||
| 20 | found at the ext4 wiki site at the URL: | ||
| 21 | http://ext4.wiki.kernel.org/index.php/Ext4_Howto | ||
| 22 | |||
| 23 | - The latest version of e2fsprogs can be found at: | ||
| 24 | |||
| 25 | https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ | ||
| 26 | |||
| 27 | or | ||
| 28 | |||
| 29 | http://sourceforge.net/project/showfiles.php?group_id=2406 | ||
| 30 | |||
| 31 | or grab the latest git repository from: | ||
| 32 | |||
| 33 | https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git | ||
| 34 | |||
| 35 | - Create a new filesystem using the ext4 filesystem type: | ||
| 36 | |||
| 37 | # mke2fs -t ext4 /dev/hda1 | ||
| 38 | |||
| 39 | Or to configure an existing ext3 filesystem to support extents: | ||
| 40 | |||
| 41 | # tune2fs -O extents /dev/hda1 | ||
| 42 | |||
| 43 | If the filesystem was created with 128 byte inodes, it can be | ||
| 44 | converted to use 256 byte for greater efficiency via: | ||
| 45 | |||
| 46 | # tune2fs -I 256 /dev/hda1 | ||
| 47 | |||
| 48 | - Mounting: | ||
| 49 | |||
| 50 | # mount -t ext4 /dev/hda1 /wherever | ||
| 51 | |||
| 52 | - When comparing performance with other filesystems, it's always | ||
| 53 | important to try multiple workloads; very often a subtle change in a | ||
| 54 | workload parameter can completely change the ranking of which | ||
| 55 | filesystems do well compared to others. When comparing versus ext3, | ||
| 56 | note that ext4 enables write barriers by default, while ext3 does | ||
| 57 | not enable write barriers by default. So it is useful to use | ||
| 58 | explicitly specify whether barriers are enabled or not when via the | ||
| 59 | '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems | ||
| 60 | for a fair comparison. When tuning ext3 for best benchmark numbers, | ||
| 61 | it is often worthwhile to try changing the data journaling mode; '-o | ||
| 62 | data=writeback' can be faster for some workloads. (Note however that | ||
| 63 | running mounted with data=writeback can potentially leave stale data | ||
| 64 | exposed in recently written files in case of an unclean shutdown, | ||
| 65 | which could be a security exposure in some situations.) Configuring | ||
| 66 | the filesystem with a large journal can also be helpful for | ||
| 67 | metadata-intensive workloads. | ||
| 68 | |||
| 69 | Features | ||
| 70 | ======== | ||
| 71 | |||
| 72 | Currently Available | ||
| 73 | ------------------- | ||
| 74 | |||
| 75 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) | ||
| 76 | * extent format reduces metadata overhead (RAM, IO for access, transactions) | ||
| 77 | * extent format more robust in face of on-disk corruption due to magics, | ||
| 78 | * internal redundancy in tree | ||
| 79 | * improved file allocation (multi-block alloc) | ||
| 80 | * lift 32000 subdirectory limit imposed by i_links_count[1] | ||
| 81 | * nsec timestamps for mtime, atime, ctime, create time | ||
| 82 | * inode version field on disk (NFSv4, Lustre) | ||
| 83 | * reduced e2fsck time via uninit_bg feature | ||
| 84 | * journal checksumming for robustness, performance | ||
| 85 | * persistent file preallocation (e.g for streaming media, databases) | ||
| 86 | * ability to pack bitmaps and inode tables into larger virtual groups via the | ||
| 87 | flex_bg feature | ||
| 88 | * large file support | ||
| 89 | * inode allocation using large virtual block groups via flex_bg | ||
| 90 | * delayed allocation | ||
| 91 | * large block (up to pagesize) support | ||
| 92 | * efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force | ||
| 93 | the ordering) | ||
| 94 | |||
| 95 | [1] Filesystems with a block size of 1k may see a limit imposed by the | ||
| 96 | directory hash tree having a maximum depth of two. | ||
| 97 | |||
| 98 | Options | ||
| 99 | ======= | ||
| 100 | |||
| 101 | When mounting an ext4 filesystem, the following option are accepted: | ||
| 102 | (*) == default | ||
| 103 | |||
| 104 | ======================= ======================================================= | ||
| 105 | Mount Option Description | ||
| 106 | ======================= ======================================================= | ||
| 107 | ro Mount filesystem read only. Note that ext4 will | ||
| 108 | replay the journal (and thus write to the | ||
| 109 | partition) even when mounted "read only". The | ||
| 110 | mount options "ro,noload" can be used to prevent | ||
| 111 | writes to the filesystem. | ||
| 112 | |||
| 113 | journal_checksum Enable checksumming of the journal transactions. | ||
| 114 | This will allow the recovery code in e2fsck and the | ||
| 115 | kernel to detect corruption in the kernel. It is a | ||
| 116 | compatible change and will be ignored by older kernels. | ||
| 117 | |||
| 118 | journal_async_commit Commit block can be written to disk without waiting | ||
| 119 | for descriptor blocks. If enabled older kernels cannot | ||
| 120 | mount the device. This will enable 'journal_checksum' | ||
| 121 | internally. | ||
| 122 | |||
| 123 | journal_path=path | ||
| 124 | journal_dev=devnum When the external journal device's major/minor numbers | ||
| 125 | have changed, these options allow the user to specify | ||
| 126 | the new journal location. The journal device is | ||
| 127 | identified through either its new major/minor numbers | ||
| 128 | encoded in devnum, or via a path to the device. | ||
| 129 | |||
| 130 | norecovery Don't load the journal on mounting. Note that | ||
| 131 | noload if the filesystem was not unmounted cleanly, | ||
| 132 | skipping the journal replay will lead to the | ||
| 133 | filesystem containing inconsistencies that can | ||
| 134 | lead to any number of problems. | ||
| 135 | |||
| 136 | data=journal All data are committed into the journal prior to being | ||
| 137 | written into the main file system. Enabling | ||
| 138 | this mode will disable delayed allocation and | ||
| 139 | O_DIRECT support. | ||
| 140 | |||
| 141 | data=ordered (*) All data are forced directly out to the main file | ||
| 142 | system prior to its metadata being committed to the | ||
| 143 | journal. | ||
| 144 | |||
| 145 | data=writeback Data ordering is not preserved, data may be written | ||
| 146 | into the main file system after its metadata has been | ||
| 147 | committed to the journal. | ||
| 148 | |||
| 149 | commit=nrsec (*) Ext4 can be told to sync all its data and metadata | ||
| 150 | every 'nrsec' seconds. The default value is 5 seconds. | ||
| 151 | This means that if you lose your power, you will lose | ||
| 152 | as much as the latest 5 seconds of work (your | ||
| 153 | filesystem will not be damaged though, thanks to the | ||
| 154 | journaling). This default value (or any low value) | ||
| 155 | will hurt performance, but it's good for data-safety. | ||
| 156 | Setting it to 0 will have the same effect as leaving | ||
| 157 | it at the default (5 seconds). | ||
| 158 | Setting it to very large values will improve | ||
| 159 | performance. | ||
| 160 | |||
| 161 | barrier=<0|1(*)> This enables/disables the use of write barriers in | ||
| 162 | barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. | ||
| 163 | nobarrier This also requires an IO stack which can support | ||
| 164 | barriers, and if jbd gets an error on a barrier | ||
| 165 | write, it will disable again with a warning. | ||
| 166 | Write barriers enforce proper on-disk ordering | ||
| 167 | of journal commits, making volatile disk write caches | ||
| 168 | safe to use, at some performance penalty. If | ||
| 169 | your disks are battery-backed in one way or another, | ||
| 170 | disabling barriers may safely improve performance. | ||
| 171 | The mount options "barrier" and "nobarrier" can | ||
| 172 | also be used to enable or disable barriers, for | ||
| 173 | consistency with other ext4 mount options. | ||
| 174 | |||
| 175 | inode_readahead_blks=n This tuning parameter controls the maximum | ||
| 176 | number of inode table blocks that ext4's inode | ||
| 177 | table readahead algorithm will pre-read into | ||
| 178 | the buffer cache. The default value is 32 blocks. | ||
| 179 | |||
| 180 | nouser_xattr Disables Extended User Attributes. See the | ||
| 181 | attr(5) manual page for more information about | ||
| 182 | extended attributes. | ||
| 183 | |||
| 184 | noacl This option disables POSIX Access Control List | ||
| 185 | support. If ACL support is enabled in the kernel | ||
| 186 | configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is | ||
| 187 | enabled by default on mount. See the acl(5) manual | ||
| 188 | page for more information about acl. | ||
| 189 | |||
| 190 | bsddf (*) Make 'df' act like BSD. | ||
| 191 | minixdf Make 'df' act like Minix. | ||
| 192 | |||
| 193 | debug Extra debugging information is sent to syslog. | ||
| 194 | |||
| 195 | abort Simulate the effects of calling ext4_abort() for | ||
| 196 | debugging purposes. This is normally used while | ||
| 197 | remounting a filesystem which is already mounted. | ||
| 198 | |||
| 199 | errors=remount-ro Remount the filesystem read-only on an error. | ||
| 200 | errors=continue Keep going on a filesystem error. | ||
| 201 | errors=panic Panic and halt the machine if an error occurs. | ||
| 202 | (These mount options override the errors behavior | ||
| 203 | specified in the superblock, which can be configured | ||
| 204 | using tune2fs) | ||
| 205 | |||
| 206 | data_err=ignore(*) Just print an error message if an error occurs | ||
| 207 | in a file data buffer in ordered mode. | ||
| 208 | data_err=abort Abort the journal if an error occurs in a file | ||
| 209 | data buffer in ordered mode. | ||
| 210 | |||
| 211 | grpid New objects have the group ID of their parent. | ||
| 212 | bsdgroups | ||
| 213 | |||
| 214 | nogrpid (*) New objects have the group ID of their creator. | ||
| 215 | sysvgroups | ||
| 216 | |||
| 217 | resgid=n The group ID which may use the reserved blocks. | ||
| 218 | |||
| 219 | resuid=n The user ID which may use the reserved blocks. | ||
| 220 | |||
| 221 | sb=n Use alternate superblock at this location. | ||
| 222 | |||
| 223 | quota These options are ignored by the filesystem. They | ||
| 224 | noquota are used only by quota tools to recognize volumes | ||
| 225 | grpquota where quota should be turned on. See documentation | ||
| 226 | usrquota in the quota-tools package for more details | ||
| 227 | (http://sourceforge.net/projects/linuxquota). | ||
| 228 | |||
| 229 | jqfmt=<quota type> These options tell filesystem details about quota | ||
| 230 | usrjquota=<file> so that quota information can be properly updated | ||
| 231 | grpjquota=<file> during journal replay. They replace the above | ||
| 232 | quota options. See documentation in the quota-tools | ||
| 233 | package for more details | ||
| 234 | (http://sourceforge.net/projects/linuxquota). | ||
| 235 | |||
| 236 | stripe=n Number of filesystem blocks that mballoc will try | ||
| 237 | to use for allocation size and alignment. For RAID5/6 | ||
| 238 | systems this should be the number of data | ||
| 239 | disks * RAID chunk size in file system blocks. | ||
| 240 | |||
| 241 | delalloc (*) Defer block allocation until just before ext4 | ||
| 242 | writes out the block(s) in question. This | ||
| 243 | allows ext4 to better allocation decisions | ||
| 244 | more efficiently. | ||
| 245 | nodelalloc Disable delayed allocation. Blocks are allocated | ||
| 246 | when the data is copied from userspace to the | ||
| 247 | page cache, either via the write(2) system call | ||
| 248 | or when an mmap'ed page which was previously | ||
| 249 | unallocated is written for the first time. | ||
| 250 | |||
| 251 | max_batch_time=usec Maximum amount of time ext4 should wait for | ||
| 252 | additional filesystem operations to be batch | ||
| 253 | together with a synchronous write operation. | ||
| 254 | Since a synchronous write operation is going to | ||
| 255 | force a commit and then a wait for the I/O | ||
| 256 | complete, it doesn't cost much, and can be a | ||
| 257 | huge throughput win, we wait for a small amount | ||
| 258 | of time to see if any other transactions can | ||
| 259 | piggyback on the synchronous write. The | ||
| 260 | algorithm used is designed to automatically tune | ||
| 261 | for the speed of the disk, by measuring the | ||
| 262 | amount of time (on average) that it takes to | ||
| 263 | finish committing a transaction. Call this time | ||
| 264 | the "commit time". If the time that the | ||
| 265 | transaction has been running is less than the | ||
| 266 | commit time, ext4 will try sleeping for the | ||
| 267 | commit time to see if other operations will join | ||
| 268 | the transaction. The commit time is capped by | ||
| 269 | the max_batch_time, which defaults to 15000us | ||
| 270 | (15ms). This optimization can be turned off | ||
| 271 | entirely by setting max_batch_time to 0. | ||
| 272 | |||
| 273 | min_batch_time=usec This parameter sets the commit time (as | ||
| 274 | described above) to be at least min_batch_time. | ||
| 275 | It defaults to zero microseconds. Increasing | ||
| 276 | this parameter may improve the throughput of | ||
| 277 | multi-threaded, synchronous workloads on very | ||
| 278 | fast disks, at the cost of increasing latency. | ||
| 279 | |||
| 280 | journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the | ||
| 281 | highest priority) which should be used for I/O | ||
| 282 | operations submitted by kjournald2 during a | ||
| 283 | commit operation. This defaults to 3, which is | ||
| 284 | a slightly higher priority than the default I/O | ||
| 285 | priority. | ||
| 286 | |||
| 287 | auto_da_alloc(*) Many broken applications don't use fsync() when | ||
| 288 | noauto_da_alloc replacing existing files via patterns such as | ||
| 289 | fd = open("foo.new")/write(fd,..)/close(fd)/ | ||
| 290 | rename("foo.new", "foo"), or worse yet, | ||
| 291 | fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). | ||
| 292 | If auto_da_alloc is enabled, ext4 will detect | ||
| 293 | the replace-via-rename and replace-via-truncate | ||
| 294 | patterns and force that any delayed allocation | ||
| 295 | blocks are allocated such that at the next | ||
| 296 | journal commit, in the default data=ordered | ||
| 297 | mode, the data blocks of the new file are forced | ||
| 298 | to disk before the rename() operation is | ||
| 299 | committed. This provides roughly the same level | ||
| 300 | of guarantees as ext3, and avoids the | ||
| 301 | "zero-length" problem that can happen when a | ||
| 302 | system crashes before the delayed allocation | ||
| 303 | blocks are forced to disk. | ||
| 304 | |||
| 305 | noinit_itable Do not initialize any uninitialized inode table | ||
| 306 | blocks in the background. This feature may be | ||
| 307 | used by installation CD's so that the install | ||
| 308 | process can complete as quickly as possible; the | ||
| 309 | inode table initialization process would then be | ||
| 310 | deferred until the next time the file system | ||
| 311 | is unmounted. | ||
| 312 | |||
| 313 | init_itable=n The lazy itable init code will wait n times the | ||
| 314 | number of milliseconds it took to zero out the | ||
| 315 | previous block group's inode table. This | ||
| 316 | minimizes the impact on the system performance | ||
| 317 | while file system's inode table is being initialized. | ||
| 318 | |||
| 319 | discard Controls whether ext4 should issue discard/TRIM | ||
| 320 | nodiscard(*) commands to the underlying block device when | ||
| 321 | blocks are freed. This is useful for SSD devices | ||
| 322 | and sparse/thinly-provisioned LUNs, but it is off | ||
| 323 | by default until sufficient testing has been done. | ||
| 324 | |||
| 325 | nouid32 Disables 32-bit UIDs and GIDs. This is for | ||
| 326 | interoperability with older kernels which only | ||
| 327 | store and expect 16-bit values. | ||
| 328 | |||
| 329 | block_validity(*) These options enable or disable the in-kernel | ||
| 330 | noblock_validity facility for tracking filesystem metadata blocks | ||
| 331 | within internal data structures. This allows multi- | ||
| 332 | block allocator and other routines to notice | ||
| 333 | bugs or corrupted allocation bitmaps which cause | ||
| 334 | blocks to be allocated which overlap with | ||
| 335 | filesystem metadata blocks. | ||
| 336 | |||
| 337 | dioread_lock Controls whether or not ext4 should use the DIO read | ||
| 338 | dioread_nolock locking. If the dioread_nolock option is specified | ||
| 339 | ext4 will allocate uninitialized extent before buffer | ||
| 340 | write and convert the extent to initialized after IO | ||
| 341 | completes. This approach allows ext4 code to avoid | ||
| 342 | using inode mutex, which improves scalability on high | ||
| 343 | speed storages. However this does not work with | ||
| 344 | data journaling and dioread_nolock option will be | ||
| 345 | ignored with kernel warning. Note that dioread_nolock | ||
| 346 | code path is only used for extent-based files. | ||
| 347 | Because of the restrictions this options comprises | ||
| 348 | it is off by default (e.g. dioread_lock). | ||
| 349 | |||
| 350 | max_dir_size_kb=n This limits the size of directories so that any | ||
| 351 | attempt to expand them beyond the specified | ||
| 352 | limit in kilobytes will cause an ENOSPC error. | ||
| 353 | This is useful in memory constrained | ||
| 354 | environments, where a very large directory can | ||
| 355 | cause severe performance problems or even | ||
| 356 | provoke the Out Of Memory killer. (For example, | ||
| 357 | if there is only 512mb memory available, a 176mb | ||
| 358 | directory may seriously cramp the system's style.) | ||
| 359 | |||
| 360 | i_version Enable 64-bit inode version support. This option is | ||
| 361 | off by default. | ||
| 362 | |||
| 363 | dax Use direct access (no page cache). See | ||
| 364 | Documentation/filesystems/dax.txt. Note that | ||
| 365 | this option is incompatible with data=journal. | ||
| 366 | ======================= ======================================================= | ||
| 367 | |||
| 368 | Data Mode | ||
| 369 | ========= | ||
| 370 | There are 3 different data modes: | ||
| 371 | |||
| 372 | * writeback mode | ||
| 373 | |||
| 374 | In data=writeback mode, ext4 does not journal data at all. This mode provides | ||
| 375 | a similar level of journaling as that of XFS, JFS, and ReiserFS in its default | ||
| 376 | mode - metadata journaling. A crash+recovery can cause incorrect data to | ||
| 377 | appear in files which were written shortly before the crash. This mode will | ||
| 378 | typically provide the best ext4 performance. | ||
| 379 | |||
| 380 | * ordered mode | ||
| 381 | |||
| 382 | In data=ordered mode, ext4 only officially journals metadata, but it logically | ||
| 383 | groups metadata information related to data changes with the data blocks into | ||
| 384 | a single unit called a transaction. When it's time to write the new metadata | ||
| 385 | out to disk, the associated data blocks are written first. In general, this | ||
| 386 | mode performs slightly slower than writeback but significantly faster than | ||
| 387 | journal mode. | ||
| 388 | |||
| 389 | * journal mode | ||
| 390 | |||
| 391 | data=journal mode provides full data and metadata journaling. All new data is | ||
| 392 | written to the journal first, and then to its final location. In the event of | ||
| 393 | a crash, the journal can be replayed, bringing both data and metadata into a | ||
| 394 | consistent state. This mode is the slowest except when data needs to be read | ||
| 395 | from and written to disk at the same time where it outperforms all others | ||
| 396 | modes. Enabling this mode will disable delayed allocation and O_DIRECT | ||
| 397 | support. | ||
| 398 | |||
| 399 | /proc entries | ||
| 400 | ============= | ||
| 401 | |||
| 402 | Information about mounted ext4 file systems can be found in | ||
| 403 | /proc/fs/ext4. Each mounted filesystem will have a directory in | ||
| 404 | /proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or | ||
| 405 | /proc/fs/ext4/dm-0). The files in each per-device directory are shown | ||
| 406 | in table below. | ||
| 407 | |||
| 408 | Files in /proc/fs/ext4/<devname> | ||
| 409 | |||
| 410 | ================ ======= | ||
| 411 | File Content | ||
| 412 | ================ ======= | ||
| 413 | mb_groups details of multiblock allocator buddy cache of free blocks | ||
| 414 | ================ ======= | ||
| 415 | |||
| 416 | /sys entries | ||
| 417 | ============ | ||
| 418 | |||
| 419 | Information about mounted ext4 file systems can be found in | ||
| 420 | /sys/fs/ext4. Each mounted filesystem will have a directory in | ||
| 421 | /sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or | ||
| 422 | /sys/fs/ext4/dm-0). The files in each per-device directory are shown | ||
| 423 | in table below. | ||
| 424 | |||
| 425 | Files in /sys/fs/ext4/<devname>: | ||
| 426 | |||
| 427 | (see also Documentation/ABI/testing/sysfs-fs-ext4) | ||
| 428 | |||
| 429 | ============================= ================================================= | ||
| 430 | File Content | ||
| 431 | ============================= ================================================= | ||
| 432 | delayed_allocation_blocks This file is read-only and shows the number of | ||
| 433 | blocks that are dirty in the page cache, but | ||
| 434 | which do not have their location in the | ||
| 435 | filesystem allocated yet. | ||
| 436 | |||
| 437 | inode_goal Tuning parameter which (if non-zero) controls | ||
| 438 | the goal inode used by the inode allocator in | ||
| 439 | preference to all other allocation heuristics. | ||
| 440 | This is intended for debugging use only, and | ||
| 441 | should be 0 on production systems. | ||
| 442 | |||
| 443 | inode_readahead_blks Tuning parameter which controls the maximum | ||
| 444 | number of inode table blocks that ext4's inode | ||
| 445 | table readahead algorithm will pre-read into | ||
| 446 | the buffer cache | ||
| 447 | |||
| 448 | lifetime_write_kbytes This file is read-only and shows the number of | ||
| 449 | kilobytes of data that have been written to this | ||
| 450 | filesystem since it was created. | ||
| 451 | |||
| 452 | max_writeback_mb_bump The maximum number of megabytes the writeback | ||
| 453 | code will try to write out before move on to | ||
| 454 | another inode. | ||
| 455 | |||
| 456 | mb_group_prealloc The multiblock allocator will round up allocation | ||
| 457 | requests to a multiple of this tuning parameter if | ||
| 458 | the stripe size is not set in the ext4 superblock | ||
| 459 | |||
| 460 | mb_max_to_scan The maximum number of extents the multiblock | ||
| 461 | allocator will search to find the best extent | ||
| 462 | |||
| 463 | mb_min_to_scan The minimum number of extents the multiblock | ||
| 464 | allocator will search to find the best extent | ||
| 465 | |||
| 466 | mb_order2_req Tuning parameter which controls the minimum size | ||
| 467 | for requests (as a power of 2) where the buddy | ||
| 468 | cache is used | ||
| 469 | |||
| 470 | mb_stats Controls whether the multiblock allocator should | ||
| 471 | collect statistics, which are shown during the | ||
| 472 | unmount. 1 means to collect statistics, 0 means | ||
| 473 | not to collect statistics | ||
| 474 | |||
| 475 | mb_stream_req Files which have fewer blocks than this tunable | ||
| 476 | parameter will have their blocks allocated out | ||
| 477 | of a block group specific preallocation pool, so | ||
| 478 | that small files are packed closely together. | ||
| 479 | Each large file will have its blocks allocated | ||
| 480 | out of its own unique preallocation pool. | ||
| 481 | |||
| 482 | session_write_kbytes This file is read-only and shows the number of | ||
| 483 | kilobytes of data that have been written to this | ||
| 484 | filesystem since it was mounted. | ||
| 485 | |||
| 486 | reserved_clusters This is RW file and contains number of reserved | ||
| 487 | clusters in the file system which will be used | ||
| 488 | in the specific situations to avoid costly | ||
| 489 | zeroout, unexpected ENOSPC, or possible data | ||
| 490 | loss. The default is 2% or 4096 clusters, | ||
| 491 | whichever is smaller and this can be changed | ||
| 492 | however it can never exceed number of clusters | ||
| 493 | in the file system. If there is not enough space | ||
| 494 | for the reserved space when mounting the file | ||
| 495 | mount will _not_ fail. | ||
| 496 | ============================= ================================================= | ||
| 497 | |||
| 498 | Ioctls | ||
| 499 | ====== | ||
| 500 | |||
| 501 | There is some Ext4 specific functionality which can be accessed by applications | ||
| 502 | through the system call interfaces. The list of all Ext4 specific ioctls are | ||
| 503 | shown in the table below. | ||
| 504 | |||
| 505 | Table of Ext4 specific ioctls | ||
| 506 | |||
| 507 | ============================= ================================================= | ||
| 508 | Ioctl Description | ||
| 509 | ============================= ================================================= | ||
| 510 | EXT4_IOC_GETFLAGS Get additional attributes associated with inode. | ||
| 511 | The ioctl argument is an integer bitfield, with | ||
| 512 | bit values described in ext4.h. This ioctl is an | ||
| 513 | alias for FS_IOC_GETFLAGS. | ||
| 514 | |||
| 515 | EXT4_IOC_SETFLAGS Set additional attributes associated with inode. | ||
| 516 | The ioctl argument is an integer bitfield, with | ||
| 517 | bit values described in ext4.h. This ioctl is an | ||
| 518 | alias for FS_IOC_SETFLAGS. | ||
| 519 | |||
| 520 | EXT4_IOC_GETVERSION | ||
| 521 | EXT4_IOC_GETVERSION_OLD | ||
| 522 | Get the inode i_generation number stored for | ||
| 523 | each inode. The i_generation number is normally | ||
| 524 | changed only when new inode is created and it is | ||
| 525 | particularly useful for network filesystems. The | ||
| 526 | '_OLD' version of this ioctl is an alias for | ||
| 527 | FS_IOC_GETVERSION. | ||
| 528 | |||
| 529 | EXT4_IOC_SETVERSION | ||
| 530 | EXT4_IOC_SETVERSION_OLD | ||
| 531 | Set the inode i_generation number stored for | ||
| 532 | each inode. The '_OLD' version of this ioctl | ||
| 533 | is an alias for FS_IOC_SETVERSION. | ||
| 534 | |||
| 535 | EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize | ||
| 536 | mount option. It allows to resize filesystem | ||
| 537 | to the end of the last existing block group, | ||
| 538 | further resize has to be done with resize2fs, | ||
| 539 | either online, or offline. The argument points | ||
| 540 | to the unsigned logn number representing the | ||
| 541 | filesystem new block count. | ||
| 542 | |||
| 543 | EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one | ||
| 544 | this ioctl is pointing to) to the donor_fd (the | ||
| 545 | one specified in move_extent structure passed | ||
| 546 | as an argument to this ioctl). Then, exchange | ||
| 547 | inode metadata between orig_fd and donor_fd. | ||
| 548 | This is especially useful for online | ||
| 549 | defragmentation, because the allocator has the | ||
| 550 | opportunity to allocate moved blocks better, | ||
| 551 | ideally into one contiguous extent. | ||
| 552 | |||
| 553 | EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or | ||
| 554 | new group descriptor block. The new group | ||
| 555 | descriptor is described by ext4_new_group_input | ||
| 556 | structure, which is passed as an argument to | ||
| 557 | this ioctl. This is especially useful in | ||
| 558 | conjunction with EXT4_IOC_GROUP_EXTEND, | ||
| 559 | which allows online resize of the filesystem | ||
| 560 | to the end of the last existing block group. | ||
| 561 | Those two ioctls combined is used in userspace | ||
| 562 | online resize tool (e.g. resize2fs). | ||
| 563 | |||
| 564 | EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself. | ||
| 565 | It converts (migrates) ext3 indirect block mapped | ||
| 566 | inode to ext4 extent mapped inode by walking | ||
| 567 | through indirect block mapping of the original | ||
| 568 | inode and converting contiguous block ranges | ||
| 569 | into ext4 extents of the temporary inode. Then, | ||
| 570 | inodes are swapped. This ioctl might help, when | ||
| 571 | migrating from ext3 to ext4 filesystem, however | ||
| 572 | suggestion is to create fresh ext4 filesystem | ||
| 573 | and copy data from the backup. Note, that | ||
| 574 | filesystem has to support extents for this ioctl | ||
| 575 | to work. | ||
| 576 | |||
| 577 | EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be | ||
| 578 | allocated to preserve application-expected ext3 | ||
| 579 | behaviour. Note that this will also start | ||
| 580 | triggering a write of the data blocks, but this | ||
| 581 | behaviour may change in the future as it is | ||
| 582 | not necessary and has been done this way only | ||
| 583 | for sake of simplicity. | ||
| 584 | |||
| 585 | EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number | ||
| 586 | of blocks of resized filesystem is passed in via | ||
| 587 | 64 bit integer argument. The kernel allocates | ||
| 588 | bitmaps and inode table, the userspace tool thus | ||
| 589 | just passes the new number of blocks. | ||
| 590 | |||
| 591 | EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes | ||
| 592 | (like i_blocks, i_size, i_flags, ...) from | ||
| 593 | the specified inode with inode | ||
| 594 | EXT4_BOOT_LOADER_INO (#5). This is typically | ||
| 595 | used to store a boot loader in a secure part of | ||
| 596 | the filesystem, where it can't be changed by a | ||
| 597 | normal user by accident. | ||
| 598 | The data blocks of the previous boot loader | ||
| 599 | will be associated with the given inode. | ||
| 600 | ============================= ================================================= | ||
| 601 | |||
| 602 | References | ||
| 603 | ========== | ||
| 604 | |||
| 605 | kernel source: <file:fs/ext4/> | ||
| 606 | <file:fs/jbd2/> | ||
| 607 | |||
| 608 | programs: http://e2fsprogs.sourceforge.net/ | ||
| 609 | |||
| 610 | useful links: http://fedoraproject.org/wiki/ext3-devel | ||
| 611 | http://www.bullopensource.org/ext4/ | ||
| 612 | http://ext4.wiki.kernel.org/index.php/Main_Page | ||
| 613 | http://fedoraproject.org/wiki/Features/Ext4 | ||
diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/globals.rst index 368bf7662b96..368bf7662b96 100644 --- a/Documentation/filesystems/ext4/ondisk/globals.rst +++ b/Documentation/filesystems/ext4/globals.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst index 759827e5d2cf..0f783ed88592 100644 --- a/Documentation/filesystems/ext4/ondisk/group_descr.rst +++ b/Documentation/filesystems/ext4/group_descr.rst | |||
| @@ -43,7 +43,7 @@ entire bitmap. | |||
| 43 | The block group descriptor is laid out in ``struct ext4_group_desc``. | 43 | The block group descriptor is laid out in ``struct ext4_group_desc``. |
| 44 | 44 | ||
| 45 | .. list-table:: | 45 | .. list-table:: |
| 46 | :widths: 1 1 1 77 | 46 | :widths: 8 8 24 40 |
| 47 | :header-rows: 1 | 47 | :header-rows: 1 |
| 48 | 48 | ||
| 49 | * - Offset | 49 | * - Offset |
| @@ -157,7 +157,7 @@ The block group descriptor is laid out in ``struct ext4_group_desc``. | |||
| 157 | Block group flags can be any combination of the following: | 157 | Block group flags can be any combination of the following: |
| 158 | 158 | ||
| 159 | .. list-table:: | 159 | .. list-table:: |
| 160 | :widths: 1 79 | 160 | :widths: 16 64 |
| 161 | :header-rows: 1 | 161 | :header-rows: 1 |
| 162 | 162 | ||
| 163 | * - Value | 163 | * - Value |
diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ifork.rst index 5dbe3b2b121a..b9816d5a896b 100644 --- a/Documentation/filesystems/ext4/ondisk/ifork.rst +++ b/Documentation/filesystems/ext4/ifork.rst | |||
| @@ -68,7 +68,7 @@ The extent tree header is recorded in ``struct ext4_extent_header``, | |||
| 68 | which is 12 bytes long: | 68 | which is 12 bytes long: |
| 69 | 69 | ||
| 70 | .. list-table:: | 70 | .. list-table:: |
| 71 | :widths: 1 1 1 77 | 71 | :widths: 8 8 24 40 |
| 72 | :header-rows: 1 | 72 | :header-rows: 1 |
| 73 | 73 | ||
| 74 | * - Offset | 74 | * - Offset |
| @@ -104,7 +104,7 @@ Internal nodes of the extent tree, also known as index nodes, are | |||
| 104 | recorded as ``struct ext4_extent_idx``, and are 12 bytes long: | 104 | recorded as ``struct ext4_extent_idx``, and are 12 bytes long: |
| 105 | 105 | ||
| 106 | .. list-table:: | 106 | .. list-table:: |
| 107 | :widths: 1 1 1 77 | 107 | :widths: 8 8 24 40 |
| 108 | :header-rows: 1 | 108 | :header-rows: 1 |
| 109 | 109 | ||
| 110 | * - Offset | 110 | * - Offset |
| @@ -134,7 +134,7 @@ Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, | |||
| 134 | and are also 12 bytes long: | 134 | and are also 12 bytes long: |
| 135 | 135 | ||
| 136 | .. list-table:: | 136 | .. list-table:: |
| 137 | :widths: 1 1 1 77 | 137 | :widths: 8 8 24 40 |
| 138 | :header-rows: 1 | 138 | :header-rows: 1 |
| 139 | 139 | ||
| 140 | * - Offset | 140 | * - Offset |
| @@ -174,7 +174,7 @@ including) the checksum itself. | |||
| 174 | ``struct ext4_extent_tail`` is 4 bytes long: | 174 | ``struct ext4_extent_tail`` is 4 bytes long: |
| 175 | 175 | ||
| 176 | .. list-table:: | 176 | .. list-table:: |
| 177 | :widths: 1 1 1 77 | 177 | :widths: 8 8 24 40 |
| 178 | :header-rows: 1 | 178 | :header-rows: 1 |
| 179 | 179 | ||
| 180 | * - Offset | 180 | * - Offset |
diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst index 71121605558c..3be3e54d480d 100644 --- a/Documentation/filesystems/ext4/index.rst +++ b/Documentation/filesystems/ext4/index.rst | |||
| @@ -1,17 +1,14 @@ | |||
| 1 | .. SPDX-License-Identifier: GPL-2.0 | 1 | .. SPDX-License-Identifier: GPL-2.0 |
| 2 | 2 | ||
| 3 | =============== | 3 | =================================== |
| 4 | ext4 Filesystem | 4 | ext4 Data Structures and Algorithms |
| 5 | =============== | 5 | =================================== |
| 6 | |||
| 7 | General usage and on-disk artifacts writen by ext4. More documentation may | ||
| 8 | be ported from the wiki as time permits. This should be considered the | ||
| 9 | canonical source of information as the details here have been reviewed by | ||
| 10 | the ext4 community. | ||
| 11 | 6 | ||
| 12 | .. toctree:: | 7 | .. toctree:: |
| 13 | :maxdepth: 5 | 8 | :maxdepth: 6 |
| 14 | :numbered: | 9 | :numbered: |
| 15 | 10 | ||
| 16 | ext4 | 11 | about.rst |
| 17 | ondisk/index | 12 | overview.rst |
| 13 | globals.rst | ||
| 14 | dynamic.rst | ||
diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/inlinedata.rst index d1075178ce0b..d1075178ce0b 100644 --- a/Documentation/filesystems/ext4/ondisk/inlinedata.rst +++ b/Documentation/filesystems/ext4/inlinedata.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/inodes.rst index 655ce898f3f5..6bd35e506b6f 100644 --- a/Documentation/filesystems/ext4/ondisk/inodes.rst +++ b/Documentation/filesystems/ext4/inodes.rst | |||
| @@ -29,8 +29,9 @@ and the inode structure itself. | |||
| 29 | The inode table entry is laid out in ``struct ext4_inode``. | 29 | The inode table entry is laid out in ``struct ext4_inode``. |
| 30 | 30 | ||
| 31 | .. list-table:: | 31 | .. list-table:: |
| 32 | :widths: 1 1 1 77 | 32 | :widths: 8 8 24 40 |
| 33 | :header-rows: 1 | 33 | :header-rows: 1 |
| 34 | :class: longtable | ||
| 34 | 35 | ||
| 35 | * - Offset | 36 | * - Offset |
| 36 | - Size | 37 | - Size |
| @@ -176,7 +177,7 @@ The inode table entry is laid out in ``struct ext4_inode``. | |||
| 176 | The ``i_mode`` value is a combination of the following flags: | 177 | The ``i_mode`` value is a combination of the following flags: |
| 177 | 178 | ||
| 178 | .. list-table:: | 179 | .. list-table:: |
| 179 | :widths: 1 79 | 180 | :widths: 16 64 |
| 180 | :header-rows: 1 | 181 | :header-rows: 1 |
| 181 | 182 | ||
| 182 | * - Value | 183 | * - Value |
| @@ -227,7 +228,7 @@ The ``i_mode`` value is a combination of the following flags: | |||
| 227 | The ``i_flags`` field is a combination of these values: | 228 | The ``i_flags`` field is a combination of these values: |
| 228 | 229 | ||
| 229 | .. list-table:: | 230 | .. list-table:: |
| 230 | :widths: 1 79 | 231 | :widths: 16 64 |
| 231 | :header-rows: 1 | 232 | :header-rows: 1 |
| 232 | 233 | ||
| 233 | * - Value | 234 | * - Value |
| @@ -314,7 +315,7 @@ The ``osd1`` field has multiple meanings depending on the creator: | |||
| 314 | Linux: | 315 | Linux: |
| 315 | 316 | ||
| 316 | .. list-table:: | 317 | .. list-table:: |
| 317 | :widths: 1 1 1 77 | 318 | :widths: 8 8 24 40 |
| 318 | :header-rows: 1 | 319 | :header-rows: 1 |
| 319 | 320 | ||
| 320 | * - Offset | 321 | * - Offset |
| @@ -331,7 +332,7 @@ Linux: | |||
| 331 | Hurd: | 332 | Hurd: |
| 332 | 333 | ||
| 333 | .. list-table:: | 334 | .. list-table:: |
| 334 | :widths: 1 1 1 77 | 335 | :widths: 8 8 24 40 |
| 335 | :header-rows: 1 | 336 | :header-rows: 1 |
| 336 | 337 | ||
| 337 | * - Offset | 338 | * - Offset |
| @@ -346,7 +347,7 @@ Hurd: | |||
| 346 | Masix: | 347 | Masix: |
| 347 | 348 | ||
| 348 | .. list-table:: | 349 | .. list-table:: |
| 349 | :widths: 1 1 1 77 | 350 | :widths: 8 8 24 40 |
| 350 | :header-rows: 1 | 351 | :header-rows: 1 |
| 351 | 352 | ||
| 352 | * - Offset | 353 | * - Offset |
| @@ -365,7 +366,7 @@ The ``osd2`` field has multiple meanings depending on the filesystem creator: | |||
| 365 | Linux: | 366 | Linux: |
| 366 | 367 | ||
| 367 | .. list-table:: | 368 | .. list-table:: |
| 368 | :widths: 1 1 1 77 | 369 | :widths: 8 8 24 40 |
| 369 | :header-rows: 1 | 370 | :header-rows: 1 |
| 370 | 371 | ||
| 371 | * - Offset | 372 | * - Offset |
| @@ -402,7 +403,7 @@ Linux: | |||
| 402 | Hurd: | 403 | Hurd: |
| 403 | 404 | ||
| 404 | .. list-table:: | 405 | .. list-table:: |
| 405 | :widths: 1 1 1 77 | 406 | :widths: 8 8 24 40 |
| 406 | :header-rows: 1 | 407 | :header-rows: 1 |
| 407 | 408 | ||
| 408 | * - Offset | 409 | * - Offset |
| @@ -433,7 +434,7 @@ Hurd: | |||
| 433 | Masix: | 434 | Masix: |
| 434 | 435 | ||
| 435 | .. list-table:: | 436 | .. list-table:: |
| 436 | :widths: 1 1 1 77 | 437 | :widths: 8 8 24 40 |
| 437 | :header-rows: 1 | 438 | :header-rows: 1 |
| 438 | 439 | ||
| 439 | * - Offset | 440 | * - Offset |
diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/journal.rst index e7031af86876..ea613ee701f5 100644 --- a/Documentation/filesystems/ext4/ondisk/journal.rst +++ b/Documentation/filesystems/ext4/journal.rst | |||
| @@ -48,7 +48,7 @@ Layout | |||
| 48 | Generally speaking, the journal has this format: | 48 | Generally speaking, the journal has this format: |
| 49 | 49 | ||
| 50 | .. list-table:: | 50 | .. list-table:: |
| 51 | :widths: 1 1 78 | 51 | :widths: 16 48 16 |
| 52 | :header-rows: 1 | 52 | :header-rows: 1 |
| 53 | 53 | ||
| 54 | * - Superblock | 54 | * - Superblock |
| @@ -76,7 +76,7 @@ The journal superblock will be in the next full block after the | |||
| 76 | superblock. | 76 | superblock. |
| 77 | 77 | ||
| 78 | .. list-table:: | 78 | .. list-table:: |
| 79 | :widths: 1 1 1 1 76 | 79 | :widths: 12 12 12 32 12 |
| 80 | :header-rows: 1 | 80 | :header-rows: 1 |
| 81 | 81 | ||
| 82 | * - 1024 bytes of padding | 82 | * - 1024 bytes of padding |
| @@ -98,7 +98,7 @@ Every block in the journal starts with a common 12-byte header | |||
| 98 | ``struct journal_header_s``: | 98 | ``struct journal_header_s``: |
| 99 | 99 | ||
| 100 | .. list-table:: | 100 | .. list-table:: |
| 101 | :widths: 1 1 1 77 | 101 | :widths: 8 8 24 40 |
| 102 | :header-rows: 1 | 102 | :header-rows: 1 |
| 103 | 103 | ||
| 104 | * - Offset | 104 | * - Offset |
| @@ -124,7 +124,7 @@ Every block in the journal starts with a common 12-byte header | |||
| 124 | The journal block type can be any one of: | 124 | The journal block type can be any one of: |
| 125 | 125 | ||
| 126 | .. list-table:: | 126 | .. list-table:: |
| 127 | :widths: 1 79 | 127 | :widths: 16 64 |
| 128 | :header-rows: 1 | 128 | :header-rows: 1 |
| 129 | 129 | ||
| 130 | * - Value | 130 | * - Value |
| @@ -154,7 +154,7 @@ The journal superblock is recorded as ``struct journal_superblock_s``, | |||
| 154 | which is 1024 bytes long: | 154 | which is 1024 bytes long: |
| 155 | 155 | ||
| 156 | .. list-table:: | 156 | .. list-table:: |
| 157 | :widths: 1 1 1 77 | 157 | :widths: 8 8 24 40 |
| 158 | :header-rows: 1 | 158 | :header-rows: 1 |
| 159 | 159 | ||
| 160 | * - Offset | 160 | * - Offset |
| @@ -264,7 +264,7 @@ which is 1024 bytes long: | |||
| 264 | The journal compat features are any combination of the following: | 264 | The journal compat features are any combination of the following: |
| 265 | 265 | ||
| 266 | .. list-table:: | 266 | .. list-table:: |
| 267 | :widths: 1 79 | 267 | :widths: 16 64 |
| 268 | :header-rows: 1 | 268 | :header-rows: 1 |
| 269 | 269 | ||
| 270 | * - Value | 270 | * - Value |
| @@ -278,7 +278,7 @@ The journal compat features are any combination of the following: | |||
| 278 | The journal incompat features are any combination of the following: | 278 | The journal incompat features are any combination of the following: |
| 279 | 279 | ||
| 280 | .. list-table:: | 280 | .. list-table:: |
| 281 | :widths: 1 79 | 281 | :widths: 16 64 |
| 282 | :header-rows: 1 | 282 | :header-rows: 1 |
| 283 | 283 | ||
| 284 | * - Value | 284 | * - Value |
| @@ -306,7 +306,7 @@ Journal checksum type codes are one of the following. crc32 or crc32c are the | |||
| 306 | most likely choices. | 306 | most likely choices. |
| 307 | 307 | ||
| 308 | .. list-table:: | 308 | .. list-table:: |
| 309 | :widths: 1 79 | 309 | :widths: 16 64 |
| 310 | :header-rows: 1 | 310 | :header-rows: 1 |
| 311 | 311 | ||
| 312 | * - Value | 312 | * - Value |
| @@ -330,7 +330,7 @@ described by a data structure, but here is the block structure anyway. | |||
| 330 | Descriptor blocks consume at least 36 bytes, but use a full block: | 330 | Descriptor blocks consume at least 36 bytes, but use a full block: |
| 331 | 331 | ||
| 332 | .. list-table:: | 332 | .. list-table:: |
| 333 | :widths: 1 1 1 77 | 333 | :widths: 8 8 24 40 |
| 334 | :header-rows: 1 | 334 | :header-rows: 1 |
| 335 | 335 | ||
| 336 | * - Offset | 336 | * - Offset |
| @@ -355,7 +355,7 @@ defined as ``struct journal_block_tag3_s``, which looks like the | |||
| 355 | following. The size is 16 or 32 bytes. | 355 | following. The size is 16 or 32 bytes. |
| 356 | 356 | ||
| 357 | .. list-table:: | 357 | .. list-table:: |
| 358 | :widths: 1 1 1 77 | 358 | :widths: 8 8 24 40 |
| 359 | :header-rows: 1 | 359 | :header-rows: 1 |
| 360 | 360 | ||
| 361 | * - Offset | 361 | * - Offset |
| @@ -400,7 +400,7 @@ following. The size is 16 or 32 bytes. | |||
| 400 | The journal tag flags are any combination of the following: | 400 | The journal tag flags are any combination of the following: |
| 401 | 401 | ||
| 402 | .. list-table:: | 402 | .. list-table:: |
| 403 | :widths: 1 79 | 403 | :widths: 16 64 |
| 404 | :header-rows: 1 | 404 | :header-rows: 1 |
| 405 | 405 | ||
| 406 | * - Value | 406 | * - Value |
| @@ -421,7 +421,7 @@ is defined as ``struct journal_block_tag_s``, which looks like the | |||
| 421 | following. The size is 8, 12, 24, or 28 bytes: | 421 | following. The size is 8, 12, 24, or 28 bytes: |
| 422 | 422 | ||
| 423 | .. list-table:: | 423 | .. list-table:: |
| 424 | :widths: 1 1 1 77 | 424 | :widths: 8 8 24 40 |
| 425 | :header-rows: 1 | 425 | :header-rows: 1 |
| 426 | 426 | ||
| 427 | * - Offset | 427 | * - Offset |
| @@ -471,7 +471,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a | |||
| 471 | ``struct jbd2_journal_block_tail``, which looks like this: | 471 | ``struct jbd2_journal_block_tail``, which looks like this: |
| 472 | 472 | ||
| 473 | .. list-table:: | 473 | .. list-table:: |
| 474 | :widths: 1 1 1 77 | 474 | :widths: 8 8 24 40 |
| 475 | :header-rows: 1 | 475 | :header-rows: 1 |
| 476 | 476 | ||
| 477 | * - Offset | 477 | * - Offset |
| @@ -513,7 +513,7 @@ Revocation blocks are described in | |||
| 513 | length, but use a full block: | 513 | length, but use a full block: |
| 514 | 514 | ||
| 515 | .. list-table:: | 515 | .. list-table:: |
| 516 | :widths: 1 1 1 77 | 516 | :widths: 8 8 24 40 |
| 517 | :header-rows: 1 | 517 | :header-rows: 1 |
| 518 | 518 | ||
| 519 | * - Offset | 519 | * - Offset |
| @@ -543,7 +543,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation | |||
| 543 | block is a ``struct jbd2_journal_revoke_tail``, which has this format: | 543 | block is a ``struct jbd2_journal_revoke_tail``, which has this format: |
| 544 | 544 | ||
| 545 | .. list-table:: | 545 | .. list-table:: |
| 546 | :widths: 1 1 1 77 | 546 | :widths: 8 8 24 40 |
| 547 | :header-rows: 1 | 547 | :header-rows: 1 |
| 548 | 548 | ||
| 549 | * - Offset | 549 | * - Offset |
| @@ -567,7 +567,7 @@ The commit block is described by ``struct commit_header``, which is 32 | |||
| 567 | bytes long (but uses a full block): | 567 | bytes long (but uses a full block): |
| 568 | 568 | ||
| 569 | .. list-table:: | 569 | .. list-table:: |
| 570 | :widths: 1 1 1 77 | 570 | :widths: 8 8 24 40 |
| 571 | :header-rows: 1 | 571 | :header-rows: 1 |
| 572 | 572 | ||
| 573 | * - Offset | 573 | * - Offset |
diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/mmp.rst index b7d7a3137f80..25660981d93c 100644 --- a/Documentation/filesystems/ext4/ondisk/mmp.rst +++ b/Documentation/filesystems/ext4/mmp.rst | |||
| @@ -32,7 +32,7 @@ The checksum is calculated against the FS UUID and the MMP structure. | |||
| 32 | The MMP structure (``struct mmp_struct``) is as follows: | 32 | The MMP structure (``struct mmp_struct``) is as follows: |
| 33 | 33 | ||
| 34 | .. list-table:: | 34 | .. list-table:: |
| 35 | :widths: 1 1 1 77 | 35 | :widths: 8 12 20 40 |
| 36 | :header-rows: 1 | 36 | :header-rows: 1 |
| 37 | 37 | ||
| 38 | * - Offset | 38 | * - Offset |
diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst deleted file mode 100644 index f7d082c3a435..000000000000 --- a/Documentation/filesystems/ext4/ondisk/index.rst +++ /dev/null | |||
| @@ -1,9 +0,0 @@ | |||
| 1 | .. SPDX-License-Identifier: GPL-2.0 | ||
| 2 | |||
| 3 | ============================== | ||
| 4 | Data Structures and Algorithms | ||
| 5 | ============================== | ||
| 6 | .. include:: about.rst | ||
| 7 | .. include:: overview.rst | ||
| 8 | .. include:: globals.rst | ||
| 9 | .. include:: dynamic.rst | ||
diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/overview.rst index cbab18baba12..cbab18baba12 100644 --- a/Documentation/filesystems/ext4/ondisk/overview.rst +++ b/Documentation/filesystems/ext4/overview.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/special_inodes.rst index a82f70c9baeb..9061aabba827 100644 --- a/Documentation/filesystems/ext4/ondisk/special_inodes.rst +++ b/Documentation/filesystems/ext4/special_inodes.rst | |||
| @@ -6,7 +6,7 @@ Special inodes | |||
| 6 | ext4 reserves some inode for special features, as follows: | 6 | ext4 reserves some inode for special features, as follows: |
| 7 | 7 | ||
| 8 | .. list-table:: | 8 | .. list-table:: |
| 9 | :widths: 1 79 | 9 | :widths: 6 70 |
| 10 | :header-rows: 1 | 10 | :header-rows: 1 |
| 11 | 11 | ||
| 12 | * - inode Number | 12 | * - inode Number |
diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/super.rst index 5f81dd87e0b9..04ff079a2acf 100644 --- a/Documentation/filesystems/ext4/ondisk/super.rst +++ b/Documentation/filesystems/ext4/super.rst | |||
| @@ -19,7 +19,7 @@ The ext4 superblock is laid out as follows in | |||
| 19 | ``struct ext4_super_block``: | 19 | ``struct ext4_super_block``: |
| 20 | 20 | ||
| 21 | .. list-table:: | 21 | .. list-table:: |
| 22 | :widths: 1 1 1 77 | 22 | :widths: 8 8 24 40 |
| 23 | :header-rows: 1 | 23 | :header-rows: 1 |
| 24 | 24 | ||
| 25 | * - Offset | 25 | * - Offset |
| @@ -483,7 +483,7 @@ The ext4 superblock is laid out as follows in | |||
| 483 | The superblock state is some combination of the following: | 483 | The superblock state is some combination of the following: |
| 484 | 484 | ||
| 485 | .. list-table:: | 485 | .. list-table:: |
| 486 | :widths: 1 79 | 486 | :widths: 8 72 |
| 487 | :header-rows: 1 | 487 | :header-rows: 1 |
| 488 | 488 | ||
| 489 | * - Value | 489 | * - Value |
| @@ -500,7 +500,7 @@ The superblock state is some combination of the following: | |||
| 500 | The superblock error policy is one of the following: | 500 | The superblock error policy is one of the following: |
| 501 | 501 | ||
| 502 | .. list-table:: | 502 | .. list-table:: |
| 503 | :widths: 1 79 | 503 | :widths: 8 72 |
| 504 | :header-rows: 1 | 504 | :header-rows: 1 |
| 505 | 505 | ||
| 506 | * - Value | 506 | * - Value |
| @@ -517,7 +517,7 @@ The superblock error policy is one of the following: | |||
| 517 | The filesystem creator is one of the following: | 517 | The filesystem creator is one of the following: |
| 518 | 518 | ||
| 519 | .. list-table:: | 519 | .. list-table:: |
| 520 | :widths: 1 79 | 520 | :widths: 8 72 |
| 521 | :header-rows: 1 | 521 | :header-rows: 1 |
| 522 | 522 | ||
| 523 | * - Value | 523 | * - Value |
| @@ -538,7 +538,7 @@ The filesystem creator is one of the following: | |||
| 538 | The superblock revision is one of the following: | 538 | The superblock revision is one of the following: |
| 539 | 539 | ||
| 540 | .. list-table:: | 540 | .. list-table:: |
| 541 | :widths: 1 79 | 541 | :widths: 8 72 |
| 542 | :header-rows: 1 | 542 | :header-rows: 1 |
| 543 | 543 | ||
| 544 | * - Value | 544 | * - Value |
| @@ -556,7 +556,7 @@ The superblock compatible features field is a combination of any of the | |||
| 556 | following: | 556 | following: |
| 557 | 557 | ||
| 558 | .. list-table:: | 558 | .. list-table:: |
| 559 | :widths: 1 79 | 559 | :widths: 16 64 |
| 560 | :header-rows: 1 | 560 | :header-rows: 1 |
| 561 | 561 | ||
| 562 | * - Value | 562 | * - Value |
| @@ -595,7 +595,7 @@ The superblock incompatible features field is a combination of any of the | |||
| 595 | following: | 595 | following: |
| 596 | 596 | ||
| 597 | .. list-table:: | 597 | .. list-table:: |
| 598 | :widths: 1 79 | 598 | :widths: 16 64 |
| 599 | :header-rows: 1 | 599 | :header-rows: 1 |
| 600 | 600 | ||
| 601 | * - Value | 601 | * - Value |
| @@ -647,7 +647,7 @@ The superblock read-only compatible features field is a combination of any of | |||
| 647 | the following: | 647 | the following: |
| 648 | 648 | ||
| 649 | .. list-table:: | 649 | .. list-table:: |
| 650 | :widths: 1 79 | 650 | :widths: 16 64 |
| 651 | :header-rows: 1 | 651 | :header-rows: 1 |
| 652 | 652 | ||
| 653 | * - Value | 653 | * - Value |
| @@ -702,7 +702,7 @@ the following: | |||
| 702 | The ``s_def_hash_version`` field is one of the following: | 702 | The ``s_def_hash_version`` field is one of the following: |
| 703 | 703 | ||
| 704 | .. list-table:: | 704 | .. list-table:: |
| 705 | :widths: 1 79 | 705 | :widths: 8 72 |
| 706 | :header-rows: 1 | 706 | :header-rows: 1 |
| 707 | 707 | ||
| 708 | * - Value | 708 | * - Value |
| @@ -725,7 +725,7 @@ The ``s_def_hash_version`` field is one of the following: | |||
| 725 | The ``s_default_mount_opts`` field is any combination of the following: | 725 | The ``s_default_mount_opts`` field is any combination of the following: |
| 726 | 726 | ||
| 727 | .. list-table:: | 727 | .. list-table:: |
| 728 | :widths: 1 79 | 728 | :widths: 8 72 |
| 729 | :header-rows: 1 | 729 | :header-rows: 1 |
| 730 | 730 | ||
| 731 | * - Value | 731 | * - Value |
| @@ -767,7 +767,7 @@ The ``s_default_mount_opts`` field is any combination of the following: | |||
| 767 | The ``s_flags`` field is any combination of the following: | 767 | The ``s_flags`` field is any combination of the following: |
| 768 | 768 | ||
| 769 | .. list-table:: | 769 | .. list-table:: |
| 770 | :widths: 1 79 | 770 | :widths: 8 72 |
| 771 | :header-rows: 1 | 771 | :header-rows: 1 |
| 772 | 772 | ||
| 773 | * - Value | 773 | * - Value |
| @@ -784,7 +784,7 @@ The ``s_flags`` field is any combination of the following: | |||
| 784 | The ``s_encrypt_algos`` list can contain any of the following: | 784 | The ``s_encrypt_algos`` list can contain any of the following: |
| 785 | 785 | ||
| 786 | .. list-table:: | 786 | .. list-table:: |
| 787 | :widths: 1 79 | 787 | :widths: 8 72 |
| 788 | :header-rows: 1 | 788 | :header-rows: 1 |
| 789 | 789 | ||
| 790 | * - Value | 790 | * - Value |
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fb50f9aa6ead..c1d570ee1d9f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c | |||
| @@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) | |||
| 284 | error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, | 284 | error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, |
| 285 | default_acl, XATTR_CREATE); | 285 | default_acl, XATTR_CREATE); |
| 286 | posix_acl_release(default_acl); | 286 | posix_acl_release(default_acl); |
| 287 | } else { | ||
| 288 | inode->i_default_acl = NULL; | ||
| 287 | } | 289 | } |
| 288 | if (acl) { | 290 | if (acl) { |
| 289 | if (!error) | 291 | if (!error) |
| 290 | error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, | 292 | error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, |
| 291 | acl, XATTR_CREATE); | 293 | acl, XATTR_CREATE); |
| 292 | posix_acl_release(acl); | 294 | posix_acl_release(acl); |
| 295 | } else { | ||
| 296 | inode->i_acl = NULL; | ||
| 293 | } | 297 | } |
| 294 | return error; | 298 | return error; |
| 295 | } | 299 | } |
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index caff935fbeb8..12f90d48ba61 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
| @@ -628,6 +628,7 @@ enum { | |||
| 628 | #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 | 628 | #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 |
| 629 | #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 | 629 | #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 |
| 630 | #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 | 630 | #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 |
| 631 | #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 | ||
| 631 | 632 | ||
| 632 | /* | 633 | /* |
| 633 | * ioctl commands | 634 | * ioctl commands |
| @@ -1030,6 +1031,9 @@ struct ext4_inode_info { | |||
| 1030 | ext4_lblk_t i_da_metadata_calc_last_lblock; | 1031 | ext4_lblk_t i_da_metadata_calc_last_lblock; |
| 1031 | int i_da_metadata_calc_len; | 1032 | int i_da_metadata_calc_len; |
| 1032 | 1033 | ||
| 1034 | /* pending cluster reservations for bigalloc file systems */ | ||
| 1035 | struct ext4_pending_tree i_pending_tree; | ||
| 1036 | |||
| 1033 | /* on-disk additional length */ | 1037 | /* on-disk additional length */ |
| 1034 | __u16 i_extra_isize; | 1038 | __u16 i_extra_isize; |
| 1035 | 1039 | ||
| @@ -1401,7 +1405,8 @@ struct ext4_sb_info { | |||
| 1401 | u32 s_min_batch_time; | 1405 | u32 s_min_batch_time; |
| 1402 | struct block_device *journal_bdev; | 1406 | struct block_device *journal_bdev; |
| 1403 | #ifdef CONFIG_QUOTA | 1407 | #ifdef CONFIG_QUOTA |
| 1404 | char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ | 1408 | /* Names of quota files with journalled quota */ |
| 1409 | char __rcu *s_qf_names[EXT4_MAXQUOTAS]; | ||
| 1405 | int s_jquota_fmt; /* Format of quota to use */ | 1410 | int s_jquota_fmt; /* Format of quota to use */ |
| 1406 | #endif | 1411 | #endif |
| 1407 | unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ | 1412 | unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ |
| @@ -2483,10 +2488,11 @@ extern int ext4_writepage_trans_blocks(struct inode *); | |||
| 2483 | extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); | 2488 | extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); |
| 2484 | extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, | 2489 | extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, |
| 2485 | loff_t lstart, loff_t lend); | 2490 | loff_t lstart, loff_t lend); |
| 2486 | extern int ext4_page_mkwrite(struct vm_fault *vmf); | 2491 | extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); |
| 2487 | extern int ext4_filemap_fault(struct vm_fault *vmf); | 2492 | extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); |
| 2488 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); | 2493 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); |
| 2489 | extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); | 2494 | extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); |
| 2495 | extern void ext4_da_release_space(struct inode *inode, int to_free); | ||
| 2490 | extern void ext4_da_update_reserve_space(struct inode *inode, | 2496 | extern void ext4_da_update_reserve_space(struct inode *inode, |
| 2491 | int used, int quota_claim); | 2497 | int used, int quota_claim); |
| 2492 | extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, | 2498 | extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, |
| @@ -3142,10 +3148,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, | |||
| 3142 | int flags); | 3148 | int flags); |
| 3143 | extern void ext4_ext_drop_refs(struct ext4_ext_path *); | 3149 | extern void ext4_ext_drop_refs(struct ext4_ext_path *); |
| 3144 | extern int ext4_ext_check_inode(struct inode *inode); | 3150 | extern int ext4_ext_check_inode(struct inode *inode); |
| 3145 | extern int ext4_find_delalloc_range(struct inode *inode, | ||
| 3146 | ext4_lblk_t lblk_start, | ||
| 3147 | ext4_lblk_t lblk_end); | ||
| 3148 | extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); | ||
| 3149 | extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); | 3151 | extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); |
| 3150 | extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 3152 | extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
| 3151 | __u64 start, __u64 len); | 3153 | __u64 start, __u64 len); |
| @@ -3156,6 +3158,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, | |||
| 3156 | struct inode *inode2, ext4_lblk_t lblk1, | 3158 | struct inode *inode2, ext4_lblk_t lblk1, |
| 3157 | ext4_lblk_t lblk2, ext4_lblk_t count, | 3159 | ext4_lblk_t lblk2, ext4_lblk_t count, |
| 3158 | int mark_unwritten,int *err); | 3160 | int mark_unwritten,int *err); |
| 3161 | extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); | ||
| 3159 | 3162 | ||
| 3160 | /* move_extent.c */ | 3163 | /* move_extent.c */ |
| 3161 | extern void ext4_double_down_write_data_sem(struct inode *first, | 3164 | extern void ext4_double_down_write_data_sem(struct inode *first, |
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index adf6668b596f..98bd0e9ee7df 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h | |||
| @@ -120,6 +120,19 @@ struct ext4_ext_path { | |||
| 120 | }; | 120 | }; |
| 121 | 121 | ||
| 122 | /* | 122 | /* |
| 123 | * Used to record a portion of a cluster found at the beginning or end | ||
| 124 | * of an extent while traversing the extent tree during space removal. | ||
| 125 | * A partial cluster may be removed if it does not contain blocks shared | ||
| 126 | * with extents that aren't being deleted (tofree state). Otherwise, | ||
| 127 | * it cannot be removed (nofree state). | ||
| 128 | */ | ||
| 129 | struct partial_cluster { | ||
| 130 | ext4_fsblk_t pclu; /* physical cluster number */ | ||
| 131 | ext4_lblk_t lblk; /* logical block number within logical cluster */ | ||
| 132 | enum {initial, tofree, nofree} state; | ||
| 133 | }; | ||
| 134 | |||
| 135 | /* | ||
| 123 | * structure for external API | 136 | * structure for external API |
| 124 | */ | 137 | */ |
| 125 | 138 | ||
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 72a361d5ef74..240b6dea5441 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
| @@ -2351,8 +2351,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, | |||
| 2351 | { | 2351 | { |
| 2352 | struct extent_status es; | 2352 | struct extent_status es; |
| 2353 | 2353 | ||
| 2354 | ext4_es_find_delayed_extent_range(inode, hole_start, | 2354 | ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, |
| 2355 | hole_start + hole_len - 1, &es); | 2355 | hole_start + hole_len - 1, &es); |
| 2356 | if (es.es_len) { | 2356 | if (es.es_len) { |
| 2357 | /* There's delayed extent containing lblock? */ | 2357 | /* There's delayed extent containing lblock? */ |
| 2358 | if (es.es_lblk <= hole_start) | 2358 | if (es.es_lblk <= hole_start) |
| @@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode) | |||
| 2490 | return 0; | 2490 | return 0; |
| 2491 | } | 2491 | } |
| 2492 | 2492 | ||
| 2493 | /* | ||
| 2494 | * ext4_rereserve_cluster - increment the reserved cluster count when | ||
| 2495 | * freeing a cluster with a pending reservation | ||
| 2496 | * | ||
| 2497 | * @inode - file containing the cluster | ||
| 2498 | * @lblk - logical block in cluster to be reserved | ||
| 2499 | * | ||
| 2500 | * Increments the reserved cluster count and adjusts quota in a bigalloc | ||
| 2501 | * file system when freeing a partial cluster containing at least one | ||
| 2502 | * delayed and unwritten block. A partial cluster meeting that | ||
| 2503 | * requirement will have a pending reservation. If so, the | ||
| 2504 | * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to | ||
| 2505 | * defer reserved and allocated space accounting to a subsequent call | ||
| 2506 | * to this function. | ||
| 2507 | */ | ||
| 2508 | static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) | ||
| 2509 | { | ||
| 2510 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 2511 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
| 2512 | |||
| 2513 | dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); | ||
| 2514 | |||
| 2515 | spin_lock(&ei->i_block_reservation_lock); | ||
| 2516 | ei->i_reserved_data_blocks++; | ||
| 2517 | percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); | ||
| 2518 | spin_unlock(&ei->i_block_reservation_lock); | ||
| 2519 | |||
| 2520 | percpu_counter_add(&sbi->s_freeclusters_counter, 1); | ||
| 2521 | ext4_remove_pending(inode, lblk); | ||
| 2522 | } | ||
| 2523 | |||
| 2493 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | 2524 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, |
| 2494 | struct ext4_extent *ex, | 2525 | struct ext4_extent *ex, |
| 2495 | long long *partial_cluster, | 2526 | struct partial_cluster *partial, |
| 2496 | ext4_lblk_t from, ext4_lblk_t to) | 2527 | ext4_lblk_t from, ext4_lblk_t to) |
| 2497 | { | 2528 | { |
| 2498 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2529 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
| 2499 | unsigned short ee_len = ext4_ext_get_actual_len(ex); | 2530 | unsigned short ee_len = ext4_ext_get_actual_len(ex); |
| 2500 | ext4_fsblk_t pblk; | 2531 | ext4_fsblk_t last_pblk, pblk; |
| 2501 | int flags = get_default_free_blocks_flags(inode); | 2532 | ext4_lblk_t num; |
| 2533 | int flags; | ||
| 2534 | |||
| 2535 | /* only extent tail removal is allowed */ | ||
| 2536 | if (from < le32_to_cpu(ex->ee_block) || | ||
| 2537 | to != le32_to_cpu(ex->ee_block) + ee_len - 1) { | ||
| 2538 | ext4_error(sbi->s_sb, | ||
| 2539 | "strange request: removal(2) %u-%u from %u:%u", | ||
| 2540 | from, to, le32_to_cpu(ex->ee_block), ee_len); | ||
| 2541 | return 0; | ||
| 2542 | } | ||
| 2543 | |||
| 2544 | #ifdef EXTENTS_STATS | ||
| 2545 | spin_lock(&sbi->s_ext_stats_lock); | ||
| 2546 | sbi->s_ext_blocks += ee_len; | ||
| 2547 | sbi->s_ext_extents++; | ||
| 2548 | if (ee_len < sbi->s_ext_min) | ||
| 2549 | sbi->s_ext_min = ee_len; | ||
| 2550 | if (ee_len > sbi->s_ext_max) | ||
| 2551 | sbi->s_ext_max = ee_len; | ||
| 2552 | if (ext_depth(inode) > sbi->s_depth_max) | ||
| 2553 | sbi->s_depth_max = ext_depth(inode); | ||
| 2554 | spin_unlock(&sbi->s_ext_stats_lock); | ||
| 2555 | #endif | ||
| 2556 | |||
| 2557 | trace_ext4_remove_blocks(inode, ex, from, to, partial); | ||
| 2502 | 2558 | ||
| 2503 | /* | 2559 | /* |
| 2504 | * For bigalloc file systems, we never free a partial cluster | 2560 | * if we have a partial cluster, and it's different from the |
| 2505 | * at the beginning of the extent. Instead, we make a note | 2561 | * cluster of the last block in the extent, we free it |
| 2506 | * that we tried freeing the cluster, and check to see if we | ||
| 2507 | * need to free it on a subsequent call to ext4_remove_blocks, | ||
| 2508 | * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. | ||
| 2509 | */ | 2562 | */ |
| 2510 | flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; | 2563 | last_pblk = ext4_ext_pblock(ex) + ee_len - 1; |
| 2564 | |||
| 2565 | if (partial->state != initial && | ||
| 2566 | partial->pclu != EXT4_B2C(sbi, last_pblk)) { | ||
| 2567 | if (partial->state == tofree) { | ||
| 2568 | flags = get_default_free_blocks_flags(inode); | ||
| 2569 | if (ext4_is_pending(inode, partial->lblk)) | ||
| 2570 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; | ||
| 2571 | ext4_free_blocks(handle, inode, NULL, | ||
| 2572 | EXT4_C2B(sbi, partial->pclu), | ||
| 2573 | sbi->s_cluster_ratio, flags); | ||
| 2574 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) | ||
| 2575 | ext4_rereserve_cluster(inode, partial->lblk); | ||
| 2576 | } | ||
| 2577 | partial->state = initial; | ||
| 2578 | } | ||
| 2579 | |||
| 2580 | num = le32_to_cpu(ex->ee_block) + ee_len - from; | ||
| 2581 | pblk = ext4_ext_pblock(ex) + ee_len - num; | ||
| 2511 | 2582 | ||
| 2512 | trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); | ||
| 2513 | /* | 2583 | /* |
| 2514 | * If we have a partial cluster, and it's different from the | 2584 | * We free the partial cluster at the end of the extent (if any), |
| 2515 | * cluster of the last block, we need to explicitly free the | 2585 | * unless the cluster is used by another extent (partial_cluster |
| 2516 | * partial cluster here. | 2586 | * state is nofree). If a partial cluster exists here, it must be |
| 2587 | * shared with the last block in the extent. | ||
| 2517 | */ | 2588 | */ |
| 2518 | pblk = ext4_ext_pblock(ex) + ee_len - 1; | 2589 | flags = get_default_free_blocks_flags(inode); |
| 2519 | if (*partial_cluster > 0 && | 2590 | |
| 2520 | *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { | 2591 | /* partial, left end cluster aligned, right end unaligned */ |
| 2592 | if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && | ||
| 2593 | (EXT4_LBLK_CMASK(sbi, to) >= from) && | ||
| 2594 | (partial->state != nofree)) { | ||
| 2595 | if (ext4_is_pending(inode, to)) | ||
| 2596 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; | ||
| 2521 | ext4_free_blocks(handle, inode, NULL, | 2597 | ext4_free_blocks(handle, inode, NULL, |
| 2522 | EXT4_C2B(sbi, *partial_cluster), | 2598 | EXT4_PBLK_CMASK(sbi, last_pblk), |
| 2523 | sbi->s_cluster_ratio, flags); | 2599 | sbi->s_cluster_ratio, flags); |
| 2524 | *partial_cluster = 0; | 2600 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
| 2601 | ext4_rereserve_cluster(inode, to); | ||
| 2602 | partial->state = initial; | ||
| 2603 | flags = get_default_free_blocks_flags(inode); | ||
| 2525 | } | 2604 | } |
| 2526 | 2605 | ||
| 2527 | #ifdef EXTENTS_STATS | 2606 | flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; |
| 2528 | { | ||
| 2529 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 2530 | spin_lock(&sbi->s_ext_stats_lock); | ||
| 2531 | sbi->s_ext_blocks += ee_len; | ||
| 2532 | sbi->s_ext_extents++; | ||
| 2533 | if (ee_len < sbi->s_ext_min) | ||
| 2534 | sbi->s_ext_min = ee_len; | ||
| 2535 | if (ee_len > sbi->s_ext_max) | ||
| 2536 | sbi->s_ext_max = ee_len; | ||
| 2537 | if (ext_depth(inode) > sbi->s_depth_max) | ||
| 2538 | sbi->s_depth_max = ext_depth(inode); | ||
| 2539 | spin_unlock(&sbi->s_ext_stats_lock); | ||
| 2540 | } | ||
| 2541 | #endif | ||
| 2542 | if (from >= le32_to_cpu(ex->ee_block) | ||
| 2543 | && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { | ||
| 2544 | /* tail removal */ | ||
| 2545 | ext4_lblk_t num; | ||
| 2546 | long long first_cluster; | ||
| 2547 | |||
| 2548 | num = le32_to_cpu(ex->ee_block) + ee_len - from; | ||
| 2549 | pblk = ext4_ext_pblock(ex) + ee_len - num; | ||
| 2550 | /* | ||
| 2551 | * Usually we want to free partial cluster at the end of the | ||
| 2552 | * extent, except for the situation when the cluster is still | ||
| 2553 | * used by any other extent (partial_cluster is negative). | ||
| 2554 | */ | ||
| 2555 | if (*partial_cluster < 0 && | ||
| 2556 | *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) | ||
| 2557 | flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; | ||
| 2558 | 2607 | ||
| 2559 | ext_debug("free last %u blocks starting %llu partial %lld\n", | 2608 | /* |
| 2560 | num, pblk, *partial_cluster); | 2609 | * For bigalloc file systems, we never free a partial cluster |
| 2561 | ext4_free_blocks(handle, inode, NULL, pblk, num, flags); | 2610 | * at the beginning of the extent. Instead, we check to see if we |
| 2562 | /* | 2611 | * need to free it on a subsequent call to ext4_remove_blocks, |
| 2563 | * If the block range to be freed didn't start at the | 2612 | * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. |
| 2564 | * beginning of a cluster, and we removed the entire | 2613 | */ |
| 2565 | * extent and the cluster is not used by any other extent, | 2614 | flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; |
| 2566 | * save the partial cluster here, since we might need to | 2615 | ext4_free_blocks(handle, inode, NULL, pblk, num, flags); |
| 2567 | * delete if we determine that the truncate or punch hole | 2616 | |
| 2568 | * operation has removed all of the blocks in the cluster. | 2617 | /* reset the partial cluster if we've freed past it */ |
| 2569 | * If that cluster is used by another extent, preserve its | 2618 | if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) |
| 2570 | * negative value so it isn't freed later on. | 2619 | partial->state = initial; |
| 2571 | * | 2620 | |
| 2572 | * If the whole extent wasn't freed, we've reached the | 2621 | /* |
| 2573 | * start of the truncated/punched region and have finished | 2622 | * If we've freed the entire extent but the beginning is not left |
| 2574 | * removing blocks. If there's a partial cluster here it's | 2623 | * cluster aligned and is not marked as ineligible for freeing we |
| 2575 | * shared with the remainder of the extent and is no longer | 2624 | * record the partial cluster at the beginning of the extent. It |
| 2576 | * a candidate for removal. | 2625 | * wasn't freed by the preceding ext4_free_blocks() call, and we |
| 2577 | */ | 2626 | * need to look farther to the left to determine if it's to be freed |
| 2578 | if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { | 2627 | * (not shared with another extent). Else, reset the partial |
| 2579 | first_cluster = (long long) EXT4_B2C(sbi, pblk); | 2628 | * cluster - we're either done freeing or the beginning of the |
| 2580 | if (first_cluster != -*partial_cluster) | 2629 | * extent is left cluster aligned. |
| 2581 | *partial_cluster = first_cluster; | 2630 | */ |
| 2582 | } else { | 2631 | if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { |
| 2583 | *partial_cluster = 0; | 2632 | if (partial->state == initial) { |
| 2633 | partial->pclu = EXT4_B2C(sbi, pblk); | ||
| 2634 | partial->lblk = from; | ||
| 2635 | partial->state = tofree; | ||
| 2584 | } | 2636 | } |
| 2585 | } else | 2637 | } else { |
| 2586 | ext4_error(sbi->s_sb, "strange request: removal(2) " | 2638 | partial->state = initial; |
| 2587 | "%u-%u from %u:%u", | 2639 | } |
| 2588 | from, to, le32_to_cpu(ex->ee_block), ee_len); | 2640 | |
| 2589 | return 0; | 2641 | return 0; |
| 2590 | } | 2642 | } |
| 2591 | 2643 | ||
| 2592 | |||
| 2593 | /* | 2644 | /* |
| 2594 | * ext4_ext_rm_leaf() Removes the extents associated with the | 2645 | * ext4_ext_rm_leaf() Removes the extents associated with the |
| 2595 | * blocks appearing between "start" and "end". Both "start" | 2646 | * blocks appearing between "start" and "end". Both "start" |
| @@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | |||
| 2608 | static int | 2659 | static int |
| 2609 | ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | 2660 | ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, |
| 2610 | struct ext4_ext_path *path, | 2661 | struct ext4_ext_path *path, |
| 2611 | long long *partial_cluster, | 2662 | struct partial_cluster *partial, |
| 2612 | ext4_lblk_t start, ext4_lblk_t end) | 2663 | ext4_lblk_t start, ext4_lblk_t end) |
| 2613 | { | 2664 | { |
| 2614 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2665 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
| @@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
| 2640 | ex_ee_block = le32_to_cpu(ex->ee_block); | 2691 | ex_ee_block = le32_to_cpu(ex->ee_block); |
| 2641 | ex_ee_len = ext4_ext_get_actual_len(ex); | 2692 | ex_ee_len = ext4_ext_get_actual_len(ex); |
| 2642 | 2693 | ||
| 2643 | trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); | 2694 | trace_ext4_ext_rm_leaf(inode, start, ex, partial); |
| 2644 | 2695 | ||
| 2645 | while (ex >= EXT_FIRST_EXTENT(eh) && | 2696 | while (ex >= EXT_FIRST_EXTENT(eh) && |
| 2646 | ex_ee_block + ex_ee_len > start) { | 2697 | ex_ee_block + ex_ee_len > start) { |
| @@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
| 2671 | */ | 2722 | */ |
| 2672 | if (sbi->s_cluster_ratio > 1) { | 2723 | if (sbi->s_cluster_ratio > 1) { |
| 2673 | pblk = ext4_ext_pblock(ex); | 2724 | pblk = ext4_ext_pblock(ex); |
| 2674 | *partial_cluster = | 2725 | partial->pclu = EXT4_B2C(sbi, pblk); |
| 2675 | -(long long) EXT4_B2C(sbi, pblk); | 2726 | partial->state = nofree; |
| 2676 | } | 2727 | } |
| 2677 | ex--; | 2728 | ex--; |
| 2678 | ex_ee_block = le32_to_cpu(ex->ee_block); | 2729 | ex_ee_block = le32_to_cpu(ex->ee_block); |
| @@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
| 2714 | if (err) | 2765 | if (err) |
| 2715 | goto out; | 2766 | goto out; |
| 2716 | 2767 | ||
| 2717 | err = ext4_remove_blocks(handle, inode, ex, partial_cluster, | 2768 | err = ext4_remove_blocks(handle, inode, ex, partial, a, b); |
| 2718 | a, b); | ||
| 2719 | if (err) | 2769 | if (err) |
| 2720 | goto out; | 2770 | goto out; |
| 2721 | 2771 | ||
| @@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
| 2769 | * If there's a partial cluster and at least one extent remains in | 2819 | * If there's a partial cluster and at least one extent remains in |
| 2770 | * the leaf, free the partial cluster if it isn't shared with the | 2820 | * the leaf, free the partial cluster if it isn't shared with the |
| 2771 | * current extent. If it is shared with the current extent | 2821 | * current extent. If it is shared with the current extent |
| 2772 | * we zero partial_cluster because we've reached the start of the | 2822 | * we reset the partial cluster because we've reached the start of the |
| 2773 | * truncated/punched region and we're done removing blocks. | 2823 | * truncated/punched region and we're done removing blocks. |
| 2774 | */ | 2824 | */ |
| 2775 | if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { | 2825 | if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { |
| 2776 | pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; | 2826 | pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; |
| 2777 | if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { | 2827 | if (partial->pclu != EXT4_B2C(sbi, pblk)) { |
| 2828 | int flags = get_default_free_blocks_flags(inode); | ||
| 2829 | |||
| 2830 | if (ext4_is_pending(inode, partial->lblk)) | ||
| 2831 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; | ||
| 2778 | ext4_free_blocks(handle, inode, NULL, | 2832 | ext4_free_blocks(handle, inode, NULL, |
| 2779 | EXT4_C2B(sbi, *partial_cluster), | 2833 | EXT4_C2B(sbi, partial->pclu), |
| 2780 | sbi->s_cluster_ratio, | 2834 | sbi->s_cluster_ratio, flags); |
| 2781 | get_default_free_blocks_flags(inode)); | 2835 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
| 2836 | ext4_rereserve_cluster(inode, partial->lblk); | ||
| 2782 | } | 2837 | } |
| 2783 | *partial_cluster = 0; | 2838 | partial->state = initial; |
| 2784 | } | 2839 | } |
| 2785 | 2840 | ||
| 2786 | /* if this leaf is free, then we should | 2841 | /* if this leaf is free, then we should |
| @@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, | |||
| 2819 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2874 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
| 2820 | int depth = ext_depth(inode); | 2875 | int depth = ext_depth(inode); |
| 2821 | struct ext4_ext_path *path = NULL; | 2876 | struct ext4_ext_path *path = NULL; |
| 2822 | long long partial_cluster = 0; | 2877 | struct partial_cluster partial; |
| 2823 | handle_t *handle; | 2878 | handle_t *handle; |
| 2824 | int i = 0, err = 0; | 2879 | int i = 0, err = 0; |
| 2825 | 2880 | ||
| 2881 | partial.pclu = 0; | ||
| 2882 | partial.lblk = 0; | ||
| 2883 | partial.state = initial; | ||
| 2884 | |||
| 2826 | ext_debug("truncate since %u to %u\n", start, end); | 2885 | ext_debug("truncate since %u to %u\n", start, end); |
| 2827 | 2886 | ||
| 2828 | /* probably first extent we're gonna free will be last in block */ | 2887 | /* probably first extent we're gonna free will be last in block */ |
| @@ -2882,8 +2941,8 @@ again: | |||
| 2882 | */ | 2941 | */ |
| 2883 | if (sbi->s_cluster_ratio > 1) { | 2942 | if (sbi->s_cluster_ratio > 1) { |
| 2884 | pblk = ext4_ext_pblock(ex) + end - ee_block + 2; | 2943 | pblk = ext4_ext_pblock(ex) + end - ee_block + 2; |
| 2885 | partial_cluster = | 2944 | partial.pclu = EXT4_B2C(sbi, pblk); |
| 2886 | -(long long) EXT4_B2C(sbi, pblk); | 2945 | partial.state = nofree; |
| 2887 | } | 2946 | } |
| 2888 | 2947 | ||
| 2889 | /* | 2948 | /* |
| @@ -2911,9 +2970,10 @@ again: | |||
| 2911 | &ex); | 2970 | &ex); |
| 2912 | if (err) | 2971 | if (err) |
| 2913 | goto out; | 2972 | goto out; |
| 2914 | if (pblk) | 2973 | if (pblk) { |
| 2915 | partial_cluster = | 2974 | partial.pclu = EXT4_B2C(sbi, pblk); |
| 2916 | -(long long) EXT4_B2C(sbi, pblk); | 2975 | partial.state = nofree; |
| 2976 | } | ||
| 2917 | } | 2977 | } |
| 2918 | } | 2978 | } |
| 2919 | /* | 2979 | /* |
| @@ -2948,8 +3008,7 @@ again: | |||
| 2948 | if (i == depth) { | 3008 | if (i == depth) { |
| 2949 | /* this is leaf block */ | 3009 | /* this is leaf block */ |
| 2950 | err = ext4_ext_rm_leaf(handle, inode, path, | 3010 | err = ext4_ext_rm_leaf(handle, inode, path, |
| 2951 | &partial_cluster, start, | 3011 | &partial, start, end); |
| 2952 | end); | ||
| 2953 | /* root level has p_bh == NULL, brelse() eats this */ | 3012 | /* root level has p_bh == NULL, brelse() eats this */ |
| 2954 | brelse(path[i].p_bh); | 3013 | brelse(path[i].p_bh); |
| 2955 | path[i].p_bh = NULL; | 3014 | path[i].p_bh = NULL; |
| @@ -3021,21 +3080,24 @@ again: | |||
| 3021 | } | 3080 | } |
| 3022 | } | 3081 | } |
| 3023 | 3082 | ||
| 3024 | trace_ext4_ext_remove_space_done(inode, start, end, depth, | 3083 | trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, |
| 3025 | partial_cluster, path->p_hdr->eh_entries); | 3084 | path->p_hdr->eh_entries); |
| 3026 | 3085 | ||
| 3027 | /* | 3086 | /* |
| 3028 | * If we still have something in the partial cluster and we have removed | 3087 | * if there's a partial cluster and we have removed the first extent |
| 3029 | * even the first extent, then we should free the blocks in the partial | 3088 | * in the file, then we also free the partial cluster, if any |
| 3030 | * cluster as well. (This code will only run when there are no leaves | ||
| 3031 | * to the immediate left of the truncated/punched region.) | ||
| 3032 | */ | 3089 | */ |
| 3033 | if (partial_cluster > 0 && err == 0) { | 3090 | if (partial.state == tofree && err == 0) { |
| 3034 | /* don't zero partial_cluster since it's not used afterwards */ | 3091 | int flags = get_default_free_blocks_flags(inode); |
| 3092 | |||
| 3093 | if (ext4_is_pending(inode, partial.lblk)) | ||
| 3094 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; | ||
| 3035 | ext4_free_blocks(handle, inode, NULL, | 3095 | ext4_free_blocks(handle, inode, NULL, |
| 3036 | EXT4_C2B(sbi, partial_cluster), | 3096 | EXT4_C2B(sbi, partial.pclu), |
| 3037 | sbi->s_cluster_ratio, | 3097 | sbi->s_cluster_ratio, flags); |
| 3038 | get_default_free_blocks_flags(inode)); | 3098 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
| 3099 | ext4_rereserve_cluster(inode, partial.lblk); | ||
| 3100 | partial.state = initial; | ||
| 3039 | } | 3101 | } |
| 3040 | 3102 | ||
| 3041 | /* TODO: flexible tree reduction should be here */ | 3103 | /* TODO: flexible tree reduction should be here */ |
| @@ -3819,114 +3881,6 @@ out: | |||
| 3819 | return ext4_mark_inode_dirty(handle, inode); | 3881 | return ext4_mark_inode_dirty(handle, inode); |
| 3820 | } | 3882 | } |
| 3821 | 3883 | ||
| 3822 | /** | ||
| 3823 | * ext4_find_delalloc_range: find delayed allocated block in the given range. | ||
| 3824 | * | ||
| 3825 | * Return 1 if there is a delalloc block in the range, otherwise 0. | ||
| 3826 | */ | ||
| 3827 | int ext4_find_delalloc_range(struct inode *inode, | ||
| 3828 | ext4_lblk_t lblk_start, | ||
| 3829 | ext4_lblk_t lblk_end) | ||
| 3830 | { | ||
| 3831 | struct extent_status es; | ||
| 3832 | |||
| 3833 | ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); | ||
| 3834 | if (es.es_len == 0) | ||
| 3835 | return 0; /* there is no delay extent in this tree */ | ||
| 3836 | else if (es.es_lblk <= lblk_start && | ||
| 3837 | lblk_start < es.es_lblk + es.es_len) | ||
| 3838 | return 1; | ||
| 3839 | else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) | ||
| 3840 | return 1; | ||
| 3841 | else | ||
| 3842 | return 0; | ||
| 3843 | } | ||
| 3844 | |||
| 3845 | int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) | ||
| 3846 | { | ||
| 3847 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 3848 | ext4_lblk_t lblk_start, lblk_end; | ||
| 3849 | lblk_start = EXT4_LBLK_CMASK(sbi, lblk); | ||
| 3850 | lblk_end = lblk_start + sbi->s_cluster_ratio - 1; | ||
| 3851 | |||
| 3852 | return ext4_find_delalloc_range(inode, lblk_start, lblk_end); | ||
| 3853 | } | ||
| 3854 | |||
| 3855 | /** | ||
| 3856 | * Determines how many complete clusters (out of those specified by the 'map') | ||
| 3857 | * are under delalloc and were reserved quota for. | ||
| 3858 | * This function is called when we are writing out the blocks that were | ||
| 3859 | * originally written with their allocation delayed, but then the space was | ||
| 3860 | * allocated using fallocate() before the delayed allocation could be resolved. | ||
| 3861 | * The cases to look for are: | ||
| 3862 | * ('=' indicated delayed allocated blocks | ||
| 3863 | * '-' indicates non-delayed allocated blocks) | ||
| 3864 | * (a) partial clusters towards beginning and/or end outside of allocated range | ||
| 3865 | * are not delalloc'ed. | ||
| 3866 | * Ex: | ||
| 3867 | * |----c---=|====c====|====c====|===-c----| | ||
| 3868 | * |++++++ allocated ++++++| | ||
| 3869 | * ==> 4 complete clusters in above example | ||
| 3870 | * | ||
| 3871 | * (b) partial cluster (outside of allocated range) towards either end is | ||
| 3872 | * marked for delayed allocation. In this case, we will exclude that | ||
| 3873 | * cluster. | ||
| 3874 | * Ex: | ||
| 3875 | * |----====c========|========c========| | ||
| 3876 | * |++++++ allocated ++++++| | ||
| 3877 | * ==> 1 complete clusters in above example | ||
| 3878 | * | ||
| 3879 | * Ex: | ||
| 3880 | * |================c================| | ||
| 3881 | * |++++++ allocated ++++++| | ||
| 3882 | * ==> 0 complete clusters in above example | ||
| 3883 | * | ||
| 3884 | * The ext4_da_update_reserve_space will be called only if we | ||
| 3885 | * determine here that there were some "entire" clusters that span | ||
| 3886 | * this 'allocated' range. | ||
| 3887 | * In the non-bigalloc case, this function will just end up returning num_blks | ||
| 3888 | * without ever calling ext4_find_delalloc_range. | ||
| 3889 | */ | ||
| 3890 | static unsigned int | ||
| 3891 | get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, | ||
| 3892 | unsigned int num_blks) | ||
| 3893 | { | ||
| 3894 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 3895 | ext4_lblk_t alloc_cluster_start, alloc_cluster_end; | ||
| 3896 | ext4_lblk_t lblk_from, lblk_to, c_offset; | ||
| 3897 | unsigned int allocated_clusters = 0; | ||
| 3898 | |||
| 3899 | alloc_cluster_start = EXT4_B2C(sbi, lblk_start); | ||
| 3900 | alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); | ||
| 3901 | |||
| 3902 | /* max possible clusters for this allocation */ | ||
| 3903 | allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; | ||
| 3904 | |||
| 3905 | trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); | ||
| 3906 | |||
| 3907 | /* Check towards left side */ | ||
| 3908 | c_offset = EXT4_LBLK_COFF(sbi, lblk_start); | ||
| 3909 | if (c_offset) { | ||
| 3910 | lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); | ||
| 3911 | lblk_to = lblk_from + c_offset - 1; | ||
| 3912 | |||
| 3913 | if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) | ||
| 3914 | allocated_clusters--; | ||
| 3915 | } | ||
| 3916 | |||
| 3917 | /* Now check towards right. */ | ||
| 3918 | c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); | ||
| 3919 | if (allocated_clusters && c_offset) { | ||
| 3920 | lblk_from = lblk_start + num_blks; | ||
| 3921 | lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; | ||
| 3922 | |||
| 3923 | if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) | ||
| 3924 | allocated_clusters--; | ||
| 3925 | } | ||
| 3926 | |||
| 3927 | return allocated_clusters; | ||
| 3928 | } | ||
| 3929 | |||
| 3930 | static int | 3884 | static int |
| 3931 | convert_initialized_extent(handle_t *handle, struct inode *inode, | 3885 | convert_initialized_extent(handle_t *handle, struct inode *inode, |
| 3932 | struct ext4_map_blocks *map, | 3886 | struct ext4_map_blocks *map, |
| @@ -4108,23 +4062,6 @@ out: | |||
| 4108 | } | 4062 | } |
| 4109 | map->m_len = allocated; | 4063 | map->m_len = allocated; |
| 4110 | 4064 | ||
| 4111 | /* | ||
| 4112 | * If we have done fallocate with the offset that is already | ||
| 4113 | * delayed allocated, we would have block reservation | ||
| 4114 | * and quota reservation done in the delayed write path. | ||
| 4115 | * But fallocate would have already updated quota and block | ||
| 4116 | * count for this offset. So cancel these reservation | ||
| 4117 | */ | ||
| 4118 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { | ||
| 4119 | unsigned int reserved_clusters; | ||
| 4120 | reserved_clusters = get_reserved_cluster_alloc(inode, | ||
| 4121 | map->m_lblk, map->m_len); | ||
| 4122 | if (reserved_clusters) | ||
| 4123 | ext4_da_update_reserve_space(inode, | ||
| 4124 | reserved_clusters, | ||
| 4125 | 0); | ||
| 4126 | } | ||
| 4127 | |||
| 4128 | map_out: | 4065 | map_out: |
| 4129 | map->m_flags |= EXT4_MAP_MAPPED; | 4066 | map->m_flags |= EXT4_MAP_MAPPED; |
| 4130 | if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { | 4067 | if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { |
| @@ -4513,77 +4450,39 @@ got_allocated_blocks: | |||
| 4513 | map->m_flags |= EXT4_MAP_NEW; | 4450 | map->m_flags |= EXT4_MAP_NEW; |
| 4514 | 4451 | ||
| 4515 | /* | 4452 | /* |
| 4516 | * Update reserved blocks/metadata blocks after successful | 4453 | * Reduce the reserved cluster count to reflect successful deferred |
| 4517 | * block allocation which had been deferred till now. | 4454 | * allocation of delayed allocated clusters or direct allocation of |
| 4455 | * clusters discovered to be delayed allocated. Once allocated, a | ||
| 4456 | * cluster is not included in the reserved count. | ||
| 4518 | */ | 4457 | */ |
| 4519 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { | 4458 | if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) { |
| 4520 | unsigned int reserved_clusters; | 4459 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { |
| 4521 | /* | ||
| 4522 | * Check how many clusters we had reserved this allocated range | ||
| 4523 | */ | ||
| 4524 | reserved_clusters = get_reserved_cluster_alloc(inode, | ||
| 4525 | map->m_lblk, allocated); | ||
| 4526 | if (!map_from_cluster) { | ||
| 4527 | BUG_ON(allocated_clusters < reserved_clusters); | ||
| 4528 | if (reserved_clusters < allocated_clusters) { | ||
| 4529 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
| 4530 | int reservation = allocated_clusters - | ||
| 4531 | reserved_clusters; | ||
| 4532 | /* | ||
| 4533 | * It seems we claimed few clusters outside of | ||
| 4534 | * the range of this allocation. We should give | ||
| 4535 | * it back to the reservation pool. This can | ||
| 4536 | * happen in the following case: | ||
| 4537 | * | ||
| 4538 | * * Suppose s_cluster_ratio is 4 (i.e., each | ||
| 4539 | * cluster has 4 blocks. Thus, the clusters | ||
| 4540 | * are [0-3],[4-7],[8-11]... | ||
| 4541 | * * First comes delayed allocation write for | ||
| 4542 | * logical blocks 10 & 11. Since there were no | ||
| 4543 | * previous delayed allocated blocks in the | ||
| 4544 | * range [8-11], we would reserve 1 cluster | ||
| 4545 | * for this write. | ||
| 4546 | * * Next comes write for logical blocks 3 to 8. | ||
| 4547 | * In this case, we will reserve 2 clusters | ||
| 4548 | * (for [0-3] and [4-7]; and not for [8-11] as | ||
| 4549 | * that range has a delayed allocated blocks. | ||
| 4550 | * Thus total reserved clusters now becomes 3. | ||
| 4551 | * * Now, during the delayed allocation writeout | ||
| 4552 | * time, we will first write blocks [3-8] and | ||
| 4553 | * allocate 3 clusters for writing these | ||
| 4554 | * blocks. Also, we would claim all these | ||
| 4555 | * three clusters above. | ||
| 4556 | * * Now when we come here to writeout the | ||
| 4557 | * blocks [10-11], we would expect to claim | ||
| 4558 | * the reservation of 1 cluster we had made | ||
| 4559 | * (and we would claim it since there are no | ||
| 4560 | * more delayed allocated blocks in the range | ||
| 4561 | * [8-11]. But our reserved cluster count had | ||
| 4562 | * already gone to 0. | ||
| 4563 | * | ||
| 4564 | * Thus, at the step 4 above when we determine | ||
| 4565 | * that there are still some unwritten delayed | ||
| 4566 | * allocated blocks outside of our current | ||
| 4567 | * block range, we should increment the | ||
| 4568 | * reserved clusters count so that when the | ||
| 4569 | * remaining blocks finally gets written, we | ||
| 4570 | * could claim them. | ||
| 4571 | */ | ||
| 4572 | dquot_reserve_block(inode, | ||
| 4573 | EXT4_C2B(sbi, reservation)); | ||
| 4574 | spin_lock(&ei->i_block_reservation_lock); | ||
| 4575 | ei->i_reserved_data_blocks += reservation; | ||
| 4576 | spin_unlock(&ei->i_block_reservation_lock); | ||
| 4577 | } | ||
| 4578 | /* | 4460 | /* |
| 4579 | * We will claim quota for all newly allocated blocks. | 4461 | * When allocating delayed allocated clusters, simply |
| 4580 | * We're updating the reserved space *after* the | 4462 | * reduce the reserved cluster count and claim quota |
| 4581 | * correction above so we do not accidentally free | ||
| 4582 | * all the metadata reservation because we might | ||
| 4583 | * actually need it later on. | ||
| 4584 | */ | 4463 | */ |
| 4585 | ext4_da_update_reserve_space(inode, allocated_clusters, | 4464 | ext4_da_update_reserve_space(inode, allocated_clusters, |
| 4586 | 1); | 4465 | 1); |
| 4466 | } else { | ||
| 4467 | ext4_lblk_t lblk, len; | ||
| 4468 | unsigned int n; | ||
| 4469 | |||
| 4470 | /* | ||
| 4471 | * When allocating non-delayed allocated clusters | ||
| 4472 | * (from fallocate, filemap, DIO, or clusters | ||
| 4473 | * allocated when delalloc has been disabled by | ||
| 4474 | * ext4_nonda_switch), reduce the reserved cluster | ||
| 4475 | * count by the number of allocated clusters that | ||
| 4476 | * have previously been delayed allocated. Quota | ||
| 4477 | * has been claimed by ext4_mb_new_blocks() above, | ||
| 4478 | * so release the quota reservations made for any | ||
| 4479 | * previously delayed allocated clusters. | ||
| 4480 | */ | ||
| 4481 | lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); | ||
| 4482 | len = allocated_clusters << sbi->s_cluster_bits; | ||
| 4483 | n = ext4_es_delayed_clu(inode, lblk, len); | ||
| 4484 | if (n > 0) | ||
| 4485 | ext4_da_update_reserve_space(inode, (int) n, 0); | ||
| 4587 | } | 4486 | } |
| 4588 | } | 4487 | } |
| 4589 | 4488 | ||
| @@ -5075,8 +4974,10 @@ static int ext4_find_delayed_extent(struct inode *inode, | |||
| 5075 | ext4_lblk_t block, next_del; | 4974 | ext4_lblk_t block, next_del; |
| 5076 | 4975 | ||
| 5077 | if (newes->es_pblk == 0) { | 4976 | if (newes->es_pblk == 0) { |
| 5078 | ext4_es_find_delayed_extent_range(inode, newes->es_lblk, | 4977 | ext4_es_find_extent_range(inode, &ext4_es_is_delayed, |
| 5079 | newes->es_lblk + newes->es_len - 1, &es); | 4978 | newes->es_lblk, |
| 4979 | newes->es_lblk + newes->es_len - 1, | ||
| 4980 | &es); | ||
| 5080 | 4981 | ||
| 5081 | /* | 4982 | /* |
| 5082 | * No extent in extent-tree contains block @newes->es_pblk, | 4983 | * No extent in extent-tree contains block @newes->es_pblk, |
| @@ -5097,7 +4998,8 @@ static int ext4_find_delayed_extent(struct inode *inode, | |||
| 5097 | } | 4998 | } |
| 5098 | 4999 | ||
| 5099 | block = newes->es_lblk + newes->es_len; | 5000 | block = newes->es_lblk + newes->es_len; |
| 5100 | ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); | 5001 | ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block, |
| 5002 | EXT_MAX_BLOCKS, &es); | ||
| 5101 | if (es.es_len == 0) | 5003 | if (es.es_len == 0) |
| 5102 | next_del = EXT_MAX_BLOCKS; | 5004 | next_del = EXT_MAX_BLOCKS; |
| 5103 | else | 5005 | else |
| @@ -5958,3 +5860,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, | |||
| 5958 | } | 5860 | } |
| 5959 | return replaced_count; | 5861 | return replaced_count; |
| 5960 | } | 5862 | } |
| 5863 | |||
| 5864 | /* | ||
| 5865 | * ext4_clu_mapped - determine whether any block in a logical cluster has | ||
| 5866 | * been mapped to a physical cluster | ||
| 5867 | * | ||
| 5868 | * @inode - file containing the logical cluster | ||
| 5869 | * @lclu - logical cluster of interest | ||
| 5870 | * | ||
| 5871 | * Returns 1 if any block in the logical cluster is mapped, signifying | ||
| 5872 | * that a physical cluster has been allocated for it. Otherwise, | ||
| 5873 | * returns 0. Can also return negative error codes. Derived from | ||
| 5874 | * ext4_ext_map_blocks(). | ||
| 5875 | */ | ||
| 5876 | int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) | ||
| 5877 | { | ||
| 5878 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 5879 | struct ext4_ext_path *path; | ||
| 5880 | int depth, mapped = 0, err = 0; | ||
| 5881 | struct ext4_extent *extent; | ||
| 5882 | ext4_lblk_t first_lblk, first_lclu, last_lclu; | ||
| 5883 | |||
| 5884 | /* search for the extent closest to the first block in the cluster */ | ||
| 5885 | path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); | ||
| 5886 | if (IS_ERR(path)) { | ||
| 5887 | err = PTR_ERR(path); | ||
| 5888 | path = NULL; | ||
| 5889 | goto out; | ||
| 5890 | } | ||
| 5891 | |||
| 5892 | depth = ext_depth(inode); | ||
| 5893 | |||
| 5894 | /* | ||
| 5895 | * A consistent leaf must not be empty. This situation is possible, | ||
| 5896 | * though, _during_ tree modification, and it's why an assert can't | ||
| 5897 | * be put in ext4_find_extent(). | ||
| 5898 | */ | ||
| 5899 | if (unlikely(path[depth].p_ext == NULL && depth != 0)) { | ||
| 5900 | EXT4_ERROR_INODE(inode, | ||
| 5901 | "bad extent address - lblock: %lu, depth: %d, pblock: %lld", | ||
| 5902 | (unsigned long) EXT4_C2B(sbi, lclu), | ||
| 5903 | depth, path[depth].p_block); | ||
| 5904 | err = -EFSCORRUPTED; | ||
| 5905 | goto out; | ||
| 5906 | } | ||
| 5907 | |||
| 5908 | extent = path[depth].p_ext; | ||
| 5909 | |||
| 5910 | /* can't be mapped if the extent tree is empty */ | ||
| 5911 | if (extent == NULL) | ||
| 5912 | goto out; | ||
| 5913 | |||
| 5914 | first_lblk = le32_to_cpu(extent->ee_block); | ||
| 5915 | first_lclu = EXT4_B2C(sbi, first_lblk); | ||
| 5916 | |||
| 5917 | /* | ||
| 5918 | * Three possible outcomes at this point - found extent spanning | ||
| 5919 | * the target cluster, to the left of the target cluster, or to the | ||
| 5920 | * right of the target cluster. The first two cases are handled here. | ||
| 5921 | * The last case indicates the target cluster is not mapped. | ||
| 5922 | */ | ||
| 5923 | if (lclu >= first_lclu) { | ||
| 5924 | last_lclu = EXT4_B2C(sbi, first_lblk + | ||
| 5925 | ext4_ext_get_actual_len(extent) - 1); | ||
| 5926 | if (lclu <= last_lclu) { | ||
| 5927 | mapped = 1; | ||
| 5928 | } else { | ||
| 5929 | first_lblk = ext4_ext_next_allocated_block(path); | ||
| 5930 | first_lclu = EXT4_B2C(sbi, first_lblk); | ||
| 5931 | if (lclu == first_lclu) | ||
| 5932 | mapped = 1; | ||
| 5933 | } | ||
| 5934 | } | ||
| 5935 | |||
| 5936 | out: | ||
| 5937 | ext4_ext_drop_refs(path); | ||
| 5938 | kfree(path); | ||
| 5939 | |||
| 5940 | return err ? err : mapped; | ||
| 5941 | } | ||
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c4e6fb15101b..2b439afafe13 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c | |||
| @@ -142,6 +142,7 @@ | |||
| 142 | */ | 142 | */ |
| 143 | 143 | ||
| 144 | static struct kmem_cache *ext4_es_cachep; | 144 | static struct kmem_cache *ext4_es_cachep; |
| 145 | static struct kmem_cache *ext4_pending_cachep; | ||
| 145 | 146 | ||
| 146 | static int __es_insert_extent(struct inode *inode, struct extent_status *newes); | 147 | static int __es_insert_extent(struct inode *inode, struct extent_status *newes); |
| 147 | static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, | 148 | static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, |
| @@ -149,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, | |||
| 149 | static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); | 150 | static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); |
| 150 | static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, | 151 | static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, |
| 151 | struct ext4_inode_info *locked_ei); | 152 | struct ext4_inode_info *locked_ei); |
| 153 | static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, | ||
| 154 | ext4_lblk_t len); | ||
| 152 | 155 | ||
| 153 | int __init ext4_init_es(void) | 156 | int __init ext4_init_es(void) |
| 154 | { | 157 | { |
| @@ -233,30 +236,38 @@ static struct extent_status *__es_tree_search(struct rb_root *root, | |||
| 233 | } | 236 | } |
| 234 | 237 | ||
| 235 | /* | 238 | /* |
| 236 | * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering | 239 | * ext4_es_find_extent_range - find extent with specified status within block |
| 237 | * @es->lblk if it exists, otherwise, the next extent after @es->lblk. | 240 | * range or next extent following block range in |
| 241 | * extents status tree | ||
| 238 | * | 242 | * |
| 239 | * @inode: the inode which owns delayed extents | 243 | * @inode - file containing the range |
| 240 | * @lblk: the offset where we start to search | 244 | * @matching_fn - pointer to function that matches extents with desired status |
| 241 | * @end: the offset where we stop to search | 245 | * @lblk - logical block defining start of range |
| 242 | * @es: delayed extent that we found | 246 | * @end - logical block defining end of range |
| 247 | * @es - extent found, if any | ||
| 248 | * | ||
| 249 | * Find the first extent within the block range specified by @lblk and @end | ||
| 250 | * in the extents status tree that satisfies @matching_fn. If a match | ||
| 251 | * is found, it's returned in @es. If not, and a matching extent is found | ||
| 252 | * beyond the block range, it's returned in @es. If no match is found, an | ||
| 253 | * extent is returned in @es whose es_lblk, es_len, and es_pblk components | ||
| 254 | * are 0. | ||
| 243 | */ | 255 | */ |
| 244 | void ext4_es_find_delayed_extent_range(struct inode *inode, | 256 | static void __es_find_extent_range(struct inode *inode, |
| 245 | ext4_lblk_t lblk, ext4_lblk_t end, | 257 | int (*matching_fn)(struct extent_status *es), |
| 246 | struct extent_status *es) | 258 | ext4_lblk_t lblk, ext4_lblk_t end, |
| 259 | struct extent_status *es) | ||
| 247 | { | 260 | { |
| 248 | struct ext4_es_tree *tree = NULL; | 261 | struct ext4_es_tree *tree = NULL; |
| 249 | struct extent_status *es1 = NULL; | 262 | struct extent_status *es1 = NULL; |
| 250 | struct rb_node *node; | 263 | struct rb_node *node; |
| 251 | 264 | ||
| 252 | BUG_ON(es == NULL); | 265 | WARN_ON(es == NULL); |
| 253 | BUG_ON(end < lblk); | 266 | WARN_ON(end < lblk); |
| 254 | trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); | ||
| 255 | 267 | ||
| 256 | read_lock(&EXT4_I(inode)->i_es_lock); | ||
| 257 | tree = &EXT4_I(inode)->i_es_tree; | 268 | tree = &EXT4_I(inode)->i_es_tree; |
| 258 | 269 | ||
| 259 | /* find extent in cache firstly */ | 270 | /* see if the extent has been cached */ |
| 260 | es->es_lblk = es->es_len = es->es_pblk = 0; | 271 | es->es_lblk = es->es_len = es->es_pblk = 0; |
| 261 | if (tree->cache_es) { | 272 | if (tree->cache_es) { |
| 262 | es1 = tree->cache_es; | 273 | es1 = tree->cache_es; |
| @@ -271,28 +282,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, | |||
| 271 | es1 = __es_tree_search(&tree->root, lblk); | 282 | es1 = __es_tree_search(&tree->root, lblk); |
| 272 | 283 | ||
| 273 | out: | 284 | out: |
| 274 | if (es1 && !ext4_es_is_delayed(es1)) { | 285 | if (es1 && !matching_fn(es1)) { |
| 275 | while ((node = rb_next(&es1->rb_node)) != NULL) { | 286 | while ((node = rb_next(&es1->rb_node)) != NULL) { |
| 276 | es1 = rb_entry(node, struct extent_status, rb_node); | 287 | es1 = rb_entry(node, struct extent_status, rb_node); |
| 277 | if (es1->es_lblk > end) { | 288 | if (es1->es_lblk > end) { |
| 278 | es1 = NULL; | 289 | es1 = NULL; |
| 279 | break; | 290 | break; |
| 280 | } | 291 | } |
| 281 | if (ext4_es_is_delayed(es1)) | 292 | if (matching_fn(es1)) |
| 282 | break; | 293 | break; |
| 283 | } | 294 | } |
| 284 | } | 295 | } |
| 285 | 296 | ||
| 286 | if (es1 && ext4_es_is_delayed(es1)) { | 297 | if (es1 && matching_fn(es1)) { |
| 287 | tree->cache_es = es1; | 298 | tree->cache_es = es1; |
| 288 | es->es_lblk = es1->es_lblk; | 299 | es->es_lblk = es1->es_lblk; |
| 289 | es->es_len = es1->es_len; | 300 | es->es_len = es1->es_len; |
| 290 | es->es_pblk = es1->es_pblk; | 301 | es->es_pblk = es1->es_pblk; |
| 291 | } | 302 | } |
| 292 | 303 | ||
| 304 | } | ||
| 305 | |||
| 306 | /* | ||
| 307 | * Locking for __es_find_extent_range() for external use | ||
| 308 | */ | ||
| 309 | void ext4_es_find_extent_range(struct inode *inode, | ||
| 310 | int (*matching_fn)(struct extent_status *es), | ||
| 311 | ext4_lblk_t lblk, ext4_lblk_t end, | ||
| 312 | struct extent_status *es) | ||
| 313 | { | ||
| 314 | trace_ext4_es_find_extent_range_enter(inode, lblk); | ||
| 315 | |||
| 316 | read_lock(&EXT4_I(inode)->i_es_lock); | ||
| 317 | __es_find_extent_range(inode, matching_fn, lblk, end, es); | ||
| 318 | read_unlock(&EXT4_I(inode)->i_es_lock); | ||
| 319 | |||
| 320 | trace_ext4_es_find_extent_range_exit(inode, es); | ||
| 321 | } | ||
| 322 | |||
| 323 | /* | ||
| 324 | * __es_scan_range - search block range for block with specified status | ||
| 325 | * in extents status tree | ||
| 326 | * | ||
| 327 | * @inode - file containing the range | ||
| 328 | * @matching_fn - pointer to function that matches extents with desired status | ||
| 329 | * @lblk - logical block defining start of range | ||
| 330 | * @end - logical block defining end of range | ||
| 331 | * | ||
| 332 | * Returns true if at least one block in the specified block range satisfies | ||
| 333 | * the criterion specified by @matching_fn, and false if not. If at least | ||
| 334 | * one extent has the specified status, then there is at least one block | ||
| 335 | * in the cluster with that status. Should only be called by code that has | ||
| 336 | * taken i_es_lock. | ||
| 337 | */ | ||
| 338 | static bool __es_scan_range(struct inode *inode, | ||
| 339 | int (*matching_fn)(struct extent_status *es), | ||
| 340 | ext4_lblk_t start, ext4_lblk_t end) | ||
| 341 | { | ||
| 342 | struct extent_status es; | ||
| 343 | |||
| 344 | __es_find_extent_range(inode, matching_fn, start, end, &es); | ||
| 345 | if (es.es_len == 0) | ||
| 346 | return false; /* no matching extent in the tree */ | ||
| 347 | else if (es.es_lblk <= start && | ||
| 348 | start < es.es_lblk + es.es_len) | ||
| 349 | return true; | ||
| 350 | else if (start <= es.es_lblk && es.es_lblk <= end) | ||
| 351 | return true; | ||
| 352 | else | ||
| 353 | return false; | ||
| 354 | } | ||
| 355 | /* | ||
| 356 | * Locking for __es_scan_range() for external use | ||
| 357 | */ | ||
| 358 | bool ext4_es_scan_range(struct inode *inode, | ||
| 359 | int (*matching_fn)(struct extent_status *es), | ||
| 360 | ext4_lblk_t lblk, ext4_lblk_t end) | ||
| 361 | { | ||
| 362 | bool ret; | ||
| 363 | |||
| 364 | read_lock(&EXT4_I(inode)->i_es_lock); | ||
| 365 | ret = __es_scan_range(inode, matching_fn, lblk, end); | ||
| 293 | read_unlock(&EXT4_I(inode)->i_es_lock); | 366 | read_unlock(&EXT4_I(inode)->i_es_lock); |
| 294 | 367 | ||
| 295 | trace_ext4_es_find_delayed_extent_range_exit(inode, es); | 368 | return ret; |
| 369 | } | ||
| 370 | |||
| 371 | /* | ||
| 372 | * __es_scan_clu - search cluster for block with specified status in | ||
| 373 | * extents status tree | ||
| 374 | * | ||
| 375 | * @inode - file containing the cluster | ||
| 376 | * @matching_fn - pointer to function that matches extents with desired status | ||
| 377 | * @lblk - logical block in cluster to be searched | ||
| 378 | * | ||
| 379 | * Returns true if at least one extent in the cluster containing @lblk | ||
| 380 | * satisfies the criterion specified by @matching_fn, and false if not. If at | ||
| 381 | * least one extent has the specified status, then there is at least one block | ||
| 382 | * in the cluster with that status. Should only be called by code that has | ||
| 383 | * taken i_es_lock. | ||
| 384 | */ | ||
| 385 | static bool __es_scan_clu(struct inode *inode, | ||
| 386 | int (*matching_fn)(struct extent_status *es), | ||
| 387 | ext4_lblk_t lblk) | ||
| 388 | { | ||
| 389 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 390 | ext4_lblk_t lblk_start, lblk_end; | ||
| 391 | |||
| 392 | lblk_start = EXT4_LBLK_CMASK(sbi, lblk); | ||
| 393 | lblk_end = lblk_start + sbi->s_cluster_ratio - 1; | ||
| 394 | |||
| 395 | return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); | ||
| 396 | } | ||
| 397 | |||
| 398 | /* | ||
| 399 | * Locking for __es_scan_clu() for external use | ||
| 400 | */ | ||
| 401 | bool ext4_es_scan_clu(struct inode *inode, | ||
| 402 | int (*matching_fn)(struct extent_status *es), | ||
| 403 | ext4_lblk_t lblk) | ||
| 404 | { | ||
| 405 | bool ret; | ||
| 406 | |||
| 407 | read_lock(&EXT4_I(inode)->i_es_lock); | ||
| 408 | ret = __es_scan_clu(inode, matching_fn, lblk); | ||
| 409 | read_unlock(&EXT4_I(inode)->i_es_lock); | ||
| 410 | |||
| 411 | return ret; | ||
| 296 | } | 412 | } |
| 297 | 413 | ||
| 298 | static void ext4_es_list_add(struct inode *inode) | 414 | static void ext4_es_list_add(struct inode *inode) |
| @@ -694,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, | |||
| 694 | struct extent_status newes; | 810 | struct extent_status newes; |
| 695 | ext4_lblk_t end = lblk + len - 1; | 811 | ext4_lblk_t end = lblk + len - 1; |
| 696 | int err = 0; | 812 | int err = 0; |
| 813 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 697 | 814 | ||
| 698 | es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", | 815 | es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", |
| 699 | lblk, len, pblk, status, inode->i_ino); | 816 | lblk, len, pblk, status, inode->i_ino); |
| @@ -730,6 +847,11 @@ retry: | |||
| 730 | if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) | 847 | if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) |
| 731 | err = 0; | 848 | err = 0; |
| 732 | 849 | ||
| 850 | if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && | ||
| 851 | (status & EXTENT_STATUS_WRITTEN || | ||
| 852 | status & EXTENT_STATUS_UNWRITTEN)) | ||
| 853 | __revise_pending(inode, lblk, len); | ||
| 854 | |||
| 733 | error: | 855 | error: |
| 734 | write_unlock(&EXT4_I(inode)->i_es_lock); | 856 | write_unlock(&EXT4_I(inode)->i_es_lock); |
| 735 | 857 | ||
| @@ -1252,3 +1374,499 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) | |||
| 1252 | ei->i_es_tree.cache_es = NULL; | 1374 | ei->i_es_tree.cache_es = NULL; |
| 1253 | return nr_shrunk; | 1375 | return nr_shrunk; |
| 1254 | } | 1376 | } |
| 1377 | |||
| 1378 | #ifdef ES_DEBUG__ | ||
| 1379 | static void ext4_print_pending_tree(struct inode *inode) | ||
| 1380 | { | ||
| 1381 | struct ext4_pending_tree *tree; | ||
| 1382 | struct rb_node *node; | ||
| 1383 | struct pending_reservation *pr; | ||
| 1384 | |||
| 1385 | printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); | ||
| 1386 | tree = &EXT4_I(inode)->i_pending_tree; | ||
| 1387 | node = rb_first(&tree->root); | ||
| 1388 | while (node) { | ||
| 1389 | pr = rb_entry(node, struct pending_reservation, rb_node); | ||
| 1390 | printk(KERN_DEBUG " %u", pr->lclu); | ||
| 1391 | node = rb_next(node); | ||
| 1392 | } | ||
| 1393 | printk(KERN_DEBUG "\n"); | ||
| 1394 | } | ||
| 1395 | #else | ||
| 1396 | #define ext4_print_pending_tree(inode) | ||
| 1397 | #endif | ||
| 1398 | |||
| 1399 | int __init ext4_init_pending(void) | ||
| 1400 | { | ||
| 1401 | ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", | ||
| 1402 | sizeof(struct pending_reservation), | ||
| 1403 | 0, (SLAB_RECLAIM_ACCOUNT), NULL); | ||
| 1404 | if (ext4_pending_cachep == NULL) | ||
| 1405 | return -ENOMEM; | ||
| 1406 | return 0; | ||
| 1407 | } | ||
| 1408 | |||
| 1409 | void ext4_exit_pending(void) | ||
| 1410 | { | ||
| 1411 | kmem_cache_destroy(ext4_pending_cachep); | ||
| 1412 | } | ||
| 1413 | |||
| 1414 | void ext4_init_pending_tree(struct ext4_pending_tree *tree) | ||
| 1415 | { | ||
| 1416 | tree->root = RB_ROOT; | ||
| 1417 | } | ||
| 1418 | |||
| 1419 | /* | ||
| 1420 | * __get_pending - retrieve a pointer to a pending reservation | ||
| 1421 | * | ||
| 1422 | * @inode - file containing the pending cluster reservation | ||
| 1423 | * @lclu - logical cluster of interest | ||
| 1424 | * | ||
| 1425 | * Returns a pointer to a pending reservation if it's a member of | ||
| 1426 | * the set, and NULL if not. Must be called holding i_es_lock. | ||
| 1427 | */ | ||
| 1428 | static struct pending_reservation *__get_pending(struct inode *inode, | ||
| 1429 | ext4_lblk_t lclu) | ||
| 1430 | { | ||
| 1431 | struct ext4_pending_tree *tree; | ||
| 1432 | struct rb_node *node; | ||
| 1433 | struct pending_reservation *pr = NULL; | ||
| 1434 | |||
| 1435 | tree = &EXT4_I(inode)->i_pending_tree; | ||
| 1436 | node = (&tree->root)->rb_node; | ||
| 1437 | |||
| 1438 | while (node) { | ||
| 1439 | pr = rb_entry(node, struct pending_reservation, rb_node); | ||
| 1440 | if (lclu < pr->lclu) | ||
| 1441 | node = node->rb_left; | ||
| 1442 | else if (lclu > pr->lclu) | ||
| 1443 | node = node->rb_right; | ||
| 1444 | else if (lclu == pr->lclu) | ||
| 1445 | return pr; | ||
| 1446 | } | ||
| 1447 | return NULL; | ||
| 1448 | } | ||
| 1449 | |||
| 1450 | /* | ||
| 1451 | * __insert_pending - adds a pending cluster reservation to the set of | ||
| 1452 | * pending reservations | ||
| 1453 | * | ||
| 1454 | * @inode - file containing the cluster | ||
| 1455 | * @lblk - logical block in the cluster to be added | ||
| 1456 | * | ||
| 1457 | * Returns 0 on successful insertion and -ENOMEM on failure. If the | ||
| 1458 | * pending reservation is already in the set, returns successfully. | ||
| 1459 | */ | ||
| 1460 | static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) | ||
| 1461 | { | ||
| 1462 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 1463 | struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; | ||
| 1464 | struct rb_node **p = &tree->root.rb_node; | ||
| 1465 | struct rb_node *parent = NULL; | ||
| 1466 | struct pending_reservation *pr; | ||
| 1467 | ext4_lblk_t lclu; | ||
| 1468 | int ret = 0; | ||
| 1469 | |||
| 1470 | lclu = EXT4_B2C(sbi, lblk); | ||
| 1471 | /* search to find parent for insertion */ | ||
| 1472 | while (*p) { | ||
| 1473 | parent = *p; | ||
| 1474 | pr = rb_entry(parent, struct pending_reservation, rb_node); | ||
| 1475 | |||
| 1476 | if (lclu < pr->lclu) { | ||
| 1477 | p = &(*p)->rb_left; | ||
| 1478 | } else if (lclu > pr->lclu) { | ||
| 1479 | p = &(*p)->rb_right; | ||
| 1480 | } else { | ||
| 1481 | /* pending reservation already inserted */ | ||
| 1482 | goto out; | ||
| 1483 | } | ||
| 1484 | } | ||
| 1485 | |||
| 1486 | pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); | ||
| 1487 | if (pr == NULL) { | ||
| 1488 | ret = -ENOMEM; | ||
| 1489 | goto out; | ||
| 1490 | } | ||
| 1491 | pr->lclu = lclu; | ||
| 1492 | |||
| 1493 | rb_link_node(&pr->rb_node, parent, p); | ||
| 1494 | rb_insert_color(&pr->rb_node, &tree->root); | ||
| 1495 | |||
| 1496 | out: | ||
| 1497 | return ret; | ||
| 1498 | } | ||
| 1499 | |||
| 1500 | /* | ||
| 1501 | * __remove_pending - removes a pending cluster reservation from the set | ||
| 1502 | * of pending reservations | ||
| 1503 | * | ||
| 1504 | * @inode - file containing the cluster | ||
| 1505 | * @lblk - logical block in the pending cluster reservation to be removed | ||
| 1506 | * | ||
| 1507 | * Returns successfully if pending reservation is not a member of the set. | ||
| 1508 | */ | ||
| 1509 | static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) | ||
| 1510 | { | ||
| 1511 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 1512 | struct pending_reservation *pr; | ||
| 1513 | struct ext4_pending_tree *tree; | ||
| 1514 | |||
| 1515 | pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); | ||
| 1516 | if (pr != NULL) { | ||
| 1517 | tree = &EXT4_I(inode)->i_pending_tree; | ||
| 1518 | rb_erase(&pr->rb_node, &tree->root); | ||
| 1519 | kmem_cache_free(ext4_pending_cachep, pr); | ||
| 1520 | } | ||
| 1521 | } | ||
| 1522 | |||
| 1523 | /* | ||
| 1524 | * ext4_remove_pending - removes a pending cluster reservation from the set | ||
| 1525 | * of pending reservations | ||
| 1526 | * | ||
| 1527 | * @inode - file containing the cluster | ||
| 1528 | * @lblk - logical block in the pending cluster reservation to be removed | ||
| 1529 | * | ||
| 1530 | * Locking for external use of __remove_pending. | ||
| 1531 | */ | ||
| 1532 | void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) | ||
| 1533 | { | ||
| 1534 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
| 1535 | |||
| 1536 | write_lock(&ei->i_es_lock); | ||
| 1537 | __remove_pending(inode, lblk); | ||
| 1538 | write_unlock(&ei->i_es_lock); | ||
| 1539 | } | ||
| 1540 | |||
| 1541 | /* | ||
| 1542 | * ext4_is_pending - determine whether a cluster has a pending reservation | ||
| 1543 | * on it | ||
| 1544 | * | ||
| 1545 | * @inode - file containing the cluster | ||
| 1546 | * @lblk - logical block in the cluster | ||
| 1547 | * | ||
| 1548 | * Returns true if there's a pending reservation for the cluster in the | ||
| 1549 | * set of pending reservations, and false if not. | ||
| 1550 | */ | ||
| 1551 | bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) | ||
| 1552 | { | ||
| 1553 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 1554 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
| 1555 | bool ret; | ||
| 1556 | |||
| 1557 | read_lock(&ei->i_es_lock); | ||
| 1558 | ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); | ||
| 1559 | read_unlock(&ei->i_es_lock); | ||
| 1560 | |||
| 1561 | return ret; | ||
| 1562 | } | ||
| 1563 | |||
| 1564 | /* | ||
| 1565 | * ext4_es_insert_delayed_block - adds a delayed block to the extents status | ||
| 1566 | * tree, adding a pending reservation where | ||
| 1567 | * needed | ||
| 1568 | * | ||
| 1569 | * @inode - file containing the newly added block | ||
| 1570 | * @lblk - logical block to be added | ||
| 1571 | * @allocated - indicates whether a physical cluster has been allocated for | ||
| 1572 | * the logical cluster that contains the block | ||
| 1573 | * | ||
| 1574 | * Returns 0 on success, negative error code on failure. | ||
| 1575 | */ | ||
| 1576 | int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, | ||
| 1577 | bool allocated) | ||
| 1578 | { | ||
| 1579 | struct extent_status newes; | ||
| 1580 | int err = 0; | ||
| 1581 | |||
| 1582 | es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", | ||
| 1583 | lblk, inode->i_ino); | ||
| 1584 | |||
| 1585 | newes.es_lblk = lblk; | ||
| 1586 | newes.es_len = 1; | ||
| 1587 | ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); | ||
| 1588 | trace_ext4_es_insert_delayed_block(inode, &newes, allocated); | ||
| 1589 | |||
| 1590 | ext4_es_insert_extent_check(inode, &newes); | ||
| 1591 | |||
| 1592 | write_lock(&EXT4_I(inode)->i_es_lock); | ||
| 1593 | |||
| 1594 | err = __es_remove_extent(inode, lblk, lblk); | ||
| 1595 | if (err != 0) | ||
| 1596 | goto error; | ||
| 1597 | retry: | ||
| 1598 | err = __es_insert_extent(inode, &newes); | ||
| 1599 | if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), | ||
| 1600 | 128, EXT4_I(inode))) | ||
| 1601 | goto retry; | ||
| 1602 | if (err != 0) | ||
| 1603 | goto error; | ||
| 1604 | |||
| 1605 | if (allocated) | ||
| 1606 | __insert_pending(inode, lblk); | ||
| 1607 | |||
| 1608 | error: | ||
| 1609 | write_unlock(&EXT4_I(inode)->i_es_lock); | ||
| 1610 | |||
| 1611 | ext4_es_print_tree(inode); | ||
| 1612 | ext4_print_pending_tree(inode); | ||
| 1613 | |||
| 1614 | return err; | ||
| 1615 | } | ||
| 1616 | |||
| 1617 | /* | ||
| 1618 | * __es_delayed_clu - count number of clusters containing blocks that | ||
| 1619 | * are delayed only | ||
| 1620 | * | ||
| 1621 | * @inode - file containing block range | ||
| 1622 | * @start - logical block defining start of range | ||
| 1623 | * @end - logical block defining end of range | ||
| 1624 | * | ||
| 1625 | * Returns the number of clusters containing only delayed (not delayed | ||
| 1626 | * and unwritten) blocks in the range specified by @start and @end. Any | ||
| 1627 | * cluster or part of a cluster within the range and containing a delayed | ||
| 1628 | * and not unwritten block within the range is counted as a whole cluster. | ||
| 1629 | */ | ||
| 1630 | static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, | ||
| 1631 | ext4_lblk_t end) | ||
| 1632 | { | ||
| 1633 | struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; | ||
| 1634 | struct extent_status *es; | ||
| 1635 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 1636 | struct rb_node *node; | ||
| 1637 | ext4_lblk_t first_lclu, last_lclu; | ||
| 1638 | unsigned long long last_counted_lclu; | ||
| 1639 | unsigned int n = 0; | ||
| 1640 | |||
| 1641 | /* guaranteed to be unequal to any ext4_lblk_t value */ | ||
| 1642 | last_counted_lclu = ~0ULL; | ||
| 1643 | |||
| 1644 | es = __es_tree_search(&tree->root, start); | ||
| 1645 | |||
| 1646 | while (es && (es->es_lblk <= end)) { | ||
| 1647 | if (ext4_es_is_delonly(es)) { | ||
| 1648 | if (es->es_lblk <= start) | ||
| 1649 | first_lclu = EXT4_B2C(sbi, start); | ||
| 1650 | else | ||
| 1651 | first_lclu = EXT4_B2C(sbi, es->es_lblk); | ||
| 1652 | |||
| 1653 | if (ext4_es_end(es) >= end) | ||
| 1654 | last_lclu = EXT4_B2C(sbi, end); | ||
| 1655 | else | ||
| 1656 | last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); | ||
| 1657 | |||
| 1658 | if (first_lclu == last_counted_lclu) | ||
| 1659 | n += last_lclu - first_lclu; | ||
| 1660 | else | ||
| 1661 | n += last_lclu - first_lclu + 1; | ||
| 1662 | last_counted_lclu = last_lclu; | ||
| 1663 | } | ||
| 1664 | node = rb_next(&es->rb_node); | ||
| 1665 | if (!node) | ||
| 1666 | break; | ||
| 1667 | es = rb_entry(node, struct extent_status, rb_node); | ||
| 1668 | } | ||
| 1669 | |||
| 1670 | return n; | ||
| 1671 | } | ||
| 1672 | |||
| 1673 | /* | ||
| 1674 | * ext4_es_delayed_clu - count number of clusters containing blocks that | ||
| 1675 | * are both delayed and unwritten | ||
| 1676 | * | ||
| 1677 | * @inode - file containing block range | ||
| 1678 | * @lblk - logical block defining start of range | ||
| 1679 | * @len - number of blocks in range | ||
| 1680 | * | ||
| 1681 | * Locking for external use of __es_delayed_clu(). | ||
| 1682 | */ | ||
| 1683 | unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, | ||
| 1684 | ext4_lblk_t len) | ||
| 1685 | { | ||
| 1686 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
| 1687 | ext4_lblk_t end; | ||
| 1688 | unsigned int n; | ||
| 1689 | |||
| 1690 | if (len == 0) | ||
| 1691 | return 0; | ||
| 1692 | |||
| 1693 | end = lblk + len - 1; | ||
| 1694 | WARN_ON(end < lblk); | ||
| 1695 | |||
| 1696 | read_lock(&ei->i_es_lock); | ||
| 1697 | |||
| 1698 | n = __es_delayed_clu(inode, lblk, end); | ||
| 1699 | |||
| 1700 | read_unlock(&ei->i_es_lock); | ||
| 1701 | |||
| 1702 | return n; | ||
| 1703 | } | ||
| 1704 | |||
| 1705 | /* | ||
| 1706 | * __revise_pending - makes, cancels, or leaves unchanged pending cluster | ||
| 1707 | * reservations for a specified block range depending | ||
| 1708 | * upon the presence or absence of delayed blocks | ||
| 1709 | * outside the range within clusters at the ends of the | ||
| 1710 | * range | ||
| 1711 | * | ||
| 1712 | * @inode - file containing the range | ||
| 1713 | * @lblk - logical block defining the start of range | ||
| 1714 | * @len - length of range in blocks | ||
| 1715 | * | ||
| 1716 | * Used after a newly allocated extent is added to the extents status tree. | ||
| 1717 | * Requires that the extents in the range have either written or unwritten | ||
| 1718 | * status. Must be called while holding i_es_lock. | ||
| 1719 | */ | ||
| 1720 | static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, | ||
| 1721 | ext4_lblk_t len) | ||
| 1722 | { | ||
| 1723 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 1724 | ext4_lblk_t end = lblk + len - 1; | ||
| 1725 | ext4_lblk_t first, last; | ||
| 1726 | bool f_del = false, l_del = false; | ||
| 1727 | |||
| 1728 | if (len == 0) | ||
| 1729 | return; | ||
| 1730 | |||
| 1731 | /* | ||
| 1732 | * Two cases - block range within single cluster and block range | ||
| 1733 | * spanning two or more clusters. Note that a cluster belonging | ||
| 1734 | * to a range starting and/or ending on a cluster boundary is treated | ||
| 1735 | * as if it does not contain a delayed extent. The new range may | ||
| 1736 | * have allocated space for previously delayed blocks out to the | ||
| 1737 | * cluster boundary, requiring that any pre-existing pending | ||
| 1738 | * reservation be canceled. Because this code only looks at blocks | ||
| 1739 | * outside the range, it should revise pending reservations | ||
| 1740 | * correctly even if the extent represented by the range can't be | ||
| 1741 | * inserted in the extents status tree due to ENOSPC. | ||
| 1742 | */ | ||
| 1743 | |||
| 1744 | if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { | ||
| 1745 | first = EXT4_LBLK_CMASK(sbi, lblk); | ||
| 1746 | if (first != lblk) | ||
| 1747 | f_del = __es_scan_range(inode, &ext4_es_is_delonly, | ||
| 1748 | first, lblk - 1); | ||
| 1749 | if (f_del) { | ||
| 1750 | __insert_pending(inode, first); | ||
| 1751 | } else { | ||
| 1752 | last = EXT4_LBLK_CMASK(sbi, end) + | ||
| 1753 | sbi->s_cluster_ratio - 1; | ||
| 1754 | if (last != end) | ||
| 1755 | l_del = __es_scan_range(inode, | ||
| 1756 | &ext4_es_is_delonly, | ||
| 1757 | end + 1, last); | ||
| 1758 | if (l_del) | ||
| 1759 | __insert_pending(inode, last); | ||
| 1760 | else | ||
| 1761 | __remove_pending(inode, last); | ||
| 1762 | } | ||
| 1763 | } else { | ||
| 1764 | first = EXT4_LBLK_CMASK(sbi, lblk); | ||
| 1765 | if (first != lblk) | ||
| 1766 | f_del = __es_scan_range(inode, &ext4_es_is_delonly, | ||
| 1767 | first, lblk - 1); | ||
| 1768 | if (f_del) | ||
| 1769 | __insert_pending(inode, first); | ||
| 1770 | else | ||
| 1771 | __remove_pending(inode, first); | ||
| 1772 | |||
| 1773 | last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; | ||
| 1774 | if (last != end) | ||
| 1775 | l_del = __es_scan_range(inode, &ext4_es_is_delonly, | ||
| 1776 | end + 1, last); | ||
| 1777 | if (l_del) | ||
| 1778 | __insert_pending(inode, last); | ||
| 1779 | else | ||
| 1780 | __remove_pending(inode, last); | ||
| 1781 | } | ||
| 1782 | } | ||
| 1783 | |||
| 1784 | /* | ||
| 1785 | * ext4_es_remove_blks - remove block range from extents status tree and | ||
| 1786 | * reduce reservation count or cancel pending | ||
| 1787 | * reservation as needed | ||
| 1788 | * | ||
| 1789 | * @inode - file containing range | ||
| 1790 | * @lblk - first block in range | ||
| 1791 | * @len - number of blocks to remove | ||
| 1792 | * | ||
| 1793 | */ | ||
| 1794 | void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, | ||
| 1795 | ext4_lblk_t len) | ||
| 1796 | { | ||
| 1797 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 1798 | unsigned int clu_size, reserved = 0; | ||
| 1799 | ext4_lblk_t last_lclu, first, length, remainder, last; | ||
| 1800 | bool delonly; | ||
| 1801 | int err = 0; | ||
| 1802 | struct pending_reservation *pr; | ||
| 1803 | struct ext4_pending_tree *tree; | ||
| 1804 | |||
| 1805 | /* | ||
| 1806 | * Process cluster by cluster for bigalloc - there may be up to | ||
| 1807 | * two clusters in a 4k page with a 1k block size and two blocks | ||
| 1808 | * per cluster. Also necessary for systems with larger page sizes | ||
| 1809 | * and potentially larger block sizes. | ||
| 1810 | */ | ||
| 1811 | clu_size = sbi->s_cluster_ratio; | ||
| 1812 | last_lclu = EXT4_B2C(sbi, lblk + len - 1); | ||
| 1813 | |||
| 1814 | write_lock(&EXT4_I(inode)->i_es_lock); | ||
| 1815 | |||
| 1816 | for (first = lblk, remainder = len; | ||
| 1817 | remainder > 0; | ||
| 1818 | first += length, remainder -= length) { | ||
| 1819 | |||
| 1820 | if (EXT4_B2C(sbi, first) == last_lclu) | ||
| 1821 | length = remainder; | ||
| 1822 | else | ||
| 1823 | length = clu_size - EXT4_LBLK_COFF(sbi, first); | ||
| 1824 | |||
| 1825 | /* | ||
| 1826 | * The BH_Delay flag, which triggers calls to this function, | ||
| 1827 | * and the contents of the extents status tree can be | ||
| 1828 | * inconsistent due to writepages activity. So, note whether | ||
| 1829 | * the blocks to be removed actually belong to an extent with | ||
| 1830 | * delayed only status. | ||
| 1831 | */ | ||
| 1832 | delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first); | ||
| 1833 | |||
| 1834 | /* | ||
| 1835 | * because of the writepages effect, written and unwritten | ||
| 1836 | * blocks could be removed here | ||
| 1837 | */ | ||
| 1838 | last = first + length - 1; | ||
| 1839 | err = __es_remove_extent(inode, first, last); | ||
| 1840 | if (err) | ||
| 1841 | ext4_warning(inode->i_sb, | ||
| 1842 | "%s: couldn't remove page (err = %d)", | ||
| 1843 | __func__, err); | ||
| 1844 | |||
| 1845 | /* non-bigalloc case: simply count the cluster for release */ | ||
| 1846 | if (sbi->s_cluster_ratio == 1 && delonly) { | ||
| 1847 | reserved++; | ||
| 1848 | continue; | ||
| 1849 | } | ||
| 1850 | |||
| 1851 | /* | ||
| 1852 | * bigalloc case: if all delayed allocated only blocks have | ||
| 1853 | * just been removed from a cluster, either cancel a pending | ||
| 1854 | * reservation if it exists or count a cluster for release | ||
| 1855 | */ | ||
| 1856 | if (delonly && | ||
| 1857 | !__es_scan_clu(inode, &ext4_es_is_delonly, first)) { | ||
| 1858 | pr = __get_pending(inode, EXT4_B2C(sbi, first)); | ||
| 1859 | if (pr != NULL) { | ||
| 1860 | tree = &EXT4_I(inode)->i_pending_tree; | ||
| 1861 | rb_erase(&pr->rb_node, &tree->root); | ||
| 1862 | kmem_cache_free(ext4_pending_cachep, pr); | ||
| 1863 | } else { | ||
| 1864 | reserved++; | ||
| 1865 | } | ||
| 1866 | } | ||
| 1867 | } | ||
| 1868 | |||
| 1869 | write_unlock(&EXT4_I(inode)->i_es_lock); | ||
| 1870 | |||
| 1871 | ext4_da_release_space(inode, reserved); | ||
| 1872 | } | ||
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8efdeb903d6b..131a8b7df265 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h | |||
| @@ -78,6 +78,51 @@ struct ext4_es_stats { | |||
| 78 | struct percpu_counter es_stats_shk_cnt; | 78 | struct percpu_counter es_stats_shk_cnt; |
| 79 | }; | 79 | }; |
| 80 | 80 | ||
| 81 | /* | ||
| 82 | * Pending cluster reservations for bigalloc file systems | ||
| 83 | * | ||
| 84 | * A cluster with a pending reservation is a logical cluster shared by at | ||
| 85 | * least one extent in the extents status tree with delayed and unwritten | ||
| 86 | * status and at least one other written or unwritten extent. The | ||
| 87 | * reservation is said to be pending because a cluster reservation would | ||
| 88 | * have to be taken in the event all blocks in the cluster shared with | ||
| 89 | * written or unwritten extents were deleted while the delayed and | ||
| 90 | * unwritten blocks remained. | ||
| 91 | * | ||
| 92 | * The set of pending cluster reservations is an auxiliary data structure | ||
| 93 | * used with the extents status tree to implement reserved cluster/block | ||
| 94 | * accounting for bigalloc file systems. The set is kept in memory and | ||
| 95 | * records all pending cluster reservations. | ||
| 96 | * | ||
| 97 | * Its primary function is to avoid the need to read extents from the | ||
| 98 | * disk when invalidating pages as a result of a truncate, punch hole, or | ||
| 99 | * collapse range operation. Page invalidation requires a decrease in the | ||
| 100 | * reserved cluster count if it results in the removal of all delayed | ||
| 101 | * and unwritten extents (blocks) from a cluster that is not shared with a | ||
| 102 | * written or unwritten extent, and no decrease otherwise. Determining | ||
| 103 | * whether the cluster is shared can be done by searching for a pending | ||
| 104 | * reservation on it. | ||
| 105 | * | ||
| 106 | * Secondarily, it provides a potentially faster method for determining | ||
| 107 | * whether the reserved cluster count should be increased when a physical | ||
| 108 | * cluster is deallocated as a result of a truncate, punch hole, or | ||
| 109 | * collapse range operation. The necessary information is also present | ||
| 110 | * in the extents status tree, but might be more rapidly accessed in | ||
| 111 | * the pending reservation set in many cases due to smaller size. | ||
| 112 | * | ||
| 113 | * The pending cluster reservation set is implemented as a red-black tree | ||
| 114 | * with the goal of minimizing per page search time overhead. | ||
| 115 | */ | ||
| 116 | |||
| 117 | struct pending_reservation { | ||
| 118 | struct rb_node rb_node; | ||
| 119 | ext4_lblk_t lclu; | ||
| 120 | }; | ||
| 121 | |||
| 122 | struct ext4_pending_tree { | ||
| 123 | struct rb_root root; | ||
| 124 | }; | ||
| 125 | |||
| 81 | extern int __init ext4_init_es(void); | 126 | extern int __init ext4_init_es(void); |
| 82 | extern void ext4_exit_es(void); | 127 | extern void ext4_exit_es(void); |
| 83 | extern void ext4_es_init_tree(struct ext4_es_tree *tree); | 128 | extern void ext4_es_init_tree(struct ext4_es_tree *tree); |
| @@ -90,11 +135,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, | |||
| 90 | unsigned int status); | 135 | unsigned int status); |
| 91 | extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, | 136 | extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, |
| 92 | ext4_lblk_t len); | 137 | ext4_lblk_t len); |
| 93 | extern void ext4_es_find_delayed_extent_range(struct inode *inode, | 138 | extern void ext4_es_find_extent_range(struct inode *inode, |
| 94 | ext4_lblk_t lblk, ext4_lblk_t end, | 139 | int (*match_fn)(struct extent_status *es), |
| 95 | struct extent_status *es); | 140 | ext4_lblk_t lblk, ext4_lblk_t end, |
| 141 | struct extent_status *es); | ||
| 96 | extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, | 142 | extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, |
| 97 | struct extent_status *es); | 143 | struct extent_status *es); |
| 144 | extern bool ext4_es_scan_range(struct inode *inode, | ||
| 145 | int (*matching_fn)(struct extent_status *es), | ||
| 146 | ext4_lblk_t lblk, ext4_lblk_t end); | ||
| 147 | extern bool ext4_es_scan_clu(struct inode *inode, | ||
| 148 | int (*matching_fn)(struct extent_status *es), | ||
| 149 | ext4_lblk_t lblk); | ||
| 98 | 150 | ||
| 99 | static inline unsigned int ext4_es_status(struct extent_status *es) | 151 | static inline unsigned int ext4_es_status(struct extent_status *es) |
| 100 | { | 152 | { |
| @@ -126,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es) | |||
| 126 | return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; | 178 | return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; |
| 127 | } | 179 | } |
| 128 | 180 | ||
| 181 | static inline int ext4_es_is_mapped(struct extent_status *es) | ||
| 182 | { | ||
| 183 | return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); | ||
| 184 | } | ||
| 185 | |||
| 186 | static inline int ext4_es_is_delonly(struct extent_status *es) | ||
| 187 | { | ||
| 188 | return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); | ||
| 189 | } | ||
| 190 | |||
| 129 | static inline void ext4_es_set_referenced(struct extent_status *es) | 191 | static inline void ext4_es_set_referenced(struct extent_status *es) |
| 130 | { | 192 | { |
| 131 | es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; | 193 | es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; |
| @@ -175,4 +237,16 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); | |||
| 175 | 237 | ||
| 176 | extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); | 238 | extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); |
| 177 | 239 | ||
| 240 | extern int __init ext4_init_pending(void); | ||
| 241 | extern void ext4_exit_pending(void); | ||
| 242 | extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); | ||
| 243 | extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); | ||
| 244 | extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); | ||
| 245 | extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, | ||
| 246 | bool allocated); | ||
| 247 | extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, | ||
| 248 | ext4_lblk_t len); | ||
| 249 | extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, | ||
| 250 | ext4_lblk_t len); | ||
| 251 | |||
| 178 | #endif /* _EXT4_EXTENTS_STATUS_H */ | 252 | #endif /* _EXT4_EXTENTS_STATUS_H */ |
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7b4736022761..9c4bac18cc6c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c | |||
| @@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, | |||
| 863 | handle_t *handle; | 863 | handle_t *handle; |
| 864 | struct page *page; | 864 | struct page *page; |
| 865 | struct ext4_iloc iloc; | 865 | struct ext4_iloc iloc; |
| 866 | int retries; | 866 | int retries = 0; |
| 867 | 867 | ||
| 868 | ret = ext4_get_inode_loc(inode, &iloc); | 868 | ret = ext4_get_inode_loc(inode, &iloc); |
| 869 | if (ret) | 869 | if (ret) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d767e993591d..c3d9a42c561e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
| @@ -577,8 +577,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, | |||
| 577 | EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; | 577 | EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; |
| 578 | if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && | 578 | if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && |
| 579 | !(status & EXTENT_STATUS_WRITTEN) && | 579 | !(status & EXTENT_STATUS_WRITTEN) && |
| 580 | ext4_find_delalloc_range(inode, map->m_lblk, | 580 | ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, |
| 581 | map->m_lblk + map->m_len - 1)) | 581 | map->m_lblk + map->m_len - 1)) |
| 582 | status |= EXTENT_STATUS_DELAYED; | 582 | status |= EXTENT_STATUS_DELAYED; |
| 583 | ret = ext4_es_insert_extent(inode, map->m_lblk, | 583 | ret = ext4_es_insert_extent(inode, map->m_lblk, |
| 584 | map->m_len, map->m_pblk, status); | 584 | map->m_len, map->m_pblk, status); |
| @@ -701,8 +701,8 @@ found: | |||
| 701 | EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; | 701 | EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; |
| 702 | if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && | 702 | if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && |
| 703 | !(status & EXTENT_STATUS_WRITTEN) && | 703 | !(status & EXTENT_STATUS_WRITTEN) && |
| 704 | ext4_find_delalloc_range(inode, map->m_lblk, | 704 | ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, |
| 705 | map->m_lblk + map->m_len - 1)) | 705 | map->m_lblk + map->m_len - 1)) |
| 706 | status |= EXTENT_STATUS_DELAYED; | 706 | status |= EXTENT_STATUS_DELAYED; |
| 707 | ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, | 707 | ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, |
| 708 | map->m_pblk, status); | 708 | map->m_pblk, status); |
| @@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode) | |||
| 1595 | return 0; /* success */ | 1595 | return 0; /* success */ |
| 1596 | } | 1596 | } |
| 1597 | 1597 | ||
| 1598 | static void ext4_da_release_space(struct inode *inode, int to_free) | 1598 | void ext4_da_release_space(struct inode *inode, int to_free) |
| 1599 | { | 1599 | { |
| 1600 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1600 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
| 1601 | struct ext4_inode_info *ei = EXT4_I(inode); | 1601 | struct ext4_inode_info *ei = EXT4_I(inode); |
| @@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
| 1634 | unsigned int offset, | 1634 | unsigned int offset, |
| 1635 | unsigned int length) | 1635 | unsigned int length) |
| 1636 | { | 1636 | { |
| 1637 | int to_release = 0, contiguous_blks = 0; | 1637 | int contiguous_blks = 0; |
| 1638 | struct buffer_head *head, *bh; | 1638 | struct buffer_head *head, *bh; |
| 1639 | unsigned int curr_off = 0; | 1639 | unsigned int curr_off = 0; |
| 1640 | struct inode *inode = page->mapping->host; | 1640 | struct inode *inode = page->mapping->host; |
| 1641 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 1642 | unsigned int stop = offset + length; | 1641 | unsigned int stop = offset + length; |
| 1643 | int num_clusters; | ||
| 1644 | ext4_fsblk_t lblk; | 1642 | ext4_fsblk_t lblk; |
| 1645 | 1643 | ||
| 1646 | BUG_ON(stop > PAGE_SIZE || stop < length); | 1644 | BUG_ON(stop > PAGE_SIZE || stop < length); |
| @@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
| 1654 | break; | 1652 | break; |
| 1655 | 1653 | ||
| 1656 | if ((offset <= curr_off) && (buffer_delay(bh))) { | 1654 | if ((offset <= curr_off) && (buffer_delay(bh))) { |
| 1657 | to_release++; | ||
| 1658 | contiguous_blks++; | 1655 | contiguous_blks++; |
| 1659 | clear_buffer_delay(bh); | 1656 | clear_buffer_delay(bh); |
| 1660 | } else if (contiguous_blks) { | 1657 | } else if (contiguous_blks) { |
| @@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
| 1662 | (PAGE_SHIFT - inode->i_blkbits); | 1659 | (PAGE_SHIFT - inode->i_blkbits); |
| 1663 | lblk += (curr_off >> inode->i_blkbits) - | 1660 | lblk += (curr_off >> inode->i_blkbits) - |
| 1664 | contiguous_blks; | 1661 | contiguous_blks; |
| 1665 | ext4_es_remove_extent(inode, lblk, contiguous_blks); | 1662 | ext4_es_remove_blks(inode, lblk, contiguous_blks); |
| 1666 | contiguous_blks = 0; | 1663 | contiguous_blks = 0; |
| 1667 | } | 1664 | } |
| 1668 | curr_off = next_off; | 1665 | curr_off = next_off; |
| @@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
| 1671 | if (contiguous_blks) { | 1668 | if (contiguous_blks) { |
| 1672 | lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); | 1669 | lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); |
| 1673 | lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; | 1670 | lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; |
| 1674 | ext4_es_remove_extent(inode, lblk, contiguous_blks); | 1671 | ext4_es_remove_blks(inode, lblk, contiguous_blks); |
| 1675 | } | 1672 | } |
| 1676 | 1673 | ||
| 1677 | /* If we have released all the blocks belonging to a cluster, then we | ||
| 1678 | * need to release the reserved space for that cluster. */ | ||
| 1679 | num_clusters = EXT4_NUM_B2C(sbi, to_release); | ||
| 1680 | while (num_clusters > 0) { | ||
| 1681 | lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + | ||
| 1682 | ((num_clusters - 1) << sbi->s_cluster_bits); | ||
| 1683 | if (sbi->s_cluster_ratio == 1 || | ||
| 1684 | !ext4_find_delalloc_cluster(inode, lblk)) | ||
| 1685 | ext4_da_release_space(inode, 1); | ||
| 1686 | |||
| 1687 | num_clusters--; | ||
| 1688 | } | ||
| 1689 | } | 1674 | } |
| 1690 | 1675 | ||
| 1691 | /* | 1676 | /* |
| @@ -1781,6 +1766,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) | |||
| 1781 | } | 1766 | } |
| 1782 | 1767 | ||
| 1783 | /* | 1768 | /* |
| 1769 | * ext4_insert_delayed_block - adds a delayed block to the extents status | ||
| 1770 | * tree, incrementing the reserved cluster/block | ||
| 1771 | * count or making a pending reservation | ||
| 1772 | * where needed | ||
| 1773 | * | ||
| 1774 | * @inode - file containing the newly added block | ||
| 1775 | * @lblk - logical block to be added | ||
| 1776 | * | ||
| 1777 | * Returns 0 on success, negative error code on failure. | ||
| 1778 | */ | ||
| 1779 | static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) | ||
| 1780 | { | ||
| 1781 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 1782 | int ret; | ||
| 1783 | bool allocated = false; | ||
| 1784 | |||
| 1785 | /* | ||
| 1786 | * If the cluster containing lblk is shared with a delayed, | ||
| 1787 | * written, or unwritten extent in a bigalloc file system, it's | ||
| 1788 | * already been accounted for and does not need to be reserved. | ||
| 1789 | * A pending reservation must be made for the cluster if it's | ||
| 1790 | * shared with a written or unwritten extent and doesn't already | ||
| 1791 | * have one. Written and unwritten extents can be purged from the | ||
| 1792 | * extents status tree if the system is under memory pressure, so | ||
| 1793 | * it's necessary to examine the extent tree if a search of the | ||
| 1794 | * extents status tree doesn't get a match. | ||
| 1795 | */ | ||
| 1796 | if (sbi->s_cluster_ratio == 1) { | ||
| 1797 | ret = ext4_da_reserve_space(inode); | ||
| 1798 | if (ret != 0) /* ENOSPC */ | ||
| 1799 | goto errout; | ||
| 1800 | } else { /* bigalloc */ | ||
| 1801 | if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { | ||
| 1802 | if (!ext4_es_scan_clu(inode, | ||
| 1803 | &ext4_es_is_mapped, lblk)) { | ||
| 1804 | ret = ext4_clu_mapped(inode, | ||
| 1805 | EXT4_B2C(sbi, lblk)); | ||
| 1806 | if (ret < 0) | ||
| 1807 | goto errout; | ||
| 1808 | if (ret == 0) { | ||
| 1809 | ret = ext4_da_reserve_space(inode); | ||
| 1810 | if (ret != 0) /* ENOSPC */ | ||
| 1811 | goto errout; | ||
| 1812 | } else { | ||
| 1813 | allocated = true; | ||
| 1814 | } | ||
| 1815 | } else { | ||
| 1816 | allocated = true; | ||
| 1817 | } | ||
| 1818 | } | ||
| 1819 | } | ||
| 1820 | |||
| 1821 | ret = ext4_es_insert_delayed_block(inode, lblk, allocated); | ||
| 1822 | |||
| 1823 | errout: | ||
| 1824 | return ret; | ||
| 1825 | } | ||
| 1826 | |||
| 1827 | /* | ||
| 1784 | * This function is grabs code from the very beginning of | 1828 | * This function is grabs code from the very beginning of |
| 1785 | * ext4_map_blocks, but assumes that the caller is from delayed write | 1829 | * ext4_map_blocks, but assumes that the caller is from delayed write |
| 1786 | * time. This function looks up the requested blocks and sets the | 1830 | * time. This function looks up the requested blocks and sets the |
| @@ -1859,28 +1903,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, | |||
| 1859 | add_delayed: | 1903 | add_delayed: |
| 1860 | if (retval == 0) { | 1904 | if (retval == 0) { |
| 1861 | int ret; | 1905 | int ret; |
| 1906 | |||
| 1862 | /* | 1907 | /* |
| 1863 | * XXX: __block_prepare_write() unmaps passed block, | 1908 | * XXX: __block_prepare_write() unmaps passed block, |
| 1864 | * is it OK? | 1909 | * is it OK? |
| 1865 | */ | 1910 | */ |
| 1866 | /* | ||
| 1867 | * If the block was allocated from previously allocated cluster, | ||
| 1868 | * then we don't need to reserve it again. However we still need | ||
| 1869 | * to reserve metadata for every block we're going to write. | ||
| 1870 | */ | ||
| 1871 | if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || | ||
| 1872 | !ext4_find_delalloc_cluster(inode, map->m_lblk)) { | ||
| 1873 | ret = ext4_da_reserve_space(inode); | ||
| 1874 | if (ret) { | ||
| 1875 | /* not enough space to reserve */ | ||
| 1876 | retval = ret; | ||
| 1877 | goto out_unlock; | ||
| 1878 | } | ||
| 1879 | } | ||
| 1880 | 1911 | ||
| 1881 | ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, | 1912 | ret = ext4_insert_delayed_block(inode, map->m_lblk); |
| 1882 | ~0, EXTENT_STATUS_DELAYED); | 1913 | if (ret != 0) { |
| 1883 | if (ret) { | ||
| 1884 | retval = ret; | 1914 | retval = ret; |
| 1885 | goto out_unlock; | 1915 | goto out_unlock; |
| 1886 | } | 1916 | } |
| @@ -3450,7 +3480,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, | |||
| 3450 | ext4_lblk_t end = map.m_lblk + map.m_len - 1; | 3480 | ext4_lblk_t end = map.m_lblk + map.m_len - 1; |
| 3451 | struct extent_status es; | 3481 | struct extent_status es; |
| 3452 | 3482 | ||
| 3453 | ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); | 3483 | ext4_es_find_extent_range(inode, &ext4_es_is_delayed, |
| 3484 | map.m_lblk, end, &es); | ||
| 3454 | 3485 | ||
| 3455 | if (!es.es_len || es.es_lblk > end) { | 3486 | if (!es.es_len || es.es_lblk > end) { |
| 3456 | /* entire range is a hole */ | 3487 | /* entire range is a hole */ |
| @@ -6153,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) | |||
| 6153 | return !buffer_mapped(bh); | 6184 | return !buffer_mapped(bh); |
| 6154 | } | 6185 | } |
| 6155 | 6186 | ||
| 6156 | int ext4_page_mkwrite(struct vm_fault *vmf) | 6187 | vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) |
| 6157 | { | 6188 | { |
| 6158 | struct vm_area_struct *vma = vmf->vma; | 6189 | struct vm_area_struct *vma = vmf->vma; |
| 6159 | struct page *page = vmf->page; | 6190 | struct page *page = vmf->page; |
| 6160 | loff_t size; | 6191 | loff_t size; |
| 6161 | unsigned long len; | 6192 | unsigned long len; |
| 6162 | int ret; | 6193 | int err; |
| 6194 | vm_fault_t ret; | ||
| 6163 | struct file *file = vma->vm_file; | 6195 | struct file *file = vma->vm_file; |
| 6164 | struct inode *inode = file_inode(file); | 6196 | struct inode *inode = file_inode(file); |
| 6165 | struct address_space *mapping = inode->i_mapping; | 6197 | struct address_space *mapping = inode->i_mapping; |
| @@ -6172,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf) | |||
| 6172 | 6204 | ||
| 6173 | down_read(&EXT4_I(inode)->i_mmap_sem); | 6205 | down_read(&EXT4_I(inode)->i_mmap_sem); |
| 6174 | 6206 | ||
| 6175 | ret = ext4_convert_inline_data(inode); | 6207 | err = ext4_convert_inline_data(inode); |
| 6176 | if (ret) | 6208 | if (err) |
| 6177 | goto out_ret; | 6209 | goto out_ret; |
| 6178 | 6210 | ||
| 6179 | /* Delalloc case is easy... */ | 6211 | /* Delalloc case is easy... */ |
| @@ -6181,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf) | |||
| 6181 | !ext4_should_journal_data(inode) && | 6213 | !ext4_should_journal_data(inode) && |
| 6182 | !ext4_nonda_switch(inode->i_sb)) { | 6214 | !ext4_nonda_switch(inode->i_sb)) { |
| 6183 | do { | 6215 | do { |
| 6184 | ret = block_page_mkwrite(vma, vmf, | 6216 | err = block_page_mkwrite(vma, vmf, |
| 6185 | ext4_da_get_block_prep); | 6217 | ext4_da_get_block_prep); |
| 6186 | } while (ret == -ENOSPC && | 6218 | } while (err == -ENOSPC && |
| 6187 | ext4_should_retry_alloc(inode->i_sb, &retries)); | 6219 | ext4_should_retry_alloc(inode->i_sb, &retries)); |
| 6188 | goto out_ret; | 6220 | goto out_ret; |
| 6189 | } | 6221 | } |
| @@ -6228,8 +6260,8 @@ retry_alloc: | |||
| 6228 | ret = VM_FAULT_SIGBUS; | 6260 | ret = VM_FAULT_SIGBUS; |
| 6229 | goto out; | 6261 | goto out; |
| 6230 | } | 6262 | } |
| 6231 | ret = block_page_mkwrite(vma, vmf, get_block); | 6263 | err = block_page_mkwrite(vma, vmf, get_block); |
| 6232 | if (!ret && ext4_should_journal_data(inode)) { | 6264 | if (!err && ext4_should_journal_data(inode)) { |
| 6233 | if (ext4_walk_page_buffers(handle, page_buffers(page), 0, | 6265 | if (ext4_walk_page_buffers(handle, page_buffers(page), 0, |
| 6234 | PAGE_SIZE, NULL, do_journal_get_write_access)) { | 6266 | PAGE_SIZE, NULL, do_journal_get_write_access)) { |
| 6235 | unlock_page(page); | 6267 | unlock_page(page); |
| @@ -6240,24 +6272,24 @@ retry_alloc: | |||
| 6240 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | 6272 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
| 6241 | } | 6273 | } |
| 6242 | ext4_journal_stop(handle); | 6274 | ext4_journal_stop(handle); |
| 6243 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 6275 | if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
| 6244 | goto retry_alloc; | 6276 | goto retry_alloc; |
| 6245 | out_ret: | 6277 | out_ret: |
| 6246 | ret = block_page_mkwrite_return(ret); | 6278 | ret = block_page_mkwrite_return(err); |
| 6247 | out: | 6279 | out: |
| 6248 | up_read(&EXT4_I(inode)->i_mmap_sem); | 6280 | up_read(&EXT4_I(inode)->i_mmap_sem); |
| 6249 | sb_end_pagefault(inode->i_sb); | 6281 | sb_end_pagefault(inode->i_sb); |
| 6250 | return ret; | 6282 | return ret; |
| 6251 | } | 6283 | } |
| 6252 | 6284 | ||
| 6253 | int ext4_filemap_fault(struct vm_fault *vmf) | 6285 | vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) |
| 6254 | { | 6286 | { |
| 6255 | struct inode *inode = file_inode(vmf->vma->vm_file); | 6287 | struct inode *inode = file_inode(vmf->vma->vm_file); |
| 6256 | int err; | 6288 | vm_fault_t ret; |
| 6257 | 6289 | ||
| 6258 | down_read(&EXT4_I(inode)->i_mmap_sem); | 6290 | down_read(&EXT4_I(inode)->i_mmap_sem); |
| 6259 | err = filemap_fault(vmf); | 6291 | ret = filemap_fault(vmf); |
| 6260 | up_read(&EXT4_I(inode)->i_mmap_sem); | 6292 | up_read(&EXT4_I(inode)->i_mmap_sem); |
| 6261 | 6293 | ||
| 6262 | return err; | 6294 | return ret; |
| 6263 | } | 6295 | } |
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a7074115d6f6..0edee31913d1 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
| @@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) | |||
| 67 | ei1 = EXT4_I(inode1); | 67 | ei1 = EXT4_I(inode1); |
| 68 | ei2 = EXT4_I(inode2); | 68 | ei2 = EXT4_I(inode2); |
| 69 | 69 | ||
| 70 | swap(inode1->i_flags, inode2->i_flags); | ||
| 71 | swap(inode1->i_version, inode2->i_version); | 70 | swap(inode1->i_version, inode2->i_version); |
| 72 | swap(inode1->i_blocks, inode2->i_blocks); | 71 | swap(inode1->i_blocks, inode2->i_blocks); |
| 73 | swap(inode1->i_bytes, inode2->i_bytes); | 72 | swap(inode1->i_bytes, inode2->i_bytes); |
| @@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) | |||
| 85 | i_size_write(inode2, isize); | 84 | i_size_write(inode2, isize); |
| 86 | } | 85 | } |
| 87 | 86 | ||
| 87 | static void reset_inode_seed(struct inode *inode) | ||
| 88 | { | ||
| 89 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
| 90 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
| 91 | __le32 inum = cpu_to_le32(inode->i_ino); | ||
| 92 | __le32 gen = cpu_to_le32(inode->i_generation); | ||
| 93 | __u32 csum; | ||
| 94 | |||
| 95 | if (!ext4_has_metadata_csum(inode->i_sb)) | ||
| 96 | return; | ||
| 97 | |||
| 98 | csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); | ||
| 99 | ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); | ||
| 100 | } | ||
| 101 | |||
| 88 | /** | 102 | /** |
| 89 | * Swap the information from the given @inode and the inode | 103 | * Swap the information from the given @inode and the inode |
| 90 | * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other | 104 | * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other |
| @@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
| 102 | struct inode *inode_bl; | 116 | struct inode *inode_bl; |
| 103 | struct ext4_inode_info *ei_bl; | 117 | struct ext4_inode_info *ei_bl; |
| 104 | 118 | ||
| 105 | if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) | 119 | if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || |
| 120 | IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || | ||
| 121 | ext4_has_inline_data(inode)) | ||
| 106 | return -EINVAL; | 122 | return -EINVAL; |
| 107 | 123 | ||
| 108 | if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) | 124 | if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || |
| 125 | !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) | ||
| 109 | return -EPERM; | 126 | return -EPERM; |
| 110 | 127 | ||
| 111 | inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); | 128 | inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); |
| @@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
| 120 | * that only 1 swap_inode_boot_loader is running. */ | 137 | * that only 1 swap_inode_boot_loader is running. */ |
| 121 | lock_two_nondirectories(inode, inode_bl); | 138 | lock_two_nondirectories(inode, inode_bl); |
| 122 | 139 | ||
| 123 | truncate_inode_pages(&inode->i_data, 0); | ||
| 124 | truncate_inode_pages(&inode_bl->i_data, 0); | ||
| 125 | |||
| 126 | /* Wait for all existing dio workers */ | 140 | /* Wait for all existing dio workers */ |
| 127 | inode_dio_wait(inode); | 141 | inode_dio_wait(inode); |
| 128 | inode_dio_wait(inode_bl); | 142 | inode_dio_wait(inode_bl); |
| 129 | 143 | ||
| 144 | truncate_inode_pages(&inode->i_data, 0); | ||
| 145 | truncate_inode_pages(&inode_bl->i_data, 0); | ||
| 146 | |||
| 130 | handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); | 147 | handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); |
| 131 | if (IS_ERR(handle)) { | 148 | if (IS_ERR(handle)) { |
| 132 | err = -EINVAL; | 149 | err = -EINVAL; |
| @@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
| 159 | 176 | ||
| 160 | inode->i_generation = prandom_u32(); | 177 | inode->i_generation = prandom_u32(); |
| 161 | inode_bl->i_generation = prandom_u32(); | 178 | inode_bl->i_generation = prandom_u32(); |
| 179 | reset_inode_seed(inode); | ||
| 180 | reset_inode_seed(inode_bl); | ||
| 162 | 181 | ||
| 163 | ext4_discard_preallocations(inode); | 182 | ext4_discard_preallocations(inode); |
| 164 | 183 | ||
| @@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
| 169 | inode->i_ino, err); | 188 | inode->i_ino, err); |
| 170 | /* Revert all changes: */ | 189 | /* Revert all changes: */ |
| 171 | swap_inode_data(inode, inode_bl); | 190 | swap_inode_data(inode, inode_bl); |
| 191 | ext4_mark_inode_dirty(handle, inode); | ||
| 172 | } else { | 192 | } else { |
| 173 | err = ext4_mark_inode_dirty(handle, inode_bl); | 193 | err = ext4_mark_inode_dirty(handle, inode_bl); |
| 174 | if (err < 0) { | 194 | if (err < 0) { |
| @@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
| 178 | /* Revert all changes: */ | 198 | /* Revert all changes: */ |
| 179 | swap_inode_data(inode, inode_bl); | 199 | swap_inode_data(inode, inode_bl); |
| 180 | ext4_mark_inode_dirty(handle, inode); | 200 | ext4_mark_inode_dirty(handle, inode); |
| 201 | ext4_mark_inode_dirty(handle, inode_bl); | ||
| 181 | } | 202 | } |
| 182 | } | 203 | } |
| 183 | ext4_journal_stop(handle); | 204 | ext4_journal_stop(handle); |
| @@ -339,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) | |||
| 339 | if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) | 360 | if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) |
| 340 | return 0; | 361 | return 0; |
| 341 | 362 | ||
| 342 | err = mnt_want_write_file(filp); | ||
| 343 | if (err) | ||
| 344 | return err; | ||
| 345 | |||
| 346 | err = -EPERM; | 363 | err = -EPERM; |
| 347 | inode_lock(inode); | ||
| 348 | /* Is it quota file? Do not allow user to mess with it */ | 364 | /* Is it quota file? Do not allow user to mess with it */ |
| 349 | if (ext4_is_quota_file(inode)) | 365 | if (ext4_is_quota_file(inode)) |
| 350 | goto out_unlock; | 366 | return err; |
| 351 | 367 | ||
| 352 | err = ext4_get_inode_loc(inode, &iloc); | 368 | err = ext4_get_inode_loc(inode, &iloc); |
| 353 | if (err) | 369 | if (err) |
| 354 | goto out_unlock; | 370 | return err; |
| 355 | 371 | ||
| 356 | raw_inode = ext4_raw_inode(&iloc); | 372 | raw_inode = ext4_raw_inode(&iloc); |
| 357 | if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { | 373 | if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { |
| @@ -359,20 +375,20 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) | |||
| 359 | EXT4_SB(sb)->s_want_extra_isize, | 375 | EXT4_SB(sb)->s_want_extra_isize, |
| 360 | &iloc); | 376 | &iloc); |
| 361 | if (err) | 377 | if (err) |
| 362 | goto out_unlock; | 378 | return err; |
| 363 | } else { | 379 | } else { |
| 364 | brelse(iloc.bh); | 380 | brelse(iloc.bh); |
| 365 | } | 381 | } |
| 366 | 382 | ||
| 367 | dquot_initialize(inode); | 383 | err = dquot_initialize(inode); |
| 384 | if (err) | ||
| 385 | return err; | ||
| 368 | 386 | ||
| 369 | handle = ext4_journal_start(inode, EXT4_HT_QUOTA, | 387 | handle = ext4_journal_start(inode, EXT4_HT_QUOTA, |
| 370 | EXT4_QUOTA_INIT_BLOCKS(sb) + | 388 | EXT4_QUOTA_INIT_BLOCKS(sb) + |
| 371 | EXT4_QUOTA_DEL_BLOCKS(sb) + 3); | 389 | EXT4_QUOTA_DEL_BLOCKS(sb) + 3); |
| 372 | if (IS_ERR(handle)) { | 390 | if (IS_ERR(handle)) |
| 373 | err = PTR_ERR(handle); | 391 | return PTR_ERR(handle); |
| 374 | goto out_unlock; | ||
| 375 | } | ||
| 376 | 392 | ||
| 377 | err = ext4_reserve_inode_write(handle, inode, &iloc); | 393 | err = ext4_reserve_inode_write(handle, inode, &iloc); |
| 378 | if (err) | 394 | if (err) |
| @@ -400,9 +416,6 @@ out_dirty: | |||
| 400 | err = rc; | 416 | err = rc; |
| 401 | out_stop: | 417 | out_stop: |
| 402 | ext4_journal_stop(handle); | 418 | ext4_journal_stop(handle); |
| 403 | out_unlock: | ||
| 404 | inode_unlock(inode); | ||
| 405 | mnt_drop_write_file(filp); | ||
| 406 | return err; | 419 | return err; |
| 407 | } | 420 | } |
| 408 | #else | 421 | #else |
| @@ -626,6 +639,30 @@ group_add_out: | |||
| 626 | return err; | 639 | return err; |
| 627 | } | 640 | } |
| 628 | 641 | ||
| 642 | static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa) | ||
| 643 | { | ||
| 644 | /* | ||
| 645 | * Project Quota ID state is only allowed to change from within the init | ||
| 646 | * namespace. Enforce that restriction only if we are trying to change | ||
| 647 | * the quota ID state. Everything else is allowed in user namespaces. | ||
| 648 | */ | ||
| 649 | if (current_user_ns() == &init_user_ns) | ||
| 650 | return 0; | ||
| 651 | |||
| 652 | if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid) | ||
| 653 | return -EINVAL; | ||
| 654 | |||
| 655 | if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) { | ||
| 656 | if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) | ||
| 657 | return -EINVAL; | ||
| 658 | } else { | ||
| 659 | if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) | ||
| 660 | return -EINVAL; | ||
| 661 | } | ||
| 662 | |||
| 663 | return 0; | ||
| 664 | } | ||
| 665 | |||
| 629 | long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | 666 | long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) |
| 630 | { | 667 | { |
| 631 | struct inode *inode = file_inode(filp); | 668 | struct inode *inode = file_inode(filp); |
| @@ -1025,19 +1062,19 @@ resizefs_out: | |||
| 1025 | return err; | 1062 | return err; |
| 1026 | 1063 | ||
| 1027 | inode_lock(inode); | 1064 | inode_lock(inode); |
| 1065 | err = ext4_ioctl_check_project(inode, &fa); | ||
| 1066 | if (err) | ||
| 1067 | goto out; | ||
| 1028 | flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | | 1068 | flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | |
| 1029 | (flags & EXT4_FL_XFLAG_VISIBLE); | 1069 | (flags & EXT4_FL_XFLAG_VISIBLE); |
| 1030 | err = ext4_ioctl_setflags(inode, flags); | 1070 | err = ext4_ioctl_setflags(inode, flags); |
| 1031 | inode_unlock(inode); | ||
| 1032 | mnt_drop_write_file(filp); | ||
| 1033 | if (err) | 1071 | if (err) |
| 1034 | return err; | 1072 | goto out; |
| 1035 | |||
| 1036 | err = ext4_ioctl_setproject(filp, fa.fsx_projid); | 1073 | err = ext4_ioctl_setproject(filp, fa.fsx_projid); |
| 1037 | if (err) | 1074 | out: |
| 1038 | return err; | 1075 | inode_unlock(inode); |
| 1039 | 1076 | mnt_drop_write_file(filp); | |
| 1040 | return 0; | 1077 | return err; |
| 1041 | } | 1078 | } |
| 1042 | case EXT4_IOC_SHUTDOWN: | 1079 | case EXT4_IOC_SHUTDOWN: |
| 1043 | return ext4_shutdown(sb, arg); | 1080 | return ext4_shutdown(sb, arg); |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..e2248083cdca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
| @@ -4915,9 +4915,17 @@ do_more: | |||
| 4915 | &sbi->s_flex_groups[flex_group].free_clusters); | 4915 | &sbi->s_flex_groups[flex_group].free_clusters); |
| 4916 | } | 4916 | } |
| 4917 | 4917 | ||
| 4918 | if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) | 4918 | /* |
| 4919 | dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); | 4919 | * on a bigalloc file system, defer the s_freeclusters_counter |
| 4920 | percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); | 4920 | * update to the caller (ext4_remove_space and friends) so they |
| 4921 | * can determine if a cluster freed here should be rereserved | ||
| 4922 | */ | ||
| 4923 | if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { | ||
| 4924 | if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) | ||
| 4925 | dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); | ||
| 4926 | percpu_counter_add(&sbi->s_freeclusters_counter, | ||
| 4927 | count_clusters); | ||
| 4928 | } | ||
| 4921 | 4929 | ||
| 4922 | ext4_mb_unload_buddy(&e4b); | 4930 | ext4_mb_unload_buddy(&e4b); |
| 4923 | 4931 | ||
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a409ff70d67b..2f5be02fc6f6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c | |||
| @@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode, | |||
| 516 | orig_inode->i_ino, donor_inode->i_ino); | 516 | orig_inode->i_ino, donor_inode->i_ino); |
| 517 | return -EINVAL; | 517 | return -EINVAL; |
| 518 | } | 518 | } |
| 519 | if (orig_eof < orig_start + *len - 1) | 519 | if (orig_eof <= orig_start) |
| 520 | *len = 0; | ||
| 521 | else if (orig_eof < orig_start + *len - 1) | ||
| 520 | *len = orig_eof - orig_start; | 522 | *len = orig_eof - orig_start; |
| 521 | if (donor_eof < donor_start + *len - 1) | 523 | if (donor_eof <= donor_start) |
| 524 | *len = 0; | ||
| 525 | else if (donor_eof < donor_start + *len - 1) | ||
| 522 | *len = donor_eof - donor_start; | 526 | *len = donor_eof - donor_start; |
| 523 | if (!*len) { | 527 | if (!*len) { |
| 524 | ext4_debug("ext4 move extent: len should not be 0 " | 528 | ext4_debug("ext4 move extent: len should not be 0 " |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 377d516c475f..67a38532032a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
| @@ -2261,7 +2261,7 @@ again: | |||
| 2261 | dxroot->info.indirect_levels += 1; | 2261 | dxroot->info.indirect_levels += 1; |
| 2262 | dxtrace(printk(KERN_DEBUG | 2262 | dxtrace(printk(KERN_DEBUG |
| 2263 | "Creating %d level index...\n", | 2263 | "Creating %d level index...\n", |
| 2264 | info->indirect_levels)); | 2264 | dxroot->info.indirect_levels)); |
| 2265 | err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); | 2265 | err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); |
| 2266 | if (err) | 2266 | if (err) |
| 2267 | goto journal_error; | 2267 | goto journal_error; |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1145109968ef..a221f1cdf704 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
| @@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb) | |||
| 914 | for (type = 0; type < EXT4_MAXQUOTAS; type++) | 914 | for (type = 0; type < EXT4_MAXQUOTAS; type++) |
| 915 | ext4_quota_off(sb, type); | 915 | ext4_quota_off(sb, type); |
| 916 | } | 916 | } |
| 917 | |||
| 918 | /* | ||
| 919 | * This is a helper function which is used in the mount/remount | ||
| 920 | * codepaths (which holds s_umount) to fetch the quota file name. | ||
| 921 | */ | ||
| 922 | static inline char *get_qf_name(struct super_block *sb, | ||
| 923 | struct ext4_sb_info *sbi, | ||
| 924 | int type) | ||
| 925 | { | ||
| 926 | return rcu_dereference_protected(sbi->s_qf_names[type], | ||
| 927 | lockdep_is_held(&sb->s_umount)); | ||
| 928 | } | ||
| 917 | #else | 929 | #else |
| 918 | static inline void ext4_quota_off_umount(struct super_block *sb) | 930 | static inline void ext4_quota_off_umount(struct super_block *sb) |
| 919 | { | 931 | { |
| @@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb) | |||
| 965 | percpu_free_rwsem(&sbi->s_journal_flag_rwsem); | 977 | percpu_free_rwsem(&sbi->s_journal_flag_rwsem); |
| 966 | #ifdef CONFIG_QUOTA | 978 | #ifdef CONFIG_QUOTA |
| 967 | for (i = 0; i < EXT4_MAXQUOTAS; i++) | 979 | for (i = 0; i < EXT4_MAXQUOTAS; i++) |
| 968 | kfree(sbi->s_qf_names[i]); | 980 | kfree(get_qf_name(sb, sbi, i)); |
| 969 | #endif | 981 | #endif |
| 970 | 982 | ||
| 971 | /* Debugging code just in case the in-memory inode orphan list | 983 | /* Debugging code just in case the in-memory inode orphan list |
| @@ -1040,6 +1052,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
| 1040 | ei->i_da_metadata_calc_len = 0; | 1052 | ei->i_da_metadata_calc_len = 0; |
| 1041 | ei->i_da_metadata_calc_last_lblock = 0; | 1053 | ei->i_da_metadata_calc_last_lblock = 0; |
| 1042 | spin_lock_init(&(ei->i_block_reservation_lock)); | 1054 | spin_lock_init(&(ei->i_block_reservation_lock)); |
| 1055 | ext4_init_pending_tree(&ei->i_pending_tree); | ||
| 1043 | #ifdef CONFIG_QUOTA | 1056 | #ifdef CONFIG_QUOTA |
| 1044 | ei->i_reserved_quota = 0; | 1057 | ei->i_reserved_quota = 0; |
| 1045 | memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); | 1058 | memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); |
| @@ -1530,11 +1543,10 @@ static const char deprecated_msg[] = | |||
| 1530 | static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) | 1543 | static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) |
| 1531 | { | 1544 | { |
| 1532 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1545 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
| 1533 | char *qname; | 1546 | char *qname, *old_qname = get_qf_name(sb, sbi, qtype); |
| 1534 | int ret = -1; | 1547 | int ret = -1; |
| 1535 | 1548 | ||
| 1536 | if (sb_any_quota_loaded(sb) && | 1549 | if (sb_any_quota_loaded(sb) && !old_qname) { |
| 1537 | !sbi->s_qf_names[qtype]) { | ||
| 1538 | ext4_msg(sb, KERN_ERR, | 1550 | ext4_msg(sb, KERN_ERR, |
| 1539 | "Cannot change journaled " | 1551 | "Cannot change journaled " |
| 1540 | "quota options when quota turned on"); | 1552 | "quota options when quota turned on"); |
| @@ -1551,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) | |||
| 1551 | "Not enough memory for storing quotafile name"); | 1563 | "Not enough memory for storing quotafile name"); |
| 1552 | return -1; | 1564 | return -1; |
| 1553 | } | 1565 | } |
| 1554 | if (sbi->s_qf_names[qtype]) { | 1566 | if (old_qname) { |
| 1555 | if (strcmp(sbi->s_qf_names[qtype], qname) == 0) | 1567 | if (strcmp(old_qname, qname) == 0) |
| 1556 | ret = 1; | 1568 | ret = 1; |
| 1557 | else | 1569 | else |
| 1558 | ext4_msg(sb, KERN_ERR, | 1570 | ext4_msg(sb, KERN_ERR, |
| @@ -1565,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) | |||
| 1565 | "quotafile must be on filesystem root"); | 1577 | "quotafile must be on filesystem root"); |
| 1566 | goto errout; | 1578 | goto errout; |
| 1567 | } | 1579 | } |
| 1568 | sbi->s_qf_names[qtype] = qname; | 1580 | rcu_assign_pointer(sbi->s_qf_names[qtype], qname); |
| 1569 | set_opt(sb, QUOTA); | 1581 | set_opt(sb, QUOTA); |
| 1570 | return 1; | 1582 | return 1; |
| 1571 | errout: | 1583 | errout: |
| @@ -1577,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype) | |||
| 1577 | { | 1589 | { |
| 1578 | 1590 | ||
| 1579 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1591 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
| 1592 | char *old_qname = get_qf_name(sb, sbi, qtype); | ||
| 1580 | 1593 | ||
| 1581 | if (sb_any_quota_loaded(sb) && | 1594 | if (sb_any_quota_loaded(sb) && old_qname) { |
| 1582 | sbi->s_qf_names[qtype]) { | ||
| 1583 | ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" | 1595 | ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" |
| 1584 | " when quota turned on"); | 1596 | " when quota turned on"); |
| 1585 | return -1; | 1597 | return -1; |
| 1586 | } | 1598 | } |
| 1587 | kfree(sbi->s_qf_names[qtype]); | 1599 | rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); |
| 1588 | sbi->s_qf_names[qtype] = NULL; | 1600 | synchronize_rcu(); |
| 1601 | kfree(old_qname); | ||
| 1589 | return 1; | 1602 | return 1; |
| 1590 | } | 1603 | } |
| 1591 | #endif | 1604 | #endif |
| @@ -1960,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb, | |||
| 1960 | int is_remount) | 1973 | int is_remount) |
| 1961 | { | 1974 | { |
| 1962 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1975 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
| 1963 | char *p; | 1976 | char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; |
| 1964 | substring_t args[MAX_OPT_ARGS]; | 1977 | substring_t args[MAX_OPT_ARGS]; |
| 1965 | int token; | 1978 | int token; |
| 1966 | 1979 | ||
| @@ -1991,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb, | |||
| 1991 | "Cannot enable project quota enforcement."); | 2004 | "Cannot enable project quota enforcement."); |
| 1992 | return 0; | 2005 | return 0; |
| 1993 | } | 2006 | } |
| 1994 | if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { | 2007 | usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); |
| 1995 | if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) | 2008 | grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); |
| 2009 | if (usr_qf_name || grp_qf_name) { | ||
| 2010 | if (test_opt(sb, USRQUOTA) && usr_qf_name) | ||
| 1996 | clear_opt(sb, USRQUOTA); | 2011 | clear_opt(sb, USRQUOTA); |
| 1997 | 2012 | ||
| 1998 | if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) | 2013 | if (test_opt(sb, GRPQUOTA) && grp_qf_name) |
| 1999 | clear_opt(sb, GRPQUOTA); | 2014 | clear_opt(sb, GRPQUOTA); |
| 2000 | 2015 | ||
| 2001 | if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { | 2016 | if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { |
| @@ -2029,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, | |||
| 2029 | { | 2044 | { |
| 2030 | #if defined(CONFIG_QUOTA) | 2045 | #if defined(CONFIG_QUOTA) |
| 2031 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 2046 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
| 2047 | char *usr_qf_name, *grp_qf_name; | ||
| 2032 | 2048 | ||
| 2033 | if (sbi->s_jquota_fmt) { | 2049 | if (sbi->s_jquota_fmt) { |
| 2034 | char *fmtname = ""; | 2050 | char *fmtname = ""; |
| @@ -2047,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq, | |||
| 2047 | seq_printf(seq, ",jqfmt=%s", fmtname); | 2063 | seq_printf(seq, ",jqfmt=%s", fmtname); |
| 2048 | } | 2064 | } |
| 2049 | 2065 | ||
| 2050 | if (sbi->s_qf_names[USRQUOTA]) | 2066 | rcu_read_lock(); |
| 2051 | seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); | 2067 | usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); |
| 2052 | 2068 | grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); | |
| 2053 | if (sbi->s_qf_names[GRPQUOTA]) | 2069 | if (usr_qf_name) |
| 2054 | seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); | 2070 | seq_show_option(seq, "usrjquota", usr_qf_name); |
| 2071 | if (grp_qf_name) | ||
| 2072 | seq_show_option(seq, "grpjquota", grp_qf_name); | ||
| 2073 | rcu_read_unlock(); | ||
| 2055 | #endif | 2074 | #endif |
| 2056 | } | 2075 | } |
| 2057 | 2076 | ||
| @@ -5103,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
| 5103 | int err = 0; | 5122 | int err = 0; |
| 5104 | #ifdef CONFIG_QUOTA | 5123 | #ifdef CONFIG_QUOTA |
| 5105 | int i, j; | 5124 | int i, j; |
| 5125 | char *to_free[EXT4_MAXQUOTAS]; | ||
| 5106 | #endif | 5126 | #endif |
| 5107 | char *orig_data = kstrdup(data, GFP_KERNEL); | 5127 | char *orig_data = kstrdup(data, GFP_KERNEL); |
| 5108 | 5128 | ||
| @@ -5122,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
| 5122 | old_opts.s_jquota_fmt = sbi->s_jquota_fmt; | 5142 | old_opts.s_jquota_fmt = sbi->s_jquota_fmt; |
| 5123 | for (i = 0; i < EXT4_MAXQUOTAS; i++) | 5143 | for (i = 0; i < EXT4_MAXQUOTAS; i++) |
| 5124 | if (sbi->s_qf_names[i]) { | 5144 | if (sbi->s_qf_names[i]) { |
| 5125 | old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], | 5145 | char *qf_name = get_qf_name(sb, sbi, i); |
| 5126 | GFP_KERNEL); | 5146 | |
| 5147 | old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); | ||
| 5127 | if (!old_opts.s_qf_names[i]) { | 5148 | if (!old_opts.s_qf_names[i]) { |
| 5128 | for (j = 0; j < i; j++) | 5149 | for (j = 0; j < i; j++) |
| 5129 | kfree(old_opts.s_qf_names[j]); | 5150 | kfree(old_opts.s_qf_names[j]); |
| @@ -5352,9 +5373,12 @@ restore_opts: | |||
| 5352 | #ifdef CONFIG_QUOTA | 5373 | #ifdef CONFIG_QUOTA |
| 5353 | sbi->s_jquota_fmt = old_opts.s_jquota_fmt; | 5374 | sbi->s_jquota_fmt = old_opts.s_jquota_fmt; |
| 5354 | for (i = 0; i < EXT4_MAXQUOTAS; i++) { | 5375 | for (i = 0; i < EXT4_MAXQUOTAS; i++) { |
| 5355 | kfree(sbi->s_qf_names[i]); | 5376 | to_free[i] = get_qf_name(sb, sbi, i); |
| 5356 | sbi->s_qf_names[i] = old_opts.s_qf_names[i]; | 5377 | rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); |
| 5357 | } | 5378 | } |
| 5379 | synchronize_rcu(); | ||
| 5380 | for (i = 0; i < EXT4_MAXQUOTAS; i++) | ||
| 5381 | kfree(to_free[i]); | ||
| 5358 | #endif | 5382 | #endif |
| 5359 | kfree(orig_data); | 5383 | kfree(orig_data); |
| 5360 | return err; | 5384 | return err; |
| @@ -5545,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type) | |||
| 5545 | */ | 5569 | */ |
| 5546 | static int ext4_quota_on_mount(struct super_block *sb, int type) | 5570 | static int ext4_quota_on_mount(struct super_block *sb, int type) |
| 5547 | { | 5571 | { |
| 5548 | return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], | 5572 | return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), |
| 5549 | EXT4_SB(sb)->s_jquota_fmt, type); | 5573 | EXT4_SB(sb)->s_jquota_fmt, type); |
| 5550 | } | 5574 | } |
| 5551 | 5575 | ||
| @@ -5954,6 +5978,10 @@ static int __init ext4_init_fs(void) | |||
| 5954 | if (err) | 5978 | if (err) |
| 5955 | return err; | 5979 | return err; |
| 5956 | 5980 | ||
| 5981 | err = ext4_init_pending(); | ||
| 5982 | if (err) | ||
| 5983 | goto out6; | ||
| 5984 | |||
| 5957 | err = ext4_init_pageio(); | 5985 | err = ext4_init_pageio(); |
| 5958 | if (err) | 5986 | if (err) |
| 5959 | goto out5; | 5987 | goto out5; |
| @@ -5992,6 +6020,8 @@ out3: | |||
| 5992 | out4: | 6020 | out4: |
| 5993 | ext4_exit_pageio(); | 6021 | ext4_exit_pageio(); |
| 5994 | out5: | 6022 | out5: |
| 6023 | ext4_exit_pending(); | ||
| 6024 | out6: | ||
| 5995 | ext4_exit_es(); | 6025 | ext4_exit_es(); |
| 5996 | 6026 | ||
| 5997 | return err; | 6027 | return err; |
| @@ -6009,6 +6039,7 @@ static void __exit ext4_exit_fs(void) | |||
| 6009 | ext4_exit_system_zone(); | 6039 | ext4_exit_system_zone(); |
| 6010 | ext4_exit_pageio(); | 6040 | ext4_exit_pageio(); |
| 6011 | ext4_exit_es(); | 6041 | ext4_exit_es(); |
| 6042 | ext4_exit_pending(); | ||
| 6012 | } | 6043 | } |
| 6013 | 6044 | ||
| 6014 | MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); | 6045 | MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); |
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index c125d662777c..26f8d7e46462 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
| @@ -251,8 +251,8 @@ restart: | |||
| 251 | bh = jh2bh(jh); | 251 | bh = jh2bh(jh); |
| 252 | 252 | ||
| 253 | if (buffer_locked(bh)) { | 253 | if (buffer_locked(bh)) { |
| 254 | spin_unlock(&journal->j_list_lock); | ||
| 255 | get_bh(bh); | 254 | get_bh(bh); |
| 255 | spin_unlock(&journal->j_list_lock); | ||
| 256 | wait_on_buffer(bh); | 256 | wait_on_buffer(bh); |
| 257 | /* the journal_head may have gone by now */ | 257 | /* the journal_head may have gone by now */ |
| 258 | BUFFER_TRACE(bh, "brelse"); | 258 | BUFFER_TRACE(bh, "brelse"); |
| @@ -333,8 +333,8 @@ restart2: | |||
| 333 | jh = transaction->t_checkpoint_io_list; | 333 | jh = transaction->t_checkpoint_io_list; |
| 334 | bh = jh2bh(jh); | 334 | bh = jh2bh(jh); |
| 335 | if (buffer_locked(bh)) { | 335 | if (buffer_locked(bh)) { |
| 336 | spin_unlock(&journal->j_list_lock); | ||
| 337 | get_bh(bh); | 336 | get_bh(bh); |
| 337 | spin_unlock(&journal->j_list_lock); | ||
| 338 | wait_on_buffer(bh); | 338 | wait_on_buffer(bh); |
| 339 | /* the journal_head may have gone by now */ | 339 | /* the journal_head may have gone by now */ |
| 340 | BUFFER_TRACE(bh, "brelse"); | 340 | BUFFER_TRACE(bh, "brelse"); |
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 96225a77c112..7b73ef7f902d 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
| @@ -242,7 +242,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to); | |||
| 242 | int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, | 242 | int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, |
| 243 | get_block_t get_block); | 243 | get_block_t get_block); |
| 244 | /* Convert errno to return value from ->page_mkwrite() call */ | 244 | /* Convert errno to return value from ->page_mkwrite() call */ |
| 245 | static inline int block_page_mkwrite_return(int err) | 245 | static inline vm_fault_t block_page_mkwrite_return(int err) |
| 246 | { | 246 | { |
| 247 | if (err == 0) | 247 | if (err == 0) |
| 248 | return VM_FAULT_LOCKED; | 248 | return VM_FAULT_LOCKED; |
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0e31eb136c57..698e0d8a5ca4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h | |||
| @@ -17,6 +17,7 @@ struct mpage_da_data; | |||
| 17 | struct ext4_map_blocks; | 17 | struct ext4_map_blocks; |
| 18 | struct extent_status; | 18 | struct extent_status; |
| 19 | struct ext4_fsmap; | 19 | struct ext4_fsmap; |
| 20 | struct partial_cluster; | ||
| 20 | 21 | ||
| 21 | #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) | 22 | #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) |
| 22 | 23 | ||
| @@ -2035,21 +2036,23 @@ TRACE_EVENT(ext4_ext_show_extent, | |||
| 2035 | ); | 2036 | ); |
| 2036 | 2037 | ||
| 2037 | TRACE_EVENT(ext4_remove_blocks, | 2038 | TRACE_EVENT(ext4_remove_blocks, |
| 2038 | TP_PROTO(struct inode *inode, struct ext4_extent *ex, | 2039 | TP_PROTO(struct inode *inode, struct ext4_extent *ex, |
| 2039 | ext4_lblk_t from, ext4_fsblk_t to, | 2040 | ext4_lblk_t from, ext4_fsblk_t to, |
| 2040 | long long partial_cluster), | 2041 | struct partial_cluster *pc), |
| 2041 | 2042 | ||
| 2042 | TP_ARGS(inode, ex, from, to, partial_cluster), | 2043 | TP_ARGS(inode, ex, from, to, pc), |
| 2043 | 2044 | ||
| 2044 | TP_STRUCT__entry( | 2045 | TP_STRUCT__entry( |
| 2045 | __field( dev_t, dev ) | 2046 | __field( dev_t, dev ) |
| 2046 | __field( ino_t, ino ) | 2047 | __field( ino_t, ino ) |
| 2047 | __field( ext4_lblk_t, from ) | 2048 | __field( ext4_lblk_t, from ) |
| 2048 | __field( ext4_lblk_t, to ) | 2049 | __field( ext4_lblk_t, to ) |
| 2049 | __field( long long, partial ) | ||
| 2050 | __field( ext4_fsblk_t, ee_pblk ) | 2050 | __field( ext4_fsblk_t, ee_pblk ) |
| 2051 | __field( ext4_lblk_t, ee_lblk ) | 2051 | __field( ext4_lblk_t, ee_lblk ) |
| 2052 | __field( unsigned short, ee_len ) | 2052 | __field( unsigned short, ee_len ) |
| 2053 | __field( ext4_fsblk_t, pc_pclu ) | ||
| 2054 | __field( ext4_lblk_t, pc_lblk ) | ||
| 2055 | __field( int, pc_state) | ||
| 2053 | ), | 2056 | ), |
| 2054 | 2057 | ||
| 2055 | TP_fast_assign( | 2058 | TP_fast_assign( |
| @@ -2057,14 +2060,16 @@ TRACE_EVENT(ext4_remove_blocks, | |||
| 2057 | __entry->ino = inode->i_ino; | 2060 | __entry->ino = inode->i_ino; |
| 2058 | __entry->from = from; | 2061 | __entry->from = from; |
| 2059 | __entry->to = to; | 2062 | __entry->to = to; |
| 2060 | __entry->partial = partial_cluster; | ||
| 2061 | __entry->ee_pblk = ext4_ext_pblock(ex); | 2063 | __entry->ee_pblk = ext4_ext_pblock(ex); |
| 2062 | __entry->ee_lblk = le32_to_cpu(ex->ee_block); | 2064 | __entry->ee_lblk = le32_to_cpu(ex->ee_block); |
| 2063 | __entry->ee_len = ext4_ext_get_actual_len(ex); | 2065 | __entry->ee_len = ext4_ext_get_actual_len(ex); |
| 2066 | __entry->pc_pclu = pc->pclu; | ||
| 2067 | __entry->pc_lblk = pc->lblk; | ||
| 2068 | __entry->pc_state = pc->state; | ||
| 2064 | ), | 2069 | ), |
| 2065 | 2070 | ||
| 2066 | TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" | 2071 | TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" |
| 2067 | "from %u to %u partial_cluster %lld", | 2072 | "from %u to %u partial [pclu %lld lblk %u state %d]", |
| 2068 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2073 | MAJOR(__entry->dev), MINOR(__entry->dev), |
| 2069 | (unsigned long) __entry->ino, | 2074 | (unsigned long) __entry->ino, |
| 2070 | (unsigned) __entry->ee_lblk, | 2075 | (unsigned) __entry->ee_lblk, |
| @@ -2072,45 +2077,53 @@ TRACE_EVENT(ext4_remove_blocks, | |||
| 2072 | (unsigned short) __entry->ee_len, | 2077 | (unsigned short) __entry->ee_len, |
| 2073 | (unsigned) __entry->from, | 2078 | (unsigned) __entry->from, |
| 2074 | (unsigned) __entry->to, | 2079 | (unsigned) __entry->to, |
| 2075 | (long long) __entry->partial) | 2080 | (long long) __entry->pc_pclu, |
| 2081 | (unsigned int) __entry->pc_lblk, | ||
| 2082 | (int) __entry->pc_state) | ||
| 2076 | ); | 2083 | ); |
| 2077 | 2084 | ||
| 2078 | TRACE_EVENT(ext4_ext_rm_leaf, | 2085 | TRACE_EVENT(ext4_ext_rm_leaf, |
| 2079 | TP_PROTO(struct inode *inode, ext4_lblk_t start, | 2086 | TP_PROTO(struct inode *inode, ext4_lblk_t start, |
| 2080 | struct ext4_extent *ex, | 2087 | struct ext4_extent *ex, |
| 2081 | long long partial_cluster), | 2088 | struct partial_cluster *pc), |
| 2082 | 2089 | ||
| 2083 | TP_ARGS(inode, start, ex, partial_cluster), | 2090 | TP_ARGS(inode, start, ex, pc), |
| 2084 | 2091 | ||
| 2085 | TP_STRUCT__entry( | 2092 | TP_STRUCT__entry( |
| 2086 | __field( dev_t, dev ) | 2093 | __field( dev_t, dev ) |
| 2087 | __field( ino_t, ino ) | 2094 | __field( ino_t, ino ) |
| 2088 | __field( long long, partial ) | ||
| 2089 | __field( ext4_lblk_t, start ) | 2095 | __field( ext4_lblk_t, start ) |
| 2090 | __field( ext4_lblk_t, ee_lblk ) | 2096 | __field( ext4_lblk_t, ee_lblk ) |
| 2091 | __field( ext4_fsblk_t, ee_pblk ) | 2097 | __field( ext4_fsblk_t, ee_pblk ) |
| 2092 | __field( short, ee_len ) | 2098 | __field( short, ee_len ) |
| 2099 | __field( ext4_fsblk_t, pc_pclu ) | ||
| 2100 | __field( ext4_lblk_t, pc_lblk ) | ||
| 2101 | __field( int, pc_state) | ||
| 2093 | ), | 2102 | ), |
| 2094 | 2103 | ||
| 2095 | TP_fast_assign( | 2104 | TP_fast_assign( |
| 2096 | __entry->dev = inode->i_sb->s_dev; | 2105 | __entry->dev = inode->i_sb->s_dev; |
| 2097 | __entry->ino = inode->i_ino; | 2106 | __entry->ino = inode->i_ino; |
| 2098 | __entry->partial = partial_cluster; | ||
| 2099 | __entry->start = start; | 2107 | __entry->start = start; |
| 2100 | __entry->ee_lblk = le32_to_cpu(ex->ee_block); | 2108 | __entry->ee_lblk = le32_to_cpu(ex->ee_block); |
| 2101 | __entry->ee_pblk = ext4_ext_pblock(ex); | 2109 | __entry->ee_pblk = ext4_ext_pblock(ex); |
| 2102 | __entry->ee_len = ext4_ext_get_actual_len(ex); | 2110 | __entry->ee_len = ext4_ext_get_actual_len(ex); |
| 2111 | __entry->pc_pclu = pc->pclu; | ||
| 2112 | __entry->pc_lblk = pc->lblk; | ||
| 2113 | __entry->pc_state = pc->state; | ||
| 2103 | ), | 2114 | ), |
| 2104 | 2115 | ||
| 2105 | TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" | 2116 | TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" |
| 2106 | "partial_cluster %lld", | 2117 | "partial [pclu %lld lblk %u state %d]", |
| 2107 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2118 | MAJOR(__entry->dev), MINOR(__entry->dev), |
| 2108 | (unsigned long) __entry->ino, | 2119 | (unsigned long) __entry->ino, |
| 2109 | (unsigned) __entry->start, | 2120 | (unsigned) __entry->start, |
| 2110 | (unsigned) __entry->ee_lblk, | 2121 | (unsigned) __entry->ee_lblk, |
| 2111 | (unsigned long long) __entry->ee_pblk, | 2122 | (unsigned long long) __entry->ee_pblk, |
| 2112 | (unsigned short) __entry->ee_len, | 2123 | (unsigned short) __entry->ee_len, |
| 2113 | (long long) __entry->partial) | 2124 | (long long) __entry->pc_pclu, |
| 2125 | (unsigned int) __entry->pc_lblk, | ||
| 2126 | (int) __entry->pc_state) | ||
| 2114 | ); | 2127 | ); |
| 2115 | 2128 | ||
| 2116 | TRACE_EVENT(ext4_ext_rm_idx, | 2129 | TRACE_EVENT(ext4_ext_rm_idx, |
| @@ -2168,9 +2181,9 @@ TRACE_EVENT(ext4_ext_remove_space, | |||
| 2168 | 2181 | ||
| 2169 | TRACE_EVENT(ext4_ext_remove_space_done, | 2182 | TRACE_EVENT(ext4_ext_remove_space_done, |
| 2170 | TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, | 2183 | TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, |
| 2171 | int depth, long long partial, __le16 eh_entries), | 2184 | int depth, struct partial_cluster *pc, __le16 eh_entries), |
| 2172 | 2185 | ||
| 2173 | TP_ARGS(inode, start, end, depth, partial, eh_entries), | 2186 | TP_ARGS(inode, start, end, depth, pc, eh_entries), |
| 2174 | 2187 | ||
| 2175 | TP_STRUCT__entry( | 2188 | TP_STRUCT__entry( |
| 2176 | __field( dev_t, dev ) | 2189 | __field( dev_t, dev ) |
| @@ -2178,7 +2191,9 @@ TRACE_EVENT(ext4_ext_remove_space_done, | |||
| 2178 | __field( ext4_lblk_t, start ) | 2191 | __field( ext4_lblk_t, start ) |
| 2179 | __field( ext4_lblk_t, end ) | 2192 | __field( ext4_lblk_t, end ) |
| 2180 | __field( int, depth ) | 2193 | __field( int, depth ) |
| 2181 | __field( long long, partial ) | 2194 | __field( ext4_fsblk_t, pc_pclu ) |
| 2195 | __field( ext4_lblk_t, pc_lblk ) | ||
| 2196 | __field( int, pc_state ) | ||
| 2182 | __field( unsigned short, eh_entries ) | 2197 | __field( unsigned short, eh_entries ) |
| 2183 | ), | 2198 | ), |
| 2184 | 2199 | ||
| @@ -2188,18 +2203,23 @@ TRACE_EVENT(ext4_ext_remove_space_done, | |||
| 2188 | __entry->start = start; | 2203 | __entry->start = start; |
| 2189 | __entry->end = end; | 2204 | __entry->end = end; |
| 2190 | __entry->depth = depth; | 2205 | __entry->depth = depth; |
| 2191 | __entry->partial = partial; | 2206 | __entry->pc_pclu = pc->pclu; |
| 2207 | __entry->pc_lblk = pc->lblk; | ||
| 2208 | __entry->pc_state = pc->state; | ||
| 2192 | __entry->eh_entries = le16_to_cpu(eh_entries); | 2209 | __entry->eh_entries = le16_to_cpu(eh_entries); |
| 2193 | ), | 2210 | ), |
| 2194 | 2211 | ||
| 2195 | TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld " | 2212 | TP_printk("dev %d,%d ino %lu since %u end %u depth %d " |
| 2213 | "partial [pclu %lld lblk %u state %d] " | ||
| 2196 | "remaining_entries %u", | 2214 | "remaining_entries %u", |
| 2197 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2215 | MAJOR(__entry->dev), MINOR(__entry->dev), |
| 2198 | (unsigned long) __entry->ino, | 2216 | (unsigned long) __entry->ino, |
| 2199 | (unsigned) __entry->start, | 2217 | (unsigned) __entry->start, |
| 2200 | (unsigned) __entry->end, | 2218 | (unsigned) __entry->end, |
| 2201 | __entry->depth, | 2219 | __entry->depth, |
| 2202 | (long long) __entry->partial, | 2220 | (long long) __entry->pc_pclu, |
| 2221 | (unsigned int) __entry->pc_lblk, | ||
| 2222 | (int) __entry->pc_state, | ||
| 2203 | (unsigned short) __entry->eh_entries) | 2223 | (unsigned short) __entry->eh_entries) |
| 2204 | ); | 2224 | ); |
| 2205 | 2225 | ||
| @@ -2270,7 +2290,7 @@ TRACE_EVENT(ext4_es_remove_extent, | |||
| 2270 | __entry->lblk, __entry->len) | 2290 | __entry->lblk, __entry->len) |
| 2271 | ); | 2291 | ); |
| 2272 | 2292 | ||
| 2273 | TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, | 2293 | TRACE_EVENT(ext4_es_find_extent_range_enter, |
| 2274 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk), | 2294 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk), |
| 2275 | 2295 | ||
| 2276 | TP_ARGS(inode, lblk), | 2296 | TP_ARGS(inode, lblk), |
| @@ -2292,7 +2312,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, | |||
| 2292 | (unsigned long) __entry->ino, __entry->lblk) | 2312 | (unsigned long) __entry->ino, __entry->lblk) |
| 2293 | ); | 2313 | ); |
| 2294 | 2314 | ||
| 2295 | TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, | 2315 | TRACE_EVENT(ext4_es_find_extent_range_exit, |
| 2296 | TP_PROTO(struct inode *inode, struct extent_status *es), | 2316 | TP_PROTO(struct inode *inode, struct extent_status *es), |
| 2297 | 2317 | ||
| 2298 | TP_ARGS(inode, es), | 2318 | TP_ARGS(inode, es), |
| @@ -2512,6 +2532,41 @@ TRACE_EVENT(ext4_es_shrink, | |||
| 2512 | __entry->scan_time, __entry->nr_skipped, __entry->retried) | 2532 | __entry->scan_time, __entry->nr_skipped, __entry->retried) |
| 2513 | ); | 2533 | ); |
| 2514 | 2534 | ||
| 2535 | TRACE_EVENT(ext4_es_insert_delayed_block, | ||
| 2536 | TP_PROTO(struct inode *inode, struct extent_status *es, | ||
| 2537 | bool allocated), | ||
| 2538 | |||
| 2539 | TP_ARGS(inode, es, allocated), | ||
| 2540 | |||
| 2541 | TP_STRUCT__entry( | ||
| 2542 | __field( dev_t, dev ) | ||
| 2543 | __field( ino_t, ino ) | ||
| 2544 | __field( ext4_lblk_t, lblk ) | ||
| 2545 | __field( ext4_lblk_t, len ) | ||
| 2546 | __field( ext4_fsblk_t, pblk ) | ||
| 2547 | __field( char, status ) | ||
| 2548 | __field( bool, allocated ) | ||
| 2549 | ), | ||
| 2550 | |||
| 2551 | TP_fast_assign( | ||
| 2552 | __entry->dev = inode->i_sb->s_dev; | ||
| 2553 | __entry->ino = inode->i_ino; | ||
| 2554 | __entry->lblk = es->es_lblk; | ||
| 2555 | __entry->len = es->es_len; | ||
| 2556 | __entry->pblk = ext4_es_pblock(es); | ||
| 2557 | __entry->status = ext4_es_status(es); | ||
| 2558 | __entry->allocated = allocated; | ||
| 2559 | ), | ||
| 2560 | |||
| 2561 | TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " | ||
| 2562 | "allocated %d", | ||
| 2563 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
| 2564 | (unsigned long) __entry->ino, | ||
| 2565 | __entry->lblk, __entry->len, | ||
| 2566 | __entry->pblk, show_extent_status(__entry->status), | ||
| 2567 | __entry->allocated) | ||
| 2568 | ); | ||
| 2569 | |||
| 2515 | /* fsmap traces */ | 2570 | /* fsmap traces */ |
| 2516 | DECLARE_EVENT_CLASS(ext4_fsmap_class, | 2571 | DECLARE_EVENT_CLASS(ext4_fsmap_class, |
| 2517 | TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, | 2572 | TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, |
