aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-10-24 12:42:24 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-10-24 12:42:24 -0400
commit5993692f09582accb4cb7af11d344598af43c3b8 (patch)
tree062447eb44769d6da6e50302853eac1bb1d6e5d3
parentd6edff78fe9e34dbea1bec7dc26cfce92c6d96d5 (diff)
parent33458eaba4dfe778a426df6a19b7aad2ff9f7eec (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: - further restructure ext4 documentation - fix up ext4's delayed allocation for bigalloc file systems - fix up some syzbot-detected races in EXT4_IOC_MOVE_EXT, EXT4_IOC_SWAP_BOOT, and ext4_remount - ... and a few other miscellaneous bugs and optimizations. * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits) ext4: fix use-after-free race in ext4_remount()'s error path ext4: cache NULL when both default_acl and acl are NULL docs: promote the ext4 data structures book to top level docs: move ext4 administrative docs to admin-guide/ jbd2: fix use after free in jbd2_log_do_checkpoint() ext4: propagate error from dquot_initialize() in EXT4_IOC_FSSETXATTR ext4: fix setattr project check in fssetxattr ioctl docs: make ext4 readme tables readable docs: fix ext4 documentation table formatting problems docs: generate a separate ext4 pdf file from the documentation ext4: convert fault handler to use vm_fault_t type ext4: initialize retries variable in ext4_da_write_inline_data_begin() ext4: fix EXT4_IOC_SWAP_BOOT ext4: fix build error when DX_DEBUG is defined ext4: fix argument checking in EXT4_IOC_MOVE_EXT ext4: fix reserved cluster accounting at page invalidation time ext4: adjust reserved cluster count when removing extents ext4: reduce reserved cluster count by number of allocated clusters ext4: fix reserved cluster accounting at delayed write time ext4: add new pending reservation mechanism ...
-rw-r--r--Documentation/admin-guide/ext4.rst574
-rw-r--r--Documentation/admin-guide/index.rst1
-rw-r--r--Documentation/conf.py4
-rw-r--r--Documentation/filesystems/ext4/about.rst (renamed from Documentation/filesystems/ext4/ondisk/about.rst)0
-rw-r--r--Documentation/filesystems/ext4/allocators.rst (renamed from Documentation/filesystems/ext4/ondisk/allocators.rst)0
-rw-r--r--Documentation/filesystems/ext4/attributes.rst (renamed from Documentation/filesystems/ext4/ondisk/attributes.rst)8
-rw-r--r--Documentation/filesystems/ext4/bigalloc.rst (renamed from Documentation/filesystems/ext4/ondisk/bigalloc.rst)0
-rw-r--r--Documentation/filesystems/ext4/bitmaps.rst (renamed from Documentation/filesystems/ext4/ondisk/bitmaps.rst)0
-rw-r--r--Documentation/filesystems/ext4/blockgroup.rst (renamed from Documentation/filesystems/ext4/ondisk/blockgroup.rst)0
-rw-r--r--Documentation/filesystems/ext4/blockmap.rst (renamed from Documentation/filesystems/ext4/ondisk/blockmap.rst)0
-rw-r--r--Documentation/filesystems/ext4/blocks.rst (renamed from Documentation/filesystems/ext4/ondisk/blocks.rst)0
-rw-r--r--Documentation/filesystems/ext4/checksums.rst (renamed from Documentation/filesystems/ext4/ondisk/checksums.rst)2
-rw-r--r--Documentation/filesystems/ext4/directory.rst (renamed from Documentation/filesystems/ext4/ondisk/directory.rst)18
-rw-r--r--Documentation/filesystems/ext4/dynamic.rst (renamed from Documentation/filesystems/ext4/ondisk/dynamic.rst)0
-rw-r--r--Documentation/filesystems/ext4/eainode.rst (renamed from Documentation/filesystems/ext4/ondisk/eainode.rst)0
-rw-r--r--Documentation/filesystems/ext4/ext4.rst613
-rw-r--r--Documentation/filesystems/ext4/globals.rst (renamed from Documentation/filesystems/ext4/ondisk/globals.rst)0
-rw-r--r--Documentation/filesystems/ext4/group_descr.rst (renamed from Documentation/filesystems/ext4/ondisk/group_descr.rst)4
-rw-r--r--Documentation/filesystems/ext4/ifork.rst (renamed from Documentation/filesystems/ext4/ondisk/ifork.rst)8
-rw-r--r--Documentation/filesystems/ext4/index.rst19
-rw-r--r--Documentation/filesystems/ext4/inlinedata.rst (renamed from Documentation/filesystems/ext4/ondisk/inlinedata.rst)0
-rw-r--r--Documentation/filesystems/ext4/inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/inodes.rst)19
-rw-r--r--Documentation/filesystems/ext4/journal.rst (renamed from Documentation/filesystems/ext4/ondisk/journal.rst)32
-rw-r--r--Documentation/filesystems/ext4/mmp.rst (renamed from Documentation/filesystems/ext4/ondisk/mmp.rst)2
-rw-r--r--Documentation/filesystems/ext4/ondisk/index.rst9
-rw-r--r--Documentation/filesystems/ext4/overview.rst (renamed from Documentation/filesystems/ext4/ondisk/overview.rst)0
-rw-r--r--Documentation/filesystems/ext4/special_inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/special_inodes.rst)2
-rw-r--r--Documentation/filesystems/ext4/super.rst (renamed from Documentation/filesystems/ext4/ondisk/super.rst)24
-rw-r--r--fs/ext4/acl.c4
-rw-r--r--fs/ext4/ext4.h17
-rw-r--r--fs/ext4/ext4_extents.h13
-rw-r--r--fs/ext4/extents.c595
-rw-r--r--fs/ext4/extents_status.c654
-rw-r--r--fs/ext4/extents_status.h80
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c142
-rw-r--r--fs/ext4/ioctl.c97
-rw-r--r--fs/ext4/mballoc.c14
-rw-r--r--fs/ext4/move_extent.c8
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/super.c81
-rw-r--r--fs/jbd2/checkpoint.c4
-rw-r--r--include/linux/buffer_head.h2
-rw-r--r--include/trace/events/ext4.h99
44 files changed, 1984 insertions, 1169 deletions
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst
new file mode 100644
index 000000000000..e506d3dae510
--- /dev/null
+++ b/Documentation/admin-guide/ext4.rst
@@ -0,0 +1,574 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3========================
4ext4 General Information
5========================
6
7Ext4 is an advanced level of the ext3 filesystem which incorporates
8scalability and reliability enhancements for supporting large filesystems
9(64 bit) in keeping with increasing disk capacities and state-of-the-art
10feature requirements.
11
12Mailing list: linux-ext4@vger.kernel.org
13Web site: http://ext4.wiki.kernel.org
14
15
16Quick usage instructions
17========================
18
19Note: More extensive information for getting started with ext4 can be
20found at the ext4 wiki site at the URL:
21http://ext4.wiki.kernel.org/index.php/Ext4_Howto
22
23 - The latest version of e2fsprogs can be found at:
24
25 https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
26
27 or
28
29 http://sourceforge.net/project/showfiles.php?group_id=2406
30
31 or grab the latest git repository from:
32
33 https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
34
35 - Create a new filesystem using the ext4 filesystem type:
36
37 # mke2fs -t ext4 /dev/hda1
38
39 Or to configure an existing ext3 filesystem to support extents:
40
41 # tune2fs -O extents /dev/hda1
42
43 If the filesystem was created with 128 byte inodes, it can be
44 converted to use 256 byte for greater efficiency via:
45
46 # tune2fs -I 256 /dev/hda1
47
48 - Mounting:
49
50 # mount -t ext4 /dev/hda1 /wherever
51
52 - When comparing performance with other filesystems, it's always
53 important to try multiple workloads; very often a subtle change in a
54 workload parameter can completely change the ranking of which
55 filesystems do well compared to others. When comparing versus ext3,
56 note that ext4 enables write barriers by default, while ext3 does
57 not enable write barriers by default. So it is useful to use
58 explicitly specify whether barriers are enabled or not when via the
59 '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
60 for a fair comparison. When tuning ext3 for best benchmark numbers,
61 it is often worthwhile to try changing the data journaling mode; '-o
62 data=writeback' can be faster for some workloads. (Note however that
63 running mounted with data=writeback can potentially leave stale data
64 exposed in recently written files in case of an unclean shutdown,
65 which could be a security exposure in some situations.) Configuring
66 the filesystem with a large journal can also be helpful for
67 metadata-intensive workloads.
68
69Features
70========
71
72Currently Available
73-------------------
74
75* ability to use filesystems > 16TB (e2fsprogs support not available yet)
76* extent format reduces metadata overhead (RAM, IO for access, transactions)
77* extent format more robust in face of on-disk corruption due to magics,
78* internal redundancy in tree
79* improved file allocation (multi-block alloc)
80* lift 32000 subdirectory limit imposed by i_links_count[1]
81* nsec timestamps for mtime, atime, ctime, create time
82* inode version field on disk (NFSv4, Lustre)
83* reduced e2fsck time via uninit_bg feature
84* journal checksumming for robustness, performance
85* persistent file preallocation (e.g for streaming media, databases)
86* ability to pack bitmaps and inode tables into larger virtual groups via the
87 flex_bg feature
88* large file support
89* inode allocation using large virtual block groups via flex_bg
90* delayed allocation
91* large block (up to pagesize) support
92* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
93 the ordering)
94
95[1] Filesystems with a block size of 1k may see a limit imposed by the
96directory hash tree having a maximum depth of two.
97
98Options
99=======
100
101When mounting an ext4 filesystem, the following option are accepted:
102(*) == default
103
104 ro
105 Mount filesystem read only. Note that ext4 will replay the journal (and
106 thus write to the partition) even when mounted "read only". The mount
107 options "ro,noload" can be used to prevent writes to the filesystem.
108
109 journal_checksum
110 Enable checksumming of the journal transactions. This will allow the
111 recovery code in e2fsck and the kernel to detect corruption in the
112 kernel. It is a compatible change and will be ignored by older
113 kernels.
114
115 journal_async_commit
116 Commit block can be written to disk without waiting for descriptor
117 blocks. If enabled older kernels cannot mount the device. This will
118 enable 'journal_checksum' internally.
119
120 journal_path=path, journal_dev=devnum
121 When the external journal device's major/minor numbers have changed,
122 these options allow the user to specify the new journal location. The
123 journal device is identified through either its new major/minor numbers
124 encoded in devnum, or via a path to the device.
125
126 norecovery, noload
127 Don't load the journal on mounting. Note that if the filesystem was
128 not unmounted cleanly, skipping the journal replay will lead to the
129 filesystem containing inconsistencies that can lead to any number of
130 problems.
131
132 data=journal
133 All data are committed into the journal prior to being written into the
134 main file system. Enabling this mode will disable delayed allocation
135 and O_DIRECT support.
136
137 data=ordered (*)
138 All data are forced directly out to the main file system prior to its
139 metadata being committed to the journal.
140
141 data=writeback
142 Data ordering is not preserved, data may be written into the main file
143 system after its metadata has been committed to the journal.
144
145 commit=nrsec (*)
146 Ext4 can be told to sync all its data and metadata every 'nrsec'
147 seconds. The default value is 5 seconds. This means that if you lose
148 your power, you will lose as much as the latest 5 seconds of work (your
149 filesystem will not be damaged though, thanks to the journaling). This
150 default value (or any low value) will hurt performance, but it's good
151 for data-safety. Setting it to 0 will have the same effect as leaving
152 it at the default (5 seconds). Setting it to very large values will
153 improve performance.
154
155 barrier=<0|1(*)>, barrier(*), nobarrier
156 This enables/disables the use of write barriers in the jbd code.
157 barrier=0 disables, barrier=1 enables. This also requires an IO stack
158 which can support barriers, and if jbd gets an error on a barrier
159 write, it will disable again with a warning. Write barriers enforce
160 proper on-disk ordering of journal commits, making volatile disk write
161 caches safe to use, at some performance penalty. If your disks are
162 battery-backed in one way or another, disabling barriers may safely
163 improve performance. The mount options "barrier" and "nobarrier" can
164 also be used to enable or disable barriers, for consistency with other
165 ext4 mount options.
166
167 inode_readahead_blks=n
168 This tuning parameter controls the maximum number of inode table blocks
169 that ext4's inode table readahead algorithm will pre-read into the
170 buffer cache. The default value is 32 blocks.
171
172 nouser_xattr
173 Disables Extended User Attributes. See the attr(5) manual page for
174 more information about extended attributes.
175
176 noacl
177 This option disables POSIX Access Control List support. If ACL support
178 is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL
179 is enabled by default on mount. See the acl(5) manual page for more
180 information about acl.
181
182 bsddf (*)
183 Make 'df' act like BSD.
184
185 minixdf
186 Make 'df' act like Minix.
187
188 debug
189 Extra debugging information is sent to syslog.
190
191 abort
192 Simulate the effects of calling ext4_abort() for debugging purposes.
193 This is normally used while remounting a filesystem which is already
194 mounted.
195
196 errors=remount-ro
197 Remount the filesystem read-only on an error.
198
199 errors=continue
200 Keep going on a filesystem error.
201
202 errors=panic
203 Panic and halt the machine if an error occurs. (These mount options
204 override the errors behavior specified in the superblock, which can be
205 configured using tune2fs)
206
207 data_err=ignore(*)
208 Just print an error message if an error occurs in a file data buffer in
209 ordered mode.
210 data_err=abort
211 Abort the journal if an error occurs in a file data buffer in ordered
212 mode.
213
214 grpid | bsdgroups
215 New objects have the group ID of their parent.
216
217 nogrpid (*) | sysvgroups
218 New objects have the group ID of their creator.
219
220 resgid=n
221 The group ID which may use the reserved blocks.
222
223 resuid=n
224 The user ID which may use the reserved blocks.
225
226 sb=
227 Use alternate superblock at this location.
228
229 quota, noquota, grpquota, usrquota
230 These options are ignored by the filesystem. They are used only by
231 quota tools to recognize volumes where quota should be turned on. See
232 documentation in the quota-tools package for more details
233 (http://sourceforge.net/projects/linuxquota).
234
235 jqfmt=<quota type>, usrjquota=<file>, grpjquota=<file>
236 These options tell filesystem details about quota so that quota
237 information can be properly updated during journal replay. They replace
238 the above quota options. See documentation in the quota-tools package
239 for more details (http://sourceforge.net/projects/linuxquota).
240
241 stripe=n
242 Number of filesystem blocks that mballoc will try to use for allocation
243 size and alignment. For RAID5/6 systems this should be the number of
244 data disks * RAID chunk size in file system blocks.
245
246 delalloc (*)
247 Defer block allocation until just before ext4 writes out the block(s)
248 in question. This allows ext4 to better allocation decisions more
249 efficiently.
250
251 nodelalloc
252 Disable delayed allocation. Blocks are allocated when the data is
253 copied from userspace to the page cache, either via the write(2) system
254 call or when an mmap'ed page which was previously unallocated is
255 written for the first time.
256
257 max_batch_time=usec
258 Maximum amount of time ext4 should wait for additional filesystem
259 operations to be batch together with a synchronous write operation.
260 Since a synchronous write operation is going to force a commit and then
261 a wait for the I/O complete, it doesn't cost much, and can be a huge
262 throughput win, we wait for a small amount of time to see if any other
263 transactions can piggyback on the synchronous write. The algorithm
264 used is designed to automatically tune for the speed of the disk, by
265 measuring the amount of time (on average) that it takes to finish
266 committing a transaction. Call this time the "commit time". If the
267 time that the transaction has been running is less than the commit
268 time, ext4 will try sleeping for the commit time to see if other
269 operations will join the transaction. The commit time is capped by
270 the max_batch_time, which defaults to 15000us (15ms). This
271 optimization can be turned off entirely by setting max_batch_time to 0.
272
273 min_batch_time=usec
274 This parameter sets the commit time (as described above) to be at least
275 min_batch_time. It defaults to zero microseconds. Increasing this
276 parameter may improve the throughput of multi-threaded, synchronous
277 workloads on very fast disks, at the cost of increasing latency.
278
279 journal_ioprio=prio
280 The I/O priority (from 0 to 7, where 0 is the highest priority) which
281 should be used for I/O operations submitted by kjournald2 during a
282 commit operation. This defaults to 3, which is a slightly higher
283 priority than the default I/O priority.
284
285 auto_da_alloc(*), noauto_da_alloc
286 Many broken applications don't use fsync() when replacing existing
287 files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/
288 rename("foo.new", "foo"), or worse yet, fd = open("foo",
289 O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4
290 will detect the replace-via-rename and replace-via-truncate patterns
291 and force that any delayed allocation blocks are allocated such that at
292 the next journal commit, in the default data=ordered mode, the data
293 blocks of the new file are forced to disk before the rename() operation
294 is committed. This provides roughly the same level of guarantees as
295 ext3, and avoids the "zero-length" problem that can happen when a
296 system crashes before the delayed allocation blocks are forced to disk.
297
298 noinit_itable
299 Do not initialize any uninitialized inode table blocks in the
300 background. This feature may be used by installation CD's so that the
301 install process can complete as quickly as possible; the inode table
302 initialization process would then be deferred until the next time the
303 file system is unmounted.
304
305 init_itable=n
306 The lazy itable init code will wait n times the number of milliseconds
307 it took to zero out the previous block group's inode table. This
308 minimizes the impact on the system performance while file system's
309 inode table is being initialized.
310
311 discard, nodiscard(*)
312 Controls whether ext4 should issue discard/TRIM commands to the
313 underlying block device when blocks are freed. This is useful for SSD
314 devices and sparse/thinly-provisioned LUNs, but it is off by default
315 until sufficient testing has been done.
316
317 nouid32
318 Disables 32-bit UIDs and GIDs. This is for interoperability with
319 older kernels which only store and expect 16-bit values.
320
321 block_validity(*), noblock_validity
322 These options enable or disable the in-kernel facility for tracking
323 filesystem metadata blocks within internal data structures. This
324 allows multi- block allocator and other routines to notice bugs or
325 corrupted allocation bitmaps which cause blocks to be allocated which
326 overlap with filesystem metadata blocks.
327
328 dioread_lock, dioread_nolock
329 Controls whether or not ext4 should use the DIO read locking. If the
330 dioread_nolock option is specified ext4 will allocate uninitialized
331 extent before buffer write and convert the extent to initialized after
332 IO completes. This approach allows ext4 code to avoid using inode
333 mutex, which improves scalability on high speed storages. However this
334 does not work with data journaling and dioread_nolock option will be
335 ignored with kernel warning. Note that dioread_nolock code path is only
336 used for extent-based files. Because of the restrictions this options
337 comprises it is off by default (e.g. dioread_lock).
338
339 max_dir_size_kb=n
340 This limits the size of directories so that any attempt to expand them
341 beyond the specified limit in kilobytes will cause an ENOSPC error.
342 This is useful in memory constrained environments, where a very large
343 directory can cause severe performance problems or even provoke the Out
344 Of Memory killer. (For example, if there is only 512mb memory
345 available, a 176mb directory may seriously cramp the system's style.)
346
347 i_version
348 Enable 64-bit inode version support. This option is off by default.
349
350 dax
351 Use direct access (no page cache). See
352 Documentation/filesystems/dax.txt. Note that this option is
353 incompatible with data=journal.
354
355Data Mode
356=========
357There are 3 different data modes:
358
359* writeback mode
360
361 In data=writeback mode, ext4 does not journal data at all. This mode provides
362 a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
363 mode - metadata journaling. A crash+recovery can cause incorrect data to
364 appear in files which were written shortly before the crash. This mode will
365 typically provide the best ext4 performance.
366
367* ordered mode
368
369 In data=ordered mode, ext4 only officially journals metadata, but it logically
370 groups metadata information related to data changes with the data blocks into
371 a single unit called a transaction. When it's time to write the new metadata
372 out to disk, the associated data blocks are written first. In general, this
373 mode performs slightly slower than writeback but significantly faster than
374 journal mode.
375
376* journal mode
377
378 data=journal mode provides full data and metadata journaling. All new data is
379 written to the journal first, and then to its final location. In the event of
380 a crash, the journal can be replayed, bringing both data and metadata into a
381 consistent state. This mode is the slowest except when data needs to be read
382 from and written to disk at the same time where it outperforms all others
383 modes. Enabling this mode will disable delayed allocation and O_DIRECT
384 support.
385
386/proc entries
387=============
388
389Information about mounted ext4 file systems can be found in
390/proc/fs/ext4. Each mounted filesystem will have a directory in
391/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
392/proc/fs/ext4/dm-0). The files in each per-device directory are shown
393in table below.
394
395Files in /proc/fs/ext4/<devname>
396
397 mb_groups
398 details of multiblock allocator buddy cache of free blocks
399
400/sys entries
401============
402
403Information about mounted ext4 file systems can be found in
404/sys/fs/ext4. Each mounted filesystem will have a directory in
405/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or
406/sys/fs/ext4/dm-0). The files in each per-device directory are shown
407in table below.
408
409Files in /sys/fs/ext4/<devname>:
410
411(see also Documentation/ABI/testing/sysfs-fs-ext4)
412
413 delayed_allocation_blocks
414 This file is read-only and shows the number of blocks that are dirty in
415 the page cache, but which do not have their location in the filesystem
416 allocated yet.
417
418 inode_goal
419 Tuning parameter which (if non-zero) controls the goal inode used by
420 the inode allocator in preference to all other allocation heuristics.
421 This is intended for debugging use only, and should be 0 on production
422 systems.
423
424 inode_readahead_blks
425 Tuning parameter which controls the maximum number of inode table
426 blocks that ext4's inode table readahead algorithm will pre-read into
427 the buffer cache.
428
429 lifetime_write_kbytes
430 This file is read-only and shows the number of kilobytes of data that
431 have been written to this filesystem since it was created.
432
433 max_writeback_mb_bump
434 The maximum number of megabytes the writeback code will try to write
435 out before move on to another inode.
436
437 mb_group_prealloc
438 The multiblock allocator will round up allocation requests to a
439 multiple of this tuning parameter if the stripe size is not set in the
440 ext4 superblock
441
442 mb_max_to_scan
443 The maximum number of extents the multiblock allocator will search to
444 find the best extent.
445
446 mb_min_to_scan
447 The minimum number of extents the multiblock allocator will search to
448 find the best extent.
449
450 mb_order2_req
451 Tuning parameter which controls the minimum size for requests (as a
452 power of 2) where the buddy cache is used.
453
454 mb_stats
455 Controls whether the multiblock allocator should collect statistics,
456 which are shown during the unmount. 1 means to collect statistics, 0
457 means not to collect statistics.
458
459 mb_stream_req
460 Files which have fewer blocks than this tunable parameter will have
461 their blocks allocated out of a block group specific preallocation
462 pool, so that small files are packed closely together. Each large file
463 will have its blocks allocated out of its own unique preallocation
464 pool.
465
466 session_write_kbytes
467 This file is read-only and shows the number of kilobytes of data that
468 have been written to this filesystem since it was mounted.
469
470 reserved_clusters
471 This is RW file and contains number of reserved clusters in the file
472 system which will be used in the specific situations to avoid costly
473 zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or
474 4096 clusters, whichever is smaller and this can be changed however it
475 can never exceed number of clusters in the file system. If there is not
476 enough space for the reserved space when mounting the file mount will
477 _not_ fail.
478
479Ioctls
480======
481
482There is some Ext4 specific functionality which can be accessed by applications
483through the system call interfaces. The list of all Ext4 specific ioctls are
484shown in the table below.
485
486Table of Ext4 specific ioctls
487
488 EXT4_IOC_GETFLAGS
489 Get additional attributes associated with inode. The ioctl argument is
490 an integer bitfield, with bit values described in ext4.h. This ioctl is
491 an alias for FS_IOC_GETFLAGS.
492
493 EXT4_IOC_SETFLAGS
494 Set additional attributes associated with inode. The ioctl argument is
495 an integer bitfield, with bit values described in ext4.h. This ioctl is
496 an alias for FS_IOC_SETFLAGS.
497
498 EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD
499 Get the inode i_generation number stored for each inode. The
500 i_generation number is normally changed only when new inode is created
501 and it is particularly useful for network filesystems. The '_OLD'
502 version of this ioctl is an alias for FS_IOC_GETVERSION.
503
504 EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD
505 Set the inode i_generation number stored for each inode. The '_OLD'
506 version of this ioctl is an alias for FS_IOC_SETVERSION.
507
508 EXT4_IOC_GROUP_EXTEND
509 This ioctl has the same purpose as the resize mount option. It allows
510 to resize filesystem to the end of the last existing block group,
511 further resize has to be done with resize2fs, either online, or
512 offline. The argument points to the unsigned logn number representing
513 the filesystem new block count.
514
515 EXT4_IOC_MOVE_EXT
516 Move the block extents from orig_fd (the one this ioctl is pointing to)
517 to the donor_fd (the one specified in move_extent structure passed as
518 an argument to this ioctl). Then, exchange inode metadata between
519 orig_fd and donor_fd. This is especially useful for online
520 defragmentation, because the allocator has the opportunity to allocate
521 moved blocks better, ideally into one contiguous extent.
522
523 EXT4_IOC_GROUP_ADD
524 Add a new group descriptor to an existing or new group descriptor
525 block. The new group descriptor is described by ext4_new_group_input
526 structure, which is passed as an argument to this ioctl. This is
527 especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which
528 allows online resize of the filesystem to the end of the last existing
529 block group. Those two ioctls combined is used in userspace online
530 resize tool (e.g. resize2fs).
531
532 EXT4_IOC_MIGRATE
533 This ioctl operates on the filesystem itself. It converts (migrates)
534 ext3 indirect block mapped inode to ext4 extent mapped inode by walking
535 through indirect block mapping of the original inode and converting
536 contiguous block ranges into ext4 extents of the temporary inode. Then,
537 inodes are swapped. This ioctl might help, when migrating from ext3 to
538 ext4 filesystem, however suggestion is to create fresh ext4 filesystem
539 and copy data from the backup. Note, that filesystem has to support
540 extents for this ioctl to work.
541
542 EXT4_IOC_ALLOC_DA_BLKS
543 Force all of the delay allocated blocks to be allocated to preserve
544 application-expected ext3 behaviour. Note that this will also start
545 triggering a write of the data blocks, but this behaviour may change in
546 the future as it is not necessary and has been done this way only for
547 sake of simplicity.
548
549 EXT4_IOC_RESIZE_FS
550 Resize the filesystem to a new size. The number of blocks of resized
551 filesystem is passed in via 64 bit integer argument. The kernel
552 allocates bitmaps and inode table, the userspace tool thus just passes
553 the new number of blocks.
554
555 EXT4_IOC_SWAP_BOOT
556 Swap i_blocks and associated attributes (like i_blocks, i_size,
557 i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO
558 (#5). This is typically used to store a boot loader in a secure part of
559 the filesystem, where it can't be changed by a normal user by accident.
560 The data blocks of the previous boot loader will be associated with the
561 given inode.
562
563References
564==========
565
566kernel source: <file:fs/ext4/>
567 <file:fs/jbd2/>
568
569programs: http://e2fsprogs.sourceforge.net/
570
571useful links: http://fedoraproject.org/wiki/ext3-devel
572 http://www.bullopensource.org/ext4/
573 http://ext4.wiki.kernel.org/index.php/Main_Page
574 http://fedoraproject.org/wiki/Features/Ext4
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 0873685bab0f..965745d5fb9a 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -71,6 +71,7 @@ configure specific aspects of kernel behavior to your liking.
71 java 71 java
72 ras 72 ras
73 bcache 73 bcache
74 ext4
74 pm/index 75 pm/index
75 thunderbolt 76 thunderbolt
76 LSM/index 77 LSM/index
diff --git a/Documentation/conf.py b/Documentation/conf.py
index b691af4831fa..ede67ccafc29 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -383,6 +383,10 @@ latex_documents = [
383 'The kernel development community', 'manual'), 383 'The kernel development community', 'manual'),
384 ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', 384 ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API',
385 'The kernel development community', 'manual'), 385 'The kernel development community', 'manual'),
386 ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide',
387 'ext4 Community', 'manual'),
388 ('filesystems/ext4/index', 'ext4-data-structures.tex',
389 'ext4 Data Structures and Algorithms', 'ext4 Community', 'manual'),
386 ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', 390 ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide',
387 'The kernel development community', 'manual'), 391 'The kernel development community', 'manual'),
388 ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', 392 ('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/about.rst
index 0aadba052264..0aadba052264 100644
--- a/Documentation/filesystems/ext4/ondisk/about.rst
+++ b/Documentation/filesystems/ext4/about.rst
diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/allocators.rst
index 7aa85152ace3..7aa85152ace3 100644
--- a/Documentation/filesystems/ext4/ondisk/allocators.rst
+++ b/Documentation/filesystems/ext4/allocators.rst
diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/attributes.rst
index 0b01b67b81fe..54386a010a8d 100644
--- a/Documentation/filesystems/ext4/ondisk/attributes.rst
+++ b/Documentation/filesystems/ext4/attributes.rst
@@ -30,7 +30,7 @@ Extended attributes, when stored after the inode, have a header
30``ext4_xattr_ibody_header`` that is 4 bytes long: 30``ext4_xattr_ibody_header`` that is 4 bytes long:
31 31
32.. list-table:: 32.. list-table::
33 :widths: 1 1 1 77 33 :widths: 8 8 24 40
34 :header-rows: 1 34 :header-rows: 1
35 35
36 * - Offset 36 * - Offset
@@ -47,7 +47,7 @@ The beginning of an extended attribute block is in
47``struct ext4_xattr_header``, which is 32 bytes long: 47``struct ext4_xattr_header``, which is 32 bytes long:
48 48
49.. list-table:: 49.. list-table::
50 :widths: 1 1 1 77 50 :widths: 8 8 24 40
51 :header-rows: 1 51 :header-rows: 1
52 52
53 * - Offset 53 * - Offset
@@ -92,7 +92,7 @@ entries must be stored in sorted order. The sort order is
92Attributes stored inside an inode do not need be stored in sorted order. 92Attributes stored inside an inode do not need be stored in sorted order.
93 93
94.. list-table:: 94.. list-table::
95 :widths: 1 1 1 77 95 :widths: 8 8 24 40
96 :header-rows: 1 96 :header-rows: 1
97 97
98 * - Offset 98 * - Offset
@@ -157,7 +157,7 @@ attribute name index field is set, and matching string is removed from
157the key name. Here is a map of name index values to key prefixes: 157the key name. Here is a map of name index values to key prefixes:
158 158
159.. list-table:: 159.. list-table::
160 :widths: 1 79 160 :widths: 16 64
161 :header-rows: 1 161 :header-rows: 1
162 162
163 * - Name Index 163 * - Name Index
diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst
index c6d88557553c..c6d88557553c 100644
--- a/Documentation/filesystems/ext4/ondisk/bigalloc.rst
+++ b/Documentation/filesystems/ext4/bigalloc.rst
diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/bitmaps.rst
index c7546dbc197a..c7546dbc197a 100644
--- a/Documentation/filesystems/ext4/ondisk/bitmaps.rst
+++ b/Documentation/filesystems/ext4/bitmaps.rst
diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst
index baf888e4c06a..baf888e4c06a 100644
--- a/Documentation/filesystems/ext4/ondisk/blockgroup.rst
+++ b/Documentation/filesystems/ext4/blockgroup.rst
diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/blockmap.rst
index 30e25750d88a..30e25750d88a 100644
--- a/Documentation/filesystems/ext4/ondisk/blockmap.rst
+++ b/Documentation/filesystems/ext4/blockmap.rst
diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/blocks.rst
index 73d4dc0f7bda..73d4dc0f7bda 100644
--- a/Documentation/filesystems/ext4/ondisk/blocks.rst
+++ b/Documentation/filesystems/ext4/blocks.rst
diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/checksums.rst
index 9d6a793b2e03..5519e253810d 100644
--- a/Documentation/filesystems/ext4/ondisk/checksums.rst
+++ b/Documentation/filesystems/ext4/checksums.rst
@@ -28,7 +28,7 @@ of checksum. The checksum function is whatever the superblock describes
28(crc32c as of October 2013) unless noted otherwise. 28(crc32c as of October 2013) unless noted otherwise.
29 29
30.. list-table:: 30.. list-table::
31 :widths: 1 1 4 31 :widths: 20 8 50
32 :header-rows: 1 32 :header-rows: 1
33 33
34 * - Metadata 34 * - Metadata
diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/directory.rst
index 8fcba68c2884..614034e24669 100644
--- a/Documentation/filesystems/ext4/ondisk/directory.rst
+++ b/Documentation/filesystems/ext4/directory.rst
@@ -34,7 +34,7 @@ is at most 263 bytes long, though on disk you'll need to reference
34``dirent.rec_len`` to know for sure. 34``dirent.rec_len`` to know for sure.
35 35
36.. list-table:: 36.. list-table::
37 :widths: 1 1 1 77 37 :widths: 8 8 24 40
38 :header-rows: 1 38 :header-rows: 1
39 39
40 * - Offset 40 * - Offset
@@ -66,7 +66,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most
66``dirent.rec_len`` to know for sure. 66``dirent.rec_len`` to know for sure.
67 67
68.. list-table:: 68.. list-table::
69 :widths: 1 1 1 77 69 :widths: 8 8 24 40
70 :header-rows: 1 70 :header-rows: 1
71 71
72 * - Offset 72 * - Offset
@@ -99,7 +99,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most
99The directory file type is one of the following values: 99The directory file type is one of the following values:
100 100
101.. list-table:: 101.. list-table::
102 :widths: 1 79 102 :widths: 16 64
103 :header-rows: 1 103 :header-rows: 1
104 104
105 * - Value 105 * - Value
@@ -130,7 +130,7 @@ in the place where the name normally goes. The structure is
130``struct ext4_dir_entry_tail``: 130``struct ext4_dir_entry_tail``:
131 131
132.. list-table:: 132.. list-table::
133 :widths: 1 1 1 77 133 :widths: 8 8 24 40
134 :header-rows: 1 134 :header-rows: 1
135 135
136 * - Offset 136 * - Offset
@@ -212,7 +212,7 @@ The root of the htree is in ``struct dx_root``, which is the full length
212of a data block: 212of a data block:
213 213
214.. list-table:: 214.. list-table::
215 :widths: 1 1 1 77 215 :widths: 8 8 24 40
216 :header-rows: 1 216 :header-rows: 1
217 217
218 * - Offset 218 * - Offset
@@ -305,7 +305,7 @@ of a data block:
305The directory hash is one of the following values: 305The directory hash is one of the following values:
306 306
307.. list-table:: 307.. list-table::
308 :widths: 1 79 308 :widths: 16 64
309 :header-rows: 1 309 :header-rows: 1
310 310
311 * - Value 311 * - Value
@@ -327,7 +327,7 @@ Interior nodes of an htree are recorded as ``struct dx_node``, which is
327also the full length of a data block: 327also the full length of a data block:
328 328
329.. list-table:: 329.. list-table::
330 :widths: 1 1 1 77 330 :widths: 8 8 24 40
331 :header-rows: 1 331 :header-rows: 1
332 332
333 * - Offset 333 * - Offset
@@ -375,7 +375,7 @@ The hash maps that exist in both ``struct dx_root`` and
375long: 375long:
376 376
377.. list-table:: 377.. list-table::
378 :widths: 1 1 1 77 378 :widths: 8 8 24 40
379 :header-rows: 1 379 :header-rows: 1
380 380
381 * - Offset 381 * - Offset
@@ -405,7 +405,7 @@ directory index (which will ensure that there's space for the checksum.
405The dx\_tail structure is 8 bytes long and looks like this: 405The dx\_tail structure is 8 bytes long and looks like this:
406 406
407.. list-table:: 407.. list-table::
408 :widths: 1 1 1 77 408 :widths: 8 8 24 40
409 :header-rows: 1 409 :header-rows: 1
410 410
411 * - Offset 411 * - Offset
diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/dynamic.rst
index bb0c84333341..bb0c84333341 100644
--- a/Documentation/filesystems/ext4/ondisk/dynamic.rst
+++ b/Documentation/filesystems/ext4/dynamic.rst
diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/eainode.rst
index ecc0d01a0a72..ecc0d01a0a72 100644
--- a/Documentation/filesystems/ext4/ondisk/eainode.rst
+++ b/Documentation/filesystems/ext4/eainode.rst
diff --git a/Documentation/filesystems/ext4/ext4.rst b/Documentation/filesystems/ext4/ext4.rst
deleted file mode 100644
index 9d4368d591fa..000000000000
--- a/Documentation/filesystems/ext4/ext4.rst
+++ /dev/null
@@ -1,613 +0,0 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3========================
4General Information
5========================
6
7Ext4 is an advanced level of the ext3 filesystem which incorporates
8scalability and reliability enhancements for supporting large filesystems
9(64 bit) in keeping with increasing disk capacities and state-of-the-art
10feature requirements.
11
12Mailing list: linux-ext4@vger.kernel.org
13Web site: http://ext4.wiki.kernel.org
14
15
16Quick usage instructions
17========================
18
19Note: More extensive information for getting started with ext4 can be
20found at the ext4 wiki site at the URL:
21http://ext4.wiki.kernel.org/index.php/Ext4_Howto
22
23 - The latest version of e2fsprogs can be found at:
24
25 https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
26
27 or
28
29 http://sourceforge.net/project/showfiles.php?group_id=2406
30
31 or grab the latest git repository from:
32
33 https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
34
35 - Create a new filesystem using the ext4 filesystem type:
36
37 # mke2fs -t ext4 /dev/hda1
38
39 Or to configure an existing ext3 filesystem to support extents:
40
41 # tune2fs -O extents /dev/hda1
42
43 If the filesystem was created with 128 byte inodes, it can be
44 converted to use 256 byte for greater efficiency via:
45
46 # tune2fs -I 256 /dev/hda1
47
48 - Mounting:
49
50 # mount -t ext4 /dev/hda1 /wherever
51
52 - When comparing performance with other filesystems, it's always
53 important to try multiple workloads; very often a subtle change in a
54 workload parameter can completely change the ranking of which
55 filesystems do well compared to others. When comparing versus ext3,
56 note that ext4 enables write barriers by default, while ext3 does
57 not enable write barriers by default. So it is useful to use
58 explicitly specify whether barriers are enabled or not when via the
59 '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
60 for a fair comparison. When tuning ext3 for best benchmark numbers,
61 it is often worthwhile to try changing the data journaling mode; '-o
62 data=writeback' can be faster for some workloads. (Note however that
63 running mounted with data=writeback can potentially leave stale data
64 exposed in recently written files in case of an unclean shutdown,
65 which could be a security exposure in some situations.) Configuring
66 the filesystem with a large journal can also be helpful for
67 metadata-intensive workloads.
68
69Features
70========
71
72Currently Available
73-------------------
74
75* ability to use filesystems > 16TB (e2fsprogs support not available yet)
76* extent format reduces metadata overhead (RAM, IO for access, transactions)
77* extent format more robust in face of on-disk corruption due to magics,
78* internal redundancy in tree
79* improved file allocation (multi-block alloc)
80* lift 32000 subdirectory limit imposed by i_links_count[1]
81* nsec timestamps for mtime, atime, ctime, create time
82* inode version field on disk (NFSv4, Lustre)
83* reduced e2fsck time via uninit_bg feature
84* journal checksumming for robustness, performance
85* persistent file preallocation (e.g for streaming media, databases)
86* ability to pack bitmaps and inode tables into larger virtual groups via the
87 flex_bg feature
88* large file support
89* inode allocation using large virtual block groups via flex_bg
90* delayed allocation
91* large block (up to pagesize) support
92* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
93 the ordering)
94
95[1] Filesystems with a block size of 1k may see a limit imposed by the
96directory hash tree having a maximum depth of two.
97
98Options
99=======
100
101When mounting an ext4 filesystem, the following option are accepted:
102(*) == default
103
104======================= =======================================================
105Mount Option Description
106======================= =======================================================
107ro Mount filesystem read only. Note that ext4 will
108 replay the journal (and thus write to the
109 partition) even when mounted "read only". The
110 mount options "ro,noload" can be used to prevent
111 writes to the filesystem.
112
113journal_checksum Enable checksumming of the journal transactions.
114 This will allow the recovery code in e2fsck and the
115 kernel to detect corruption in the kernel. It is a
116 compatible change and will be ignored by older kernels.
117
118journal_async_commit Commit block can be written to disk without waiting
119 for descriptor blocks. If enabled older kernels cannot
120 mount the device. This will enable 'journal_checksum'
121 internally.
122
123journal_path=path
124journal_dev=devnum When the external journal device's major/minor numbers
125 have changed, these options allow the user to specify
126 the new journal location. The journal device is
127 identified through either its new major/minor numbers
128 encoded in devnum, or via a path to the device.
129
130norecovery Don't load the journal on mounting. Note that
131noload if the filesystem was not unmounted cleanly,
132 skipping the journal replay will lead to the
133 filesystem containing inconsistencies that can
134 lead to any number of problems.
135
136data=journal All data are committed into the journal prior to being
137 written into the main file system. Enabling
138 this mode will disable delayed allocation and
139 O_DIRECT support.
140
141data=ordered (*) All data are forced directly out to the main file
142 system prior to its metadata being committed to the
143 journal.
144
145data=writeback Data ordering is not preserved, data may be written
146 into the main file system after its metadata has been
147 committed to the journal.
148
149commit=nrsec (*) Ext4 can be told to sync all its data and metadata
150 every 'nrsec' seconds. The default value is 5 seconds.
151 This means that if you lose your power, you will lose
152 as much as the latest 5 seconds of work (your
153 filesystem will not be damaged though, thanks to the
154 journaling). This default value (or any low value)
155 will hurt performance, but it's good for data-safety.
156 Setting it to 0 will have the same effect as leaving
157 it at the default (5 seconds).
158 Setting it to very large values will improve
159 performance.
160
161barrier=<0|1(*)> This enables/disables the use of write barriers in
162barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
163nobarrier This also requires an IO stack which can support
164 barriers, and if jbd gets an error on a barrier
165 write, it will disable again with a warning.
166 Write barriers enforce proper on-disk ordering
167 of journal commits, making volatile disk write caches
168 safe to use, at some performance penalty. If
169 your disks are battery-backed in one way or another,
170 disabling barriers may safely improve performance.
171 The mount options "barrier" and "nobarrier" can
172 also be used to enable or disable barriers, for
173 consistency with other ext4 mount options.
174
175inode_readahead_blks=n This tuning parameter controls the maximum
176 number of inode table blocks that ext4's inode
177 table readahead algorithm will pre-read into
178 the buffer cache. The default value is 32 blocks.
179
180nouser_xattr Disables Extended User Attributes. See the
181 attr(5) manual page for more information about
182 extended attributes.
183
184noacl This option disables POSIX Access Control List
185 support. If ACL support is enabled in the kernel
186 configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is
187 enabled by default on mount. See the acl(5) manual
188 page for more information about acl.
189
190bsddf (*) Make 'df' act like BSD.
191minixdf Make 'df' act like Minix.
192
193debug Extra debugging information is sent to syslog.
194
195abort Simulate the effects of calling ext4_abort() for
196 debugging purposes. This is normally used while
197 remounting a filesystem which is already mounted.
198
199errors=remount-ro Remount the filesystem read-only on an error.
200errors=continue Keep going on a filesystem error.
201errors=panic Panic and halt the machine if an error occurs.
202 (These mount options override the errors behavior
203 specified in the superblock, which can be configured
204 using tune2fs)
205
206data_err=ignore(*) Just print an error message if an error occurs
207 in a file data buffer in ordered mode.
208data_err=abort Abort the journal if an error occurs in a file
209 data buffer in ordered mode.
210
211grpid New objects have the group ID of their parent.
212bsdgroups
213
214nogrpid (*) New objects have the group ID of their creator.
215sysvgroups
216
217resgid=n The group ID which may use the reserved blocks.
218
219resuid=n The user ID which may use the reserved blocks.
220
221sb=n Use alternate superblock at this location.
222
223quota These options are ignored by the filesystem. They
224noquota are used only by quota tools to recognize volumes
225grpquota where quota should be turned on. See documentation
226usrquota in the quota-tools package for more details
227 (http://sourceforge.net/projects/linuxquota).
228
229jqfmt=<quota type> These options tell filesystem details about quota
230usrjquota=<file> so that quota information can be properly updated
231grpjquota=<file> during journal replay. They replace the above
232 quota options. See documentation in the quota-tools
233 package for more details
234 (http://sourceforge.net/projects/linuxquota).
235
236stripe=n Number of filesystem blocks that mballoc will try
237 to use for allocation size and alignment. For RAID5/6
238 systems this should be the number of data
239 disks * RAID chunk size in file system blocks.
240
241delalloc (*) Defer block allocation until just before ext4
242 writes out the block(s) in question. This
243 allows ext4 to better allocation decisions
244 more efficiently.
245nodelalloc Disable delayed allocation. Blocks are allocated
246 when the data is copied from userspace to the
247 page cache, either via the write(2) system call
248 or when an mmap'ed page which was previously
249 unallocated is written for the first time.
250
251max_batch_time=usec Maximum amount of time ext4 should wait for
252 additional filesystem operations to be batch
253 together with a synchronous write operation.
254 Since a synchronous write operation is going to
255 force a commit and then a wait for the I/O
256 complete, it doesn't cost much, and can be a
257 huge throughput win, we wait for a small amount
258 of time to see if any other transactions can
259 piggyback on the synchronous write. The
260 algorithm used is designed to automatically tune
261 for the speed of the disk, by measuring the
262 amount of time (on average) that it takes to
263 finish committing a transaction. Call this time
264 the "commit time". If the time that the
265 transaction has been running is less than the
266 commit time, ext4 will try sleeping for the
267 commit time to see if other operations will join
268 the transaction. The commit time is capped by
269 the max_batch_time, which defaults to 15000us
270 (15ms). This optimization can be turned off
271 entirely by setting max_batch_time to 0.
272
273min_batch_time=usec This parameter sets the commit time (as
274 described above) to be at least min_batch_time.
275 It defaults to zero microseconds. Increasing
276 this parameter may improve the throughput of
277 multi-threaded, synchronous workloads on very
278 fast disks, at the cost of increasing latency.
279
280journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
281 highest priority) which should be used for I/O
282 operations submitted by kjournald2 during a
283 commit operation. This defaults to 3, which is
284 a slightly higher priority than the default I/O
285 priority.
286
287auto_da_alloc(*) Many broken applications don't use fsync() when
288noauto_da_alloc replacing existing files via patterns such as
289 fd = open("foo.new")/write(fd,..)/close(fd)/
290 rename("foo.new", "foo"), or worse yet,
291 fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
292 If auto_da_alloc is enabled, ext4 will detect
293 the replace-via-rename and replace-via-truncate
294 patterns and force that any delayed allocation
295 blocks are allocated such that at the next
296 journal commit, in the default data=ordered
297 mode, the data blocks of the new file are forced
298 to disk before the rename() operation is
299 committed. This provides roughly the same level
300 of guarantees as ext3, and avoids the
301 "zero-length" problem that can happen when a
302 system crashes before the delayed allocation
303 blocks are forced to disk.
304
305noinit_itable Do not initialize any uninitialized inode table
306 blocks in the background. This feature may be
307 used by installation CD's so that the install
308 process can complete as quickly as possible; the
309 inode table initialization process would then be
310 deferred until the next time the file system
311 is unmounted.
312
313init_itable=n The lazy itable init code will wait n times the
314 number of milliseconds it took to zero out the
315 previous block group's inode table. This
316 minimizes the impact on the system performance
317 while file system's inode table is being initialized.
318
319discard Controls whether ext4 should issue discard/TRIM
320nodiscard(*) commands to the underlying block device when
321 blocks are freed. This is useful for SSD devices
322 and sparse/thinly-provisioned LUNs, but it is off
323 by default until sufficient testing has been done.
324
325nouid32 Disables 32-bit UIDs and GIDs. This is for
326 interoperability with older kernels which only
327 store and expect 16-bit values.
328
329block_validity(*) These options enable or disable the in-kernel
330noblock_validity facility for tracking filesystem metadata blocks
331 within internal data structures. This allows multi-
332 block allocator and other routines to notice
333 bugs or corrupted allocation bitmaps which cause
334 blocks to be allocated which overlap with
335 filesystem metadata blocks.
336
337dioread_lock Controls whether or not ext4 should use the DIO read
338dioread_nolock locking. If the dioread_nolock option is specified
339 ext4 will allocate uninitialized extent before buffer
340 write and convert the extent to initialized after IO
341 completes. This approach allows ext4 code to avoid
342 using inode mutex, which improves scalability on high
343 speed storages. However this does not work with
344 data journaling and dioread_nolock option will be
345 ignored with kernel warning. Note that dioread_nolock
346 code path is only used for extent-based files.
347 Because of the restrictions this options comprises
348 it is off by default (e.g. dioread_lock).
349
350max_dir_size_kb=n This limits the size of directories so that any
351 attempt to expand them beyond the specified
352 limit in kilobytes will cause an ENOSPC error.
353 This is useful in memory constrained
354 environments, where a very large directory can
355 cause severe performance problems or even
356 provoke the Out Of Memory killer. (For example,
357 if there is only 512mb memory available, a 176mb
358 directory may seriously cramp the system's style.)
359
360i_version Enable 64-bit inode version support. This option is
361 off by default.
362
363dax Use direct access (no page cache). See
364 Documentation/filesystems/dax.txt. Note that
365 this option is incompatible with data=journal.
366======================= =======================================================
367
368Data Mode
369=========
370There are 3 different data modes:
371
372* writeback mode
373
374 In data=writeback mode, ext4 does not journal data at all. This mode provides
375 a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
376 mode - metadata journaling. A crash+recovery can cause incorrect data to
377 appear in files which were written shortly before the crash. This mode will
378 typically provide the best ext4 performance.
379
380* ordered mode
381
382 In data=ordered mode, ext4 only officially journals metadata, but it logically
383 groups metadata information related to data changes with the data blocks into
384 a single unit called a transaction. When it's time to write the new metadata
385 out to disk, the associated data blocks are written first. In general, this
386 mode performs slightly slower than writeback but significantly faster than
387 journal mode.
388
389* journal mode
390
391 data=journal mode provides full data and metadata journaling. All new data is
392 written to the journal first, and then to its final location. In the event of
393 a crash, the journal can be replayed, bringing both data and metadata into a
394 consistent state. This mode is the slowest except when data needs to be read
395 from and written to disk at the same time where it outperforms all others
396 modes. Enabling this mode will disable delayed allocation and O_DIRECT
397 support.
398
399/proc entries
400=============
401
402Information about mounted ext4 file systems can be found in
403/proc/fs/ext4. Each mounted filesystem will have a directory in
404/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
405/proc/fs/ext4/dm-0). The files in each per-device directory are shown
406in table below.
407
408Files in /proc/fs/ext4/<devname>
409
410================ =======
411 File Content
412================ =======
413 mb_groups details of multiblock allocator buddy cache of free blocks
414================ =======
415
416/sys entries
417============
418
419Information about mounted ext4 file systems can be found in
420/sys/fs/ext4. Each mounted filesystem will have a directory in
421/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or
422/sys/fs/ext4/dm-0). The files in each per-device directory are shown
423in table below.
424
425Files in /sys/fs/ext4/<devname>:
426
427(see also Documentation/ABI/testing/sysfs-fs-ext4)
428
429============================= =================================================
430File Content
431============================= =================================================
432 delayed_allocation_blocks This file is read-only and shows the number of
433 blocks that are dirty in the page cache, but
434 which do not have their location in the
435 filesystem allocated yet.
436
437inode_goal Tuning parameter which (if non-zero) controls
438 the goal inode used by the inode allocator in
439 preference to all other allocation heuristics.
440 This is intended for debugging use only, and
441 should be 0 on production systems.
442
443inode_readahead_blks Tuning parameter which controls the maximum
444 number of inode table blocks that ext4's inode
445 table readahead algorithm will pre-read into
446 the buffer cache
447
448lifetime_write_kbytes This file is read-only and shows the number of
449 kilobytes of data that have been written to this
450 filesystem since it was created.
451
452 max_writeback_mb_bump The maximum number of megabytes the writeback
453 code will try to write out before move on to
454 another inode.
455
456 mb_group_prealloc The multiblock allocator will round up allocation
457 requests to a multiple of this tuning parameter if
458 the stripe size is not set in the ext4 superblock
459
460 mb_max_to_scan The maximum number of extents the multiblock
461 allocator will search to find the best extent
462
463 mb_min_to_scan The minimum number of extents the multiblock
464 allocator will search to find the best extent
465
466 mb_order2_req Tuning parameter which controls the minimum size
467 for requests (as a power of 2) where the buddy
468 cache is used
469
470 mb_stats Controls whether the multiblock allocator should
471 collect statistics, which are shown during the
472 unmount. 1 means to collect statistics, 0 means
473 not to collect statistics
474
475 mb_stream_req Files which have fewer blocks than this tunable
476 parameter will have their blocks allocated out
477 of a block group specific preallocation pool, so
478 that small files are packed closely together.
479 Each large file will have its blocks allocated
480 out of its own unique preallocation pool.
481
482 session_write_kbytes This file is read-only and shows the number of
483 kilobytes of data that have been written to this
484 filesystem since it was mounted.
485
486 reserved_clusters This is RW file and contains number of reserved
487 clusters in the file system which will be used
488 in the specific situations to avoid costly
489 zeroout, unexpected ENOSPC, or possible data
490 loss. The default is 2% or 4096 clusters,
491 whichever is smaller and this can be changed
492 however it can never exceed number of clusters
493 in the file system. If there is not enough space
494 for the reserved space when mounting the file
495 mount will _not_ fail.
496============================= =================================================
497
498Ioctls
499======
500
501There is some Ext4 specific functionality which can be accessed by applications
502through the system call interfaces. The list of all Ext4 specific ioctls are
503shown in the table below.
504
505Table of Ext4 specific ioctls
506
507============================= =================================================
508Ioctl Description
509============================= =================================================
510 EXT4_IOC_GETFLAGS Get additional attributes associated with inode.
511 The ioctl argument is an integer bitfield, with
512 bit values described in ext4.h. This ioctl is an
513 alias for FS_IOC_GETFLAGS.
514
515 EXT4_IOC_SETFLAGS Set additional attributes associated with inode.
516 The ioctl argument is an integer bitfield, with
517 bit values described in ext4.h. This ioctl is an
518 alias for FS_IOC_SETFLAGS.
519
520 EXT4_IOC_GETVERSION
521 EXT4_IOC_GETVERSION_OLD
522 Get the inode i_generation number stored for
523 each inode. The i_generation number is normally
524 changed only when new inode is created and it is
525 particularly useful for network filesystems. The
526 '_OLD' version of this ioctl is an alias for
527 FS_IOC_GETVERSION.
528
529 EXT4_IOC_SETVERSION
530 EXT4_IOC_SETVERSION_OLD
531 Set the inode i_generation number stored for
532 each inode. The '_OLD' version of this ioctl
533 is an alias for FS_IOC_SETVERSION.
534
535 EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize
536 mount option. It allows to resize filesystem
537 to the end of the last existing block group,
538 further resize has to be done with resize2fs,
539 either online, or offline. The argument points
540 to the unsigned logn number representing the
541 filesystem new block count.
542
543 EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one
544 this ioctl is pointing to) to the donor_fd (the
545 one specified in move_extent structure passed
546 as an argument to this ioctl). Then, exchange
547 inode metadata between orig_fd and donor_fd.
548 This is especially useful for online
549 defragmentation, because the allocator has the
550 opportunity to allocate moved blocks better,
551 ideally into one contiguous extent.
552
553 EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or
554 new group descriptor block. The new group
555 descriptor is described by ext4_new_group_input
556 structure, which is passed as an argument to
557 this ioctl. This is especially useful in
558 conjunction with EXT4_IOC_GROUP_EXTEND,
559 which allows online resize of the filesystem
560 to the end of the last existing block group.
561 Those two ioctls combined is used in userspace
562 online resize tool (e.g. resize2fs).
563
564 EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself.
565 It converts (migrates) ext3 indirect block mapped
566 inode to ext4 extent mapped inode by walking
567 through indirect block mapping of the original
568 inode and converting contiguous block ranges
569 into ext4 extents of the temporary inode. Then,
570 inodes are swapped. This ioctl might help, when
571 migrating from ext3 to ext4 filesystem, however
572 suggestion is to create fresh ext4 filesystem
573 and copy data from the backup. Note, that
574 filesystem has to support extents for this ioctl
575 to work.
576
577 EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be
578 allocated to preserve application-expected ext3
579 behaviour. Note that this will also start
580 triggering a write of the data blocks, but this
581 behaviour may change in the future as it is
582 not necessary and has been done this way only
583 for sake of simplicity.
584
585 EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number
586 of blocks of resized filesystem is passed in via
587 64 bit integer argument. The kernel allocates
588 bitmaps and inode table, the userspace tool thus
589 just passes the new number of blocks.
590
591 EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes
592 (like i_blocks, i_size, i_flags, ...) from
593 the specified inode with inode
594 EXT4_BOOT_LOADER_INO (#5). This is typically
595 used to store a boot loader in a secure part of
596 the filesystem, where it can't be changed by a
597 normal user by accident.
598 The data blocks of the previous boot loader
599 will be associated with the given inode.
600============================= =================================================
601
602References
603==========
604
605kernel source: <file:fs/ext4/>
606 <file:fs/jbd2/>
607
608programs: http://e2fsprogs.sourceforge.net/
609
610useful links: http://fedoraproject.org/wiki/ext3-devel
611 http://www.bullopensource.org/ext4/
612 http://ext4.wiki.kernel.org/index.php/Main_Page
613 http://fedoraproject.org/wiki/Features/Ext4
diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/globals.rst
index 368bf7662b96..368bf7662b96 100644
--- a/Documentation/filesystems/ext4/ondisk/globals.rst
+++ b/Documentation/filesystems/ext4/globals.rst
diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst
index 759827e5d2cf..0f783ed88592 100644
--- a/Documentation/filesystems/ext4/ondisk/group_descr.rst
+++ b/Documentation/filesystems/ext4/group_descr.rst
@@ -43,7 +43,7 @@ entire bitmap.
43The block group descriptor is laid out in ``struct ext4_group_desc``. 43The block group descriptor is laid out in ``struct ext4_group_desc``.
44 44
45.. list-table:: 45.. list-table::
46 :widths: 1 1 1 77 46 :widths: 8 8 24 40
47 :header-rows: 1 47 :header-rows: 1
48 48
49 * - Offset 49 * - Offset
@@ -157,7 +157,7 @@ The block group descriptor is laid out in ``struct ext4_group_desc``.
157Block group flags can be any combination of the following: 157Block group flags can be any combination of the following:
158 158
159.. list-table:: 159.. list-table::
160 :widths: 1 79 160 :widths: 16 64
161 :header-rows: 1 161 :header-rows: 1
162 162
163 * - Value 163 * - Value
diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ifork.rst
index 5dbe3b2b121a..b9816d5a896b 100644
--- a/Documentation/filesystems/ext4/ondisk/ifork.rst
+++ b/Documentation/filesystems/ext4/ifork.rst
@@ -68,7 +68,7 @@ The extent tree header is recorded in ``struct ext4_extent_header``,
68which is 12 bytes long: 68which is 12 bytes long:
69 69
70.. list-table:: 70.. list-table::
71 :widths: 1 1 1 77 71 :widths: 8 8 24 40
72 :header-rows: 1 72 :header-rows: 1
73 73
74 * - Offset 74 * - Offset
@@ -104,7 +104,7 @@ Internal nodes of the extent tree, also known as index nodes, are
104recorded as ``struct ext4_extent_idx``, and are 12 bytes long: 104recorded as ``struct ext4_extent_idx``, and are 12 bytes long:
105 105
106.. list-table:: 106.. list-table::
107 :widths: 1 1 1 77 107 :widths: 8 8 24 40
108 :header-rows: 1 108 :header-rows: 1
109 109
110 * - Offset 110 * - Offset
@@ -134,7 +134,7 @@ Leaf nodes of the extent tree are recorded as ``struct ext4_extent``,
134and are also 12 bytes long: 134and are also 12 bytes long:
135 135
136.. list-table:: 136.. list-table::
137 :widths: 1 1 1 77 137 :widths: 8 8 24 40
138 :header-rows: 1 138 :header-rows: 1
139 139
140 * - Offset 140 * - Offset
@@ -174,7 +174,7 @@ including) the checksum itself.
174``struct ext4_extent_tail`` is 4 bytes long: 174``struct ext4_extent_tail`` is 4 bytes long:
175 175
176.. list-table:: 176.. list-table::
177 :widths: 1 1 1 77 177 :widths: 8 8 24 40
178 :header-rows: 1 178 :header-rows: 1
179 179
180 * - Offset 180 * - Offset
diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst
index 71121605558c..3be3e54d480d 100644
--- a/Documentation/filesystems/ext4/index.rst
+++ b/Documentation/filesystems/ext4/index.rst
@@ -1,17 +1,14 @@
1.. SPDX-License-Identifier: GPL-2.0 1.. SPDX-License-Identifier: GPL-2.0
2 2
3=============== 3===================================
4ext4 Filesystem 4ext4 Data Structures and Algorithms
5=============== 5===================================
6
7General usage and on-disk artifacts writen by ext4. More documentation may
8be ported from the wiki as time permits. This should be considered the
9canonical source of information as the details here have been reviewed by
10the ext4 community.
11 6
12.. toctree:: 7.. toctree::
13 :maxdepth: 5 8 :maxdepth: 6
14 :numbered: 9 :numbered:
15 10
16 ext4 11 about.rst
17 ondisk/index 12 overview.rst
13 globals.rst
14 dynamic.rst
diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/inlinedata.rst
index d1075178ce0b..d1075178ce0b 100644
--- a/Documentation/filesystems/ext4/ondisk/inlinedata.rst
+++ b/Documentation/filesystems/ext4/inlinedata.rst
diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/inodes.rst
index 655ce898f3f5..6bd35e506b6f 100644
--- a/Documentation/filesystems/ext4/ondisk/inodes.rst
+++ b/Documentation/filesystems/ext4/inodes.rst
@@ -29,8 +29,9 @@ and the inode structure itself.
29The inode table entry is laid out in ``struct ext4_inode``. 29The inode table entry is laid out in ``struct ext4_inode``.
30 30
31.. list-table:: 31.. list-table::
32 :widths: 1 1 1 77 32 :widths: 8 8 24 40
33 :header-rows: 1 33 :header-rows: 1
34 :class: longtable
34 35
35 * - Offset 36 * - Offset
36 - Size 37 - Size
@@ -176,7 +177,7 @@ The inode table entry is laid out in ``struct ext4_inode``.
176The ``i_mode`` value is a combination of the following flags: 177The ``i_mode`` value is a combination of the following flags:
177 178
178.. list-table:: 179.. list-table::
179 :widths: 1 79 180 :widths: 16 64
180 :header-rows: 1 181 :header-rows: 1
181 182
182 * - Value 183 * - Value
@@ -227,7 +228,7 @@ The ``i_mode`` value is a combination of the following flags:
227The ``i_flags`` field is a combination of these values: 228The ``i_flags`` field is a combination of these values:
228 229
229.. list-table:: 230.. list-table::
230 :widths: 1 79 231 :widths: 16 64
231 :header-rows: 1 232 :header-rows: 1
232 233
233 * - Value 234 * - Value
@@ -314,7 +315,7 @@ The ``osd1`` field has multiple meanings depending on the creator:
314Linux: 315Linux:
315 316
316.. list-table:: 317.. list-table::
317 :widths: 1 1 1 77 318 :widths: 8 8 24 40
318 :header-rows: 1 319 :header-rows: 1
319 320
320 * - Offset 321 * - Offset
@@ -331,7 +332,7 @@ Linux:
331Hurd: 332Hurd:
332 333
333.. list-table:: 334.. list-table::
334 :widths: 1 1 1 77 335 :widths: 8 8 24 40
335 :header-rows: 1 336 :header-rows: 1
336 337
337 * - Offset 338 * - Offset
@@ -346,7 +347,7 @@ Hurd:
346Masix: 347Masix:
347 348
348.. list-table:: 349.. list-table::
349 :widths: 1 1 1 77 350 :widths: 8 8 24 40
350 :header-rows: 1 351 :header-rows: 1
351 352
352 * - Offset 353 * - Offset
@@ -365,7 +366,7 @@ The ``osd2`` field has multiple meanings depending on the filesystem creator:
365Linux: 366Linux:
366 367
367.. list-table:: 368.. list-table::
368 :widths: 1 1 1 77 369 :widths: 8 8 24 40
369 :header-rows: 1 370 :header-rows: 1
370 371
371 * - Offset 372 * - Offset
@@ -402,7 +403,7 @@ Linux:
402Hurd: 403Hurd:
403 404
404.. list-table:: 405.. list-table::
405 :widths: 1 1 1 77 406 :widths: 8 8 24 40
406 :header-rows: 1 407 :header-rows: 1
407 408
408 * - Offset 409 * - Offset
@@ -433,7 +434,7 @@ Hurd:
433Masix: 434Masix:
434 435
435.. list-table:: 436.. list-table::
436 :widths: 1 1 1 77 437 :widths: 8 8 24 40
437 :header-rows: 1 438 :header-rows: 1
438 439
439 * - Offset 440 * - Offset
diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/journal.rst
index e7031af86876..ea613ee701f5 100644
--- a/Documentation/filesystems/ext4/ondisk/journal.rst
+++ b/Documentation/filesystems/ext4/journal.rst
@@ -48,7 +48,7 @@ Layout
48Generally speaking, the journal has this format: 48Generally speaking, the journal has this format:
49 49
50.. list-table:: 50.. list-table::
51 :widths: 1 1 78 51 :widths: 16 48 16
52 :header-rows: 1 52 :header-rows: 1
53 53
54 * - Superblock 54 * - Superblock
@@ -76,7 +76,7 @@ The journal superblock will be in the next full block after the
76superblock. 76superblock.
77 77
78.. list-table:: 78.. list-table::
79 :widths: 1 1 1 1 76 79 :widths: 12 12 12 32 12
80 :header-rows: 1 80 :header-rows: 1
81 81
82 * - 1024 bytes of padding 82 * - 1024 bytes of padding
@@ -98,7 +98,7 @@ Every block in the journal starts with a common 12-byte header
98``struct journal_header_s``: 98``struct journal_header_s``:
99 99
100.. list-table:: 100.. list-table::
101 :widths: 1 1 1 77 101 :widths: 8 8 24 40
102 :header-rows: 1 102 :header-rows: 1
103 103
104 * - Offset 104 * - Offset
@@ -124,7 +124,7 @@ Every block in the journal starts with a common 12-byte header
124The journal block type can be any one of: 124The journal block type can be any one of:
125 125
126.. list-table:: 126.. list-table::
127 :widths: 1 79 127 :widths: 16 64
128 :header-rows: 1 128 :header-rows: 1
129 129
130 * - Value 130 * - Value
@@ -154,7 +154,7 @@ The journal superblock is recorded as ``struct journal_superblock_s``,
154which is 1024 bytes long: 154which is 1024 bytes long:
155 155
156.. list-table:: 156.. list-table::
157 :widths: 1 1 1 77 157 :widths: 8 8 24 40
158 :header-rows: 1 158 :header-rows: 1
159 159
160 * - Offset 160 * - Offset
@@ -264,7 +264,7 @@ which is 1024 bytes long:
264The journal compat features are any combination of the following: 264The journal compat features are any combination of the following:
265 265
266.. list-table:: 266.. list-table::
267 :widths: 1 79 267 :widths: 16 64
268 :header-rows: 1 268 :header-rows: 1
269 269
270 * - Value 270 * - Value
@@ -278,7 +278,7 @@ The journal compat features are any combination of the following:
278The journal incompat features are any combination of the following: 278The journal incompat features are any combination of the following:
279 279
280.. list-table:: 280.. list-table::
281 :widths: 1 79 281 :widths: 16 64
282 :header-rows: 1 282 :header-rows: 1
283 283
284 * - Value 284 * - Value
@@ -306,7 +306,7 @@ Journal checksum type codes are one of the following. crc32 or crc32c are the
306most likely choices. 306most likely choices.
307 307
308.. list-table:: 308.. list-table::
309 :widths: 1 79 309 :widths: 16 64
310 :header-rows: 1 310 :header-rows: 1
311 311
312 * - Value 312 * - Value
@@ -330,7 +330,7 @@ described by a data structure, but here is the block structure anyway.
330Descriptor blocks consume at least 36 bytes, but use a full block: 330Descriptor blocks consume at least 36 bytes, but use a full block:
331 331
332.. list-table:: 332.. list-table::
333 :widths: 1 1 1 77 333 :widths: 8 8 24 40
334 :header-rows: 1 334 :header-rows: 1
335 335
336 * - Offset 336 * - Offset
@@ -355,7 +355,7 @@ defined as ``struct journal_block_tag3_s``, which looks like the
355following. The size is 16 or 32 bytes. 355following. The size is 16 or 32 bytes.
356 356
357.. list-table:: 357.. list-table::
358 :widths: 1 1 1 77 358 :widths: 8 8 24 40
359 :header-rows: 1 359 :header-rows: 1
360 360
361 * - Offset 361 * - Offset
@@ -400,7 +400,7 @@ following. The size is 16 or 32 bytes.
400The journal tag flags are any combination of the following: 400The journal tag flags are any combination of the following:
401 401
402.. list-table:: 402.. list-table::
403 :widths: 1 79 403 :widths: 16 64
404 :header-rows: 1 404 :header-rows: 1
405 405
406 * - Value 406 * - Value
@@ -421,7 +421,7 @@ is defined as ``struct journal_block_tag_s``, which looks like the
421following. The size is 8, 12, 24, or 28 bytes: 421following. The size is 8, 12, 24, or 28 bytes:
422 422
423.. list-table:: 423.. list-table::
424 :widths: 1 1 1 77 424 :widths: 8 8 24 40
425 :header-rows: 1 425 :header-rows: 1
426 426
427 * - Offset 427 * - Offset
@@ -471,7 +471,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a
471``struct jbd2_journal_block_tail``, which looks like this: 471``struct jbd2_journal_block_tail``, which looks like this:
472 472
473.. list-table:: 473.. list-table::
474 :widths: 1 1 1 77 474 :widths: 8 8 24 40
475 :header-rows: 1 475 :header-rows: 1
476 476
477 * - Offset 477 * - Offset
@@ -513,7 +513,7 @@ Revocation blocks are described in
513length, but use a full block: 513length, but use a full block:
514 514
515.. list-table:: 515.. list-table::
516 :widths: 1 1 1 77 516 :widths: 8 8 24 40
517 :header-rows: 1 517 :header-rows: 1
518 518
519 * - Offset 519 * - Offset
@@ -543,7 +543,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation
543block is a ``struct jbd2_journal_revoke_tail``, which has this format: 543block is a ``struct jbd2_journal_revoke_tail``, which has this format:
544 544
545.. list-table:: 545.. list-table::
546 :widths: 1 1 1 77 546 :widths: 8 8 24 40
547 :header-rows: 1 547 :header-rows: 1
548 548
549 * - Offset 549 * - Offset
@@ -567,7 +567,7 @@ The commit block is described by ``struct commit_header``, which is 32
567bytes long (but uses a full block): 567bytes long (but uses a full block):
568 568
569.. list-table:: 569.. list-table::
570 :widths: 1 1 1 77 570 :widths: 8 8 24 40
571 :header-rows: 1 571 :header-rows: 1
572 572
573 * - Offset 573 * - Offset
diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/mmp.rst
index b7d7a3137f80..25660981d93c 100644
--- a/Documentation/filesystems/ext4/ondisk/mmp.rst
+++ b/Documentation/filesystems/ext4/mmp.rst
@@ -32,7 +32,7 @@ The checksum is calculated against the FS UUID and the MMP structure.
32The MMP structure (``struct mmp_struct``) is as follows: 32The MMP structure (``struct mmp_struct``) is as follows:
33 33
34.. list-table:: 34.. list-table::
35 :widths: 1 1 1 77 35 :widths: 8 12 20 40
36 :header-rows: 1 36 :header-rows: 1
37 37
38 * - Offset 38 * - Offset
diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst
deleted file mode 100644
index f7d082c3a435..000000000000
--- a/Documentation/filesystems/ext4/ondisk/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
1.. SPDX-License-Identifier: GPL-2.0
2
3==============================
4Data Structures and Algorithms
5==============================
6.. include:: about.rst
7.. include:: overview.rst
8.. include:: globals.rst
9.. include:: dynamic.rst
diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/overview.rst
index cbab18baba12..cbab18baba12 100644
--- a/Documentation/filesystems/ext4/ondisk/overview.rst
+++ b/Documentation/filesystems/ext4/overview.rst
diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/special_inodes.rst
index a82f70c9baeb..9061aabba827 100644
--- a/Documentation/filesystems/ext4/ondisk/special_inodes.rst
+++ b/Documentation/filesystems/ext4/special_inodes.rst
@@ -6,7 +6,7 @@ Special inodes
6ext4 reserves some inode for special features, as follows: 6ext4 reserves some inode for special features, as follows:
7 7
8.. list-table:: 8.. list-table::
9 :widths: 1 79 9 :widths: 6 70
10 :header-rows: 1 10 :header-rows: 1
11 11
12 * - inode Number 12 * - inode Number
diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/super.rst
index 5f81dd87e0b9..04ff079a2acf 100644
--- a/Documentation/filesystems/ext4/ondisk/super.rst
+++ b/Documentation/filesystems/ext4/super.rst
@@ -19,7 +19,7 @@ The ext4 superblock is laid out as follows in
19``struct ext4_super_block``: 19``struct ext4_super_block``:
20 20
21.. list-table:: 21.. list-table::
22 :widths: 1 1 1 77 22 :widths: 8 8 24 40
23 :header-rows: 1 23 :header-rows: 1
24 24
25 * - Offset 25 * - Offset
@@ -483,7 +483,7 @@ The ext4 superblock is laid out as follows in
483The superblock state is some combination of the following: 483The superblock state is some combination of the following:
484 484
485.. list-table:: 485.. list-table::
486 :widths: 1 79 486 :widths: 8 72
487 :header-rows: 1 487 :header-rows: 1
488 488
489 * - Value 489 * - Value
@@ -500,7 +500,7 @@ The superblock state is some combination of the following:
500The superblock error policy is one of the following: 500The superblock error policy is one of the following:
501 501
502.. list-table:: 502.. list-table::
503 :widths: 1 79 503 :widths: 8 72
504 :header-rows: 1 504 :header-rows: 1
505 505
506 * - Value 506 * - Value
@@ -517,7 +517,7 @@ The superblock error policy is one of the following:
517The filesystem creator is one of the following: 517The filesystem creator is one of the following:
518 518
519.. list-table:: 519.. list-table::
520 :widths: 1 79 520 :widths: 8 72
521 :header-rows: 1 521 :header-rows: 1
522 522
523 * - Value 523 * - Value
@@ -538,7 +538,7 @@ The filesystem creator is one of the following:
538The superblock revision is one of the following: 538The superblock revision is one of the following:
539 539
540.. list-table:: 540.. list-table::
541 :widths: 1 79 541 :widths: 8 72
542 :header-rows: 1 542 :header-rows: 1
543 543
544 * - Value 544 * - Value
@@ -556,7 +556,7 @@ The superblock compatible features field is a combination of any of the
556following: 556following:
557 557
558.. list-table:: 558.. list-table::
559 :widths: 1 79 559 :widths: 16 64
560 :header-rows: 1 560 :header-rows: 1
561 561
562 * - Value 562 * - Value
@@ -595,7 +595,7 @@ The superblock incompatible features field is a combination of any of the
595following: 595following:
596 596
597.. list-table:: 597.. list-table::
598 :widths: 1 79 598 :widths: 16 64
599 :header-rows: 1 599 :header-rows: 1
600 600
601 * - Value 601 * - Value
@@ -647,7 +647,7 @@ The superblock read-only compatible features field is a combination of any of
647the following: 647the following:
648 648
649.. list-table:: 649.. list-table::
650 :widths: 1 79 650 :widths: 16 64
651 :header-rows: 1 651 :header-rows: 1
652 652
653 * - Value 653 * - Value
@@ -702,7 +702,7 @@ the following:
702The ``s_def_hash_version`` field is one of the following: 702The ``s_def_hash_version`` field is one of the following:
703 703
704.. list-table:: 704.. list-table::
705 :widths: 1 79 705 :widths: 8 72
706 :header-rows: 1 706 :header-rows: 1
707 707
708 * - Value 708 * - Value
@@ -725,7 +725,7 @@ The ``s_def_hash_version`` field is one of the following:
725The ``s_default_mount_opts`` field is any combination of the following: 725The ``s_default_mount_opts`` field is any combination of the following:
726 726
727.. list-table:: 727.. list-table::
728 :widths: 1 79 728 :widths: 8 72
729 :header-rows: 1 729 :header-rows: 1
730 730
731 * - Value 731 * - Value
@@ -767,7 +767,7 @@ The ``s_default_mount_opts`` field is any combination of the following:
767The ``s_flags`` field is any combination of the following: 767The ``s_flags`` field is any combination of the following:
768 768
769.. list-table:: 769.. list-table::
770 :widths: 1 79 770 :widths: 8 72
771 :header-rows: 1 771 :header-rows: 1
772 772
773 * - Value 773 * - Value
@@ -784,7 +784,7 @@ The ``s_flags`` field is any combination of the following:
784The ``s_encrypt_algos`` list can contain any of the following: 784The ``s_encrypt_algos`` list can contain any of the following:
785 785
786.. list-table:: 786.. list-table::
787 :widths: 1 79 787 :widths: 8 72
788 :header-rows: 1 788 :header-rows: 1
789 789
790 * - Value 790 * - Value
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index fb50f9aa6ead..c1d570ee1d9f 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
284 error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, 284 error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
285 default_acl, XATTR_CREATE); 285 default_acl, XATTR_CREATE);
286 posix_acl_release(default_acl); 286 posix_acl_release(default_acl);
287 } else {
288 inode->i_default_acl = NULL;
287 } 289 }
288 if (acl) { 290 if (acl) {
289 if (!error) 291 if (!error)
290 error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, 292 error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
291 acl, XATTR_CREATE); 293 acl, XATTR_CREATE);
292 posix_acl_release(acl); 294 posix_acl_release(acl);
295 } else {
296 inode->i_acl = NULL;
293 } 297 }
294 return error; 298 return error;
295} 299}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index caff935fbeb8..12f90d48ba61 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -628,6 +628,7 @@ enum {
628#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 628#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
629#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 629#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
630#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 630#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
631#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040
631 632
632/* 633/*
633 * ioctl commands 634 * ioctl commands
@@ -1030,6 +1031,9 @@ struct ext4_inode_info {
1030 ext4_lblk_t i_da_metadata_calc_last_lblock; 1031 ext4_lblk_t i_da_metadata_calc_last_lblock;
1031 int i_da_metadata_calc_len; 1032 int i_da_metadata_calc_len;
1032 1033
1034 /* pending cluster reservations for bigalloc file systems */
1035 struct ext4_pending_tree i_pending_tree;
1036
1033 /* on-disk additional length */ 1037 /* on-disk additional length */
1034 __u16 i_extra_isize; 1038 __u16 i_extra_isize;
1035 1039
@@ -1401,7 +1405,8 @@ struct ext4_sb_info {
1401 u32 s_min_batch_time; 1405 u32 s_min_batch_time;
1402 struct block_device *journal_bdev; 1406 struct block_device *journal_bdev;
1403#ifdef CONFIG_QUOTA 1407#ifdef CONFIG_QUOTA
1404 char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ 1408 /* Names of quota files with journalled quota */
1409 char __rcu *s_qf_names[EXT4_MAXQUOTAS];
1405 int s_jquota_fmt; /* Format of quota to use */ 1410 int s_jquota_fmt; /* Format of quota to use */
1406#endif 1411#endif
1407 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ 1412 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
@@ -2483,10 +2488,11 @@ extern int ext4_writepage_trans_blocks(struct inode *);
2483extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2488extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
2484extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 2489extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
2485 loff_t lstart, loff_t lend); 2490 loff_t lstart, loff_t lend);
2486extern int ext4_page_mkwrite(struct vm_fault *vmf); 2491extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
2487extern int ext4_filemap_fault(struct vm_fault *vmf); 2492extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
2488extern qsize_t *ext4_get_reserved_space(struct inode *inode); 2493extern qsize_t *ext4_get_reserved_space(struct inode *inode);
2489extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); 2494extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
2495extern void ext4_da_release_space(struct inode *inode, int to_free);
2490extern void ext4_da_update_reserve_space(struct inode *inode, 2496extern void ext4_da_update_reserve_space(struct inode *inode,
2491 int used, int quota_claim); 2497 int used, int quota_claim);
2492extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, 2498extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
@@ -3142,10 +3148,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
3142 int flags); 3148 int flags);
3143extern void ext4_ext_drop_refs(struct ext4_ext_path *); 3149extern void ext4_ext_drop_refs(struct ext4_ext_path *);
3144extern int ext4_ext_check_inode(struct inode *inode); 3150extern int ext4_ext_check_inode(struct inode *inode);
3145extern int ext4_find_delalloc_range(struct inode *inode,
3146 ext4_lblk_t lblk_start,
3147 ext4_lblk_t lblk_end);
3148extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
3149extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); 3151extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
3150extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 3152extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3151 __u64 start, __u64 len); 3153 __u64 start, __u64 len);
@@ -3156,6 +3158,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
3156 struct inode *inode2, ext4_lblk_t lblk1, 3158 struct inode *inode2, ext4_lblk_t lblk1,
3157 ext4_lblk_t lblk2, ext4_lblk_t count, 3159 ext4_lblk_t lblk2, ext4_lblk_t count,
3158 int mark_unwritten,int *err); 3160 int mark_unwritten,int *err);
3161extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
3159 3162
3160/* move_extent.c */ 3163/* move_extent.c */
3161extern void ext4_double_down_write_data_sem(struct inode *first, 3164extern void ext4_double_down_write_data_sem(struct inode *first,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index adf6668b596f..98bd0e9ee7df 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -120,6 +120,19 @@ struct ext4_ext_path {
120}; 120};
121 121
122/* 122/*
123 * Used to record a portion of a cluster found at the beginning or end
124 * of an extent while traversing the extent tree during space removal.
125 * A partial cluster may be removed if it does not contain blocks shared
126 * with extents that aren't being deleted (tofree state). Otherwise,
127 * it cannot be removed (nofree state).
128 */
129struct partial_cluster {
130 ext4_fsblk_t pclu; /* physical cluster number */
131 ext4_lblk_t lblk; /* logical block number within logical cluster */
132 enum {initial, tofree, nofree} state;
133};
134
135/*
123 * structure for external API 136 * structure for external API
124 */ 137 */
125 138
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 72a361d5ef74..240b6dea5441 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2351,8 +2351,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2351{ 2351{
2352 struct extent_status es; 2352 struct extent_status es;
2353 2353
2354 ext4_es_find_delayed_extent_range(inode, hole_start, 2354 ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
2355 hole_start + hole_len - 1, &es); 2355 hole_start + hole_len - 1, &es);
2356 if (es.es_len) { 2356 if (es.es_len) {
2357 /* There's delayed extent containing lblock? */ 2357 /* There's delayed extent containing lblock? */
2358 if (es.es_lblk <= hole_start) 2358 if (es.es_lblk <= hole_start)
@@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode)
2490 return 0; 2490 return 0;
2491} 2491}
2492 2492
2493/*
2494 * ext4_rereserve_cluster - increment the reserved cluster count when
2495 * freeing a cluster with a pending reservation
2496 *
2497 * @inode - file containing the cluster
2498 * @lblk - logical block in cluster to be reserved
2499 *
2500 * Increments the reserved cluster count and adjusts quota in a bigalloc
2501 * file system when freeing a partial cluster containing at least one
2502 * delayed and unwritten block. A partial cluster meeting that
2503 * requirement will have a pending reservation. If so, the
2504 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2505 * defer reserved and allocated space accounting to a subsequent call
2506 * to this function.
2507 */
2508static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2509{
2510 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2511 struct ext4_inode_info *ei = EXT4_I(inode);
2512
2513 dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2514
2515 spin_lock(&ei->i_block_reservation_lock);
2516 ei->i_reserved_data_blocks++;
2517 percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2518 spin_unlock(&ei->i_block_reservation_lock);
2519
2520 percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2521 ext4_remove_pending(inode, lblk);
2522}
2523
2493static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2524static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2494 struct ext4_extent *ex, 2525 struct ext4_extent *ex,
2495 long long *partial_cluster, 2526 struct partial_cluster *partial,
2496 ext4_lblk_t from, ext4_lblk_t to) 2527 ext4_lblk_t from, ext4_lblk_t to)
2497{ 2528{
2498 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2529 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2499 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2530 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2500 ext4_fsblk_t pblk; 2531 ext4_fsblk_t last_pblk, pblk;
2501 int flags = get_default_free_blocks_flags(inode); 2532 ext4_lblk_t num;
2533 int flags;
2534
2535 /* only extent tail removal is allowed */
2536 if (from < le32_to_cpu(ex->ee_block) ||
2537 to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2538 ext4_error(sbi->s_sb,
2539 "strange request: removal(2) %u-%u from %u:%u",
2540 from, to, le32_to_cpu(ex->ee_block), ee_len);
2541 return 0;
2542 }
2543
2544#ifdef EXTENTS_STATS
2545 spin_lock(&sbi->s_ext_stats_lock);
2546 sbi->s_ext_blocks += ee_len;
2547 sbi->s_ext_extents++;
2548 if (ee_len < sbi->s_ext_min)
2549 sbi->s_ext_min = ee_len;
2550 if (ee_len > sbi->s_ext_max)
2551 sbi->s_ext_max = ee_len;
2552 if (ext_depth(inode) > sbi->s_depth_max)
2553 sbi->s_depth_max = ext_depth(inode);
2554 spin_unlock(&sbi->s_ext_stats_lock);
2555#endif
2556
2557 trace_ext4_remove_blocks(inode, ex, from, to, partial);
2502 2558
2503 /* 2559 /*
2504 * For bigalloc file systems, we never free a partial cluster 2560 * if we have a partial cluster, and it's different from the
2505 * at the beginning of the extent. Instead, we make a note 2561 * cluster of the last block in the extent, we free it
2506 * that we tried freeing the cluster, and check to see if we
2507 * need to free it on a subsequent call to ext4_remove_blocks,
2508 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2509 */ 2562 */
2510 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; 2563 last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2564
2565 if (partial->state != initial &&
2566 partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2567 if (partial->state == tofree) {
2568 flags = get_default_free_blocks_flags(inode);
2569 if (ext4_is_pending(inode, partial->lblk))
2570 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2571 ext4_free_blocks(handle, inode, NULL,
2572 EXT4_C2B(sbi, partial->pclu),
2573 sbi->s_cluster_ratio, flags);
2574 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2575 ext4_rereserve_cluster(inode, partial->lblk);
2576 }
2577 partial->state = initial;
2578 }
2579
2580 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2581 pblk = ext4_ext_pblock(ex) + ee_len - num;
2511 2582
2512 trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
2513 /* 2583 /*
2514 * If we have a partial cluster, and it's different from the 2584 * We free the partial cluster at the end of the extent (if any),
2515 * cluster of the last block, we need to explicitly free the 2585 * unless the cluster is used by another extent (partial_cluster
2516 * partial cluster here. 2586 * state is nofree). If a partial cluster exists here, it must be
2587 * shared with the last block in the extent.
2517 */ 2588 */
2518 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2589 flags = get_default_free_blocks_flags(inode);
2519 if (*partial_cluster > 0 && 2590
2520 *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { 2591 /* partial, left end cluster aligned, right end unaligned */
2592 if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2593 (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2594 (partial->state != nofree)) {
2595 if (ext4_is_pending(inode, to))
2596 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2521 ext4_free_blocks(handle, inode, NULL, 2597 ext4_free_blocks(handle, inode, NULL,
2522 EXT4_C2B(sbi, *partial_cluster), 2598 EXT4_PBLK_CMASK(sbi, last_pblk),
2523 sbi->s_cluster_ratio, flags); 2599 sbi->s_cluster_ratio, flags);
2524 *partial_cluster = 0; 2600 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2601 ext4_rereserve_cluster(inode, to);
2602 partial->state = initial;
2603 flags = get_default_free_blocks_flags(inode);
2525 } 2604 }
2526 2605
2527#ifdef EXTENTS_STATS 2606 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2528 {
2529 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2530 spin_lock(&sbi->s_ext_stats_lock);
2531 sbi->s_ext_blocks += ee_len;
2532 sbi->s_ext_extents++;
2533 if (ee_len < sbi->s_ext_min)
2534 sbi->s_ext_min = ee_len;
2535 if (ee_len > sbi->s_ext_max)
2536 sbi->s_ext_max = ee_len;
2537 if (ext_depth(inode) > sbi->s_depth_max)
2538 sbi->s_depth_max = ext_depth(inode);
2539 spin_unlock(&sbi->s_ext_stats_lock);
2540 }
2541#endif
2542 if (from >= le32_to_cpu(ex->ee_block)
2543 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2544 /* tail removal */
2545 ext4_lblk_t num;
2546 long long first_cluster;
2547
2548 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2549 pblk = ext4_ext_pblock(ex) + ee_len - num;
2550 /*
2551 * Usually we want to free partial cluster at the end of the
2552 * extent, except for the situation when the cluster is still
2553 * used by any other extent (partial_cluster is negative).
2554 */
2555 if (*partial_cluster < 0 &&
2556 *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
2557 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2558 2607
2559 ext_debug("free last %u blocks starting %llu partial %lld\n", 2608 /*
2560 num, pblk, *partial_cluster); 2609 * For bigalloc file systems, we never free a partial cluster
2561 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2610 * at the beginning of the extent. Instead, we check to see if we
2562 /* 2611 * need to free it on a subsequent call to ext4_remove_blocks,
2563 * If the block range to be freed didn't start at the 2612 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2564 * beginning of a cluster, and we removed the entire 2613 */
2565 * extent and the cluster is not used by any other extent, 2614 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2566 * save the partial cluster here, since we might need to 2615 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2567 * delete if we determine that the truncate or punch hole 2616
2568 * operation has removed all of the blocks in the cluster. 2617 /* reset the partial cluster if we've freed past it */
2569 * If that cluster is used by another extent, preserve its 2618 if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2570 * negative value so it isn't freed later on. 2619 partial->state = initial;
2571 * 2620
2572 * If the whole extent wasn't freed, we've reached the 2621 /*
2573 * start of the truncated/punched region and have finished 2622 * If we've freed the entire extent but the beginning is not left
2574 * removing blocks. If there's a partial cluster here it's 2623 * cluster aligned and is not marked as ineligible for freeing we
2575 * shared with the remainder of the extent and is no longer 2624 * record the partial cluster at the beginning of the extent. It
2576 * a candidate for removal. 2625 * wasn't freed by the preceding ext4_free_blocks() call, and we
2577 */ 2626 * need to look farther to the left to determine if it's to be freed
2578 if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { 2627 * (not shared with another extent). Else, reset the partial
2579 first_cluster = (long long) EXT4_B2C(sbi, pblk); 2628 * cluster - we're either done freeing or the beginning of the
2580 if (first_cluster != -*partial_cluster) 2629 * extent is left cluster aligned.
2581 *partial_cluster = first_cluster; 2630 */
2582 } else { 2631 if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2583 *partial_cluster = 0; 2632 if (partial->state == initial) {
2633 partial->pclu = EXT4_B2C(sbi, pblk);
2634 partial->lblk = from;
2635 partial->state = tofree;
2584 } 2636 }
2585 } else 2637 } else {
2586 ext4_error(sbi->s_sb, "strange request: removal(2) " 2638 partial->state = initial;
2587 "%u-%u from %u:%u", 2639 }
2588 from, to, le32_to_cpu(ex->ee_block), ee_len); 2640
2589 return 0; 2641 return 0;
2590} 2642}
2591 2643
2592
2593/* 2644/*
2594 * ext4_ext_rm_leaf() Removes the extents associated with the 2645 * ext4_ext_rm_leaf() Removes the extents associated with the
2595 * blocks appearing between "start" and "end". Both "start" 2646 * blocks appearing between "start" and "end". Both "start"
@@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2608static int 2659static int
2609ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2660ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2610 struct ext4_ext_path *path, 2661 struct ext4_ext_path *path,
2611 long long *partial_cluster, 2662 struct partial_cluster *partial,
2612 ext4_lblk_t start, ext4_lblk_t end) 2663 ext4_lblk_t start, ext4_lblk_t end)
2613{ 2664{
2614 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2665 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2640 ex_ee_block = le32_to_cpu(ex->ee_block); 2691 ex_ee_block = le32_to_cpu(ex->ee_block);
2641 ex_ee_len = ext4_ext_get_actual_len(ex); 2692 ex_ee_len = ext4_ext_get_actual_len(ex);
2642 2693
2643 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2694 trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2644 2695
2645 while (ex >= EXT_FIRST_EXTENT(eh) && 2696 while (ex >= EXT_FIRST_EXTENT(eh) &&
2646 ex_ee_block + ex_ee_len > start) { 2697 ex_ee_block + ex_ee_len > start) {
@@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2671 */ 2722 */
2672 if (sbi->s_cluster_ratio > 1) { 2723 if (sbi->s_cluster_ratio > 1) {
2673 pblk = ext4_ext_pblock(ex); 2724 pblk = ext4_ext_pblock(ex);
2674 *partial_cluster = 2725 partial->pclu = EXT4_B2C(sbi, pblk);
2675 -(long long) EXT4_B2C(sbi, pblk); 2726 partial->state = nofree;
2676 } 2727 }
2677 ex--; 2728 ex--;
2678 ex_ee_block = le32_to_cpu(ex->ee_block); 2729 ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2714 if (err) 2765 if (err)
2715 goto out; 2766 goto out;
2716 2767
2717 err = ext4_remove_blocks(handle, inode, ex, partial_cluster, 2768 err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2718 a, b);
2719 if (err) 2769 if (err)
2720 goto out; 2770 goto out;
2721 2771
@@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2769 * If there's a partial cluster and at least one extent remains in 2819 * If there's a partial cluster and at least one extent remains in
2770 * the leaf, free the partial cluster if it isn't shared with the 2820 * the leaf, free the partial cluster if it isn't shared with the
2771 * current extent. If it is shared with the current extent 2821 * current extent. If it is shared with the current extent
2772 * we zero partial_cluster because we've reached the start of the 2822 * we reset the partial cluster because we've reached the start of the
2773 * truncated/punched region and we're done removing blocks. 2823 * truncated/punched region and we're done removing blocks.
2774 */ 2824 */
2775 if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { 2825 if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2776 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; 2826 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2777 if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { 2827 if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2828 int flags = get_default_free_blocks_flags(inode);
2829
2830 if (ext4_is_pending(inode, partial->lblk))
2831 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2778 ext4_free_blocks(handle, inode, NULL, 2832 ext4_free_blocks(handle, inode, NULL,
2779 EXT4_C2B(sbi, *partial_cluster), 2833 EXT4_C2B(sbi, partial->pclu),
2780 sbi->s_cluster_ratio, 2834 sbi->s_cluster_ratio, flags);
2781 get_default_free_blocks_flags(inode)); 2835 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2836 ext4_rereserve_cluster(inode, partial->lblk);
2782 } 2837 }
2783 *partial_cluster = 0; 2838 partial->state = initial;
2784 } 2839 }
2785 2840
2786 /* if this leaf is free, then we should 2841 /* if this leaf is free, then we should
@@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2819 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2874 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2820 int depth = ext_depth(inode); 2875 int depth = ext_depth(inode);
2821 struct ext4_ext_path *path = NULL; 2876 struct ext4_ext_path *path = NULL;
2822 long long partial_cluster = 0; 2877 struct partial_cluster partial;
2823 handle_t *handle; 2878 handle_t *handle;
2824 int i = 0, err = 0; 2879 int i = 0, err = 0;
2825 2880
2881 partial.pclu = 0;
2882 partial.lblk = 0;
2883 partial.state = initial;
2884
2826 ext_debug("truncate since %u to %u\n", start, end); 2885 ext_debug("truncate since %u to %u\n", start, end);
2827 2886
2828 /* probably first extent we're gonna free will be last in block */ 2887 /* probably first extent we're gonna free will be last in block */
@@ -2882,8 +2941,8 @@ again:
2882 */ 2941 */
2883 if (sbi->s_cluster_ratio > 1) { 2942 if (sbi->s_cluster_ratio > 1) {
2884 pblk = ext4_ext_pblock(ex) + end - ee_block + 2; 2943 pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
2885 partial_cluster = 2944 partial.pclu = EXT4_B2C(sbi, pblk);
2886 -(long long) EXT4_B2C(sbi, pblk); 2945 partial.state = nofree;
2887 } 2946 }
2888 2947
2889 /* 2948 /*
@@ -2911,9 +2970,10 @@ again:
2911 &ex); 2970 &ex);
2912 if (err) 2971 if (err)
2913 goto out; 2972 goto out;
2914 if (pblk) 2973 if (pblk) {
2915 partial_cluster = 2974 partial.pclu = EXT4_B2C(sbi, pblk);
2916 -(long long) EXT4_B2C(sbi, pblk); 2975 partial.state = nofree;
2976 }
2917 } 2977 }
2918 } 2978 }
2919 /* 2979 /*
@@ -2948,8 +3008,7 @@ again:
2948 if (i == depth) { 3008 if (i == depth) {
2949 /* this is leaf block */ 3009 /* this is leaf block */
2950 err = ext4_ext_rm_leaf(handle, inode, path, 3010 err = ext4_ext_rm_leaf(handle, inode, path,
2951 &partial_cluster, start, 3011 &partial, start, end);
2952 end);
2953 /* root level has p_bh == NULL, brelse() eats this */ 3012 /* root level has p_bh == NULL, brelse() eats this */
2954 brelse(path[i].p_bh); 3013 brelse(path[i].p_bh);
2955 path[i].p_bh = NULL; 3014 path[i].p_bh = NULL;
@@ -3021,21 +3080,24 @@ again:
3021 } 3080 }
3022 } 3081 }
3023 3082
3024 trace_ext4_ext_remove_space_done(inode, start, end, depth, 3083 trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
3025 partial_cluster, path->p_hdr->eh_entries); 3084 path->p_hdr->eh_entries);
3026 3085
3027 /* 3086 /*
3028 * If we still have something in the partial cluster and we have removed 3087 * if there's a partial cluster and we have removed the first extent
3029 * even the first extent, then we should free the blocks in the partial 3088 * in the file, then we also free the partial cluster, if any
3030 * cluster as well. (This code will only run when there are no leaves
3031 * to the immediate left of the truncated/punched region.)
3032 */ 3089 */
3033 if (partial_cluster > 0 && err == 0) { 3090 if (partial.state == tofree && err == 0) {
3034 /* don't zero partial_cluster since it's not used afterwards */ 3091 int flags = get_default_free_blocks_flags(inode);
3092
3093 if (ext4_is_pending(inode, partial.lblk))
3094 flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
3035 ext4_free_blocks(handle, inode, NULL, 3095 ext4_free_blocks(handle, inode, NULL,
3036 EXT4_C2B(sbi, partial_cluster), 3096 EXT4_C2B(sbi, partial.pclu),
3037 sbi->s_cluster_ratio, 3097 sbi->s_cluster_ratio, flags);
3038 get_default_free_blocks_flags(inode)); 3098 if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
3099 ext4_rereserve_cluster(inode, partial.lblk);
3100 partial.state = initial;
3039 } 3101 }
3040 3102
3041 /* TODO: flexible tree reduction should be here */ 3103 /* TODO: flexible tree reduction should be here */
@@ -3819,114 +3881,6 @@ out:
3819 return ext4_mark_inode_dirty(handle, inode); 3881 return ext4_mark_inode_dirty(handle, inode);
3820} 3882}
3821 3883
3822/**
3823 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3824 *
3825 * Return 1 if there is a delalloc block in the range, otherwise 0.
3826 */
3827int ext4_find_delalloc_range(struct inode *inode,
3828 ext4_lblk_t lblk_start,
3829 ext4_lblk_t lblk_end)
3830{
3831 struct extent_status es;
3832
3833 ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
3834 if (es.es_len == 0)
3835 return 0; /* there is no delay extent in this tree */
3836 else if (es.es_lblk <= lblk_start &&
3837 lblk_start < es.es_lblk + es.es_len)
3838 return 1;
3839 else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
3840 return 1;
3841 else
3842 return 0;
3843}
3844
3845int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3846{
3847 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3848 ext4_lblk_t lblk_start, lblk_end;
3849 lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
3850 lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3851
3852 return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
3853}
3854
3855/**
3856 * Determines how many complete clusters (out of those specified by the 'map')
3857 * are under delalloc and were reserved quota for.
3858 * This function is called when we are writing out the blocks that were
3859 * originally written with their allocation delayed, but then the space was
3860 * allocated using fallocate() before the delayed allocation could be resolved.
3861 * The cases to look for are:
3862 * ('=' indicated delayed allocated blocks
3863 * '-' indicates non-delayed allocated blocks)
3864 * (a) partial clusters towards beginning and/or end outside of allocated range
3865 * are not delalloc'ed.
3866 * Ex:
3867 * |----c---=|====c====|====c====|===-c----|
3868 * |++++++ allocated ++++++|
3869 * ==> 4 complete clusters in above example
3870 *
3871 * (b) partial cluster (outside of allocated range) towards either end is
3872 * marked for delayed allocation. In this case, we will exclude that
3873 * cluster.
3874 * Ex:
3875 * |----====c========|========c========|
3876 * |++++++ allocated ++++++|
3877 * ==> 1 complete clusters in above example
3878 *
3879 * Ex:
3880 * |================c================|
3881 * |++++++ allocated ++++++|
3882 * ==> 0 complete clusters in above example
3883 *
3884 * The ext4_da_update_reserve_space will be called only if we
3885 * determine here that there were some "entire" clusters that span
3886 * this 'allocated' range.
3887 * In the non-bigalloc case, this function will just end up returning num_blks
3888 * without ever calling ext4_find_delalloc_range.
3889 */
3890static unsigned int
3891get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3892 unsigned int num_blks)
3893{
3894 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3895 ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
3896 ext4_lblk_t lblk_from, lblk_to, c_offset;
3897 unsigned int allocated_clusters = 0;
3898
3899 alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
3900 alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
3901
3902 /* max possible clusters for this allocation */
3903 allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
3904
3905 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
3906
3907 /* Check towards left side */
3908 c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
3909 if (c_offset) {
3910 lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
3911 lblk_to = lblk_from + c_offset - 1;
3912
3913 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3914 allocated_clusters--;
3915 }
3916
3917 /* Now check towards right. */
3918 c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
3919 if (allocated_clusters && c_offset) {
3920 lblk_from = lblk_start + num_blks;
3921 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3922
3923 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3924 allocated_clusters--;
3925 }
3926
3927 return allocated_clusters;
3928}
3929
3930static int 3884static int
3931convert_initialized_extent(handle_t *handle, struct inode *inode, 3885convert_initialized_extent(handle_t *handle, struct inode *inode,
3932 struct ext4_map_blocks *map, 3886 struct ext4_map_blocks *map,
@@ -4108,23 +4062,6 @@ out:
4108 } 4062 }
4109 map->m_len = allocated; 4063 map->m_len = allocated;
4110 4064
4111 /*
4112 * If we have done fallocate with the offset that is already
4113 * delayed allocated, we would have block reservation
4114 * and quota reservation done in the delayed write path.
4115 * But fallocate would have already updated quota and block
4116 * count for this offset. So cancel these reservation
4117 */
4118 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4119 unsigned int reserved_clusters;
4120 reserved_clusters = get_reserved_cluster_alloc(inode,
4121 map->m_lblk, map->m_len);
4122 if (reserved_clusters)
4123 ext4_da_update_reserve_space(inode,
4124 reserved_clusters,
4125 0);
4126 }
4127
4128map_out: 4065map_out:
4129 map->m_flags |= EXT4_MAP_MAPPED; 4066 map->m_flags |= EXT4_MAP_MAPPED;
4130 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { 4067 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
@@ -4513,77 +4450,39 @@ got_allocated_blocks:
4513 map->m_flags |= EXT4_MAP_NEW; 4450 map->m_flags |= EXT4_MAP_NEW;
4514 4451
4515 /* 4452 /*
4516 * Update reserved blocks/metadata blocks after successful 4453 * Reduce the reserved cluster count to reflect successful deferred
4517 * block allocation which had been deferred till now. 4454 * allocation of delayed allocated clusters or direct allocation of
4455 * clusters discovered to be delayed allocated. Once allocated, a
4456 * cluster is not included in the reserved count.
4518 */ 4457 */
4519 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 4458 if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
4520 unsigned int reserved_clusters; 4459 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4521 /*
4522 * Check how many clusters we had reserved this allocated range
4523 */
4524 reserved_clusters = get_reserved_cluster_alloc(inode,
4525 map->m_lblk, allocated);
4526 if (!map_from_cluster) {
4527 BUG_ON(allocated_clusters < reserved_clusters);
4528 if (reserved_clusters < allocated_clusters) {
4529 struct ext4_inode_info *ei = EXT4_I(inode);
4530 int reservation = allocated_clusters -
4531 reserved_clusters;
4532 /*
4533 * It seems we claimed few clusters outside of
4534 * the range of this allocation. We should give
4535 * it back to the reservation pool. This can
4536 * happen in the following case:
4537 *
4538 * * Suppose s_cluster_ratio is 4 (i.e., each
4539 * cluster has 4 blocks. Thus, the clusters
4540 * are [0-3],[4-7],[8-11]...
4541 * * First comes delayed allocation write for
4542 * logical blocks 10 & 11. Since there were no
4543 * previous delayed allocated blocks in the
4544 * range [8-11], we would reserve 1 cluster
4545 * for this write.
4546 * * Next comes write for logical blocks 3 to 8.
4547 * In this case, we will reserve 2 clusters
4548 * (for [0-3] and [4-7]; and not for [8-11] as
4549 * that range has a delayed allocated blocks.
4550 * Thus total reserved clusters now becomes 3.
4551 * * Now, during the delayed allocation writeout
4552 * time, we will first write blocks [3-8] and
4553 * allocate 3 clusters for writing these
4554 * blocks. Also, we would claim all these
4555 * three clusters above.
4556 * * Now when we come here to writeout the
4557 * blocks [10-11], we would expect to claim
4558 * the reservation of 1 cluster we had made
4559 * (and we would claim it since there are no
4560 * more delayed allocated blocks in the range
4561 * [8-11]. But our reserved cluster count had
4562 * already gone to 0.
4563 *
4564 * Thus, at the step 4 above when we determine
4565 * that there are still some unwritten delayed
4566 * allocated blocks outside of our current
4567 * block range, we should increment the
4568 * reserved clusters count so that when the
4569 * remaining blocks finally gets written, we
4570 * could claim them.
4571 */
4572 dquot_reserve_block(inode,
4573 EXT4_C2B(sbi, reservation));
4574 spin_lock(&ei->i_block_reservation_lock);
4575 ei->i_reserved_data_blocks += reservation;
4576 spin_unlock(&ei->i_block_reservation_lock);
4577 }
4578 /* 4460 /*
4579 * We will claim quota for all newly allocated blocks. 4461 * When allocating delayed allocated clusters, simply
4580 * We're updating the reserved space *after* the 4462 * reduce the reserved cluster count and claim quota
4581 * correction above so we do not accidentally free
4582 * all the metadata reservation because we might
4583 * actually need it later on.
4584 */ 4463 */
4585 ext4_da_update_reserve_space(inode, allocated_clusters, 4464 ext4_da_update_reserve_space(inode, allocated_clusters,
4586 1); 4465 1);
4466 } else {
4467 ext4_lblk_t lblk, len;
4468 unsigned int n;
4469
4470 /*
4471 * When allocating non-delayed allocated clusters
4472 * (from fallocate, filemap, DIO, or clusters
4473 * allocated when delalloc has been disabled by
4474 * ext4_nonda_switch), reduce the reserved cluster
4475 * count by the number of allocated clusters that
4476 * have previously been delayed allocated. Quota
4477 * has been claimed by ext4_mb_new_blocks() above,
4478 * so release the quota reservations made for any
4479 * previously delayed allocated clusters.
4480 */
4481 lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4482 len = allocated_clusters << sbi->s_cluster_bits;
4483 n = ext4_es_delayed_clu(inode, lblk, len);
4484 if (n > 0)
4485 ext4_da_update_reserve_space(inode, (int) n, 0);
4587 } 4486 }
4588 } 4487 }
4589 4488
@@ -5075,8 +4974,10 @@ static int ext4_find_delayed_extent(struct inode *inode,
5075 ext4_lblk_t block, next_del; 4974 ext4_lblk_t block, next_del;
5076 4975
5077 if (newes->es_pblk == 0) { 4976 if (newes->es_pblk == 0) {
5078 ext4_es_find_delayed_extent_range(inode, newes->es_lblk, 4977 ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
5079 newes->es_lblk + newes->es_len - 1, &es); 4978 newes->es_lblk,
4979 newes->es_lblk + newes->es_len - 1,
4980 &es);
5080 4981
5081 /* 4982 /*
5082 * No extent in extent-tree contains block @newes->es_pblk, 4983 * No extent in extent-tree contains block @newes->es_pblk,
@@ -5097,7 +4998,8 @@ static int ext4_find_delayed_extent(struct inode *inode,
5097 } 4998 }
5098 4999
5099 block = newes->es_lblk + newes->es_len; 5000 block = newes->es_lblk + newes->es_len;
5100 ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); 5001 ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
5002 EXT_MAX_BLOCKS, &es);
5101 if (es.es_len == 0) 5003 if (es.es_len == 0)
5102 next_del = EXT_MAX_BLOCKS; 5004 next_del = EXT_MAX_BLOCKS;
5103 else 5005 else
@@ -5958,3 +5860,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
5958 } 5860 }
5959 return replaced_count; 5861 return replaced_count;
5960} 5862}
5863
5864/*
5865 * ext4_clu_mapped - determine whether any block in a logical cluster has
5866 * been mapped to a physical cluster
5867 *
5868 * @inode - file containing the logical cluster
5869 * @lclu - logical cluster of interest
5870 *
5871 * Returns 1 if any block in the logical cluster is mapped, signifying
5872 * that a physical cluster has been allocated for it. Otherwise,
5873 * returns 0. Can also return negative error codes. Derived from
5874 * ext4_ext_map_blocks().
5875 */
5876int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
5877{
5878 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5879 struct ext4_ext_path *path;
5880 int depth, mapped = 0, err = 0;
5881 struct ext4_extent *extent;
5882 ext4_lblk_t first_lblk, first_lclu, last_lclu;
5883
5884 /* search for the extent closest to the first block in the cluster */
5885 path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
5886 if (IS_ERR(path)) {
5887 err = PTR_ERR(path);
5888 path = NULL;
5889 goto out;
5890 }
5891
5892 depth = ext_depth(inode);
5893
5894 /*
5895 * A consistent leaf must not be empty. This situation is possible,
5896 * though, _during_ tree modification, and it's why an assert can't
5897 * be put in ext4_find_extent().
5898 */
5899 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
5900 EXT4_ERROR_INODE(inode,
5901 "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5902 (unsigned long) EXT4_C2B(sbi, lclu),
5903 depth, path[depth].p_block);
5904 err = -EFSCORRUPTED;
5905 goto out;
5906 }
5907
5908 extent = path[depth].p_ext;
5909
5910 /* can't be mapped if the extent tree is empty */
5911 if (extent == NULL)
5912 goto out;
5913
5914 first_lblk = le32_to_cpu(extent->ee_block);
5915 first_lclu = EXT4_B2C(sbi, first_lblk);
5916
5917 /*
5918 * Three possible outcomes at this point - found extent spanning
5919 * the target cluster, to the left of the target cluster, or to the
5920 * right of the target cluster. The first two cases are handled here.
5921 * The last case indicates the target cluster is not mapped.
5922 */
5923 if (lclu >= first_lclu) {
5924 last_lclu = EXT4_B2C(sbi, first_lblk +
5925 ext4_ext_get_actual_len(extent) - 1);
5926 if (lclu <= last_lclu) {
5927 mapped = 1;
5928 } else {
5929 first_lblk = ext4_ext_next_allocated_block(path);
5930 first_lclu = EXT4_B2C(sbi, first_lblk);
5931 if (lclu == first_lclu)
5932 mapped = 1;
5933 }
5934 }
5935
5936out:
5937 ext4_ext_drop_refs(path);
5938 kfree(path);
5939
5940 return err ? err : mapped;
5941}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index c4e6fb15101b..2b439afafe13 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -142,6 +142,7 @@
142 */ 142 */
143 143
144static struct kmem_cache *ext4_es_cachep; 144static struct kmem_cache *ext4_es_cachep;
145static struct kmem_cache *ext4_pending_cachep;
145 146
146static int __es_insert_extent(struct inode *inode, struct extent_status *newes); 147static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
147static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 148static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
@@ -149,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
149static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); 150static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
150static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, 151static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
151 struct ext4_inode_info *locked_ei); 152 struct ext4_inode_info *locked_ei);
153static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
154 ext4_lblk_t len);
152 155
153int __init ext4_init_es(void) 156int __init ext4_init_es(void)
154{ 157{
@@ -233,30 +236,38 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
233} 236}
234 237
235/* 238/*
236 * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering 239 * ext4_es_find_extent_range - find extent with specified status within block
237 * @es->lblk if it exists, otherwise, the next extent after @es->lblk. 240 * range or next extent following block range in
241 * extents status tree
238 * 242 *
239 * @inode: the inode which owns delayed extents 243 * @inode - file containing the range
240 * @lblk: the offset where we start to search 244 * @matching_fn - pointer to function that matches extents with desired status
241 * @end: the offset where we stop to search 245 * @lblk - logical block defining start of range
242 * @es: delayed extent that we found 246 * @end - logical block defining end of range
247 * @es - extent found, if any
248 *
249 * Find the first extent within the block range specified by @lblk and @end
250 * in the extents status tree that satisfies @matching_fn. If a match
251 * is found, it's returned in @es. If not, and a matching extent is found
252 * beyond the block range, it's returned in @es. If no match is found, an
253 * extent is returned in @es whose es_lblk, es_len, and es_pblk components
254 * are 0.
243 */ 255 */
244void ext4_es_find_delayed_extent_range(struct inode *inode, 256static void __es_find_extent_range(struct inode *inode,
245 ext4_lblk_t lblk, ext4_lblk_t end, 257 int (*matching_fn)(struct extent_status *es),
246 struct extent_status *es) 258 ext4_lblk_t lblk, ext4_lblk_t end,
259 struct extent_status *es)
247{ 260{
248 struct ext4_es_tree *tree = NULL; 261 struct ext4_es_tree *tree = NULL;
249 struct extent_status *es1 = NULL; 262 struct extent_status *es1 = NULL;
250 struct rb_node *node; 263 struct rb_node *node;
251 264
252 BUG_ON(es == NULL); 265 WARN_ON(es == NULL);
253 BUG_ON(end < lblk); 266 WARN_ON(end < lblk);
254 trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
255 267
256 read_lock(&EXT4_I(inode)->i_es_lock);
257 tree = &EXT4_I(inode)->i_es_tree; 268 tree = &EXT4_I(inode)->i_es_tree;
258 269
259 /* find extent in cache firstly */ 270 /* see if the extent has been cached */
260 es->es_lblk = es->es_len = es->es_pblk = 0; 271 es->es_lblk = es->es_len = es->es_pblk = 0;
261 if (tree->cache_es) { 272 if (tree->cache_es) {
262 es1 = tree->cache_es; 273 es1 = tree->cache_es;
@@ -271,28 +282,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
271 es1 = __es_tree_search(&tree->root, lblk); 282 es1 = __es_tree_search(&tree->root, lblk);
272 283
273out: 284out:
274 if (es1 && !ext4_es_is_delayed(es1)) { 285 if (es1 && !matching_fn(es1)) {
275 while ((node = rb_next(&es1->rb_node)) != NULL) { 286 while ((node = rb_next(&es1->rb_node)) != NULL) {
276 es1 = rb_entry(node, struct extent_status, rb_node); 287 es1 = rb_entry(node, struct extent_status, rb_node);
277 if (es1->es_lblk > end) { 288 if (es1->es_lblk > end) {
278 es1 = NULL; 289 es1 = NULL;
279 break; 290 break;
280 } 291 }
281 if (ext4_es_is_delayed(es1)) 292 if (matching_fn(es1))
282 break; 293 break;
283 } 294 }
284 } 295 }
285 296
286 if (es1 && ext4_es_is_delayed(es1)) { 297 if (es1 && matching_fn(es1)) {
287 tree->cache_es = es1; 298 tree->cache_es = es1;
288 es->es_lblk = es1->es_lblk; 299 es->es_lblk = es1->es_lblk;
289 es->es_len = es1->es_len; 300 es->es_len = es1->es_len;
290 es->es_pblk = es1->es_pblk; 301 es->es_pblk = es1->es_pblk;
291 } 302 }
292 303
304}
305
306/*
307 * Locking for __es_find_extent_range() for external use
308 */
309void ext4_es_find_extent_range(struct inode *inode,
310 int (*matching_fn)(struct extent_status *es),
311 ext4_lblk_t lblk, ext4_lblk_t end,
312 struct extent_status *es)
313{
314 trace_ext4_es_find_extent_range_enter(inode, lblk);
315
316 read_lock(&EXT4_I(inode)->i_es_lock);
317 __es_find_extent_range(inode, matching_fn, lblk, end, es);
318 read_unlock(&EXT4_I(inode)->i_es_lock);
319
320 trace_ext4_es_find_extent_range_exit(inode, es);
321}
322
323/*
324 * __es_scan_range - search block range for block with specified status
325 * in extents status tree
326 *
327 * @inode - file containing the range
328 * @matching_fn - pointer to function that matches extents with desired status
329 * @lblk - logical block defining start of range
330 * @end - logical block defining end of range
331 *
332 * Returns true if at least one block in the specified block range satisfies
333 * the criterion specified by @matching_fn, and false if not. If at least
334 * one extent has the specified status, then there is at least one block
335 * in the cluster with that status. Should only be called by code that has
336 * taken i_es_lock.
337 */
338static bool __es_scan_range(struct inode *inode,
339 int (*matching_fn)(struct extent_status *es),
340 ext4_lblk_t start, ext4_lblk_t end)
341{
342 struct extent_status es;
343
344 __es_find_extent_range(inode, matching_fn, start, end, &es);
345 if (es.es_len == 0)
346 return false; /* no matching extent in the tree */
347 else if (es.es_lblk <= start &&
348 start < es.es_lblk + es.es_len)
349 return true;
350 else if (start <= es.es_lblk && es.es_lblk <= end)
351 return true;
352 else
353 return false;
354}
355/*
356 * Locking for __es_scan_range() for external use
357 */
358bool ext4_es_scan_range(struct inode *inode,
359 int (*matching_fn)(struct extent_status *es),
360 ext4_lblk_t lblk, ext4_lblk_t end)
361{
362 bool ret;
363
364 read_lock(&EXT4_I(inode)->i_es_lock);
365 ret = __es_scan_range(inode, matching_fn, lblk, end);
293 read_unlock(&EXT4_I(inode)->i_es_lock); 366 read_unlock(&EXT4_I(inode)->i_es_lock);
294 367
295 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 368 return ret;
369}
370
371/*
372 * __es_scan_clu - search cluster for block with specified status in
373 * extents status tree
374 *
375 * @inode - file containing the cluster
376 * @matching_fn - pointer to function that matches extents with desired status
377 * @lblk - logical block in cluster to be searched
378 *
379 * Returns true if at least one extent in the cluster containing @lblk
380 * satisfies the criterion specified by @matching_fn, and false if not. If at
381 * least one extent has the specified status, then there is at least one block
382 * in the cluster with that status. Should only be called by code that has
383 * taken i_es_lock.
384 */
385static bool __es_scan_clu(struct inode *inode,
386 int (*matching_fn)(struct extent_status *es),
387 ext4_lblk_t lblk)
388{
389 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
390 ext4_lblk_t lblk_start, lblk_end;
391
392 lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
393 lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
394
395 return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
396}
397
398/*
399 * Locking for __es_scan_clu() for external use
400 */
401bool ext4_es_scan_clu(struct inode *inode,
402 int (*matching_fn)(struct extent_status *es),
403 ext4_lblk_t lblk)
404{
405 bool ret;
406
407 read_lock(&EXT4_I(inode)->i_es_lock);
408 ret = __es_scan_clu(inode, matching_fn, lblk);
409 read_unlock(&EXT4_I(inode)->i_es_lock);
410
411 return ret;
296} 412}
297 413
298static void ext4_es_list_add(struct inode *inode) 414static void ext4_es_list_add(struct inode *inode)
@@ -694,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
694 struct extent_status newes; 810 struct extent_status newes;
695 ext4_lblk_t end = lblk + len - 1; 811 ext4_lblk_t end = lblk + len - 1;
696 int err = 0; 812 int err = 0;
813 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
697 814
698 es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", 815 es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
699 lblk, len, pblk, status, inode->i_ino); 816 lblk, len, pblk, status, inode->i_ino);
@@ -730,6 +847,11 @@ retry:
730 if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) 847 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
731 err = 0; 848 err = 0;
732 849
850 if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
851 (status & EXTENT_STATUS_WRITTEN ||
852 status & EXTENT_STATUS_UNWRITTEN))
853 __revise_pending(inode, lblk, len);
854
733error: 855error:
734 write_unlock(&EXT4_I(inode)->i_es_lock); 856 write_unlock(&EXT4_I(inode)->i_es_lock);
735 857
@@ -1252,3 +1374,499 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1252 ei->i_es_tree.cache_es = NULL; 1374 ei->i_es_tree.cache_es = NULL;
1253 return nr_shrunk; 1375 return nr_shrunk;
1254} 1376}
1377
1378#ifdef ES_DEBUG__
1379static void ext4_print_pending_tree(struct inode *inode)
1380{
1381 struct ext4_pending_tree *tree;
1382 struct rb_node *node;
1383 struct pending_reservation *pr;
1384
1385 printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
1386 tree = &EXT4_I(inode)->i_pending_tree;
1387 node = rb_first(&tree->root);
1388 while (node) {
1389 pr = rb_entry(node, struct pending_reservation, rb_node);
1390 printk(KERN_DEBUG " %u", pr->lclu);
1391 node = rb_next(node);
1392 }
1393 printk(KERN_DEBUG "\n");
1394}
1395#else
1396#define ext4_print_pending_tree(inode)
1397#endif
1398
1399int __init ext4_init_pending(void)
1400{
1401 ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
1402 sizeof(struct pending_reservation),
1403 0, (SLAB_RECLAIM_ACCOUNT), NULL);
1404 if (ext4_pending_cachep == NULL)
1405 return -ENOMEM;
1406 return 0;
1407}
1408
1409void ext4_exit_pending(void)
1410{
1411 kmem_cache_destroy(ext4_pending_cachep);
1412}
1413
1414void ext4_init_pending_tree(struct ext4_pending_tree *tree)
1415{
1416 tree->root = RB_ROOT;
1417}
1418
1419/*
1420 * __get_pending - retrieve a pointer to a pending reservation
1421 *
1422 * @inode - file containing the pending cluster reservation
1423 * @lclu - logical cluster of interest
1424 *
1425 * Returns a pointer to a pending reservation if it's a member of
1426 * the set, and NULL if not. Must be called holding i_es_lock.
1427 */
1428static struct pending_reservation *__get_pending(struct inode *inode,
1429 ext4_lblk_t lclu)
1430{
1431 struct ext4_pending_tree *tree;
1432 struct rb_node *node;
1433 struct pending_reservation *pr = NULL;
1434
1435 tree = &EXT4_I(inode)->i_pending_tree;
1436 node = (&tree->root)->rb_node;
1437
1438 while (node) {
1439 pr = rb_entry(node, struct pending_reservation, rb_node);
1440 if (lclu < pr->lclu)
1441 node = node->rb_left;
1442 else if (lclu > pr->lclu)
1443 node = node->rb_right;
1444 else if (lclu == pr->lclu)
1445 return pr;
1446 }
1447 return NULL;
1448}
1449
1450/*
1451 * __insert_pending - adds a pending cluster reservation to the set of
1452 * pending reservations
1453 *
1454 * @inode - file containing the cluster
1455 * @lblk - logical block in the cluster to be added
1456 *
1457 * Returns 0 on successful insertion and -ENOMEM on failure. If the
1458 * pending reservation is already in the set, returns successfully.
1459 */
1460static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
1461{
1462 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1463 struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1464 struct rb_node **p = &tree->root.rb_node;
1465 struct rb_node *parent = NULL;
1466 struct pending_reservation *pr;
1467 ext4_lblk_t lclu;
1468 int ret = 0;
1469
1470 lclu = EXT4_B2C(sbi, lblk);
1471 /* search to find parent for insertion */
1472 while (*p) {
1473 parent = *p;
1474 pr = rb_entry(parent, struct pending_reservation, rb_node);
1475
1476 if (lclu < pr->lclu) {
1477 p = &(*p)->rb_left;
1478 } else if (lclu > pr->lclu) {
1479 p = &(*p)->rb_right;
1480 } else {
1481 /* pending reservation already inserted */
1482 goto out;
1483 }
1484 }
1485
1486 pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
1487 if (pr == NULL) {
1488 ret = -ENOMEM;
1489 goto out;
1490 }
1491 pr->lclu = lclu;
1492
1493 rb_link_node(&pr->rb_node, parent, p);
1494 rb_insert_color(&pr->rb_node, &tree->root);
1495
1496out:
1497 return ret;
1498}
1499
1500/*
1501 * __remove_pending - removes a pending cluster reservation from the set
1502 * of pending reservations
1503 *
1504 * @inode - file containing the cluster
1505 * @lblk - logical block in the pending cluster reservation to be removed
1506 *
1507 * Returns successfully if pending reservation is not a member of the set.
1508 */
1509static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
1510{
1511 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1512 struct pending_reservation *pr;
1513 struct ext4_pending_tree *tree;
1514
1515 pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
1516 if (pr != NULL) {
1517 tree = &EXT4_I(inode)->i_pending_tree;
1518 rb_erase(&pr->rb_node, &tree->root);
1519 kmem_cache_free(ext4_pending_cachep, pr);
1520 }
1521}
1522
1523/*
1524 * ext4_remove_pending - removes a pending cluster reservation from the set
1525 * of pending reservations
1526 *
1527 * @inode - file containing the cluster
1528 * @lblk - logical block in the pending cluster reservation to be removed
1529 *
1530 * Locking for external use of __remove_pending.
1531 */
1532void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
1533{
1534 struct ext4_inode_info *ei = EXT4_I(inode);
1535
1536 write_lock(&ei->i_es_lock);
1537 __remove_pending(inode, lblk);
1538 write_unlock(&ei->i_es_lock);
1539}
1540
1541/*
1542 * ext4_is_pending - determine whether a cluster has a pending reservation
1543 * on it
1544 *
1545 * @inode - file containing the cluster
1546 * @lblk - logical block in the cluster
1547 *
1548 * Returns true if there's a pending reservation for the cluster in the
1549 * set of pending reservations, and false if not.
1550 */
1551bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
1552{
1553 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1554 struct ext4_inode_info *ei = EXT4_I(inode);
1555 bool ret;
1556
1557 read_lock(&ei->i_es_lock);
1558 ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
1559 read_unlock(&ei->i_es_lock);
1560
1561 return ret;
1562}
1563
1564/*
1565 * ext4_es_insert_delayed_block - adds a delayed block to the extents status
1566 * tree, adding a pending reservation where
1567 * needed
1568 *
1569 * @inode - file containing the newly added block
1570 * @lblk - logical block to be added
1571 * @allocated - indicates whether a physical cluster has been allocated for
1572 * the logical cluster that contains the block
1573 *
1574 * Returns 0 on success, negative error code on failure.
1575 */
1576int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
1577 bool allocated)
1578{
1579 struct extent_status newes;
1580 int err = 0;
1581
1582 es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
1583 lblk, inode->i_ino);
1584
1585 newes.es_lblk = lblk;
1586 newes.es_len = 1;
1587 ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
1588 trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
1589
1590 ext4_es_insert_extent_check(inode, &newes);
1591
1592 write_lock(&EXT4_I(inode)->i_es_lock);
1593
1594 err = __es_remove_extent(inode, lblk, lblk);
1595 if (err != 0)
1596 goto error;
1597retry:
1598 err = __es_insert_extent(inode, &newes);
1599 if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
1600 128, EXT4_I(inode)))
1601 goto retry;
1602 if (err != 0)
1603 goto error;
1604
1605 if (allocated)
1606 __insert_pending(inode, lblk);
1607
1608error:
1609 write_unlock(&EXT4_I(inode)->i_es_lock);
1610
1611 ext4_es_print_tree(inode);
1612 ext4_print_pending_tree(inode);
1613
1614 return err;
1615}
1616
1617/*
1618 * __es_delayed_clu - count number of clusters containing blocks that
1619 * are delayed only
1620 *
1621 * @inode - file containing block range
1622 * @start - logical block defining start of range
1623 * @end - logical block defining end of range
1624 *
1625 * Returns the number of clusters containing only delayed (not delayed
1626 * and unwritten) blocks in the range specified by @start and @end. Any
1627 * cluster or part of a cluster within the range and containing a delayed
1628 * and not unwritten block within the range is counted as a whole cluster.
1629 */
1630static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
1631 ext4_lblk_t end)
1632{
1633 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
1634 struct extent_status *es;
1635 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1636 struct rb_node *node;
1637 ext4_lblk_t first_lclu, last_lclu;
1638 unsigned long long last_counted_lclu;
1639 unsigned int n = 0;
1640
1641 /* guaranteed to be unequal to any ext4_lblk_t value */
1642 last_counted_lclu = ~0ULL;
1643
1644 es = __es_tree_search(&tree->root, start);
1645
1646 while (es && (es->es_lblk <= end)) {
1647 if (ext4_es_is_delonly(es)) {
1648 if (es->es_lblk <= start)
1649 first_lclu = EXT4_B2C(sbi, start);
1650 else
1651 first_lclu = EXT4_B2C(sbi, es->es_lblk);
1652
1653 if (ext4_es_end(es) >= end)
1654 last_lclu = EXT4_B2C(sbi, end);
1655 else
1656 last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
1657
1658 if (first_lclu == last_counted_lclu)
1659 n += last_lclu - first_lclu;
1660 else
1661 n += last_lclu - first_lclu + 1;
1662 last_counted_lclu = last_lclu;
1663 }
1664 node = rb_next(&es->rb_node);
1665 if (!node)
1666 break;
1667 es = rb_entry(node, struct extent_status, rb_node);
1668 }
1669
1670 return n;
1671}
1672
1673/*
1674 * ext4_es_delayed_clu - count number of clusters containing blocks that
1675 * are both delayed and unwritten
1676 *
1677 * @inode - file containing block range
1678 * @lblk - logical block defining start of range
1679 * @len - number of blocks in range
1680 *
1681 * Locking for external use of __es_delayed_clu().
1682 */
1683unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
1684 ext4_lblk_t len)
1685{
1686 struct ext4_inode_info *ei = EXT4_I(inode);
1687 ext4_lblk_t end;
1688 unsigned int n;
1689
1690 if (len == 0)
1691 return 0;
1692
1693 end = lblk + len - 1;
1694 WARN_ON(end < lblk);
1695
1696 read_lock(&ei->i_es_lock);
1697
1698 n = __es_delayed_clu(inode, lblk, end);
1699
1700 read_unlock(&ei->i_es_lock);
1701
1702 return n;
1703}
1704
1705/*
1706 * __revise_pending - makes, cancels, or leaves unchanged pending cluster
1707 * reservations for a specified block range depending
1708 * upon the presence or absence of delayed blocks
1709 * outside the range within clusters at the ends of the
1710 * range
1711 *
1712 * @inode - file containing the range
1713 * @lblk - logical block defining the start of range
1714 * @len - length of range in blocks
1715 *
1716 * Used after a newly allocated extent is added to the extents status tree.
1717 * Requires that the extents in the range have either written or unwritten
1718 * status. Must be called while holding i_es_lock.
1719 */
1720static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
1721 ext4_lblk_t len)
1722{
1723 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1724 ext4_lblk_t end = lblk + len - 1;
1725 ext4_lblk_t first, last;
1726 bool f_del = false, l_del = false;
1727
1728 if (len == 0)
1729 return;
1730
1731 /*
1732 * Two cases - block range within single cluster and block range
1733 * spanning two or more clusters. Note that a cluster belonging
1734 * to a range starting and/or ending on a cluster boundary is treated
1735 * as if it does not contain a delayed extent. The new range may
1736 * have allocated space for previously delayed blocks out to the
1737 * cluster boundary, requiring that any pre-existing pending
1738 * reservation be canceled. Because this code only looks at blocks
1739 * outside the range, it should revise pending reservations
1740 * correctly even if the extent represented by the range can't be
1741 * inserted in the extents status tree due to ENOSPC.
1742 */
1743
1744 if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
1745 first = EXT4_LBLK_CMASK(sbi, lblk);
1746 if (first != lblk)
1747 f_del = __es_scan_range(inode, &ext4_es_is_delonly,
1748 first, lblk - 1);
1749 if (f_del) {
1750 __insert_pending(inode, first);
1751 } else {
1752 last = EXT4_LBLK_CMASK(sbi, end) +
1753 sbi->s_cluster_ratio - 1;
1754 if (last != end)
1755 l_del = __es_scan_range(inode,
1756 &ext4_es_is_delonly,
1757 end + 1, last);
1758 if (l_del)
1759 __insert_pending(inode, last);
1760 else
1761 __remove_pending(inode, last);
1762 }
1763 } else {
1764 first = EXT4_LBLK_CMASK(sbi, lblk);
1765 if (first != lblk)
1766 f_del = __es_scan_range(inode, &ext4_es_is_delonly,
1767 first, lblk - 1);
1768 if (f_del)
1769 __insert_pending(inode, first);
1770 else
1771 __remove_pending(inode, first);
1772
1773 last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
1774 if (last != end)
1775 l_del = __es_scan_range(inode, &ext4_es_is_delonly,
1776 end + 1, last);
1777 if (l_del)
1778 __insert_pending(inode, last);
1779 else
1780 __remove_pending(inode, last);
1781 }
1782}
1783
1784/*
1785 * ext4_es_remove_blks - remove block range from extents status tree and
1786 * reduce reservation count or cancel pending
1787 * reservation as needed
1788 *
1789 * @inode - file containing range
1790 * @lblk - first block in range
1791 * @len - number of blocks to remove
1792 *
1793 */
1794void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
1795 ext4_lblk_t len)
1796{
1797 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1798 unsigned int clu_size, reserved = 0;
1799 ext4_lblk_t last_lclu, first, length, remainder, last;
1800 bool delonly;
1801 int err = 0;
1802 struct pending_reservation *pr;
1803 struct ext4_pending_tree *tree;
1804
1805 /*
1806 * Process cluster by cluster for bigalloc - there may be up to
1807 * two clusters in a 4k page with a 1k block size and two blocks
1808 * per cluster. Also necessary for systems with larger page sizes
1809 * and potentially larger block sizes.
1810 */
1811 clu_size = sbi->s_cluster_ratio;
1812 last_lclu = EXT4_B2C(sbi, lblk + len - 1);
1813
1814 write_lock(&EXT4_I(inode)->i_es_lock);
1815
1816 for (first = lblk, remainder = len;
1817 remainder > 0;
1818 first += length, remainder -= length) {
1819
1820 if (EXT4_B2C(sbi, first) == last_lclu)
1821 length = remainder;
1822 else
1823 length = clu_size - EXT4_LBLK_COFF(sbi, first);
1824
1825 /*
1826 * The BH_Delay flag, which triggers calls to this function,
1827 * and the contents of the extents status tree can be
1828 * inconsistent due to writepages activity. So, note whether
1829 * the blocks to be removed actually belong to an extent with
1830 * delayed only status.
1831 */
1832 delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first);
1833
1834 /*
1835 * because of the writepages effect, written and unwritten
1836 * blocks could be removed here
1837 */
1838 last = first + length - 1;
1839 err = __es_remove_extent(inode, first, last);
1840 if (err)
1841 ext4_warning(inode->i_sb,
1842 "%s: couldn't remove page (err = %d)",
1843 __func__, err);
1844
1845 /* non-bigalloc case: simply count the cluster for release */
1846 if (sbi->s_cluster_ratio == 1 && delonly) {
1847 reserved++;
1848 continue;
1849 }
1850
1851 /*
1852 * bigalloc case: if all delayed allocated only blocks have
1853 * just been removed from a cluster, either cancel a pending
1854 * reservation if it exists or count a cluster for release
1855 */
1856 if (delonly &&
1857 !__es_scan_clu(inode, &ext4_es_is_delonly, first)) {
1858 pr = __get_pending(inode, EXT4_B2C(sbi, first));
1859 if (pr != NULL) {
1860 tree = &EXT4_I(inode)->i_pending_tree;
1861 rb_erase(&pr->rb_node, &tree->root);
1862 kmem_cache_free(ext4_pending_cachep, pr);
1863 } else {
1864 reserved++;
1865 }
1866 }
1867 }
1868
1869 write_unlock(&EXT4_I(inode)->i_es_lock);
1870
1871 ext4_da_release_space(inode, reserved);
1872}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8efdeb903d6b..131a8b7df265 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -78,6 +78,51 @@ struct ext4_es_stats {
78 struct percpu_counter es_stats_shk_cnt; 78 struct percpu_counter es_stats_shk_cnt;
79}; 79};
80 80
81/*
82 * Pending cluster reservations for bigalloc file systems
83 *
84 * A cluster with a pending reservation is a logical cluster shared by at
85 * least one extent in the extents status tree with delayed and unwritten
86 * status and at least one other written or unwritten extent. The
87 * reservation is said to be pending because a cluster reservation would
88 * have to be taken in the event all blocks in the cluster shared with
89 * written or unwritten extents were deleted while the delayed and
90 * unwritten blocks remained.
91 *
92 * The set of pending cluster reservations is an auxiliary data structure
93 * used with the extents status tree to implement reserved cluster/block
94 * accounting for bigalloc file systems. The set is kept in memory and
95 * records all pending cluster reservations.
96 *
97 * Its primary function is to avoid the need to read extents from the
98 * disk when invalidating pages as a result of a truncate, punch hole, or
99 * collapse range operation. Page invalidation requires a decrease in the
100 * reserved cluster count if it results in the removal of all delayed
101 * and unwritten extents (blocks) from a cluster that is not shared with a
102 * written or unwritten extent, and no decrease otherwise. Determining
103 * whether the cluster is shared can be done by searching for a pending
104 * reservation on it.
105 *
106 * Secondarily, it provides a potentially faster method for determining
107 * whether the reserved cluster count should be increased when a physical
108 * cluster is deallocated as a result of a truncate, punch hole, or
109 * collapse range operation. The necessary information is also present
110 * in the extents status tree, but might be more rapidly accessed in
111 * the pending reservation set in many cases due to smaller size.
112 *
113 * The pending cluster reservation set is implemented as a red-black tree
114 * with the goal of minimizing per page search time overhead.
115 */
116
117struct pending_reservation {
118 struct rb_node rb_node;
119 ext4_lblk_t lclu;
120};
121
122struct ext4_pending_tree {
123 struct rb_root root;
124};
125
81extern int __init ext4_init_es(void); 126extern int __init ext4_init_es(void);
82extern void ext4_exit_es(void); 127extern void ext4_exit_es(void);
83extern void ext4_es_init_tree(struct ext4_es_tree *tree); 128extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -90,11 +135,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
90 unsigned int status); 135 unsigned int status);
91extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 136extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
92 ext4_lblk_t len); 137 ext4_lblk_t len);
93extern void ext4_es_find_delayed_extent_range(struct inode *inode, 138extern void ext4_es_find_extent_range(struct inode *inode,
94 ext4_lblk_t lblk, ext4_lblk_t end, 139 int (*match_fn)(struct extent_status *es),
95 struct extent_status *es); 140 ext4_lblk_t lblk, ext4_lblk_t end,
141 struct extent_status *es);
96extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 142extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
97 struct extent_status *es); 143 struct extent_status *es);
144extern bool ext4_es_scan_range(struct inode *inode,
145 int (*matching_fn)(struct extent_status *es),
146 ext4_lblk_t lblk, ext4_lblk_t end);
147extern bool ext4_es_scan_clu(struct inode *inode,
148 int (*matching_fn)(struct extent_status *es),
149 ext4_lblk_t lblk);
98 150
99static inline unsigned int ext4_es_status(struct extent_status *es) 151static inline unsigned int ext4_es_status(struct extent_status *es)
100{ 152{
@@ -126,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es)
126 return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; 178 return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
127} 179}
128 180
181static inline int ext4_es_is_mapped(struct extent_status *es)
182{
183 return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
184}
185
186static inline int ext4_es_is_delonly(struct extent_status *es)
187{
188 return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
189}
190
129static inline void ext4_es_set_referenced(struct extent_status *es) 191static inline void ext4_es_set_referenced(struct extent_status *es)
130{ 192{
131 es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; 193 es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@@ -175,4 +237,16 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
175 237
176extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); 238extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
177 239
240extern int __init ext4_init_pending(void);
241extern void ext4_exit_pending(void);
242extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
243extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
244extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
245extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
246 bool allocated);
247extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
248 ext4_lblk_t len);
249extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
250 ext4_lblk_t len);
251
178#endif /* _EXT4_EXTENTS_STATUS_H */ 252#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 7b4736022761..9c4bac18cc6c 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
863 handle_t *handle; 863 handle_t *handle;
864 struct page *page; 864 struct page *page;
865 struct ext4_iloc iloc; 865 struct ext4_iloc iloc;
866 int retries; 866 int retries = 0;
867 867
868 ret = ext4_get_inode_loc(inode, &iloc); 868 ret = ext4_get_inode_loc(inode, &iloc);
869 if (ret) 869 if (ret)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d767e993591d..c3d9a42c561e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -577,8 +577,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
577 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 577 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
578 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 578 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
579 !(status & EXTENT_STATUS_WRITTEN) && 579 !(status & EXTENT_STATUS_WRITTEN) &&
580 ext4_find_delalloc_range(inode, map->m_lblk, 580 ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
581 map->m_lblk + map->m_len - 1)) 581 map->m_lblk + map->m_len - 1))
582 status |= EXTENT_STATUS_DELAYED; 582 status |= EXTENT_STATUS_DELAYED;
583 ret = ext4_es_insert_extent(inode, map->m_lblk, 583 ret = ext4_es_insert_extent(inode, map->m_lblk,
584 map->m_len, map->m_pblk, status); 584 map->m_len, map->m_pblk, status);
@@ -701,8 +701,8 @@ found:
701 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 701 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
702 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 702 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
703 !(status & EXTENT_STATUS_WRITTEN) && 703 !(status & EXTENT_STATUS_WRITTEN) &&
704 ext4_find_delalloc_range(inode, map->m_lblk, 704 ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
705 map->m_lblk + map->m_len - 1)) 705 map->m_lblk + map->m_len - 1))
706 status |= EXTENT_STATUS_DELAYED; 706 status |= EXTENT_STATUS_DELAYED;
707 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 707 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
708 map->m_pblk, status); 708 map->m_pblk, status);
@@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode)
1595 return 0; /* success */ 1595 return 0; /* success */
1596} 1596}
1597 1597
1598static void ext4_da_release_space(struct inode *inode, int to_free) 1598void ext4_da_release_space(struct inode *inode, int to_free)
1599{ 1599{
1600 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1600 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1601 struct ext4_inode_info *ei = EXT4_I(inode); 1601 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page,
1634 unsigned int offset, 1634 unsigned int offset,
1635 unsigned int length) 1635 unsigned int length)
1636{ 1636{
1637 int to_release = 0, contiguous_blks = 0; 1637 int contiguous_blks = 0;
1638 struct buffer_head *head, *bh; 1638 struct buffer_head *head, *bh;
1639 unsigned int curr_off = 0; 1639 unsigned int curr_off = 0;
1640 struct inode *inode = page->mapping->host; 1640 struct inode *inode = page->mapping->host;
1641 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1642 unsigned int stop = offset + length; 1641 unsigned int stop = offset + length;
1643 int num_clusters;
1644 ext4_fsblk_t lblk; 1642 ext4_fsblk_t lblk;
1645 1643
1646 BUG_ON(stop > PAGE_SIZE || stop < length); 1644 BUG_ON(stop > PAGE_SIZE || stop < length);
@@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page,
1654 break; 1652 break;
1655 1653
1656 if ((offset <= curr_off) && (buffer_delay(bh))) { 1654 if ((offset <= curr_off) && (buffer_delay(bh))) {
1657 to_release++;
1658 contiguous_blks++; 1655 contiguous_blks++;
1659 clear_buffer_delay(bh); 1656 clear_buffer_delay(bh);
1660 } else if (contiguous_blks) { 1657 } else if (contiguous_blks) {
@@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page,
1662 (PAGE_SHIFT - inode->i_blkbits); 1659 (PAGE_SHIFT - inode->i_blkbits);
1663 lblk += (curr_off >> inode->i_blkbits) - 1660 lblk += (curr_off >> inode->i_blkbits) -
1664 contiguous_blks; 1661 contiguous_blks;
1665 ext4_es_remove_extent(inode, lblk, contiguous_blks); 1662 ext4_es_remove_blks(inode, lblk, contiguous_blks);
1666 contiguous_blks = 0; 1663 contiguous_blks = 0;
1667 } 1664 }
1668 curr_off = next_off; 1665 curr_off = next_off;
@@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page,
1671 if (contiguous_blks) { 1668 if (contiguous_blks) {
1672 lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); 1669 lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
1673 lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; 1670 lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
1674 ext4_es_remove_extent(inode, lblk, contiguous_blks); 1671 ext4_es_remove_blks(inode, lblk, contiguous_blks);
1675 } 1672 }
1676 1673
1677 /* If we have released all the blocks belonging to a cluster, then we
1678 * need to release the reserved space for that cluster. */
1679 num_clusters = EXT4_NUM_B2C(sbi, to_release);
1680 while (num_clusters > 0) {
1681 lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
1682 ((num_clusters - 1) << sbi->s_cluster_bits);
1683 if (sbi->s_cluster_ratio == 1 ||
1684 !ext4_find_delalloc_cluster(inode, lblk))
1685 ext4_da_release_space(inode, 1);
1686
1687 num_clusters--;
1688 }
1689} 1674}
1690 1675
1691/* 1676/*
@@ -1781,6 +1766,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1781} 1766}
1782 1767
1783/* 1768/*
1769 * ext4_insert_delayed_block - adds a delayed block to the extents status
1770 * tree, incrementing the reserved cluster/block
1771 * count or making a pending reservation
1772 * where needed
1773 *
1774 * @inode - file containing the newly added block
1775 * @lblk - logical block to be added
1776 *
1777 * Returns 0 on success, negative error code on failure.
1778 */
1779static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
1780{
1781 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1782 int ret;
1783 bool allocated = false;
1784
1785 /*
1786 * If the cluster containing lblk is shared with a delayed,
1787 * written, or unwritten extent in a bigalloc file system, it's
1788 * already been accounted for and does not need to be reserved.
1789 * A pending reservation must be made for the cluster if it's
1790 * shared with a written or unwritten extent and doesn't already
1791 * have one. Written and unwritten extents can be purged from the
1792 * extents status tree if the system is under memory pressure, so
1793 * it's necessary to examine the extent tree if a search of the
1794 * extents status tree doesn't get a match.
1795 */
1796 if (sbi->s_cluster_ratio == 1) {
1797 ret = ext4_da_reserve_space(inode);
1798 if (ret != 0) /* ENOSPC */
1799 goto errout;
1800 } else { /* bigalloc */
1801 if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
1802 if (!ext4_es_scan_clu(inode,
1803 &ext4_es_is_mapped, lblk)) {
1804 ret = ext4_clu_mapped(inode,
1805 EXT4_B2C(sbi, lblk));
1806 if (ret < 0)
1807 goto errout;
1808 if (ret == 0) {
1809 ret = ext4_da_reserve_space(inode);
1810 if (ret != 0) /* ENOSPC */
1811 goto errout;
1812 } else {
1813 allocated = true;
1814 }
1815 } else {
1816 allocated = true;
1817 }
1818 }
1819 }
1820
1821 ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
1822
1823errout:
1824 return ret;
1825}
1826
1827/*
1784 * This function is grabs code from the very beginning of 1828 * This function is grabs code from the very beginning of
1785 * ext4_map_blocks, but assumes that the caller is from delayed write 1829 * ext4_map_blocks, but assumes that the caller is from delayed write
1786 * time. This function looks up the requested blocks and sets the 1830 * time. This function looks up the requested blocks and sets the
@@ -1859,28 +1903,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1859add_delayed: 1903add_delayed:
1860 if (retval == 0) { 1904 if (retval == 0) {
1861 int ret; 1905 int ret;
1906
1862 /* 1907 /*
1863 * XXX: __block_prepare_write() unmaps passed block, 1908 * XXX: __block_prepare_write() unmaps passed block,
1864 * is it OK? 1909 * is it OK?
1865 */ 1910 */
1866 /*
1867 * If the block was allocated from previously allocated cluster,
1868 * then we don't need to reserve it again. However we still need
1869 * to reserve metadata for every block we're going to write.
1870 */
1871 if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
1872 !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
1873 ret = ext4_da_reserve_space(inode);
1874 if (ret) {
1875 /* not enough space to reserve */
1876 retval = ret;
1877 goto out_unlock;
1878 }
1879 }
1880 1911
1881 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 1912 ret = ext4_insert_delayed_block(inode, map->m_lblk);
1882 ~0, EXTENT_STATUS_DELAYED); 1913 if (ret != 0) {
1883 if (ret) {
1884 retval = ret; 1914 retval = ret;
1885 goto out_unlock; 1915 goto out_unlock;
1886 } 1916 }
@@ -3450,7 +3480,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
3450 ext4_lblk_t end = map.m_lblk + map.m_len - 1; 3480 ext4_lblk_t end = map.m_lblk + map.m_len - 1;
3451 struct extent_status es; 3481 struct extent_status es;
3452 3482
3453 ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); 3483 ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
3484 map.m_lblk, end, &es);
3454 3485
3455 if (!es.es_len || es.es_lblk > end) { 3486 if (!es.es_len || es.es_lblk > end) {
3456 /* entire range is a hole */ 3487 /* entire range is a hole */
@@ -6153,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
6153 return !buffer_mapped(bh); 6184 return !buffer_mapped(bh);
6154} 6185}
6155 6186
6156int ext4_page_mkwrite(struct vm_fault *vmf) 6187vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
6157{ 6188{
6158 struct vm_area_struct *vma = vmf->vma; 6189 struct vm_area_struct *vma = vmf->vma;
6159 struct page *page = vmf->page; 6190 struct page *page = vmf->page;
6160 loff_t size; 6191 loff_t size;
6161 unsigned long len; 6192 unsigned long len;
6162 int ret; 6193 int err;
6194 vm_fault_t ret;
6163 struct file *file = vma->vm_file; 6195 struct file *file = vma->vm_file;
6164 struct inode *inode = file_inode(file); 6196 struct inode *inode = file_inode(file);
6165 struct address_space *mapping = inode->i_mapping; 6197 struct address_space *mapping = inode->i_mapping;
@@ -6172,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
6172 6204
6173 down_read(&EXT4_I(inode)->i_mmap_sem); 6205 down_read(&EXT4_I(inode)->i_mmap_sem);
6174 6206
6175 ret = ext4_convert_inline_data(inode); 6207 err = ext4_convert_inline_data(inode);
6176 if (ret) 6208 if (err)
6177 goto out_ret; 6209 goto out_ret;
6178 6210
6179 /* Delalloc case is easy... */ 6211 /* Delalloc case is easy... */
@@ -6181,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
6181 !ext4_should_journal_data(inode) && 6213 !ext4_should_journal_data(inode) &&
6182 !ext4_nonda_switch(inode->i_sb)) { 6214 !ext4_nonda_switch(inode->i_sb)) {
6183 do { 6215 do {
6184 ret = block_page_mkwrite(vma, vmf, 6216 err = block_page_mkwrite(vma, vmf,
6185 ext4_da_get_block_prep); 6217 ext4_da_get_block_prep);
6186 } while (ret == -ENOSPC && 6218 } while (err == -ENOSPC &&
6187 ext4_should_retry_alloc(inode->i_sb, &retries)); 6219 ext4_should_retry_alloc(inode->i_sb, &retries));
6188 goto out_ret; 6220 goto out_ret;
6189 } 6221 }
@@ -6228,8 +6260,8 @@ retry_alloc:
6228 ret = VM_FAULT_SIGBUS; 6260 ret = VM_FAULT_SIGBUS;
6229 goto out; 6261 goto out;
6230 } 6262 }
6231 ret = block_page_mkwrite(vma, vmf, get_block); 6263 err = block_page_mkwrite(vma, vmf, get_block);
6232 if (!ret && ext4_should_journal_data(inode)) { 6264 if (!err && ext4_should_journal_data(inode)) {
6233 if (ext4_walk_page_buffers(handle, page_buffers(page), 0, 6265 if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
6234 PAGE_SIZE, NULL, do_journal_get_write_access)) { 6266 PAGE_SIZE, NULL, do_journal_get_write_access)) {
6235 unlock_page(page); 6267 unlock_page(page);
@@ -6240,24 +6272,24 @@ retry_alloc:
6240 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 6272 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
6241 } 6273 }
6242 ext4_journal_stop(handle); 6274 ext4_journal_stop(handle);
6243 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 6275 if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
6244 goto retry_alloc; 6276 goto retry_alloc;
6245out_ret: 6277out_ret:
6246 ret = block_page_mkwrite_return(ret); 6278 ret = block_page_mkwrite_return(err);
6247out: 6279out:
6248 up_read(&EXT4_I(inode)->i_mmap_sem); 6280 up_read(&EXT4_I(inode)->i_mmap_sem);
6249 sb_end_pagefault(inode->i_sb); 6281 sb_end_pagefault(inode->i_sb);
6250 return ret; 6282 return ret;
6251} 6283}
6252 6284
6253int ext4_filemap_fault(struct vm_fault *vmf) 6285vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
6254{ 6286{
6255 struct inode *inode = file_inode(vmf->vma->vm_file); 6287 struct inode *inode = file_inode(vmf->vma->vm_file);
6256 int err; 6288 vm_fault_t ret;
6257 6289
6258 down_read(&EXT4_I(inode)->i_mmap_sem); 6290 down_read(&EXT4_I(inode)->i_mmap_sem);
6259 err = filemap_fault(vmf); 6291 ret = filemap_fault(vmf);
6260 up_read(&EXT4_I(inode)->i_mmap_sem); 6292 up_read(&EXT4_I(inode)->i_mmap_sem);
6261 6293
6262 return err; 6294 return ret;
6263} 6295}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a7074115d6f6..0edee31913d1 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
67 ei1 = EXT4_I(inode1); 67 ei1 = EXT4_I(inode1);
68 ei2 = EXT4_I(inode2); 68 ei2 = EXT4_I(inode2);
69 69
70 swap(inode1->i_flags, inode2->i_flags);
71 swap(inode1->i_version, inode2->i_version); 70 swap(inode1->i_version, inode2->i_version);
72 swap(inode1->i_blocks, inode2->i_blocks); 71 swap(inode1->i_blocks, inode2->i_blocks);
73 swap(inode1->i_bytes, inode2->i_bytes); 72 swap(inode1->i_bytes, inode2->i_bytes);
@@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
85 i_size_write(inode2, isize); 84 i_size_write(inode2, isize);
86} 85}
87 86
87static void reset_inode_seed(struct inode *inode)
88{
89 struct ext4_inode_info *ei = EXT4_I(inode);
90 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
91 __le32 inum = cpu_to_le32(inode->i_ino);
92 __le32 gen = cpu_to_le32(inode->i_generation);
93 __u32 csum;
94
95 if (!ext4_has_metadata_csum(inode->i_sb))
96 return;
97
98 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
99 ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
100}
101
88/** 102/**
89 * Swap the information from the given @inode and the inode 103 * Swap the information from the given @inode and the inode
90 * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other 104 * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
@@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
102 struct inode *inode_bl; 116 struct inode *inode_bl;
103 struct ext4_inode_info *ei_bl; 117 struct ext4_inode_info *ei_bl;
104 118
105 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) 119 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
120 IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
121 ext4_has_inline_data(inode))
106 return -EINVAL; 122 return -EINVAL;
107 123
108 if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) 124 if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
125 !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
109 return -EPERM; 126 return -EPERM;
110 127
111 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); 128 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
@@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
120 * that only 1 swap_inode_boot_loader is running. */ 137 * that only 1 swap_inode_boot_loader is running. */
121 lock_two_nondirectories(inode, inode_bl); 138 lock_two_nondirectories(inode, inode_bl);
122 139
123 truncate_inode_pages(&inode->i_data, 0);
124 truncate_inode_pages(&inode_bl->i_data, 0);
125
126 /* Wait for all existing dio workers */ 140 /* Wait for all existing dio workers */
127 inode_dio_wait(inode); 141 inode_dio_wait(inode);
128 inode_dio_wait(inode_bl); 142 inode_dio_wait(inode_bl);
129 143
144 truncate_inode_pages(&inode->i_data, 0);
145 truncate_inode_pages(&inode_bl->i_data, 0);
146
130 handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); 147 handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
131 if (IS_ERR(handle)) { 148 if (IS_ERR(handle)) {
132 err = -EINVAL; 149 err = -EINVAL;
@@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
159 176
160 inode->i_generation = prandom_u32(); 177 inode->i_generation = prandom_u32();
161 inode_bl->i_generation = prandom_u32(); 178 inode_bl->i_generation = prandom_u32();
179 reset_inode_seed(inode);
180 reset_inode_seed(inode_bl);
162 181
163 ext4_discard_preallocations(inode); 182 ext4_discard_preallocations(inode);
164 183
@@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
169 inode->i_ino, err); 188 inode->i_ino, err);
170 /* Revert all changes: */ 189 /* Revert all changes: */
171 swap_inode_data(inode, inode_bl); 190 swap_inode_data(inode, inode_bl);
191 ext4_mark_inode_dirty(handle, inode);
172 } else { 192 } else {
173 err = ext4_mark_inode_dirty(handle, inode_bl); 193 err = ext4_mark_inode_dirty(handle, inode_bl);
174 if (err < 0) { 194 if (err < 0) {
@@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
178 /* Revert all changes: */ 198 /* Revert all changes: */
179 swap_inode_data(inode, inode_bl); 199 swap_inode_data(inode, inode_bl);
180 ext4_mark_inode_dirty(handle, inode); 200 ext4_mark_inode_dirty(handle, inode);
201 ext4_mark_inode_dirty(handle, inode_bl);
181 } 202 }
182 } 203 }
183 ext4_journal_stop(handle); 204 ext4_journal_stop(handle);
@@ -339,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
339 if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) 360 if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
340 return 0; 361 return 0;
341 362
342 err = mnt_want_write_file(filp);
343 if (err)
344 return err;
345
346 err = -EPERM; 363 err = -EPERM;
347 inode_lock(inode);
348 /* Is it quota file? Do not allow user to mess with it */ 364 /* Is it quota file? Do not allow user to mess with it */
349 if (ext4_is_quota_file(inode)) 365 if (ext4_is_quota_file(inode))
350 goto out_unlock; 366 return err;
351 367
352 err = ext4_get_inode_loc(inode, &iloc); 368 err = ext4_get_inode_loc(inode, &iloc);
353 if (err) 369 if (err)
354 goto out_unlock; 370 return err;
355 371
356 raw_inode = ext4_raw_inode(&iloc); 372 raw_inode = ext4_raw_inode(&iloc);
357 if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { 373 if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
@@ -359,20 +375,20 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
359 EXT4_SB(sb)->s_want_extra_isize, 375 EXT4_SB(sb)->s_want_extra_isize,
360 &iloc); 376 &iloc);
361 if (err) 377 if (err)
362 goto out_unlock; 378 return err;
363 } else { 379 } else {
364 brelse(iloc.bh); 380 brelse(iloc.bh);
365 } 381 }
366 382
367 dquot_initialize(inode); 383 err = dquot_initialize(inode);
384 if (err)
385 return err;
368 386
369 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 387 handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
370 EXT4_QUOTA_INIT_BLOCKS(sb) + 388 EXT4_QUOTA_INIT_BLOCKS(sb) +
371 EXT4_QUOTA_DEL_BLOCKS(sb) + 3); 389 EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
372 if (IS_ERR(handle)) { 390 if (IS_ERR(handle))
373 err = PTR_ERR(handle); 391 return PTR_ERR(handle);
374 goto out_unlock;
375 }
376 392
377 err = ext4_reserve_inode_write(handle, inode, &iloc); 393 err = ext4_reserve_inode_write(handle, inode, &iloc);
378 if (err) 394 if (err)
@@ -400,9 +416,6 @@ out_dirty:
400 err = rc; 416 err = rc;
401out_stop: 417out_stop:
402 ext4_journal_stop(handle); 418 ext4_journal_stop(handle);
403out_unlock:
404 inode_unlock(inode);
405 mnt_drop_write_file(filp);
406 return err; 419 return err;
407} 420}
408#else 421#else
@@ -626,6 +639,30 @@ group_add_out:
626 return err; 639 return err;
627} 640}
628 641
642static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
643{
644 /*
645 * Project Quota ID state is only allowed to change from within the init
646 * namespace. Enforce that restriction only if we are trying to change
647 * the quota ID state. Everything else is allowed in user namespaces.
648 */
649 if (current_user_ns() == &init_user_ns)
650 return 0;
651
652 if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid)
653 return -EINVAL;
654
655 if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) {
656 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
657 return -EINVAL;
658 } else {
659 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
660 return -EINVAL;
661 }
662
663 return 0;
664}
665
629long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 666long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
630{ 667{
631 struct inode *inode = file_inode(filp); 668 struct inode *inode = file_inode(filp);
@@ -1025,19 +1062,19 @@ resizefs_out:
1025 return err; 1062 return err;
1026 1063
1027 inode_lock(inode); 1064 inode_lock(inode);
1065 err = ext4_ioctl_check_project(inode, &fa);
1066 if (err)
1067 goto out;
1028 flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | 1068 flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
1029 (flags & EXT4_FL_XFLAG_VISIBLE); 1069 (flags & EXT4_FL_XFLAG_VISIBLE);
1030 err = ext4_ioctl_setflags(inode, flags); 1070 err = ext4_ioctl_setflags(inode, flags);
1031 inode_unlock(inode);
1032 mnt_drop_write_file(filp);
1033 if (err) 1071 if (err)
1034 return err; 1072 goto out;
1035
1036 err = ext4_ioctl_setproject(filp, fa.fsx_projid); 1073 err = ext4_ioctl_setproject(filp, fa.fsx_projid);
1037 if (err) 1074out:
1038 return err; 1075 inode_unlock(inode);
1039 1076 mnt_drop_write_file(filp);
1040 return 0; 1077 return err;
1041 } 1078 }
1042 case EXT4_IOC_SHUTDOWN: 1079 case EXT4_IOC_SHUTDOWN:
1043 return ext4_shutdown(sb, arg); 1080 return ext4_shutdown(sb, arg);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e29fce2fbf25..e2248083cdca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4915,9 +4915,17 @@ do_more:
4915 &sbi->s_flex_groups[flex_group].free_clusters); 4915 &sbi->s_flex_groups[flex_group].free_clusters);
4916 } 4916 }
4917 4917
4918 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) 4918 /*
4919 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 4919 * on a bigalloc file system, defer the s_freeclusters_counter
4920 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); 4920 * update to the caller (ext4_remove_space and friends) so they
4921 * can determine if a cluster freed here should be rereserved
4922 */
4923 if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
4924 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4925 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4926 percpu_counter_add(&sbi->s_freeclusters_counter,
4927 count_clusters);
4928 }
4921 4929
4922 ext4_mb_unload_buddy(&e4b); 4930 ext4_mb_unload_buddy(&e4b);
4923 4931
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index a409ff70d67b..2f5be02fc6f6 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode,
516 orig_inode->i_ino, donor_inode->i_ino); 516 orig_inode->i_ino, donor_inode->i_ino);
517 return -EINVAL; 517 return -EINVAL;
518 } 518 }
519 if (orig_eof < orig_start + *len - 1) 519 if (orig_eof <= orig_start)
520 *len = 0;
521 else if (orig_eof < orig_start + *len - 1)
520 *len = orig_eof - orig_start; 522 *len = orig_eof - orig_start;
521 if (donor_eof < donor_start + *len - 1) 523 if (donor_eof <= donor_start)
524 *len = 0;
525 else if (donor_eof < donor_start + *len - 1)
522 *len = donor_eof - donor_start; 526 *len = donor_eof - donor_start;
523 if (!*len) { 527 if (!*len) {
524 ext4_debug("ext4 move extent: len should not be 0 " 528 ext4_debug("ext4 move extent: len should not be 0 "
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 377d516c475f..67a38532032a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2261,7 +2261,7 @@ again:
2261 dxroot->info.indirect_levels += 1; 2261 dxroot->info.indirect_levels += 1;
2262 dxtrace(printk(KERN_DEBUG 2262 dxtrace(printk(KERN_DEBUG
2263 "Creating %d level index...\n", 2263 "Creating %d level index...\n",
2264 info->indirect_levels)); 2264 dxroot->info.indirect_levels));
2265 err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); 2265 err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
2266 if (err) 2266 if (err)
2267 goto journal_error; 2267 goto journal_error;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1145109968ef..a221f1cdf704 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb)
914 for (type = 0; type < EXT4_MAXQUOTAS; type++) 914 for (type = 0; type < EXT4_MAXQUOTAS; type++)
915 ext4_quota_off(sb, type); 915 ext4_quota_off(sb, type);
916} 916}
917
918/*
919 * This is a helper function which is used in the mount/remount
920 * codepaths (which holds s_umount) to fetch the quota file name.
921 */
922static inline char *get_qf_name(struct super_block *sb,
923 struct ext4_sb_info *sbi,
924 int type)
925{
926 return rcu_dereference_protected(sbi->s_qf_names[type],
927 lockdep_is_held(&sb->s_umount));
928}
917#else 929#else
918static inline void ext4_quota_off_umount(struct super_block *sb) 930static inline void ext4_quota_off_umount(struct super_block *sb)
919{ 931{
@@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb)
965 percpu_free_rwsem(&sbi->s_journal_flag_rwsem); 977 percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
966#ifdef CONFIG_QUOTA 978#ifdef CONFIG_QUOTA
967 for (i = 0; i < EXT4_MAXQUOTAS; i++) 979 for (i = 0; i < EXT4_MAXQUOTAS; i++)
968 kfree(sbi->s_qf_names[i]); 980 kfree(get_qf_name(sb, sbi, i));
969#endif 981#endif
970 982
971 /* Debugging code just in case the in-memory inode orphan list 983 /* Debugging code just in case the in-memory inode orphan list
@@ -1040,6 +1052,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
1040 ei->i_da_metadata_calc_len = 0; 1052 ei->i_da_metadata_calc_len = 0;
1041 ei->i_da_metadata_calc_last_lblock = 0; 1053 ei->i_da_metadata_calc_last_lblock = 0;
1042 spin_lock_init(&(ei->i_block_reservation_lock)); 1054 spin_lock_init(&(ei->i_block_reservation_lock));
1055 ext4_init_pending_tree(&ei->i_pending_tree);
1043#ifdef CONFIG_QUOTA 1056#ifdef CONFIG_QUOTA
1044 ei->i_reserved_quota = 0; 1057 ei->i_reserved_quota = 0;
1045 memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); 1058 memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
@@ -1530,11 +1543,10 @@ static const char deprecated_msg[] =
1530static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) 1543static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1531{ 1544{
1532 struct ext4_sb_info *sbi = EXT4_SB(sb); 1545 struct ext4_sb_info *sbi = EXT4_SB(sb);
1533 char *qname; 1546 char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
1534 int ret = -1; 1547 int ret = -1;
1535 1548
1536 if (sb_any_quota_loaded(sb) && 1549 if (sb_any_quota_loaded(sb) && !old_qname) {
1537 !sbi->s_qf_names[qtype]) {
1538 ext4_msg(sb, KERN_ERR, 1550 ext4_msg(sb, KERN_ERR,
1539 "Cannot change journaled " 1551 "Cannot change journaled "
1540 "quota options when quota turned on"); 1552 "quota options when quota turned on");
@@ -1551,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1551 "Not enough memory for storing quotafile name"); 1563 "Not enough memory for storing quotafile name");
1552 return -1; 1564 return -1;
1553 } 1565 }
1554 if (sbi->s_qf_names[qtype]) { 1566 if (old_qname) {
1555 if (strcmp(sbi->s_qf_names[qtype], qname) == 0) 1567 if (strcmp(old_qname, qname) == 0)
1556 ret = 1; 1568 ret = 1;
1557 else 1569 else
1558 ext4_msg(sb, KERN_ERR, 1570 ext4_msg(sb, KERN_ERR,
@@ -1565,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1565 "quotafile must be on filesystem root"); 1577 "quotafile must be on filesystem root");
1566 goto errout; 1578 goto errout;
1567 } 1579 }
1568 sbi->s_qf_names[qtype] = qname; 1580 rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
1569 set_opt(sb, QUOTA); 1581 set_opt(sb, QUOTA);
1570 return 1; 1582 return 1;
1571errout: 1583errout:
@@ -1577,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype)
1577{ 1589{
1578 1590
1579 struct ext4_sb_info *sbi = EXT4_SB(sb); 1591 struct ext4_sb_info *sbi = EXT4_SB(sb);
1592 char *old_qname = get_qf_name(sb, sbi, qtype);
1580 1593
1581 if (sb_any_quota_loaded(sb) && 1594 if (sb_any_quota_loaded(sb) && old_qname) {
1582 sbi->s_qf_names[qtype]) {
1583 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" 1595 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1584 " when quota turned on"); 1596 " when quota turned on");
1585 return -1; 1597 return -1;
1586 } 1598 }
1587 kfree(sbi->s_qf_names[qtype]); 1599 rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
1588 sbi->s_qf_names[qtype] = NULL; 1600 synchronize_rcu();
1601 kfree(old_qname);
1589 return 1; 1602 return 1;
1590} 1603}
1591#endif 1604#endif
@@ -1960,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb,
1960 int is_remount) 1973 int is_remount)
1961{ 1974{
1962 struct ext4_sb_info *sbi = EXT4_SB(sb); 1975 struct ext4_sb_info *sbi = EXT4_SB(sb);
1963 char *p; 1976 char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
1964 substring_t args[MAX_OPT_ARGS]; 1977 substring_t args[MAX_OPT_ARGS];
1965 int token; 1978 int token;
1966 1979
@@ -1991,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb,
1991 "Cannot enable project quota enforcement."); 2004 "Cannot enable project quota enforcement.");
1992 return 0; 2005 return 0;
1993 } 2006 }
1994 if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { 2007 usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
1995 if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) 2008 grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
2009 if (usr_qf_name || grp_qf_name) {
2010 if (test_opt(sb, USRQUOTA) && usr_qf_name)
1996 clear_opt(sb, USRQUOTA); 2011 clear_opt(sb, USRQUOTA);
1997 2012
1998 if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) 2013 if (test_opt(sb, GRPQUOTA) && grp_qf_name)
1999 clear_opt(sb, GRPQUOTA); 2014 clear_opt(sb, GRPQUOTA);
2000 2015
2001 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { 2016 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
@@ -2029,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
2029{ 2044{
2030#if defined(CONFIG_QUOTA) 2045#if defined(CONFIG_QUOTA)
2031 struct ext4_sb_info *sbi = EXT4_SB(sb); 2046 struct ext4_sb_info *sbi = EXT4_SB(sb);
2047 char *usr_qf_name, *grp_qf_name;
2032 2048
2033 if (sbi->s_jquota_fmt) { 2049 if (sbi->s_jquota_fmt) {
2034 char *fmtname = ""; 2050 char *fmtname = "";
@@ -2047,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
2047 seq_printf(seq, ",jqfmt=%s", fmtname); 2063 seq_printf(seq, ",jqfmt=%s", fmtname);
2048 } 2064 }
2049 2065
2050 if (sbi->s_qf_names[USRQUOTA]) 2066 rcu_read_lock();
2051 seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); 2067 usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
2052 2068 grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
2053 if (sbi->s_qf_names[GRPQUOTA]) 2069 if (usr_qf_name)
2054 seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); 2070 seq_show_option(seq, "usrjquota", usr_qf_name);
2071 if (grp_qf_name)
2072 seq_show_option(seq, "grpjquota", grp_qf_name);
2073 rcu_read_unlock();
2055#endif 2074#endif
2056} 2075}
2057 2076
@@ -5103,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
5103 int err = 0; 5122 int err = 0;
5104#ifdef CONFIG_QUOTA 5123#ifdef CONFIG_QUOTA
5105 int i, j; 5124 int i, j;
5125 char *to_free[EXT4_MAXQUOTAS];
5106#endif 5126#endif
5107 char *orig_data = kstrdup(data, GFP_KERNEL); 5127 char *orig_data = kstrdup(data, GFP_KERNEL);
5108 5128
@@ -5122,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
5122 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 5142 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
5123 for (i = 0; i < EXT4_MAXQUOTAS; i++) 5143 for (i = 0; i < EXT4_MAXQUOTAS; i++)
5124 if (sbi->s_qf_names[i]) { 5144 if (sbi->s_qf_names[i]) {
5125 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], 5145 char *qf_name = get_qf_name(sb, sbi, i);
5126 GFP_KERNEL); 5146
5147 old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
5127 if (!old_opts.s_qf_names[i]) { 5148 if (!old_opts.s_qf_names[i]) {
5128 for (j = 0; j < i; j++) 5149 for (j = 0; j < i; j++)
5129 kfree(old_opts.s_qf_names[j]); 5150 kfree(old_opts.s_qf_names[j]);
@@ -5352,9 +5373,12 @@ restore_opts:
5352#ifdef CONFIG_QUOTA 5373#ifdef CONFIG_QUOTA
5353 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 5374 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
5354 for (i = 0; i < EXT4_MAXQUOTAS; i++) { 5375 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
5355 kfree(sbi->s_qf_names[i]); 5376 to_free[i] = get_qf_name(sb, sbi, i);
5356 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 5377 rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
5357 } 5378 }
5379 synchronize_rcu();
5380 for (i = 0; i < EXT4_MAXQUOTAS; i++)
5381 kfree(to_free[i]);
5358#endif 5382#endif
5359 kfree(orig_data); 5383 kfree(orig_data);
5360 return err; 5384 return err;
@@ -5545,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type)
5545 */ 5569 */
5546static int ext4_quota_on_mount(struct super_block *sb, int type) 5570static int ext4_quota_on_mount(struct super_block *sb, int type)
5547{ 5571{
5548 return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], 5572 return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
5549 EXT4_SB(sb)->s_jquota_fmt, type); 5573 EXT4_SB(sb)->s_jquota_fmt, type);
5550} 5574}
5551 5575
@@ -5954,6 +5978,10 @@ static int __init ext4_init_fs(void)
5954 if (err) 5978 if (err)
5955 return err; 5979 return err;
5956 5980
5981 err = ext4_init_pending();
5982 if (err)
5983 goto out6;
5984
5957 err = ext4_init_pageio(); 5985 err = ext4_init_pageio();
5958 if (err) 5986 if (err)
5959 goto out5; 5987 goto out5;
@@ -5992,6 +6020,8 @@ out3:
5992out4: 6020out4:
5993 ext4_exit_pageio(); 6021 ext4_exit_pageio();
5994out5: 6022out5:
6023 ext4_exit_pending();
6024out6:
5995 ext4_exit_es(); 6025 ext4_exit_es();
5996 6026
5997 return err; 6027 return err;
@@ -6009,6 +6039,7 @@ static void __exit ext4_exit_fs(void)
6009 ext4_exit_system_zone(); 6039 ext4_exit_system_zone();
6010 ext4_exit_pageio(); 6040 ext4_exit_pageio();
6011 ext4_exit_es(); 6041 ext4_exit_es();
6042 ext4_exit_pending();
6012} 6043}
6013 6044
6014MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 6045MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c125d662777c..26f8d7e46462 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -251,8 +251,8 @@ restart:
251 bh = jh2bh(jh); 251 bh = jh2bh(jh);
252 252
253 if (buffer_locked(bh)) { 253 if (buffer_locked(bh)) {
254 spin_unlock(&journal->j_list_lock);
255 get_bh(bh); 254 get_bh(bh);
255 spin_unlock(&journal->j_list_lock);
256 wait_on_buffer(bh); 256 wait_on_buffer(bh);
257 /* the journal_head may have gone by now */ 257 /* the journal_head may have gone by now */
258 BUFFER_TRACE(bh, "brelse"); 258 BUFFER_TRACE(bh, "brelse");
@@ -333,8 +333,8 @@ restart2:
333 jh = transaction->t_checkpoint_io_list; 333 jh = transaction->t_checkpoint_io_list;
334 bh = jh2bh(jh); 334 bh = jh2bh(jh);
335 if (buffer_locked(bh)) { 335 if (buffer_locked(bh)) {
336 spin_unlock(&journal->j_list_lock);
337 get_bh(bh); 336 get_bh(bh);
337 spin_unlock(&journal->j_list_lock);
338 wait_on_buffer(bh); 338 wait_on_buffer(bh);
339 /* the journal_head may have gone by now */ 339 /* the journal_head may have gone by now */
340 BUFFER_TRACE(bh, "brelse"); 340 BUFFER_TRACE(bh, "brelse");
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 96225a77c112..7b73ef7f902d 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -242,7 +242,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to);
242int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 242int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
243 get_block_t get_block); 243 get_block_t get_block);
244/* Convert errno to return value from ->page_mkwrite() call */ 244/* Convert errno to return value from ->page_mkwrite() call */
245static inline int block_page_mkwrite_return(int err) 245static inline vm_fault_t block_page_mkwrite_return(int err)
246{ 246{
247 if (err == 0) 247 if (err == 0)
248 return VM_FAULT_LOCKED; 248 return VM_FAULT_LOCKED;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 0e31eb136c57..698e0d8a5ca4 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -17,6 +17,7 @@ struct mpage_da_data;
17struct ext4_map_blocks; 17struct ext4_map_blocks;
18struct extent_status; 18struct extent_status;
19struct ext4_fsmap; 19struct ext4_fsmap;
20struct partial_cluster;
20 21
21#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) 22#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
22 23
@@ -2035,21 +2036,23 @@ TRACE_EVENT(ext4_ext_show_extent,
2035); 2036);
2036 2037
2037TRACE_EVENT(ext4_remove_blocks, 2038TRACE_EVENT(ext4_remove_blocks,
2038 TP_PROTO(struct inode *inode, struct ext4_extent *ex, 2039 TP_PROTO(struct inode *inode, struct ext4_extent *ex,
2039 ext4_lblk_t from, ext4_fsblk_t to, 2040 ext4_lblk_t from, ext4_fsblk_t to,
2040 long long partial_cluster), 2041 struct partial_cluster *pc),
2041 2042
2042 TP_ARGS(inode, ex, from, to, partial_cluster), 2043 TP_ARGS(inode, ex, from, to, pc),
2043 2044
2044 TP_STRUCT__entry( 2045 TP_STRUCT__entry(
2045 __field( dev_t, dev ) 2046 __field( dev_t, dev )
2046 __field( ino_t, ino ) 2047 __field( ino_t, ino )
2047 __field( ext4_lblk_t, from ) 2048 __field( ext4_lblk_t, from )
2048 __field( ext4_lblk_t, to ) 2049 __field( ext4_lblk_t, to )
2049 __field( long long, partial )
2050 __field( ext4_fsblk_t, ee_pblk ) 2050 __field( ext4_fsblk_t, ee_pblk )
2051 __field( ext4_lblk_t, ee_lblk ) 2051 __field( ext4_lblk_t, ee_lblk )
2052 __field( unsigned short, ee_len ) 2052 __field( unsigned short, ee_len )
2053 __field( ext4_fsblk_t, pc_pclu )
2054 __field( ext4_lblk_t, pc_lblk )
2055 __field( int, pc_state)
2053 ), 2056 ),
2054 2057
2055 TP_fast_assign( 2058 TP_fast_assign(
@@ -2057,14 +2060,16 @@ TRACE_EVENT(ext4_remove_blocks,
2057 __entry->ino = inode->i_ino; 2060 __entry->ino = inode->i_ino;
2058 __entry->from = from; 2061 __entry->from = from;
2059 __entry->to = to; 2062 __entry->to = to;
2060 __entry->partial = partial_cluster;
2061 __entry->ee_pblk = ext4_ext_pblock(ex); 2063 __entry->ee_pblk = ext4_ext_pblock(ex);
2062 __entry->ee_lblk = le32_to_cpu(ex->ee_block); 2064 __entry->ee_lblk = le32_to_cpu(ex->ee_block);
2063 __entry->ee_len = ext4_ext_get_actual_len(ex); 2065 __entry->ee_len = ext4_ext_get_actual_len(ex);
2066 __entry->pc_pclu = pc->pclu;
2067 __entry->pc_lblk = pc->lblk;
2068 __entry->pc_state = pc->state;
2064 ), 2069 ),
2065 2070
2066 TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" 2071 TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
2067 "from %u to %u partial_cluster %lld", 2072 "from %u to %u partial [pclu %lld lblk %u state %d]",
2068 MAJOR(__entry->dev), MINOR(__entry->dev), 2073 MAJOR(__entry->dev), MINOR(__entry->dev),
2069 (unsigned long) __entry->ino, 2074 (unsigned long) __entry->ino,
2070 (unsigned) __entry->ee_lblk, 2075 (unsigned) __entry->ee_lblk,
@@ -2072,45 +2077,53 @@ TRACE_EVENT(ext4_remove_blocks,
2072 (unsigned short) __entry->ee_len, 2077 (unsigned short) __entry->ee_len,
2073 (unsigned) __entry->from, 2078 (unsigned) __entry->from,
2074 (unsigned) __entry->to, 2079 (unsigned) __entry->to,
2075 (long long) __entry->partial) 2080 (long long) __entry->pc_pclu,
2081 (unsigned int) __entry->pc_lblk,
2082 (int) __entry->pc_state)
2076); 2083);
2077 2084
2078TRACE_EVENT(ext4_ext_rm_leaf, 2085TRACE_EVENT(ext4_ext_rm_leaf,
2079 TP_PROTO(struct inode *inode, ext4_lblk_t start, 2086 TP_PROTO(struct inode *inode, ext4_lblk_t start,
2080 struct ext4_extent *ex, 2087 struct ext4_extent *ex,
2081 long long partial_cluster), 2088 struct partial_cluster *pc),
2082 2089
2083 TP_ARGS(inode, start, ex, partial_cluster), 2090 TP_ARGS(inode, start, ex, pc),
2084 2091
2085 TP_STRUCT__entry( 2092 TP_STRUCT__entry(
2086 __field( dev_t, dev ) 2093 __field( dev_t, dev )
2087 __field( ino_t, ino ) 2094 __field( ino_t, ino )
2088 __field( long long, partial )
2089 __field( ext4_lblk_t, start ) 2095 __field( ext4_lblk_t, start )
2090 __field( ext4_lblk_t, ee_lblk ) 2096 __field( ext4_lblk_t, ee_lblk )
2091 __field( ext4_fsblk_t, ee_pblk ) 2097 __field( ext4_fsblk_t, ee_pblk )
2092 __field( short, ee_len ) 2098 __field( short, ee_len )
2099 __field( ext4_fsblk_t, pc_pclu )
2100 __field( ext4_lblk_t, pc_lblk )
2101 __field( int, pc_state)
2093 ), 2102 ),
2094 2103
2095 TP_fast_assign( 2104 TP_fast_assign(
2096 __entry->dev = inode->i_sb->s_dev; 2105 __entry->dev = inode->i_sb->s_dev;
2097 __entry->ino = inode->i_ino; 2106 __entry->ino = inode->i_ino;
2098 __entry->partial = partial_cluster;
2099 __entry->start = start; 2107 __entry->start = start;
2100 __entry->ee_lblk = le32_to_cpu(ex->ee_block); 2108 __entry->ee_lblk = le32_to_cpu(ex->ee_block);
2101 __entry->ee_pblk = ext4_ext_pblock(ex); 2109 __entry->ee_pblk = ext4_ext_pblock(ex);
2102 __entry->ee_len = ext4_ext_get_actual_len(ex); 2110 __entry->ee_len = ext4_ext_get_actual_len(ex);
2111 __entry->pc_pclu = pc->pclu;
2112 __entry->pc_lblk = pc->lblk;
2113 __entry->pc_state = pc->state;
2103 ), 2114 ),
2104 2115
2105 TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" 2116 TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
2106 "partial_cluster %lld", 2117 "partial [pclu %lld lblk %u state %d]",
2107 MAJOR(__entry->dev), MINOR(__entry->dev), 2118 MAJOR(__entry->dev), MINOR(__entry->dev),
2108 (unsigned long) __entry->ino, 2119 (unsigned long) __entry->ino,
2109 (unsigned) __entry->start, 2120 (unsigned) __entry->start,
2110 (unsigned) __entry->ee_lblk, 2121 (unsigned) __entry->ee_lblk,
2111 (unsigned long long) __entry->ee_pblk, 2122 (unsigned long long) __entry->ee_pblk,
2112 (unsigned short) __entry->ee_len, 2123 (unsigned short) __entry->ee_len,
2113 (long long) __entry->partial) 2124 (long long) __entry->pc_pclu,
2125 (unsigned int) __entry->pc_lblk,
2126 (int) __entry->pc_state)
2114); 2127);
2115 2128
2116TRACE_EVENT(ext4_ext_rm_idx, 2129TRACE_EVENT(ext4_ext_rm_idx,
@@ -2168,9 +2181,9 @@ TRACE_EVENT(ext4_ext_remove_space,
2168 2181
2169TRACE_EVENT(ext4_ext_remove_space_done, 2182TRACE_EVENT(ext4_ext_remove_space_done,
2170 TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, 2183 TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
2171 int depth, long long partial, __le16 eh_entries), 2184 int depth, struct partial_cluster *pc, __le16 eh_entries),
2172 2185
2173 TP_ARGS(inode, start, end, depth, partial, eh_entries), 2186 TP_ARGS(inode, start, end, depth, pc, eh_entries),
2174 2187
2175 TP_STRUCT__entry( 2188 TP_STRUCT__entry(
2176 __field( dev_t, dev ) 2189 __field( dev_t, dev )
@@ -2178,7 +2191,9 @@ TRACE_EVENT(ext4_ext_remove_space_done,
2178 __field( ext4_lblk_t, start ) 2191 __field( ext4_lblk_t, start )
2179 __field( ext4_lblk_t, end ) 2192 __field( ext4_lblk_t, end )
2180 __field( int, depth ) 2193 __field( int, depth )
2181 __field( long long, partial ) 2194 __field( ext4_fsblk_t, pc_pclu )
2195 __field( ext4_lblk_t, pc_lblk )
2196 __field( int, pc_state )
2182 __field( unsigned short, eh_entries ) 2197 __field( unsigned short, eh_entries )
2183 ), 2198 ),
2184 2199
@@ -2188,18 +2203,23 @@ TRACE_EVENT(ext4_ext_remove_space_done,
2188 __entry->start = start; 2203 __entry->start = start;
2189 __entry->end = end; 2204 __entry->end = end;
2190 __entry->depth = depth; 2205 __entry->depth = depth;
2191 __entry->partial = partial; 2206 __entry->pc_pclu = pc->pclu;
2207 __entry->pc_lblk = pc->lblk;
2208 __entry->pc_state = pc->state;
2192 __entry->eh_entries = le16_to_cpu(eh_entries); 2209 __entry->eh_entries = le16_to_cpu(eh_entries);
2193 ), 2210 ),
2194 2211
2195 TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld " 2212 TP_printk("dev %d,%d ino %lu since %u end %u depth %d "
2213 "partial [pclu %lld lblk %u state %d] "
2196 "remaining_entries %u", 2214 "remaining_entries %u",
2197 MAJOR(__entry->dev), MINOR(__entry->dev), 2215 MAJOR(__entry->dev), MINOR(__entry->dev),
2198 (unsigned long) __entry->ino, 2216 (unsigned long) __entry->ino,
2199 (unsigned) __entry->start, 2217 (unsigned) __entry->start,
2200 (unsigned) __entry->end, 2218 (unsigned) __entry->end,
2201 __entry->depth, 2219 __entry->depth,
2202 (long long) __entry->partial, 2220 (long long) __entry->pc_pclu,
2221 (unsigned int) __entry->pc_lblk,
2222 (int) __entry->pc_state,
2203 (unsigned short) __entry->eh_entries) 2223 (unsigned short) __entry->eh_entries)
2204); 2224);
2205 2225
@@ -2270,7 +2290,7 @@ TRACE_EVENT(ext4_es_remove_extent,
2270 __entry->lblk, __entry->len) 2290 __entry->lblk, __entry->len)
2271); 2291);
2272 2292
2273TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, 2293TRACE_EVENT(ext4_es_find_extent_range_enter,
2274 TP_PROTO(struct inode *inode, ext4_lblk_t lblk), 2294 TP_PROTO(struct inode *inode, ext4_lblk_t lblk),
2275 2295
2276 TP_ARGS(inode, lblk), 2296 TP_ARGS(inode, lblk),
@@ -2292,7 +2312,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_enter,
2292 (unsigned long) __entry->ino, __entry->lblk) 2312 (unsigned long) __entry->ino, __entry->lblk)
2293); 2313);
2294 2314
2295TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, 2315TRACE_EVENT(ext4_es_find_extent_range_exit,
2296 TP_PROTO(struct inode *inode, struct extent_status *es), 2316 TP_PROTO(struct inode *inode, struct extent_status *es),
2297 2317
2298 TP_ARGS(inode, es), 2318 TP_ARGS(inode, es),
@@ -2512,6 +2532,41 @@ TRACE_EVENT(ext4_es_shrink,
2512 __entry->scan_time, __entry->nr_skipped, __entry->retried) 2532 __entry->scan_time, __entry->nr_skipped, __entry->retried)
2513); 2533);
2514 2534
2535TRACE_EVENT(ext4_es_insert_delayed_block,
2536 TP_PROTO(struct inode *inode, struct extent_status *es,
2537 bool allocated),
2538
2539 TP_ARGS(inode, es, allocated),
2540
2541 TP_STRUCT__entry(
2542 __field( dev_t, dev )
2543 __field( ino_t, ino )
2544 __field( ext4_lblk_t, lblk )
2545 __field( ext4_lblk_t, len )
2546 __field( ext4_fsblk_t, pblk )
2547 __field( char, status )
2548 __field( bool, allocated )
2549 ),
2550
2551 TP_fast_assign(
2552 __entry->dev = inode->i_sb->s_dev;
2553 __entry->ino = inode->i_ino;
2554 __entry->lblk = es->es_lblk;
2555 __entry->len = es->es_len;
2556 __entry->pblk = ext4_es_pblock(es);
2557 __entry->status = ext4_es_status(es);
2558 __entry->allocated = allocated;
2559 ),
2560
2561 TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
2562 "allocated %d",
2563 MAJOR(__entry->dev), MINOR(__entry->dev),
2564 (unsigned long) __entry->ino,
2565 __entry->lblk, __entry->len,
2566 __entry->pblk, show_extent_status(__entry->status),
2567 __entry->allocated)
2568);
2569
2515/* fsmap traces */ 2570/* fsmap traces */
2516DECLARE_EVENT_CLASS(ext4_fsmap_class, 2571DECLARE_EVENT_CLASS(ext4_fsmap_class,
2517 TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, 2572 TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,