diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-24 12:42:24 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-24 12:42:24 -0400 |
commit | 5993692f09582accb4cb7af11d344598af43c3b8 (patch) | |
tree | 062447eb44769d6da6e50302853eac1bb1d6e5d3 | |
parent | d6edff78fe9e34dbea1bec7dc26cfce92c6d96d5 (diff) | |
parent | 33458eaba4dfe778a426df6a19b7aad2ff9f7eec (diff) |
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
- further restructure ext4 documentation
- fix up ext4's delayed allocation for bigalloc file systems
- fix up some syzbot-detected races in EXT4_IOC_MOVE_EXT,
EXT4_IOC_SWAP_BOOT, and ext4_remount
- ... and a few other miscellaneous bugs and optimizations.
* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits)
ext4: fix use-after-free race in ext4_remount()'s error path
ext4: cache NULL when both default_acl and acl are NULL
docs: promote the ext4 data structures book to top level
docs: move ext4 administrative docs to admin-guide/
jbd2: fix use after free in jbd2_log_do_checkpoint()
ext4: propagate error from dquot_initialize() in EXT4_IOC_FSSETXATTR
ext4: fix setattr project check in fssetxattr ioctl
docs: make ext4 readme tables readable
docs: fix ext4 documentation table formatting problems
docs: generate a separate ext4 pdf file from the documentation
ext4: convert fault handler to use vm_fault_t type
ext4: initialize retries variable in ext4_da_write_inline_data_begin()
ext4: fix EXT4_IOC_SWAP_BOOT
ext4: fix build error when DX_DEBUG is defined
ext4: fix argument checking in EXT4_IOC_MOVE_EXT
ext4: fix reserved cluster accounting at page invalidation time
ext4: adjust reserved cluster count when removing extents
ext4: reduce reserved cluster count by number of allocated clusters
ext4: fix reserved cluster accounting at delayed write time
ext4: add new pending reservation mechanism
...
-rw-r--r-- | Documentation/admin-guide/ext4.rst | 574 | ||||
-rw-r--r-- | Documentation/admin-guide/index.rst | 1 | ||||
-rw-r--r-- | Documentation/conf.py | 4 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/about.rst (renamed from Documentation/filesystems/ext4/ondisk/about.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/allocators.rst (renamed from Documentation/filesystems/ext4/ondisk/allocators.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/attributes.rst (renamed from Documentation/filesystems/ext4/ondisk/attributes.rst) | 8 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/bigalloc.rst (renamed from Documentation/filesystems/ext4/ondisk/bigalloc.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/bitmaps.rst (renamed from Documentation/filesystems/ext4/ondisk/bitmaps.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/blockgroup.rst (renamed from Documentation/filesystems/ext4/ondisk/blockgroup.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/blockmap.rst (renamed from Documentation/filesystems/ext4/ondisk/blockmap.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/blocks.rst (renamed from Documentation/filesystems/ext4/ondisk/blocks.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/checksums.rst (renamed from Documentation/filesystems/ext4/ondisk/checksums.rst) | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/directory.rst (renamed from Documentation/filesystems/ext4/ondisk/directory.rst) | 18 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/dynamic.rst (renamed from Documentation/filesystems/ext4/ondisk/dynamic.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/eainode.rst (renamed from Documentation/filesystems/ext4/ondisk/eainode.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/ext4.rst | 613 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/globals.rst (renamed from Documentation/filesystems/ext4/ondisk/globals.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/group_descr.rst (renamed from Documentation/filesystems/ext4/ondisk/group_descr.rst) | 4 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/ifork.rst (renamed from Documentation/filesystems/ext4/ondisk/ifork.rst) | 8 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/index.rst | 19 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/inlinedata.rst (renamed from Documentation/filesystems/ext4/ondisk/inlinedata.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/inodes.rst) | 19 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/journal.rst (renamed from Documentation/filesystems/ext4/ondisk/journal.rst) | 32 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/mmp.rst (renamed from Documentation/filesystems/ext4/ondisk/mmp.rst) | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/ondisk/index.rst | 9 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/overview.rst (renamed from Documentation/filesystems/ext4/ondisk/overview.rst) | 0 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/special_inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/special_inodes.rst) | 2 | ||||
-rw-r--r-- | Documentation/filesystems/ext4/super.rst (renamed from Documentation/filesystems/ext4/ondisk/super.rst) | 24 | ||||
-rw-r--r-- | fs/ext4/acl.c | 4 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 17 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 13 | ||||
-rw-r--r-- | fs/ext4/extents.c | 595 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 654 | ||||
-rw-r--r-- | fs/ext4/extents_status.h | 80 | ||||
-rw-r--r-- | fs/ext4/inline.c | 2 | ||||
-rw-r--r-- | fs/ext4/inode.c | 142 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 97 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 14 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 8 | ||||
-rw-r--r-- | fs/ext4/namei.c | 2 | ||||
-rw-r--r-- | fs/ext4/super.c | 81 | ||||
-rw-r--r-- | fs/jbd2/checkpoint.c | 4 | ||||
-rw-r--r-- | include/linux/buffer_head.h | 2 | ||||
-rw-r--r-- | include/trace/events/ext4.h | 99 |
44 files changed, 1984 insertions, 1169 deletions
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst new file mode 100644 index 000000000000..e506d3dae510 --- /dev/null +++ b/Documentation/admin-guide/ext4.rst | |||
@@ -0,0 +1,574 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | ======================== | ||
4 | ext4 General Information | ||
5 | ======================== | ||
6 | |||
7 | Ext4 is an advanced level of the ext3 filesystem which incorporates | ||
8 | scalability and reliability enhancements for supporting large filesystems | ||
9 | (64 bit) in keeping with increasing disk capacities and state-of-the-art | ||
10 | feature requirements. | ||
11 | |||
12 | Mailing list: linux-ext4@vger.kernel.org | ||
13 | Web site: http://ext4.wiki.kernel.org | ||
14 | |||
15 | |||
16 | Quick usage instructions | ||
17 | ======================== | ||
18 | |||
19 | Note: More extensive information for getting started with ext4 can be | ||
20 | found at the ext4 wiki site at the URL: | ||
21 | http://ext4.wiki.kernel.org/index.php/Ext4_Howto | ||
22 | |||
23 | - The latest version of e2fsprogs can be found at: | ||
24 | |||
25 | https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ | ||
26 | |||
27 | or | ||
28 | |||
29 | http://sourceforge.net/project/showfiles.php?group_id=2406 | ||
30 | |||
31 | or grab the latest git repository from: | ||
32 | |||
33 | https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git | ||
34 | |||
35 | - Create a new filesystem using the ext4 filesystem type: | ||
36 | |||
37 | # mke2fs -t ext4 /dev/hda1 | ||
38 | |||
39 | Or to configure an existing ext3 filesystem to support extents: | ||
40 | |||
41 | # tune2fs -O extents /dev/hda1 | ||
42 | |||
43 | If the filesystem was created with 128 byte inodes, it can be | ||
44 | converted to use 256 byte for greater efficiency via: | ||
45 | |||
46 | # tune2fs -I 256 /dev/hda1 | ||
47 | |||
48 | - Mounting: | ||
49 | |||
50 | # mount -t ext4 /dev/hda1 /wherever | ||
51 | |||
52 | - When comparing performance with other filesystems, it's always | ||
53 | important to try multiple workloads; very often a subtle change in a | ||
54 | workload parameter can completely change the ranking of which | ||
55 | filesystems do well compared to others. When comparing versus ext3, | ||
56 | note that ext4 enables write barriers by default, while ext3 does | ||
57 | not enable write barriers by default. So it is useful to use | ||
58 | explicitly specify whether barriers are enabled or not when via the | ||
59 | '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems | ||
60 | for a fair comparison. When tuning ext3 for best benchmark numbers, | ||
61 | it is often worthwhile to try changing the data journaling mode; '-o | ||
62 | data=writeback' can be faster for some workloads. (Note however that | ||
63 | running mounted with data=writeback can potentially leave stale data | ||
64 | exposed in recently written files in case of an unclean shutdown, | ||
65 | which could be a security exposure in some situations.) Configuring | ||
66 | the filesystem with a large journal can also be helpful for | ||
67 | metadata-intensive workloads. | ||
68 | |||
69 | Features | ||
70 | ======== | ||
71 | |||
72 | Currently Available | ||
73 | ------------------- | ||
74 | |||
75 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) | ||
76 | * extent format reduces metadata overhead (RAM, IO for access, transactions) | ||
77 | * extent format more robust in face of on-disk corruption due to magics, | ||
78 | * internal redundancy in tree | ||
79 | * improved file allocation (multi-block alloc) | ||
80 | * lift 32000 subdirectory limit imposed by i_links_count[1] | ||
81 | * nsec timestamps for mtime, atime, ctime, create time | ||
82 | * inode version field on disk (NFSv4, Lustre) | ||
83 | * reduced e2fsck time via uninit_bg feature | ||
84 | * journal checksumming for robustness, performance | ||
85 | * persistent file preallocation (e.g for streaming media, databases) | ||
86 | * ability to pack bitmaps and inode tables into larger virtual groups via the | ||
87 | flex_bg feature | ||
88 | * large file support | ||
89 | * inode allocation using large virtual block groups via flex_bg | ||
90 | * delayed allocation | ||
91 | * large block (up to pagesize) support | ||
92 | * efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force | ||
93 | the ordering) | ||
94 | |||
95 | [1] Filesystems with a block size of 1k may see a limit imposed by the | ||
96 | directory hash tree having a maximum depth of two. | ||
97 | |||
98 | Options | ||
99 | ======= | ||
100 | |||
101 | When mounting an ext4 filesystem, the following option are accepted: | ||
102 | (*) == default | ||
103 | |||
104 | ro | ||
105 | Mount filesystem read only. Note that ext4 will replay the journal (and | ||
106 | thus write to the partition) even when mounted "read only". The mount | ||
107 | options "ro,noload" can be used to prevent writes to the filesystem. | ||
108 | |||
109 | journal_checksum | ||
110 | Enable checksumming of the journal transactions. This will allow the | ||
111 | recovery code in e2fsck and the kernel to detect corruption in the | ||
112 | kernel. It is a compatible change and will be ignored by older | ||
113 | kernels. | ||
114 | |||
115 | journal_async_commit | ||
116 | Commit block can be written to disk without waiting for descriptor | ||
117 | blocks. If enabled older kernels cannot mount the device. This will | ||
118 | enable 'journal_checksum' internally. | ||
119 | |||
120 | journal_path=path, journal_dev=devnum | ||
121 | When the external journal device's major/minor numbers have changed, | ||
122 | these options allow the user to specify the new journal location. The | ||
123 | journal device is identified through either its new major/minor numbers | ||
124 | encoded in devnum, or via a path to the device. | ||
125 | |||
126 | norecovery, noload | ||
127 | Don't load the journal on mounting. Note that if the filesystem was | ||
128 | not unmounted cleanly, skipping the journal replay will lead to the | ||
129 | filesystem containing inconsistencies that can lead to any number of | ||
130 | problems. | ||
131 | |||
132 | data=journal | ||
133 | All data are committed into the journal prior to being written into the | ||
134 | main file system. Enabling this mode will disable delayed allocation | ||
135 | and O_DIRECT support. | ||
136 | |||
137 | data=ordered (*) | ||
138 | All data are forced directly out to the main file system prior to its | ||
139 | metadata being committed to the journal. | ||
140 | |||
141 | data=writeback | ||
142 | Data ordering is not preserved, data may be written into the main file | ||
143 | system after its metadata has been committed to the journal. | ||
144 | |||
145 | commit=nrsec (*) | ||
146 | Ext4 can be told to sync all its data and metadata every 'nrsec' | ||
147 | seconds. The default value is 5 seconds. This means that if you lose | ||
148 | your power, you will lose as much as the latest 5 seconds of work (your | ||
149 | filesystem will not be damaged though, thanks to the journaling). This | ||
150 | default value (or any low value) will hurt performance, but it's good | ||
151 | for data-safety. Setting it to 0 will have the same effect as leaving | ||
152 | it at the default (5 seconds). Setting it to very large values will | ||
153 | improve performance. | ||
154 | |||
155 | barrier=<0|1(*)>, barrier(*), nobarrier | ||
156 | This enables/disables the use of write barriers in the jbd code. | ||
157 | barrier=0 disables, barrier=1 enables. This also requires an IO stack | ||
158 | which can support barriers, and if jbd gets an error on a barrier | ||
159 | write, it will disable again with a warning. Write barriers enforce | ||
160 | proper on-disk ordering of journal commits, making volatile disk write | ||
161 | caches safe to use, at some performance penalty. If your disks are | ||
162 | battery-backed in one way or another, disabling barriers may safely | ||
163 | improve performance. The mount options "barrier" and "nobarrier" can | ||
164 | also be used to enable or disable barriers, for consistency with other | ||
165 | ext4 mount options. | ||
166 | |||
167 | inode_readahead_blks=n | ||
168 | This tuning parameter controls the maximum number of inode table blocks | ||
169 | that ext4's inode table readahead algorithm will pre-read into the | ||
170 | buffer cache. The default value is 32 blocks. | ||
171 | |||
172 | nouser_xattr | ||
173 | Disables Extended User Attributes. See the attr(5) manual page for | ||
174 | more information about extended attributes. | ||
175 | |||
176 | noacl | ||
177 | This option disables POSIX Access Control List support. If ACL support | ||
178 | is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL | ||
179 | is enabled by default on mount. See the acl(5) manual page for more | ||
180 | information about acl. | ||
181 | |||
182 | bsddf (*) | ||
183 | Make 'df' act like BSD. | ||
184 | |||
185 | minixdf | ||
186 | Make 'df' act like Minix. | ||
187 | |||
188 | debug | ||
189 | Extra debugging information is sent to syslog. | ||
190 | |||
191 | abort | ||
192 | Simulate the effects of calling ext4_abort() for debugging purposes. | ||
193 | This is normally used while remounting a filesystem which is already | ||
194 | mounted. | ||
195 | |||
196 | errors=remount-ro | ||
197 | Remount the filesystem read-only on an error. | ||
198 | |||
199 | errors=continue | ||
200 | Keep going on a filesystem error. | ||
201 | |||
202 | errors=panic | ||
203 | Panic and halt the machine if an error occurs. (These mount options | ||
204 | override the errors behavior specified in the superblock, which can be | ||
205 | configured using tune2fs) | ||
206 | |||
207 | data_err=ignore(*) | ||
208 | Just print an error message if an error occurs in a file data buffer in | ||
209 | ordered mode. | ||
210 | data_err=abort | ||
211 | Abort the journal if an error occurs in a file data buffer in ordered | ||
212 | mode. | ||
213 | |||
214 | grpid | bsdgroups | ||
215 | New objects have the group ID of their parent. | ||
216 | |||
217 | nogrpid (*) | sysvgroups | ||
218 | New objects have the group ID of their creator. | ||
219 | |||
220 | resgid=n | ||
221 | The group ID which may use the reserved blocks. | ||
222 | |||
223 | resuid=n | ||
224 | The user ID which may use the reserved blocks. | ||
225 | |||
226 | sb= | ||
227 | Use alternate superblock at this location. | ||
228 | |||
229 | quota, noquota, grpquota, usrquota | ||
230 | These options are ignored by the filesystem. They are used only by | ||
231 | quota tools to recognize volumes where quota should be turned on. See | ||
232 | documentation in the quota-tools package for more details | ||
233 | (http://sourceforge.net/projects/linuxquota). | ||
234 | |||
235 | jqfmt=<quota type>, usrjquota=<file>, grpjquota=<file> | ||
236 | These options tell filesystem details about quota so that quota | ||
237 | information can be properly updated during journal replay. They replace | ||
238 | the above quota options. See documentation in the quota-tools package | ||
239 | for more details (http://sourceforge.net/projects/linuxquota). | ||
240 | |||
241 | stripe=n | ||
242 | Number of filesystem blocks that mballoc will try to use for allocation | ||
243 | size and alignment. For RAID5/6 systems this should be the number of | ||
244 | data disks * RAID chunk size in file system blocks. | ||
245 | |||
246 | delalloc (*) | ||
247 | Defer block allocation until just before ext4 writes out the block(s) | ||
248 | in question. This allows ext4 to better allocation decisions more | ||
249 | efficiently. | ||
250 | |||
251 | nodelalloc | ||
252 | Disable delayed allocation. Blocks are allocated when the data is | ||
253 | copied from userspace to the page cache, either via the write(2) system | ||
254 | call or when an mmap'ed page which was previously unallocated is | ||
255 | written for the first time. | ||
256 | |||
257 | max_batch_time=usec | ||
258 | Maximum amount of time ext4 should wait for additional filesystem | ||
259 | operations to be batch together with a synchronous write operation. | ||
260 | Since a synchronous write operation is going to force a commit and then | ||
261 | a wait for the I/O complete, it doesn't cost much, and can be a huge | ||
262 | throughput win, we wait for a small amount of time to see if any other | ||
263 | transactions can piggyback on the synchronous write. The algorithm | ||
264 | used is designed to automatically tune for the speed of the disk, by | ||
265 | measuring the amount of time (on average) that it takes to finish | ||
266 | committing a transaction. Call this time the "commit time". If the | ||
267 | time that the transaction has been running is less than the commit | ||
268 | time, ext4 will try sleeping for the commit time to see if other | ||
269 | operations will join the transaction. The commit time is capped by | ||
270 | the max_batch_time, which defaults to 15000us (15ms). This | ||
271 | optimization can be turned off entirely by setting max_batch_time to 0. | ||
272 | |||
273 | min_batch_time=usec | ||
274 | This parameter sets the commit time (as described above) to be at least | ||
275 | min_batch_time. It defaults to zero microseconds. Increasing this | ||
276 | parameter may improve the throughput of multi-threaded, synchronous | ||
277 | workloads on very fast disks, at the cost of increasing latency. | ||
278 | |||
279 | journal_ioprio=prio | ||
280 | The I/O priority (from 0 to 7, where 0 is the highest priority) which | ||
281 | should be used for I/O operations submitted by kjournald2 during a | ||
282 | commit operation. This defaults to 3, which is a slightly higher | ||
283 | priority than the default I/O priority. | ||
284 | |||
285 | auto_da_alloc(*), noauto_da_alloc | ||
286 | Many broken applications don't use fsync() when replacing existing | ||
287 | files via patterns such as fd = open("foo.new")/write(fd,..)/close(fd)/ | ||
288 | rename("foo.new", "foo"), or worse yet, fd = open("foo", | ||
289 | O_TRUNC)/write(fd,..)/close(fd). If auto_da_alloc is enabled, ext4 | ||
290 | will detect the replace-via-rename and replace-via-truncate patterns | ||
291 | and force that any delayed allocation blocks are allocated such that at | ||
292 | the next journal commit, in the default data=ordered mode, the data | ||
293 | blocks of the new file are forced to disk before the rename() operation | ||
294 | is committed. This provides roughly the same level of guarantees as | ||
295 | ext3, and avoids the "zero-length" problem that can happen when a | ||
296 | system crashes before the delayed allocation blocks are forced to disk. | ||
297 | |||
298 | noinit_itable | ||
299 | Do not initialize any uninitialized inode table blocks in the | ||
300 | background. This feature may be used by installation CD's so that the | ||
301 | install process can complete as quickly as possible; the inode table | ||
302 | initialization process would then be deferred until the next time the | ||
303 | file system is unmounted. | ||
304 | |||
305 | init_itable=n | ||
306 | The lazy itable init code will wait n times the number of milliseconds | ||
307 | it took to zero out the previous block group's inode table. This | ||
308 | minimizes the impact on the system performance while file system's | ||
309 | inode table is being initialized. | ||
310 | |||
311 | discard, nodiscard(*) | ||
312 | Controls whether ext4 should issue discard/TRIM commands to the | ||
313 | underlying block device when blocks are freed. This is useful for SSD | ||
314 | devices and sparse/thinly-provisioned LUNs, but it is off by default | ||
315 | until sufficient testing has been done. | ||
316 | |||
317 | nouid32 | ||
318 | Disables 32-bit UIDs and GIDs. This is for interoperability with | ||
319 | older kernels which only store and expect 16-bit values. | ||
320 | |||
321 | block_validity(*), noblock_validity | ||
322 | These options enable or disable the in-kernel facility for tracking | ||
323 | filesystem metadata blocks within internal data structures. This | ||
324 | allows multi- block allocator and other routines to notice bugs or | ||
325 | corrupted allocation bitmaps which cause blocks to be allocated which | ||
326 | overlap with filesystem metadata blocks. | ||
327 | |||
328 | dioread_lock, dioread_nolock | ||
329 | Controls whether or not ext4 should use the DIO read locking. If the | ||
330 | dioread_nolock option is specified ext4 will allocate uninitialized | ||
331 | extent before buffer write and convert the extent to initialized after | ||
332 | IO completes. This approach allows ext4 code to avoid using inode | ||
333 | mutex, which improves scalability on high speed storages. However this | ||
334 | does not work with data journaling and dioread_nolock option will be | ||
335 | ignored with kernel warning. Note that dioread_nolock code path is only | ||
336 | used for extent-based files. Because of the restrictions this options | ||
337 | comprises it is off by default (e.g. dioread_lock). | ||
338 | |||
339 | max_dir_size_kb=n | ||
340 | This limits the size of directories so that any attempt to expand them | ||
341 | beyond the specified limit in kilobytes will cause an ENOSPC error. | ||
342 | This is useful in memory constrained environments, where a very large | ||
343 | directory can cause severe performance problems or even provoke the Out | ||
344 | Of Memory killer. (For example, if there is only 512mb memory | ||
345 | available, a 176mb directory may seriously cramp the system's style.) | ||
346 | |||
347 | i_version | ||
348 | Enable 64-bit inode version support. This option is off by default. | ||
349 | |||
350 | dax | ||
351 | Use direct access (no page cache). See | ||
352 | Documentation/filesystems/dax.txt. Note that this option is | ||
353 | incompatible with data=journal. | ||
354 | |||
355 | Data Mode | ||
356 | ========= | ||
357 | There are 3 different data modes: | ||
358 | |||
359 | * writeback mode | ||
360 | |||
361 | In data=writeback mode, ext4 does not journal data at all. This mode provides | ||
362 | a similar level of journaling as that of XFS, JFS, and ReiserFS in its default | ||
363 | mode - metadata journaling. A crash+recovery can cause incorrect data to | ||
364 | appear in files which were written shortly before the crash. This mode will | ||
365 | typically provide the best ext4 performance. | ||
366 | |||
367 | * ordered mode | ||
368 | |||
369 | In data=ordered mode, ext4 only officially journals metadata, but it logically | ||
370 | groups metadata information related to data changes with the data blocks into | ||
371 | a single unit called a transaction. When it's time to write the new metadata | ||
372 | out to disk, the associated data blocks are written first. In general, this | ||
373 | mode performs slightly slower than writeback but significantly faster than | ||
374 | journal mode. | ||
375 | |||
376 | * journal mode | ||
377 | |||
378 | data=journal mode provides full data and metadata journaling. All new data is | ||
379 | written to the journal first, and then to its final location. In the event of | ||
380 | a crash, the journal can be replayed, bringing both data and metadata into a | ||
381 | consistent state. This mode is the slowest except when data needs to be read | ||
382 | from and written to disk at the same time where it outperforms all others | ||
383 | modes. Enabling this mode will disable delayed allocation and O_DIRECT | ||
384 | support. | ||
385 | |||
386 | /proc entries | ||
387 | ============= | ||
388 | |||
389 | Information about mounted ext4 file systems can be found in | ||
390 | /proc/fs/ext4. Each mounted filesystem will have a directory in | ||
391 | /proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or | ||
392 | /proc/fs/ext4/dm-0). The files in each per-device directory are shown | ||
393 | in table below. | ||
394 | |||
395 | Files in /proc/fs/ext4/<devname> | ||
396 | |||
397 | mb_groups | ||
398 | details of multiblock allocator buddy cache of free blocks | ||
399 | |||
400 | /sys entries | ||
401 | ============ | ||
402 | |||
403 | Information about mounted ext4 file systems can be found in | ||
404 | /sys/fs/ext4. Each mounted filesystem will have a directory in | ||
405 | /sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or | ||
406 | /sys/fs/ext4/dm-0). The files in each per-device directory are shown | ||
407 | in table below. | ||
408 | |||
409 | Files in /sys/fs/ext4/<devname>: | ||
410 | |||
411 | (see also Documentation/ABI/testing/sysfs-fs-ext4) | ||
412 | |||
413 | delayed_allocation_blocks | ||
414 | This file is read-only and shows the number of blocks that are dirty in | ||
415 | the page cache, but which do not have their location in the filesystem | ||
416 | allocated yet. | ||
417 | |||
418 | inode_goal | ||
419 | Tuning parameter which (if non-zero) controls the goal inode used by | ||
420 | the inode allocator in preference to all other allocation heuristics. | ||
421 | This is intended for debugging use only, and should be 0 on production | ||
422 | systems. | ||
423 | |||
424 | inode_readahead_blks | ||
425 | Tuning parameter which controls the maximum number of inode table | ||
426 | blocks that ext4's inode table readahead algorithm will pre-read into | ||
427 | the buffer cache. | ||
428 | |||
429 | lifetime_write_kbytes | ||
430 | This file is read-only and shows the number of kilobytes of data that | ||
431 | have been written to this filesystem since it was created. | ||
432 | |||
433 | max_writeback_mb_bump | ||
434 | The maximum number of megabytes the writeback code will try to write | ||
435 | out before move on to another inode. | ||
436 | |||
437 | mb_group_prealloc | ||
438 | The multiblock allocator will round up allocation requests to a | ||
439 | multiple of this tuning parameter if the stripe size is not set in the | ||
440 | ext4 superblock | ||
441 | |||
442 | mb_max_to_scan | ||
443 | The maximum number of extents the multiblock allocator will search to | ||
444 | find the best extent. | ||
445 | |||
446 | mb_min_to_scan | ||
447 | The minimum number of extents the multiblock allocator will search to | ||
448 | find the best extent. | ||
449 | |||
450 | mb_order2_req | ||
451 | Tuning parameter which controls the minimum size for requests (as a | ||
452 | power of 2) where the buddy cache is used. | ||
453 | |||
454 | mb_stats | ||
455 | Controls whether the multiblock allocator should collect statistics, | ||
456 | which are shown during the unmount. 1 means to collect statistics, 0 | ||
457 | means not to collect statistics. | ||
458 | |||
459 | mb_stream_req | ||
460 | Files which have fewer blocks than this tunable parameter will have | ||
461 | their blocks allocated out of a block group specific preallocation | ||
462 | pool, so that small files are packed closely together. Each large file | ||
463 | will have its blocks allocated out of its own unique preallocation | ||
464 | pool. | ||
465 | |||
466 | session_write_kbytes | ||
467 | This file is read-only and shows the number of kilobytes of data that | ||
468 | have been written to this filesystem since it was mounted. | ||
469 | |||
470 | reserved_clusters | ||
471 | This is RW file and contains number of reserved clusters in the file | ||
472 | system which will be used in the specific situations to avoid costly | ||
473 | zeroout, unexpected ENOSPC, or possible data loss. The default is 2% or | ||
474 | 4096 clusters, whichever is smaller and this can be changed however it | ||
475 | can never exceed number of clusters in the file system. If there is not | ||
476 | enough space for the reserved space when mounting the file mount will | ||
477 | _not_ fail. | ||
478 | |||
479 | Ioctls | ||
480 | ====== | ||
481 | |||
482 | There is some Ext4 specific functionality which can be accessed by applications | ||
483 | through the system call interfaces. The list of all Ext4 specific ioctls are | ||
484 | shown in the table below. | ||
485 | |||
486 | Table of Ext4 specific ioctls | ||
487 | |||
488 | EXT4_IOC_GETFLAGS | ||
489 | Get additional attributes associated with inode. The ioctl argument is | ||
490 | an integer bitfield, with bit values described in ext4.h. This ioctl is | ||
491 | an alias for FS_IOC_GETFLAGS. | ||
492 | |||
493 | EXT4_IOC_SETFLAGS | ||
494 | Set additional attributes associated with inode. The ioctl argument is | ||
495 | an integer bitfield, with bit values described in ext4.h. This ioctl is | ||
496 | an alias for FS_IOC_SETFLAGS. | ||
497 | |||
498 | EXT4_IOC_GETVERSION, EXT4_IOC_GETVERSION_OLD | ||
499 | Get the inode i_generation number stored for each inode. The | ||
500 | i_generation number is normally changed only when new inode is created | ||
501 | and it is particularly useful for network filesystems. The '_OLD' | ||
502 | version of this ioctl is an alias for FS_IOC_GETVERSION. | ||
503 | |||
504 | EXT4_IOC_SETVERSION, EXT4_IOC_SETVERSION_OLD | ||
505 | Set the inode i_generation number stored for each inode. The '_OLD' | ||
506 | version of this ioctl is an alias for FS_IOC_SETVERSION. | ||
507 | |||
508 | EXT4_IOC_GROUP_EXTEND | ||
509 | This ioctl has the same purpose as the resize mount option. It allows | ||
510 | to resize filesystem to the end of the last existing block group, | ||
511 | further resize has to be done with resize2fs, either online, or | ||
512 | offline. The argument points to the unsigned logn number representing | ||
513 | the filesystem new block count. | ||
514 | |||
515 | EXT4_IOC_MOVE_EXT | ||
516 | Move the block extents from orig_fd (the one this ioctl is pointing to) | ||
517 | to the donor_fd (the one specified in move_extent structure passed as | ||
518 | an argument to this ioctl). Then, exchange inode metadata between | ||
519 | orig_fd and donor_fd. This is especially useful for online | ||
520 | defragmentation, because the allocator has the opportunity to allocate | ||
521 | moved blocks better, ideally into one contiguous extent. | ||
522 | |||
523 | EXT4_IOC_GROUP_ADD | ||
524 | Add a new group descriptor to an existing or new group descriptor | ||
525 | block. The new group descriptor is described by ext4_new_group_input | ||
526 | structure, which is passed as an argument to this ioctl. This is | ||
527 | especially useful in conjunction with EXT4_IOC_GROUP_EXTEND, which | ||
528 | allows online resize of the filesystem to the end of the last existing | ||
529 | block group. Those two ioctls combined is used in userspace online | ||
530 | resize tool (e.g. resize2fs). | ||
531 | |||
532 | EXT4_IOC_MIGRATE | ||
533 | This ioctl operates on the filesystem itself. It converts (migrates) | ||
534 | ext3 indirect block mapped inode to ext4 extent mapped inode by walking | ||
535 | through indirect block mapping of the original inode and converting | ||
536 | contiguous block ranges into ext4 extents of the temporary inode. Then, | ||
537 | inodes are swapped. This ioctl might help, when migrating from ext3 to | ||
538 | ext4 filesystem, however suggestion is to create fresh ext4 filesystem | ||
539 | and copy data from the backup. Note, that filesystem has to support | ||
540 | extents for this ioctl to work. | ||
541 | |||
542 | EXT4_IOC_ALLOC_DA_BLKS | ||
543 | Force all of the delay allocated blocks to be allocated to preserve | ||
544 | application-expected ext3 behaviour. Note that this will also start | ||
545 | triggering a write of the data blocks, but this behaviour may change in | ||
546 | the future as it is not necessary and has been done this way only for | ||
547 | sake of simplicity. | ||
548 | |||
549 | EXT4_IOC_RESIZE_FS | ||
550 | Resize the filesystem to a new size. The number of blocks of resized | ||
551 | filesystem is passed in via 64 bit integer argument. The kernel | ||
552 | allocates bitmaps and inode table, the userspace tool thus just passes | ||
553 | the new number of blocks. | ||
554 | |||
555 | EXT4_IOC_SWAP_BOOT | ||
556 | Swap i_blocks and associated attributes (like i_blocks, i_size, | ||
557 | i_flags, ...) from the specified inode with inode EXT4_BOOT_LOADER_INO | ||
558 | (#5). This is typically used to store a boot loader in a secure part of | ||
559 | the filesystem, where it can't be changed by a normal user by accident. | ||
560 | The data blocks of the previous boot loader will be associated with the | ||
561 | given inode. | ||
562 | |||
563 | References | ||
564 | ========== | ||
565 | |||
566 | kernel source: <file:fs/ext4/> | ||
567 | <file:fs/jbd2/> | ||
568 | |||
569 | programs: http://e2fsprogs.sourceforge.net/ | ||
570 | |||
571 | useful links: http://fedoraproject.org/wiki/ext3-devel | ||
572 | http://www.bullopensource.org/ext4/ | ||
573 | http://ext4.wiki.kernel.org/index.php/Main_Page | ||
574 | http://fedoraproject.org/wiki/Features/Ext4 | ||
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 0873685bab0f..965745d5fb9a 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst | |||
@@ -71,6 +71,7 @@ configure specific aspects of kernel behavior to your liking. | |||
71 | java | 71 | java |
72 | ras | 72 | ras |
73 | bcache | 73 | bcache |
74 | ext4 | ||
74 | pm/index | 75 | pm/index |
75 | thunderbolt | 76 | thunderbolt |
76 | LSM/index | 77 | LSM/index |
diff --git a/Documentation/conf.py b/Documentation/conf.py index b691af4831fa..ede67ccafc29 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py | |||
@@ -383,6 +383,10 @@ latex_documents = [ | |||
383 | 'The kernel development community', 'manual'), | 383 | 'The kernel development community', 'manual'), |
384 | ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', | 384 | ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API', |
385 | 'The kernel development community', 'manual'), | 385 | 'The kernel development community', 'manual'), |
386 | ('admin-guide/ext4', 'ext4-admin-guide.tex', 'ext4 Administration Guide', | ||
387 | 'ext4 Community', 'manual'), | ||
388 | ('filesystems/ext4/index', 'ext4-data-structures.tex', | ||
389 | 'ext4 Data Structures and Algorithms', 'ext4 Community', 'manual'), | ||
386 | ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', | 390 | ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide', |
387 | 'The kernel development community', 'manual'), | 391 | 'The kernel development community', 'manual'), |
388 | ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', | 392 | ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', |
diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/about.rst index 0aadba052264..0aadba052264 100644 --- a/Documentation/filesystems/ext4/ondisk/about.rst +++ b/Documentation/filesystems/ext4/about.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/allocators.rst index 7aa85152ace3..7aa85152ace3 100644 --- a/Documentation/filesystems/ext4/ondisk/allocators.rst +++ b/Documentation/filesystems/ext4/allocators.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/attributes.rst index 0b01b67b81fe..54386a010a8d 100644 --- a/Documentation/filesystems/ext4/ondisk/attributes.rst +++ b/Documentation/filesystems/ext4/attributes.rst | |||
@@ -30,7 +30,7 @@ Extended attributes, when stored after the inode, have a header | |||
30 | ``ext4_xattr_ibody_header`` that is 4 bytes long: | 30 | ``ext4_xattr_ibody_header`` that is 4 bytes long: |
31 | 31 | ||
32 | .. list-table:: | 32 | .. list-table:: |
33 | :widths: 1 1 1 77 | 33 | :widths: 8 8 24 40 |
34 | :header-rows: 1 | 34 | :header-rows: 1 |
35 | 35 | ||
36 | * - Offset | 36 | * - Offset |
@@ -47,7 +47,7 @@ The beginning of an extended attribute block is in | |||
47 | ``struct ext4_xattr_header``, which is 32 bytes long: | 47 | ``struct ext4_xattr_header``, which is 32 bytes long: |
48 | 48 | ||
49 | .. list-table:: | 49 | .. list-table:: |
50 | :widths: 1 1 1 77 | 50 | :widths: 8 8 24 40 |
51 | :header-rows: 1 | 51 | :header-rows: 1 |
52 | 52 | ||
53 | * - Offset | 53 | * - Offset |
@@ -92,7 +92,7 @@ entries must be stored in sorted order. The sort order is | |||
92 | Attributes stored inside an inode do not need be stored in sorted order. | 92 | Attributes stored inside an inode do not need be stored in sorted order. |
93 | 93 | ||
94 | .. list-table:: | 94 | .. list-table:: |
95 | :widths: 1 1 1 77 | 95 | :widths: 8 8 24 40 |
96 | :header-rows: 1 | 96 | :header-rows: 1 |
97 | 97 | ||
98 | * - Offset | 98 | * - Offset |
@@ -157,7 +157,7 @@ attribute name index field is set, and matching string is removed from | |||
157 | the key name. Here is a map of name index values to key prefixes: | 157 | the key name. Here is a map of name index values to key prefixes: |
158 | 158 | ||
159 | .. list-table:: | 159 | .. list-table:: |
160 | :widths: 1 79 | 160 | :widths: 16 64 |
161 | :header-rows: 1 | 161 | :header-rows: 1 |
162 | 162 | ||
163 | * - Name Index | 163 | * - Name Index |
diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst index c6d88557553c..c6d88557553c 100644 --- a/Documentation/filesystems/ext4/ondisk/bigalloc.rst +++ b/Documentation/filesystems/ext4/bigalloc.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/bitmaps.rst index c7546dbc197a..c7546dbc197a 100644 --- a/Documentation/filesystems/ext4/ondisk/bitmaps.rst +++ b/Documentation/filesystems/ext4/bitmaps.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst index baf888e4c06a..baf888e4c06a 100644 --- a/Documentation/filesystems/ext4/ondisk/blockgroup.rst +++ b/Documentation/filesystems/ext4/blockgroup.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/blockmap.rst index 30e25750d88a..30e25750d88a 100644 --- a/Documentation/filesystems/ext4/ondisk/blockmap.rst +++ b/Documentation/filesystems/ext4/blockmap.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/blocks.rst index 73d4dc0f7bda..73d4dc0f7bda 100644 --- a/Documentation/filesystems/ext4/ondisk/blocks.rst +++ b/Documentation/filesystems/ext4/blocks.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/checksums.rst index 9d6a793b2e03..5519e253810d 100644 --- a/Documentation/filesystems/ext4/ondisk/checksums.rst +++ b/Documentation/filesystems/ext4/checksums.rst | |||
@@ -28,7 +28,7 @@ of checksum. The checksum function is whatever the superblock describes | |||
28 | (crc32c as of October 2013) unless noted otherwise. | 28 | (crc32c as of October 2013) unless noted otherwise. |
29 | 29 | ||
30 | .. list-table:: | 30 | .. list-table:: |
31 | :widths: 1 1 4 | 31 | :widths: 20 8 50 |
32 | :header-rows: 1 | 32 | :header-rows: 1 |
33 | 33 | ||
34 | * - Metadata | 34 | * - Metadata |
diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/directory.rst index 8fcba68c2884..614034e24669 100644 --- a/Documentation/filesystems/ext4/ondisk/directory.rst +++ b/Documentation/filesystems/ext4/directory.rst | |||
@@ -34,7 +34,7 @@ is at most 263 bytes long, though on disk you'll need to reference | |||
34 | ``dirent.rec_len`` to know for sure. | 34 | ``dirent.rec_len`` to know for sure. |
35 | 35 | ||
36 | .. list-table:: | 36 | .. list-table:: |
37 | :widths: 1 1 1 77 | 37 | :widths: 8 8 24 40 |
38 | :header-rows: 1 | 38 | :header-rows: 1 |
39 | 39 | ||
40 | * - Offset | 40 | * - Offset |
@@ -66,7 +66,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most | |||
66 | ``dirent.rec_len`` to know for sure. | 66 | ``dirent.rec_len`` to know for sure. |
67 | 67 | ||
68 | .. list-table:: | 68 | .. list-table:: |
69 | :widths: 1 1 1 77 | 69 | :widths: 8 8 24 40 |
70 | :header-rows: 1 | 70 | :header-rows: 1 |
71 | 71 | ||
72 | * - Offset | 72 | * - Offset |
@@ -99,7 +99,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most | |||
99 | The directory file type is one of the following values: | 99 | The directory file type is one of the following values: |
100 | 100 | ||
101 | .. list-table:: | 101 | .. list-table:: |
102 | :widths: 1 79 | 102 | :widths: 16 64 |
103 | :header-rows: 1 | 103 | :header-rows: 1 |
104 | 104 | ||
105 | * - Value | 105 | * - Value |
@@ -130,7 +130,7 @@ in the place where the name normally goes. The structure is | |||
130 | ``struct ext4_dir_entry_tail``: | 130 | ``struct ext4_dir_entry_tail``: |
131 | 131 | ||
132 | .. list-table:: | 132 | .. list-table:: |
133 | :widths: 1 1 1 77 | 133 | :widths: 8 8 24 40 |
134 | :header-rows: 1 | 134 | :header-rows: 1 |
135 | 135 | ||
136 | * - Offset | 136 | * - Offset |
@@ -212,7 +212,7 @@ The root of the htree is in ``struct dx_root``, which is the full length | |||
212 | of a data block: | 212 | of a data block: |
213 | 213 | ||
214 | .. list-table:: | 214 | .. list-table:: |
215 | :widths: 1 1 1 77 | 215 | :widths: 8 8 24 40 |
216 | :header-rows: 1 | 216 | :header-rows: 1 |
217 | 217 | ||
218 | * - Offset | 218 | * - Offset |
@@ -305,7 +305,7 @@ of a data block: | |||
305 | The directory hash is one of the following values: | 305 | The directory hash is one of the following values: |
306 | 306 | ||
307 | .. list-table:: | 307 | .. list-table:: |
308 | :widths: 1 79 | 308 | :widths: 16 64 |
309 | :header-rows: 1 | 309 | :header-rows: 1 |
310 | 310 | ||
311 | * - Value | 311 | * - Value |
@@ -327,7 +327,7 @@ Interior nodes of an htree are recorded as ``struct dx_node``, which is | |||
327 | also the full length of a data block: | 327 | also the full length of a data block: |
328 | 328 | ||
329 | .. list-table:: | 329 | .. list-table:: |
330 | :widths: 1 1 1 77 | 330 | :widths: 8 8 24 40 |
331 | :header-rows: 1 | 331 | :header-rows: 1 |
332 | 332 | ||
333 | * - Offset | 333 | * - Offset |
@@ -375,7 +375,7 @@ The hash maps that exist in both ``struct dx_root`` and | |||
375 | long: | 375 | long: |
376 | 376 | ||
377 | .. list-table:: | 377 | .. list-table:: |
378 | :widths: 1 1 1 77 | 378 | :widths: 8 8 24 40 |
379 | :header-rows: 1 | 379 | :header-rows: 1 |
380 | 380 | ||
381 | * - Offset | 381 | * - Offset |
@@ -405,7 +405,7 @@ directory index (which will ensure that there's space for the checksum. | |||
405 | The dx\_tail structure is 8 bytes long and looks like this: | 405 | The dx\_tail structure is 8 bytes long and looks like this: |
406 | 406 | ||
407 | .. list-table:: | 407 | .. list-table:: |
408 | :widths: 1 1 1 77 | 408 | :widths: 8 8 24 40 |
409 | :header-rows: 1 | 409 | :header-rows: 1 |
410 | 410 | ||
411 | * - Offset | 411 | * - Offset |
diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/dynamic.rst index bb0c84333341..bb0c84333341 100644 --- a/Documentation/filesystems/ext4/ondisk/dynamic.rst +++ b/Documentation/filesystems/ext4/dynamic.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/eainode.rst index ecc0d01a0a72..ecc0d01a0a72 100644 --- a/Documentation/filesystems/ext4/ondisk/eainode.rst +++ b/Documentation/filesystems/ext4/eainode.rst | |||
diff --git a/Documentation/filesystems/ext4/ext4.rst b/Documentation/filesystems/ext4/ext4.rst deleted file mode 100644 index 9d4368d591fa..000000000000 --- a/Documentation/filesystems/ext4/ext4.rst +++ /dev/null | |||
@@ -1,613 +0,0 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | ======================== | ||
4 | General Information | ||
5 | ======================== | ||
6 | |||
7 | Ext4 is an advanced level of the ext3 filesystem which incorporates | ||
8 | scalability and reliability enhancements for supporting large filesystems | ||
9 | (64 bit) in keeping with increasing disk capacities and state-of-the-art | ||
10 | feature requirements. | ||
11 | |||
12 | Mailing list: linux-ext4@vger.kernel.org | ||
13 | Web site: http://ext4.wiki.kernel.org | ||
14 | |||
15 | |||
16 | Quick usage instructions | ||
17 | ======================== | ||
18 | |||
19 | Note: More extensive information for getting started with ext4 can be | ||
20 | found at the ext4 wiki site at the URL: | ||
21 | http://ext4.wiki.kernel.org/index.php/Ext4_Howto | ||
22 | |||
23 | - The latest version of e2fsprogs can be found at: | ||
24 | |||
25 | https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ | ||
26 | |||
27 | or | ||
28 | |||
29 | http://sourceforge.net/project/showfiles.php?group_id=2406 | ||
30 | |||
31 | or grab the latest git repository from: | ||
32 | |||
33 | https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git | ||
34 | |||
35 | - Create a new filesystem using the ext4 filesystem type: | ||
36 | |||
37 | # mke2fs -t ext4 /dev/hda1 | ||
38 | |||
39 | Or to configure an existing ext3 filesystem to support extents: | ||
40 | |||
41 | # tune2fs -O extents /dev/hda1 | ||
42 | |||
43 | If the filesystem was created with 128 byte inodes, it can be | ||
44 | converted to use 256 byte for greater efficiency via: | ||
45 | |||
46 | # tune2fs -I 256 /dev/hda1 | ||
47 | |||
48 | - Mounting: | ||
49 | |||
50 | # mount -t ext4 /dev/hda1 /wherever | ||
51 | |||
52 | - When comparing performance with other filesystems, it's always | ||
53 | important to try multiple workloads; very often a subtle change in a | ||
54 | workload parameter can completely change the ranking of which | ||
55 | filesystems do well compared to others. When comparing versus ext3, | ||
56 | note that ext4 enables write barriers by default, while ext3 does | ||
57 | not enable write barriers by default. So it is useful to use | ||
58 | explicitly specify whether barriers are enabled or not when via the | ||
59 | '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems | ||
60 | for a fair comparison. When tuning ext3 for best benchmark numbers, | ||
61 | it is often worthwhile to try changing the data journaling mode; '-o | ||
62 | data=writeback' can be faster for some workloads. (Note however that | ||
63 | running mounted with data=writeback can potentially leave stale data | ||
64 | exposed in recently written files in case of an unclean shutdown, | ||
65 | which could be a security exposure in some situations.) Configuring | ||
66 | the filesystem with a large journal can also be helpful for | ||
67 | metadata-intensive workloads. | ||
68 | |||
69 | Features | ||
70 | ======== | ||
71 | |||
72 | Currently Available | ||
73 | ------------------- | ||
74 | |||
75 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) | ||
76 | * extent format reduces metadata overhead (RAM, IO for access, transactions) | ||
77 | * extent format more robust in face of on-disk corruption due to magics, | ||
78 | * internal redundancy in tree | ||
79 | * improved file allocation (multi-block alloc) | ||
80 | * lift 32000 subdirectory limit imposed by i_links_count[1] | ||
81 | * nsec timestamps for mtime, atime, ctime, create time | ||
82 | * inode version field on disk (NFSv4, Lustre) | ||
83 | * reduced e2fsck time via uninit_bg feature | ||
84 | * journal checksumming for robustness, performance | ||
85 | * persistent file preallocation (e.g for streaming media, databases) | ||
86 | * ability to pack bitmaps and inode tables into larger virtual groups via the | ||
87 | flex_bg feature | ||
88 | * large file support | ||
89 | * inode allocation using large virtual block groups via flex_bg | ||
90 | * delayed allocation | ||
91 | * large block (up to pagesize) support | ||
92 | * efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force | ||
93 | the ordering) | ||
94 | |||
95 | [1] Filesystems with a block size of 1k may see a limit imposed by the | ||
96 | directory hash tree having a maximum depth of two. | ||
97 | |||
98 | Options | ||
99 | ======= | ||
100 | |||
101 | When mounting an ext4 filesystem, the following option are accepted: | ||
102 | (*) == default | ||
103 | |||
104 | ======================= ======================================================= | ||
105 | Mount Option Description | ||
106 | ======================= ======================================================= | ||
107 | ro Mount filesystem read only. Note that ext4 will | ||
108 | replay the journal (and thus write to the | ||
109 | partition) even when mounted "read only". The | ||
110 | mount options "ro,noload" can be used to prevent | ||
111 | writes to the filesystem. | ||
112 | |||
113 | journal_checksum Enable checksumming of the journal transactions. | ||
114 | This will allow the recovery code in e2fsck and the | ||
115 | kernel to detect corruption in the kernel. It is a | ||
116 | compatible change and will be ignored by older kernels. | ||
117 | |||
118 | journal_async_commit Commit block can be written to disk without waiting | ||
119 | for descriptor blocks. If enabled older kernels cannot | ||
120 | mount the device. This will enable 'journal_checksum' | ||
121 | internally. | ||
122 | |||
123 | journal_path=path | ||
124 | journal_dev=devnum When the external journal device's major/minor numbers | ||
125 | have changed, these options allow the user to specify | ||
126 | the new journal location. The journal device is | ||
127 | identified through either its new major/minor numbers | ||
128 | encoded in devnum, or via a path to the device. | ||
129 | |||
130 | norecovery Don't load the journal on mounting. Note that | ||
131 | noload if the filesystem was not unmounted cleanly, | ||
132 | skipping the journal replay will lead to the | ||
133 | filesystem containing inconsistencies that can | ||
134 | lead to any number of problems. | ||
135 | |||
136 | data=journal All data are committed into the journal prior to being | ||
137 | written into the main file system. Enabling | ||
138 | this mode will disable delayed allocation and | ||
139 | O_DIRECT support. | ||
140 | |||
141 | data=ordered (*) All data are forced directly out to the main file | ||
142 | system prior to its metadata being committed to the | ||
143 | journal. | ||
144 | |||
145 | data=writeback Data ordering is not preserved, data may be written | ||
146 | into the main file system after its metadata has been | ||
147 | committed to the journal. | ||
148 | |||
149 | commit=nrsec (*) Ext4 can be told to sync all its data and metadata | ||
150 | every 'nrsec' seconds. The default value is 5 seconds. | ||
151 | This means that if you lose your power, you will lose | ||
152 | as much as the latest 5 seconds of work (your | ||
153 | filesystem will not be damaged though, thanks to the | ||
154 | journaling). This default value (or any low value) | ||
155 | will hurt performance, but it's good for data-safety. | ||
156 | Setting it to 0 will have the same effect as leaving | ||
157 | it at the default (5 seconds). | ||
158 | Setting it to very large values will improve | ||
159 | performance. | ||
160 | |||
161 | barrier=<0|1(*)> This enables/disables the use of write barriers in | ||
162 | barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. | ||
163 | nobarrier This also requires an IO stack which can support | ||
164 | barriers, and if jbd gets an error on a barrier | ||
165 | write, it will disable again with a warning. | ||
166 | Write barriers enforce proper on-disk ordering | ||
167 | of journal commits, making volatile disk write caches | ||
168 | safe to use, at some performance penalty. If | ||
169 | your disks are battery-backed in one way or another, | ||
170 | disabling barriers may safely improve performance. | ||
171 | The mount options "barrier" and "nobarrier" can | ||
172 | also be used to enable or disable barriers, for | ||
173 | consistency with other ext4 mount options. | ||
174 | |||
175 | inode_readahead_blks=n This tuning parameter controls the maximum | ||
176 | number of inode table blocks that ext4's inode | ||
177 | table readahead algorithm will pre-read into | ||
178 | the buffer cache. The default value is 32 blocks. | ||
179 | |||
180 | nouser_xattr Disables Extended User Attributes. See the | ||
181 | attr(5) manual page for more information about | ||
182 | extended attributes. | ||
183 | |||
184 | noacl This option disables POSIX Access Control List | ||
185 | support. If ACL support is enabled in the kernel | ||
186 | configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is | ||
187 | enabled by default on mount. See the acl(5) manual | ||
188 | page for more information about acl. | ||
189 | |||
190 | bsddf (*) Make 'df' act like BSD. | ||
191 | minixdf Make 'df' act like Minix. | ||
192 | |||
193 | debug Extra debugging information is sent to syslog. | ||
194 | |||
195 | abort Simulate the effects of calling ext4_abort() for | ||
196 | debugging purposes. This is normally used while | ||
197 | remounting a filesystem which is already mounted. | ||
198 | |||
199 | errors=remount-ro Remount the filesystem read-only on an error. | ||
200 | errors=continue Keep going on a filesystem error. | ||
201 | errors=panic Panic and halt the machine if an error occurs. | ||
202 | (These mount options override the errors behavior | ||
203 | specified in the superblock, which can be configured | ||
204 | using tune2fs) | ||
205 | |||
206 | data_err=ignore(*) Just print an error message if an error occurs | ||
207 | in a file data buffer in ordered mode. | ||
208 | data_err=abort Abort the journal if an error occurs in a file | ||
209 | data buffer in ordered mode. | ||
210 | |||
211 | grpid New objects have the group ID of their parent. | ||
212 | bsdgroups | ||
213 | |||
214 | nogrpid (*) New objects have the group ID of their creator. | ||
215 | sysvgroups | ||
216 | |||
217 | resgid=n The group ID which may use the reserved blocks. | ||
218 | |||
219 | resuid=n The user ID which may use the reserved blocks. | ||
220 | |||
221 | sb=n Use alternate superblock at this location. | ||
222 | |||
223 | quota These options are ignored by the filesystem. They | ||
224 | noquota are used only by quota tools to recognize volumes | ||
225 | grpquota where quota should be turned on. See documentation | ||
226 | usrquota in the quota-tools package for more details | ||
227 | (http://sourceforge.net/projects/linuxquota). | ||
228 | |||
229 | jqfmt=<quota type> These options tell filesystem details about quota | ||
230 | usrjquota=<file> so that quota information can be properly updated | ||
231 | grpjquota=<file> during journal replay. They replace the above | ||
232 | quota options. See documentation in the quota-tools | ||
233 | package for more details | ||
234 | (http://sourceforge.net/projects/linuxquota). | ||
235 | |||
236 | stripe=n Number of filesystem blocks that mballoc will try | ||
237 | to use for allocation size and alignment. For RAID5/6 | ||
238 | systems this should be the number of data | ||
239 | disks * RAID chunk size in file system blocks. | ||
240 | |||
241 | delalloc (*) Defer block allocation until just before ext4 | ||
242 | writes out the block(s) in question. This | ||
243 | allows ext4 to better allocation decisions | ||
244 | more efficiently. | ||
245 | nodelalloc Disable delayed allocation. Blocks are allocated | ||
246 | when the data is copied from userspace to the | ||
247 | page cache, either via the write(2) system call | ||
248 | or when an mmap'ed page which was previously | ||
249 | unallocated is written for the first time. | ||
250 | |||
251 | max_batch_time=usec Maximum amount of time ext4 should wait for | ||
252 | additional filesystem operations to be batch | ||
253 | together with a synchronous write operation. | ||
254 | Since a synchronous write operation is going to | ||
255 | force a commit and then a wait for the I/O | ||
256 | complete, it doesn't cost much, and can be a | ||
257 | huge throughput win, we wait for a small amount | ||
258 | of time to see if any other transactions can | ||
259 | piggyback on the synchronous write. The | ||
260 | algorithm used is designed to automatically tune | ||
261 | for the speed of the disk, by measuring the | ||
262 | amount of time (on average) that it takes to | ||
263 | finish committing a transaction. Call this time | ||
264 | the "commit time". If the time that the | ||
265 | transaction has been running is less than the | ||
266 | commit time, ext4 will try sleeping for the | ||
267 | commit time to see if other operations will join | ||
268 | the transaction. The commit time is capped by | ||
269 | the max_batch_time, which defaults to 15000us | ||
270 | (15ms). This optimization can be turned off | ||
271 | entirely by setting max_batch_time to 0. | ||
272 | |||
273 | min_batch_time=usec This parameter sets the commit time (as | ||
274 | described above) to be at least min_batch_time. | ||
275 | It defaults to zero microseconds. Increasing | ||
276 | this parameter may improve the throughput of | ||
277 | multi-threaded, synchronous workloads on very | ||
278 | fast disks, at the cost of increasing latency. | ||
279 | |||
280 | journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the | ||
281 | highest priority) which should be used for I/O | ||
282 | operations submitted by kjournald2 during a | ||
283 | commit operation. This defaults to 3, which is | ||
284 | a slightly higher priority than the default I/O | ||
285 | priority. | ||
286 | |||
287 | auto_da_alloc(*) Many broken applications don't use fsync() when | ||
288 | noauto_da_alloc replacing existing files via patterns such as | ||
289 | fd = open("foo.new")/write(fd,..)/close(fd)/ | ||
290 | rename("foo.new", "foo"), or worse yet, | ||
291 | fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). | ||
292 | If auto_da_alloc is enabled, ext4 will detect | ||
293 | the replace-via-rename and replace-via-truncate | ||
294 | patterns and force that any delayed allocation | ||
295 | blocks are allocated such that at the next | ||
296 | journal commit, in the default data=ordered | ||
297 | mode, the data blocks of the new file are forced | ||
298 | to disk before the rename() operation is | ||
299 | committed. This provides roughly the same level | ||
300 | of guarantees as ext3, and avoids the | ||
301 | "zero-length" problem that can happen when a | ||
302 | system crashes before the delayed allocation | ||
303 | blocks are forced to disk. | ||
304 | |||
305 | noinit_itable Do not initialize any uninitialized inode table | ||
306 | blocks in the background. This feature may be | ||
307 | used by installation CD's so that the install | ||
308 | process can complete as quickly as possible; the | ||
309 | inode table initialization process would then be | ||
310 | deferred until the next time the file system | ||
311 | is unmounted. | ||
312 | |||
313 | init_itable=n The lazy itable init code will wait n times the | ||
314 | number of milliseconds it took to zero out the | ||
315 | previous block group's inode table. This | ||
316 | minimizes the impact on the system performance | ||
317 | while file system's inode table is being initialized. | ||
318 | |||
319 | discard Controls whether ext4 should issue discard/TRIM | ||
320 | nodiscard(*) commands to the underlying block device when | ||
321 | blocks are freed. This is useful for SSD devices | ||
322 | and sparse/thinly-provisioned LUNs, but it is off | ||
323 | by default until sufficient testing has been done. | ||
324 | |||
325 | nouid32 Disables 32-bit UIDs and GIDs. This is for | ||
326 | interoperability with older kernels which only | ||
327 | store and expect 16-bit values. | ||
328 | |||
329 | block_validity(*) These options enable or disable the in-kernel | ||
330 | noblock_validity facility for tracking filesystem metadata blocks | ||
331 | within internal data structures. This allows multi- | ||
332 | block allocator and other routines to notice | ||
333 | bugs or corrupted allocation bitmaps which cause | ||
334 | blocks to be allocated which overlap with | ||
335 | filesystem metadata blocks. | ||
336 | |||
337 | dioread_lock Controls whether or not ext4 should use the DIO read | ||
338 | dioread_nolock locking. If the dioread_nolock option is specified | ||
339 | ext4 will allocate uninitialized extent before buffer | ||
340 | write and convert the extent to initialized after IO | ||
341 | completes. This approach allows ext4 code to avoid | ||
342 | using inode mutex, which improves scalability on high | ||
343 | speed storages. However this does not work with | ||
344 | data journaling and dioread_nolock option will be | ||
345 | ignored with kernel warning. Note that dioread_nolock | ||
346 | code path is only used for extent-based files. | ||
347 | Because of the restrictions this options comprises | ||
348 | it is off by default (e.g. dioread_lock). | ||
349 | |||
350 | max_dir_size_kb=n This limits the size of directories so that any | ||
351 | attempt to expand them beyond the specified | ||
352 | limit in kilobytes will cause an ENOSPC error. | ||
353 | This is useful in memory constrained | ||
354 | environments, where a very large directory can | ||
355 | cause severe performance problems or even | ||
356 | provoke the Out Of Memory killer. (For example, | ||
357 | if there is only 512mb memory available, a 176mb | ||
358 | directory may seriously cramp the system's style.) | ||
359 | |||
360 | i_version Enable 64-bit inode version support. This option is | ||
361 | off by default. | ||
362 | |||
363 | dax Use direct access (no page cache). See | ||
364 | Documentation/filesystems/dax.txt. Note that | ||
365 | this option is incompatible with data=journal. | ||
366 | ======================= ======================================================= | ||
367 | |||
368 | Data Mode | ||
369 | ========= | ||
370 | There are 3 different data modes: | ||
371 | |||
372 | * writeback mode | ||
373 | |||
374 | In data=writeback mode, ext4 does not journal data at all. This mode provides | ||
375 | a similar level of journaling as that of XFS, JFS, and ReiserFS in its default | ||
376 | mode - metadata journaling. A crash+recovery can cause incorrect data to | ||
377 | appear in files which were written shortly before the crash. This mode will | ||
378 | typically provide the best ext4 performance. | ||
379 | |||
380 | * ordered mode | ||
381 | |||
382 | In data=ordered mode, ext4 only officially journals metadata, but it logically | ||
383 | groups metadata information related to data changes with the data blocks into | ||
384 | a single unit called a transaction. When it's time to write the new metadata | ||
385 | out to disk, the associated data blocks are written first. In general, this | ||
386 | mode performs slightly slower than writeback but significantly faster than | ||
387 | journal mode. | ||
388 | |||
389 | * journal mode | ||
390 | |||
391 | data=journal mode provides full data and metadata journaling. All new data is | ||
392 | written to the journal first, and then to its final location. In the event of | ||
393 | a crash, the journal can be replayed, bringing both data and metadata into a | ||
394 | consistent state. This mode is the slowest except when data needs to be read | ||
395 | from and written to disk at the same time where it outperforms all others | ||
396 | modes. Enabling this mode will disable delayed allocation and O_DIRECT | ||
397 | support. | ||
398 | |||
399 | /proc entries | ||
400 | ============= | ||
401 | |||
402 | Information about mounted ext4 file systems can be found in | ||
403 | /proc/fs/ext4. Each mounted filesystem will have a directory in | ||
404 | /proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or | ||
405 | /proc/fs/ext4/dm-0). The files in each per-device directory are shown | ||
406 | in table below. | ||
407 | |||
408 | Files in /proc/fs/ext4/<devname> | ||
409 | |||
410 | ================ ======= | ||
411 | File Content | ||
412 | ================ ======= | ||
413 | mb_groups details of multiblock allocator buddy cache of free blocks | ||
414 | ================ ======= | ||
415 | |||
416 | /sys entries | ||
417 | ============ | ||
418 | |||
419 | Information about mounted ext4 file systems can be found in | ||
420 | /sys/fs/ext4. Each mounted filesystem will have a directory in | ||
421 | /sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or | ||
422 | /sys/fs/ext4/dm-0). The files in each per-device directory are shown | ||
423 | in table below. | ||
424 | |||
425 | Files in /sys/fs/ext4/<devname>: | ||
426 | |||
427 | (see also Documentation/ABI/testing/sysfs-fs-ext4) | ||
428 | |||
429 | ============================= ================================================= | ||
430 | File Content | ||
431 | ============================= ================================================= | ||
432 | delayed_allocation_blocks This file is read-only and shows the number of | ||
433 | blocks that are dirty in the page cache, but | ||
434 | which do not have their location in the | ||
435 | filesystem allocated yet. | ||
436 | |||
437 | inode_goal Tuning parameter which (if non-zero) controls | ||
438 | the goal inode used by the inode allocator in | ||
439 | preference to all other allocation heuristics. | ||
440 | This is intended for debugging use only, and | ||
441 | should be 0 on production systems. | ||
442 | |||
443 | inode_readahead_blks Tuning parameter which controls the maximum | ||
444 | number of inode table blocks that ext4's inode | ||
445 | table readahead algorithm will pre-read into | ||
446 | the buffer cache | ||
447 | |||
448 | lifetime_write_kbytes This file is read-only and shows the number of | ||
449 | kilobytes of data that have been written to this | ||
450 | filesystem since it was created. | ||
451 | |||
452 | max_writeback_mb_bump The maximum number of megabytes the writeback | ||
453 | code will try to write out before move on to | ||
454 | another inode. | ||
455 | |||
456 | mb_group_prealloc The multiblock allocator will round up allocation | ||
457 | requests to a multiple of this tuning parameter if | ||
458 | the stripe size is not set in the ext4 superblock | ||
459 | |||
460 | mb_max_to_scan The maximum number of extents the multiblock | ||
461 | allocator will search to find the best extent | ||
462 | |||
463 | mb_min_to_scan The minimum number of extents the multiblock | ||
464 | allocator will search to find the best extent | ||
465 | |||
466 | mb_order2_req Tuning parameter which controls the minimum size | ||
467 | for requests (as a power of 2) where the buddy | ||
468 | cache is used | ||
469 | |||
470 | mb_stats Controls whether the multiblock allocator should | ||
471 | collect statistics, which are shown during the | ||
472 | unmount. 1 means to collect statistics, 0 means | ||
473 | not to collect statistics | ||
474 | |||
475 | mb_stream_req Files which have fewer blocks than this tunable | ||
476 | parameter will have their blocks allocated out | ||
477 | of a block group specific preallocation pool, so | ||
478 | that small files are packed closely together. | ||
479 | Each large file will have its blocks allocated | ||
480 | out of its own unique preallocation pool. | ||
481 | |||
482 | session_write_kbytes This file is read-only and shows the number of | ||
483 | kilobytes of data that have been written to this | ||
484 | filesystem since it was mounted. | ||
485 | |||
486 | reserved_clusters This is RW file and contains number of reserved | ||
487 | clusters in the file system which will be used | ||
488 | in the specific situations to avoid costly | ||
489 | zeroout, unexpected ENOSPC, or possible data | ||
490 | loss. The default is 2% or 4096 clusters, | ||
491 | whichever is smaller and this can be changed | ||
492 | however it can never exceed number of clusters | ||
493 | in the file system. If there is not enough space | ||
494 | for the reserved space when mounting the file | ||
495 | mount will _not_ fail. | ||
496 | ============================= ================================================= | ||
497 | |||
498 | Ioctls | ||
499 | ====== | ||
500 | |||
501 | There is some Ext4 specific functionality which can be accessed by applications | ||
502 | through the system call interfaces. The list of all Ext4 specific ioctls are | ||
503 | shown in the table below. | ||
504 | |||
505 | Table of Ext4 specific ioctls | ||
506 | |||
507 | ============================= ================================================= | ||
508 | Ioctl Description | ||
509 | ============================= ================================================= | ||
510 | EXT4_IOC_GETFLAGS Get additional attributes associated with inode. | ||
511 | The ioctl argument is an integer bitfield, with | ||
512 | bit values described in ext4.h. This ioctl is an | ||
513 | alias for FS_IOC_GETFLAGS. | ||
514 | |||
515 | EXT4_IOC_SETFLAGS Set additional attributes associated with inode. | ||
516 | The ioctl argument is an integer bitfield, with | ||
517 | bit values described in ext4.h. This ioctl is an | ||
518 | alias for FS_IOC_SETFLAGS. | ||
519 | |||
520 | EXT4_IOC_GETVERSION | ||
521 | EXT4_IOC_GETVERSION_OLD | ||
522 | Get the inode i_generation number stored for | ||
523 | each inode. The i_generation number is normally | ||
524 | changed only when new inode is created and it is | ||
525 | particularly useful for network filesystems. The | ||
526 | '_OLD' version of this ioctl is an alias for | ||
527 | FS_IOC_GETVERSION. | ||
528 | |||
529 | EXT4_IOC_SETVERSION | ||
530 | EXT4_IOC_SETVERSION_OLD | ||
531 | Set the inode i_generation number stored for | ||
532 | each inode. The '_OLD' version of this ioctl | ||
533 | is an alias for FS_IOC_SETVERSION. | ||
534 | |||
535 | EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize | ||
536 | mount option. It allows to resize filesystem | ||
537 | to the end of the last existing block group, | ||
538 | further resize has to be done with resize2fs, | ||
539 | either online, or offline. The argument points | ||
540 | to the unsigned logn number representing the | ||
541 | filesystem new block count. | ||
542 | |||
543 | EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one | ||
544 | this ioctl is pointing to) to the donor_fd (the | ||
545 | one specified in move_extent structure passed | ||
546 | as an argument to this ioctl). Then, exchange | ||
547 | inode metadata between orig_fd and donor_fd. | ||
548 | This is especially useful for online | ||
549 | defragmentation, because the allocator has the | ||
550 | opportunity to allocate moved blocks better, | ||
551 | ideally into one contiguous extent. | ||
552 | |||
553 | EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or | ||
554 | new group descriptor block. The new group | ||
555 | descriptor is described by ext4_new_group_input | ||
556 | structure, which is passed as an argument to | ||
557 | this ioctl. This is especially useful in | ||
558 | conjunction with EXT4_IOC_GROUP_EXTEND, | ||
559 | which allows online resize of the filesystem | ||
560 | to the end of the last existing block group. | ||
561 | Those two ioctls combined is used in userspace | ||
562 | online resize tool (e.g. resize2fs). | ||
563 | |||
564 | EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself. | ||
565 | It converts (migrates) ext3 indirect block mapped | ||
566 | inode to ext4 extent mapped inode by walking | ||
567 | through indirect block mapping of the original | ||
568 | inode and converting contiguous block ranges | ||
569 | into ext4 extents of the temporary inode. Then, | ||
570 | inodes are swapped. This ioctl might help, when | ||
571 | migrating from ext3 to ext4 filesystem, however | ||
572 | suggestion is to create fresh ext4 filesystem | ||
573 | and copy data from the backup. Note, that | ||
574 | filesystem has to support extents for this ioctl | ||
575 | to work. | ||
576 | |||
577 | EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be | ||
578 | allocated to preserve application-expected ext3 | ||
579 | behaviour. Note that this will also start | ||
580 | triggering a write of the data blocks, but this | ||
581 | behaviour may change in the future as it is | ||
582 | not necessary and has been done this way only | ||
583 | for sake of simplicity. | ||
584 | |||
585 | EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number | ||
586 | of blocks of resized filesystem is passed in via | ||
587 | 64 bit integer argument. The kernel allocates | ||
588 | bitmaps and inode table, the userspace tool thus | ||
589 | just passes the new number of blocks. | ||
590 | |||
591 | EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes | ||
592 | (like i_blocks, i_size, i_flags, ...) from | ||
593 | the specified inode with inode | ||
594 | EXT4_BOOT_LOADER_INO (#5). This is typically | ||
595 | used to store a boot loader in a secure part of | ||
596 | the filesystem, where it can't be changed by a | ||
597 | normal user by accident. | ||
598 | The data blocks of the previous boot loader | ||
599 | will be associated with the given inode. | ||
600 | ============================= ================================================= | ||
601 | |||
602 | References | ||
603 | ========== | ||
604 | |||
605 | kernel source: <file:fs/ext4/> | ||
606 | <file:fs/jbd2/> | ||
607 | |||
608 | programs: http://e2fsprogs.sourceforge.net/ | ||
609 | |||
610 | useful links: http://fedoraproject.org/wiki/ext3-devel | ||
611 | http://www.bullopensource.org/ext4/ | ||
612 | http://ext4.wiki.kernel.org/index.php/Main_Page | ||
613 | http://fedoraproject.org/wiki/Features/Ext4 | ||
diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/globals.rst index 368bf7662b96..368bf7662b96 100644 --- a/Documentation/filesystems/ext4/ondisk/globals.rst +++ b/Documentation/filesystems/ext4/globals.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst index 759827e5d2cf..0f783ed88592 100644 --- a/Documentation/filesystems/ext4/ondisk/group_descr.rst +++ b/Documentation/filesystems/ext4/group_descr.rst | |||
@@ -43,7 +43,7 @@ entire bitmap. | |||
43 | The block group descriptor is laid out in ``struct ext4_group_desc``. | 43 | The block group descriptor is laid out in ``struct ext4_group_desc``. |
44 | 44 | ||
45 | .. list-table:: | 45 | .. list-table:: |
46 | :widths: 1 1 1 77 | 46 | :widths: 8 8 24 40 |
47 | :header-rows: 1 | 47 | :header-rows: 1 |
48 | 48 | ||
49 | * - Offset | 49 | * - Offset |
@@ -157,7 +157,7 @@ The block group descriptor is laid out in ``struct ext4_group_desc``. | |||
157 | Block group flags can be any combination of the following: | 157 | Block group flags can be any combination of the following: |
158 | 158 | ||
159 | .. list-table:: | 159 | .. list-table:: |
160 | :widths: 1 79 | 160 | :widths: 16 64 |
161 | :header-rows: 1 | 161 | :header-rows: 1 |
162 | 162 | ||
163 | * - Value | 163 | * - Value |
diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ifork.rst index 5dbe3b2b121a..b9816d5a896b 100644 --- a/Documentation/filesystems/ext4/ondisk/ifork.rst +++ b/Documentation/filesystems/ext4/ifork.rst | |||
@@ -68,7 +68,7 @@ The extent tree header is recorded in ``struct ext4_extent_header``, | |||
68 | which is 12 bytes long: | 68 | which is 12 bytes long: |
69 | 69 | ||
70 | .. list-table:: | 70 | .. list-table:: |
71 | :widths: 1 1 1 77 | 71 | :widths: 8 8 24 40 |
72 | :header-rows: 1 | 72 | :header-rows: 1 |
73 | 73 | ||
74 | * - Offset | 74 | * - Offset |
@@ -104,7 +104,7 @@ Internal nodes of the extent tree, also known as index nodes, are | |||
104 | recorded as ``struct ext4_extent_idx``, and are 12 bytes long: | 104 | recorded as ``struct ext4_extent_idx``, and are 12 bytes long: |
105 | 105 | ||
106 | .. list-table:: | 106 | .. list-table:: |
107 | :widths: 1 1 1 77 | 107 | :widths: 8 8 24 40 |
108 | :header-rows: 1 | 108 | :header-rows: 1 |
109 | 109 | ||
110 | * - Offset | 110 | * - Offset |
@@ -134,7 +134,7 @@ Leaf nodes of the extent tree are recorded as ``struct ext4_extent``, | |||
134 | and are also 12 bytes long: | 134 | and are also 12 bytes long: |
135 | 135 | ||
136 | .. list-table:: | 136 | .. list-table:: |
137 | :widths: 1 1 1 77 | 137 | :widths: 8 8 24 40 |
138 | :header-rows: 1 | 138 | :header-rows: 1 |
139 | 139 | ||
140 | * - Offset | 140 | * - Offset |
@@ -174,7 +174,7 @@ including) the checksum itself. | |||
174 | ``struct ext4_extent_tail`` is 4 bytes long: | 174 | ``struct ext4_extent_tail`` is 4 bytes long: |
175 | 175 | ||
176 | .. list-table:: | 176 | .. list-table:: |
177 | :widths: 1 1 1 77 | 177 | :widths: 8 8 24 40 |
178 | :header-rows: 1 | 178 | :header-rows: 1 |
179 | 179 | ||
180 | * - Offset | 180 | * - Offset |
diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst index 71121605558c..3be3e54d480d 100644 --- a/Documentation/filesystems/ext4/index.rst +++ b/Documentation/filesystems/ext4/index.rst | |||
@@ -1,17 +1,14 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | 1 | .. SPDX-License-Identifier: GPL-2.0 |
2 | 2 | ||
3 | =============== | 3 | =================================== |
4 | ext4 Filesystem | 4 | ext4 Data Structures and Algorithms |
5 | =============== | 5 | =================================== |
6 | |||
7 | General usage and on-disk artifacts writen by ext4. More documentation may | ||
8 | be ported from the wiki as time permits. This should be considered the | ||
9 | canonical source of information as the details here have been reviewed by | ||
10 | the ext4 community. | ||
11 | 6 | ||
12 | .. toctree:: | 7 | .. toctree:: |
13 | :maxdepth: 5 | 8 | :maxdepth: 6 |
14 | :numbered: | 9 | :numbered: |
15 | 10 | ||
16 | ext4 | 11 | about.rst |
17 | ondisk/index | 12 | overview.rst |
13 | globals.rst | ||
14 | dynamic.rst | ||
diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/inlinedata.rst index d1075178ce0b..d1075178ce0b 100644 --- a/Documentation/filesystems/ext4/ondisk/inlinedata.rst +++ b/Documentation/filesystems/ext4/inlinedata.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/inodes.rst index 655ce898f3f5..6bd35e506b6f 100644 --- a/Documentation/filesystems/ext4/ondisk/inodes.rst +++ b/Documentation/filesystems/ext4/inodes.rst | |||
@@ -29,8 +29,9 @@ and the inode structure itself. | |||
29 | The inode table entry is laid out in ``struct ext4_inode``. | 29 | The inode table entry is laid out in ``struct ext4_inode``. |
30 | 30 | ||
31 | .. list-table:: | 31 | .. list-table:: |
32 | :widths: 1 1 1 77 | 32 | :widths: 8 8 24 40 |
33 | :header-rows: 1 | 33 | :header-rows: 1 |
34 | :class: longtable | ||
34 | 35 | ||
35 | * - Offset | 36 | * - Offset |
36 | - Size | 37 | - Size |
@@ -176,7 +177,7 @@ The inode table entry is laid out in ``struct ext4_inode``. | |||
176 | The ``i_mode`` value is a combination of the following flags: | 177 | The ``i_mode`` value is a combination of the following flags: |
177 | 178 | ||
178 | .. list-table:: | 179 | .. list-table:: |
179 | :widths: 1 79 | 180 | :widths: 16 64 |
180 | :header-rows: 1 | 181 | :header-rows: 1 |
181 | 182 | ||
182 | * - Value | 183 | * - Value |
@@ -227,7 +228,7 @@ The ``i_mode`` value is a combination of the following flags: | |||
227 | The ``i_flags`` field is a combination of these values: | 228 | The ``i_flags`` field is a combination of these values: |
228 | 229 | ||
229 | .. list-table:: | 230 | .. list-table:: |
230 | :widths: 1 79 | 231 | :widths: 16 64 |
231 | :header-rows: 1 | 232 | :header-rows: 1 |
232 | 233 | ||
233 | * - Value | 234 | * - Value |
@@ -314,7 +315,7 @@ The ``osd1`` field has multiple meanings depending on the creator: | |||
314 | Linux: | 315 | Linux: |
315 | 316 | ||
316 | .. list-table:: | 317 | .. list-table:: |
317 | :widths: 1 1 1 77 | 318 | :widths: 8 8 24 40 |
318 | :header-rows: 1 | 319 | :header-rows: 1 |
319 | 320 | ||
320 | * - Offset | 321 | * - Offset |
@@ -331,7 +332,7 @@ Linux: | |||
331 | Hurd: | 332 | Hurd: |
332 | 333 | ||
333 | .. list-table:: | 334 | .. list-table:: |
334 | :widths: 1 1 1 77 | 335 | :widths: 8 8 24 40 |
335 | :header-rows: 1 | 336 | :header-rows: 1 |
336 | 337 | ||
337 | * - Offset | 338 | * - Offset |
@@ -346,7 +347,7 @@ Hurd: | |||
346 | Masix: | 347 | Masix: |
347 | 348 | ||
348 | .. list-table:: | 349 | .. list-table:: |
349 | :widths: 1 1 1 77 | 350 | :widths: 8 8 24 40 |
350 | :header-rows: 1 | 351 | :header-rows: 1 |
351 | 352 | ||
352 | * - Offset | 353 | * - Offset |
@@ -365,7 +366,7 @@ The ``osd2`` field has multiple meanings depending on the filesystem creator: | |||
365 | Linux: | 366 | Linux: |
366 | 367 | ||
367 | .. list-table:: | 368 | .. list-table:: |
368 | :widths: 1 1 1 77 | 369 | :widths: 8 8 24 40 |
369 | :header-rows: 1 | 370 | :header-rows: 1 |
370 | 371 | ||
371 | * - Offset | 372 | * - Offset |
@@ -402,7 +403,7 @@ Linux: | |||
402 | Hurd: | 403 | Hurd: |
403 | 404 | ||
404 | .. list-table:: | 405 | .. list-table:: |
405 | :widths: 1 1 1 77 | 406 | :widths: 8 8 24 40 |
406 | :header-rows: 1 | 407 | :header-rows: 1 |
407 | 408 | ||
408 | * - Offset | 409 | * - Offset |
@@ -433,7 +434,7 @@ Hurd: | |||
433 | Masix: | 434 | Masix: |
434 | 435 | ||
435 | .. list-table:: | 436 | .. list-table:: |
436 | :widths: 1 1 1 77 | 437 | :widths: 8 8 24 40 |
437 | :header-rows: 1 | 438 | :header-rows: 1 |
438 | 439 | ||
439 | * - Offset | 440 | * - Offset |
diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/journal.rst index e7031af86876..ea613ee701f5 100644 --- a/Documentation/filesystems/ext4/ondisk/journal.rst +++ b/Documentation/filesystems/ext4/journal.rst | |||
@@ -48,7 +48,7 @@ Layout | |||
48 | Generally speaking, the journal has this format: | 48 | Generally speaking, the journal has this format: |
49 | 49 | ||
50 | .. list-table:: | 50 | .. list-table:: |
51 | :widths: 1 1 78 | 51 | :widths: 16 48 16 |
52 | :header-rows: 1 | 52 | :header-rows: 1 |
53 | 53 | ||
54 | * - Superblock | 54 | * - Superblock |
@@ -76,7 +76,7 @@ The journal superblock will be in the next full block after the | |||
76 | superblock. | 76 | superblock. |
77 | 77 | ||
78 | .. list-table:: | 78 | .. list-table:: |
79 | :widths: 1 1 1 1 76 | 79 | :widths: 12 12 12 32 12 |
80 | :header-rows: 1 | 80 | :header-rows: 1 |
81 | 81 | ||
82 | * - 1024 bytes of padding | 82 | * - 1024 bytes of padding |
@@ -98,7 +98,7 @@ Every block in the journal starts with a common 12-byte header | |||
98 | ``struct journal_header_s``: | 98 | ``struct journal_header_s``: |
99 | 99 | ||
100 | .. list-table:: | 100 | .. list-table:: |
101 | :widths: 1 1 1 77 | 101 | :widths: 8 8 24 40 |
102 | :header-rows: 1 | 102 | :header-rows: 1 |
103 | 103 | ||
104 | * - Offset | 104 | * - Offset |
@@ -124,7 +124,7 @@ Every block in the journal starts with a common 12-byte header | |||
124 | The journal block type can be any one of: | 124 | The journal block type can be any one of: |
125 | 125 | ||
126 | .. list-table:: | 126 | .. list-table:: |
127 | :widths: 1 79 | 127 | :widths: 16 64 |
128 | :header-rows: 1 | 128 | :header-rows: 1 |
129 | 129 | ||
130 | * - Value | 130 | * - Value |
@@ -154,7 +154,7 @@ The journal superblock is recorded as ``struct journal_superblock_s``, | |||
154 | which is 1024 bytes long: | 154 | which is 1024 bytes long: |
155 | 155 | ||
156 | .. list-table:: | 156 | .. list-table:: |
157 | :widths: 1 1 1 77 | 157 | :widths: 8 8 24 40 |
158 | :header-rows: 1 | 158 | :header-rows: 1 |
159 | 159 | ||
160 | * - Offset | 160 | * - Offset |
@@ -264,7 +264,7 @@ which is 1024 bytes long: | |||
264 | The journal compat features are any combination of the following: | 264 | The journal compat features are any combination of the following: |
265 | 265 | ||
266 | .. list-table:: | 266 | .. list-table:: |
267 | :widths: 1 79 | 267 | :widths: 16 64 |
268 | :header-rows: 1 | 268 | :header-rows: 1 |
269 | 269 | ||
270 | * - Value | 270 | * - Value |
@@ -278,7 +278,7 @@ The journal compat features are any combination of the following: | |||
278 | The journal incompat features are any combination of the following: | 278 | The journal incompat features are any combination of the following: |
279 | 279 | ||
280 | .. list-table:: | 280 | .. list-table:: |
281 | :widths: 1 79 | 281 | :widths: 16 64 |
282 | :header-rows: 1 | 282 | :header-rows: 1 |
283 | 283 | ||
284 | * - Value | 284 | * - Value |
@@ -306,7 +306,7 @@ Journal checksum type codes are one of the following. crc32 or crc32c are the | |||
306 | most likely choices. | 306 | most likely choices. |
307 | 307 | ||
308 | .. list-table:: | 308 | .. list-table:: |
309 | :widths: 1 79 | 309 | :widths: 16 64 |
310 | :header-rows: 1 | 310 | :header-rows: 1 |
311 | 311 | ||
312 | * - Value | 312 | * - Value |
@@ -330,7 +330,7 @@ described by a data structure, but here is the block structure anyway. | |||
330 | Descriptor blocks consume at least 36 bytes, but use a full block: | 330 | Descriptor blocks consume at least 36 bytes, but use a full block: |
331 | 331 | ||
332 | .. list-table:: | 332 | .. list-table:: |
333 | :widths: 1 1 1 77 | 333 | :widths: 8 8 24 40 |
334 | :header-rows: 1 | 334 | :header-rows: 1 |
335 | 335 | ||
336 | * - Offset | 336 | * - Offset |
@@ -355,7 +355,7 @@ defined as ``struct journal_block_tag3_s``, which looks like the | |||
355 | following. The size is 16 or 32 bytes. | 355 | following. The size is 16 or 32 bytes. |
356 | 356 | ||
357 | .. list-table:: | 357 | .. list-table:: |
358 | :widths: 1 1 1 77 | 358 | :widths: 8 8 24 40 |
359 | :header-rows: 1 | 359 | :header-rows: 1 |
360 | 360 | ||
361 | * - Offset | 361 | * - Offset |
@@ -400,7 +400,7 @@ following. The size is 16 or 32 bytes. | |||
400 | The journal tag flags are any combination of the following: | 400 | The journal tag flags are any combination of the following: |
401 | 401 | ||
402 | .. list-table:: | 402 | .. list-table:: |
403 | :widths: 1 79 | 403 | :widths: 16 64 |
404 | :header-rows: 1 | 404 | :header-rows: 1 |
405 | 405 | ||
406 | * - Value | 406 | * - Value |
@@ -421,7 +421,7 @@ is defined as ``struct journal_block_tag_s``, which looks like the | |||
421 | following. The size is 8, 12, 24, or 28 bytes: | 421 | following. The size is 8, 12, 24, or 28 bytes: |
422 | 422 | ||
423 | .. list-table:: | 423 | .. list-table:: |
424 | :widths: 1 1 1 77 | 424 | :widths: 8 8 24 40 |
425 | :header-rows: 1 | 425 | :header-rows: 1 |
426 | 426 | ||
427 | * - Offset | 427 | * - Offset |
@@ -471,7 +471,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a | |||
471 | ``struct jbd2_journal_block_tail``, which looks like this: | 471 | ``struct jbd2_journal_block_tail``, which looks like this: |
472 | 472 | ||
473 | .. list-table:: | 473 | .. list-table:: |
474 | :widths: 1 1 1 77 | 474 | :widths: 8 8 24 40 |
475 | :header-rows: 1 | 475 | :header-rows: 1 |
476 | 476 | ||
477 | * - Offset | 477 | * - Offset |
@@ -513,7 +513,7 @@ Revocation blocks are described in | |||
513 | length, but use a full block: | 513 | length, but use a full block: |
514 | 514 | ||
515 | .. list-table:: | 515 | .. list-table:: |
516 | :widths: 1 1 1 77 | 516 | :widths: 8 8 24 40 |
517 | :header-rows: 1 | 517 | :header-rows: 1 |
518 | 518 | ||
519 | * - Offset | 519 | * - Offset |
@@ -543,7 +543,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation | |||
543 | block is a ``struct jbd2_journal_revoke_tail``, which has this format: | 543 | block is a ``struct jbd2_journal_revoke_tail``, which has this format: |
544 | 544 | ||
545 | .. list-table:: | 545 | .. list-table:: |
546 | :widths: 1 1 1 77 | 546 | :widths: 8 8 24 40 |
547 | :header-rows: 1 | 547 | :header-rows: 1 |
548 | 548 | ||
549 | * - Offset | 549 | * - Offset |
@@ -567,7 +567,7 @@ The commit block is described by ``struct commit_header``, which is 32 | |||
567 | bytes long (but uses a full block): | 567 | bytes long (but uses a full block): |
568 | 568 | ||
569 | .. list-table:: | 569 | .. list-table:: |
570 | :widths: 1 1 1 77 | 570 | :widths: 8 8 24 40 |
571 | :header-rows: 1 | 571 | :header-rows: 1 |
572 | 572 | ||
573 | * - Offset | 573 | * - Offset |
diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/mmp.rst index b7d7a3137f80..25660981d93c 100644 --- a/Documentation/filesystems/ext4/ondisk/mmp.rst +++ b/Documentation/filesystems/ext4/mmp.rst | |||
@@ -32,7 +32,7 @@ The checksum is calculated against the FS UUID and the MMP structure. | |||
32 | The MMP structure (``struct mmp_struct``) is as follows: | 32 | The MMP structure (``struct mmp_struct``) is as follows: |
33 | 33 | ||
34 | .. list-table:: | 34 | .. list-table:: |
35 | :widths: 1 1 1 77 | 35 | :widths: 8 12 20 40 |
36 | :header-rows: 1 | 36 | :header-rows: 1 |
37 | 37 | ||
38 | * - Offset | 38 | * - Offset |
diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst deleted file mode 100644 index f7d082c3a435..000000000000 --- a/Documentation/filesystems/ext4/ondisk/index.rst +++ /dev/null | |||
@@ -1,9 +0,0 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | ============================== | ||
4 | Data Structures and Algorithms | ||
5 | ============================== | ||
6 | .. include:: about.rst | ||
7 | .. include:: overview.rst | ||
8 | .. include:: globals.rst | ||
9 | .. include:: dynamic.rst | ||
diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/overview.rst index cbab18baba12..cbab18baba12 100644 --- a/Documentation/filesystems/ext4/ondisk/overview.rst +++ b/Documentation/filesystems/ext4/overview.rst | |||
diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/special_inodes.rst index a82f70c9baeb..9061aabba827 100644 --- a/Documentation/filesystems/ext4/ondisk/special_inodes.rst +++ b/Documentation/filesystems/ext4/special_inodes.rst | |||
@@ -6,7 +6,7 @@ Special inodes | |||
6 | ext4 reserves some inode for special features, as follows: | 6 | ext4 reserves some inode for special features, as follows: |
7 | 7 | ||
8 | .. list-table:: | 8 | .. list-table:: |
9 | :widths: 1 79 | 9 | :widths: 6 70 |
10 | :header-rows: 1 | 10 | :header-rows: 1 |
11 | 11 | ||
12 | * - inode Number | 12 | * - inode Number |
diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/super.rst index 5f81dd87e0b9..04ff079a2acf 100644 --- a/Documentation/filesystems/ext4/ondisk/super.rst +++ b/Documentation/filesystems/ext4/super.rst | |||
@@ -19,7 +19,7 @@ The ext4 superblock is laid out as follows in | |||
19 | ``struct ext4_super_block``: | 19 | ``struct ext4_super_block``: |
20 | 20 | ||
21 | .. list-table:: | 21 | .. list-table:: |
22 | :widths: 1 1 1 77 | 22 | :widths: 8 8 24 40 |
23 | :header-rows: 1 | 23 | :header-rows: 1 |
24 | 24 | ||
25 | * - Offset | 25 | * - Offset |
@@ -483,7 +483,7 @@ The ext4 superblock is laid out as follows in | |||
483 | The superblock state is some combination of the following: | 483 | The superblock state is some combination of the following: |
484 | 484 | ||
485 | .. list-table:: | 485 | .. list-table:: |
486 | :widths: 1 79 | 486 | :widths: 8 72 |
487 | :header-rows: 1 | 487 | :header-rows: 1 |
488 | 488 | ||
489 | * - Value | 489 | * - Value |
@@ -500,7 +500,7 @@ The superblock state is some combination of the following: | |||
500 | The superblock error policy is one of the following: | 500 | The superblock error policy is one of the following: |
501 | 501 | ||
502 | .. list-table:: | 502 | .. list-table:: |
503 | :widths: 1 79 | 503 | :widths: 8 72 |
504 | :header-rows: 1 | 504 | :header-rows: 1 |
505 | 505 | ||
506 | * - Value | 506 | * - Value |
@@ -517,7 +517,7 @@ The superblock error policy is one of the following: | |||
517 | The filesystem creator is one of the following: | 517 | The filesystem creator is one of the following: |
518 | 518 | ||
519 | .. list-table:: | 519 | .. list-table:: |
520 | :widths: 1 79 | 520 | :widths: 8 72 |
521 | :header-rows: 1 | 521 | :header-rows: 1 |
522 | 522 | ||
523 | * - Value | 523 | * - Value |
@@ -538,7 +538,7 @@ The filesystem creator is one of the following: | |||
538 | The superblock revision is one of the following: | 538 | The superblock revision is one of the following: |
539 | 539 | ||
540 | .. list-table:: | 540 | .. list-table:: |
541 | :widths: 1 79 | 541 | :widths: 8 72 |
542 | :header-rows: 1 | 542 | :header-rows: 1 |
543 | 543 | ||
544 | * - Value | 544 | * - Value |
@@ -556,7 +556,7 @@ The superblock compatible features field is a combination of any of the | |||
556 | following: | 556 | following: |
557 | 557 | ||
558 | .. list-table:: | 558 | .. list-table:: |
559 | :widths: 1 79 | 559 | :widths: 16 64 |
560 | :header-rows: 1 | 560 | :header-rows: 1 |
561 | 561 | ||
562 | * - Value | 562 | * - Value |
@@ -595,7 +595,7 @@ The superblock incompatible features field is a combination of any of the | |||
595 | following: | 595 | following: |
596 | 596 | ||
597 | .. list-table:: | 597 | .. list-table:: |
598 | :widths: 1 79 | 598 | :widths: 16 64 |
599 | :header-rows: 1 | 599 | :header-rows: 1 |
600 | 600 | ||
601 | * - Value | 601 | * - Value |
@@ -647,7 +647,7 @@ The superblock read-only compatible features field is a combination of any of | |||
647 | the following: | 647 | the following: |
648 | 648 | ||
649 | .. list-table:: | 649 | .. list-table:: |
650 | :widths: 1 79 | 650 | :widths: 16 64 |
651 | :header-rows: 1 | 651 | :header-rows: 1 |
652 | 652 | ||
653 | * - Value | 653 | * - Value |
@@ -702,7 +702,7 @@ the following: | |||
702 | The ``s_def_hash_version`` field is one of the following: | 702 | The ``s_def_hash_version`` field is one of the following: |
703 | 703 | ||
704 | .. list-table:: | 704 | .. list-table:: |
705 | :widths: 1 79 | 705 | :widths: 8 72 |
706 | :header-rows: 1 | 706 | :header-rows: 1 |
707 | 707 | ||
708 | * - Value | 708 | * - Value |
@@ -725,7 +725,7 @@ The ``s_def_hash_version`` field is one of the following: | |||
725 | The ``s_default_mount_opts`` field is any combination of the following: | 725 | The ``s_default_mount_opts`` field is any combination of the following: |
726 | 726 | ||
727 | .. list-table:: | 727 | .. list-table:: |
728 | :widths: 1 79 | 728 | :widths: 8 72 |
729 | :header-rows: 1 | 729 | :header-rows: 1 |
730 | 730 | ||
731 | * - Value | 731 | * - Value |
@@ -767,7 +767,7 @@ The ``s_default_mount_opts`` field is any combination of the following: | |||
767 | The ``s_flags`` field is any combination of the following: | 767 | The ``s_flags`` field is any combination of the following: |
768 | 768 | ||
769 | .. list-table:: | 769 | .. list-table:: |
770 | :widths: 1 79 | 770 | :widths: 8 72 |
771 | :header-rows: 1 | 771 | :header-rows: 1 |
772 | 772 | ||
773 | * - Value | 773 | * - Value |
@@ -784,7 +784,7 @@ The ``s_flags`` field is any combination of the following: | |||
784 | The ``s_encrypt_algos`` list can contain any of the following: | 784 | The ``s_encrypt_algos`` list can contain any of the following: |
785 | 785 | ||
786 | .. list-table:: | 786 | .. list-table:: |
787 | :widths: 1 79 | 787 | :widths: 8 72 |
788 | :header-rows: 1 | 788 | :header-rows: 1 |
789 | 789 | ||
790 | * - Value | 790 | * - Value |
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index fb50f9aa6ead..c1d570ee1d9f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c | |||
@@ -284,12 +284,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) | |||
284 | error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, | 284 | error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, |
285 | default_acl, XATTR_CREATE); | 285 | default_acl, XATTR_CREATE); |
286 | posix_acl_release(default_acl); | 286 | posix_acl_release(default_acl); |
287 | } else { | ||
288 | inode->i_default_acl = NULL; | ||
287 | } | 289 | } |
288 | if (acl) { | 290 | if (acl) { |
289 | if (!error) | 291 | if (!error) |
290 | error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, | 292 | error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, |
291 | acl, XATTR_CREATE); | 293 | acl, XATTR_CREATE); |
292 | posix_acl_release(acl); | 294 | posix_acl_release(acl); |
295 | } else { | ||
296 | inode->i_acl = NULL; | ||
293 | } | 297 | } |
294 | return error; | 298 | return error; |
295 | } | 299 | } |
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index caff935fbeb8..12f90d48ba61 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -628,6 +628,7 @@ enum { | |||
628 | #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 | 628 | #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 |
629 | #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 | 629 | #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 |
630 | #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 | 630 | #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 |
631 | #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 | ||
631 | 632 | ||
632 | /* | 633 | /* |
633 | * ioctl commands | 634 | * ioctl commands |
@@ -1030,6 +1031,9 @@ struct ext4_inode_info { | |||
1030 | ext4_lblk_t i_da_metadata_calc_last_lblock; | 1031 | ext4_lblk_t i_da_metadata_calc_last_lblock; |
1031 | int i_da_metadata_calc_len; | 1032 | int i_da_metadata_calc_len; |
1032 | 1033 | ||
1034 | /* pending cluster reservations for bigalloc file systems */ | ||
1035 | struct ext4_pending_tree i_pending_tree; | ||
1036 | |||
1033 | /* on-disk additional length */ | 1037 | /* on-disk additional length */ |
1034 | __u16 i_extra_isize; | 1038 | __u16 i_extra_isize; |
1035 | 1039 | ||
@@ -1401,7 +1405,8 @@ struct ext4_sb_info { | |||
1401 | u32 s_min_batch_time; | 1405 | u32 s_min_batch_time; |
1402 | struct block_device *journal_bdev; | 1406 | struct block_device *journal_bdev; |
1403 | #ifdef CONFIG_QUOTA | 1407 | #ifdef CONFIG_QUOTA |
1404 | char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ | 1408 | /* Names of quota files with journalled quota */ |
1409 | char __rcu *s_qf_names[EXT4_MAXQUOTAS]; | ||
1405 | int s_jquota_fmt; /* Format of quota to use */ | 1410 | int s_jquota_fmt; /* Format of quota to use */ |
1406 | #endif | 1411 | #endif |
1407 | unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ | 1412 | unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ |
@@ -2483,10 +2488,11 @@ extern int ext4_writepage_trans_blocks(struct inode *); | |||
2483 | extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); | 2488 | extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); |
2484 | extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, | 2489 | extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, |
2485 | loff_t lstart, loff_t lend); | 2490 | loff_t lstart, loff_t lend); |
2486 | extern int ext4_page_mkwrite(struct vm_fault *vmf); | 2491 | extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); |
2487 | extern int ext4_filemap_fault(struct vm_fault *vmf); | 2492 | extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); |
2488 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); | 2493 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); |
2489 | extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); | 2494 | extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); |
2495 | extern void ext4_da_release_space(struct inode *inode, int to_free); | ||
2490 | extern void ext4_da_update_reserve_space(struct inode *inode, | 2496 | extern void ext4_da_update_reserve_space(struct inode *inode, |
2491 | int used, int quota_claim); | 2497 | int used, int quota_claim); |
2492 | extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, | 2498 | extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, |
@@ -3142,10 +3148,6 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, | |||
3142 | int flags); | 3148 | int flags); |
3143 | extern void ext4_ext_drop_refs(struct ext4_ext_path *); | 3149 | extern void ext4_ext_drop_refs(struct ext4_ext_path *); |
3144 | extern int ext4_ext_check_inode(struct inode *inode); | 3150 | extern int ext4_ext_check_inode(struct inode *inode); |
3145 | extern int ext4_find_delalloc_range(struct inode *inode, | ||
3146 | ext4_lblk_t lblk_start, | ||
3147 | ext4_lblk_t lblk_end); | ||
3148 | extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); | ||
3149 | extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); | 3151 | extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); |
3150 | extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 3152 | extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
3151 | __u64 start, __u64 len); | 3153 | __u64 start, __u64 len); |
@@ -3156,6 +3158,7 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, | |||
3156 | struct inode *inode2, ext4_lblk_t lblk1, | 3158 | struct inode *inode2, ext4_lblk_t lblk1, |
3157 | ext4_lblk_t lblk2, ext4_lblk_t count, | 3159 | ext4_lblk_t lblk2, ext4_lblk_t count, |
3158 | int mark_unwritten,int *err); | 3160 | int mark_unwritten,int *err); |
3161 | extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); | ||
3159 | 3162 | ||
3160 | /* move_extent.c */ | 3163 | /* move_extent.c */ |
3161 | extern void ext4_double_down_write_data_sem(struct inode *first, | 3164 | extern void ext4_double_down_write_data_sem(struct inode *first, |
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index adf6668b596f..98bd0e9ee7df 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h | |||
@@ -120,6 +120,19 @@ struct ext4_ext_path { | |||
120 | }; | 120 | }; |
121 | 121 | ||
122 | /* | 122 | /* |
123 | * Used to record a portion of a cluster found at the beginning or end | ||
124 | * of an extent while traversing the extent tree during space removal. | ||
125 | * A partial cluster may be removed if it does not contain blocks shared | ||
126 | * with extents that aren't being deleted (tofree state). Otherwise, | ||
127 | * it cannot be removed (nofree state). | ||
128 | */ | ||
129 | struct partial_cluster { | ||
130 | ext4_fsblk_t pclu; /* physical cluster number */ | ||
131 | ext4_lblk_t lblk; /* logical block number within logical cluster */ | ||
132 | enum {initial, tofree, nofree} state; | ||
133 | }; | ||
134 | |||
135 | /* | ||
123 | * structure for external API | 136 | * structure for external API |
124 | */ | 137 | */ |
125 | 138 | ||
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 72a361d5ef74..240b6dea5441 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -2351,8 +2351,8 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, | |||
2351 | { | 2351 | { |
2352 | struct extent_status es; | 2352 | struct extent_status es; |
2353 | 2353 | ||
2354 | ext4_es_find_delayed_extent_range(inode, hole_start, | 2354 | ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, |
2355 | hole_start + hole_len - 1, &es); | 2355 | hole_start + hole_len - 1, &es); |
2356 | if (es.es_len) { | 2356 | if (es.es_len) { |
2357 | /* There's delayed extent containing lblock? */ | 2357 | /* There's delayed extent containing lblock? */ |
2358 | if (es.es_lblk <= hole_start) | 2358 | if (es.es_lblk <= hole_start) |
@@ -2490,106 +2490,157 @@ static inline int get_default_free_blocks_flags(struct inode *inode) | |||
2490 | return 0; | 2490 | return 0; |
2491 | } | 2491 | } |
2492 | 2492 | ||
2493 | /* | ||
2494 | * ext4_rereserve_cluster - increment the reserved cluster count when | ||
2495 | * freeing a cluster with a pending reservation | ||
2496 | * | ||
2497 | * @inode - file containing the cluster | ||
2498 | * @lblk - logical block in cluster to be reserved | ||
2499 | * | ||
2500 | * Increments the reserved cluster count and adjusts quota in a bigalloc | ||
2501 | * file system when freeing a partial cluster containing at least one | ||
2502 | * delayed and unwritten block. A partial cluster meeting that | ||
2503 | * requirement will have a pending reservation. If so, the | ||
2504 | * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to | ||
2505 | * defer reserved and allocated space accounting to a subsequent call | ||
2506 | * to this function. | ||
2507 | */ | ||
2508 | static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) | ||
2509 | { | ||
2510 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
2511 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
2512 | |||
2513 | dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); | ||
2514 | |||
2515 | spin_lock(&ei->i_block_reservation_lock); | ||
2516 | ei->i_reserved_data_blocks++; | ||
2517 | percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); | ||
2518 | spin_unlock(&ei->i_block_reservation_lock); | ||
2519 | |||
2520 | percpu_counter_add(&sbi->s_freeclusters_counter, 1); | ||
2521 | ext4_remove_pending(inode, lblk); | ||
2522 | } | ||
2523 | |||
2493 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | 2524 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, |
2494 | struct ext4_extent *ex, | 2525 | struct ext4_extent *ex, |
2495 | long long *partial_cluster, | 2526 | struct partial_cluster *partial, |
2496 | ext4_lblk_t from, ext4_lblk_t to) | 2527 | ext4_lblk_t from, ext4_lblk_t to) |
2497 | { | 2528 | { |
2498 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2529 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
2499 | unsigned short ee_len = ext4_ext_get_actual_len(ex); | 2530 | unsigned short ee_len = ext4_ext_get_actual_len(ex); |
2500 | ext4_fsblk_t pblk; | 2531 | ext4_fsblk_t last_pblk, pblk; |
2501 | int flags = get_default_free_blocks_flags(inode); | 2532 | ext4_lblk_t num; |
2533 | int flags; | ||
2534 | |||
2535 | /* only extent tail removal is allowed */ | ||
2536 | if (from < le32_to_cpu(ex->ee_block) || | ||
2537 | to != le32_to_cpu(ex->ee_block) + ee_len - 1) { | ||
2538 | ext4_error(sbi->s_sb, | ||
2539 | "strange request: removal(2) %u-%u from %u:%u", | ||
2540 | from, to, le32_to_cpu(ex->ee_block), ee_len); | ||
2541 | return 0; | ||
2542 | } | ||
2543 | |||
2544 | #ifdef EXTENTS_STATS | ||
2545 | spin_lock(&sbi->s_ext_stats_lock); | ||
2546 | sbi->s_ext_blocks += ee_len; | ||
2547 | sbi->s_ext_extents++; | ||
2548 | if (ee_len < sbi->s_ext_min) | ||
2549 | sbi->s_ext_min = ee_len; | ||
2550 | if (ee_len > sbi->s_ext_max) | ||
2551 | sbi->s_ext_max = ee_len; | ||
2552 | if (ext_depth(inode) > sbi->s_depth_max) | ||
2553 | sbi->s_depth_max = ext_depth(inode); | ||
2554 | spin_unlock(&sbi->s_ext_stats_lock); | ||
2555 | #endif | ||
2556 | |||
2557 | trace_ext4_remove_blocks(inode, ex, from, to, partial); | ||
2502 | 2558 | ||
2503 | /* | 2559 | /* |
2504 | * For bigalloc file systems, we never free a partial cluster | 2560 | * if we have a partial cluster, and it's different from the |
2505 | * at the beginning of the extent. Instead, we make a note | 2561 | * cluster of the last block in the extent, we free it |
2506 | * that we tried freeing the cluster, and check to see if we | ||
2507 | * need to free it on a subsequent call to ext4_remove_blocks, | ||
2508 | * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. | ||
2509 | */ | 2562 | */ |
2510 | flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; | 2563 | last_pblk = ext4_ext_pblock(ex) + ee_len - 1; |
2564 | |||
2565 | if (partial->state != initial && | ||
2566 | partial->pclu != EXT4_B2C(sbi, last_pblk)) { | ||
2567 | if (partial->state == tofree) { | ||
2568 | flags = get_default_free_blocks_flags(inode); | ||
2569 | if (ext4_is_pending(inode, partial->lblk)) | ||
2570 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; | ||
2571 | ext4_free_blocks(handle, inode, NULL, | ||
2572 | EXT4_C2B(sbi, partial->pclu), | ||
2573 | sbi->s_cluster_ratio, flags); | ||
2574 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) | ||
2575 | ext4_rereserve_cluster(inode, partial->lblk); | ||
2576 | } | ||
2577 | partial->state = initial; | ||
2578 | } | ||
2579 | |||
2580 | num = le32_to_cpu(ex->ee_block) + ee_len - from; | ||
2581 | pblk = ext4_ext_pblock(ex) + ee_len - num; | ||
2511 | 2582 | ||
2512 | trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); | ||
2513 | /* | 2583 | /* |
2514 | * If we have a partial cluster, and it's different from the | 2584 | * We free the partial cluster at the end of the extent (if any), |
2515 | * cluster of the last block, we need to explicitly free the | 2585 | * unless the cluster is used by another extent (partial_cluster |
2516 | * partial cluster here. | 2586 | * state is nofree). If a partial cluster exists here, it must be |
2587 | * shared with the last block in the extent. | ||
2517 | */ | 2588 | */ |
2518 | pblk = ext4_ext_pblock(ex) + ee_len - 1; | 2589 | flags = get_default_free_blocks_flags(inode); |
2519 | if (*partial_cluster > 0 && | 2590 | |
2520 | *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { | 2591 | /* partial, left end cluster aligned, right end unaligned */ |
2592 | if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && | ||
2593 | (EXT4_LBLK_CMASK(sbi, to) >= from) && | ||
2594 | (partial->state != nofree)) { | ||
2595 | if (ext4_is_pending(inode, to)) | ||
2596 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; | ||
2521 | ext4_free_blocks(handle, inode, NULL, | 2597 | ext4_free_blocks(handle, inode, NULL, |
2522 | EXT4_C2B(sbi, *partial_cluster), | 2598 | EXT4_PBLK_CMASK(sbi, last_pblk), |
2523 | sbi->s_cluster_ratio, flags); | 2599 | sbi->s_cluster_ratio, flags); |
2524 | *partial_cluster = 0; | 2600 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
2601 | ext4_rereserve_cluster(inode, to); | ||
2602 | partial->state = initial; | ||
2603 | flags = get_default_free_blocks_flags(inode); | ||
2525 | } | 2604 | } |
2526 | 2605 | ||
2527 | #ifdef EXTENTS_STATS | 2606 | flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; |
2528 | { | ||
2529 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
2530 | spin_lock(&sbi->s_ext_stats_lock); | ||
2531 | sbi->s_ext_blocks += ee_len; | ||
2532 | sbi->s_ext_extents++; | ||
2533 | if (ee_len < sbi->s_ext_min) | ||
2534 | sbi->s_ext_min = ee_len; | ||
2535 | if (ee_len > sbi->s_ext_max) | ||
2536 | sbi->s_ext_max = ee_len; | ||
2537 | if (ext_depth(inode) > sbi->s_depth_max) | ||
2538 | sbi->s_depth_max = ext_depth(inode); | ||
2539 | spin_unlock(&sbi->s_ext_stats_lock); | ||
2540 | } | ||
2541 | #endif | ||
2542 | if (from >= le32_to_cpu(ex->ee_block) | ||
2543 | && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { | ||
2544 | /* tail removal */ | ||
2545 | ext4_lblk_t num; | ||
2546 | long long first_cluster; | ||
2547 | |||
2548 | num = le32_to_cpu(ex->ee_block) + ee_len - from; | ||
2549 | pblk = ext4_ext_pblock(ex) + ee_len - num; | ||
2550 | /* | ||
2551 | * Usually we want to free partial cluster at the end of the | ||
2552 | * extent, except for the situation when the cluster is still | ||
2553 | * used by any other extent (partial_cluster is negative). | ||
2554 | */ | ||
2555 | if (*partial_cluster < 0 && | ||
2556 | *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) | ||
2557 | flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; | ||
2558 | 2607 | ||
2559 | ext_debug("free last %u blocks starting %llu partial %lld\n", | 2608 | /* |
2560 | num, pblk, *partial_cluster); | 2609 | * For bigalloc file systems, we never free a partial cluster |
2561 | ext4_free_blocks(handle, inode, NULL, pblk, num, flags); | 2610 | * at the beginning of the extent. Instead, we check to see if we |
2562 | /* | 2611 | * need to free it on a subsequent call to ext4_remove_blocks, |
2563 | * If the block range to be freed didn't start at the | 2612 | * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. |
2564 | * beginning of a cluster, and we removed the entire | 2613 | */ |
2565 | * extent and the cluster is not used by any other extent, | 2614 | flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; |
2566 | * save the partial cluster here, since we might need to | 2615 | ext4_free_blocks(handle, inode, NULL, pblk, num, flags); |
2567 | * delete if we determine that the truncate or punch hole | 2616 | |
2568 | * operation has removed all of the blocks in the cluster. | 2617 | /* reset the partial cluster if we've freed past it */ |
2569 | * If that cluster is used by another extent, preserve its | 2618 | if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) |
2570 | * negative value so it isn't freed later on. | 2619 | partial->state = initial; |
2571 | * | 2620 | |
2572 | * If the whole extent wasn't freed, we've reached the | 2621 | /* |
2573 | * start of the truncated/punched region and have finished | 2622 | * If we've freed the entire extent but the beginning is not left |
2574 | * removing blocks. If there's a partial cluster here it's | 2623 | * cluster aligned and is not marked as ineligible for freeing we |
2575 | * shared with the remainder of the extent and is no longer | 2624 | * record the partial cluster at the beginning of the extent. It |
2576 | * a candidate for removal. | 2625 | * wasn't freed by the preceding ext4_free_blocks() call, and we |
2577 | */ | 2626 | * need to look farther to the left to determine if it's to be freed |
2578 | if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { | 2627 | * (not shared with another extent). Else, reset the partial |
2579 | first_cluster = (long long) EXT4_B2C(sbi, pblk); | 2628 | * cluster - we're either done freeing or the beginning of the |
2580 | if (first_cluster != -*partial_cluster) | 2629 | * extent is left cluster aligned. |
2581 | *partial_cluster = first_cluster; | 2630 | */ |
2582 | } else { | 2631 | if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { |
2583 | *partial_cluster = 0; | 2632 | if (partial->state == initial) { |
2633 | partial->pclu = EXT4_B2C(sbi, pblk); | ||
2634 | partial->lblk = from; | ||
2635 | partial->state = tofree; | ||
2584 | } | 2636 | } |
2585 | } else | 2637 | } else { |
2586 | ext4_error(sbi->s_sb, "strange request: removal(2) " | 2638 | partial->state = initial; |
2587 | "%u-%u from %u:%u", | 2639 | } |
2588 | from, to, le32_to_cpu(ex->ee_block), ee_len); | 2640 | |
2589 | return 0; | 2641 | return 0; |
2590 | } | 2642 | } |
2591 | 2643 | ||
2592 | |||
2593 | /* | 2644 | /* |
2594 | * ext4_ext_rm_leaf() Removes the extents associated with the | 2645 | * ext4_ext_rm_leaf() Removes the extents associated with the |
2595 | * blocks appearing between "start" and "end". Both "start" | 2646 | * blocks appearing between "start" and "end". Both "start" |
@@ -2608,7 +2659,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | |||
2608 | static int | 2659 | static int |
2609 | ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | 2660 | ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, |
2610 | struct ext4_ext_path *path, | 2661 | struct ext4_ext_path *path, |
2611 | long long *partial_cluster, | 2662 | struct partial_cluster *partial, |
2612 | ext4_lblk_t start, ext4_lblk_t end) | 2663 | ext4_lblk_t start, ext4_lblk_t end) |
2613 | { | 2664 | { |
2614 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2665 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
@@ -2640,7 +2691,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2640 | ex_ee_block = le32_to_cpu(ex->ee_block); | 2691 | ex_ee_block = le32_to_cpu(ex->ee_block); |
2641 | ex_ee_len = ext4_ext_get_actual_len(ex); | 2692 | ex_ee_len = ext4_ext_get_actual_len(ex); |
2642 | 2693 | ||
2643 | trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); | 2694 | trace_ext4_ext_rm_leaf(inode, start, ex, partial); |
2644 | 2695 | ||
2645 | while (ex >= EXT_FIRST_EXTENT(eh) && | 2696 | while (ex >= EXT_FIRST_EXTENT(eh) && |
2646 | ex_ee_block + ex_ee_len > start) { | 2697 | ex_ee_block + ex_ee_len > start) { |
@@ -2671,8 +2722,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2671 | */ | 2722 | */ |
2672 | if (sbi->s_cluster_ratio > 1) { | 2723 | if (sbi->s_cluster_ratio > 1) { |
2673 | pblk = ext4_ext_pblock(ex); | 2724 | pblk = ext4_ext_pblock(ex); |
2674 | *partial_cluster = | 2725 | partial->pclu = EXT4_B2C(sbi, pblk); |
2675 | -(long long) EXT4_B2C(sbi, pblk); | 2726 | partial->state = nofree; |
2676 | } | 2727 | } |
2677 | ex--; | 2728 | ex--; |
2678 | ex_ee_block = le32_to_cpu(ex->ee_block); | 2729 | ex_ee_block = le32_to_cpu(ex->ee_block); |
@@ -2714,8 +2765,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2714 | if (err) | 2765 | if (err) |
2715 | goto out; | 2766 | goto out; |
2716 | 2767 | ||
2717 | err = ext4_remove_blocks(handle, inode, ex, partial_cluster, | 2768 | err = ext4_remove_blocks(handle, inode, ex, partial, a, b); |
2718 | a, b); | ||
2719 | if (err) | 2769 | if (err) |
2720 | goto out; | 2770 | goto out; |
2721 | 2771 | ||
@@ -2769,18 +2819,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
2769 | * If there's a partial cluster and at least one extent remains in | 2819 | * If there's a partial cluster and at least one extent remains in |
2770 | * the leaf, free the partial cluster if it isn't shared with the | 2820 | * the leaf, free the partial cluster if it isn't shared with the |
2771 | * current extent. If it is shared with the current extent | 2821 | * current extent. If it is shared with the current extent |
2772 | * we zero partial_cluster because we've reached the start of the | 2822 | * we reset the partial cluster because we've reached the start of the |
2773 | * truncated/punched region and we're done removing blocks. | 2823 | * truncated/punched region and we're done removing blocks. |
2774 | */ | 2824 | */ |
2775 | if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { | 2825 | if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { |
2776 | pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; | 2826 | pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; |
2777 | if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { | 2827 | if (partial->pclu != EXT4_B2C(sbi, pblk)) { |
2828 | int flags = get_default_free_blocks_flags(inode); | ||
2829 | |||
2830 | if (ext4_is_pending(inode, partial->lblk)) | ||
2831 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; | ||
2778 | ext4_free_blocks(handle, inode, NULL, | 2832 | ext4_free_blocks(handle, inode, NULL, |
2779 | EXT4_C2B(sbi, *partial_cluster), | 2833 | EXT4_C2B(sbi, partial->pclu), |
2780 | sbi->s_cluster_ratio, | 2834 | sbi->s_cluster_ratio, flags); |
2781 | get_default_free_blocks_flags(inode)); | 2835 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
2836 | ext4_rereserve_cluster(inode, partial->lblk); | ||
2782 | } | 2837 | } |
2783 | *partial_cluster = 0; | 2838 | partial->state = initial; |
2784 | } | 2839 | } |
2785 | 2840 | ||
2786 | /* if this leaf is free, then we should | 2841 | /* if this leaf is free, then we should |
@@ -2819,10 +2874,14 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, | |||
2819 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 2874 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
2820 | int depth = ext_depth(inode); | 2875 | int depth = ext_depth(inode); |
2821 | struct ext4_ext_path *path = NULL; | 2876 | struct ext4_ext_path *path = NULL; |
2822 | long long partial_cluster = 0; | 2877 | struct partial_cluster partial; |
2823 | handle_t *handle; | 2878 | handle_t *handle; |
2824 | int i = 0, err = 0; | 2879 | int i = 0, err = 0; |
2825 | 2880 | ||
2881 | partial.pclu = 0; | ||
2882 | partial.lblk = 0; | ||
2883 | partial.state = initial; | ||
2884 | |||
2826 | ext_debug("truncate since %u to %u\n", start, end); | 2885 | ext_debug("truncate since %u to %u\n", start, end); |
2827 | 2886 | ||
2828 | /* probably first extent we're gonna free will be last in block */ | 2887 | /* probably first extent we're gonna free will be last in block */ |
@@ -2882,8 +2941,8 @@ again: | |||
2882 | */ | 2941 | */ |
2883 | if (sbi->s_cluster_ratio > 1) { | 2942 | if (sbi->s_cluster_ratio > 1) { |
2884 | pblk = ext4_ext_pblock(ex) + end - ee_block + 2; | 2943 | pblk = ext4_ext_pblock(ex) + end - ee_block + 2; |
2885 | partial_cluster = | 2944 | partial.pclu = EXT4_B2C(sbi, pblk); |
2886 | -(long long) EXT4_B2C(sbi, pblk); | 2945 | partial.state = nofree; |
2887 | } | 2946 | } |
2888 | 2947 | ||
2889 | /* | 2948 | /* |
@@ -2911,9 +2970,10 @@ again: | |||
2911 | &ex); | 2970 | &ex); |
2912 | if (err) | 2971 | if (err) |
2913 | goto out; | 2972 | goto out; |
2914 | if (pblk) | 2973 | if (pblk) { |
2915 | partial_cluster = | 2974 | partial.pclu = EXT4_B2C(sbi, pblk); |
2916 | -(long long) EXT4_B2C(sbi, pblk); | 2975 | partial.state = nofree; |
2976 | } | ||
2917 | } | 2977 | } |
2918 | } | 2978 | } |
2919 | /* | 2979 | /* |
@@ -2948,8 +3008,7 @@ again: | |||
2948 | if (i == depth) { | 3008 | if (i == depth) { |
2949 | /* this is leaf block */ | 3009 | /* this is leaf block */ |
2950 | err = ext4_ext_rm_leaf(handle, inode, path, | 3010 | err = ext4_ext_rm_leaf(handle, inode, path, |
2951 | &partial_cluster, start, | 3011 | &partial, start, end); |
2952 | end); | ||
2953 | /* root level has p_bh == NULL, brelse() eats this */ | 3012 | /* root level has p_bh == NULL, brelse() eats this */ |
2954 | brelse(path[i].p_bh); | 3013 | brelse(path[i].p_bh); |
2955 | path[i].p_bh = NULL; | 3014 | path[i].p_bh = NULL; |
@@ -3021,21 +3080,24 @@ again: | |||
3021 | } | 3080 | } |
3022 | } | 3081 | } |
3023 | 3082 | ||
3024 | trace_ext4_ext_remove_space_done(inode, start, end, depth, | 3083 | trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, |
3025 | partial_cluster, path->p_hdr->eh_entries); | 3084 | path->p_hdr->eh_entries); |
3026 | 3085 | ||
3027 | /* | 3086 | /* |
3028 | * If we still have something in the partial cluster and we have removed | 3087 | * if there's a partial cluster and we have removed the first extent |
3029 | * even the first extent, then we should free the blocks in the partial | 3088 | * in the file, then we also free the partial cluster, if any |
3030 | * cluster as well. (This code will only run when there are no leaves | ||
3031 | * to the immediate left of the truncated/punched region.) | ||
3032 | */ | 3089 | */ |
3033 | if (partial_cluster > 0 && err == 0) { | 3090 | if (partial.state == tofree && err == 0) { |
3034 | /* don't zero partial_cluster since it's not used afterwards */ | 3091 | int flags = get_default_free_blocks_flags(inode); |
3092 | |||
3093 | if (ext4_is_pending(inode, partial.lblk)) | ||
3094 | flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; | ||
3035 | ext4_free_blocks(handle, inode, NULL, | 3095 | ext4_free_blocks(handle, inode, NULL, |
3036 | EXT4_C2B(sbi, partial_cluster), | 3096 | EXT4_C2B(sbi, partial.pclu), |
3037 | sbi->s_cluster_ratio, | 3097 | sbi->s_cluster_ratio, flags); |
3038 | get_default_free_blocks_flags(inode)); | 3098 | if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) |
3099 | ext4_rereserve_cluster(inode, partial.lblk); | ||
3100 | partial.state = initial; | ||
3039 | } | 3101 | } |
3040 | 3102 | ||
3041 | /* TODO: flexible tree reduction should be here */ | 3103 | /* TODO: flexible tree reduction should be here */ |
@@ -3819,114 +3881,6 @@ out: | |||
3819 | return ext4_mark_inode_dirty(handle, inode); | 3881 | return ext4_mark_inode_dirty(handle, inode); |
3820 | } | 3882 | } |
3821 | 3883 | ||
3822 | /** | ||
3823 | * ext4_find_delalloc_range: find delayed allocated block in the given range. | ||
3824 | * | ||
3825 | * Return 1 if there is a delalloc block in the range, otherwise 0. | ||
3826 | */ | ||
3827 | int ext4_find_delalloc_range(struct inode *inode, | ||
3828 | ext4_lblk_t lblk_start, | ||
3829 | ext4_lblk_t lblk_end) | ||
3830 | { | ||
3831 | struct extent_status es; | ||
3832 | |||
3833 | ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); | ||
3834 | if (es.es_len == 0) | ||
3835 | return 0; /* there is no delay extent in this tree */ | ||
3836 | else if (es.es_lblk <= lblk_start && | ||
3837 | lblk_start < es.es_lblk + es.es_len) | ||
3838 | return 1; | ||
3839 | else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) | ||
3840 | return 1; | ||
3841 | else | ||
3842 | return 0; | ||
3843 | } | ||
3844 | |||
3845 | int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) | ||
3846 | { | ||
3847 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
3848 | ext4_lblk_t lblk_start, lblk_end; | ||
3849 | lblk_start = EXT4_LBLK_CMASK(sbi, lblk); | ||
3850 | lblk_end = lblk_start + sbi->s_cluster_ratio - 1; | ||
3851 | |||
3852 | return ext4_find_delalloc_range(inode, lblk_start, lblk_end); | ||
3853 | } | ||
3854 | |||
3855 | /** | ||
3856 | * Determines how many complete clusters (out of those specified by the 'map') | ||
3857 | * are under delalloc and were reserved quota for. | ||
3858 | * This function is called when we are writing out the blocks that were | ||
3859 | * originally written with their allocation delayed, but then the space was | ||
3860 | * allocated using fallocate() before the delayed allocation could be resolved. | ||
3861 | * The cases to look for are: | ||
3862 | * ('=' indicated delayed allocated blocks | ||
3863 | * '-' indicates non-delayed allocated blocks) | ||
3864 | * (a) partial clusters towards beginning and/or end outside of allocated range | ||
3865 | * are not delalloc'ed. | ||
3866 | * Ex: | ||
3867 | * |----c---=|====c====|====c====|===-c----| | ||
3868 | * |++++++ allocated ++++++| | ||
3869 | * ==> 4 complete clusters in above example | ||
3870 | * | ||
3871 | * (b) partial cluster (outside of allocated range) towards either end is | ||
3872 | * marked for delayed allocation. In this case, we will exclude that | ||
3873 | * cluster. | ||
3874 | * Ex: | ||
3875 | * |----====c========|========c========| | ||
3876 | * |++++++ allocated ++++++| | ||
3877 | * ==> 1 complete clusters in above example | ||
3878 | * | ||
3879 | * Ex: | ||
3880 | * |================c================| | ||
3881 | * |++++++ allocated ++++++| | ||
3882 | * ==> 0 complete clusters in above example | ||
3883 | * | ||
3884 | * The ext4_da_update_reserve_space will be called only if we | ||
3885 | * determine here that there were some "entire" clusters that span | ||
3886 | * this 'allocated' range. | ||
3887 | * In the non-bigalloc case, this function will just end up returning num_blks | ||
3888 | * without ever calling ext4_find_delalloc_range. | ||
3889 | */ | ||
3890 | static unsigned int | ||
3891 | get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, | ||
3892 | unsigned int num_blks) | ||
3893 | { | ||
3894 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
3895 | ext4_lblk_t alloc_cluster_start, alloc_cluster_end; | ||
3896 | ext4_lblk_t lblk_from, lblk_to, c_offset; | ||
3897 | unsigned int allocated_clusters = 0; | ||
3898 | |||
3899 | alloc_cluster_start = EXT4_B2C(sbi, lblk_start); | ||
3900 | alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); | ||
3901 | |||
3902 | /* max possible clusters for this allocation */ | ||
3903 | allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; | ||
3904 | |||
3905 | trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); | ||
3906 | |||
3907 | /* Check towards left side */ | ||
3908 | c_offset = EXT4_LBLK_COFF(sbi, lblk_start); | ||
3909 | if (c_offset) { | ||
3910 | lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); | ||
3911 | lblk_to = lblk_from + c_offset - 1; | ||
3912 | |||
3913 | if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) | ||
3914 | allocated_clusters--; | ||
3915 | } | ||
3916 | |||
3917 | /* Now check towards right. */ | ||
3918 | c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); | ||
3919 | if (allocated_clusters && c_offset) { | ||
3920 | lblk_from = lblk_start + num_blks; | ||
3921 | lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; | ||
3922 | |||
3923 | if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) | ||
3924 | allocated_clusters--; | ||
3925 | } | ||
3926 | |||
3927 | return allocated_clusters; | ||
3928 | } | ||
3929 | |||
3930 | static int | 3884 | static int |
3931 | convert_initialized_extent(handle_t *handle, struct inode *inode, | 3885 | convert_initialized_extent(handle_t *handle, struct inode *inode, |
3932 | struct ext4_map_blocks *map, | 3886 | struct ext4_map_blocks *map, |
@@ -4108,23 +4062,6 @@ out: | |||
4108 | } | 4062 | } |
4109 | map->m_len = allocated; | 4063 | map->m_len = allocated; |
4110 | 4064 | ||
4111 | /* | ||
4112 | * If we have done fallocate with the offset that is already | ||
4113 | * delayed allocated, we would have block reservation | ||
4114 | * and quota reservation done in the delayed write path. | ||
4115 | * But fallocate would have already updated quota and block | ||
4116 | * count for this offset. So cancel these reservation | ||
4117 | */ | ||
4118 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { | ||
4119 | unsigned int reserved_clusters; | ||
4120 | reserved_clusters = get_reserved_cluster_alloc(inode, | ||
4121 | map->m_lblk, map->m_len); | ||
4122 | if (reserved_clusters) | ||
4123 | ext4_da_update_reserve_space(inode, | ||
4124 | reserved_clusters, | ||
4125 | 0); | ||
4126 | } | ||
4127 | |||
4128 | map_out: | 4065 | map_out: |
4129 | map->m_flags |= EXT4_MAP_MAPPED; | 4066 | map->m_flags |= EXT4_MAP_MAPPED; |
4130 | if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { | 4067 | if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { |
@@ -4513,77 +4450,39 @@ got_allocated_blocks: | |||
4513 | map->m_flags |= EXT4_MAP_NEW; | 4450 | map->m_flags |= EXT4_MAP_NEW; |
4514 | 4451 | ||
4515 | /* | 4452 | /* |
4516 | * Update reserved blocks/metadata blocks after successful | 4453 | * Reduce the reserved cluster count to reflect successful deferred |
4517 | * block allocation which had been deferred till now. | 4454 | * allocation of delayed allocated clusters or direct allocation of |
4455 | * clusters discovered to be delayed allocated. Once allocated, a | ||
4456 | * cluster is not included in the reserved count. | ||
4518 | */ | 4457 | */ |
4519 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { | 4458 | if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) { |
4520 | unsigned int reserved_clusters; | 4459 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { |
4521 | /* | ||
4522 | * Check how many clusters we had reserved this allocated range | ||
4523 | */ | ||
4524 | reserved_clusters = get_reserved_cluster_alloc(inode, | ||
4525 | map->m_lblk, allocated); | ||
4526 | if (!map_from_cluster) { | ||
4527 | BUG_ON(allocated_clusters < reserved_clusters); | ||
4528 | if (reserved_clusters < allocated_clusters) { | ||
4529 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
4530 | int reservation = allocated_clusters - | ||
4531 | reserved_clusters; | ||
4532 | /* | ||
4533 | * It seems we claimed few clusters outside of | ||
4534 | * the range of this allocation. We should give | ||
4535 | * it back to the reservation pool. This can | ||
4536 | * happen in the following case: | ||
4537 | * | ||
4538 | * * Suppose s_cluster_ratio is 4 (i.e., each | ||
4539 | * cluster has 4 blocks. Thus, the clusters | ||
4540 | * are [0-3],[4-7],[8-11]... | ||
4541 | * * First comes delayed allocation write for | ||
4542 | * logical blocks 10 & 11. Since there were no | ||
4543 | * previous delayed allocated blocks in the | ||
4544 | * range [8-11], we would reserve 1 cluster | ||
4545 | * for this write. | ||
4546 | * * Next comes write for logical blocks 3 to 8. | ||
4547 | * In this case, we will reserve 2 clusters | ||
4548 | * (for [0-3] and [4-7]; and not for [8-11] as | ||
4549 | * that range has a delayed allocated blocks. | ||
4550 | * Thus total reserved clusters now becomes 3. | ||
4551 | * * Now, during the delayed allocation writeout | ||
4552 | * time, we will first write blocks [3-8] and | ||
4553 | * allocate 3 clusters for writing these | ||
4554 | * blocks. Also, we would claim all these | ||
4555 | * three clusters above. | ||
4556 | * * Now when we come here to writeout the | ||
4557 | * blocks [10-11], we would expect to claim | ||
4558 | * the reservation of 1 cluster we had made | ||
4559 | * (and we would claim it since there are no | ||
4560 | * more delayed allocated blocks in the range | ||
4561 | * [8-11]. But our reserved cluster count had | ||
4562 | * already gone to 0. | ||
4563 | * | ||
4564 | * Thus, at the step 4 above when we determine | ||
4565 | * that there are still some unwritten delayed | ||
4566 | * allocated blocks outside of our current | ||
4567 | * block range, we should increment the | ||
4568 | * reserved clusters count so that when the | ||
4569 | * remaining blocks finally gets written, we | ||
4570 | * could claim them. | ||
4571 | */ | ||
4572 | dquot_reserve_block(inode, | ||
4573 | EXT4_C2B(sbi, reservation)); | ||
4574 | spin_lock(&ei->i_block_reservation_lock); | ||
4575 | ei->i_reserved_data_blocks += reservation; | ||
4576 | spin_unlock(&ei->i_block_reservation_lock); | ||
4577 | } | ||
4578 | /* | 4460 | /* |
4579 | * We will claim quota for all newly allocated blocks. | 4461 | * When allocating delayed allocated clusters, simply |
4580 | * We're updating the reserved space *after* the | 4462 | * reduce the reserved cluster count and claim quota |
4581 | * correction above so we do not accidentally free | ||
4582 | * all the metadata reservation because we might | ||
4583 | * actually need it later on. | ||
4584 | */ | 4463 | */ |
4585 | ext4_da_update_reserve_space(inode, allocated_clusters, | 4464 | ext4_da_update_reserve_space(inode, allocated_clusters, |
4586 | 1); | 4465 | 1); |
4466 | } else { | ||
4467 | ext4_lblk_t lblk, len; | ||
4468 | unsigned int n; | ||
4469 | |||
4470 | /* | ||
4471 | * When allocating non-delayed allocated clusters | ||
4472 | * (from fallocate, filemap, DIO, or clusters | ||
4473 | * allocated when delalloc has been disabled by | ||
4474 | * ext4_nonda_switch), reduce the reserved cluster | ||
4475 | * count by the number of allocated clusters that | ||
4476 | * have previously been delayed allocated. Quota | ||
4477 | * has been claimed by ext4_mb_new_blocks() above, | ||
4478 | * so release the quota reservations made for any | ||
4479 | * previously delayed allocated clusters. | ||
4480 | */ | ||
4481 | lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); | ||
4482 | len = allocated_clusters << sbi->s_cluster_bits; | ||
4483 | n = ext4_es_delayed_clu(inode, lblk, len); | ||
4484 | if (n > 0) | ||
4485 | ext4_da_update_reserve_space(inode, (int) n, 0); | ||
4587 | } | 4486 | } |
4588 | } | 4487 | } |
4589 | 4488 | ||
@@ -5075,8 +4974,10 @@ static int ext4_find_delayed_extent(struct inode *inode, | |||
5075 | ext4_lblk_t block, next_del; | 4974 | ext4_lblk_t block, next_del; |
5076 | 4975 | ||
5077 | if (newes->es_pblk == 0) { | 4976 | if (newes->es_pblk == 0) { |
5078 | ext4_es_find_delayed_extent_range(inode, newes->es_lblk, | 4977 | ext4_es_find_extent_range(inode, &ext4_es_is_delayed, |
5079 | newes->es_lblk + newes->es_len - 1, &es); | 4978 | newes->es_lblk, |
4979 | newes->es_lblk + newes->es_len - 1, | ||
4980 | &es); | ||
5080 | 4981 | ||
5081 | /* | 4982 | /* |
5082 | * No extent in extent-tree contains block @newes->es_pblk, | 4983 | * No extent in extent-tree contains block @newes->es_pblk, |
@@ -5097,7 +4998,8 @@ static int ext4_find_delayed_extent(struct inode *inode, | |||
5097 | } | 4998 | } |
5098 | 4999 | ||
5099 | block = newes->es_lblk + newes->es_len; | 5000 | block = newes->es_lblk + newes->es_len; |
5100 | ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); | 5001 | ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block, |
5002 | EXT_MAX_BLOCKS, &es); | ||
5101 | if (es.es_len == 0) | 5003 | if (es.es_len == 0) |
5102 | next_del = EXT_MAX_BLOCKS; | 5004 | next_del = EXT_MAX_BLOCKS; |
5103 | else | 5005 | else |
@@ -5958,3 +5860,82 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, | |||
5958 | } | 5860 | } |
5959 | return replaced_count; | 5861 | return replaced_count; |
5960 | } | 5862 | } |
5863 | |||
5864 | /* | ||
5865 | * ext4_clu_mapped - determine whether any block in a logical cluster has | ||
5866 | * been mapped to a physical cluster | ||
5867 | * | ||
5868 | * @inode - file containing the logical cluster | ||
5869 | * @lclu - logical cluster of interest | ||
5870 | * | ||
5871 | * Returns 1 if any block in the logical cluster is mapped, signifying | ||
5872 | * that a physical cluster has been allocated for it. Otherwise, | ||
5873 | * returns 0. Can also return negative error codes. Derived from | ||
5874 | * ext4_ext_map_blocks(). | ||
5875 | */ | ||
5876 | int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) | ||
5877 | { | ||
5878 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
5879 | struct ext4_ext_path *path; | ||
5880 | int depth, mapped = 0, err = 0; | ||
5881 | struct ext4_extent *extent; | ||
5882 | ext4_lblk_t first_lblk, first_lclu, last_lclu; | ||
5883 | |||
5884 | /* search for the extent closest to the first block in the cluster */ | ||
5885 | path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); | ||
5886 | if (IS_ERR(path)) { | ||
5887 | err = PTR_ERR(path); | ||
5888 | path = NULL; | ||
5889 | goto out; | ||
5890 | } | ||
5891 | |||
5892 | depth = ext_depth(inode); | ||
5893 | |||
5894 | /* | ||
5895 | * A consistent leaf must not be empty. This situation is possible, | ||
5896 | * though, _during_ tree modification, and it's why an assert can't | ||
5897 | * be put in ext4_find_extent(). | ||
5898 | */ | ||
5899 | if (unlikely(path[depth].p_ext == NULL && depth != 0)) { | ||
5900 | EXT4_ERROR_INODE(inode, | ||
5901 | "bad extent address - lblock: %lu, depth: %d, pblock: %lld", | ||
5902 | (unsigned long) EXT4_C2B(sbi, lclu), | ||
5903 | depth, path[depth].p_block); | ||
5904 | err = -EFSCORRUPTED; | ||
5905 | goto out; | ||
5906 | } | ||
5907 | |||
5908 | extent = path[depth].p_ext; | ||
5909 | |||
5910 | /* can't be mapped if the extent tree is empty */ | ||
5911 | if (extent == NULL) | ||
5912 | goto out; | ||
5913 | |||
5914 | first_lblk = le32_to_cpu(extent->ee_block); | ||
5915 | first_lclu = EXT4_B2C(sbi, first_lblk); | ||
5916 | |||
5917 | /* | ||
5918 | * Three possible outcomes at this point - found extent spanning | ||
5919 | * the target cluster, to the left of the target cluster, or to the | ||
5920 | * right of the target cluster. The first two cases are handled here. | ||
5921 | * The last case indicates the target cluster is not mapped. | ||
5922 | */ | ||
5923 | if (lclu >= first_lclu) { | ||
5924 | last_lclu = EXT4_B2C(sbi, first_lblk + | ||
5925 | ext4_ext_get_actual_len(extent) - 1); | ||
5926 | if (lclu <= last_lclu) { | ||
5927 | mapped = 1; | ||
5928 | } else { | ||
5929 | first_lblk = ext4_ext_next_allocated_block(path); | ||
5930 | first_lclu = EXT4_B2C(sbi, first_lblk); | ||
5931 | if (lclu == first_lclu) | ||
5932 | mapped = 1; | ||
5933 | } | ||
5934 | } | ||
5935 | |||
5936 | out: | ||
5937 | ext4_ext_drop_refs(path); | ||
5938 | kfree(path); | ||
5939 | |||
5940 | return err ? err : mapped; | ||
5941 | } | ||
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c4e6fb15101b..2b439afafe13 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c | |||
@@ -142,6 +142,7 @@ | |||
142 | */ | 142 | */ |
143 | 143 | ||
144 | static struct kmem_cache *ext4_es_cachep; | 144 | static struct kmem_cache *ext4_es_cachep; |
145 | static struct kmem_cache *ext4_pending_cachep; | ||
145 | 146 | ||
146 | static int __es_insert_extent(struct inode *inode, struct extent_status *newes); | 147 | static int __es_insert_extent(struct inode *inode, struct extent_status *newes); |
147 | static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, | 148 | static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, |
@@ -149,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, | |||
149 | static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); | 150 | static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); |
150 | static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, | 151 | static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, |
151 | struct ext4_inode_info *locked_ei); | 152 | struct ext4_inode_info *locked_ei); |
153 | static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, | ||
154 | ext4_lblk_t len); | ||
152 | 155 | ||
153 | int __init ext4_init_es(void) | 156 | int __init ext4_init_es(void) |
154 | { | 157 | { |
@@ -233,30 +236,38 @@ static struct extent_status *__es_tree_search(struct rb_root *root, | |||
233 | } | 236 | } |
234 | 237 | ||
235 | /* | 238 | /* |
236 | * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering | 239 | * ext4_es_find_extent_range - find extent with specified status within block |
237 | * @es->lblk if it exists, otherwise, the next extent after @es->lblk. | 240 | * range or next extent following block range in |
241 | * extents status tree | ||
238 | * | 242 | * |
239 | * @inode: the inode which owns delayed extents | 243 | * @inode - file containing the range |
240 | * @lblk: the offset where we start to search | 244 | * @matching_fn - pointer to function that matches extents with desired status |
241 | * @end: the offset where we stop to search | 245 | * @lblk - logical block defining start of range |
242 | * @es: delayed extent that we found | 246 | * @end - logical block defining end of range |
247 | * @es - extent found, if any | ||
248 | * | ||
249 | * Find the first extent within the block range specified by @lblk and @end | ||
250 | * in the extents status tree that satisfies @matching_fn. If a match | ||
251 | * is found, it's returned in @es. If not, and a matching extent is found | ||
252 | * beyond the block range, it's returned in @es. If no match is found, an | ||
253 | * extent is returned in @es whose es_lblk, es_len, and es_pblk components | ||
254 | * are 0. | ||
243 | */ | 255 | */ |
244 | void ext4_es_find_delayed_extent_range(struct inode *inode, | 256 | static void __es_find_extent_range(struct inode *inode, |
245 | ext4_lblk_t lblk, ext4_lblk_t end, | 257 | int (*matching_fn)(struct extent_status *es), |
246 | struct extent_status *es) | 258 | ext4_lblk_t lblk, ext4_lblk_t end, |
259 | struct extent_status *es) | ||
247 | { | 260 | { |
248 | struct ext4_es_tree *tree = NULL; | 261 | struct ext4_es_tree *tree = NULL; |
249 | struct extent_status *es1 = NULL; | 262 | struct extent_status *es1 = NULL; |
250 | struct rb_node *node; | 263 | struct rb_node *node; |
251 | 264 | ||
252 | BUG_ON(es == NULL); | 265 | WARN_ON(es == NULL); |
253 | BUG_ON(end < lblk); | 266 | WARN_ON(end < lblk); |
254 | trace_ext4_es_find_delayed_extent_range_enter(inode, lblk); | ||
255 | 267 | ||
256 | read_lock(&EXT4_I(inode)->i_es_lock); | ||
257 | tree = &EXT4_I(inode)->i_es_tree; | 268 | tree = &EXT4_I(inode)->i_es_tree; |
258 | 269 | ||
259 | /* find extent in cache firstly */ | 270 | /* see if the extent has been cached */ |
260 | es->es_lblk = es->es_len = es->es_pblk = 0; | 271 | es->es_lblk = es->es_len = es->es_pblk = 0; |
261 | if (tree->cache_es) { | 272 | if (tree->cache_es) { |
262 | es1 = tree->cache_es; | 273 | es1 = tree->cache_es; |
@@ -271,28 +282,133 @@ void ext4_es_find_delayed_extent_range(struct inode *inode, | |||
271 | es1 = __es_tree_search(&tree->root, lblk); | 282 | es1 = __es_tree_search(&tree->root, lblk); |
272 | 283 | ||
273 | out: | 284 | out: |
274 | if (es1 && !ext4_es_is_delayed(es1)) { | 285 | if (es1 && !matching_fn(es1)) { |
275 | while ((node = rb_next(&es1->rb_node)) != NULL) { | 286 | while ((node = rb_next(&es1->rb_node)) != NULL) { |
276 | es1 = rb_entry(node, struct extent_status, rb_node); | 287 | es1 = rb_entry(node, struct extent_status, rb_node); |
277 | if (es1->es_lblk > end) { | 288 | if (es1->es_lblk > end) { |
278 | es1 = NULL; | 289 | es1 = NULL; |
279 | break; | 290 | break; |
280 | } | 291 | } |
281 | if (ext4_es_is_delayed(es1)) | 292 | if (matching_fn(es1)) |
282 | break; | 293 | break; |
283 | } | 294 | } |
284 | } | 295 | } |
285 | 296 | ||
286 | if (es1 && ext4_es_is_delayed(es1)) { | 297 | if (es1 && matching_fn(es1)) { |
287 | tree->cache_es = es1; | 298 | tree->cache_es = es1; |
288 | es->es_lblk = es1->es_lblk; | 299 | es->es_lblk = es1->es_lblk; |
289 | es->es_len = es1->es_len; | 300 | es->es_len = es1->es_len; |
290 | es->es_pblk = es1->es_pblk; | 301 | es->es_pblk = es1->es_pblk; |
291 | } | 302 | } |
292 | 303 | ||
304 | } | ||
305 | |||
306 | /* | ||
307 | * Locking for __es_find_extent_range() for external use | ||
308 | */ | ||
309 | void ext4_es_find_extent_range(struct inode *inode, | ||
310 | int (*matching_fn)(struct extent_status *es), | ||
311 | ext4_lblk_t lblk, ext4_lblk_t end, | ||
312 | struct extent_status *es) | ||
313 | { | ||
314 | trace_ext4_es_find_extent_range_enter(inode, lblk); | ||
315 | |||
316 | read_lock(&EXT4_I(inode)->i_es_lock); | ||
317 | __es_find_extent_range(inode, matching_fn, lblk, end, es); | ||
318 | read_unlock(&EXT4_I(inode)->i_es_lock); | ||
319 | |||
320 | trace_ext4_es_find_extent_range_exit(inode, es); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * __es_scan_range - search block range for block with specified status | ||
325 | * in extents status tree | ||
326 | * | ||
327 | * @inode - file containing the range | ||
328 | * @matching_fn - pointer to function that matches extents with desired status | ||
329 | * @lblk - logical block defining start of range | ||
330 | * @end - logical block defining end of range | ||
331 | * | ||
332 | * Returns true if at least one block in the specified block range satisfies | ||
333 | * the criterion specified by @matching_fn, and false if not. If at least | ||
334 | * one extent has the specified status, then there is at least one block | ||
335 | * in the cluster with that status. Should only be called by code that has | ||
336 | * taken i_es_lock. | ||
337 | */ | ||
338 | static bool __es_scan_range(struct inode *inode, | ||
339 | int (*matching_fn)(struct extent_status *es), | ||
340 | ext4_lblk_t start, ext4_lblk_t end) | ||
341 | { | ||
342 | struct extent_status es; | ||
343 | |||
344 | __es_find_extent_range(inode, matching_fn, start, end, &es); | ||
345 | if (es.es_len == 0) | ||
346 | return false; /* no matching extent in the tree */ | ||
347 | else if (es.es_lblk <= start && | ||
348 | start < es.es_lblk + es.es_len) | ||
349 | return true; | ||
350 | else if (start <= es.es_lblk && es.es_lblk <= end) | ||
351 | return true; | ||
352 | else | ||
353 | return false; | ||
354 | } | ||
355 | /* | ||
356 | * Locking for __es_scan_range() for external use | ||
357 | */ | ||
358 | bool ext4_es_scan_range(struct inode *inode, | ||
359 | int (*matching_fn)(struct extent_status *es), | ||
360 | ext4_lblk_t lblk, ext4_lblk_t end) | ||
361 | { | ||
362 | bool ret; | ||
363 | |||
364 | read_lock(&EXT4_I(inode)->i_es_lock); | ||
365 | ret = __es_scan_range(inode, matching_fn, lblk, end); | ||
293 | read_unlock(&EXT4_I(inode)->i_es_lock); | 366 | read_unlock(&EXT4_I(inode)->i_es_lock); |
294 | 367 | ||
295 | trace_ext4_es_find_delayed_extent_range_exit(inode, es); | 368 | return ret; |
369 | } | ||
370 | |||
371 | /* | ||
372 | * __es_scan_clu - search cluster for block with specified status in | ||
373 | * extents status tree | ||
374 | * | ||
375 | * @inode - file containing the cluster | ||
376 | * @matching_fn - pointer to function that matches extents with desired status | ||
377 | * @lblk - logical block in cluster to be searched | ||
378 | * | ||
379 | * Returns true if at least one extent in the cluster containing @lblk | ||
380 | * satisfies the criterion specified by @matching_fn, and false if not. If at | ||
381 | * least one extent has the specified status, then there is at least one block | ||
382 | * in the cluster with that status. Should only be called by code that has | ||
383 | * taken i_es_lock. | ||
384 | */ | ||
385 | static bool __es_scan_clu(struct inode *inode, | ||
386 | int (*matching_fn)(struct extent_status *es), | ||
387 | ext4_lblk_t lblk) | ||
388 | { | ||
389 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
390 | ext4_lblk_t lblk_start, lblk_end; | ||
391 | |||
392 | lblk_start = EXT4_LBLK_CMASK(sbi, lblk); | ||
393 | lblk_end = lblk_start + sbi->s_cluster_ratio - 1; | ||
394 | |||
395 | return __es_scan_range(inode, matching_fn, lblk_start, lblk_end); | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Locking for __es_scan_clu() for external use | ||
400 | */ | ||
401 | bool ext4_es_scan_clu(struct inode *inode, | ||
402 | int (*matching_fn)(struct extent_status *es), | ||
403 | ext4_lblk_t lblk) | ||
404 | { | ||
405 | bool ret; | ||
406 | |||
407 | read_lock(&EXT4_I(inode)->i_es_lock); | ||
408 | ret = __es_scan_clu(inode, matching_fn, lblk); | ||
409 | read_unlock(&EXT4_I(inode)->i_es_lock); | ||
410 | |||
411 | return ret; | ||
296 | } | 412 | } |
297 | 413 | ||
298 | static void ext4_es_list_add(struct inode *inode) | 414 | static void ext4_es_list_add(struct inode *inode) |
@@ -694,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, | |||
694 | struct extent_status newes; | 810 | struct extent_status newes; |
695 | ext4_lblk_t end = lblk + len - 1; | 811 | ext4_lblk_t end = lblk + len - 1; |
696 | int err = 0; | 812 | int err = 0; |
813 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
697 | 814 | ||
698 | es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", | 815 | es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", |
699 | lblk, len, pblk, status, inode->i_ino); | 816 | lblk, len, pblk, status, inode->i_ino); |
@@ -730,6 +847,11 @@ retry: | |||
730 | if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) | 847 | if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) |
731 | err = 0; | 848 | err = 0; |
732 | 849 | ||
850 | if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && | ||
851 | (status & EXTENT_STATUS_WRITTEN || | ||
852 | status & EXTENT_STATUS_UNWRITTEN)) | ||
853 | __revise_pending(inode, lblk, len); | ||
854 | |||
733 | error: | 855 | error: |
734 | write_unlock(&EXT4_I(inode)->i_es_lock); | 856 | write_unlock(&EXT4_I(inode)->i_es_lock); |
735 | 857 | ||
@@ -1252,3 +1374,499 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) | |||
1252 | ei->i_es_tree.cache_es = NULL; | 1374 | ei->i_es_tree.cache_es = NULL; |
1253 | return nr_shrunk; | 1375 | return nr_shrunk; |
1254 | } | 1376 | } |
1377 | |||
1378 | #ifdef ES_DEBUG__ | ||
1379 | static void ext4_print_pending_tree(struct inode *inode) | ||
1380 | { | ||
1381 | struct ext4_pending_tree *tree; | ||
1382 | struct rb_node *node; | ||
1383 | struct pending_reservation *pr; | ||
1384 | |||
1385 | printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); | ||
1386 | tree = &EXT4_I(inode)->i_pending_tree; | ||
1387 | node = rb_first(&tree->root); | ||
1388 | while (node) { | ||
1389 | pr = rb_entry(node, struct pending_reservation, rb_node); | ||
1390 | printk(KERN_DEBUG " %u", pr->lclu); | ||
1391 | node = rb_next(node); | ||
1392 | } | ||
1393 | printk(KERN_DEBUG "\n"); | ||
1394 | } | ||
1395 | #else | ||
1396 | #define ext4_print_pending_tree(inode) | ||
1397 | #endif | ||
1398 | |||
1399 | int __init ext4_init_pending(void) | ||
1400 | { | ||
1401 | ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation", | ||
1402 | sizeof(struct pending_reservation), | ||
1403 | 0, (SLAB_RECLAIM_ACCOUNT), NULL); | ||
1404 | if (ext4_pending_cachep == NULL) | ||
1405 | return -ENOMEM; | ||
1406 | return 0; | ||
1407 | } | ||
1408 | |||
1409 | void ext4_exit_pending(void) | ||
1410 | { | ||
1411 | kmem_cache_destroy(ext4_pending_cachep); | ||
1412 | } | ||
1413 | |||
1414 | void ext4_init_pending_tree(struct ext4_pending_tree *tree) | ||
1415 | { | ||
1416 | tree->root = RB_ROOT; | ||
1417 | } | ||
1418 | |||
1419 | /* | ||
1420 | * __get_pending - retrieve a pointer to a pending reservation | ||
1421 | * | ||
1422 | * @inode - file containing the pending cluster reservation | ||
1423 | * @lclu - logical cluster of interest | ||
1424 | * | ||
1425 | * Returns a pointer to a pending reservation if it's a member of | ||
1426 | * the set, and NULL if not. Must be called holding i_es_lock. | ||
1427 | */ | ||
1428 | static struct pending_reservation *__get_pending(struct inode *inode, | ||
1429 | ext4_lblk_t lclu) | ||
1430 | { | ||
1431 | struct ext4_pending_tree *tree; | ||
1432 | struct rb_node *node; | ||
1433 | struct pending_reservation *pr = NULL; | ||
1434 | |||
1435 | tree = &EXT4_I(inode)->i_pending_tree; | ||
1436 | node = (&tree->root)->rb_node; | ||
1437 | |||
1438 | while (node) { | ||
1439 | pr = rb_entry(node, struct pending_reservation, rb_node); | ||
1440 | if (lclu < pr->lclu) | ||
1441 | node = node->rb_left; | ||
1442 | else if (lclu > pr->lclu) | ||
1443 | node = node->rb_right; | ||
1444 | else if (lclu == pr->lclu) | ||
1445 | return pr; | ||
1446 | } | ||
1447 | return NULL; | ||
1448 | } | ||
1449 | |||
1450 | /* | ||
1451 | * __insert_pending - adds a pending cluster reservation to the set of | ||
1452 | * pending reservations | ||
1453 | * | ||
1454 | * @inode - file containing the cluster | ||
1455 | * @lblk - logical block in the cluster to be added | ||
1456 | * | ||
1457 | * Returns 0 on successful insertion and -ENOMEM on failure. If the | ||
1458 | * pending reservation is already in the set, returns successfully. | ||
1459 | */ | ||
1460 | static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) | ||
1461 | { | ||
1462 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1463 | struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; | ||
1464 | struct rb_node **p = &tree->root.rb_node; | ||
1465 | struct rb_node *parent = NULL; | ||
1466 | struct pending_reservation *pr; | ||
1467 | ext4_lblk_t lclu; | ||
1468 | int ret = 0; | ||
1469 | |||
1470 | lclu = EXT4_B2C(sbi, lblk); | ||
1471 | /* search to find parent for insertion */ | ||
1472 | while (*p) { | ||
1473 | parent = *p; | ||
1474 | pr = rb_entry(parent, struct pending_reservation, rb_node); | ||
1475 | |||
1476 | if (lclu < pr->lclu) { | ||
1477 | p = &(*p)->rb_left; | ||
1478 | } else if (lclu > pr->lclu) { | ||
1479 | p = &(*p)->rb_right; | ||
1480 | } else { | ||
1481 | /* pending reservation already inserted */ | ||
1482 | goto out; | ||
1483 | } | ||
1484 | } | ||
1485 | |||
1486 | pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); | ||
1487 | if (pr == NULL) { | ||
1488 | ret = -ENOMEM; | ||
1489 | goto out; | ||
1490 | } | ||
1491 | pr->lclu = lclu; | ||
1492 | |||
1493 | rb_link_node(&pr->rb_node, parent, p); | ||
1494 | rb_insert_color(&pr->rb_node, &tree->root); | ||
1495 | |||
1496 | out: | ||
1497 | return ret; | ||
1498 | } | ||
1499 | |||
1500 | /* | ||
1501 | * __remove_pending - removes a pending cluster reservation from the set | ||
1502 | * of pending reservations | ||
1503 | * | ||
1504 | * @inode - file containing the cluster | ||
1505 | * @lblk - logical block in the pending cluster reservation to be removed | ||
1506 | * | ||
1507 | * Returns successfully if pending reservation is not a member of the set. | ||
1508 | */ | ||
1509 | static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) | ||
1510 | { | ||
1511 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1512 | struct pending_reservation *pr; | ||
1513 | struct ext4_pending_tree *tree; | ||
1514 | |||
1515 | pr = __get_pending(inode, EXT4_B2C(sbi, lblk)); | ||
1516 | if (pr != NULL) { | ||
1517 | tree = &EXT4_I(inode)->i_pending_tree; | ||
1518 | rb_erase(&pr->rb_node, &tree->root); | ||
1519 | kmem_cache_free(ext4_pending_cachep, pr); | ||
1520 | } | ||
1521 | } | ||
1522 | |||
1523 | /* | ||
1524 | * ext4_remove_pending - removes a pending cluster reservation from the set | ||
1525 | * of pending reservations | ||
1526 | * | ||
1527 | * @inode - file containing the cluster | ||
1528 | * @lblk - logical block in the pending cluster reservation to be removed | ||
1529 | * | ||
1530 | * Locking for external use of __remove_pending. | ||
1531 | */ | ||
1532 | void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk) | ||
1533 | { | ||
1534 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1535 | |||
1536 | write_lock(&ei->i_es_lock); | ||
1537 | __remove_pending(inode, lblk); | ||
1538 | write_unlock(&ei->i_es_lock); | ||
1539 | } | ||
1540 | |||
1541 | /* | ||
1542 | * ext4_is_pending - determine whether a cluster has a pending reservation | ||
1543 | * on it | ||
1544 | * | ||
1545 | * @inode - file containing the cluster | ||
1546 | * @lblk - logical block in the cluster | ||
1547 | * | ||
1548 | * Returns true if there's a pending reservation for the cluster in the | ||
1549 | * set of pending reservations, and false if not. | ||
1550 | */ | ||
1551 | bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) | ||
1552 | { | ||
1553 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1554 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1555 | bool ret; | ||
1556 | |||
1557 | read_lock(&ei->i_es_lock); | ||
1558 | ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL); | ||
1559 | read_unlock(&ei->i_es_lock); | ||
1560 | |||
1561 | return ret; | ||
1562 | } | ||
1563 | |||
1564 | /* | ||
1565 | * ext4_es_insert_delayed_block - adds a delayed block to the extents status | ||
1566 | * tree, adding a pending reservation where | ||
1567 | * needed | ||
1568 | * | ||
1569 | * @inode - file containing the newly added block | ||
1570 | * @lblk - logical block to be added | ||
1571 | * @allocated - indicates whether a physical cluster has been allocated for | ||
1572 | * the logical cluster that contains the block | ||
1573 | * | ||
1574 | * Returns 0 on success, negative error code on failure. | ||
1575 | */ | ||
1576 | int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, | ||
1577 | bool allocated) | ||
1578 | { | ||
1579 | struct extent_status newes; | ||
1580 | int err = 0; | ||
1581 | |||
1582 | es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", | ||
1583 | lblk, inode->i_ino); | ||
1584 | |||
1585 | newes.es_lblk = lblk; | ||
1586 | newes.es_len = 1; | ||
1587 | ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); | ||
1588 | trace_ext4_es_insert_delayed_block(inode, &newes, allocated); | ||
1589 | |||
1590 | ext4_es_insert_extent_check(inode, &newes); | ||
1591 | |||
1592 | write_lock(&EXT4_I(inode)->i_es_lock); | ||
1593 | |||
1594 | err = __es_remove_extent(inode, lblk, lblk); | ||
1595 | if (err != 0) | ||
1596 | goto error; | ||
1597 | retry: | ||
1598 | err = __es_insert_extent(inode, &newes); | ||
1599 | if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), | ||
1600 | 128, EXT4_I(inode))) | ||
1601 | goto retry; | ||
1602 | if (err != 0) | ||
1603 | goto error; | ||
1604 | |||
1605 | if (allocated) | ||
1606 | __insert_pending(inode, lblk); | ||
1607 | |||
1608 | error: | ||
1609 | write_unlock(&EXT4_I(inode)->i_es_lock); | ||
1610 | |||
1611 | ext4_es_print_tree(inode); | ||
1612 | ext4_print_pending_tree(inode); | ||
1613 | |||
1614 | return err; | ||
1615 | } | ||
1616 | |||
1617 | /* | ||
1618 | * __es_delayed_clu - count number of clusters containing blocks that | ||
1619 | * are delayed only | ||
1620 | * | ||
1621 | * @inode - file containing block range | ||
1622 | * @start - logical block defining start of range | ||
1623 | * @end - logical block defining end of range | ||
1624 | * | ||
1625 | * Returns the number of clusters containing only delayed (not delayed | ||
1626 | * and unwritten) blocks in the range specified by @start and @end. Any | ||
1627 | * cluster or part of a cluster within the range and containing a delayed | ||
1628 | * and not unwritten block within the range is counted as a whole cluster. | ||
1629 | */ | ||
1630 | static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, | ||
1631 | ext4_lblk_t end) | ||
1632 | { | ||
1633 | struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; | ||
1634 | struct extent_status *es; | ||
1635 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1636 | struct rb_node *node; | ||
1637 | ext4_lblk_t first_lclu, last_lclu; | ||
1638 | unsigned long long last_counted_lclu; | ||
1639 | unsigned int n = 0; | ||
1640 | |||
1641 | /* guaranteed to be unequal to any ext4_lblk_t value */ | ||
1642 | last_counted_lclu = ~0ULL; | ||
1643 | |||
1644 | es = __es_tree_search(&tree->root, start); | ||
1645 | |||
1646 | while (es && (es->es_lblk <= end)) { | ||
1647 | if (ext4_es_is_delonly(es)) { | ||
1648 | if (es->es_lblk <= start) | ||
1649 | first_lclu = EXT4_B2C(sbi, start); | ||
1650 | else | ||
1651 | first_lclu = EXT4_B2C(sbi, es->es_lblk); | ||
1652 | |||
1653 | if (ext4_es_end(es) >= end) | ||
1654 | last_lclu = EXT4_B2C(sbi, end); | ||
1655 | else | ||
1656 | last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); | ||
1657 | |||
1658 | if (first_lclu == last_counted_lclu) | ||
1659 | n += last_lclu - first_lclu; | ||
1660 | else | ||
1661 | n += last_lclu - first_lclu + 1; | ||
1662 | last_counted_lclu = last_lclu; | ||
1663 | } | ||
1664 | node = rb_next(&es->rb_node); | ||
1665 | if (!node) | ||
1666 | break; | ||
1667 | es = rb_entry(node, struct extent_status, rb_node); | ||
1668 | } | ||
1669 | |||
1670 | return n; | ||
1671 | } | ||
1672 | |||
1673 | /* | ||
1674 | * ext4_es_delayed_clu - count number of clusters containing blocks that | ||
1675 | * are both delayed and unwritten | ||
1676 | * | ||
1677 | * @inode - file containing block range | ||
1678 | * @lblk - logical block defining start of range | ||
1679 | * @len - number of blocks in range | ||
1680 | * | ||
1681 | * Locking for external use of __es_delayed_clu(). | ||
1682 | */ | ||
1683 | unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, | ||
1684 | ext4_lblk_t len) | ||
1685 | { | ||
1686 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1687 | ext4_lblk_t end; | ||
1688 | unsigned int n; | ||
1689 | |||
1690 | if (len == 0) | ||
1691 | return 0; | ||
1692 | |||
1693 | end = lblk + len - 1; | ||
1694 | WARN_ON(end < lblk); | ||
1695 | |||
1696 | read_lock(&ei->i_es_lock); | ||
1697 | |||
1698 | n = __es_delayed_clu(inode, lblk, end); | ||
1699 | |||
1700 | read_unlock(&ei->i_es_lock); | ||
1701 | |||
1702 | return n; | ||
1703 | } | ||
1704 | |||
1705 | /* | ||
1706 | * __revise_pending - makes, cancels, or leaves unchanged pending cluster | ||
1707 | * reservations for a specified block range depending | ||
1708 | * upon the presence or absence of delayed blocks | ||
1709 | * outside the range within clusters at the ends of the | ||
1710 | * range | ||
1711 | * | ||
1712 | * @inode - file containing the range | ||
1713 | * @lblk - logical block defining the start of range | ||
1714 | * @len - length of range in blocks | ||
1715 | * | ||
1716 | * Used after a newly allocated extent is added to the extents status tree. | ||
1717 | * Requires that the extents in the range have either written or unwritten | ||
1718 | * status. Must be called while holding i_es_lock. | ||
1719 | */ | ||
1720 | static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, | ||
1721 | ext4_lblk_t len) | ||
1722 | { | ||
1723 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1724 | ext4_lblk_t end = lblk + len - 1; | ||
1725 | ext4_lblk_t first, last; | ||
1726 | bool f_del = false, l_del = false; | ||
1727 | |||
1728 | if (len == 0) | ||
1729 | return; | ||
1730 | |||
1731 | /* | ||
1732 | * Two cases - block range within single cluster and block range | ||
1733 | * spanning two or more clusters. Note that a cluster belonging | ||
1734 | * to a range starting and/or ending on a cluster boundary is treated | ||
1735 | * as if it does not contain a delayed extent. The new range may | ||
1736 | * have allocated space for previously delayed blocks out to the | ||
1737 | * cluster boundary, requiring that any pre-existing pending | ||
1738 | * reservation be canceled. Because this code only looks at blocks | ||
1739 | * outside the range, it should revise pending reservations | ||
1740 | * correctly even if the extent represented by the range can't be | ||
1741 | * inserted in the extents status tree due to ENOSPC. | ||
1742 | */ | ||
1743 | |||
1744 | if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { | ||
1745 | first = EXT4_LBLK_CMASK(sbi, lblk); | ||
1746 | if (first != lblk) | ||
1747 | f_del = __es_scan_range(inode, &ext4_es_is_delonly, | ||
1748 | first, lblk - 1); | ||
1749 | if (f_del) { | ||
1750 | __insert_pending(inode, first); | ||
1751 | } else { | ||
1752 | last = EXT4_LBLK_CMASK(sbi, end) + | ||
1753 | sbi->s_cluster_ratio - 1; | ||
1754 | if (last != end) | ||
1755 | l_del = __es_scan_range(inode, | ||
1756 | &ext4_es_is_delonly, | ||
1757 | end + 1, last); | ||
1758 | if (l_del) | ||
1759 | __insert_pending(inode, last); | ||
1760 | else | ||
1761 | __remove_pending(inode, last); | ||
1762 | } | ||
1763 | } else { | ||
1764 | first = EXT4_LBLK_CMASK(sbi, lblk); | ||
1765 | if (first != lblk) | ||
1766 | f_del = __es_scan_range(inode, &ext4_es_is_delonly, | ||
1767 | first, lblk - 1); | ||
1768 | if (f_del) | ||
1769 | __insert_pending(inode, first); | ||
1770 | else | ||
1771 | __remove_pending(inode, first); | ||
1772 | |||
1773 | last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; | ||
1774 | if (last != end) | ||
1775 | l_del = __es_scan_range(inode, &ext4_es_is_delonly, | ||
1776 | end + 1, last); | ||
1777 | if (l_del) | ||
1778 | __insert_pending(inode, last); | ||
1779 | else | ||
1780 | __remove_pending(inode, last); | ||
1781 | } | ||
1782 | } | ||
1783 | |||
1784 | /* | ||
1785 | * ext4_es_remove_blks - remove block range from extents status tree and | ||
1786 | * reduce reservation count or cancel pending | ||
1787 | * reservation as needed | ||
1788 | * | ||
1789 | * @inode - file containing range | ||
1790 | * @lblk - first block in range | ||
1791 | * @len - number of blocks to remove | ||
1792 | * | ||
1793 | */ | ||
1794 | void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, | ||
1795 | ext4_lblk_t len) | ||
1796 | { | ||
1797 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1798 | unsigned int clu_size, reserved = 0; | ||
1799 | ext4_lblk_t last_lclu, first, length, remainder, last; | ||
1800 | bool delonly; | ||
1801 | int err = 0; | ||
1802 | struct pending_reservation *pr; | ||
1803 | struct ext4_pending_tree *tree; | ||
1804 | |||
1805 | /* | ||
1806 | * Process cluster by cluster for bigalloc - there may be up to | ||
1807 | * two clusters in a 4k page with a 1k block size and two blocks | ||
1808 | * per cluster. Also necessary for systems with larger page sizes | ||
1809 | * and potentially larger block sizes. | ||
1810 | */ | ||
1811 | clu_size = sbi->s_cluster_ratio; | ||
1812 | last_lclu = EXT4_B2C(sbi, lblk + len - 1); | ||
1813 | |||
1814 | write_lock(&EXT4_I(inode)->i_es_lock); | ||
1815 | |||
1816 | for (first = lblk, remainder = len; | ||
1817 | remainder > 0; | ||
1818 | first += length, remainder -= length) { | ||
1819 | |||
1820 | if (EXT4_B2C(sbi, first) == last_lclu) | ||
1821 | length = remainder; | ||
1822 | else | ||
1823 | length = clu_size - EXT4_LBLK_COFF(sbi, first); | ||
1824 | |||
1825 | /* | ||
1826 | * The BH_Delay flag, which triggers calls to this function, | ||
1827 | * and the contents of the extents status tree can be | ||
1828 | * inconsistent due to writepages activity. So, note whether | ||
1829 | * the blocks to be removed actually belong to an extent with | ||
1830 | * delayed only status. | ||
1831 | */ | ||
1832 | delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first); | ||
1833 | |||
1834 | /* | ||
1835 | * because of the writepages effect, written and unwritten | ||
1836 | * blocks could be removed here | ||
1837 | */ | ||
1838 | last = first + length - 1; | ||
1839 | err = __es_remove_extent(inode, first, last); | ||
1840 | if (err) | ||
1841 | ext4_warning(inode->i_sb, | ||
1842 | "%s: couldn't remove page (err = %d)", | ||
1843 | __func__, err); | ||
1844 | |||
1845 | /* non-bigalloc case: simply count the cluster for release */ | ||
1846 | if (sbi->s_cluster_ratio == 1 && delonly) { | ||
1847 | reserved++; | ||
1848 | continue; | ||
1849 | } | ||
1850 | |||
1851 | /* | ||
1852 | * bigalloc case: if all delayed allocated only blocks have | ||
1853 | * just been removed from a cluster, either cancel a pending | ||
1854 | * reservation if it exists or count a cluster for release | ||
1855 | */ | ||
1856 | if (delonly && | ||
1857 | !__es_scan_clu(inode, &ext4_es_is_delonly, first)) { | ||
1858 | pr = __get_pending(inode, EXT4_B2C(sbi, first)); | ||
1859 | if (pr != NULL) { | ||
1860 | tree = &EXT4_I(inode)->i_pending_tree; | ||
1861 | rb_erase(&pr->rb_node, &tree->root); | ||
1862 | kmem_cache_free(ext4_pending_cachep, pr); | ||
1863 | } else { | ||
1864 | reserved++; | ||
1865 | } | ||
1866 | } | ||
1867 | } | ||
1868 | |||
1869 | write_unlock(&EXT4_I(inode)->i_es_lock); | ||
1870 | |||
1871 | ext4_da_release_space(inode, reserved); | ||
1872 | } | ||
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8efdeb903d6b..131a8b7df265 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h | |||
@@ -78,6 +78,51 @@ struct ext4_es_stats { | |||
78 | struct percpu_counter es_stats_shk_cnt; | 78 | struct percpu_counter es_stats_shk_cnt; |
79 | }; | 79 | }; |
80 | 80 | ||
81 | /* | ||
82 | * Pending cluster reservations for bigalloc file systems | ||
83 | * | ||
84 | * A cluster with a pending reservation is a logical cluster shared by at | ||
85 | * least one extent in the extents status tree with delayed and unwritten | ||
86 | * status and at least one other written or unwritten extent. The | ||
87 | * reservation is said to be pending because a cluster reservation would | ||
88 | * have to be taken in the event all blocks in the cluster shared with | ||
89 | * written or unwritten extents were deleted while the delayed and | ||
90 | * unwritten blocks remained. | ||
91 | * | ||
92 | * The set of pending cluster reservations is an auxiliary data structure | ||
93 | * used with the extents status tree to implement reserved cluster/block | ||
94 | * accounting for bigalloc file systems. The set is kept in memory and | ||
95 | * records all pending cluster reservations. | ||
96 | * | ||
97 | * Its primary function is to avoid the need to read extents from the | ||
98 | * disk when invalidating pages as a result of a truncate, punch hole, or | ||
99 | * collapse range operation. Page invalidation requires a decrease in the | ||
100 | * reserved cluster count if it results in the removal of all delayed | ||
101 | * and unwritten extents (blocks) from a cluster that is not shared with a | ||
102 | * written or unwritten extent, and no decrease otherwise. Determining | ||
103 | * whether the cluster is shared can be done by searching for a pending | ||
104 | * reservation on it. | ||
105 | * | ||
106 | * Secondarily, it provides a potentially faster method for determining | ||
107 | * whether the reserved cluster count should be increased when a physical | ||
108 | * cluster is deallocated as a result of a truncate, punch hole, or | ||
109 | * collapse range operation. The necessary information is also present | ||
110 | * in the extents status tree, but might be more rapidly accessed in | ||
111 | * the pending reservation set in many cases due to smaller size. | ||
112 | * | ||
113 | * The pending cluster reservation set is implemented as a red-black tree | ||
114 | * with the goal of minimizing per page search time overhead. | ||
115 | */ | ||
116 | |||
117 | struct pending_reservation { | ||
118 | struct rb_node rb_node; | ||
119 | ext4_lblk_t lclu; | ||
120 | }; | ||
121 | |||
122 | struct ext4_pending_tree { | ||
123 | struct rb_root root; | ||
124 | }; | ||
125 | |||
81 | extern int __init ext4_init_es(void); | 126 | extern int __init ext4_init_es(void); |
82 | extern void ext4_exit_es(void); | 127 | extern void ext4_exit_es(void); |
83 | extern void ext4_es_init_tree(struct ext4_es_tree *tree); | 128 | extern void ext4_es_init_tree(struct ext4_es_tree *tree); |
@@ -90,11 +135,18 @@ extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, | |||
90 | unsigned int status); | 135 | unsigned int status); |
91 | extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, | 136 | extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, |
92 | ext4_lblk_t len); | 137 | ext4_lblk_t len); |
93 | extern void ext4_es_find_delayed_extent_range(struct inode *inode, | 138 | extern void ext4_es_find_extent_range(struct inode *inode, |
94 | ext4_lblk_t lblk, ext4_lblk_t end, | 139 | int (*match_fn)(struct extent_status *es), |
95 | struct extent_status *es); | 140 | ext4_lblk_t lblk, ext4_lblk_t end, |
141 | struct extent_status *es); | ||
96 | extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, | 142 | extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, |
97 | struct extent_status *es); | 143 | struct extent_status *es); |
144 | extern bool ext4_es_scan_range(struct inode *inode, | ||
145 | int (*matching_fn)(struct extent_status *es), | ||
146 | ext4_lblk_t lblk, ext4_lblk_t end); | ||
147 | extern bool ext4_es_scan_clu(struct inode *inode, | ||
148 | int (*matching_fn)(struct extent_status *es), | ||
149 | ext4_lblk_t lblk); | ||
98 | 150 | ||
99 | static inline unsigned int ext4_es_status(struct extent_status *es) | 151 | static inline unsigned int ext4_es_status(struct extent_status *es) |
100 | { | 152 | { |
@@ -126,6 +178,16 @@ static inline int ext4_es_is_hole(struct extent_status *es) | |||
126 | return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; | 178 | return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; |
127 | } | 179 | } |
128 | 180 | ||
181 | static inline int ext4_es_is_mapped(struct extent_status *es) | ||
182 | { | ||
183 | return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); | ||
184 | } | ||
185 | |||
186 | static inline int ext4_es_is_delonly(struct extent_status *es) | ||
187 | { | ||
188 | return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); | ||
189 | } | ||
190 | |||
129 | static inline void ext4_es_set_referenced(struct extent_status *es) | 191 | static inline void ext4_es_set_referenced(struct extent_status *es) |
130 | { | 192 | { |
131 | es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; | 193 | es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; |
@@ -175,4 +237,16 @@ extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); | |||
175 | 237 | ||
176 | extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); | 238 | extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); |
177 | 239 | ||
240 | extern int __init ext4_init_pending(void); | ||
241 | extern void ext4_exit_pending(void); | ||
242 | extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); | ||
243 | extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); | ||
244 | extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); | ||
245 | extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, | ||
246 | bool allocated); | ||
247 | extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, | ||
248 | ext4_lblk_t len); | ||
249 | extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, | ||
250 | ext4_lblk_t len); | ||
251 | |||
178 | #endif /* _EXT4_EXTENTS_STATUS_H */ | 252 | #endif /* _EXT4_EXTENTS_STATUS_H */ |
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7b4736022761..9c4bac18cc6c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c | |||
@@ -863,7 +863,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, | |||
863 | handle_t *handle; | 863 | handle_t *handle; |
864 | struct page *page; | 864 | struct page *page; |
865 | struct ext4_iloc iloc; | 865 | struct ext4_iloc iloc; |
866 | int retries; | 866 | int retries = 0; |
867 | 867 | ||
868 | ret = ext4_get_inode_loc(inode, &iloc); | 868 | ret = ext4_get_inode_loc(inode, &iloc); |
869 | if (ret) | 869 | if (ret) |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d767e993591d..c3d9a42c561e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -577,8 +577,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, | |||
577 | EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; | 577 | EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; |
578 | if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && | 578 | if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && |
579 | !(status & EXTENT_STATUS_WRITTEN) && | 579 | !(status & EXTENT_STATUS_WRITTEN) && |
580 | ext4_find_delalloc_range(inode, map->m_lblk, | 580 | ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, |
581 | map->m_lblk + map->m_len - 1)) | 581 | map->m_lblk + map->m_len - 1)) |
582 | status |= EXTENT_STATUS_DELAYED; | 582 | status |= EXTENT_STATUS_DELAYED; |
583 | ret = ext4_es_insert_extent(inode, map->m_lblk, | 583 | ret = ext4_es_insert_extent(inode, map->m_lblk, |
584 | map->m_len, map->m_pblk, status); | 584 | map->m_len, map->m_pblk, status); |
@@ -701,8 +701,8 @@ found: | |||
701 | EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; | 701 | EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; |
702 | if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && | 702 | if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && |
703 | !(status & EXTENT_STATUS_WRITTEN) && | 703 | !(status & EXTENT_STATUS_WRITTEN) && |
704 | ext4_find_delalloc_range(inode, map->m_lblk, | 704 | ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, |
705 | map->m_lblk + map->m_len - 1)) | 705 | map->m_lblk + map->m_len - 1)) |
706 | status |= EXTENT_STATUS_DELAYED; | 706 | status |= EXTENT_STATUS_DELAYED; |
707 | ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, | 707 | ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, |
708 | map->m_pblk, status); | 708 | map->m_pblk, status); |
@@ -1595,7 +1595,7 @@ static int ext4_da_reserve_space(struct inode *inode) | |||
1595 | return 0; /* success */ | 1595 | return 0; /* success */ |
1596 | } | 1596 | } |
1597 | 1597 | ||
1598 | static void ext4_da_release_space(struct inode *inode, int to_free) | 1598 | void ext4_da_release_space(struct inode *inode, int to_free) |
1599 | { | 1599 | { |
1600 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 1600 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
1601 | struct ext4_inode_info *ei = EXT4_I(inode); | 1601 | struct ext4_inode_info *ei = EXT4_I(inode); |
@@ -1634,13 +1634,11 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
1634 | unsigned int offset, | 1634 | unsigned int offset, |
1635 | unsigned int length) | 1635 | unsigned int length) |
1636 | { | 1636 | { |
1637 | int to_release = 0, contiguous_blks = 0; | 1637 | int contiguous_blks = 0; |
1638 | struct buffer_head *head, *bh; | 1638 | struct buffer_head *head, *bh; |
1639 | unsigned int curr_off = 0; | 1639 | unsigned int curr_off = 0; |
1640 | struct inode *inode = page->mapping->host; | 1640 | struct inode *inode = page->mapping->host; |
1641 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1642 | unsigned int stop = offset + length; | 1641 | unsigned int stop = offset + length; |
1643 | int num_clusters; | ||
1644 | ext4_fsblk_t lblk; | 1642 | ext4_fsblk_t lblk; |
1645 | 1643 | ||
1646 | BUG_ON(stop > PAGE_SIZE || stop < length); | 1644 | BUG_ON(stop > PAGE_SIZE || stop < length); |
@@ -1654,7 +1652,6 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
1654 | break; | 1652 | break; |
1655 | 1653 | ||
1656 | if ((offset <= curr_off) && (buffer_delay(bh))) { | 1654 | if ((offset <= curr_off) && (buffer_delay(bh))) { |
1657 | to_release++; | ||
1658 | contiguous_blks++; | 1655 | contiguous_blks++; |
1659 | clear_buffer_delay(bh); | 1656 | clear_buffer_delay(bh); |
1660 | } else if (contiguous_blks) { | 1657 | } else if (contiguous_blks) { |
@@ -1662,7 +1659,7 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
1662 | (PAGE_SHIFT - inode->i_blkbits); | 1659 | (PAGE_SHIFT - inode->i_blkbits); |
1663 | lblk += (curr_off >> inode->i_blkbits) - | 1660 | lblk += (curr_off >> inode->i_blkbits) - |
1664 | contiguous_blks; | 1661 | contiguous_blks; |
1665 | ext4_es_remove_extent(inode, lblk, contiguous_blks); | 1662 | ext4_es_remove_blks(inode, lblk, contiguous_blks); |
1666 | contiguous_blks = 0; | 1663 | contiguous_blks = 0; |
1667 | } | 1664 | } |
1668 | curr_off = next_off; | 1665 | curr_off = next_off; |
@@ -1671,21 +1668,9 @@ static void ext4_da_page_release_reservation(struct page *page, | |||
1671 | if (contiguous_blks) { | 1668 | if (contiguous_blks) { |
1672 | lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); | 1669 | lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); |
1673 | lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; | 1670 | lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; |
1674 | ext4_es_remove_extent(inode, lblk, contiguous_blks); | 1671 | ext4_es_remove_blks(inode, lblk, contiguous_blks); |
1675 | } | 1672 | } |
1676 | 1673 | ||
1677 | /* If we have released all the blocks belonging to a cluster, then we | ||
1678 | * need to release the reserved space for that cluster. */ | ||
1679 | num_clusters = EXT4_NUM_B2C(sbi, to_release); | ||
1680 | while (num_clusters > 0) { | ||
1681 | lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) + | ||
1682 | ((num_clusters - 1) << sbi->s_cluster_bits); | ||
1683 | if (sbi->s_cluster_ratio == 1 || | ||
1684 | !ext4_find_delalloc_cluster(inode, lblk)) | ||
1685 | ext4_da_release_space(inode, 1); | ||
1686 | |||
1687 | num_clusters--; | ||
1688 | } | ||
1689 | } | 1674 | } |
1690 | 1675 | ||
1691 | /* | 1676 | /* |
@@ -1781,6 +1766,65 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) | |||
1781 | } | 1766 | } |
1782 | 1767 | ||
1783 | /* | 1768 | /* |
1769 | * ext4_insert_delayed_block - adds a delayed block to the extents status | ||
1770 | * tree, incrementing the reserved cluster/block | ||
1771 | * count or making a pending reservation | ||
1772 | * where needed | ||
1773 | * | ||
1774 | * @inode - file containing the newly added block | ||
1775 | * @lblk - logical block to be added | ||
1776 | * | ||
1777 | * Returns 0 on success, negative error code on failure. | ||
1778 | */ | ||
1779 | static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) | ||
1780 | { | ||
1781 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1782 | int ret; | ||
1783 | bool allocated = false; | ||
1784 | |||
1785 | /* | ||
1786 | * If the cluster containing lblk is shared with a delayed, | ||
1787 | * written, or unwritten extent in a bigalloc file system, it's | ||
1788 | * already been accounted for and does not need to be reserved. | ||
1789 | * A pending reservation must be made for the cluster if it's | ||
1790 | * shared with a written or unwritten extent and doesn't already | ||
1791 | * have one. Written and unwritten extents can be purged from the | ||
1792 | * extents status tree if the system is under memory pressure, so | ||
1793 | * it's necessary to examine the extent tree if a search of the | ||
1794 | * extents status tree doesn't get a match. | ||
1795 | */ | ||
1796 | if (sbi->s_cluster_ratio == 1) { | ||
1797 | ret = ext4_da_reserve_space(inode); | ||
1798 | if (ret != 0) /* ENOSPC */ | ||
1799 | goto errout; | ||
1800 | } else { /* bigalloc */ | ||
1801 | if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { | ||
1802 | if (!ext4_es_scan_clu(inode, | ||
1803 | &ext4_es_is_mapped, lblk)) { | ||
1804 | ret = ext4_clu_mapped(inode, | ||
1805 | EXT4_B2C(sbi, lblk)); | ||
1806 | if (ret < 0) | ||
1807 | goto errout; | ||
1808 | if (ret == 0) { | ||
1809 | ret = ext4_da_reserve_space(inode); | ||
1810 | if (ret != 0) /* ENOSPC */ | ||
1811 | goto errout; | ||
1812 | } else { | ||
1813 | allocated = true; | ||
1814 | } | ||
1815 | } else { | ||
1816 | allocated = true; | ||
1817 | } | ||
1818 | } | ||
1819 | } | ||
1820 | |||
1821 | ret = ext4_es_insert_delayed_block(inode, lblk, allocated); | ||
1822 | |||
1823 | errout: | ||
1824 | return ret; | ||
1825 | } | ||
1826 | |||
1827 | /* | ||
1784 | * This function is grabs code from the very beginning of | 1828 | * This function is grabs code from the very beginning of |
1785 | * ext4_map_blocks, but assumes that the caller is from delayed write | 1829 | * ext4_map_blocks, but assumes that the caller is from delayed write |
1786 | * time. This function looks up the requested blocks and sets the | 1830 | * time. This function looks up the requested blocks and sets the |
@@ -1859,28 +1903,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, | |||
1859 | add_delayed: | 1903 | add_delayed: |
1860 | if (retval == 0) { | 1904 | if (retval == 0) { |
1861 | int ret; | 1905 | int ret; |
1906 | |||
1862 | /* | 1907 | /* |
1863 | * XXX: __block_prepare_write() unmaps passed block, | 1908 | * XXX: __block_prepare_write() unmaps passed block, |
1864 | * is it OK? | 1909 | * is it OK? |
1865 | */ | 1910 | */ |
1866 | /* | ||
1867 | * If the block was allocated from previously allocated cluster, | ||
1868 | * then we don't need to reserve it again. However we still need | ||
1869 | * to reserve metadata for every block we're going to write. | ||
1870 | */ | ||
1871 | if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || | ||
1872 | !ext4_find_delalloc_cluster(inode, map->m_lblk)) { | ||
1873 | ret = ext4_da_reserve_space(inode); | ||
1874 | if (ret) { | ||
1875 | /* not enough space to reserve */ | ||
1876 | retval = ret; | ||
1877 | goto out_unlock; | ||
1878 | } | ||
1879 | } | ||
1880 | 1911 | ||
1881 | ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, | 1912 | ret = ext4_insert_delayed_block(inode, map->m_lblk); |
1882 | ~0, EXTENT_STATUS_DELAYED); | 1913 | if (ret != 0) { |
1883 | if (ret) { | ||
1884 | retval = ret; | 1914 | retval = ret; |
1885 | goto out_unlock; | 1915 | goto out_unlock; |
1886 | } | 1916 | } |
@@ -3450,7 +3480,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, | |||
3450 | ext4_lblk_t end = map.m_lblk + map.m_len - 1; | 3480 | ext4_lblk_t end = map.m_lblk + map.m_len - 1; |
3451 | struct extent_status es; | 3481 | struct extent_status es; |
3452 | 3482 | ||
3453 | ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es); | 3483 | ext4_es_find_extent_range(inode, &ext4_es_is_delayed, |
3484 | map.m_lblk, end, &es); | ||
3454 | 3485 | ||
3455 | if (!es.es_len || es.es_lblk > end) { | 3486 | if (!es.es_len || es.es_lblk > end) { |
3456 | /* entire range is a hole */ | 3487 | /* entire range is a hole */ |
@@ -6153,13 +6184,14 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) | |||
6153 | return !buffer_mapped(bh); | 6184 | return !buffer_mapped(bh); |
6154 | } | 6185 | } |
6155 | 6186 | ||
6156 | int ext4_page_mkwrite(struct vm_fault *vmf) | 6187 | vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) |
6157 | { | 6188 | { |
6158 | struct vm_area_struct *vma = vmf->vma; | 6189 | struct vm_area_struct *vma = vmf->vma; |
6159 | struct page *page = vmf->page; | 6190 | struct page *page = vmf->page; |
6160 | loff_t size; | 6191 | loff_t size; |
6161 | unsigned long len; | 6192 | unsigned long len; |
6162 | int ret; | 6193 | int err; |
6194 | vm_fault_t ret; | ||
6163 | struct file *file = vma->vm_file; | 6195 | struct file *file = vma->vm_file; |
6164 | struct inode *inode = file_inode(file); | 6196 | struct inode *inode = file_inode(file); |
6165 | struct address_space *mapping = inode->i_mapping; | 6197 | struct address_space *mapping = inode->i_mapping; |
@@ -6172,8 +6204,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf) | |||
6172 | 6204 | ||
6173 | down_read(&EXT4_I(inode)->i_mmap_sem); | 6205 | down_read(&EXT4_I(inode)->i_mmap_sem); |
6174 | 6206 | ||
6175 | ret = ext4_convert_inline_data(inode); | 6207 | err = ext4_convert_inline_data(inode); |
6176 | if (ret) | 6208 | if (err) |
6177 | goto out_ret; | 6209 | goto out_ret; |
6178 | 6210 | ||
6179 | /* Delalloc case is easy... */ | 6211 | /* Delalloc case is easy... */ |
@@ -6181,9 +6213,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf) | |||
6181 | !ext4_should_journal_data(inode) && | 6213 | !ext4_should_journal_data(inode) && |
6182 | !ext4_nonda_switch(inode->i_sb)) { | 6214 | !ext4_nonda_switch(inode->i_sb)) { |
6183 | do { | 6215 | do { |
6184 | ret = block_page_mkwrite(vma, vmf, | 6216 | err = block_page_mkwrite(vma, vmf, |
6185 | ext4_da_get_block_prep); | 6217 | ext4_da_get_block_prep); |
6186 | } while (ret == -ENOSPC && | 6218 | } while (err == -ENOSPC && |
6187 | ext4_should_retry_alloc(inode->i_sb, &retries)); | 6219 | ext4_should_retry_alloc(inode->i_sb, &retries)); |
6188 | goto out_ret; | 6220 | goto out_ret; |
6189 | } | 6221 | } |
@@ -6228,8 +6260,8 @@ retry_alloc: | |||
6228 | ret = VM_FAULT_SIGBUS; | 6260 | ret = VM_FAULT_SIGBUS; |
6229 | goto out; | 6261 | goto out; |
6230 | } | 6262 | } |
6231 | ret = block_page_mkwrite(vma, vmf, get_block); | 6263 | err = block_page_mkwrite(vma, vmf, get_block); |
6232 | if (!ret && ext4_should_journal_data(inode)) { | 6264 | if (!err && ext4_should_journal_data(inode)) { |
6233 | if (ext4_walk_page_buffers(handle, page_buffers(page), 0, | 6265 | if (ext4_walk_page_buffers(handle, page_buffers(page), 0, |
6234 | PAGE_SIZE, NULL, do_journal_get_write_access)) { | 6266 | PAGE_SIZE, NULL, do_journal_get_write_access)) { |
6235 | unlock_page(page); | 6267 | unlock_page(page); |
@@ -6240,24 +6272,24 @@ retry_alloc: | |||
6240 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | 6272 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
6241 | } | 6273 | } |
6242 | ext4_journal_stop(handle); | 6274 | ext4_journal_stop(handle); |
6243 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 6275 | if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
6244 | goto retry_alloc; | 6276 | goto retry_alloc; |
6245 | out_ret: | 6277 | out_ret: |
6246 | ret = block_page_mkwrite_return(ret); | 6278 | ret = block_page_mkwrite_return(err); |
6247 | out: | 6279 | out: |
6248 | up_read(&EXT4_I(inode)->i_mmap_sem); | 6280 | up_read(&EXT4_I(inode)->i_mmap_sem); |
6249 | sb_end_pagefault(inode->i_sb); | 6281 | sb_end_pagefault(inode->i_sb); |
6250 | return ret; | 6282 | return ret; |
6251 | } | 6283 | } |
6252 | 6284 | ||
6253 | int ext4_filemap_fault(struct vm_fault *vmf) | 6285 | vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) |
6254 | { | 6286 | { |
6255 | struct inode *inode = file_inode(vmf->vma->vm_file); | 6287 | struct inode *inode = file_inode(vmf->vma->vm_file); |
6256 | int err; | 6288 | vm_fault_t ret; |
6257 | 6289 | ||
6258 | down_read(&EXT4_I(inode)->i_mmap_sem); | 6290 | down_read(&EXT4_I(inode)->i_mmap_sem); |
6259 | err = filemap_fault(vmf); | 6291 | ret = filemap_fault(vmf); |
6260 | up_read(&EXT4_I(inode)->i_mmap_sem); | 6292 | up_read(&EXT4_I(inode)->i_mmap_sem); |
6261 | 6293 | ||
6262 | return err; | 6294 | return ret; |
6263 | } | 6295 | } |
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a7074115d6f6..0edee31913d1 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
@@ -67,7 +67,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) | |||
67 | ei1 = EXT4_I(inode1); | 67 | ei1 = EXT4_I(inode1); |
68 | ei2 = EXT4_I(inode2); | 68 | ei2 = EXT4_I(inode2); |
69 | 69 | ||
70 | swap(inode1->i_flags, inode2->i_flags); | ||
71 | swap(inode1->i_version, inode2->i_version); | 70 | swap(inode1->i_version, inode2->i_version); |
72 | swap(inode1->i_blocks, inode2->i_blocks); | 71 | swap(inode1->i_blocks, inode2->i_blocks); |
73 | swap(inode1->i_bytes, inode2->i_bytes); | 72 | swap(inode1->i_bytes, inode2->i_bytes); |
@@ -85,6 +84,21 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) | |||
85 | i_size_write(inode2, isize); | 84 | i_size_write(inode2, isize); |
86 | } | 85 | } |
87 | 86 | ||
87 | static void reset_inode_seed(struct inode *inode) | ||
88 | { | ||
89 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
90 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
91 | __le32 inum = cpu_to_le32(inode->i_ino); | ||
92 | __le32 gen = cpu_to_le32(inode->i_generation); | ||
93 | __u32 csum; | ||
94 | |||
95 | if (!ext4_has_metadata_csum(inode->i_sb)) | ||
96 | return; | ||
97 | |||
98 | csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); | ||
99 | ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); | ||
100 | } | ||
101 | |||
88 | /** | 102 | /** |
89 | * Swap the information from the given @inode and the inode | 103 | * Swap the information from the given @inode and the inode |
90 | * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other | 104 | * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other |
@@ -102,10 +116,13 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
102 | struct inode *inode_bl; | 116 | struct inode *inode_bl; |
103 | struct ext4_inode_info *ei_bl; | 117 | struct ext4_inode_info *ei_bl; |
104 | 118 | ||
105 | if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) | 119 | if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || |
120 | IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || | ||
121 | ext4_has_inline_data(inode)) | ||
106 | return -EINVAL; | 122 | return -EINVAL; |
107 | 123 | ||
108 | if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) | 124 | if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || |
125 | !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) | ||
109 | return -EPERM; | 126 | return -EPERM; |
110 | 127 | ||
111 | inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); | 128 | inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); |
@@ -120,13 +137,13 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
120 | * that only 1 swap_inode_boot_loader is running. */ | 137 | * that only 1 swap_inode_boot_loader is running. */ |
121 | lock_two_nondirectories(inode, inode_bl); | 138 | lock_two_nondirectories(inode, inode_bl); |
122 | 139 | ||
123 | truncate_inode_pages(&inode->i_data, 0); | ||
124 | truncate_inode_pages(&inode_bl->i_data, 0); | ||
125 | |||
126 | /* Wait for all existing dio workers */ | 140 | /* Wait for all existing dio workers */ |
127 | inode_dio_wait(inode); | 141 | inode_dio_wait(inode); |
128 | inode_dio_wait(inode_bl); | 142 | inode_dio_wait(inode_bl); |
129 | 143 | ||
144 | truncate_inode_pages(&inode->i_data, 0); | ||
145 | truncate_inode_pages(&inode_bl->i_data, 0); | ||
146 | |||
130 | handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); | 147 | handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); |
131 | if (IS_ERR(handle)) { | 148 | if (IS_ERR(handle)) { |
132 | err = -EINVAL; | 149 | err = -EINVAL; |
@@ -159,6 +176,8 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
159 | 176 | ||
160 | inode->i_generation = prandom_u32(); | 177 | inode->i_generation = prandom_u32(); |
161 | inode_bl->i_generation = prandom_u32(); | 178 | inode_bl->i_generation = prandom_u32(); |
179 | reset_inode_seed(inode); | ||
180 | reset_inode_seed(inode_bl); | ||
162 | 181 | ||
163 | ext4_discard_preallocations(inode); | 182 | ext4_discard_preallocations(inode); |
164 | 183 | ||
@@ -169,6 +188,7 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
169 | inode->i_ino, err); | 188 | inode->i_ino, err); |
170 | /* Revert all changes: */ | 189 | /* Revert all changes: */ |
171 | swap_inode_data(inode, inode_bl); | 190 | swap_inode_data(inode, inode_bl); |
191 | ext4_mark_inode_dirty(handle, inode); | ||
172 | } else { | 192 | } else { |
173 | err = ext4_mark_inode_dirty(handle, inode_bl); | 193 | err = ext4_mark_inode_dirty(handle, inode_bl); |
174 | if (err < 0) { | 194 | if (err < 0) { |
@@ -178,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb, | |||
178 | /* Revert all changes: */ | 198 | /* Revert all changes: */ |
179 | swap_inode_data(inode, inode_bl); | 199 | swap_inode_data(inode, inode_bl); |
180 | ext4_mark_inode_dirty(handle, inode); | 200 | ext4_mark_inode_dirty(handle, inode); |
201 | ext4_mark_inode_dirty(handle, inode_bl); | ||
181 | } | 202 | } |
182 | } | 203 | } |
183 | ext4_journal_stop(handle); | 204 | ext4_journal_stop(handle); |
@@ -339,19 +360,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) | |||
339 | if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) | 360 | if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) |
340 | return 0; | 361 | return 0; |
341 | 362 | ||
342 | err = mnt_want_write_file(filp); | ||
343 | if (err) | ||
344 | return err; | ||
345 | |||
346 | err = -EPERM; | 363 | err = -EPERM; |
347 | inode_lock(inode); | ||
348 | /* Is it quota file? Do not allow user to mess with it */ | 364 | /* Is it quota file? Do not allow user to mess with it */ |
349 | if (ext4_is_quota_file(inode)) | 365 | if (ext4_is_quota_file(inode)) |
350 | goto out_unlock; | 366 | return err; |
351 | 367 | ||
352 | err = ext4_get_inode_loc(inode, &iloc); | 368 | err = ext4_get_inode_loc(inode, &iloc); |
353 | if (err) | 369 | if (err) |
354 | goto out_unlock; | 370 | return err; |
355 | 371 | ||
356 | raw_inode = ext4_raw_inode(&iloc); | 372 | raw_inode = ext4_raw_inode(&iloc); |
357 | if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { | 373 | if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { |
@@ -359,20 +375,20 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) | |||
359 | EXT4_SB(sb)->s_want_extra_isize, | 375 | EXT4_SB(sb)->s_want_extra_isize, |
360 | &iloc); | 376 | &iloc); |
361 | if (err) | 377 | if (err) |
362 | goto out_unlock; | 378 | return err; |
363 | } else { | 379 | } else { |
364 | brelse(iloc.bh); | 380 | brelse(iloc.bh); |
365 | } | 381 | } |
366 | 382 | ||
367 | dquot_initialize(inode); | 383 | err = dquot_initialize(inode); |
384 | if (err) | ||
385 | return err; | ||
368 | 386 | ||
369 | handle = ext4_journal_start(inode, EXT4_HT_QUOTA, | 387 | handle = ext4_journal_start(inode, EXT4_HT_QUOTA, |
370 | EXT4_QUOTA_INIT_BLOCKS(sb) + | 388 | EXT4_QUOTA_INIT_BLOCKS(sb) + |
371 | EXT4_QUOTA_DEL_BLOCKS(sb) + 3); | 389 | EXT4_QUOTA_DEL_BLOCKS(sb) + 3); |
372 | if (IS_ERR(handle)) { | 390 | if (IS_ERR(handle)) |
373 | err = PTR_ERR(handle); | 391 | return PTR_ERR(handle); |
374 | goto out_unlock; | ||
375 | } | ||
376 | 392 | ||
377 | err = ext4_reserve_inode_write(handle, inode, &iloc); | 393 | err = ext4_reserve_inode_write(handle, inode, &iloc); |
378 | if (err) | 394 | if (err) |
@@ -400,9 +416,6 @@ out_dirty: | |||
400 | err = rc; | 416 | err = rc; |
401 | out_stop: | 417 | out_stop: |
402 | ext4_journal_stop(handle); | 418 | ext4_journal_stop(handle); |
403 | out_unlock: | ||
404 | inode_unlock(inode); | ||
405 | mnt_drop_write_file(filp); | ||
406 | return err; | 419 | return err; |
407 | } | 420 | } |
408 | #else | 421 | #else |
@@ -626,6 +639,30 @@ group_add_out: | |||
626 | return err; | 639 | return err; |
627 | } | 640 | } |
628 | 641 | ||
642 | static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa) | ||
643 | { | ||
644 | /* | ||
645 | * Project Quota ID state is only allowed to change from within the init | ||
646 | * namespace. Enforce that restriction only if we are trying to change | ||
647 | * the quota ID state. Everything else is allowed in user namespaces. | ||
648 | */ | ||
649 | if (current_user_ns() == &init_user_ns) | ||
650 | return 0; | ||
651 | |||
652 | if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid) | ||
653 | return -EINVAL; | ||
654 | |||
655 | if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) { | ||
656 | if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) | ||
657 | return -EINVAL; | ||
658 | } else { | ||
659 | if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) | ||
660 | return -EINVAL; | ||
661 | } | ||
662 | |||
663 | return 0; | ||
664 | } | ||
665 | |||
629 | long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) | 666 | long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) |
630 | { | 667 | { |
631 | struct inode *inode = file_inode(filp); | 668 | struct inode *inode = file_inode(filp); |
@@ -1025,19 +1062,19 @@ resizefs_out: | |||
1025 | return err; | 1062 | return err; |
1026 | 1063 | ||
1027 | inode_lock(inode); | 1064 | inode_lock(inode); |
1065 | err = ext4_ioctl_check_project(inode, &fa); | ||
1066 | if (err) | ||
1067 | goto out; | ||
1028 | flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | | 1068 | flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | |
1029 | (flags & EXT4_FL_XFLAG_VISIBLE); | 1069 | (flags & EXT4_FL_XFLAG_VISIBLE); |
1030 | err = ext4_ioctl_setflags(inode, flags); | 1070 | err = ext4_ioctl_setflags(inode, flags); |
1031 | inode_unlock(inode); | ||
1032 | mnt_drop_write_file(filp); | ||
1033 | if (err) | 1071 | if (err) |
1034 | return err; | 1072 | goto out; |
1035 | |||
1036 | err = ext4_ioctl_setproject(filp, fa.fsx_projid); | 1073 | err = ext4_ioctl_setproject(filp, fa.fsx_projid); |
1037 | if (err) | 1074 | out: |
1038 | return err; | 1075 | inode_unlock(inode); |
1039 | 1076 | mnt_drop_write_file(filp); | |
1040 | return 0; | 1077 | return err; |
1041 | } | 1078 | } |
1042 | case EXT4_IOC_SHUTDOWN: | 1079 | case EXT4_IOC_SHUTDOWN: |
1043 | return ext4_shutdown(sb, arg); | 1080 | return ext4_shutdown(sb, arg); |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..e2248083cdca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -4915,9 +4915,17 @@ do_more: | |||
4915 | &sbi->s_flex_groups[flex_group].free_clusters); | 4915 | &sbi->s_flex_groups[flex_group].free_clusters); |
4916 | } | 4916 | } |
4917 | 4917 | ||
4918 | if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) | 4918 | /* |
4919 | dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); | 4919 | * on a bigalloc file system, defer the s_freeclusters_counter |
4920 | percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); | 4920 | * update to the caller (ext4_remove_space and friends) so they |
4921 | * can determine if a cluster freed here should be rereserved | ||
4922 | */ | ||
4923 | if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { | ||
4924 | if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) | ||
4925 | dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); | ||
4926 | percpu_counter_add(&sbi->s_freeclusters_counter, | ||
4927 | count_clusters); | ||
4928 | } | ||
4921 | 4929 | ||
4922 | ext4_mb_unload_buddy(&e4b); | 4930 | ext4_mb_unload_buddy(&e4b); |
4923 | 4931 | ||
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a409ff70d67b..2f5be02fc6f6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c | |||
@@ -516,9 +516,13 @@ mext_check_arguments(struct inode *orig_inode, | |||
516 | orig_inode->i_ino, donor_inode->i_ino); | 516 | orig_inode->i_ino, donor_inode->i_ino); |
517 | return -EINVAL; | 517 | return -EINVAL; |
518 | } | 518 | } |
519 | if (orig_eof < orig_start + *len - 1) | 519 | if (orig_eof <= orig_start) |
520 | *len = 0; | ||
521 | else if (orig_eof < orig_start + *len - 1) | ||
520 | *len = orig_eof - orig_start; | 522 | *len = orig_eof - orig_start; |
521 | if (donor_eof < donor_start + *len - 1) | 523 | if (donor_eof <= donor_start) |
524 | *len = 0; | ||
525 | else if (donor_eof < donor_start + *len - 1) | ||
522 | *len = donor_eof - donor_start; | 526 | *len = donor_eof - donor_start; |
523 | if (!*len) { | 527 | if (!*len) { |
524 | ext4_debug("ext4 move extent: len should not be 0 " | 528 | ext4_debug("ext4 move extent: len should not be 0 " |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 377d516c475f..67a38532032a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -2261,7 +2261,7 @@ again: | |||
2261 | dxroot->info.indirect_levels += 1; | 2261 | dxroot->info.indirect_levels += 1; |
2262 | dxtrace(printk(KERN_DEBUG | 2262 | dxtrace(printk(KERN_DEBUG |
2263 | "Creating %d level index...\n", | 2263 | "Creating %d level index...\n", |
2264 | info->indirect_levels)); | 2264 | dxroot->info.indirect_levels)); |
2265 | err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); | 2265 | err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); |
2266 | if (err) | 2266 | if (err) |
2267 | goto journal_error; | 2267 | goto journal_error; |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1145109968ef..a221f1cdf704 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -914,6 +914,18 @@ static inline void ext4_quota_off_umount(struct super_block *sb) | |||
914 | for (type = 0; type < EXT4_MAXQUOTAS; type++) | 914 | for (type = 0; type < EXT4_MAXQUOTAS; type++) |
915 | ext4_quota_off(sb, type); | 915 | ext4_quota_off(sb, type); |
916 | } | 916 | } |
917 | |||
918 | /* | ||
919 | * This is a helper function which is used in the mount/remount | ||
920 | * codepaths (which holds s_umount) to fetch the quota file name. | ||
921 | */ | ||
922 | static inline char *get_qf_name(struct super_block *sb, | ||
923 | struct ext4_sb_info *sbi, | ||
924 | int type) | ||
925 | { | ||
926 | return rcu_dereference_protected(sbi->s_qf_names[type], | ||
927 | lockdep_is_held(&sb->s_umount)); | ||
928 | } | ||
917 | #else | 929 | #else |
918 | static inline void ext4_quota_off_umount(struct super_block *sb) | 930 | static inline void ext4_quota_off_umount(struct super_block *sb) |
919 | { | 931 | { |
@@ -965,7 +977,7 @@ static void ext4_put_super(struct super_block *sb) | |||
965 | percpu_free_rwsem(&sbi->s_journal_flag_rwsem); | 977 | percpu_free_rwsem(&sbi->s_journal_flag_rwsem); |
966 | #ifdef CONFIG_QUOTA | 978 | #ifdef CONFIG_QUOTA |
967 | for (i = 0; i < EXT4_MAXQUOTAS; i++) | 979 | for (i = 0; i < EXT4_MAXQUOTAS; i++) |
968 | kfree(sbi->s_qf_names[i]); | 980 | kfree(get_qf_name(sb, sbi, i)); |
969 | #endif | 981 | #endif |
970 | 982 | ||
971 | /* Debugging code just in case the in-memory inode orphan list | 983 | /* Debugging code just in case the in-memory inode orphan list |
@@ -1040,6 +1052,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
1040 | ei->i_da_metadata_calc_len = 0; | 1052 | ei->i_da_metadata_calc_len = 0; |
1041 | ei->i_da_metadata_calc_last_lblock = 0; | 1053 | ei->i_da_metadata_calc_last_lblock = 0; |
1042 | spin_lock_init(&(ei->i_block_reservation_lock)); | 1054 | spin_lock_init(&(ei->i_block_reservation_lock)); |
1055 | ext4_init_pending_tree(&ei->i_pending_tree); | ||
1043 | #ifdef CONFIG_QUOTA | 1056 | #ifdef CONFIG_QUOTA |
1044 | ei->i_reserved_quota = 0; | 1057 | ei->i_reserved_quota = 0; |
1045 | memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); | 1058 | memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); |
@@ -1530,11 +1543,10 @@ static const char deprecated_msg[] = | |||
1530 | static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) | 1543 | static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) |
1531 | { | 1544 | { |
1532 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1545 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1533 | char *qname; | 1546 | char *qname, *old_qname = get_qf_name(sb, sbi, qtype); |
1534 | int ret = -1; | 1547 | int ret = -1; |
1535 | 1548 | ||
1536 | if (sb_any_quota_loaded(sb) && | 1549 | if (sb_any_quota_loaded(sb) && !old_qname) { |
1537 | !sbi->s_qf_names[qtype]) { | ||
1538 | ext4_msg(sb, KERN_ERR, | 1550 | ext4_msg(sb, KERN_ERR, |
1539 | "Cannot change journaled " | 1551 | "Cannot change journaled " |
1540 | "quota options when quota turned on"); | 1552 | "quota options when quota turned on"); |
@@ -1551,8 +1563,8 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) | |||
1551 | "Not enough memory for storing quotafile name"); | 1563 | "Not enough memory for storing quotafile name"); |
1552 | return -1; | 1564 | return -1; |
1553 | } | 1565 | } |
1554 | if (sbi->s_qf_names[qtype]) { | 1566 | if (old_qname) { |
1555 | if (strcmp(sbi->s_qf_names[qtype], qname) == 0) | 1567 | if (strcmp(old_qname, qname) == 0) |
1556 | ret = 1; | 1568 | ret = 1; |
1557 | else | 1569 | else |
1558 | ext4_msg(sb, KERN_ERR, | 1570 | ext4_msg(sb, KERN_ERR, |
@@ -1565,7 +1577,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) | |||
1565 | "quotafile must be on filesystem root"); | 1577 | "quotafile must be on filesystem root"); |
1566 | goto errout; | 1578 | goto errout; |
1567 | } | 1579 | } |
1568 | sbi->s_qf_names[qtype] = qname; | 1580 | rcu_assign_pointer(sbi->s_qf_names[qtype], qname); |
1569 | set_opt(sb, QUOTA); | 1581 | set_opt(sb, QUOTA); |
1570 | return 1; | 1582 | return 1; |
1571 | errout: | 1583 | errout: |
@@ -1577,15 +1589,16 @@ static int clear_qf_name(struct super_block *sb, int qtype) | |||
1577 | { | 1589 | { |
1578 | 1590 | ||
1579 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1591 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1592 | char *old_qname = get_qf_name(sb, sbi, qtype); | ||
1580 | 1593 | ||
1581 | if (sb_any_quota_loaded(sb) && | 1594 | if (sb_any_quota_loaded(sb) && old_qname) { |
1582 | sbi->s_qf_names[qtype]) { | ||
1583 | ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" | 1595 | ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" |
1584 | " when quota turned on"); | 1596 | " when quota turned on"); |
1585 | return -1; | 1597 | return -1; |
1586 | } | 1598 | } |
1587 | kfree(sbi->s_qf_names[qtype]); | 1599 | rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); |
1588 | sbi->s_qf_names[qtype] = NULL; | 1600 | synchronize_rcu(); |
1601 | kfree(old_qname); | ||
1589 | return 1; | 1602 | return 1; |
1590 | } | 1603 | } |
1591 | #endif | 1604 | #endif |
@@ -1960,7 +1973,7 @@ static int parse_options(char *options, struct super_block *sb, | |||
1960 | int is_remount) | 1973 | int is_remount) |
1961 | { | 1974 | { |
1962 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 1975 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1963 | char *p; | 1976 | char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; |
1964 | substring_t args[MAX_OPT_ARGS]; | 1977 | substring_t args[MAX_OPT_ARGS]; |
1965 | int token; | 1978 | int token; |
1966 | 1979 | ||
@@ -1991,11 +2004,13 @@ static int parse_options(char *options, struct super_block *sb, | |||
1991 | "Cannot enable project quota enforcement."); | 2004 | "Cannot enable project quota enforcement."); |
1992 | return 0; | 2005 | return 0; |
1993 | } | 2006 | } |
1994 | if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { | 2007 | usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); |
1995 | if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) | 2008 | grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); |
2009 | if (usr_qf_name || grp_qf_name) { | ||
2010 | if (test_opt(sb, USRQUOTA) && usr_qf_name) | ||
1996 | clear_opt(sb, USRQUOTA); | 2011 | clear_opt(sb, USRQUOTA); |
1997 | 2012 | ||
1998 | if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) | 2013 | if (test_opt(sb, GRPQUOTA) && grp_qf_name) |
1999 | clear_opt(sb, GRPQUOTA); | 2014 | clear_opt(sb, GRPQUOTA); |
2000 | 2015 | ||
2001 | if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { | 2016 | if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { |
@@ -2029,6 +2044,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, | |||
2029 | { | 2044 | { |
2030 | #if defined(CONFIG_QUOTA) | 2045 | #if defined(CONFIG_QUOTA) |
2031 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 2046 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
2047 | char *usr_qf_name, *grp_qf_name; | ||
2032 | 2048 | ||
2033 | if (sbi->s_jquota_fmt) { | 2049 | if (sbi->s_jquota_fmt) { |
2034 | char *fmtname = ""; | 2050 | char *fmtname = ""; |
@@ -2047,11 +2063,14 @@ static inline void ext4_show_quota_options(struct seq_file *seq, | |||
2047 | seq_printf(seq, ",jqfmt=%s", fmtname); | 2063 | seq_printf(seq, ",jqfmt=%s", fmtname); |
2048 | } | 2064 | } |
2049 | 2065 | ||
2050 | if (sbi->s_qf_names[USRQUOTA]) | 2066 | rcu_read_lock(); |
2051 | seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); | 2067 | usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); |
2052 | 2068 | grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); | |
2053 | if (sbi->s_qf_names[GRPQUOTA]) | 2069 | if (usr_qf_name) |
2054 | seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); | 2070 | seq_show_option(seq, "usrjquota", usr_qf_name); |
2071 | if (grp_qf_name) | ||
2072 | seq_show_option(seq, "grpjquota", grp_qf_name); | ||
2073 | rcu_read_unlock(); | ||
2055 | #endif | 2074 | #endif |
2056 | } | 2075 | } |
2057 | 2076 | ||
@@ -5103,6 +5122,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
5103 | int err = 0; | 5122 | int err = 0; |
5104 | #ifdef CONFIG_QUOTA | 5123 | #ifdef CONFIG_QUOTA |
5105 | int i, j; | 5124 | int i, j; |
5125 | char *to_free[EXT4_MAXQUOTAS]; | ||
5106 | #endif | 5126 | #endif |
5107 | char *orig_data = kstrdup(data, GFP_KERNEL); | 5127 | char *orig_data = kstrdup(data, GFP_KERNEL); |
5108 | 5128 | ||
@@ -5122,8 +5142,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) | |||
5122 | old_opts.s_jquota_fmt = sbi->s_jquota_fmt; | 5142 | old_opts.s_jquota_fmt = sbi->s_jquota_fmt; |
5123 | for (i = 0; i < EXT4_MAXQUOTAS; i++) | 5143 | for (i = 0; i < EXT4_MAXQUOTAS; i++) |
5124 | if (sbi->s_qf_names[i]) { | 5144 | if (sbi->s_qf_names[i]) { |
5125 | old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], | 5145 | char *qf_name = get_qf_name(sb, sbi, i); |
5126 | GFP_KERNEL); | 5146 | |
5147 | old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); | ||
5127 | if (!old_opts.s_qf_names[i]) { | 5148 | if (!old_opts.s_qf_names[i]) { |
5128 | for (j = 0; j < i; j++) | 5149 | for (j = 0; j < i; j++) |
5129 | kfree(old_opts.s_qf_names[j]); | 5150 | kfree(old_opts.s_qf_names[j]); |
@@ -5352,9 +5373,12 @@ restore_opts: | |||
5352 | #ifdef CONFIG_QUOTA | 5373 | #ifdef CONFIG_QUOTA |
5353 | sbi->s_jquota_fmt = old_opts.s_jquota_fmt; | 5374 | sbi->s_jquota_fmt = old_opts.s_jquota_fmt; |
5354 | for (i = 0; i < EXT4_MAXQUOTAS; i++) { | 5375 | for (i = 0; i < EXT4_MAXQUOTAS; i++) { |
5355 | kfree(sbi->s_qf_names[i]); | 5376 | to_free[i] = get_qf_name(sb, sbi, i); |
5356 | sbi->s_qf_names[i] = old_opts.s_qf_names[i]; | 5377 | rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); |
5357 | } | 5378 | } |
5379 | synchronize_rcu(); | ||
5380 | for (i = 0; i < EXT4_MAXQUOTAS; i++) | ||
5381 | kfree(to_free[i]); | ||
5358 | #endif | 5382 | #endif |
5359 | kfree(orig_data); | 5383 | kfree(orig_data); |
5360 | return err; | 5384 | return err; |
@@ -5545,7 +5569,7 @@ static int ext4_write_info(struct super_block *sb, int type) | |||
5545 | */ | 5569 | */ |
5546 | static int ext4_quota_on_mount(struct super_block *sb, int type) | 5570 | static int ext4_quota_on_mount(struct super_block *sb, int type) |
5547 | { | 5571 | { |
5548 | return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], | 5572 | return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), |
5549 | EXT4_SB(sb)->s_jquota_fmt, type); | 5573 | EXT4_SB(sb)->s_jquota_fmt, type); |
5550 | } | 5574 | } |
5551 | 5575 | ||
@@ -5954,6 +5978,10 @@ static int __init ext4_init_fs(void) | |||
5954 | if (err) | 5978 | if (err) |
5955 | return err; | 5979 | return err; |
5956 | 5980 | ||
5981 | err = ext4_init_pending(); | ||
5982 | if (err) | ||
5983 | goto out6; | ||
5984 | |||
5957 | err = ext4_init_pageio(); | 5985 | err = ext4_init_pageio(); |
5958 | if (err) | 5986 | if (err) |
5959 | goto out5; | 5987 | goto out5; |
@@ -5992,6 +6020,8 @@ out3: | |||
5992 | out4: | 6020 | out4: |
5993 | ext4_exit_pageio(); | 6021 | ext4_exit_pageio(); |
5994 | out5: | 6022 | out5: |
6023 | ext4_exit_pending(); | ||
6024 | out6: | ||
5995 | ext4_exit_es(); | 6025 | ext4_exit_es(); |
5996 | 6026 | ||
5997 | return err; | 6027 | return err; |
@@ -6009,6 +6039,7 @@ static void __exit ext4_exit_fs(void) | |||
6009 | ext4_exit_system_zone(); | 6039 | ext4_exit_system_zone(); |
6010 | ext4_exit_pageio(); | 6040 | ext4_exit_pageio(); |
6011 | ext4_exit_es(); | 6041 | ext4_exit_es(); |
6042 | ext4_exit_pending(); | ||
6012 | } | 6043 | } |
6013 | 6044 | ||
6014 | MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); | 6045 | MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); |
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index c125d662777c..26f8d7e46462 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
@@ -251,8 +251,8 @@ restart: | |||
251 | bh = jh2bh(jh); | 251 | bh = jh2bh(jh); |
252 | 252 | ||
253 | if (buffer_locked(bh)) { | 253 | if (buffer_locked(bh)) { |
254 | spin_unlock(&journal->j_list_lock); | ||
255 | get_bh(bh); | 254 | get_bh(bh); |
255 | spin_unlock(&journal->j_list_lock); | ||
256 | wait_on_buffer(bh); | 256 | wait_on_buffer(bh); |
257 | /* the journal_head may have gone by now */ | 257 | /* the journal_head may have gone by now */ |
258 | BUFFER_TRACE(bh, "brelse"); | 258 | BUFFER_TRACE(bh, "brelse"); |
@@ -333,8 +333,8 @@ restart2: | |||
333 | jh = transaction->t_checkpoint_io_list; | 333 | jh = transaction->t_checkpoint_io_list; |
334 | bh = jh2bh(jh); | 334 | bh = jh2bh(jh); |
335 | if (buffer_locked(bh)) { | 335 | if (buffer_locked(bh)) { |
336 | spin_unlock(&journal->j_list_lock); | ||
337 | get_bh(bh); | 336 | get_bh(bh); |
337 | spin_unlock(&journal->j_list_lock); | ||
338 | wait_on_buffer(bh); | 338 | wait_on_buffer(bh); |
339 | /* the journal_head may have gone by now */ | 339 | /* the journal_head may have gone by now */ |
340 | BUFFER_TRACE(bh, "brelse"); | 340 | BUFFER_TRACE(bh, "brelse"); |
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 96225a77c112..7b73ef7f902d 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -242,7 +242,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to); | |||
242 | int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, | 242 | int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, |
243 | get_block_t get_block); | 243 | get_block_t get_block); |
244 | /* Convert errno to return value from ->page_mkwrite() call */ | 244 | /* Convert errno to return value from ->page_mkwrite() call */ |
245 | static inline int block_page_mkwrite_return(int err) | 245 | static inline vm_fault_t block_page_mkwrite_return(int err) |
246 | { | 246 | { |
247 | if (err == 0) | 247 | if (err == 0) |
248 | return VM_FAULT_LOCKED; | 248 | return VM_FAULT_LOCKED; |
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0e31eb136c57..698e0d8a5ca4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h | |||
@@ -17,6 +17,7 @@ struct mpage_da_data; | |||
17 | struct ext4_map_blocks; | 17 | struct ext4_map_blocks; |
18 | struct extent_status; | 18 | struct extent_status; |
19 | struct ext4_fsmap; | 19 | struct ext4_fsmap; |
20 | struct partial_cluster; | ||
20 | 21 | ||
21 | #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) | 22 | #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) |
22 | 23 | ||
@@ -2035,21 +2036,23 @@ TRACE_EVENT(ext4_ext_show_extent, | |||
2035 | ); | 2036 | ); |
2036 | 2037 | ||
2037 | TRACE_EVENT(ext4_remove_blocks, | 2038 | TRACE_EVENT(ext4_remove_blocks, |
2038 | TP_PROTO(struct inode *inode, struct ext4_extent *ex, | 2039 | TP_PROTO(struct inode *inode, struct ext4_extent *ex, |
2039 | ext4_lblk_t from, ext4_fsblk_t to, | 2040 | ext4_lblk_t from, ext4_fsblk_t to, |
2040 | long long partial_cluster), | 2041 | struct partial_cluster *pc), |
2041 | 2042 | ||
2042 | TP_ARGS(inode, ex, from, to, partial_cluster), | 2043 | TP_ARGS(inode, ex, from, to, pc), |
2043 | 2044 | ||
2044 | TP_STRUCT__entry( | 2045 | TP_STRUCT__entry( |
2045 | __field( dev_t, dev ) | 2046 | __field( dev_t, dev ) |
2046 | __field( ino_t, ino ) | 2047 | __field( ino_t, ino ) |
2047 | __field( ext4_lblk_t, from ) | 2048 | __field( ext4_lblk_t, from ) |
2048 | __field( ext4_lblk_t, to ) | 2049 | __field( ext4_lblk_t, to ) |
2049 | __field( long long, partial ) | ||
2050 | __field( ext4_fsblk_t, ee_pblk ) | 2050 | __field( ext4_fsblk_t, ee_pblk ) |
2051 | __field( ext4_lblk_t, ee_lblk ) | 2051 | __field( ext4_lblk_t, ee_lblk ) |
2052 | __field( unsigned short, ee_len ) | 2052 | __field( unsigned short, ee_len ) |
2053 | __field( ext4_fsblk_t, pc_pclu ) | ||
2054 | __field( ext4_lblk_t, pc_lblk ) | ||
2055 | __field( int, pc_state) | ||
2053 | ), | 2056 | ), |
2054 | 2057 | ||
2055 | TP_fast_assign( | 2058 | TP_fast_assign( |
@@ -2057,14 +2060,16 @@ TRACE_EVENT(ext4_remove_blocks, | |||
2057 | __entry->ino = inode->i_ino; | 2060 | __entry->ino = inode->i_ino; |
2058 | __entry->from = from; | 2061 | __entry->from = from; |
2059 | __entry->to = to; | 2062 | __entry->to = to; |
2060 | __entry->partial = partial_cluster; | ||
2061 | __entry->ee_pblk = ext4_ext_pblock(ex); | 2063 | __entry->ee_pblk = ext4_ext_pblock(ex); |
2062 | __entry->ee_lblk = le32_to_cpu(ex->ee_block); | 2064 | __entry->ee_lblk = le32_to_cpu(ex->ee_block); |
2063 | __entry->ee_len = ext4_ext_get_actual_len(ex); | 2065 | __entry->ee_len = ext4_ext_get_actual_len(ex); |
2066 | __entry->pc_pclu = pc->pclu; | ||
2067 | __entry->pc_lblk = pc->lblk; | ||
2068 | __entry->pc_state = pc->state; | ||
2064 | ), | 2069 | ), |
2065 | 2070 | ||
2066 | TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" | 2071 | TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" |
2067 | "from %u to %u partial_cluster %lld", | 2072 | "from %u to %u partial [pclu %lld lblk %u state %d]", |
2068 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2073 | MAJOR(__entry->dev), MINOR(__entry->dev), |
2069 | (unsigned long) __entry->ino, | 2074 | (unsigned long) __entry->ino, |
2070 | (unsigned) __entry->ee_lblk, | 2075 | (unsigned) __entry->ee_lblk, |
@@ -2072,45 +2077,53 @@ TRACE_EVENT(ext4_remove_blocks, | |||
2072 | (unsigned short) __entry->ee_len, | 2077 | (unsigned short) __entry->ee_len, |
2073 | (unsigned) __entry->from, | 2078 | (unsigned) __entry->from, |
2074 | (unsigned) __entry->to, | 2079 | (unsigned) __entry->to, |
2075 | (long long) __entry->partial) | 2080 | (long long) __entry->pc_pclu, |
2081 | (unsigned int) __entry->pc_lblk, | ||
2082 | (int) __entry->pc_state) | ||
2076 | ); | 2083 | ); |
2077 | 2084 | ||
2078 | TRACE_EVENT(ext4_ext_rm_leaf, | 2085 | TRACE_EVENT(ext4_ext_rm_leaf, |
2079 | TP_PROTO(struct inode *inode, ext4_lblk_t start, | 2086 | TP_PROTO(struct inode *inode, ext4_lblk_t start, |
2080 | struct ext4_extent *ex, | 2087 | struct ext4_extent *ex, |
2081 | long long partial_cluster), | 2088 | struct partial_cluster *pc), |
2082 | 2089 | ||
2083 | TP_ARGS(inode, start, ex, partial_cluster), | 2090 | TP_ARGS(inode, start, ex, pc), |
2084 | 2091 | ||
2085 | TP_STRUCT__entry( | 2092 | TP_STRUCT__entry( |
2086 | __field( dev_t, dev ) | 2093 | __field( dev_t, dev ) |
2087 | __field( ino_t, ino ) | 2094 | __field( ino_t, ino ) |
2088 | __field( long long, partial ) | ||
2089 | __field( ext4_lblk_t, start ) | 2095 | __field( ext4_lblk_t, start ) |
2090 | __field( ext4_lblk_t, ee_lblk ) | 2096 | __field( ext4_lblk_t, ee_lblk ) |
2091 | __field( ext4_fsblk_t, ee_pblk ) | 2097 | __field( ext4_fsblk_t, ee_pblk ) |
2092 | __field( short, ee_len ) | 2098 | __field( short, ee_len ) |
2099 | __field( ext4_fsblk_t, pc_pclu ) | ||
2100 | __field( ext4_lblk_t, pc_lblk ) | ||
2101 | __field( int, pc_state) | ||
2093 | ), | 2102 | ), |
2094 | 2103 | ||
2095 | TP_fast_assign( | 2104 | TP_fast_assign( |
2096 | __entry->dev = inode->i_sb->s_dev; | 2105 | __entry->dev = inode->i_sb->s_dev; |
2097 | __entry->ino = inode->i_ino; | 2106 | __entry->ino = inode->i_ino; |
2098 | __entry->partial = partial_cluster; | ||
2099 | __entry->start = start; | 2107 | __entry->start = start; |
2100 | __entry->ee_lblk = le32_to_cpu(ex->ee_block); | 2108 | __entry->ee_lblk = le32_to_cpu(ex->ee_block); |
2101 | __entry->ee_pblk = ext4_ext_pblock(ex); | 2109 | __entry->ee_pblk = ext4_ext_pblock(ex); |
2102 | __entry->ee_len = ext4_ext_get_actual_len(ex); | 2110 | __entry->ee_len = ext4_ext_get_actual_len(ex); |
2111 | __entry->pc_pclu = pc->pclu; | ||
2112 | __entry->pc_lblk = pc->lblk; | ||
2113 | __entry->pc_state = pc->state; | ||
2103 | ), | 2114 | ), |
2104 | 2115 | ||
2105 | TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" | 2116 | TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" |
2106 | "partial_cluster %lld", | 2117 | "partial [pclu %lld lblk %u state %d]", |
2107 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2118 | MAJOR(__entry->dev), MINOR(__entry->dev), |
2108 | (unsigned long) __entry->ino, | 2119 | (unsigned long) __entry->ino, |
2109 | (unsigned) __entry->start, | 2120 | (unsigned) __entry->start, |
2110 | (unsigned) __entry->ee_lblk, | 2121 | (unsigned) __entry->ee_lblk, |
2111 | (unsigned long long) __entry->ee_pblk, | 2122 | (unsigned long long) __entry->ee_pblk, |
2112 | (unsigned short) __entry->ee_len, | 2123 | (unsigned short) __entry->ee_len, |
2113 | (long long) __entry->partial) | 2124 | (long long) __entry->pc_pclu, |
2125 | (unsigned int) __entry->pc_lblk, | ||
2126 | (int) __entry->pc_state) | ||
2114 | ); | 2127 | ); |
2115 | 2128 | ||
2116 | TRACE_EVENT(ext4_ext_rm_idx, | 2129 | TRACE_EVENT(ext4_ext_rm_idx, |
@@ -2168,9 +2181,9 @@ TRACE_EVENT(ext4_ext_remove_space, | |||
2168 | 2181 | ||
2169 | TRACE_EVENT(ext4_ext_remove_space_done, | 2182 | TRACE_EVENT(ext4_ext_remove_space_done, |
2170 | TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, | 2183 | TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, |
2171 | int depth, long long partial, __le16 eh_entries), | 2184 | int depth, struct partial_cluster *pc, __le16 eh_entries), |
2172 | 2185 | ||
2173 | TP_ARGS(inode, start, end, depth, partial, eh_entries), | 2186 | TP_ARGS(inode, start, end, depth, pc, eh_entries), |
2174 | 2187 | ||
2175 | TP_STRUCT__entry( | 2188 | TP_STRUCT__entry( |
2176 | __field( dev_t, dev ) | 2189 | __field( dev_t, dev ) |
@@ -2178,7 +2191,9 @@ TRACE_EVENT(ext4_ext_remove_space_done, | |||
2178 | __field( ext4_lblk_t, start ) | 2191 | __field( ext4_lblk_t, start ) |
2179 | __field( ext4_lblk_t, end ) | 2192 | __field( ext4_lblk_t, end ) |
2180 | __field( int, depth ) | 2193 | __field( int, depth ) |
2181 | __field( long long, partial ) | 2194 | __field( ext4_fsblk_t, pc_pclu ) |
2195 | __field( ext4_lblk_t, pc_lblk ) | ||
2196 | __field( int, pc_state ) | ||
2182 | __field( unsigned short, eh_entries ) | 2197 | __field( unsigned short, eh_entries ) |
2183 | ), | 2198 | ), |
2184 | 2199 | ||
@@ -2188,18 +2203,23 @@ TRACE_EVENT(ext4_ext_remove_space_done, | |||
2188 | __entry->start = start; | 2203 | __entry->start = start; |
2189 | __entry->end = end; | 2204 | __entry->end = end; |
2190 | __entry->depth = depth; | 2205 | __entry->depth = depth; |
2191 | __entry->partial = partial; | 2206 | __entry->pc_pclu = pc->pclu; |
2207 | __entry->pc_lblk = pc->lblk; | ||
2208 | __entry->pc_state = pc->state; | ||
2192 | __entry->eh_entries = le16_to_cpu(eh_entries); | 2209 | __entry->eh_entries = le16_to_cpu(eh_entries); |
2193 | ), | 2210 | ), |
2194 | 2211 | ||
2195 | TP_printk("dev %d,%d ino %lu since %u end %u depth %d partial %lld " | 2212 | TP_printk("dev %d,%d ino %lu since %u end %u depth %d " |
2213 | "partial [pclu %lld lblk %u state %d] " | ||
2196 | "remaining_entries %u", | 2214 | "remaining_entries %u", |
2197 | MAJOR(__entry->dev), MINOR(__entry->dev), | 2215 | MAJOR(__entry->dev), MINOR(__entry->dev), |
2198 | (unsigned long) __entry->ino, | 2216 | (unsigned long) __entry->ino, |
2199 | (unsigned) __entry->start, | 2217 | (unsigned) __entry->start, |
2200 | (unsigned) __entry->end, | 2218 | (unsigned) __entry->end, |
2201 | __entry->depth, | 2219 | __entry->depth, |
2202 | (long long) __entry->partial, | 2220 | (long long) __entry->pc_pclu, |
2221 | (unsigned int) __entry->pc_lblk, | ||
2222 | (int) __entry->pc_state, | ||
2203 | (unsigned short) __entry->eh_entries) | 2223 | (unsigned short) __entry->eh_entries) |
2204 | ); | 2224 | ); |
2205 | 2225 | ||
@@ -2270,7 +2290,7 @@ TRACE_EVENT(ext4_es_remove_extent, | |||
2270 | __entry->lblk, __entry->len) | 2290 | __entry->lblk, __entry->len) |
2271 | ); | 2291 | ); |
2272 | 2292 | ||
2273 | TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, | 2293 | TRACE_EVENT(ext4_es_find_extent_range_enter, |
2274 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk), | 2294 | TP_PROTO(struct inode *inode, ext4_lblk_t lblk), |
2275 | 2295 | ||
2276 | TP_ARGS(inode, lblk), | 2296 | TP_ARGS(inode, lblk), |
@@ -2292,7 +2312,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_range_enter, | |||
2292 | (unsigned long) __entry->ino, __entry->lblk) | 2312 | (unsigned long) __entry->ino, __entry->lblk) |
2293 | ); | 2313 | ); |
2294 | 2314 | ||
2295 | TRACE_EVENT(ext4_es_find_delayed_extent_range_exit, | 2315 | TRACE_EVENT(ext4_es_find_extent_range_exit, |
2296 | TP_PROTO(struct inode *inode, struct extent_status *es), | 2316 | TP_PROTO(struct inode *inode, struct extent_status *es), |
2297 | 2317 | ||
2298 | TP_ARGS(inode, es), | 2318 | TP_ARGS(inode, es), |
@@ -2512,6 +2532,41 @@ TRACE_EVENT(ext4_es_shrink, | |||
2512 | __entry->scan_time, __entry->nr_skipped, __entry->retried) | 2532 | __entry->scan_time, __entry->nr_skipped, __entry->retried) |
2513 | ); | 2533 | ); |
2514 | 2534 | ||
2535 | TRACE_EVENT(ext4_es_insert_delayed_block, | ||
2536 | TP_PROTO(struct inode *inode, struct extent_status *es, | ||
2537 | bool allocated), | ||
2538 | |||
2539 | TP_ARGS(inode, es, allocated), | ||
2540 | |||
2541 | TP_STRUCT__entry( | ||
2542 | __field( dev_t, dev ) | ||
2543 | __field( ino_t, ino ) | ||
2544 | __field( ext4_lblk_t, lblk ) | ||
2545 | __field( ext4_lblk_t, len ) | ||
2546 | __field( ext4_fsblk_t, pblk ) | ||
2547 | __field( char, status ) | ||
2548 | __field( bool, allocated ) | ||
2549 | ), | ||
2550 | |||
2551 | TP_fast_assign( | ||
2552 | __entry->dev = inode->i_sb->s_dev; | ||
2553 | __entry->ino = inode->i_ino; | ||
2554 | __entry->lblk = es->es_lblk; | ||
2555 | __entry->len = es->es_len; | ||
2556 | __entry->pblk = ext4_es_pblock(es); | ||
2557 | __entry->status = ext4_es_status(es); | ||
2558 | __entry->allocated = allocated; | ||
2559 | ), | ||
2560 | |||
2561 | TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " | ||
2562 | "allocated %d", | ||
2563 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
2564 | (unsigned long) __entry->ino, | ||
2565 | __entry->lblk, __entry->len, | ||
2566 | __entry->pblk, show_extent_status(__entry->status), | ||
2567 | __entry->allocated) | ||
2568 | ); | ||
2569 | |||
2515 | /* fsmap traces */ | 2570 | /* fsmap traces */ |
2516 | DECLARE_EVENT_CLASS(ext4_fsmap_class, | 2571 | DECLARE_EVENT_CLASS(ext4_fsmap_class, |
2517 | TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, | 2572 | TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, |