summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-21 16:37:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-21 16:37:39 -0400
commit70cb0d02b58128db07fc39b5e87a2873e2c16bde (patch)
tree43c0a4eb00f192ceb306b9c52503b2d54bc59660
parent104c0d6bc43e10ba84931c45b67e2c76c9c67f68 (diff)
parent040823b5372b445d1d9483811e85a24d71314d33 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Added new ext4 debugging ioctls to allow userspace to get information about the state of the extent status cache. Dropped workaround for pre-1970 dates which were encoded incorrectly in pre-4.4 kernels. Since both the kernel correctly generates, and e2fsck detects and fixes this issue for the past four years, it'e time to drop the workaround. (Also, it's not like files with dates in the distant past were all that common in the first place.) A lot of miscellaneous bug fixes and cleanups, including some ext4 Documentation fixes. Also included are two minor bug fixes in fs/unicode" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (21 commits) unicode: make array 'token' static const, makes object smaller unicode: Move static keyword to the front of declarations ext4: add missing bigalloc documentation. ext4: fix kernel oops caused by spurious casefold flag ext4: fix integer overflow when calculating commit interval ext4: use percpu_counters for extent_status cache hits/misses ext4: fix potential use after free after remounting with noblock_validity jbd2: add missing tracepoint for reserved handle ext4: fix punch hole for inline_data file systems ext4: rework reserved cluster accounting when invalidating pages ext4: documentation fixes ext4: treat buffers with write errors as containing valid data ext4: fix warning inside ext4_convert_unwritten_extents_endio ext4: set error return correctly when ext4_htree_store_dirent fails ext4: drop legacy pre-1970 encoding workaround ext4: add new ioctl EXT4_IOC_GET_ES_CACHE ext4: add a new ioctl EXT4_IOC_GETSTATE ext4: add a new ioctl EXT4_IOC_CLEAR_ES_CACHE jbd2: flush_descriptor(): Do not decrease buffer head's ref count ext4: remove unnecessary error check ...
-rw-r--r--Documentation/filesystems/ext4/bigalloc.rst32
-rw-r--r--Documentation/filesystems/ext4/blockgroup.rst10
-rw-r--r--Documentation/filesystems/ext4/blocks.rst4
-rw-r--r--Documentation/filesystems/ext4/directory.rst2
-rw-r--r--Documentation/filesystems/ext4/group_descr.rst9
-rw-r--r--Documentation/filesystems/ext4/inodes.rst4
-rw-r--r--Documentation/filesystems/ext4/super.rst20
-rw-r--r--fs/ext4/block_validity.c189
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h64
-rw-r--r--fs/ext4/extents.c98
-rw-r--r--fs/ext4/extents_status.c521
-rw-r--r--fs/ext4/extents_status.h8
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c103
-rw-r--r--fs/ext4/ioctl.c98
-rw-r--r--fs/ext4/namei.c4
-rw-r--r--fs/ext4/super.c7
-rw-r--r--fs/jbd2/revoke.c4
-rw-r--r--fs/jbd2/transaction.c3
-rw-r--r--fs/unicode/utf8-core.c2
-rw-r--r--fs/unicode/utf8-selftest.c4
24 files changed, 890 insertions, 309 deletions
diff --git a/Documentation/filesystems/ext4/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst
index c6d88557553c..72075aa608e4 100644
--- a/Documentation/filesystems/ext4/bigalloc.rst
+++ b/Documentation/filesystems/ext4/bigalloc.rst
@@ -9,14 +9,26 @@ ext4 code is not prepared to handle the case where the block size
9exceeds the page size. However, for a filesystem of mostly huge files, 9exceeds the page size. However, for a filesystem of mostly huge files,
10it is desirable to be able to allocate disk blocks in units of multiple 10it is desirable to be able to allocate disk blocks in units of multiple
11blocks to reduce both fragmentation and metadata overhead. The 11blocks to reduce both fragmentation and metadata overhead. The
12`bigalloc <Bigalloc>`__ feature provides exactly this ability. The 12bigalloc feature provides exactly this ability.
13administrator can set a block cluster size at mkfs time (which is stored 13
14in the s\_log\_cluster\_size field in the superblock); from then on, the 14The bigalloc feature (EXT4_FEATURE_RO_COMPAT_BIGALLOC) changes ext4 to
15block bitmaps track clusters, not individual blocks. This means that 15use clustered allocation, so that each bit in the ext4 block allocation
16block groups can be several gigabytes in size (instead of just 128MiB); 16bitmap addresses a power of two number of blocks. For example, if the
17however, the minimum allocation unit becomes a cluster, not a block, 17file system is mainly going to be storing large files in the 4-32
18even for directories. TaoBao had a patchset to extend the “use units of 18megabyte range, it might make sense to set a cluster size of 1 megabyte.
19clusters instead of blocks” to the extent tree, though it is not clear 19This means that each bit in the block allocation bitmap now addresses
20where those patches went-- they eventually morphed into “extent tree v2” 20256 4k blocks. This shrinks the total size of the block allocation
21but that code has not landed as of May 2015. 21bitmaps for a 2T file system from 64 megabytes to 256 kilobytes. It also
22means that a block group addresses 32 gigabytes instead of 128 megabytes,
23also shrinking the amount of file system overhead for metadata.
24
25The administrator can set a block cluster size at mkfs time (which is
26stored in the s\_log\_cluster\_size field in the superblock); from then
27on, the block bitmaps track clusters, not individual blocks. This means
28that block groups can be several gigabytes in size (instead of just
29128MiB); however, the minimum allocation unit becomes a cluster, not a
30block, even for directories. TaoBao had a patchset to extend the “use
31units of clusters instead of blocks” to the extent tree, though it is
32not clear where those patches went-- they eventually morphed into
33“extent tree v2” but that code has not landed as of May 2015.
22 34
diff --git a/Documentation/filesystems/ext4/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst
index baf888e4c06a..3da156633339 100644
--- a/Documentation/filesystems/ext4/blockgroup.rst
+++ b/Documentation/filesystems/ext4/blockgroup.rst
@@ -71,11 +71,11 @@ if the flex\_bg size is 4, then group 0 will contain (in order) the
71superblock, group descriptors, data block bitmaps for groups 0-3, inode 71superblock, group descriptors, data block bitmaps for groups 0-3, inode
72bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining 72bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining
73space in group 0 is for file data. The effect of this is to group the 73space in group 0 is for file data. The effect of this is to group the
74block metadata close together for faster loading, and to enable large 74block group metadata close together for faster loading, and to enable
75files to be continuous on disk. Backup copies of the superblock and 75large files to be continuous on disk. Backup copies of the superblock
76group descriptors are always at the beginning of block groups, even if 76and group descriptors are always at the beginning of block groups, even
77flex\_bg is enabled. The number of block groups that make up a flex\_bg 77if flex\_bg is enabled. The number of block groups that make up a
78is given by 2 ^ ``sb.s_log_groups_per_flex``. 78flex\_bg is given by 2 ^ ``sb.s_log_groups_per_flex``.
79 79
80Meta Block Groups 80Meta Block Groups
81----------------- 81-----------------
diff --git a/Documentation/filesystems/ext4/blocks.rst b/Documentation/filesystems/ext4/blocks.rst
index 73d4dc0f7bda..bd722ecd92d6 100644
--- a/Documentation/filesystems/ext4/blocks.rst
+++ b/Documentation/filesystems/ext4/blocks.rst
@@ -10,7 +10,9 @@ block groups. Block size is specified at mkfs time and typically is
104KiB. You may experience mounting problems if block size is greater than 104KiB. You may experience mounting problems if block size is greater than
11page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory 11page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory
12pages). By default a filesystem can contain 2^32 blocks; if the '64bit' 12pages). By default a filesystem can contain 2^32 blocks; if the '64bit'
13feature is enabled, then a filesystem can have 2^64 blocks. 13feature is enabled, then a filesystem can have 2^64 blocks. The location
14of structures is stored in terms of the block number the structure lives
15in and not the absolute offset on disk.
14 16
15For 32-bit filesystems, limits are as follows: 17For 32-bit filesystems, limits are as follows:
16 18
diff --git a/Documentation/filesystems/ext4/directory.rst b/Documentation/filesystems/ext4/directory.rst
index 614034e24669..073940cc64ed 100644
--- a/Documentation/filesystems/ext4/directory.rst
+++ b/Documentation/filesystems/ext4/directory.rst
@@ -59,7 +59,7 @@ is at most 263 bytes long, though on disk you'll need to reference
59 - File name. 59 - File name.
60 60
61Since file names cannot be longer than 255 bytes, the new directory 61Since file names cannot be longer than 255 bytes, the new directory
62entry format shortens the rec\_len field and uses the space for a file 62entry format shortens the name\_len field and uses the space for a file
63type flag, probably to avoid having to load every inode during directory 63type flag, probably to avoid having to load every inode during directory
64tree traversal. This format is ``ext4_dir_entry_2``, which is at most 64tree traversal. This format is ``ext4_dir_entry_2``, which is at most
65263 bytes long, though on disk you'll need to reference 65263 bytes long, though on disk you'll need to reference
diff --git a/Documentation/filesystems/ext4/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst
index 0f783ed88592..7ba6114e7f5c 100644
--- a/Documentation/filesystems/ext4/group_descr.rst
+++ b/Documentation/filesystems/ext4/group_descr.rst
@@ -99,9 +99,12 @@ The block group descriptor is laid out in ``struct ext4_group_desc``.
99 * - 0x1E 99 * - 0x1E
100 - \_\_le16 100 - \_\_le16
101 - bg\_checksum 101 - bg\_checksum
102 - Group descriptor checksum; crc16(sb\_uuid+group+desc) if the 102 - Group descriptor checksum; crc16(sb\_uuid+group\_num+bg\_desc) if the
103 RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) & 103 RO\_COMPAT\_GDT\_CSUM feature is set, or
104 0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set. 104 crc32c(sb\_uuid+group\_num+bg\_desc) & 0xFFFF if the
105 RO\_COMPAT\_METADATA\_CSUM feature is set. The bg\_checksum
106 field in bg\_desc is skipped when calculating crc16 checksum,
107 and set to zero if crc32c checksum is used.
105 * - 108 * -
106 - 109 -
107 - 110 -
diff --git a/Documentation/filesystems/ext4/inodes.rst b/Documentation/filesystems/ext4/inodes.rst
index e851e6ca31fa..a65baffb4ebf 100644
--- a/Documentation/filesystems/ext4/inodes.rst
+++ b/Documentation/filesystems/ext4/inodes.rst
@@ -472,8 +472,8 @@ inode, which allows struct ext4\_inode to grow for a new kernel without
472having to upgrade all of the on-disk inodes. Access to fields beyond 472having to upgrade all of the on-disk inodes. Access to fields beyond
473EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within 473EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within
474``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as 474``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as
475of October 2013) the inode structure is 156 bytes 475of August 2019) the inode structure is 160 bytes
476(``i_extra_isize = 28``). The extra space between the end of the inode 476(``i_extra_isize = 32``). The extra space between the end of the inode
477structure and the end of the inode record can be used to store extended 477structure and the end of the inode record can be used to store extended
478attributes. Each inode record can be as large as the filesystem block 478attributes. Each inode record can be as large as the filesystem block
479size, though this is not terribly efficient. 479size, though this is not terribly efficient.
diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst
index 6eae92054827..93e55d7c1d40 100644
--- a/Documentation/filesystems/ext4/super.rst
+++ b/Documentation/filesystems/ext4/super.rst
@@ -58,7 +58,7 @@ The ext4 superblock is laid out as follows in
58 * - 0x1C 58 * - 0x1C
59 - \_\_le32 59 - \_\_le32
60 - s\_log\_cluster\_size 60 - s\_log\_cluster\_size
61 - Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is 61 - Cluster size is 2 ^ (10 + s\_log\_cluster\_size) blocks if bigalloc is
62 enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size. 62 enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size.
63 * - 0x20 63 * - 0x20
64 - \_\_le32 64 - \_\_le32
@@ -447,7 +447,7 @@ The ext4 superblock is laid out as follows in
447 - Upper 8 bits of the s_wtime field. 447 - Upper 8 bits of the s_wtime field.
448 * - 0x275 448 * - 0x275
449 - \_\_u8 449 - \_\_u8
450 - s\_wtime_hi 450 - s\_mtime_hi
451 - Upper 8 bits of the s_mtime field. 451 - Upper 8 bits of the s_mtime field.
452 * - 0x276 452 * - 0x276
453 - \_\_u8 453 - \_\_u8
@@ -466,12 +466,20 @@ The ext4 superblock is laid out as follows in
466 - s\_last_error_time_hi 466 - s\_last_error_time_hi
467 - Upper 8 bits of the s_last_error_time_hi field. 467 - Upper 8 bits of the s_last_error_time_hi field.
468 * - 0x27A 468 * - 0x27A
469 - \_\_u8[2] 469 - \_\_u8
470 - s\_pad 470 - s\_pad[2]
471 - Zero padding. 471 - Zero padding.
472 * - 0x27C 472 * - 0x27C
473 - \_\_le16
474 - s\_encoding
475 - Filename charset encoding.
476 * - 0x27E
477 - \_\_le16
478 - s\_encoding_flags
479 - Filename charset encoding flags.
480 * - 0x280
473 - \_\_le32 481 - \_\_le32
474 - s\_reserved[96] 482 - s\_reserved[95]
475 - Padding to the end of the block. 483 - Padding to the end of the block.
476 * - 0x3FC 484 * - 0x3FC
477 - \_\_le32 485 - \_\_le32
@@ -617,7 +625,7 @@ following:
617 * - 0x80 625 * - 0x80
618 - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT). 626 - Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT).
619 * - 0x100 627 * - 0x100
620 - Multiple mount protection. Not implemented (INCOMPAT\_MMP). 628 - Multiple mount protection (INCOMPAT\_MMP).
621 * - 0x200 629 * - 0x200
622 - Flexible block groups. See the earlier discussion of this feature 630 - Flexible block groups. See the earlier discussion of this feature
623 (INCOMPAT\_FLEX\_BG). 631 (INCOMPAT\_FLEX\_BG).
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8e83741b02e0..d4d4fdfac1a6 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -38,6 +38,7 @@ int __init ext4_init_system_zone(void)
38 38
39void ext4_exit_system_zone(void) 39void ext4_exit_system_zone(void)
40{ 40{
41 rcu_barrier();
41 kmem_cache_destroy(ext4_system_zone_cachep); 42 kmem_cache_destroy(ext4_system_zone_cachep);
42} 43}
43 44
@@ -49,17 +50,26 @@ static inline int can_merge(struct ext4_system_zone *entry1,
49 return 0; 50 return 0;
50} 51}
51 52
53static void release_system_zone(struct ext4_system_blocks *system_blks)
54{
55 struct ext4_system_zone *entry, *n;
56
57 rbtree_postorder_for_each_entry_safe(entry, n,
58 &system_blks->root, node)
59 kmem_cache_free(ext4_system_zone_cachep, entry);
60}
61
52/* 62/*
53 * Mark a range of blocks as belonging to the "system zone" --- that 63 * Mark a range of blocks as belonging to the "system zone" --- that
54 * is, filesystem metadata blocks which should never be used by 64 * is, filesystem metadata blocks which should never be used by
55 * inodes. 65 * inodes.
56 */ 66 */
57static int add_system_zone(struct ext4_sb_info *sbi, 67static int add_system_zone(struct ext4_system_blocks *system_blks,
58 ext4_fsblk_t start_blk, 68 ext4_fsblk_t start_blk,
59 unsigned int count) 69 unsigned int count)
60{ 70{
61 struct ext4_system_zone *new_entry = NULL, *entry; 71 struct ext4_system_zone *new_entry = NULL, *entry;
62 struct rb_node **n = &sbi->system_blks.rb_node, *node; 72 struct rb_node **n = &system_blks->root.rb_node, *node;
63 struct rb_node *parent = NULL, *new_node = NULL; 73 struct rb_node *parent = NULL, *new_node = NULL;
64 74
65 while (*n) { 75 while (*n) {
@@ -91,7 +101,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
91 new_node = &new_entry->node; 101 new_node = &new_entry->node;
92 102
93 rb_link_node(new_node, parent, n); 103 rb_link_node(new_node, parent, n);
94 rb_insert_color(new_node, &sbi->system_blks); 104 rb_insert_color(new_node, &system_blks->root);
95 } 105 }
96 106
97 /* Can we merge to the left? */ 107 /* Can we merge to the left? */
@@ -101,7 +111,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
101 if (can_merge(entry, new_entry)) { 111 if (can_merge(entry, new_entry)) {
102 new_entry->start_blk = entry->start_blk; 112 new_entry->start_blk = entry->start_blk;
103 new_entry->count += entry->count; 113 new_entry->count += entry->count;
104 rb_erase(node, &sbi->system_blks); 114 rb_erase(node, &system_blks->root);
105 kmem_cache_free(ext4_system_zone_cachep, entry); 115 kmem_cache_free(ext4_system_zone_cachep, entry);
106 } 116 }
107 } 117 }
@@ -112,7 +122,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
112 entry = rb_entry(node, struct ext4_system_zone, node); 122 entry = rb_entry(node, struct ext4_system_zone, node);
113 if (can_merge(new_entry, entry)) { 123 if (can_merge(new_entry, entry)) {
114 new_entry->count += entry->count; 124 new_entry->count += entry->count;
115 rb_erase(node, &sbi->system_blks); 125 rb_erase(node, &system_blks->root);
116 kmem_cache_free(ext4_system_zone_cachep, entry); 126 kmem_cache_free(ext4_system_zone_cachep, entry);
117 } 127 }
118 } 128 }
@@ -126,7 +136,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
126 int first = 1; 136 int first = 1;
127 137
128 printk(KERN_INFO "System zones: "); 138 printk(KERN_INFO "System zones: ");
129 node = rb_first(&sbi->system_blks); 139 node = rb_first(&sbi->system_blks->root);
130 while (node) { 140 while (node) {
131 entry = rb_entry(node, struct ext4_system_zone, node); 141 entry = rb_entry(node, struct ext4_system_zone, node);
132 printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", 142 printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
@@ -137,7 +147,47 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
137 printk(KERN_CONT "\n"); 147 printk(KERN_CONT "\n");
138} 148}
139 149
140static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) 150/*
151 * Returns 1 if the passed-in block region (start_blk,
152 * start_blk+count) is valid; 0 if some part of the block region
153 * overlaps with filesystem metadata blocks.
154 */
155static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
156 struct ext4_system_blocks *system_blks,
157 ext4_fsblk_t start_blk,
158 unsigned int count)
159{
160 struct ext4_system_zone *entry;
161 struct rb_node *n;
162
163 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
164 (start_blk + count < start_blk) ||
165 (start_blk + count > ext4_blocks_count(sbi->s_es))) {
166 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
167 return 0;
168 }
169
170 if (system_blks == NULL)
171 return 1;
172
173 n = system_blks->root.rb_node;
174 while (n) {
175 entry = rb_entry(n, struct ext4_system_zone, node);
176 if (start_blk + count - 1 < entry->start_blk)
177 n = n->rb_left;
178 else if (start_blk >= (entry->start_blk + entry->count))
179 n = n->rb_right;
180 else {
181 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
182 return 0;
183 }
184 }
185 return 1;
186}
187
188static int ext4_protect_reserved_inode(struct super_block *sb,
189 struct ext4_system_blocks *system_blks,
190 u32 ino)
141{ 191{
142 struct inode *inode; 192 struct inode *inode;
143 struct ext4_sb_info *sbi = EXT4_SB(sb); 193 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -163,14 +213,15 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
163 if (n == 0) { 213 if (n == 0) {
164 i++; 214 i++;
165 } else { 215 } else {
166 if (!ext4_data_block_valid(sbi, map.m_pblk, n)) { 216 if (!ext4_data_block_valid_rcu(sbi, system_blks,
217 map.m_pblk, n)) {
167 ext4_error(sb, "blocks %llu-%llu from inode %u " 218 ext4_error(sb, "blocks %llu-%llu from inode %u "
168 "overlap system zone", map.m_pblk, 219 "overlap system zone", map.m_pblk,
169 map.m_pblk + map.m_len - 1, ino); 220 map.m_pblk + map.m_len - 1, ino);
170 err = -EFSCORRUPTED; 221 err = -EFSCORRUPTED;
171 break; 222 break;
172 } 223 }
173 err = add_system_zone(sbi, map.m_pblk, n); 224 err = add_system_zone(system_blks, map.m_pblk, n);
174 if (err < 0) 225 if (err < 0)
175 break; 226 break;
176 i += n; 227 i += n;
@@ -180,94 +231,130 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
180 return err; 231 return err;
181} 232}
182 233
234static void ext4_destroy_system_zone(struct rcu_head *rcu)
235{
236 struct ext4_system_blocks *system_blks;
237
238 system_blks = container_of(rcu, struct ext4_system_blocks, rcu);
239 release_system_zone(system_blks);
240 kfree(system_blks);
241}
242
243/*
244 * Build system zone rbtree which is used for block validity checking.
245 *
246 * The update of system_blks pointer in this function is protected by
247 * sb->s_umount semaphore. However we have to be careful as we can be
248 * racing with ext4_data_block_valid() calls reading system_blks rbtree
249 * protected only by RCU. That's why we first build the rbtree and then
250 * swap it in place.
251 */
183int ext4_setup_system_zone(struct super_block *sb) 252int ext4_setup_system_zone(struct super_block *sb)
184{ 253{
185 ext4_group_t ngroups = ext4_get_groups_count(sb); 254 ext4_group_t ngroups = ext4_get_groups_count(sb);
186 struct ext4_sb_info *sbi = EXT4_SB(sb); 255 struct ext4_sb_info *sbi = EXT4_SB(sb);
256 struct ext4_system_blocks *system_blks;
187 struct ext4_group_desc *gdp; 257 struct ext4_group_desc *gdp;
188 ext4_group_t i; 258 ext4_group_t i;
189 int flex_size = ext4_flex_bg_size(sbi); 259 int flex_size = ext4_flex_bg_size(sbi);
190 int ret; 260 int ret;
191 261
192 if (!test_opt(sb, BLOCK_VALIDITY)) { 262 if (!test_opt(sb, BLOCK_VALIDITY)) {
193 if (sbi->system_blks.rb_node) 263 if (sbi->system_blks)
194 ext4_release_system_zone(sb); 264 ext4_release_system_zone(sb);
195 return 0; 265 return 0;
196 } 266 }
197 if (sbi->system_blks.rb_node) 267 if (sbi->system_blks)
198 return 0; 268 return 0;
199 269
270 system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
271 if (!system_blks)
272 return -ENOMEM;
273
200 for (i=0; i < ngroups; i++) { 274 for (i=0; i < ngroups; i++) {
201 cond_resched(); 275 cond_resched();
202 if (ext4_bg_has_super(sb, i) && 276 if (ext4_bg_has_super(sb, i) &&
203 ((i < 5) || ((i % flex_size) == 0))) 277 ((i < 5) || ((i % flex_size) == 0)))
204 add_system_zone(sbi, ext4_group_first_block_no(sb, i), 278 add_system_zone(system_blks,
279 ext4_group_first_block_no(sb, i),
205 ext4_bg_num_gdb(sb, i) + 1); 280 ext4_bg_num_gdb(sb, i) + 1);
206 gdp = ext4_get_group_desc(sb, i, NULL); 281 gdp = ext4_get_group_desc(sb, i, NULL);
207 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); 282 ret = add_system_zone(system_blks,
283 ext4_block_bitmap(sb, gdp), 1);
208 if (ret) 284 if (ret)
209 return ret; 285 goto err;
210 ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); 286 ret = add_system_zone(system_blks,
287 ext4_inode_bitmap(sb, gdp), 1);
211 if (ret) 288 if (ret)
212 return ret; 289 goto err;
213 ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), 290 ret = add_system_zone(system_blks,
291 ext4_inode_table(sb, gdp),
214 sbi->s_itb_per_group); 292 sbi->s_itb_per_group);
215 if (ret) 293 if (ret)
216 return ret; 294 goto err;
217 } 295 }
218 if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { 296 if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
219 ret = ext4_protect_reserved_inode(sb, 297 ret = ext4_protect_reserved_inode(sb, system_blks,
220 le32_to_cpu(sbi->s_es->s_journal_inum)); 298 le32_to_cpu(sbi->s_es->s_journal_inum));
221 if (ret) 299 if (ret)
222 return ret; 300 goto err;
223 } 301 }
224 302
303 /*
304 * System blks rbtree complete, announce it once to prevent racing
305 * with ext4_data_block_valid() accessing the rbtree at the same
306 * time.
307 */
308 rcu_assign_pointer(sbi->system_blks, system_blks);
309
225 if (test_opt(sb, DEBUG)) 310 if (test_opt(sb, DEBUG))
226 debug_print_tree(sbi); 311 debug_print_tree(sbi);
227 return 0; 312 return 0;
313err:
314 release_system_zone(system_blks);
315 kfree(system_blks);
316 return ret;
228} 317}
229 318
230/* Called when the filesystem is unmounted */ 319/*
320 * Called when the filesystem is unmounted or when remounting it with
321 * noblock_validity specified.
322 *
323 * The update of system_blks pointer in this function is protected by
324 * sb->s_umount semaphore. However we have to be careful as we can be
325 * racing with ext4_data_block_valid() calls reading system_blks rbtree
326 * protected only by RCU. So we first clear the system_blks pointer and
327 * then free the rbtree only after RCU grace period expires.
328 */
231void ext4_release_system_zone(struct super_block *sb) 329void ext4_release_system_zone(struct super_block *sb)
232{ 330{
233 struct ext4_system_zone *entry, *n; 331 struct ext4_system_blocks *system_blks;
234 332
235 rbtree_postorder_for_each_entry_safe(entry, n, 333 system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
236 &EXT4_SB(sb)->system_blks, node) 334 lockdep_is_held(&sb->s_umount));
237 kmem_cache_free(ext4_system_zone_cachep, entry); 335 rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
238 336
239 EXT4_SB(sb)->system_blks = RB_ROOT; 337 if (system_blks)
338 call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
240} 339}
241 340
242/*
243 * Returns 1 if the passed-in block region (start_blk,
244 * start_blk+count) is valid; 0 if some part of the block region
245 * overlaps with filesystem metadata blocks.
246 */
247int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, 341int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
248 unsigned int count) 342 unsigned int count)
249{ 343{
250 struct ext4_system_zone *entry; 344 struct ext4_system_blocks *system_blks;
251 struct rb_node *n = sbi->system_blks.rb_node; 345 int ret;
252 346
253 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || 347 /*
254 (start_blk + count < start_blk) || 348 * Lock the system zone to prevent it being released concurrently
255 (start_blk + count > ext4_blocks_count(sbi->s_es))) { 349 * when doing a remount which inverse current "[no]block_validity"
256 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); 350 * mount option.
257 return 0; 351 */
258 } 352 rcu_read_lock();
259 while (n) { 353 system_blks = rcu_dereference(sbi->system_blks);
260 entry = rb_entry(n, struct ext4_system_zone, node); 354 ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
261 if (start_blk + count - 1 < entry->start_blk) 355 count);
262 n = n->rb_left; 356 rcu_read_unlock();
263 else if (start_blk >= (entry->start_blk + entry->count)) 357 return ret;
264 n = n->rb_right;
265 else {
266 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
267 return 0;
268 }
269 }
270 return 1;
271} 358}
272 359
273int ext4_check_blockref(const char *function, unsigned int line, 360int ext4_check_blockref(const char *function, unsigned int line,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86054f31fe4d..9fdd2b269d61 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -668,14 +668,15 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
668 const char *str, const struct qstr *name) 668 const char *str, const struct qstr *name)
669{ 669{
670 struct qstr qstr = {.name = str, .len = len }; 670 struct qstr qstr = {.name = str, .len = len };
671 struct inode *inode = dentry->d_parent->d_inode;
671 672
672 if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) { 673 if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) {
673 if (len != name->len) 674 if (len != name->len)
674 return -1; 675 return -1;
675 return memcmp(str, name->name, len); 676 return memcmp(str, name->name, len);
676 } 677 }
677 678
678 return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr, false); 679 return ext4_ci_compare(inode, name, &qstr, false);
679} 680}
680 681
681static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) 682static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
@@ -685,7 +686,7 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
685 unsigned char *norm; 686 unsigned char *norm;
686 int len, ret = 0; 687 int len, ret = 0;
687 688
688 if (!IS_CASEFOLDED(dentry->d_inode)) 689 if (!IS_CASEFOLDED(dentry->d_inode) || !um)
689 return 0; 690 return 0;
690 691
691 norm = kmalloc(PATH_MAX, GFP_ATOMIC); 692 norm = kmalloc(PATH_MAX, GFP_ATOMIC);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 42c6e4a5e673..03db3e71676c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,6 +186,14 @@ struct ext4_map_blocks {
186}; 186};
187 187
188/* 188/*
189 * Block validity checking, system zone rbtree.
190 */
191struct ext4_system_blocks {
192 struct rb_root root;
193 struct rcu_head rcu;
194};
195
196/*
189 * Flags for ext4_io_end->flags 197 * Flags for ext4_io_end->flags
190 */ 198 */
191#define EXT4_IO_END_UNWRITTEN 0x0001 199#define EXT4_IO_END_UNWRITTEN 0x0001
@@ -285,6 +293,9 @@ struct ext4_io_submit {
285 ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) 293 ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
286#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ 294#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
287 ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) 295 ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
296/* Fill in the low bits to get the last block of the cluster */
297#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \
298 ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
288/* Get the cluster offset */ 299/* Get the cluster offset */
289#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ 300#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
290 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) 301 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
@@ -653,6 +664,10 @@ enum {
653#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY 664#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
654#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT 665#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
655#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY 666#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
667/* ioctl codes 19--39 are reserved for fscrypt */
668#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
669#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
670#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
656 671
657#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR 672#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
658#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR 673#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
@@ -666,6 +681,16 @@ enum {
666#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ 681#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
667#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ 682#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
668 683
684/*
685 * Flags returned by EXT4_IOC_GETSTATE
686 *
687 * We only expose to userspace a subset of the state flags in
688 * i_state_flags
689 */
690#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001
691#define EXT4_STATE_FLAG_NEW 0x00000002
692#define EXT4_STATE_FLAG_NEWENTRY 0x00000004
693#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008
669 694
670#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 695#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
671/* 696/*
@@ -683,6 +708,12 @@ enum {
683#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 708#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
684#endif 709#endif
685 710
711/*
712 * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
713 * It indicates that the entry in extent status cache is for a hole.
714 */
715#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000
716
686/* Max physical block we can address w/o extents */ 717/* Max physical block we can address w/o extents */
687#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF 718#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
688 719
@@ -812,21 +843,8 @@ static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
812static inline void ext4_decode_extra_time(struct timespec64 *time, 843static inline void ext4_decode_extra_time(struct timespec64 *time,
813 __le32 extra) 844 __le32 extra)
814{ 845{
815 if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) { 846 if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
816
817#if 1
818 /* Handle legacy encoding of pre-1970 dates with epoch
819 * bits 1,1. (This backwards compatibility may be removed
820 * at the discretion of the ext4 developers.)
821 */
822 u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
823 if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
824 extra_bits = 0;
825 time->tv_sec += extra_bits << 32;
826#else
827 time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; 847 time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
828#endif
829 }
830 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; 848 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
831} 849}
832 850
@@ -1427,7 +1445,7 @@ struct ext4_sb_info {
1427 int s_jquota_fmt; /* Format of quota to use */ 1445 int s_jquota_fmt; /* Format of quota to use */
1428#endif 1446#endif
1429 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ 1447 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
1430 struct rb_root system_blks; 1448 struct ext4_system_blocks __rcu *system_blks;
1431 1449
1432#ifdef EXTENTS_STATS 1450#ifdef EXTENTS_STATS
1433 /* ext4 extents stats */ 1451 /* ext4 extents stats */
@@ -3267,6 +3285,9 @@ extern int ext4_ext_check_inode(struct inode *inode);
3267extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); 3285extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
3268extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 3286extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3269 __u64 start, __u64 len); 3287 __u64 start, __u64 len);
3288extern int ext4_get_es_cache(struct inode *inode,
3289 struct fiemap_extent_info *fieinfo,
3290 __u64 start, __u64 len);
3270extern int ext4_ext_precache(struct inode *inode); 3291extern int ext4_ext_precache(struct inode *inode);
3271extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); 3292extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
3272extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); 3293extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
@@ -3359,6 +3380,19 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
3359 3380
3360extern const struct iomap_ops ext4_iomap_ops; 3381extern const struct iomap_ops ext4_iomap_ops;
3361 3382
3383static inline int ext4_buffer_uptodate(struct buffer_head *bh)
3384{
3385 /*
3386 * If the buffer has the write error flag, we have failed
3387 * to write out data in the block. In this case, we don't
3388 * have to read the block because we may read the old data
3389 * successfully.
3390 */
3391 if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
3392 set_buffer_uptodate(bh);
3393 return buffer_uptodate(bh);
3394}
3395
3362#endif /* __KERNEL__ */ 3396#endif /* __KERNEL__ */
3363 3397
3364#define EFSBADCRC EBADMSG /* Bad CRC detected */ 3398#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92266a2da7d6..fb0f99dc8c22 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2315,6 +2315,52 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2315 return err; 2315 return err;
2316} 2316}
2317 2317
2318static int ext4_fill_es_cache_info(struct inode *inode,
2319 ext4_lblk_t block, ext4_lblk_t num,
2320 struct fiemap_extent_info *fieinfo)
2321{
2322 ext4_lblk_t next, end = block + num - 1;
2323 struct extent_status es;
2324 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2325 unsigned int flags;
2326 int err;
2327
2328 while (block <= end) {
2329 next = 0;
2330 flags = 0;
2331 if (!ext4_es_lookup_extent(inode, block, &next, &es))
2332 break;
2333 if (ext4_es_is_unwritten(&es))
2334 flags |= FIEMAP_EXTENT_UNWRITTEN;
2335 if (ext4_es_is_delayed(&es))
2336 flags |= (FIEMAP_EXTENT_DELALLOC |
2337 FIEMAP_EXTENT_UNKNOWN);
2338 if (ext4_es_is_hole(&es))
2339 flags |= EXT4_FIEMAP_EXTENT_HOLE;
2340 if (next == 0)
2341 flags |= FIEMAP_EXTENT_LAST;
2342 if (flags & (FIEMAP_EXTENT_DELALLOC|
2343 EXT4_FIEMAP_EXTENT_HOLE))
2344 es.es_pblk = 0;
2345 else
2346 es.es_pblk = ext4_es_pblock(&es);
2347 err = fiemap_fill_next_extent(fieinfo,
2348 (__u64)es.es_lblk << blksize_bits,
2349 (__u64)es.es_pblk << blksize_bits,
2350 (__u64)es.es_len << blksize_bits,
2351 flags);
2352 if (next == 0)
2353 break;
2354 block = next;
2355 if (err < 0)
2356 return err;
2357 if (err == 1)
2358 return 0;
2359 }
2360 return 0;
2361}
2362
2363
2318/* 2364/*
2319 * ext4_ext_determine_hole - determine hole around given block 2365 * ext4_ext_determine_hole - determine hole around given block
2320 * @inode: inode we lookup in 2366 * @inode: inode we lookup in
@@ -3813,8 +3859,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3813 * illegal. 3859 * illegal.
3814 */ 3860 */
3815 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3861 if (ee_block != map->m_lblk || ee_len > map->m_len) {
3816#ifdef EXT4_DEBUG 3862#ifdef CONFIG_EXT4_DEBUG
3817 ext4_warning("Inode (%ld) finished: extent logical block %llu," 3863 ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3818 " len %u; IO logical block %llu, len %u", 3864 " len %u; IO logical block %llu, len %u",
3819 inode->i_ino, (unsigned long long)ee_block, ee_len, 3865 inode->i_ino, (unsigned long long)ee_block, ee_len,
3820 (unsigned long long)map->m_lblk, map->m_len); 3866 (unsigned long long)map->m_lblk, map->m_len);
@@ -5017,8 +5063,6 @@ static int ext4_find_delayed_extent(struct inode *inode,
5017 5063
5018 return next_del; 5064 return next_del;
5019} 5065}
5020/* fiemap flags we can handle specified here */
5021#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
5022 5066
5023static int ext4_xattr_fiemap(struct inode *inode, 5067static int ext4_xattr_fiemap(struct inode *inode,
5024 struct fiemap_extent_info *fieinfo) 5068 struct fiemap_extent_info *fieinfo)
@@ -5055,10 +5099,16 @@ static int ext4_xattr_fiemap(struct inode *inode,
5055 return (error < 0 ? error : 0); 5099 return (error < 0 ? error : 0);
5056} 5100}
5057 5101
5058int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5102static int _ext4_fiemap(struct inode *inode,
5059 __u64 start, __u64 len) 5103 struct fiemap_extent_info *fieinfo,
5104 __u64 start, __u64 len,
5105 int (*fill)(struct inode *, ext4_lblk_t,
5106 ext4_lblk_t,
5107 struct fiemap_extent_info *))
5060{ 5108{
5061 ext4_lblk_t start_blk; 5109 ext4_lblk_t start_blk;
5110 u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
5111
5062 int error = 0; 5112 int error = 0;
5063 5113
5064 if (ext4_has_inline_data(inode)) { 5114 if (ext4_has_inline_data(inode)) {
@@ -5075,14 +5125,18 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5075 error = ext4_ext_precache(inode); 5125 error = ext4_ext_precache(inode);
5076 if (error) 5126 if (error)
5077 return error; 5127 return error;
5128 fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
5078 } 5129 }
5079 5130
5080 /* fallback to generic here if not in extents fmt */ 5131 /* fallback to generic here if not in extents fmt */
5081 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5132 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
5133 fill == ext4_fill_fiemap_extents)
5082 return generic_block_fiemap(inode, fieinfo, start, len, 5134 return generic_block_fiemap(inode, fieinfo, start, len,
5083 ext4_get_block); 5135 ext4_get_block);
5084 5136
5085 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 5137 if (fill == ext4_fill_es_cache_info)
5138 ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
5139 if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
5086 return -EBADR; 5140 return -EBADR;
5087 5141
5088 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 5142 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
@@ -5101,12 +5155,36 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5101 * Walk the extent tree gathering extent information 5155 * Walk the extent tree gathering extent information
5102 * and pushing extents back to the user. 5156 * and pushing extents back to the user.
5103 */ 5157 */
5104 error = ext4_fill_fiemap_extents(inode, start_blk, 5158 error = fill(inode, start_blk, len_blks, fieinfo);
5105 len_blks, fieinfo);
5106 } 5159 }
5107 return error; 5160 return error;
5108} 5161}
5109 5162
5163int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5164 __u64 start, __u64 len)
5165{
5166 return _ext4_fiemap(inode, fieinfo, start, len,
5167 ext4_fill_fiemap_extents);
5168}
5169
5170int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
5171 __u64 start, __u64 len)
5172{
5173 if (ext4_has_inline_data(inode)) {
5174 int has_inline;
5175
5176 down_read(&EXT4_I(inode)->xattr_sem);
5177 has_inline = ext4_has_inline_data(inode);
5178 up_read(&EXT4_I(inode)->xattr_sem);
5179 if (has_inline)
5180 return 0;
5181 }
5182
5183 return _ext4_fiemap(inode, fieinfo, start, len,
5184 ext4_fill_es_cache_info);
5185}
5186
5187
5110/* 5188/*
5111 * ext4_access_path: 5189 * ext4_access_path:
5112 * Function to access the path buffer for marking it dirty. 5190 * Function to access the path buffer for marking it dirty.
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 7521de2dcf3a..d996b44d2265 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -146,7 +146,7 @@ static struct kmem_cache *ext4_pending_cachep;
146 146
147static int __es_insert_extent(struct inode *inode, struct extent_status *newes); 147static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
148static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 148static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
149 ext4_lblk_t end); 149 ext4_lblk_t end, int *reserved);
150static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); 150static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
151static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, 151static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
152 struct ext4_inode_info *locked_ei); 152 struct ext4_inode_info *locked_ei);
@@ -836,7 +836,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
836 ext4_es_insert_extent_check(inode, &newes); 836 ext4_es_insert_extent_check(inode, &newes);
837 837
838 write_lock(&EXT4_I(inode)->i_es_lock); 838 write_lock(&EXT4_I(inode)->i_es_lock);
839 err = __es_remove_extent(inode, lblk, end); 839 err = __es_remove_extent(inode, lblk, end, NULL);
840 if (err != 0) 840 if (err != 0)
841 goto error; 841 goto error;
842retry: 842retry:
@@ -899,6 +899,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
899 * Return: 1 on found, 0 on not 899 * Return: 1 on found, 0 on not
900 */ 900 */
901int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 901int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
902 ext4_lblk_t *next_lblk,
902 struct extent_status *es) 903 struct extent_status *es)
903{ 904{
904 struct ext4_es_tree *tree; 905 struct ext4_es_tree *tree;
@@ -947,9 +948,18 @@ out:
947 es->es_pblk = es1->es_pblk; 948 es->es_pblk = es1->es_pblk;
948 if (!ext4_es_is_referenced(es1)) 949 if (!ext4_es_is_referenced(es1))
949 ext4_es_set_referenced(es1); 950 ext4_es_set_referenced(es1);
950 stats->es_stats_cache_hits++; 951 percpu_counter_inc(&stats->es_stats_cache_hits);
952 if (next_lblk) {
953 node = rb_next(&es1->rb_node);
954 if (node) {
955 es1 = rb_entry(node, struct extent_status,
956 rb_node);
957 *next_lblk = es1->es_lblk;
958 } else
959 *next_lblk = 0;
960 }
951 } else { 961 } else {
952 stats->es_stats_cache_misses++; 962 percpu_counter_inc(&stats->es_stats_cache_misses);
953 } 963 }
954 964
955 read_unlock(&EXT4_I(inode)->i_es_lock); 965 read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -958,8 +968,322 @@ out:
958 return found; 968 return found;
959} 969}
960 970
971struct rsvd_count {
972 int ndelonly;
973 bool first_do_lblk_found;
974 ext4_lblk_t first_do_lblk;
975 ext4_lblk_t last_do_lblk;
976 struct extent_status *left_es;
977 bool partial;
978 ext4_lblk_t lclu;
979};
980
981/*
982 * init_rsvd - initialize reserved count data before removing block range
983 * in file from extent status tree
984 *
985 * @inode - file containing range
986 * @lblk - first block in range
987 * @es - pointer to first extent in range
988 * @rc - pointer to reserved count data
989 *
990 * Assumes es is not NULL
991 */
992static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
993 struct extent_status *es, struct rsvd_count *rc)
994{
995 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
996 struct rb_node *node;
997
998 rc->ndelonly = 0;
999
1000 /*
1001 * for bigalloc, note the first delonly block in the range has not
1002 * been found, record the extent containing the block to the left of
1003 * the region to be removed, if any, and note that there's no partial
1004 * cluster to track
1005 */
1006 if (sbi->s_cluster_ratio > 1) {
1007 rc->first_do_lblk_found = false;
1008 if (lblk > es->es_lblk) {
1009 rc->left_es = es;
1010 } else {
1011 node = rb_prev(&es->rb_node);
1012 rc->left_es = node ? rb_entry(node,
1013 struct extent_status,
1014 rb_node) : NULL;
1015 }
1016 rc->partial = false;
1017 }
1018}
1019
1020/*
1021 * count_rsvd - count the clusters containing delayed and not unwritten
1022 * (delonly) blocks in a range within an extent and add to
1023 * the running tally in rsvd_count
1024 *
1025 * @inode - file containing extent
1026 * @lblk - first block in range
1027 * @len - length of range in blocks
1028 * @es - pointer to extent containing clusters to be counted
1029 * @rc - pointer to reserved count data
1030 *
1031 * Tracks partial clusters found at the beginning and end of extents so
1032 * they aren't overcounted when they span adjacent extents
1033 */
1034static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
1035 struct extent_status *es, struct rsvd_count *rc)
1036{
1037 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1038 ext4_lblk_t i, end, nclu;
1039
1040 if (!ext4_es_is_delonly(es))
1041 return;
1042
1043 WARN_ON(len <= 0);
1044
1045 if (sbi->s_cluster_ratio == 1) {
1046 rc->ndelonly += (int) len;
1047 return;
1048 }
1049
1050 /* bigalloc */
1051
1052 i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
1053 end = lblk + (ext4_lblk_t) len - 1;
1054 end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
1055
1056 /* record the first block of the first delonly extent seen */
1057 if (rc->first_do_lblk_found == false) {
1058 rc->first_do_lblk = i;
1059 rc->first_do_lblk_found = true;
1060 }
1061
1062 /* update the last lblk in the region seen so far */
1063 rc->last_do_lblk = end;
1064
1065 /*
1066 * if we're tracking a partial cluster and the current extent
1067 * doesn't start with it, count it and stop tracking
1068 */
1069 if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
1070 rc->ndelonly++;
1071 rc->partial = false;
1072 }
1073
1074 /*
1075 * if the first cluster doesn't start on a cluster boundary but
1076 * ends on one, count it
1077 */
1078 if (EXT4_LBLK_COFF(sbi, i) != 0) {
1079 if (end >= EXT4_LBLK_CFILL(sbi, i)) {
1080 rc->ndelonly++;
1081 rc->partial = false;
1082 i = EXT4_LBLK_CFILL(sbi, i) + 1;
1083 }
1084 }
1085
1086 /*
1087 * if the current cluster starts on a cluster boundary, count the
1088 * number of whole delonly clusters in the extent
1089 */
1090 if ((i + sbi->s_cluster_ratio - 1) <= end) {
1091 nclu = (end - i + 1) >> sbi->s_cluster_bits;
1092 rc->ndelonly += nclu;
1093 i += nclu << sbi->s_cluster_bits;
1094 }
1095
1096 /*
1097 * start tracking a partial cluster if there's a partial at the end
1098 * of the current extent and we're not already tracking one
1099 */
1100 if (!rc->partial && i <= end) {
1101 rc->partial = true;
1102 rc->lclu = EXT4_B2C(sbi, i);
1103 }
1104}
1105
1106/*
1107 * __pr_tree_search - search for a pending cluster reservation
1108 *
1109 * @root - root of pending reservation tree
1110 * @lclu - logical cluster to search for
1111 *
1112 * Returns the pending reservation for the cluster identified by @lclu
1113 * if found. If not, returns a reservation for the next cluster if any,
1114 * and if not, returns NULL.
1115 */
1116static struct pending_reservation *__pr_tree_search(struct rb_root *root,
1117 ext4_lblk_t lclu)
1118{
1119 struct rb_node *node = root->rb_node;
1120 struct pending_reservation *pr = NULL;
1121
1122 while (node) {
1123 pr = rb_entry(node, struct pending_reservation, rb_node);
1124 if (lclu < pr->lclu)
1125 node = node->rb_left;
1126 else if (lclu > pr->lclu)
1127 node = node->rb_right;
1128 else
1129 return pr;
1130 }
1131 if (pr && lclu < pr->lclu)
1132 return pr;
1133 if (pr && lclu > pr->lclu) {
1134 node = rb_next(&pr->rb_node);
1135 return node ? rb_entry(node, struct pending_reservation,
1136 rb_node) : NULL;
1137 }
1138 return NULL;
1139}
1140
1141/*
1142 * get_rsvd - calculates and returns the number of cluster reservations to be
1143 * released when removing a block range from the extent status tree
1144 * and releases any pending reservations within the range
1145 *
1146 * @inode - file containing block range
1147 * @end - last block in range
1148 * @right_es - pointer to extent containing next block beyond end or NULL
1149 * @rc - pointer to reserved count data
1150 *
1151 * The number of reservations to be released is equal to the number of
1152 * clusters containing delayed and not unwritten (delonly) blocks within
1153 * the range, minus the number of clusters still containing delonly blocks
1154 * at the ends of the range, and minus the number of pending reservations
1155 * within the range.
1156 */
1157static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
1158 struct extent_status *right_es,
1159 struct rsvd_count *rc)
1160{
1161 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1162 struct pending_reservation *pr;
1163 struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1164 struct rb_node *node;
1165 ext4_lblk_t first_lclu, last_lclu;
1166 bool left_delonly, right_delonly, count_pending;
1167 struct extent_status *es;
1168
1169 if (sbi->s_cluster_ratio > 1) {
1170 /* count any remaining partial cluster */
1171 if (rc->partial)
1172 rc->ndelonly++;
1173
1174 if (rc->ndelonly == 0)
1175 return 0;
1176
1177 first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
1178 last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
1179
1180 /*
1181 * decrease the delonly count by the number of clusters at the
1182 * ends of the range that still contain delonly blocks -
1183 * these clusters still need to be reserved
1184 */
1185 left_delonly = right_delonly = false;
1186
1187 es = rc->left_es;
1188 while (es && ext4_es_end(es) >=
1189 EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
1190 if (ext4_es_is_delonly(es)) {
1191 rc->ndelonly--;
1192 left_delonly = true;
1193 break;
1194 }
1195 node = rb_prev(&es->rb_node);
1196 if (!node)
1197 break;
1198 es = rb_entry(node, struct extent_status, rb_node);
1199 }
1200 if (right_es && (!left_delonly || first_lclu != last_lclu)) {
1201 if (end < ext4_es_end(right_es)) {
1202 es = right_es;
1203 } else {
1204 node = rb_next(&right_es->rb_node);
1205 es = node ? rb_entry(node, struct extent_status,
1206 rb_node) : NULL;
1207 }
1208 while (es && es->es_lblk <=
1209 EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
1210 if (ext4_es_is_delonly(es)) {
1211 rc->ndelonly--;
1212 right_delonly = true;
1213 break;
1214 }
1215 node = rb_next(&es->rb_node);
1216 if (!node)
1217 break;
1218 es = rb_entry(node, struct extent_status,
1219 rb_node);
1220 }
1221 }
1222
1223 /*
1224 * Determine the block range that should be searched for
1225 * pending reservations, if any. Clusters on the ends of the
1226 * original removed range containing delonly blocks are
1227 * excluded. They've already been accounted for and it's not
1228 * possible to determine if an associated pending reservation
1229 * should be released with the information available in the
1230 * extents status tree.
1231 */
1232 if (first_lclu == last_lclu) {
1233 if (left_delonly | right_delonly)
1234 count_pending = false;
1235 else
1236 count_pending = true;
1237 } else {
1238 if (left_delonly)
1239 first_lclu++;
1240 if (right_delonly)
1241 last_lclu--;
1242 if (first_lclu <= last_lclu)
1243 count_pending = true;
1244 else
1245 count_pending = false;
1246 }
1247
1248 /*
1249 * a pending reservation found between first_lclu and last_lclu
1250 * represents an allocated cluster that contained at least one
1251 * delonly block, so the delonly total must be reduced by one
1252 * for each pending reservation found and released
1253 */
1254 if (count_pending) {
1255 pr = __pr_tree_search(&tree->root, first_lclu);
1256 while (pr && pr->lclu <= last_lclu) {
1257 rc->ndelonly--;
1258 node = rb_next(&pr->rb_node);
1259 rb_erase(&pr->rb_node, &tree->root);
1260 kmem_cache_free(ext4_pending_cachep, pr);
1261 if (!node)
1262 break;
1263 pr = rb_entry(node, struct pending_reservation,
1264 rb_node);
1265 }
1266 }
1267 }
1268 return rc->ndelonly;
1269}
1270
1271
1272/*
1273 * __es_remove_extent - removes block range from extent status tree
1274 *
1275 * @inode - file containing range
1276 * @lblk - first block in range
1277 * @end - last block in range
1278 * @reserved - number of cluster reservations released
1279 *
1280 * If @reserved is not NULL and delayed allocation is enabled, counts
1281 * block/cluster reservations freed by removing range and if bigalloc
1282 * enabled cancels pending reservations as needed. Returns 0 on success,
1283 * error code on failure.
1284 */
961static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 1285static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
962 ext4_lblk_t end) 1286 ext4_lblk_t end, int *reserved)
963{ 1287{
964 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; 1288 struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
965 struct rb_node *node; 1289 struct rb_node *node;
@@ -968,9 +1292,14 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
968 ext4_lblk_t len1, len2; 1292 ext4_lblk_t len1, len2;
969 ext4_fsblk_t block; 1293 ext4_fsblk_t block;
970 int err; 1294 int err;
1295 bool count_reserved = true;
1296 struct rsvd_count rc;
971 1297
1298 if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
1299 count_reserved = false;
972retry: 1300retry:
973 err = 0; 1301 err = 0;
1302
974 es = __es_tree_search(&tree->root, lblk); 1303 es = __es_tree_search(&tree->root, lblk);
975 if (!es) 1304 if (!es)
976 goto out; 1305 goto out;
@@ -979,6 +1308,8 @@ retry:
979 1308
980 /* Simply invalidate cache_es. */ 1309 /* Simply invalidate cache_es. */
981 tree->cache_es = NULL; 1310 tree->cache_es = NULL;
1311 if (count_reserved)
1312 init_rsvd(inode, lblk, es, &rc);
982 1313
983 orig_es.es_lblk = es->es_lblk; 1314 orig_es.es_lblk = es->es_lblk;
984 orig_es.es_len = es->es_len; 1315 orig_es.es_len = es->es_len;
@@ -1020,10 +1351,16 @@ retry:
1020 ext4_es_store_pblock(es, block); 1351 ext4_es_store_pblock(es, block);
1021 } 1352 }
1022 } 1353 }
1354 if (count_reserved)
1355 count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
1356 &orig_es, &rc);
1023 goto out; 1357 goto out;
1024 } 1358 }
1025 1359
1026 if (len1 > 0) { 1360 if (len1 > 0) {
1361 if (count_reserved)
1362 count_rsvd(inode, lblk, orig_es.es_len - len1,
1363 &orig_es, &rc);
1027 node = rb_next(&es->rb_node); 1364 node = rb_next(&es->rb_node);
1028 if (node) 1365 if (node)
1029 es = rb_entry(node, struct extent_status, rb_node); 1366 es = rb_entry(node, struct extent_status, rb_node);
@@ -1032,6 +1369,8 @@ retry:
1032 } 1369 }
1033 1370
1034 while (es && ext4_es_end(es) <= end) { 1371 while (es && ext4_es_end(es) <= end) {
1372 if (count_reserved)
1373 count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
1035 node = rb_next(&es->rb_node); 1374 node = rb_next(&es->rb_node);
1036 rb_erase(&es->rb_node, &tree->root); 1375 rb_erase(&es->rb_node, &tree->root);
1037 ext4_es_free_extent(inode, es); 1376 ext4_es_free_extent(inode, es);
@@ -1046,6 +1385,9 @@ retry:
1046 ext4_lblk_t orig_len = es->es_len; 1385 ext4_lblk_t orig_len = es->es_len;
1047 1386
1048 len1 = ext4_es_end(es) - end; 1387 len1 = ext4_es_end(es) - end;
1388 if (count_reserved)
1389 count_rsvd(inode, es->es_lblk, orig_len - len1,
1390 es, &rc);
1049 es->es_lblk = end + 1; 1391 es->es_lblk = end + 1;
1050 es->es_len = len1; 1392 es->es_len = len1;
1051 if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { 1393 if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
@@ -1054,20 +1396,28 @@ retry:
1054 } 1396 }
1055 } 1397 }
1056 1398
1399 if (count_reserved)
1400 *reserved = get_rsvd(inode, end, es, &rc);
1057out: 1401out:
1058 return err; 1402 return err;
1059} 1403}
1060 1404
1061/* 1405/*
1062 * ext4_es_remove_extent() removes a space from a extent status tree. 1406 * ext4_es_remove_extent - removes block range from extent status tree
1063 * 1407 *
1064 * Return 0 on success, error code on failure. 1408 * @inode - file containing range
1409 * @lblk - first block in range
1410 * @len - number of blocks to remove
1411 *
1412 * Reduces block/cluster reservation count and for bigalloc cancels pending
1413 * reservations as needed. Returns 0 on success, error code on failure.
1065 */ 1414 */
1066int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, 1415int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1067 ext4_lblk_t len) 1416 ext4_lblk_t len)
1068{ 1417{
1069 ext4_lblk_t end; 1418 ext4_lblk_t end;
1070 int err = 0; 1419 int err = 0;
1420 int reserved = 0;
1071 1421
1072 trace_ext4_es_remove_extent(inode, lblk, len); 1422 trace_ext4_es_remove_extent(inode, lblk, len);
1073 es_debug("remove [%u/%u) from extent status tree of inode %lu\n", 1423 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
@@ -1085,9 +1435,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1085 * is reclaimed. 1435 * is reclaimed.
1086 */ 1436 */
1087 write_lock(&EXT4_I(inode)->i_es_lock); 1437 write_lock(&EXT4_I(inode)->i_es_lock);
1088 err = __es_remove_extent(inode, lblk, end); 1438 err = __es_remove_extent(inode, lblk, end, &reserved);
1089 write_unlock(&EXT4_I(inode)->i_es_lock); 1439 write_unlock(&EXT4_I(inode)->i_es_lock);
1090 ext4_es_print_tree(inode); 1440 ext4_es_print_tree(inode);
1441 ext4_da_release_space(inode, reserved);
1091 return err; 1442 return err;
1092} 1443}
1093 1444
@@ -1235,9 +1586,9 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
1235 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", 1586 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1236 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), 1587 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1237 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); 1588 percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1238 seq_printf(seq, " %lu/%lu cache hits/misses\n", 1589 seq_printf(seq, " %lld/%lld cache hits/misses\n",
1239 es_stats->es_stats_cache_hits, 1590 percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
1240 es_stats->es_stats_cache_misses); 1591 percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
1241 if (inode_cnt) 1592 if (inode_cnt)
1242 seq_printf(seq, " %d inodes on list\n", inode_cnt); 1593 seq_printf(seq, " %d inodes on list\n", inode_cnt);
1243 1594
@@ -1264,35 +1615,46 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1264 sbi->s_es_nr_inode = 0; 1615 sbi->s_es_nr_inode = 0;
1265 spin_lock_init(&sbi->s_es_lock); 1616 spin_lock_init(&sbi->s_es_lock);
1266 sbi->s_es_stats.es_stats_shrunk = 0; 1617 sbi->s_es_stats.es_stats_shrunk = 0;
1267 sbi->s_es_stats.es_stats_cache_hits = 0; 1618 err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
1268 sbi->s_es_stats.es_stats_cache_misses = 0; 1619 GFP_KERNEL);
1620 if (err)
1621 return err;
1622 err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
1623 GFP_KERNEL);
1624 if (err)
1625 goto err1;
1269 sbi->s_es_stats.es_stats_scan_time = 0; 1626 sbi->s_es_stats.es_stats_scan_time = 0;
1270 sbi->s_es_stats.es_stats_max_scan_time = 0; 1627 sbi->s_es_stats.es_stats_max_scan_time = 0;
1271 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); 1628 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1272 if (err) 1629 if (err)
1273 return err; 1630 goto err2;
1274 err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); 1631 err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1275 if (err) 1632 if (err)
1276 goto err1; 1633 goto err3;
1277 1634
1278 sbi->s_es_shrinker.scan_objects = ext4_es_scan; 1635 sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1279 sbi->s_es_shrinker.count_objects = ext4_es_count; 1636 sbi->s_es_shrinker.count_objects = ext4_es_count;
1280 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 1637 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1281 err = register_shrinker(&sbi->s_es_shrinker); 1638 err = register_shrinker(&sbi->s_es_shrinker);
1282 if (err) 1639 if (err)
1283 goto err2; 1640 goto err4;
1284 1641
1285 return 0; 1642 return 0;
1286 1643err4:
1287err2:
1288 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); 1644 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1289err1: 1645err3:
1290 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1646 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1647err2:
1648 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1649err1:
1650 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1291 return err; 1651 return err;
1292} 1652}
1293 1653
1294void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) 1654void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1295{ 1655{
1656 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1657 percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1296 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); 1658 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1297 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); 1659 percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1298 unregister_shrinker(&sbi->s_es_shrinker); 1660 unregister_shrinker(&sbi->s_es_shrinker);
@@ -1317,6 +1679,7 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1317 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); 1679 es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1318 if (!es) 1680 if (!es)
1319 goto out_wrap; 1681 goto out_wrap;
1682
1320 while (*nr_to_scan > 0) { 1683 while (*nr_to_scan > 0) {
1321 if (es->es_lblk > end) { 1684 if (es->es_lblk > end) {
1322 ei->i_es_shrink_lblk = end + 1; 1685 ei->i_es_shrink_lblk = end + 1;
@@ -1374,6 +1737,34 @@ static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1374 return nr_shrunk; 1737 return nr_shrunk;
1375} 1738}
1376 1739
1740/*
1741 * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove
1742 * discretionary entries from the extent status cache. (Some entries
1743 * must be present for proper operations.)
1744 */
1745void ext4_clear_inode_es(struct inode *inode)
1746{
1747 struct ext4_inode_info *ei = EXT4_I(inode);
1748 struct extent_status *es;
1749 struct ext4_es_tree *tree;
1750 struct rb_node *node;
1751
1752 write_lock(&ei->i_es_lock);
1753 tree = &EXT4_I(inode)->i_es_tree;
1754 tree->cache_es = NULL;
1755 node = rb_first(&tree->root);
1756 while (node) {
1757 es = rb_entry(node, struct extent_status, rb_node);
1758 node = rb_next(node);
1759 if (!ext4_es_is_delayed(es)) {
1760 rb_erase(&es->rb_node, &tree->root);
1761 ext4_es_free_extent(inode, es);
1762 }
1763 }
1764 ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
1765 write_unlock(&ei->i_es_lock);
1766}
1767
1377#ifdef ES_DEBUG__ 1768#ifdef ES_DEBUG__
1378static void ext4_print_pending_tree(struct inode *inode) 1769static void ext4_print_pending_tree(struct inode *inode)
1379{ 1770{
@@ -1590,7 +1981,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
1590 1981
1591 write_lock(&EXT4_I(inode)->i_es_lock); 1982 write_lock(&EXT4_I(inode)->i_es_lock);
1592 1983
1593 err = __es_remove_extent(inode, lblk, lblk); 1984 err = __es_remove_extent(inode, lblk, lblk, NULL);
1594 if (err != 0) 1985 if (err != 0)
1595 goto error; 1986 goto error;
1596retry: 1987retry:
@@ -1779,93 +2170,3 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
1779 __remove_pending(inode, last); 2170 __remove_pending(inode, last);
1780 } 2171 }
1781} 2172}
1782
1783/*
1784 * ext4_es_remove_blks - remove block range from extents status tree and
1785 * reduce reservation count or cancel pending
1786 * reservation as needed
1787 *
1788 * @inode - file containing range
1789 * @lblk - first block in range
1790 * @len - number of blocks to remove
1791 *
1792 */
1793void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
1794 ext4_lblk_t len)
1795{
1796 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1797 unsigned int clu_size, reserved = 0;
1798 ext4_lblk_t last_lclu, first, length, remainder, last;
1799 bool delonly;
1800 int err = 0;
1801 struct pending_reservation *pr;
1802 struct ext4_pending_tree *tree;
1803
1804 /*
1805 * Process cluster by cluster for bigalloc - there may be up to
1806 * two clusters in a 4k page with a 1k block size and two blocks
1807 * per cluster. Also necessary for systems with larger page sizes
1808 * and potentially larger block sizes.
1809 */
1810 clu_size = sbi->s_cluster_ratio;
1811 last_lclu = EXT4_B2C(sbi, lblk + len - 1);
1812
1813 write_lock(&EXT4_I(inode)->i_es_lock);
1814
1815 for (first = lblk, remainder = len;
1816 remainder > 0;
1817 first += length, remainder -= length) {
1818
1819 if (EXT4_B2C(sbi, first) == last_lclu)
1820 length = remainder;
1821 else
1822 length = clu_size - EXT4_LBLK_COFF(sbi, first);
1823
1824 /*
1825 * The BH_Delay flag, which triggers calls to this function,
1826 * and the contents of the extents status tree can be
1827 * inconsistent due to writepages activity. So, note whether
1828 * the blocks to be removed actually belong to an extent with
1829 * delayed only status.
1830 */
1831 delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first);
1832
1833 /*
1834 * because of the writepages effect, written and unwritten
1835 * blocks could be removed here
1836 */
1837 last = first + length - 1;
1838 err = __es_remove_extent(inode, first, last);
1839 if (err)
1840 ext4_warning(inode->i_sb,
1841 "%s: couldn't remove page (err = %d)",
1842 __func__, err);
1843
1844 /* non-bigalloc case: simply count the cluster for release */
1845 if (sbi->s_cluster_ratio == 1 && delonly) {
1846 reserved++;
1847 continue;
1848 }
1849
1850 /*
1851 * bigalloc case: if all delayed allocated only blocks have
1852 * just been removed from a cluster, either cancel a pending
1853 * reservation if it exists or count a cluster for release
1854 */
1855 if (delonly &&
1856 !__es_scan_clu(inode, &ext4_es_is_delonly, first)) {
1857 pr = __get_pending(inode, EXT4_B2C(sbi, first));
1858 if (pr != NULL) {
1859 tree = &EXT4_I(inode)->i_pending_tree;
1860 rb_erase(&pr->rb_node, &tree->root);
1861 kmem_cache_free(ext4_pending_cachep, pr);
1862 } else {
1863 reserved++;
1864 }
1865 }
1866 }
1867
1868 write_unlock(&EXT4_I(inode)->i_es_lock);
1869
1870 ext4_da_release_space(inode, reserved);
1871}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 131a8b7df265..825313c59752 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -70,8 +70,8 @@ struct ext4_es_tree {
70 70
71struct ext4_es_stats { 71struct ext4_es_stats {
72 unsigned long es_stats_shrunk; 72 unsigned long es_stats_shrunk;
73 unsigned long es_stats_cache_hits; 73 struct percpu_counter es_stats_cache_hits;
74 unsigned long es_stats_cache_misses; 74 struct percpu_counter es_stats_cache_misses;
75 u64 es_stats_scan_time; 75 u64 es_stats_scan_time;
76 u64 es_stats_max_scan_time; 76 u64 es_stats_max_scan_time;
77 struct percpu_counter es_stats_all_cnt; 77 struct percpu_counter es_stats_all_cnt;
@@ -140,6 +140,7 @@ extern void ext4_es_find_extent_range(struct inode *inode,
140 ext4_lblk_t lblk, ext4_lblk_t end, 140 ext4_lblk_t lblk, ext4_lblk_t end,
141 struct extent_status *es); 141 struct extent_status *es);
142extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 142extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
143 ext4_lblk_t *next_lblk,
143 struct extent_status *es); 144 struct extent_status *es);
144extern bool ext4_es_scan_range(struct inode *inode, 145extern bool ext4_es_scan_range(struct inode *inode,
145 int (*matching_fn)(struct extent_status *es), 146 int (*matching_fn)(struct extent_status *es),
@@ -246,7 +247,6 @@ extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
246 bool allocated); 247 bool allocated);
247extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, 248extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
248 ext4_lblk_t len); 249 ext4_lblk_t len);
249extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, 250extern void ext4_clear_inode_es(struct inode *inode);
250 ext4_lblk_t len);
251 251
252#endif /* _EXT4_EXTENTS_STATUS_H */ 252#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b8a20bb9a145..8d2bbcc2d813 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -230,8 +230,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
230 if (IS_DAX(inode)) 230 if (IS_DAX(inode))
231 return ext4_dax_write_iter(iocb, from); 231 return ext4_dax_write_iter(iocb, from);
232#endif 232#endif
233 if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
234 return -EOPNOTSUPP;
235 233
236 if (!inode_trylock(inode)) { 234 if (!inode_trylock(inode)) {
237 if (iocb->ki_flags & IOCB_NOWAIT) 235 if (iocb->ki_flags & IOCB_NOWAIT)
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index d358bfcb6b3f..3e133793a5a3 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -280,7 +280,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
280 unsigned char *buff; 280 unsigned char *buff;
281 struct qstr qstr = {.name = name, .len = len }; 281 struct qstr qstr = {.name = name, .len = len };
282 282
283 if (len && IS_CASEFOLDED(dir)) { 283 if (len && IS_CASEFOLDED(dir) && um) {
284 buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); 284 buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
285 if (!buff) 285 if (!buff)
286 return -ENOMEM; 286 return -ENOMEM;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 88cdf3c90bd1..2fec62d764fa 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1416,7 +1416,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
1416 err = ext4_htree_store_dirent(dir_file, hinfo->hash, 1416 err = ext4_htree_store_dirent(dir_file, hinfo->hash,
1417 hinfo->minor_hash, de, &tmp_str); 1417 hinfo->minor_hash, de, &tmp_str);
1418 if (err) { 1418 if (err) {
1419 count = err; 1419 ret = err;
1420 goto out; 1420 goto out;
1421 } 1421 }
1422 count++; 1422 count++;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d0dc0e3463db..123e3dee7733 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -527,7 +527,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
527 return -EFSCORRUPTED; 527 return -EFSCORRUPTED;
528 528
529 /* Lookup extent status tree firstly */ 529 /* Lookup extent status tree firstly */
530 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 530 if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
531 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 531 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
532 map->m_pblk = ext4_es_pblock(&es) + 532 map->m_pblk = ext4_es_pblock(&es) +
533 map->m_lblk - es.es_lblk; 533 map->m_lblk - es.es_lblk;
@@ -695,7 +695,7 @@ found:
695 * extent status tree. 695 * extent status tree.
696 */ 696 */
697 if ((flags & EXT4_GET_BLOCKS_PRE_IO) && 697 if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
698 ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 698 ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
699 if (ext4_es_is_written(&es)) 699 if (ext4_es_is_written(&es))
700 goto out_sem; 700 goto out_sem;
701 } 701 }
@@ -1024,7 +1024,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
1024 bh = ext4_getblk(handle, inode, block, map_flags); 1024 bh = ext4_getblk(handle, inode, block, map_flags);
1025 if (IS_ERR(bh)) 1025 if (IS_ERR(bh))
1026 return bh; 1026 return bh;
1027 if (!bh || buffer_uptodate(bh)) 1027 if (!bh || ext4_buffer_uptodate(bh))
1028 return bh; 1028 return bh;
1029 ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); 1029 ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
1030 wait_on_buffer(bh); 1030 wait_on_buffer(bh);
@@ -1051,7 +1051,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
1051 1051
1052 for (i = 0; i < bh_count; i++) 1052 for (i = 0; i < bh_count; i++)
1053 /* Note that NULL bhs[i] is valid because of holes. */ 1053 /* Note that NULL bhs[i] is valid because of holes. */
1054 if (bhs[i] && !buffer_uptodate(bhs[i])) 1054 if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
1055 ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, 1055 ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
1056 &bhs[i]); 1056 &bhs[i]);
1057 1057
@@ -1656,49 +1656,6 @@ void ext4_da_release_space(struct inode *inode, int to_free)
1656 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); 1656 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
1657} 1657}
1658 1658
1659static void ext4_da_page_release_reservation(struct page *page,
1660 unsigned int offset,
1661 unsigned int length)
1662{
1663 int contiguous_blks = 0;
1664 struct buffer_head *head, *bh;
1665 unsigned int curr_off = 0;
1666 struct inode *inode = page->mapping->host;
1667 unsigned int stop = offset + length;
1668 ext4_fsblk_t lblk;
1669
1670 BUG_ON(stop > PAGE_SIZE || stop < length);
1671
1672 head = page_buffers(page);
1673 bh = head;
1674 do {
1675 unsigned int next_off = curr_off + bh->b_size;
1676
1677 if (next_off > stop)
1678 break;
1679
1680 if ((offset <= curr_off) && (buffer_delay(bh))) {
1681 contiguous_blks++;
1682 clear_buffer_delay(bh);
1683 } else if (contiguous_blks) {
1684 lblk = page->index <<
1685 (PAGE_SHIFT - inode->i_blkbits);
1686 lblk += (curr_off >> inode->i_blkbits) -
1687 contiguous_blks;
1688 ext4_es_remove_blks(inode, lblk, contiguous_blks);
1689 contiguous_blks = 0;
1690 }
1691 curr_off = next_off;
1692 } while ((bh = bh->b_this_page) != head);
1693
1694 if (contiguous_blks) {
1695 lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
1696 lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
1697 ext4_es_remove_blks(inode, lblk, contiguous_blks);
1698 }
1699
1700}
1701
1702/* 1659/*
1703 * Delayed allocation stuff 1660 * Delayed allocation stuff
1704 */ 1661 */
@@ -1878,7 +1835,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1878 (unsigned long) map->m_lblk); 1835 (unsigned long) map->m_lblk);
1879 1836
1880 /* Lookup extent status tree firstly */ 1837 /* Lookup extent status tree firstly */
1881 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1838 if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
1882 if (ext4_es_is_hole(&es)) { 1839 if (ext4_es_is_hole(&es)) {
1883 retval = 0; 1840 retval = 0;
1884 down_read(&EXT4_I(inode)->i_data_sem); 1841 down_read(&EXT4_I(inode)->i_data_sem);
@@ -2800,15 +2757,6 @@ static int ext4_writepages(struct address_space *mapping,
2800 goto out_writepages; 2757 goto out_writepages;
2801 } 2758 }
2802 2759
2803 if (ext4_should_dioread_nolock(inode)) {
2804 /*
2805 * We may need to convert up to one extent per block in
2806 * the page and we may dirty the inode.
2807 */
2808 rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
2809 PAGE_SIZE >> inode->i_blkbits);
2810 }
2811
2812 /* 2760 /*
2813 * If we have inline data and arrive here, it means that 2761 * If we have inline data and arrive here, it means that
2814 * we will soon create the block for the 1st page, so 2762 * we will soon create the block for the 1st page, so
@@ -2827,6 +2775,15 @@ static int ext4_writepages(struct address_space *mapping,
2827 ext4_journal_stop(handle); 2775 ext4_journal_stop(handle);
2828 } 2776 }
2829 2777
2778 if (ext4_should_dioread_nolock(inode)) {
2779 /*
2780 * We may need to convert up to one extent per block in
2781 * the page and we may dirty the inode.
2782 */
2783 rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
2784 PAGE_SIZE >> inode->i_blkbits);
2785 }
2786
2830 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2787 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2831 range_whole = 1; 2788 range_whole = 1;
2832 2789
@@ -3242,24 +3199,6 @@ static int ext4_da_write_end(struct file *file,
3242 return ret ? ret : copied; 3199 return ret ? ret : copied;
3243} 3200}
3244 3201
3245static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
3246 unsigned int length)
3247{
3248 /*
3249 * Drop reserved blocks
3250 */
3251 BUG_ON(!PageLocked(page));
3252 if (!page_has_buffers(page))
3253 goto out;
3254
3255 ext4_da_page_release_reservation(page, offset, length);
3256
3257out:
3258 ext4_invalidatepage(page, offset, length);
3259
3260 return;
3261}
3262
3263/* 3202/*
3264 * Force all delayed allocation blocks to be allocated for a given inode. 3203 * Force all delayed allocation blocks to be allocated for a given inode.
3265 */ 3204 */
@@ -4002,7 +3941,7 @@ static const struct address_space_operations ext4_da_aops = {
4002 .write_end = ext4_da_write_end, 3941 .write_end = ext4_da_write_end,
4003 .set_page_dirty = ext4_set_page_dirty, 3942 .set_page_dirty = ext4_set_page_dirty,
4004 .bmap = ext4_bmap, 3943 .bmap = ext4_bmap,
4005 .invalidatepage = ext4_da_invalidatepage, 3944 .invalidatepage = ext4_invalidatepage,
4006 .releasepage = ext4_releasepage, 3945 .releasepage = ext4_releasepage,
4007 .direct_IO = ext4_direct_IO, 3946 .direct_IO = ext4_direct_IO,
4008 .migratepage = buffer_migrate_page, 3947 .migratepage = buffer_migrate_page,
@@ -4314,6 +4253,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
4314 4253
4315 trace_ext4_punch_hole(inode, offset, length, 0); 4254 trace_ext4_punch_hole(inode, offset, length, 0);
4316 4255
4256 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
4257 if (ext4_has_inline_data(inode)) {
4258 down_write(&EXT4_I(inode)->i_mmap_sem);
4259 ret = ext4_convert_inline_data(inode);
4260 up_write(&EXT4_I(inode)->i_mmap_sem);
4261 if (ret)
4262 return ret;
4263 }
4264
4317 /* 4265 /*
4318 * Write out all dirty pages to avoid race conditions 4266 * Write out all dirty pages to avoid race conditions
4319 * Then release them. 4267 * Then release them.
@@ -5137,6 +5085,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
5137 "iget: bogus i_mode (%o)", inode->i_mode); 5085 "iget: bogus i_mode (%o)", inode->i_mode);
5138 goto bad_inode; 5086 goto bad_inode;
5139 } 5087 }
5088 if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
5089 ext4_error_inode(inode, function, line, 0,
5090 "casefold flag without casefold feature");
5140 brelse(iloc.bh); 5091 brelse(iloc.bh);
5141 5092
5142 unlock_new_inode(inode); 5093 unlock_new_inode(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5444d49cbf09..0b7f316fd30f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -745,6 +745,74 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
745 fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); 745 fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
746} 746}
747 747
748/* copied from fs/ioctl.c */
749static int fiemap_check_ranges(struct super_block *sb,
750 u64 start, u64 len, u64 *new_len)
751{
752 u64 maxbytes = (u64) sb->s_maxbytes;
753
754 *new_len = len;
755
756 if (len == 0)
757 return -EINVAL;
758
759 if (start > maxbytes)
760 return -EFBIG;
761
762 /*
763 * Shrink request scope to what the fs can actually handle.
764 */
765 if (len > maxbytes || (maxbytes - len) < start)
766 *new_len = maxbytes - start;
767
768 return 0;
769}
770
771/* So that the fiemap access checks can't overflow on 32 bit machines. */
772#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
773
774static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
775{
776 struct fiemap fiemap;
777 struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
778 struct fiemap_extent_info fieinfo = { 0, };
779 struct inode *inode = file_inode(filp);
780 struct super_block *sb = inode->i_sb;
781 u64 len;
782 int error;
783
784 if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
785 return -EFAULT;
786
787 if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
788 return -EINVAL;
789
790 error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
791 &len);
792 if (error)
793 return error;
794
795 fieinfo.fi_flags = fiemap.fm_flags;
796 fieinfo.fi_extents_max = fiemap.fm_extent_count;
797 fieinfo.fi_extents_start = ufiemap->fm_extents;
798
799 if (fiemap.fm_extent_count != 0 &&
800 !access_ok(fieinfo.fi_extents_start,
801 fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
802 return -EFAULT;
803
804 if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
805 filemap_write_and_wait(inode->i_mapping);
806
807 error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len);
808 fiemap.fm_flags = fieinfo.fi_flags;
809 fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
810 if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
811 error = -EFAULT;
812
813 return error;
814}
815
748long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 816long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
749{ 817{
750 struct inode *inode = file_inode(filp); 818 struct inode *inode = file_inode(filp);
@@ -1142,6 +1210,33 @@ resizefs_out:
1142 return -EOPNOTSUPP; 1210 return -EOPNOTSUPP;
1143 return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); 1211 return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
1144 1212
1213 case EXT4_IOC_CLEAR_ES_CACHE:
1214 {
1215 if (!inode_owner_or_capable(inode))
1216 return -EACCES;
1217 ext4_clear_inode_es(inode);
1218 return 0;
1219 }
1220
1221 case EXT4_IOC_GETSTATE:
1222 {
1223 __u32 state = 0;
1224
1225 if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED))
1226 state |= EXT4_STATE_FLAG_EXT_PRECACHED;
1227 if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
1228 state |= EXT4_STATE_FLAG_NEW;
1229 if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
1230 state |= EXT4_STATE_FLAG_NEWENTRY;
1231 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE))
1232 state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE;
1233
1234 return put_user(state, (__u32 __user *) arg);
1235 }
1236
1237 case EXT4_IOC_GET_ES_CACHE:
1238 return ext4_ioctl_get_es_cache(filp, arg);
1239
1145 case EXT4_IOC_FSGETXATTR: 1240 case EXT4_IOC_FSGETXATTR:
1146 { 1241 {
1147 struct fsxattr fa; 1242 struct fsxattr fa;
@@ -1278,6 +1373,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1278 case FS_IOC_GETFSMAP: 1373 case FS_IOC_GETFSMAP:
1279 case FS_IOC_ENABLE_VERITY: 1374 case FS_IOC_ENABLE_VERITY:
1280 case FS_IOC_MEASURE_VERITY: 1375 case FS_IOC_MEASURE_VERITY:
1376 case EXT4_IOC_CLEAR_ES_CACHE:
1377 case EXT4_IOC_GETSTATE:
1378 case EXT4_IOC_GET_ES_CACHE:
1281 break; 1379 break;
1282 default: 1380 default:
1283 return -ENOIOCTLCMD; 1381 return -ENOIOCTLCMD;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 129029534075..a427d2031a8d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1312,7 +1312,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
1312{ 1312{
1313 int len; 1313 int len;
1314 1314
1315 if (!IS_CASEFOLDED(dir)) { 1315 if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
1316 cf_name->name = NULL; 1316 cf_name->name = NULL;
1317 return; 1317 return;
1318 } 1318 }
@@ -2183,7 +2183,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
2183 2183
2184#ifdef CONFIG_UNICODE 2184#ifdef CONFIG_UNICODE
2185 if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && 2185 if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
2186 utf8_validate(sbi->s_encoding, &dentry->d_name)) 2186 sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
2187 return -EINVAL; 2187 return -EINVAL;
2188#endif 2188#endif
2189 2189
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3db5f17228b7..dd654e53ba3d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1878,6 +1878,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1878 } else if (token == Opt_commit) { 1878 } else if (token == Opt_commit) {
1879 if (arg == 0) 1879 if (arg == 0)
1880 arg = JBD2_DEFAULT_MAX_COMMIT_AGE; 1880 arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
1881 else if (arg > INT_MAX / HZ) {
1882 ext4_msg(sb, KERN_ERR,
1883 "Invalid commit interval %d, "
1884 "must be smaller than %d",
1885 arg, INT_MAX / HZ);
1886 return -1;
1887 }
1881 sbi->s_commit_interval = HZ * arg; 1888 sbi->s_commit_interval = HZ * arg;
1882 } else if (token == Opt_debug_want_extra_isize) { 1889 } else if (token == Opt_debug_want_extra_isize) {
1883 sbi->s_want_extra_isize = arg; 1890 sbi->s_want_extra_isize = arg;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 69b9bc329964..f08073d7bbf5 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -638,10 +638,8 @@ static void flush_descriptor(journal_t *journal,
638{ 638{
639 jbd2_journal_revoke_header_t *header; 639 jbd2_journal_revoke_header_t *header;
640 640
641 if (is_journal_aborted(journal)) { 641 if (is_journal_aborted(journal))
642 put_bh(descriptor);
643 return; 642 return;
644 }
645 643
646 header = (jbd2_journal_revoke_header_t *)descriptor->b_data; 644 header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
647 header->r_count = cpu_to_be32(offset); 645 header->r_count = cpu_to_be32(offset);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 990e7b5062e7..afc06daee5bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -569,6 +569,9 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
569 } 569 }
570 handle->h_type = type; 570 handle->h_type = type;
571 handle->h_line_no = line_no; 571 handle->h_line_no = line_no;
572 trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
573 handle->h_transaction->t_tid, type,
574 line_no, handle->h_buffer_credits);
572 return 0; 575 return 0;
573} 576}
574EXPORT_SYMBOL(jbd2_journal_start_reserved); 577EXPORT_SYMBOL(jbd2_journal_start_reserved);
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 71ca4d047d65..2a878b739115 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -154,7 +154,7 @@ static int utf8_parse_version(const char *version, unsigned int *maj,
154{ 154{
155 substring_t args[3]; 155 substring_t args[3];
156 char version_string[12]; 156 char version_string[12];
157 const struct match_token token[] = { 157 static const struct match_token token[] = {
158 {1, "%d.%d.%d"}, 158 {1, "%d.%d.%d"},
159 {0, NULL} 159 {0, NULL}
160 }; 160 };
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index 6c1a36bbf6ad..6fe8af7edccb 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -35,7 +35,7 @@ unsigned int total_tests;
35#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) 35#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
36#define test(cond) _test(cond, __func__, __LINE__, "") 36#define test(cond) _test(cond, __func__, __LINE__, "")
37 37
38const static struct { 38static const struct {
39 /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 39 /* UTF-8 strings in this vector _must_ be NULL-terminated. */
40 unsigned char str[10]; 40 unsigned char str[10];
41 unsigned char dec[10]; 41 unsigned char dec[10];
@@ -89,7 +89,7 @@ const static struct {
89 89
90}; 90};
91 91
92const static struct { 92static const struct {
93 /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 93 /* UTF-8 strings in this vector _must_ be NULL-terminated. */
94 unsigned char str[30]; 94 unsigned char str[30];
95 unsigned char ncf[30]; 95 unsigned char ncf[30];