aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-07 17:36:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-07 17:36:39 -0400
commit6432f2128414edbea5fd4f6c4fa4c28d0e1c6151 (patch)
treed3c63c5f2f043ce52d98d8dfd3c9c0a7bc76ed95 /fs/ext4
parent1b033447bf847ba49c3816c564c9191c97456b36 (diff)
parentc278531d39f3158bfee93dc67da0b77e09776de2 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "The big new feature added this time is supporting online resizing using the meta_bg feature. This allows us to resize file systems which are greater than 16TB. In addition, the speed of online resizing has been improved in general. We also fix a number of races, some of which could lead to deadlocks, in ext4's Asynchronous I/O and online defrag support, thanks to good work by Dmitry Monakhov. There are also a large number of more minor bug fixes and cleanups from a number of other ext4 contributors, quite of few of which have submitted fixes for the first time." * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (69 commits) ext4: fix ext4_flush_completed_IO wait semantics ext4: fix mtime update in nodelalloc mode ext4: fix ext_remove_space for punch_hole case ext4: punch_hole should wait for DIO writers ext4: serialize truncate with owerwrite DIO workers ext4: endless truncate due to nonlocked dio readers ext4: serialize unlocked dio reads with truncate ext4: serialize dio nonlocked reads with defrag workers ext4: completed_io locking cleanup ext4: fix unwritten counter leakage ext4: give i_aiodio_unwritten a more appropriate name ext4: ext4_inode_info diet ext4: convert to use leXX_add_cpu() ext4: ext4_bread usage audit fs: reserve fallocate flag codepoint ext4: remove redundant offset check in mext_check_arguments() ext4: don't clear orphan list on ro mount with errors jbd2: fix assertion failure in commit code due to lacking transaction credits ext4: release donor reference when EXT4_IOC_MOVE_EXT ioctl fails ext4: enable FITRIM ioctl on bigalloc file system ...
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/ext4.h49
-rw-r--r--fs/ext4/extents.c258
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/fsync.c92
-rw-r--r--fs/ext4/ialloc.c9
-rw-r--r--fs/ext4/indirect.c18
-rw-r--r--fs/ext4/inode.c83
-rw-r--r--fs/ext4/ioctl.c22
-rw-r--r--fs/ext4/mballoc.c129
-rw-r--r--fs/ext4/mballoc.h5
-rw-r--r--fs/ext4/move_extent.c520
-rw-r--r--fs/ext4/namei.c105
-rw-r--r--fs/ext4/page-io.c176
-rw-r--r--fs/ext4/resize.c432
-rw-r--r--fs/ext4/super.c92
15 files changed, 1260 insertions, 736 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c3411d4ce2da..3ab2539b7b2e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,7 +186,6 @@ struct mpage_da_data {
186#define EXT4_IO_END_ERROR 0x0002 186#define EXT4_IO_END_ERROR 0x0002
187#define EXT4_IO_END_QUEUED 0x0004 187#define EXT4_IO_END_QUEUED 0x0004
188#define EXT4_IO_END_DIRECT 0x0008 188#define EXT4_IO_END_DIRECT 0x0008
189#define EXT4_IO_END_IN_FSYNC 0x0010
190 189
191struct ext4_io_page { 190struct ext4_io_page {
192 struct page *p_page; 191 struct page *p_page;
@@ -912,9 +911,7 @@ struct ext4_inode_info {
912 struct list_head i_completed_io_list; 911 struct list_head i_completed_io_list;
913 spinlock_t i_completed_io_lock; 912 spinlock_t i_completed_io_lock;
914 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 913 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
915 /* current io_end structure for async DIO write*/ 914 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
916 ext4_io_end_t *cur_aio_dio;
917 atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
918 915
919 spinlock_t i_block_reservation_lock; 916 spinlock_t i_block_reservation_lock;
920 917
@@ -1233,6 +1230,7 @@ struct ext4_sb_info {
1233 spinlock_t s_md_lock; 1230 spinlock_t s_md_lock;
1234 unsigned short *s_mb_offsets; 1231 unsigned short *s_mb_offsets;
1235 unsigned int *s_mb_maxs; 1232 unsigned int *s_mb_maxs;
1233 unsigned int s_group_info_size;
1236 1234
1237 /* tunables */ 1235 /* tunables */
1238 unsigned long s_stripe; 1236 unsigned long s_stripe;
@@ -1243,6 +1241,7 @@ struct ext4_sb_info {
1243 unsigned int s_mb_order2_reqs; 1241 unsigned int s_mb_order2_reqs;
1244 unsigned int s_mb_group_prealloc; 1242 unsigned int s_mb_group_prealloc;
1245 unsigned int s_max_writeback_mb_bump; 1243 unsigned int s_max_writeback_mb_bump;
1244 unsigned int s_max_dir_size_kb;
1246 /* where last allocation was done - for stream allocation */ 1245 /* where last allocation was done - for stream allocation */
1247 unsigned long s_mb_last_group; 1246 unsigned long s_mb_last_group;
1248 unsigned long s_mb_last_start; 1247 unsigned long s_mb_last_start;
@@ -1270,8 +1269,12 @@ struct ext4_sb_info {
1270 unsigned long s_sectors_written_start; 1269 unsigned long s_sectors_written_start;
1271 u64 s_kbytes_written; 1270 u64 s_kbytes_written;
1272 1271
1272 /* the size of zero-out chunk */
1273 unsigned int s_extent_max_zeroout_kb;
1274
1273 unsigned int s_log_groups_per_flex; 1275 unsigned int s_log_groups_per_flex;
1274 struct flex_groups *s_flex_groups; 1276 struct flex_groups *s_flex_groups;
1277 ext4_group_t s_flex_groups_allocated;
1275 1278
1276 /* workqueue for dio unwritten */ 1279 /* workqueue for dio unwritten */
1277 struct workqueue_struct *dio_unwritten_wq; 1280 struct workqueue_struct *dio_unwritten_wq;
@@ -1328,10 +1331,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1328{ 1331{
1329 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1332 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1330 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1333 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1331 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 1334 atomic_inc(&EXT4_I(inode)->i_unwritten);
1332 } 1335 }
1333} 1336}
1334 1337
1338static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
1339{
1340 return inode->i_private;
1341}
1342
1343static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
1344{
1345 inode->i_private = io;
1346}
1347
1335/* 1348/*
1336 * Inode dynamic state flags 1349 * Inode dynamic state flags
1337 */ 1350 */
@@ -1345,6 +1358,8 @@ enum {
1345 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1358 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1346 EXT4_STATE_NEWENTRY, /* File just added to dir */ 1359 EXT4_STATE_NEWENTRY, /* File just added to dir */
1347 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ 1360 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
1361 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
1362 nolocking */
1348}; 1363};
1349 1364
1350#define EXT4_INODE_BIT_FNS(name, field, offset) \ 1365#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1932,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1932 1947
1933/* fsync.c */ 1948/* fsync.c */
1934extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 1949extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
1935extern int ext4_flush_completed_IO(struct inode *); 1950extern int ext4_flush_unwritten_io(struct inode *);
1936 1951
1937/* hash.c */ 1952/* hash.c */
1938extern int ext4fs_dirhash(const char *name, int len, struct 1953extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1966,6 +1981,8 @@ extern void ext4_exit_mballoc(void);
1966extern void ext4_free_blocks(handle_t *handle, struct inode *inode, 1981extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1967 struct buffer_head *bh, ext4_fsblk_t block, 1982 struct buffer_head *bh, ext4_fsblk_t block,
1968 unsigned long count, int flags); 1983 unsigned long count, int flags);
1984extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
1985 ext4_group_t ngroups);
1969extern int ext4_mb_add_groupinfo(struct super_block *sb, 1986extern int ext4_mb_add_groupinfo(struct super_block *sb,
1970 ext4_group_t i, struct ext4_group_desc *desc); 1987 ext4_group_t i, struct ext4_group_desc *desc);
1971extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, 1988extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
@@ -2051,6 +2068,8 @@ extern void ext4_superblock_csum_set(struct super_block *sb,
2051extern void *ext4_kvmalloc(size_t size, gfp_t flags); 2068extern void *ext4_kvmalloc(size_t size, gfp_t flags);
2052extern void *ext4_kvzalloc(size_t size, gfp_t flags); 2069extern void *ext4_kvzalloc(size_t size, gfp_t flags);
2053extern void ext4_kvfree(void *ptr); 2070extern void ext4_kvfree(void *ptr);
2071extern int ext4_alloc_flex_bg_array(struct super_block *sb,
2072 ext4_group_t ngroup);
2054extern __printf(4, 5) 2073extern __printf(4, 5)
2055void __ext4_error(struct super_block *, const char *, unsigned int, 2074void __ext4_error(struct super_block *, const char *, unsigned int,
2056 const char *, ...); 2075 const char *, ...);
@@ -2352,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations;
2352extern const struct inode_operations ext4_file_inode_operations; 2371extern const struct inode_operations ext4_file_inode_operations;
2353extern const struct file_operations ext4_file_operations; 2372extern const struct file_operations ext4_file_operations;
2354extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); 2373extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
2374extern void ext4_unwritten_wait(struct inode *inode);
2355 2375
2356/* namei.c */ 2376/* namei.c */
2357extern const struct inode_operations ext4_dir_inode_operations; 2377extern const struct inode_operations ext4_dir_inode_operations;
@@ -2400,11 +2420,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2400 2420
2401/* page-io.c */ 2421/* page-io.c */
2402extern int __init ext4_init_pageio(void); 2422extern int __init ext4_init_pageio(void);
2423extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2403extern void ext4_exit_pageio(void); 2424extern void ext4_exit_pageio(void);
2404extern void ext4_ioend_wait(struct inode *); 2425extern void ext4_ioend_wait(struct inode *);
2405extern void ext4_free_io_end(ext4_io_end_t *io); 2426extern void ext4_free_io_end(ext4_io_end_t *io);
2406extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2427extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2407extern int ext4_end_io_nolock(ext4_io_end_t *io);
2408extern void ext4_io_submit(struct ext4_io_submit *io); 2428extern void ext4_io_submit(struct ext4_io_submit *io);
2409extern int ext4_bio_write_page(struct ext4_io_submit *io, 2429extern int ext4_bio_write_page(struct ext4_io_submit *io,
2410 struct page *page, 2430 struct page *page,
@@ -2452,6 +2472,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
2452 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); 2472 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
2453} 2473}
2454 2474
2475/*
2476 * Disable DIO read nolock optimization, so new dioreaders will be forced
2477 * to grab i_mutex
2478 */
2479static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
2480{
2481 ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
2482 smp_mb();
2483}
2484static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
2485{
2486 smp_mb();
2487 ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
2488}
2489
2455#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 2490#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2456 2491
2457/* For ioend & aio unwritten conversion wait queues */ 2492/* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aabbb3f53683..1c94cca35ed1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1177,7 +1177,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1177 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1177 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1178 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1178 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1179 1179
1180 neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); 1180 le16_add_cpu(&neh->eh_depth, 1);
1181 ext4_mark_inode_dirty(handle, inode); 1181 ext4_mark_inode_dirty(handle, inode);
1182out: 1182out:
1183 brelse(bh); 1183 brelse(bh);
@@ -1656,16 +1656,60 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
1656} 1656}
1657 1657
1658/* 1658/*
1659 * This function does a very simple check to see if we can collapse
1660 * an extent tree with a single extent tree leaf block into the inode.
1661 */
1662static void ext4_ext_try_to_merge_up(handle_t *handle,
1663 struct inode *inode,
1664 struct ext4_ext_path *path)
1665{
1666 size_t s;
1667 unsigned max_root = ext4_ext_space_root(inode, 0);
1668 ext4_fsblk_t blk;
1669
1670 if ((path[0].p_depth != 1) ||
1671 (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1672 (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1673 return;
1674
1675 /*
1676 * We need to modify the block allocation bitmap and the block
1677 * group descriptor to release the extent tree block. If we
1678 * can't get the journal credits, give up.
1679 */
1680 if (ext4_journal_extend(handle, 2))
1681 return;
1682
1683 /*
1684 * Copy the extent data up to the inode
1685 */
1686 blk = ext4_idx_pblock(path[0].p_idx);
1687 s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1688 sizeof(struct ext4_extent_idx);
1689 s += sizeof(struct ext4_extent_header);
1690
1691 memcpy(path[0].p_hdr, path[1].p_hdr, s);
1692 path[0].p_depth = 0;
1693 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1694 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1695 path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1696
1697 brelse(path[1].p_bh);
1698 ext4_free_blocks(handle, inode, NULL, blk, 1,
1699 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1700}
1701
1702/*
1659 * This function tries to merge the @ex extent to neighbours in the tree. 1703 * This function tries to merge the @ex extent to neighbours in the tree.
1660 * return 1 if merge left else 0. 1704 * return 1 if merge left else 0.
1661 */ 1705 */
1662static int ext4_ext_try_to_merge(struct inode *inode, 1706static void ext4_ext_try_to_merge(handle_t *handle,
1707 struct inode *inode,
1663 struct ext4_ext_path *path, 1708 struct ext4_ext_path *path,
1664 struct ext4_extent *ex) { 1709 struct ext4_extent *ex) {
1665 struct ext4_extent_header *eh; 1710 struct ext4_extent_header *eh;
1666 unsigned int depth; 1711 unsigned int depth;
1667 int merge_done = 0; 1712 int merge_done = 0;
1668 int ret = 0;
1669 1713
1670 depth = ext_depth(inode); 1714 depth = ext_depth(inode);
1671 BUG_ON(path[depth].p_hdr == NULL); 1715 BUG_ON(path[depth].p_hdr == NULL);
@@ -1675,9 +1719,9 @@ static int ext4_ext_try_to_merge(struct inode *inode,
1675 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1719 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1676 1720
1677 if (!merge_done) 1721 if (!merge_done)
1678 ret = ext4_ext_try_to_merge_right(inode, path, ex); 1722 (void) ext4_ext_try_to_merge_right(inode, path, ex);
1679 1723
1680 return ret; 1724 ext4_ext_try_to_merge_up(handle, inode, path);
1681} 1725}
1682 1726
1683/* 1727/*
@@ -1893,7 +1937,7 @@ has_space:
1893merge: 1937merge:
1894 /* try to merge extents */ 1938 /* try to merge extents */
1895 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 1939 if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1896 ext4_ext_try_to_merge(inode, path, nearex); 1940 ext4_ext_try_to_merge(handle, inode, path, nearex);
1897 1941
1898 1942
1899 /* time to correct all indexes above */ 1943 /* time to correct all indexes above */
@@ -1901,7 +1945,7 @@ merge:
1901 if (err) 1945 if (err)
1902 goto cleanup; 1946 goto cleanup;
1903 1947
1904 err = ext4_ext_dirty(handle, inode, path + depth); 1948 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
1905 1949
1906cleanup: 1950cleanup:
1907 if (npath) { 1951 if (npath) {
@@ -2092,13 +2136,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2092} 2136}
2093 2137
2094/* 2138/*
2095 * ext4_ext_check_cache() 2139 * ext4_ext_in_cache()
2096 * Checks to see if the given block is in the cache. 2140 * Checks to see if the given block is in the cache.
2097 * If it is, the cached extent is stored in the given 2141 * If it is, the cached extent is stored in the given
2098 * cache extent pointer. If the cached extent is a hole, 2142 * cache extent pointer.
2099 * this routine should be used instead of
2100 * ext4_ext_in_cache if the calling function needs to
2101 * know the size of the hole.
2102 * 2143 *
2103 * @inode: The files inode 2144 * @inode: The files inode
2104 * @block: The block to look for in the cache 2145 * @block: The block to look for in the cache
@@ -2107,8 +2148,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2107 * 2148 *
2108 * Return 0 if cache is invalid; 1 if the cache is valid 2149 * Return 0 if cache is invalid; 1 if the cache is valid
2109 */ 2150 */
2110static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, 2151static int
2111 struct ext4_ext_cache *ex){ 2152ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2153 struct ext4_extent *ex)
2154{
2112 struct ext4_ext_cache *cex; 2155 struct ext4_ext_cache *cex;
2113 struct ext4_sb_info *sbi; 2156 struct ext4_sb_info *sbi;
2114 int ret = 0; 2157 int ret = 0;
@@ -2125,7 +2168,9 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2125 goto errout; 2168 goto errout;
2126 2169
2127 if (in_range(block, cex->ec_block, cex->ec_len)) { 2170 if (in_range(block, cex->ec_block, cex->ec_len)) {
2128 memcpy(ex, cex, sizeof(struct ext4_ext_cache)); 2171 ex->ee_block = cpu_to_le32(cex->ec_block);
2172 ext4_ext_store_pblock(ex, cex->ec_start);
2173 ex->ee_len = cpu_to_le16(cex->ec_len);
2129 ext_debug("%u cached by %u:%u:%llu\n", 2174 ext_debug("%u cached by %u:%u:%llu\n",
2130 block, 2175 block,
2131 cex->ec_block, cex->ec_len, cex->ec_start); 2176 cex->ec_block, cex->ec_len, cex->ec_start);
@@ -2138,37 +2183,6 @@ errout:
2138} 2183}
2139 2184
2140/* 2185/*
2141 * ext4_ext_in_cache()
2142 * Checks to see if the given block is in the cache.
2143 * If it is, the cached extent is stored in the given
2144 * extent pointer.
2145 *
2146 * @inode: The files inode
2147 * @block: The block to look for in the cache
2148 * @ex: Pointer where the cached extent will be stored
2149 * if it contains block
2150 *
2151 * Return 0 if cache is invalid; 1 if the cache is valid
2152 */
2153static int
2154ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2155 struct ext4_extent *ex)
2156{
2157 struct ext4_ext_cache cex;
2158 int ret = 0;
2159
2160 if (ext4_ext_check_cache(inode, block, &cex)) {
2161 ex->ee_block = cpu_to_le32(cex.ec_block);
2162 ext4_ext_store_pblock(ex, cex.ec_start);
2163 ex->ee_len = cpu_to_le16(cex.ec_len);
2164 ret = 1;
2165 }
2166
2167 return ret;
2168}
2169
2170
2171/*
2172 * ext4_ext_rm_idx: 2186 * ext4_ext_rm_idx:
2173 * removes index from the index block. 2187 * removes index from the index block.
2174 */ 2188 */
@@ -2274,10 +2288,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2274 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2288 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2275 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2289 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2276 ext4_fsblk_t pblk; 2290 ext4_fsblk_t pblk;
2277 int flags = EXT4_FREE_BLOCKS_FORGET; 2291 int flags = 0;
2278 2292
2279 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2293 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2280 flags |= EXT4_FREE_BLOCKS_METADATA; 2294 flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2295 else if (ext4_should_journal_data(inode))
2296 flags |= EXT4_FREE_BLOCKS_FORGET;
2297
2281 /* 2298 /*
2282 * For bigalloc file systems, we never free a partial cluster 2299 * For bigalloc file systems, we never free a partial cluster
2283 * at the beginning of the extent. Instead, we make a note 2300 * at the beginning of the extent. Instead, we make a note
@@ -2572,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2572 struct ext4_ext_path *path = NULL; 2589 struct ext4_ext_path *path = NULL;
2573 ext4_fsblk_t partial_cluster = 0; 2590 ext4_fsblk_t partial_cluster = 0;
2574 handle_t *handle; 2591 handle_t *handle;
2575 int i = 0, err; 2592 int i = 0, err = 0;
2576 2593
2577 ext_debug("truncate since %u to %u\n", start, end); 2594 ext_debug("truncate since %u to %u\n", start, end);
2578 2595
@@ -2604,12 +2621,16 @@ again:
2604 return PTR_ERR(path); 2621 return PTR_ERR(path);
2605 } 2622 }
2606 depth = ext_depth(inode); 2623 depth = ext_depth(inode);
2624 /* Leaf not may not exist only if inode has no blocks at all */
2607 ex = path[depth].p_ext; 2625 ex = path[depth].p_ext;
2608 if (!ex) { 2626 if (!ex) {
2609 ext4_ext_drop_refs(path); 2627 if (depth) {
2610 kfree(path); 2628 EXT4_ERROR_INODE(inode,
2611 path = NULL; 2629 "path[%d].p_hdr == NULL",
2612 goto cont; 2630 depth);
2631 err = -EIO;
2632 }
2633 goto out;
2613 } 2634 }
2614 2635
2615 ee_block = le32_to_cpu(ex->ee_block); 2636 ee_block = le32_to_cpu(ex->ee_block);
@@ -2641,8 +2662,6 @@ again:
2641 goto out; 2662 goto out;
2642 } 2663 }
2643 } 2664 }
2644cont:
2645
2646 /* 2665 /*
2647 * We start scanning from right side, freeing all the blocks 2666 * We start scanning from right side, freeing all the blocks
2648 * after i_size and walking into the tree depth-wise. 2667 * after i_size and walking into the tree depth-wise.
@@ -2924,9 +2943,9 @@ static int ext4_split_extent_at(handle_t *handle,
2924 ext4_ext_mark_initialized(ex); 2943 ext4_ext_mark_initialized(ex);
2925 2944
2926 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 2945 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2927 ext4_ext_try_to_merge(inode, path, ex); 2946 ext4_ext_try_to_merge(handle, inode, path, ex);
2928 2947
2929 err = ext4_ext_dirty(handle, inode, path + depth); 2948 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2930 goto out; 2949 goto out;
2931 } 2950 }
2932 2951
@@ -2958,8 +2977,8 @@ static int ext4_split_extent_at(handle_t *handle,
2958 goto fix_extent_len; 2977 goto fix_extent_len;
2959 /* update the extent length and mark as initialized */ 2978 /* update the extent length and mark as initialized */
2960 ex->ee_len = cpu_to_le16(ee_len); 2979 ex->ee_len = cpu_to_le16(ee_len);
2961 ext4_ext_try_to_merge(inode, path, ex); 2980 ext4_ext_try_to_merge(handle, inode, path, ex);
2962 err = ext4_ext_dirty(handle, inode, path + depth); 2981 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2963 goto out; 2982 goto out;
2964 } else if (err) 2983 } else if (err)
2965 goto fix_extent_len; 2984 goto fix_extent_len;
@@ -3041,7 +3060,6 @@ out:
3041 return err ? err : map->m_len; 3060 return err ? err : map->m_len;
3042} 3061}
3043 3062
3044#define EXT4_EXT_ZERO_LEN 7
3045/* 3063/*
3046 * This function is called by ext4_ext_map_blocks() if someone tries to write 3064 * This function is called by ext4_ext_map_blocks() if someone tries to write
3047 * to an uninitialized extent. It may result in splitting the uninitialized 3065 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -3067,13 +3085,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3067 struct ext4_map_blocks *map, 3085 struct ext4_map_blocks *map,
3068 struct ext4_ext_path *path) 3086 struct ext4_ext_path *path)
3069{ 3087{
3088 struct ext4_sb_info *sbi;
3070 struct ext4_extent_header *eh; 3089 struct ext4_extent_header *eh;
3071 struct ext4_map_blocks split_map; 3090 struct ext4_map_blocks split_map;
3072 struct ext4_extent zero_ex; 3091 struct ext4_extent zero_ex;
3073 struct ext4_extent *ex; 3092 struct ext4_extent *ex;
3074 ext4_lblk_t ee_block, eof_block; 3093 ext4_lblk_t ee_block, eof_block;
3075 unsigned int ee_len, depth; 3094 unsigned int ee_len, depth;
3076 int allocated; 3095 int allocated, max_zeroout = 0;
3077 int err = 0; 3096 int err = 0;
3078 int split_flag = 0; 3097 int split_flag = 0;
3079 3098
@@ -3081,6 +3100,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3081 "block %llu, max_blocks %u\n", inode->i_ino, 3100 "block %llu, max_blocks %u\n", inode->i_ino,
3082 (unsigned long long)map->m_lblk, map->m_len); 3101 (unsigned long long)map->m_lblk, map->m_len);
3083 3102
3103 sbi = EXT4_SB(inode->i_sb);
3084 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3104 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3085 inode->i_sb->s_blocksize_bits; 3105 inode->i_sb->s_blocksize_bits;
3086 if (eof_block < map->m_lblk + map->m_len) 3106 if (eof_block < map->m_lblk + map->m_len)
@@ -3180,9 +3200,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3180 */ 3200 */
3181 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3201 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3182 3202
3183 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 3203 if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3184 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && 3204 max_zeroout = sbi->s_extent_max_zeroout_kb >>
3185 (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3205 inode->i_sb->s_blocksize_bits;
3206
3207 /* If extent is less than s_max_zeroout_kb, zeroout directly */
3208 if (max_zeroout && (ee_len <= max_zeroout)) {
3186 err = ext4_ext_zeroout(inode, ex); 3209 err = ext4_ext_zeroout(inode, ex);
3187 if (err) 3210 if (err)
3188 goto out; 3211 goto out;
@@ -3191,8 +3214,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3191 if (err) 3214 if (err)
3192 goto out; 3215 goto out;
3193 ext4_ext_mark_initialized(ex); 3216 ext4_ext_mark_initialized(ex);
3194 ext4_ext_try_to_merge(inode, path, ex); 3217 ext4_ext_try_to_merge(handle, inode, path, ex);
3195 err = ext4_ext_dirty(handle, inode, path + depth); 3218 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3196 goto out; 3219 goto out;
3197 } 3220 }
3198 3221
@@ -3206,9 +3229,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3206 split_map.m_lblk = map->m_lblk; 3229 split_map.m_lblk = map->m_lblk;
3207 split_map.m_len = map->m_len; 3230 split_map.m_len = map->m_len;
3208 3231
3209 if (allocated > map->m_len) { 3232 if (max_zeroout && (allocated > map->m_len)) {
3210 if (allocated <= EXT4_EXT_ZERO_LEN && 3233 if (allocated <= max_zeroout) {
3211 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3212 /* case 3 */ 3234 /* case 3 */
3213 zero_ex.ee_block = 3235 zero_ex.ee_block =
3214 cpu_to_le32(map->m_lblk); 3236 cpu_to_le32(map->m_lblk);
@@ -3220,9 +3242,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3220 goto out; 3242 goto out;
3221 split_map.m_lblk = map->m_lblk; 3243 split_map.m_lblk = map->m_lblk;
3222 split_map.m_len = allocated; 3244 split_map.m_len = allocated;
3223 } else if ((map->m_lblk - ee_block + map->m_len < 3245 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
3224 EXT4_EXT_ZERO_LEN) &&
3225 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3226 /* case 2 */ 3246 /* case 2 */
3227 if (map->m_lblk != ee_block) { 3247 if (map->m_lblk != ee_block) {
3228 zero_ex.ee_block = ex->ee_block; 3248 zero_ex.ee_block = ex->ee_block;
@@ -3242,7 +3262,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3242 } 3262 }
3243 3263
3244 allocated = ext4_split_extent(handle, inode, path, 3264 allocated = ext4_split_extent(handle, inode, path,
3245 &split_map, split_flag, 0); 3265 &split_map, split_flag, 0);
3246 if (allocated < 0) 3266 if (allocated < 0)
3247 err = allocated; 3267 err = allocated;
3248 3268
@@ -3256,7 +3276,7 @@ out:
3256 * to an uninitialized extent. 3276 * to an uninitialized extent.
3257 * 3277 *
3258 * Writing to an uninitialized extent may result in splitting the uninitialized 3278 * Writing to an uninitialized extent may result in splitting the uninitialized
3259 * extent into multiple /initialized uninitialized extents (up to three) 3279 * extent into multiple initialized/uninitialized extents (up to three)
3260 * There are three possibilities: 3280 * There are three possibilities:
3261 * a> There is no split required: Entire extent should be uninitialized 3281 * a> There is no split required: Entire extent should be uninitialized
3262 * b> Splits in two extents: Write is happening at either end of the extent 3282 * b> Splits in two extents: Write is happening at either end of the extent
@@ -3333,10 +3353,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3333 /* note: ext4_ext_correct_indexes() isn't needed here because 3353 /* note: ext4_ext_correct_indexes() isn't needed here because
3334 * borders are not changed 3354 * borders are not changed
3335 */ 3355 */
3336 ext4_ext_try_to_merge(inode, path, ex); 3356 ext4_ext_try_to_merge(handle, inode, path, ex);
3337 3357
3338 /* Mark modified extent as dirty */ 3358 /* Mark modified extent as dirty */
3339 err = ext4_ext_dirty(handle, inode, path + depth); 3359 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3340out: 3360out:
3341 ext4_ext_show_leaf(inode, path); 3361 ext4_ext_show_leaf(inode, path);
3342 return err; 3362 return err;
@@ -3600,7 +3620,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3600{ 3620{
3601 int ret = 0; 3621 int ret = 0;
3602 int err = 0; 3622 int err = 0;
3603 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3623 ext4_io_end_t *io = ext4_inode_aio(inode);
3604 3624
3605 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " 3625 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
3606 "block %llu, max_blocks %u, flags %x, allocated %u\n", 3626 "block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3615,6 +3635,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3615 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3635 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3616 ret = ext4_split_unwritten_extents(handle, inode, map, 3636 ret = ext4_split_unwritten_extents(handle, inode, map,
3617 path, flags); 3637 path, flags);
3638 if (ret <= 0)
3639 goto out;
3618 /* 3640 /*
3619 * Flag the inode(non aio case) or end_io struct (aio case) 3641 * Flag the inode(non aio case) or end_io struct (aio case)
3620 * that this IO needs to conversion to written when IO is 3642 * that this IO needs to conversion to written when IO is
@@ -3858,8 +3880,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3858 unsigned int allocated = 0, offset = 0; 3880 unsigned int allocated = 0, offset = 0;
3859 unsigned int allocated_clusters = 0; 3881 unsigned int allocated_clusters = 0;
3860 struct ext4_allocation_request ar; 3882 struct ext4_allocation_request ar;
3861 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3883 ext4_io_end_t *io = ext4_inode_aio(inode);
3862 ext4_lblk_t cluster_offset; 3884 ext4_lblk_t cluster_offset;
3885 int set_unwritten = 0;
3863 3886
3864 ext_debug("blocks %u/%u requested for inode %lu\n", 3887 ext_debug("blocks %u/%u requested for inode %lu\n",
3865 map->m_lblk, map->m_len, inode->i_ino); 3888 map->m_lblk, map->m_len, inode->i_ino);
@@ -4082,13 +4105,8 @@ got_allocated_blocks:
4082 * For non asycn direct IO case, flag the inode state 4105 * For non asycn direct IO case, flag the inode state
4083 * that we need to perform conversion when IO is done. 4106 * that we need to perform conversion when IO is done.
4084 */ 4107 */
4085 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 4108 if ((flags & EXT4_GET_BLOCKS_PRE_IO))
4086 if (io) 4109 set_unwritten = 1;
4087 ext4_set_io_unwritten_flag(inode, io);
4088 else
4089 ext4_set_inode_state(inode,
4090 EXT4_STATE_DIO_UNWRITTEN);
4091 }
4092 if (ext4_should_dioread_nolock(inode)) 4110 if (ext4_should_dioread_nolock(inode))
4093 map->m_flags |= EXT4_MAP_UNINIT; 4111 map->m_flags |= EXT4_MAP_UNINIT;
4094 } 4112 }
@@ -4100,6 +4118,15 @@ got_allocated_blocks:
4100 if (!err) 4118 if (!err)
4101 err = ext4_ext_insert_extent(handle, inode, path, 4119 err = ext4_ext_insert_extent(handle, inode, path,
4102 &newex, flags); 4120 &newex, flags);
4121
4122 if (!err && set_unwritten) {
4123 if (io)
4124 ext4_set_io_unwritten_flag(inode, io);
4125 else
4126 ext4_set_inode_state(inode,
4127 EXT4_STATE_DIO_UNWRITTEN);
4128 }
4129
4103 if (err && free_on_err) { 4130 if (err && free_on_err) {
4104 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4131 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4105 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4132 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
@@ -4241,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode)
4241 * finish any pending end_io work so we won't run the risk of 4268 * finish any pending end_io work so we won't run the risk of
4242 * converting any truncated blocks to initialized later 4269 * converting any truncated blocks to initialized later
4243 */ 4270 */
4244 ext4_flush_completed_IO(inode); 4271 ext4_flush_unwritten_io(inode);
4245 4272
4246 /* 4273 /*
4247 * probably first extent we're gonna free will be last in block 4274 * probably first extent we're gonna free will be last in block
@@ -4769,9 +4796,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4769 loff_t first_page_offset, last_page_offset; 4796 loff_t first_page_offset, last_page_offset;
4770 int credits, err = 0; 4797 int credits, err = 0;
4771 4798
4799 /*
4800 * Write out all dirty pages to avoid race conditions
4801 * Then release them.
4802 */
4803 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4804 err = filemap_write_and_wait_range(mapping,
4805 offset, offset + length - 1);
4806
4807 if (err)
4808 return err;
4809 }
4810
4811 mutex_lock(&inode->i_mutex);
4812 /* It's not possible punch hole on append only file */
4813 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
4814 err = -EPERM;
4815 goto out_mutex;
4816 }
4817 if (IS_SWAPFILE(inode)) {
4818 err = -ETXTBSY;
4819 goto out_mutex;
4820 }
4821
4772 /* No need to punch hole beyond i_size */ 4822 /* No need to punch hole beyond i_size */
4773 if (offset >= inode->i_size) 4823 if (offset >= inode->i_size)
4774 return 0; 4824 goto out_mutex;
4775 4825
4776 /* 4826 /*
4777 * If the hole extends beyond i_size, set the hole 4827 * If the hole extends beyond i_size, set the hole
@@ -4789,35 +4839,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4789 first_page_offset = first_page << PAGE_CACHE_SHIFT; 4839 first_page_offset = first_page << PAGE_CACHE_SHIFT;
4790 last_page_offset = last_page << PAGE_CACHE_SHIFT; 4840 last_page_offset = last_page << PAGE_CACHE_SHIFT;
4791 4841
4792 /*
4793 * Write out all dirty pages to avoid race conditions
4794 * Then release them.
4795 */
4796 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4797 err = filemap_write_and_wait_range(mapping,
4798 offset, offset + length - 1);
4799
4800 if (err)
4801 return err;
4802 }
4803
4804 /* Now release the pages */ 4842 /* Now release the pages */
4805 if (last_page_offset > first_page_offset) { 4843 if (last_page_offset > first_page_offset) {
4806 truncate_pagecache_range(inode, first_page_offset, 4844 truncate_pagecache_range(inode, first_page_offset,
4807 last_page_offset - 1); 4845 last_page_offset - 1);
4808 } 4846 }
4809 4847
4810 /* finish any pending end_io work */ 4848 /* Wait all existing dio workers, newcomers will block on i_mutex */
4811 ext4_flush_completed_IO(inode); 4849 ext4_inode_block_unlocked_dio(inode);
4850 err = ext4_flush_unwritten_io(inode);
4851 if (err)
4852 goto out_dio;
4853 inode_dio_wait(inode);
4812 4854
4813 credits = ext4_writepage_trans_blocks(inode); 4855 credits = ext4_writepage_trans_blocks(inode);
4814 handle = ext4_journal_start(inode, credits); 4856 handle = ext4_journal_start(inode, credits);
4815 if (IS_ERR(handle)) 4857 if (IS_ERR(handle)) {
4816 return PTR_ERR(handle); 4858 err = PTR_ERR(handle);
4859 goto out_dio;
4860 }
4817 4861
4818 err = ext4_orphan_add(handle, inode);
4819 if (err)
4820 goto out;
4821 4862
4822 /* 4863 /*
4823 * Now we need to zero out the non-page-aligned data in the 4864 * Now we need to zero out the non-page-aligned data in the
@@ -4903,10 +4944,13 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4903 up_write(&EXT4_I(inode)->i_data_sem); 4944 up_write(&EXT4_I(inode)->i_data_sem);
4904 4945
4905out: 4946out:
4906 ext4_orphan_del(handle, inode);
4907 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4947 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4908 ext4_mark_inode_dirty(handle, inode); 4948 ext4_mark_inode_dirty(handle, inode);
4909 ext4_journal_stop(handle); 4949 ext4_journal_stop(handle);
4950out_dio:
4951 ext4_inode_resume_unlocked_dio(inode);
4952out_mutex:
4953 mutex_unlock(&inode->i_mutex);
4910 return err; 4954 return err;
4911} 4955}
4912int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4956int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3b0e3bdaabfc..ca6f07afe601 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
55 return 0; 55 return 0;
56} 56}
57 57
58static void ext4_aiodio_wait(struct inode *inode) 58void ext4_unwritten_wait(struct inode *inode)
59{ 59{
60 wait_queue_head_t *wq = ext4_ioend_wq(inode); 60 wait_queue_head_t *wq = ext4_ioend_wq(inode);
61 61
62 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); 62 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
63} 63}
64 64
65/* 65/*
@@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
116 "performance will be poor.", 116 "performance will be poor.",
117 inode->i_ino, current->comm); 117 inode->i_ino, current->comm);
118 mutex_lock(ext4_aio_mutex(inode)); 118 mutex_lock(ext4_aio_mutex(inode));
119 ext4_aiodio_wait(inode); 119 ext4_unwritten_wait(inode);
120 } 120 }
121 121
122 BUG_ON(iocb->ki_pos != pos); 122 BUG_ON(iocb->ki_pos != pos);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2a1dcea4f12e..be1d89f385b4 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,87 +34,6 @@
34 34
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37static void dump_completed_IO(struct inode * inode)
38{
39#ifdef EXT4FS_DEBUG
40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags;
43
44 if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
45 ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
46 return;
47 }
48
49 ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
50 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
51 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
52 cur = &io->list;
53 before = cur->prev;
54 io0 = container_of(before, ext4_io_end_t, list);
55 after = cur->next;
56 io1 = container_of(after, ext4_io_end_t, list);
57
58 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
59 io, inode->i_ino, io0, io1);
60 }
61 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
62#endif
63}
64
65/*
66 * This function is called from ext4_sync_file().
67 *
68 * When IO is completed, the work to convert unwritten extents to
69 * written is queued on workqueue but may not get immediately
70 * scheduled. When fsync is called, we need to ensure the
71 * conversion is complete before fsync returns.
72 * The inode keeps track of a list of pending/completed IO that
73 * might needs to do the conversion. This function walks through
74 * the list and convert the related unwritten extents for completed IO
75 * to written.
76 * The function return the number of pending IOs on success.
77 */
78int ext4_flush_completed_IO(struct inode *inode)
79{
80 ext4_io_end_t *io;
81 struct ext4_inode_info *ei = EXT4_I(inode);
82 unsigned long flags;
83 int ret = 0;
84 int ret2 = 0;
85
86 dump_completed_IO(inode);
87 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
88 while (!list_empty(&ei->i_completed_io_list)){
89 io = list_entry(ei->i_completed_io_list.next,
90 ext4_io_end_t, list);
91 list_del_init(&io->list);
92 io->flag |= EXT4_IO_END_IN_FSYNC;
93 /*
94 * Calling ext4_end_io_nolock() to convert completed
95 * IO to written.
96 *
97 * When ext4_sync_file() is called, run_queue() may already
98 * about to flush the work corresponding to this io structure.
99 * It will be upset if it founds the io structure related
100 * to the work-to-be schedule is freed.
101 *
102 * Thus we need to keep the io structure still valid here after
103 * conversion finished. The io structure has a flag to
104 * avoid double converting from both fsync and background work
105 * queue work.
106 */
107 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
108 ret = ext4_end_io_nolock(io);
109 if (ret < 0)
110 ret2 = ret;
111 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
112 io->flag &= ~EXT4_IO_END_IN_FSYNC;
113 }
114 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
115 return (ret2 < 0) ? ret2 : 0;
116}
117
118/* 37/*
119 * If we're not journaling and this is a just-created file, we have to 38 * If we're not journaling and this is a just-created file, we have to
120 * sync our parent directory (if it was freshly created) since 39 * sync our parent directory (if it was freshly created) since
@@ -203,7 +122,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
203 struct inode *inode = file->f_mapping->host; 122 struct inode *inode = file->f_mapping->host;
204 struct ext4_inode_info *ei = EXT4_I(inode); 123 struct ext4_inode_info *ei = EXT4_I(inode);
205 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 124 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
206 int ret; 125 int ret, err;
207 tid_t commit_tid; 126 tid_t commit_tid;
208 bool needs_barrier = false; 127 bool needs_barrier = false;
209 128
@@ -219,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
219 if (inode->i_sb->s_flags & MS_RDONLY) 138 if (inode->i_sb->s_flags & MS_RDONLY)
220 goto out; 139 goto out;
221 140
222 ret = ext4_flush_completed_IO(inode); 141 ret = ext4_flush_unwritten_io(inode);
223 if (ret < 0) 142 if (ret < 0)
224 goto out; 143 goto out;
225 144
@@ -255,8 +174,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
255 needs_barrier = true; 174 needs_barrier = true;
256 jbd2_log_start_commit(journal, commit_tid); 175 jbd2_log_start_commit(journal, commit_tid);
257 ret = jbd2_log_wait_commit(journal, commit_tid); 176 ret = jbd2_log_wait_commit(journal, commit_tid);
258 if (needs_barrier) 177 if (needs_barrier) {
259 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 178 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
179 if (!ret)
180 ret = err;
181 }
260 out: 182 out:
261 mutex_unlock(&inode->i_mutex); 183 mutex_unlock(&inode->i_mutex);
262 trace_ext4_sync_file_exit(inode, ret); 184 trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 26154b81b836..fa36372f3fdf 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -697,6 +697,15 @@ got_group:
697 if (!gdp) 697 if (!gdp)
698 goto fail; 698 goto fail;
699 699
700 /*
701 * Check free inodes count before loading bitmap.
702 */
703 if (ext4_free_inodes_count(sb, gdp) == 0) {
704 if (++group == ngroups)
705 group = 0;
706 continue;
707 }
708
700 brelse(inode_bitmap_bh); 709 brelse(inode_bitmap_bh);
701 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); 710 inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
702 if (!inode_bitmap_bh) 711 if (!inode_bitmap_bh)
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2bf145..792e388e7b44 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -807,16 +807,30 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
807 807
808retry: 808retry:
809 if (rw == READ && ext4_should_dioread_nolock(inode)) { 809 if (rw == READ && ext4_should_dioread_nolock(inode)) {
810 if (unlikely(!list_empty(&ei->i_completed_io_list))) { 810 if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
811 mutex_lock(&inode->i_mutex); 811 mutex_lock(&inode->i_mutex);
812 ext4_flush_completed_IO(inode); 812 ext4_flush_unwritten_io(inode);
813 mutex_unlock(&inode->i_mutex); 813 mutex_unlock(&inode->i_mutex);
814 } 814 }
815 /*
816 * Nolock dioread optimization may be dynamically disabled
817 * via ext4_inode_block_unlocked_dio(). Check inode's state
818 * while holding extra i_dio_count ref.
819 */
820 atomic_inc(&inode->i_dio_count);
821 smp_mb();
822 if (unlikely(ext4_test_inode_state(inode,
823 EXT4_STATE_DIOREAD_LOCK))) {
824 inode_dio_done(inode);
825 goto locked;
826 }
815 ret = __blockdev_direct_IO(rw, iocb, inode, 827 ret = __blockdev_direct_IO(rw, iocb, inode,
816 inode->i_sb->s_bdev, iov, 828 inode->i_sb->s_bdev, iov,
817 offset, nr_segs, 829 offset, nr_segs,
818 ext4_get_block, NULL, NULL, 0); 830 ext4_get_block, NULL, NULL, 0);
831 inode_dio_done(inode);
819 } else { 832 } else {
833locked:
820 ret = blockdev_direct_IO(rw, iocb, inode, iov, 834 ret = blockdev_direct_IO(rw, iocb, inode, iov,
821 offset, nr_segs, ext4_get_block); 835 offset, nr_segs, ext4_get_block);
822 836
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c862ee5fe79d..b3c243b9afa5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -732,11 +732,13 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
732 err = ext4_map_blocks(handle, inode, &map, 732 err = ext4_map_blocks(handle, inode, &map,
733 create ? EXT4_GET_BLOCKS_CREATE : 0); 733 create ? EXT4_GET_BLOCKS_CREATE : 0);
734 734
735 /* ensure we send some value back into *errp */
736 *errp = 0;
737
735 if (err < 0) 738 if (err < 0)
736 *errp = err; 739 *errp = err;
737 if (err <= 0) 740 if (err <= 0)
738 return NULL; 741 return NULL;
739 *errp = 0;
740 742
741 bh = sb_getblk(inode->i_sb, map.m_pblk); 743 bh = sb_getblk(inode->i_sb, map.m_pblk);
742 if (!bh) { 744 if (!bh) {
@@ -1954,9 +1956,6 @@ out:
1954 return ret; 1956 return ret;
1955} 1957}
1956 1958
1957static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
1958static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
1959
1960/* 1959/*
1961 * Note that we don't need to start a transaction unless we're journaling data 1960 * Note that we don't need to start a transaction unless we're journaling data
1962 * because we should have holes filled from ext4_page_mkwrite(). We even don't 1961 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2463,6 +2462,16 @@ static int ext4_nonda_switch(struct super_block *sb)
2463 free_blocks = EXT4_C2B(sbi, 2462 free_blocks = EXT4_C2B(sbi,
2464 percpu_counter_read_positive(&sbi->s_freeclusters_counter)); 2463 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
2465 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2464 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
2465 /*
2466 * Start pushing delalloc when 1/2 of free blocks are dirty.
2467 */
2468 if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
2469 !writeback_in_progress(sb->s_bdi) &&
2470 down_read_trylock(&sb->s_umount)) {
2471 writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
2472 up_read(&sb->s_umount);
2473 }
2474
2466 if (2 * free_blocks < 3 * dirty_blocks || 2475 if (2 * free_blocks < 3 * dirty_blocks ||
2467 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { 2476 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
2468 /* 2477 /*
@@ -2471,13 +2480,6 @@ static int ext4_nonda_switch(struct super_block *sb)
2471 */ 2480 */
2472 return 1; 2481 return 1;
2473 } 2482 }
2474 /*
2475 * Even if we don't switch but are nearing capacity,
2476 * start pushing delalloc when 1/2 of free blocks are dirty.
2477 */
2478 if (free_blocks < 2 * dirty_blocks)
2479 writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
2480
2481 return 0; 2483 return 0;
2482} 2484}
2483 2485
@@ -2879,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2879{ 2881{
2880 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 2882 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2881 ext4_io_end_t *io_end = iocb->private; 2883 ext4_io_end_t *io_end = iocb->private;
2882 struct workqueue_struct *wq;
2883 unsigned long flags;
2884 struct ext4_inode_info *ei;
2885 2884
2886 /* if not async direct IO or dio with 0 bytes write, just return */ 2885 /* if not async direct IO or dio with 0 bytes write, just return */
2887 if (!io_end || !size) 2886 if (!io_end || !size)
@@ -2910,24 +2909,14 @@ out:
2910 io_end->iocb = iocb; 2909 io_end->iocb = iocb;
2911 io_end->result = ret; 2910 io_end->result = ret;
2912 } 2911 }
2913 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
2914 2912
2915 /* Add the io_end to per-inode completed aio dio list*/ 2913 ext4_add_complete_io(io_end);
2916 ei = EXT4_I(io_end->inode);
2917 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
2918 list_add_tail(&io_end->list, &ei->i_completed_io_list);
2919 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
2920
2921 /* queue the work to convert unwritten extents to written */
2922 queue_work(wq, &io_end->work);
2923} 2914}
2924 2915
2925static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 2916static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2926{ 2917{
2927 ext4_io_end_t *io_end = bh->b_private; 2918 ext4_io_end_t *io_end = bh->b_private;
2928 struct workqueue_struct *wq;
2929 struct inode *inode; 2919 struct inode *inode;
2930 unsigned long flags;
2931 2920
2932 if (!test_clear_buffer_uninit(bh) || !io_end) 2921 if (!test_clear_buffer_uninit(bh) || !io_end)
2933 goto out; 2922 goto out;
@@ -2946,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2946 */ 2935 */
2947 inode = io_end->inode; 2936 inode = io_end->inode;
2948 ext4_set_io_unwritten_flag(inode, io_end); 2937 ext4_set_io_unwritten_flag(inode, io_end);
2949 2938 ext4_add_complete_io(io_end);
2950 /* Add the io_end to per-inode completed io list*/
2951 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
2952 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
2953 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
2954
2955 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
2956 /* queue the work to convert unwritten extents to written */
2957 queue_work(wq, &io_end->work);
2958out: 2939out:
2959 bh->b_private = NULL; 2940 bh->b_private = NULL;
2960 bh->b_end_io = NULL; 2941 bh->b_end_io = NULL;
@@ -3029,6 +3010,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3029 overwrite = *((int *)iocb->private); 3010 overwrite = *((int *)iocb->private);
3030 3011
3031 if (overwrite) { 3012 if (overwrite) {
3013 atomic_inc(&inode->i_dio_count);
3032 down_read(&EXT4_I(inode)->i_data_sem); 3014 down_read(&EXT4_I(inode)->i_data_sem);
3033 mutex_unlock(&inode->i_mutex); 3015 mutex_unlock(&inode->i_mutex);
3034 } 3016 }
@@ -3054,7 +3036,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3054 * hook to the iocb. 3036 * hook to the iocb.
3055 */ 3037 */
3056 iocb->private = NULL; 3038 iocb->private = NULL;
3057 EXT4_I(inode)->cur_aio_dio = NULL; 3039 ext4_inode_aio_set(inode, NULL);
3058 if (!is_sync_kiocb(iocb)) { 3040 if (!is_sync_kiocb(iocb)) {
3059 ext4_io_end_t *io_end = 3041 ext4_io_end_t *io_end =
3060 ext4_init_io_end(inode, GFP_NOFS); 3042 ext4_init_io_end(inode, GFP_NOFS);
@@ -3071,7 +3053,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3071 * is a unwritten extents needs to be converted 3053 * is a unwritten extents needs to be converted
3072 * when IO is completed. 3054 * when IO is completed.
3073 */ 3055 */
3074 EXT4_I(inode)->cur_aio_dio = iocb->private; 3056 ext4_inode_aio_set(inode, io_end);
3075 } 3057 }
3076 3058
3077 if (overwrite) 3059 if (overwrite)
@@ -3091,7 +3073,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3091 NULL, 3073 NULL,
3092 DIO_LOCKING); 3074 DIO_LOCKING);
3093 if (iocb->private) 3075 if (iocb->private)
3094 EXT4_I(inode)->cur_aio_dio = NULL; 3076 ext4_inode_aio_set(inode, NULL);
3095 /* 3077 /*
3096 * The io_end structure takes a reference to the inode, 3078 * The io_end structure takes a reference to the inode,
3097 * that structure needs to be destroyed and the 3079 * that structure needs to be destroyed and the
@@ -3126,6 +3108,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3126 retake_lock: 3108 retake_lock:
3127 /* take i_mutex locking again if we do a ovewrite dio */ 3109 /* take i_mutex locking again if we do a ovewrite dio */
3128 if (overwrite) { 3110 if (overwrite) {
3111 inode_dio_done(inode);
3129 up_read(&EXT4_I(inode)->i_data_sem); 3112 up_read(&EXT4_I(inode)->i_data_sem);
3130 mutex_lock(&inode->i_mutex); 3113 mutex_lock(&inode->i_mutex);
3131 } 3114 }
@@ -4052,6 +4035,7 @@ static int ext4_do_update_inode(handle_t *handle,
4052 struct ext4_inode_info *ei = EXT4_I(inode); 4035 struct ext4_inode_info *ei = EXT4_I(inode);
4053 struct buffer_head *bh = iloc->bh; 4036 struct buffer_head *bh = iloc->bh;
4054 int err = 0, rc, block; 4037 int err = 0, rc, block;
4038 int need_datasync = 0;
4055 uid_t i_uid; 4039 uid_t i_uid;
4056 gid_t i_gid; 4040 gid_t i_gid;
4057 4041
@@ -4102,7 +4086,10 @@ static int ext4_do_update_inode(handle_t *handle,
4102 raw_inode->i_file_acl_high = 4086 raw_inode->i_file_acl_high =
4103 cpu_to_le16(ei->i_file_acl >> 32); 4087 cpu_to_le16(ei->i_file_acl >> 32);
4104 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4088 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
4105 ext4_isize_set(raw_inode, ei->i_disksize); 4089 if (ei->i_disksize != ext4_isize(raw_inode)) {
4090 ext4_isize_set(raw_inode, ei->i_disksize);
4091 need_datasync = 1;
4092 }
4106 if (ei->i_disksize > 0x7fffffffULL) { 4093 if (ei->i_disksize > 0x7fffffffULL) {
4107 struct super_block *sb = inode->i_sb; 4094 struct super_block *sb = inode->i_sb;
4108 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 4095 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -4155,7 +4142,7 @@ static int ext4_do_update_inode(handle_t *handle,
4155 err = rc; 4142 err = rc;
4156 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 4143 ext4_clear_inode_state(inode, EXT4_STATE_NEW);
4157 4144
4158 ext4_update_inode_fsync_trans(handle, inode, 0); 4145 ext4_update_inode_fsync_trans(handle, inode, need_datasync);
4159out_brelse: 4146out_brelse:
4160 brelse(bh); 4147 brelse(bh);
4161 ext4_std_error(inode->i_sb, err); 4148 ext4_std_error(inode->i_sb, err);
@@ -4298,7 +4285,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4298 } 4285 }
4299 4286
4300 if (attr->ia_valid & ATTR_SIZE) { 4287 if (attr->ia_valid & ATTR_SIZE) {
4301 inode_dio_wait(inode);
4302 4288
4303 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4289 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4304 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4290 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4347,8 +4333,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4347 } 4333 }
4348 4334
4349 if (attr->ia_valid & ATTR_SIZE) { 4335 if (attr->ia_valid & ATTR_SIZE) {
4350 if (attr->ia_size != i_size_read(inode)) 4336 if (attr->ia_size != i_size_read(inode)) {
4351 truncate_setsize(inode, attr->ia_size); 4337 truncate_setsize(inode, attr->ia_size);
4338 /* Inode size will be reduced, wait for dio in flight.
4339 * Temporarily disable dioread_nolock to prevent
4340 * livelock. */
4341 if (orphan) {
4342 ext4_inode_block_unlocked_dio(inode);
4343 inode_dio_wait(inode);
4344 ext4_inode_resume_unlocked_dio(inode);
4345 }
4346 }
4352 ext4_truncate(inode); 4347 ext4_truncate(inode);
4353 } 4348 }
4354 4349
@@ -4727,6 +4722,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4727 return err; 4722 return err;
4728 } 4723 }
4729 4724
4725 /* Wait for all existing dio workers */
4726 ext4_inode_block_unlocked_dio(inode);
4727 inode_dio_wait(inode);
4728
4730 jbd2_journal_lock_updates(journal); 4729 jbd2_journal_lock_updates(journal);
4731 4730
4732 /* 4731 /*
@@ -4746,6 +4745,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4746 ext4_set_aops(inode); 4745 ext4_set_aops(inode);
4747 4746
4748 jbd2_journal_unlock_updates(journal); 4747 jbd2_journal_unlock_updates(journal);
4748 ext4_inode_resume_unlocked_dio(inode);
4749 4749
4750 /* Finally we can mark the inode as dirty. */ 4750 /* Finally we can mark the inode as dirty. */
4751 4751
@@ -4780,6 +4780,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4780 int retries = 0; 4780 int retries = 0;
4781 4781
4782 sb_start_pagefault(inode->i_sb); 4782 sb_start_pagefault(inode->i_sb);
4783 file_update_time(vma->vm_file);
4783 /* Delalloc case is easy... */ 4784 /* Delalloc case is easy... */
4784 if (test_opt(inode->i_sb, DELALLOC) && 4785 if (test_opt(inode->i_sb, DELALLOC) &&
4785 !ext4_should_journal_data(inode) && 4786 !ext4_should_journal_data(inode) &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5439d6a56e99..5747f52f7c72 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -366,26 +366,11 @@ group_add_out:
366 return -EOPNOTSUPP; 366 return -EOPNOTSUPP;
367 } 367 }
368 368
369 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
370 EXT4_FEATURE_INCOMPAT_META_BG)) {
371 ext4_msg(sb, KERN_ERR,
372 "Online resizing not (yet) supported with meta_bg");
373 return -EOPNOTSUPP;
374 }
375
376 if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, 369 if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
377 sizeof(__u64))) { 370 sizeof(__u64))) {
378 return -EFAULT; 371 return -EFAULT;
379 } 372 }
380 373
381 if (n_blocks_count > MAX_32_NUM &&
382 !EXT4_HAS_INCOMPAT_FEATURE(sb,
383 EXT4_FEATURE_INCOMPAT_64BIT)) {
384 ext4_msg(sb, KERN_ERR,
385 "File system only supports 32-bit block numbers");
386 return -EOPNOTSUPP;
387 }
388
389 err = ext4_resize_begin(sb); 374 err = ext4_resize_begin(sb);
390 if (err) 375 if (err)
391 return err; 376 return err;
@@ -420,13 +405,6 @@ resizefs_out:
420 if (!blk_queue_discard(q)) 405 if (!blk_queue_discard(q))
421 return -EOPNOTSUPP; 406 return -EOPNOTSUPP;
422 407
423 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
424 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
425 ext4_msg(sb, KERN_ERR,
426 "FITRIM not supported with bigalloc");
427 return -EOPNOTSUPP;
428 }
429
430 if (copy_from_user(&range, (struct fstrim_range __user *)arg, 408 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
431 sizeof(range))) 409 sizeof(range)))
432 return -EFAULT; 410 return -EFAULT;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 08778f6cdfe9..f8b27bf80aca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -24,6 +24,7 @@
24#include "ext4_jbd2.h" 24#include "ext4_jbd2.h"
25#include "mballoc.h" 25#include "mballoc.h"
26#include <linux/debugfs.h> 26#include <linux/debugfs.h>
27#include <linux/log2.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
28#include <trace/events/ext4.h> 29#include <trace/events/ext4.h>
29 30
@@ -1338,17 +1339,17 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1338 mb_check_buddy(e4b); 1339 mb_check_buddy(e4b);
1339} 1340}
1340 1341
1341static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1342static int mb_find_extent(struct ext4_buddy *e4b, int block,
1342 int needed, struct ext4_free_extent *ex) 1343 int needed, struct ext4_free_extent *ex)
1343{ 1344{
1344 int next = block; 1345 int next = block;
1345 int max; 1346 int max, order;
1346 void *buddy; 1347 void *buddy;
1347 1348
1348 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1349 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1349 BUG_ON(ex == NULL); 1350 BUG_ON(ex == NULL);
1350 1351
1351 buddy = mb_find_buddy(e4b, order, &max); 1352 buddy = mb_find_buddy(e4b, 0, &max);
1352 BUG_ON(buddy == NULL); 1353 BUG_ON(buddy == NULL);
1353 BUG_ON(block >= max); 1354 BUG_ON(block >= max);
1354 if (mb_test_bit(block, buddy)) { 1355 if (mb_test_bit(block, buddy)) {
@@ -1358,12 +1359,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1358 return 0; 1359 return 0;
1359 } 1360 }
1360 1361
1361 /* FIXME dorp order completely ? */ 1362 /* find actual order */
1362 if (likely(order == 0)) { 1363 order = mb_find_order_for_block(e4b, block);
1363 /* find actual order */ 1364 block = block >> order;
1364 order = mb_find_order_for_block(e4b, block);
1365 block = block >> order;
1366 }
1367 1365
1368 ex->fe_len = 1 << order; 1366 ex->fe_len = 1 << order;
1369 ex->fe_start = block << order; 1367 ex->fe_start = block << order;
@@ -1549,7 +1547,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1549 /* recheck chunk's availability - we don't know 1547 /* recheck chunk's availability - we don't know
1550 * when it was found (within this lock-unlock 1548 * when it was found (within this lock-unlock
1551 * period or not) */ 1549 * period or not) */
1552 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); 1550 max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1553 if (max >= gex->fe_len) { 1551 if (max >= gex->fe_len) {
1554 ext4_mb_use_best_found(ac, e4b); 1552 ext4_mb_use_best_found(ac, e4b);
1555 return; 1553 return;
@@ -1641,7 +1639,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1641 return err; 1639 return err;
1642 1640
1643 ext4_lock_group(ac->ac_sb, group); 1641 ext4_lock_group(ac->ac_sb, group);
1644 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); 1642 max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1645 1643
1646 if (max > 0) { 1644 if (max > 0) {
1647 ac->ac_b_ex = ex; 1645 ac->ac_b_ex = ex;
@@ -1662,17 +1660,20 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1662 int max; 1660 int max;
1663 int err; 1661 int err;
1664 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1662 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1663 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1665 struct ext4_free_extent ex; 1664 struct ext4_free_extent ex;
1666 1665
1667 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1666 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1668 return 0; 1667 return 0;
1668 if (grp->bb_free == 0)
1669 return 0;
1669 1670
1670 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1671 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1671 if (err) 1672 if (err)
1672 return err; 1673 return err;
1673 1674
1674 ext4_lock_group(ac->ac_sb, group); 1675 ext4_lock_group(ac->ac_sb, group);
1675 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, 1676 max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1676 ac->ac_g_ex.fe_len, &ex); 1677 ac->ac_g_ex.fe_len, &ex);
1677 1678
1678 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1679 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
@@ -1788,7 +1789,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1788 break; 1789 break;
1789 } 1790 }
1790 1791
1791 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1792 mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1792 BUG_ON(ex.fe_len <= 0); 1793 BUG_ON(ex.fe_len <= 0);
1793 if (free < ex.fe_len) { 1794 if (free < ex.fe_len) {
1794 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1795 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
@@ -1840,7 +1841,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1840 1841
1841 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { 1842 while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
1842 if (!mb_test_bit(i, bitmap)) { 1843 if (!mb_test_bit(i, bitmap)) {
1843 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1844 max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
1844 if (max >= sbi->s_stripe) { 1845 if (max >= sbi->s_stripe) {
1845 ac->ac_found++; 1846 ac->ac_found++;
1846 ac->ac_b_ex = ex; 1847 ac->ac_b_ex = ex;
@@ -1862,6 +1863,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1862 1863
1863 BUG_ON(cr < 0 || cr >= 4); 1864 BUG_ON(cr < 0 || cr >= 4);
1864 1865
1866 free = grp->bb_free;
1867 if (free == 0)
1868 return 0;
1869 if (cr <= 2 && free < ac->ac_g_ex.fe_len)
1870 return 0;
1871
1865 /* We only do this if the grp has never been initialized */ 1872 /* We only do this if the grp has never been initialized */
1866 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1873 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1867 int ret = ext4_mb_init_group(ac->ac_sb, group); 1874 int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -1869,10 +1876,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1869 return 0; 1876 return 0;
1870 } 1877 }
1871 1878
1872 free = grp->bb_free;
1873 fragments = grp->bb_fragments; 1879 fragments = grp->bb_fragments;
1874 if (free == 0)
1875 return 0;
1876 if (fragments == 0) 1880 if (fragments == 0)
1877 return 0; 1881 return 0;
1878 1882
@@ -2163,6 +2167,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2163 return cachep; 2167 return cachep;
2164} 2168}
2165 2169
2170/*
2171 * Allocate the top-level s_group_info array for the specified number
2172 * of groups
2173 */
2174int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2175{
2176 struct ext4_sb_info *sbi = EXT4_SB(sb);
2177 unsigned size;
2178 struct ext4_group_info ***new_groupinfo;
2179
2180 size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2181 EXT4_DESC_PER_BLOCK_BITS(sb);
2182 if (size <= sbi->s_group_info_size)
2183 return 0;
2184
2185 size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2186 new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
2187 if (!new_groupinfo) {
2188 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2189 return -ENOMEM;
2190 }
2191 if (sbi->s_group_info) {
2192 memcpy(new_groupinfo, sbi->s_group_info,
2193 sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2194 ext4_kvfree(sbi->s_group_info);
2195 }
2196 sbi->s_group_info = new_groupinfo;
2197 sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2198 ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
2199 sbi->s_group_info_size);
2200 return 0;
2201}
2202
2166/* Create and initialize ext4_group_info data for the given group. */ 2203/* Create and initialize ext4_group_info data for the given group. */
2167int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2204int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2168 struct ext4_group_desc *desc) 2205 struct ext4_group_desc *desc)
@@ -2195,12 +2232,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2195 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2232 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2196 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2233 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2197 2234
2198 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2235 meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
2199 if (meta_group_info[i] == NULL) { 2236 if (meta_group_info[i] == NULL) {
2200 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); 2237 ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2201 goto exit_group_info; 2238 goto exit_group_info;
2202 } 2239 }
2203 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
2204 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2240 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2205 &(meta_group_info[i]->bb_state)); 2241 &(meta_group_info[i]->bb_state));
2206 2242
@@ -2252,49 +2288,14 @@ static int ext4_mb_init_backend(struct super_block *sb)
2252 ext4_group_t ngroups = ext4_get_groups_count(sb); 2288 ext4_group_t ngroups = ext4_get_groups_count(sb);
2253 ext4_group_t i; 2289 ext4_group_t i;
2254 struct ext4_sb_info *sbi = EXT4_SB(sb); 2290 struct ext4_sb_info *sbi = EXT4_SB(sb);
2255 struct ext4_super_block *es = sbi->s_es; 2291 int err;
2256 int num_meta_group_infos;
2257 int num_meta_group_infos_max;
2258 int array_size;
2259 struct ext4_group_desc *desc; 2292 struct ext4_group_desc *desc;
2260 struct kmem_cache *cachep; 2293 struct kmem_cache *cachep;
2261 2294
2262 /* This is the number of blocks used by GDT */ 2295 err = ext4_mb_alloc_groupinfo(sb, ngroups);
2263 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2296 if (err)
2264 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2297 return err;
2265
2266 /*
2267 * This is the total number of blocks used by GDT including
2268 * the number of reserved blocks for GDT.
2269 * The s_group_info array is allocated with this value
2270 * to allow a clean online resize without a complex
2271 * manipulation of pointer.
2272 * The drawback is the unused memory when no resize
2273 * occurs but it's very low in terms of pages
2274 * (see comments below)
2275 * Need to handle this properly when META_BG resizing is allowed
2276 */
2277 num_meta_group_infos_max = num_meta_group_infos +
2278 le16_to_cpu(es->s_reserved_gdt_blocks);
2279 2298
2280 /*
2281 * array_size is the size of s_group_info array. We round it
2282 * to the next power of two because this approximation is done
2283 * internally by kmalloc so we can have some more memory
2284 * for free here (e.g. may be used for META_BG resize).
2285 */
2286 array_size = 1;
2287 while (array_size < sizeof(*sbi->s_group_info) *
2288 num_meta_group_infos_max)
2289 array_size = array_size << 1;
2290 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2291 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2292 * So a two level scheme suffices for now. */
2293 sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
2294 if (sbi->s_group_info == NULL) {
2295 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2296 return -ENOMEM;
2297 }
2298 sbi->s_buddy_cache = new_inode(sb); 2299 sbi->s_buddy_cache = new_inode(sb);
2299 if (sbi->s_buddy_cache == NULL) { 2300 if (sbi->s_buddy_cache == NULL) {
2300 ext4_msg(sb, KERN_ERR, "can't get new inode"); 2301 ext4_msg(sb, KERN_ERR, "can't get new inode");
@@ -2322,7 +2323,7 @@ err_freebuddy:
2322 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2323 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2323 while (i-- > 0) 2324 while (i-- > 0)
2324 kmem_cache_free(cachep, ext4_get_group_info(sb, i)); 2325 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2325 i = num_meta_group_infos; 2326 i = sbi->s_group_info_size;
2326 while (i-- > 0) 2327 while (i-- > 0)
2327 kfree(sbi->s_group_info[i]); 2328 kfree(sbi->s_group_info[i]);
2328 iput(sbi->s_buddy_cache); 2329 iput(sbi->s_buddy_cache);
@@ -4008,7 +4009,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4008 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4009 ext4_get_group_no_and_offset(sb, goal, &group, &block);
4009 4010
4010 /* set up allocation goals */ 4011 /* set up allocation goals */
4011 memset(ac, 0, sizeof(struct ext4_allocation_context));
4012 ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); 4012 ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
4013 ac->ac_status = AC_STATUS_CONTINUE; 4013 ac->ac_status = AC_STATUS_CONTINUE;
4014 ac->ac_sb = sb; 4014 ac->ac_sb = sb;
@@ -4291,7 +4291,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4291 } 4291 }
4292 } 4292 }
4293 4293
4294 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4294 ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4295 if (!ac) { 4295 if (!ac) {
4296 ar->len = 0; 4296 ar->len = 0;
4297 *errp = -ENOMEM; 4297 *errp = -ENOMEM;
@@ -4657,6 +4657,8 @@ do_more:
4657 * with group lock held. generate_buddy look at 4657 * with group lock held. generate_buddy look at
4658 * them with group lock_held 4658 * them with group lock_held
4659 */ 4659 */
4660 if (test_opt(sb, DISCARD))
4661 ext4_issue_discard(sb, block_group, bit, count);
4660 ext4_lock_group(sb, block_group); 4662 ext4_lock_group(sb, block_group);
4661 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4663 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4662 mb_free_blocks(inode, &e4b, bit, count_clusters); 4664 mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4988,7 +4990,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4988 4990
4989 start = range->start >> sb->s_blocksize_bits; 4991 start = range->start >> sb->s_blocksize_bits;
4990 end = start + (range->len >> sb->s_blocksize_bits) - 1; 4992 end = start + (range->len >> sb->s_blocksize_bits) - 1;
4991 minlen = range->minlen >> sb->s_blocksize_bits; 4993 minlen = EXT4_NUM_B2C(EXT4_SB(sb),
4994 range->minlen >> sb->s_blocksize_bits);
4992 4995
4993 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || 4996 if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
4994 unlikely(start >= max_blks)) 4997 unlikely(start >= max_blks))
@@ -5048,6 +5051,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5048 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); 5051 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5049 5052
5050out: 5053out:
5051 range->len = trimmed * sb->s_blocksize; 5054 range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5052 return ret; 5055 return ret;
5053} 5056}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c070618c21ce..3ccd889ba953 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -65,11 +65,6 @@ extern u8 mb_enable_debug;
65#define MB_DEFAULT_MIN_TO_SCAN 10 65#define MB_DEFAULT_MIN_TO_SCAN 10
66 66
67/* 67/*
68 * How many groups mballoc will scan looking for the best chunk
69 */
70#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
71
72/*
73 * with 'ext4_mb_stats' allocator will collect stats that will be 68 * with 'ext4_mb_stats' allocator will collect stats that will be
74 * shown at umount. The collecting costs though! 69 * shown at umount. The collecting costs though!
75 */ 70 */
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c5826c623e7a..292daeeed455 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -141,55 +141,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
141} 141}
142 142
143/** 143/**
144 * mext_check_null_inode - NULL check for two inodes
145 *
146 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
147 */
148static int
149mext_check_null_inode(struct inode *inode1, struct inode *inode2,
150 const char *function, unsigned int line)
151{
152 int ret = 0;
153
154 if (inode1 == NULL) {
155 __ext4_error(inode2->i_sb, function, line,
156 "Both inodes should not be NULL: "
157 "inode1 NULL inode2 %lu", inode2->i_ino);
158 ret = -EIO;
159 } else if (inode2 == NULL) {
160 __ext4_error(inode1->i_sb, function, line,
161 "Both inodes should not be NULL: "
162 "inode1 %lu inode2 NULL", inode1->i_ino);
163 ret = -EIO;
164 }
165 return ret;
166}
167
168/**
169 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem 144 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
170 * 145 *
171 * @orig_inode: original inode structure 146 * Acquire write lock of i_data_sem of the two inodes
172 * @donor_inode: donor inode structure
173 * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
174 * i_ino order.
175 */ 147 */
176static void 148static void
177double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) 149double_down_write_data_sem(struct inode *first, struct inode *second)
178{ 150{
179 struct inode *first = orig_inode, *second = donor_inode; 151 if (first < second) {
152 down_write(&EXT4_I(first)->i_data_sem);
153 down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
154 } else {
155 down_write(&EXT4_I(second)->i_data_sem);
156 down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
180 157
181 /*
182 * Use the inode number to provide the stable locking order instead
183 * of its address, because the C language doesn't guarantee you can
184 * compare pointers that don't come from the same array.
185 */
186 if (donor_inode->i_ino < orig_inode->i_ino) {
187 first = donor_inode;
188 second = orig_inode;
189 } 158 }
190
191 down_write(&EXT4_I(first)->i_data_sem);
192 down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
193} 159}
194 160
195/** 161/**
@@ -604,9 +570,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
604 diff = donor_off - le32_to_cpu(tmp_dext->ee_block); 570 diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
605 571
606 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); 572 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
607 tmp_dext->ee_block = 573 le32_add_cpu(&tmp_dext->ee_block, diff);
608 cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); 574 le16_add_cpu(&tmp_dext->ee_len, -diff);
609 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
610 575
611 if (max_count < ext4_ext_get_actual_len(tmp_dext)) 576 if (max_count < ext4_ext_get_actual_len(tmp_dext))
612 tmp_dext->ee_len = cpu_to_le16(max_count); 577 tmp_dext->ee_len = cpu_to_le16(max_count);
@@ -629,6 +594,43 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
629} 594}
630 595
631/** 596/**
597 * mext_check_coverage - Check that all extents in range has the same type
598 *
599 * @inode: inode in question
600 * @from: block offset of inode
601 * @count: block count to be checked
602 * @uninit: extents expected to be uninitialized
603 * @err: pointer to save error value
604 *
605 * Return 1 if all extents in range has expected type, and zero otherwise.
606 */
607static int
608mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
609 int uninit, int *err)
610{
611 struct ext4_ext_path *path = NULL;
612 struct ext4_extent *ext;
613 ext4_lblk_t last = from + count;
614 while (from < last) {
615 *err = get_ext_path(inode, from, &path);
616 if (*err)
617 return 0;
618 ext = path[ext_depth(inode)].p_ext;
619 if (!ext) {
620 ext4_ext_drop_refs(path);
621 return 0;
622 }
623 if (uninit != ext4_ext_is_uninitialized(ext)) {
624 ext4_ext_drop_refs(path);
625 return 0;
626 }
627 from += ext4_ext_get_actual_len(ext);
628 ext4_ext_drop_refs(path);
629 }
630 return 1;
631}
632
633/**
632 * mext_replace_branches - Replace original extents with new extents 634 * mext_replace_branches - Replace original extents with new extents
633 * 635 *
634 * @handle: journal handle 636 * @handle: journal handle
@@ -663,9 +665,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
663 int replaced_count = 0; 665 int replaced_count = 0;
664 int dext_alen; 666 int dext_alen;
665 667
666 /* Protect extent trees against block allocations via delalloc */
667 double_down_write_data_sem(orig_inode, donor_inode);
668
669 /* Get the original extent for the block "orig_off" */ 668 /* Get the original extent for the block "orig_off" */
670 *err = get_ext_path(orig_inode, orig_off, &orig_path); 669 *err = get_ext_path(orig_inode, orig_off, &orig_path);
671 if (*err) 670 if (*err)
@@ -764,12 +763,122 @@ out:
764 ext4_ext_invalidate_cache(orig_inode); 763 ext4_ext_invalidate_cache(orig_inode);
765 ext4_ext_invalidate_cache(donor_inode); 764 ext4_ext_invalidate_cache(donor_inode);
766 765
767 double_up_write_data_sem(orig_inode, donor_inode);
768
769 return replaced_count; 766 return replaced_count;
770} 767}
771 768
772/** 769/**
770 * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
771 *
772 * @inode1: the inode structure
773 * @inode2: the inode structure
774 * @index: page index
775 * @page: result page vector
776 *
777 * Grab two locked pages for inode's by inode order
778 */
779static int
780mext_page_double_lock(struct inode *inode1, struct inode *inode2,
781 pgoff_t index, struct page *page[2])
782{
783 struct address_space *mapping[2];
784 unsigned fl = AOP_FLAG_NOFS;
785
786 BUG_ON(!inode1 || !inode2);
787 if (inode1 < inode2) {
788 mapping[0] = inode1->i_mapping;
789 mapping[1] = inode2->i_mapping;
790 } else {
791 mapping[0] = inode2->i_mapping;
792 mapping[1] = inode1->i_mapping;
793 }
794
795 page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
796 if (!page[0])
797 return -ENOMEM;
798
799 page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
800 if (!page[1]) {
801 unlock_page(page[0]);
802 page_cache_release(page[0]);
803 return -ENOMEM;
804 }
805
806 if (inode1 > inode2) {
807 struct page *tmp;
808 tmp = page[0];
809 page[0] = page[1];
810 page[1] = tmp;
811 }
812 return 0;
813}
814
815/* Force page buffers uptodate w/o dropping page's lock */
816static int
817mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
818{
819 struct inode *inode = page->mapping->host;
820 sector_t block;
821 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
822 unsigned int blocksize, block_start, block_end;
823 int i, err, nr = 0, partial = 0;
824 BUG_ON(!PageLocked(page));
825 BUG_ON(PageWriteback(page));
826
827 if (PageUptodate(page))
828 return 0;
829
830 blocksize = 1 << inode->i_blkbits;
831 if (!page_has_buffers(page))
832 create_empty_buffers(page, blocksize, 0);
833
834 head = page_buffers(page);
835 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
836 for (bh = head, block_start = 0; bh != head || !block_start;
837 block++, block_start = block_end, bh = bh->b_this_page) {
838 block_end = block_start + blocksize;
839 if (block_end <= from || block_start >= to) {
840 if (!buffer_uptodate(bh))
841 partial = 1;
842 continue;
843 }
844 if (buffer_uptodate(bh))
845 continue;
846 if (!buffer_mapped(bh)) {
847 int err = 0;
848 err = ext4_get_block(inode, block, bh, 0);
849 if (err) {
850 SetPageError(page);
851 return err;
852 }
853 if (!buffer_mapped(bh)) {
854 zero_user(page, block_start, blocksize);
855 if (!err)
856 set_buffer_uptodate(bh);
857 continue;
858 }
859 }
860 BUG_ON(nr >= MAX_BUF_PER_PAGE);
861 arr[nr++] = bh;
862 }
863 /* No io required */
864 if (!nr)
865 goto out;
866
867 for (i = 0; i < nr; i++) {
868 bh = arr[i];
869 if (!bh_uptodate_or_lock(bh)) {
870 err = bh_submit_read(bh);
871 if (err)
872 return err;
873 }
874 }
875out:
876 if (!partial)
877 SetPageUptodate(page);
878 return 0;
879}
880
881/**
773 * move_extent_per_page - Move extent data per page 882 * move_extent_per_page - Move extent data per page
774 * 883 *
775 * @o_filp: file structure of original file 884 * @o_filp: file structure of original file
@@ -791,26 +900,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
791 int block_len_in_page, int uninit, int *err) 900 int block_len_in_page, int uninit, int *err)
792{ 901{
793 struct inode *orig_inode = o_filp->f_dentry->d_inode; 902 struct inode *orig_inode = o_filp->f_dentry->d_inode;
794 struct address_space *mapping = orig_inode->i_mapping; 903 struct page *pagep[2] = {NULL, NULL};
795 struct buffer_head *bh;
796 struct page *page = NULL;
797 const struct address_space_operations *a_ops = mapping->a_ops;
798 handle_t *handle; 904 handle_t *handle;
799 ext4_lblk_t orig_blk_offset; 905 ext4_lblk_t orig_blk_offset;
800 long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 906 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
801 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 907 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
802 unsigned int w_flags = 0; 908 unsigned int w_flags = 0;
803 unsigned int tmp_data_size, data_size, replaced_size; 909 unsigned int tmp_data_size, data_size, replaced_size;
804 void *fsdata; 910 int err2, jblocks, retries = 0;
805 int i, jblocks;
806 int err2 = 0;
807 int replaced_count = 0; 911 int replaced_count = 0;
912 int from = data_offset_in_page << orig_inode->i_blkbits;
808 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 913 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
809 914
810 /* 915 /*
811 * It needs twice the amount of ordinary journal buffers because 916 * It needs twice the amount of ordinary journal buffers because
812 * inode and donor_inode may change each different metadata blocks. 917 * inode and donor_inode may change each different metadata blocks.
813 */ 918 */
919again:
920 *err = 0;
814 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 921 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
815 handle = ext4_journal_start(orig_inode, jblocks); 922 handle = ext4_journal_start(orig_inode, jblocks);
816 if (IS_ERR(handle)) { 923 if (IS_ERR(handle)) {
@@ -824,19 +931,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
824 orig_blk_offset = orig_page_offset * blocks_per_page + 931 orig_blk_offset = orig_page_offset * blocks_per_page +
825 data_offset_in_page; 932 data_offset_in_page;
826 933
827 /*
828 * If orig extent is uninitialized one,
829 * it's not necessary force the page into memory
830 * and then force it to be written out again.
831 * Just swap data blocks between orig and donor.
832 */
833 if (uninit) {
834 replaced_count = mext_replace_branches(handle, orig_inode,
835 donor_inode, orig_blk_offset,
836 block_len_in_page, err);
837 goto out2;
838 }
839
840 offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 934 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
841 935
842 /* Calculate data_size */ 936 /* Calculate data_size */
@@ -858,75 +952,120 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
858 952
859 replaced_size = data_size; 953 replaced_size = data_size;
860 954
861 *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, 955 *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
862 &page, &fsdata); 956 pagep);
863 if (unlikely(*err < 0)) 957 if (unlikely(*err < 0))
864 goto out; 958 goto stop_journal;
865
866 if (!PageUptodate(page)) {
867 mapping->a_ops->readpage(o_filp, page);
868 lock_page(page);
869 }
870
871 /* 959 /*
872 * try_to_release_page() doesn't call releasepage in writeback mode. 960 * If orig extent was uninitialized it can become initialized
873 * We should care about the order of writing to the same file 961 * at any time after i_data_sem was dropped, in order to
874 * by multiple move extent processes. 962 * serialize with delalloc we have recheck extent while we
875 * It needs to call wait_on_page_writeback() to wait for the 963 * hold page's lock, if it is still the case data copy is not
876 * writeback of the page. 964 * necessary, just swap data blocks between orig and donor.
877 */ 965 */
878 wait_on_page_writeback(page); 966 if (uninit) {
967 double_down_write_data_sem(orig_inode, donor_inode);
968 /* If any of extents in range became initialized we have to
969 * fallback to data copying */
970 uninit = mext_check_coverage(orig_inode, orig_blk_offset,
971 block_len_in_page, 1, err);
972 if (*err)
973 goto drop_data_sem;
879 974
880 /* Release old bh and drop refs */ 975 uninit &= mext_check_coverage(donor_inode, orig_blk_offset,
881 try_to_release_page(page, 0); 976 block_len_in_page, 1, err);
977 if (*err)
978 goto drop_data_sem;
979
980 if (!uninit) {
981 double_up_write_data_sem(orig_inode, donor_inode);
982 goto data_copy;
983 }
984 if ((page_has_private(pagep[0]) &&
985 !try_to_release_page(pagep[0], 0)) ||
986 (page_has_private(pagep[1]) &&
987 !try_to_release_page(pagep[1], 0))) {
988 *err = -EBUSY;
989 goto drop_data_sem;
990 }
991 replaced_count = mext_replace_branches(handle, orig_inode,
992 donor_inode, orig_blk_offset,
993 block_len_in_page, err);
994 drop_data_sem:
995 double_up_write_data_sem(orig_inode, donor_inode);
996 goto unlock_pages;
997 }
998data_copy:
999 *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
1000 if (*err)
1001 goto unlock_pages;
1002
1003 /* At this point all buffers in range are uptodate, old mapping layout
1004 * is no longer required, try to drop it now. */
1005 if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
1006 (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
1007 *err = -EBUSY;
1008 goto unlock_pages;
1009 }
882 1010
883 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 1011 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
884 orig_blk_offset, block_len_in_page, 1012 orig_blk_offset,
885 &err2); 1013 block_len_in_page, err);
886 if (err2) { 1014 if (*err) {
887 if (replaced_count) { 1015 if (replaced_count) {
888 block_len_in_page = replaced_count; 1016 block_len_in_page = replaced_count;
889 replaced_size = 1017 replaced_size =
890 block_len_in_page << orig_inode->i_blkbits; 1018 block_len_in_page << orig_inode->i_blkbits;
891 } else 1019 } else
892 goto out; 1020 goto unlock_pages;
893 } 1021 }
1022 /* Perform all necessary steps similar write_begin()/write_end()
1023 * but keeping in mind that i_size will not change */
1024 *err = __block_write_begin(pagep[0], from, from + replaced_size,
1025 ext4_get_block);
1026 if (!*err)
1027 *err = block_commit_write(pagep[0], from, from + replaced_size);
894 1028
895 if (!page_has_buffers(page)) 1029 if (unlikely(*err < 0))
896 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); 1030 goto repair_branches;
897 1031
898 bh = page_buffers(page); 1032 /* Even in case of data=writeback it is reasonable to pin
899 for (i = 0; i < data_offset_in_page; i++) 1033 * inode to transaction, to prevent unexpected data loss */
900 bh = bh->b_this_page; 1034 *err = ext4_jbd2_file_inode(handle, orig_inode);
901 1035
902 for (i = 0; i < block_len_in_page; i++) { 1036unlock_pages:
903 *err = ext4_get_block(orig_inode, 1037 unlock_page(pagep[0]);
904 (sector_t)(orig_blk_offset + i), bh, 0); 1038 page_cache_release(pagep[0]);
905 if (*err < 0) 1039 unlock_page(pagep[1]);
906 goto out; 1040 page_cache_release(pagep[1]);
907 1041stop_journal:
908 if (bh->b_this_page != NULL)
909 bh = bh->b_this_page;
910 }
911
912 *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
913 page, fsdata);
914 page = NULL;
915
916out:
917 if (unlikely(page)) {
918 if (PageLocked(page))
919 unlock_page(page);
920 page_cache_release(page);
921 ext4_journal_stop(handle);
922 }
923out2:
924 ext4_journal_stop(handle); 1042 ext4_journal_stop(handle);
925 1043 /* Buffer was busy because probably is pinned to journal transaction,
926 if (err2) 1044 * force transaction commit may help to free it. */
927 *err = err2; 1045 if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
928 1046 &retries))
1047 goto again;
929 return replaced_count; 1048 return replaced_count;
1049
1050repair_branches:
1051 /*
1052 * This should never ever happen!
1053 * Extents are swapped already, but we are not able to copy data.
1054 * Try to swap extents to it's original places
1055 */
1056 double_down_write_data_sem(orig_inode, donor_inode);
1057 replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
1058 orig_blk_offset,
1059 block_len_in_page, &err2);
1060 double_up_write_data_sem(orig_inode, donor_inode);
1061 if (replaced_count != block_len_in_page) {
1062 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
1063 "Unable to copy data block,"
1064 " data will be lost.");
1065 *err = -EIO;
1066 }
1067 replaced_count = 0;
1068 goto unlock_pages;
930} 1069}
931 1070
932/** 1071/**
@@ -969,14 +1108,6 @@ mext_check_arguments(struct inode *orig_inode,
969 return -EINVAL; 1108 return -EINVAL;
970 } 1109 }
971 1110
972 /* Files should be in the same ext4 FS */
973 if (orig_inode->i_sb != donor_inode->i_sb) {
974 ext4_debug("ext4 move extent: The argument files "
975 "should be in same FS [ino:orig %lu, donor %lu]\n",
976 orig_inode->i_ino, donor_inode->i_ino);
977 return -EINVAL;
978 }
979
980 /* Ext4 move extent supports only extent based file */ 1111 /* Ext4 move extent supports only extent based file */
981 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { 1112 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
982 ext4_debug("ext4 move extent: orig file is not extents " 1113 ext4_debug("ext4 move extent: orig file is not extents "
@@ -1002,7 +1133,6 @@ mext_check_arguments(struct inode *orig_inode,
1002 } 1133 }
1003 1134
1004 if ((orig_start >= EXT_MAX_BLOCKS) || 1135 if ((orig_start >= EXT_MAX_BLOCKS) ||
1005 (donor_start >= EXT_MAX_BLOCKS) ||
1006 (*len > EXT_MAX_BLOCKS) || 1136 (*len > EXT_MAX_BLOCKS) ||
1007 (orig_start + *len >= EXT_MAX_BLOCKS)) { 1137 (orig_start + *len >= EXT_MAX_BLOCKS)) {
1008 ext4_debug("ext4 move extent: Can't handle over [%u] blocks " 1138 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
@@ -1072,35 +1202,19 @@ mext_check_arguments(struct inode *orig_inode,
1072 * @inode1: the inode structure 1202 * @inode1: the inode structure
1073 * @inode2: the inode structure 1203 * @inode2: the inode structure
1074 * 1204 *
1075 * Lock two inodes' i_mutex by i_ino order. 1205 * Lock two inodes' i_mutex
1076 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1077 */ 1206 */
1078static int 1207static void
1079mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1208mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1080{ 1209{
1081 int ret = 0; 1210 BUG_ON(inode1 == inode2);
1082 1211 if (inode1 < inode2) {
1083 BUG_ON(inode1 == NULL && inode2 == NULL);
1084
1085 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1086 if (ret < 0)
1087 goto out;
1088
1089 if (inode1 == inode2) {
1090 mutex_lock(&inode1->i_mutex);
1091 goto out;
1092 }
1093
1094 if (inode1->i_ino < inode2->i_ino) {
1095 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); 1212 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
1096 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); 1213 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
1097 } else { 1214 } else {
1098 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1215 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1099 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1216 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1100 } 1217 }
1101
1102out:
1103 return ret;
1104} 1218}
1105 1219
1106/** 1220/**
@@ -1109,28 +1223,13 @@ out:
1109 * @inode1: the inode that is released first 1223 * @inode1: the inode that is released first
1110 * @inode2: the inode that is released second 1224 * @inode2: the inode that is released second
1111 * 1225 *
1112 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1113 */ 1226 */
1114 1227
1115static int 1228static void
1116mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1229mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1117{ 1230{
1118 int ret = 0; 1231 mutex_unlock(&inode1->i_mutex);
1119 1232 mutex_unlock(&inode2->i_mutex);
1120 BUG_ON(inode1 == NULL && inode2 == NULL);
1121
1122 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1123 if (ret < 0)
1124 goto out;
1125
1126 if (inode1)
1127 mutex_unlock(&inode1->i_mutex);
1128
1129 if (inode2 && inode2 != inode1)
1130 mutex_unlock(&inode2->i_mutex);
1131
1132out:
1133 return ret;
1134} 1233}
1135 1234
1136/** 1235/**
@@ -1187,16 +1286,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1187 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; 1286 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
1188 ext4_lblk_t rest_blocks; 1287 ext4_lblk_t rest_blocks;
1189 pgoff_t orig_page_offset = 0, seq_end_page; 1288 pgoff_t orig_page_offset = 0, seq_end_page;
1190 int ret1, ret2, depth, last_extent = 0; 1289 int ret, depth, last_extent = 0;
1191 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 1290 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
1192 int data_offset_in_page; 1291 int data_offset_in_page;
1193 int block_len_in_page; 1292 int block_len_in_page;
1194 int uninit; 1293 int uninit;
1195 1294
1196 /* orig and donor should be different file */ 1295 if (orig_inode->i_sb != donor_inode->i_sb) {
1197 if (orig_inode->i_ino == donor_inode->i_ino) { 1296 ext4_debug("ext4 move extent: The argument files "
1297 "should be in same FS [ino:orig %lu, donor %lu]\n",
1298 orig_inode->i_ino, donor_inode->i_ino);
1299 return -EINVAL;
1300 }
1301
1302 /* orig and donor should be different inodes */
1303 if (orig_inode == donor_inode) {
1198 ext4_debug("ext4 move extent: The argument files should not " 1304 ext4_debug("ext4 move extent: The argument files should not "
1199 "be same file [ino:orig %lu, donor %lu]\n", 1305 "be same inode [ino:orig %lu, donor %lu]\n",
1200 orig_inode->i_ino, donor_inode->i_ino); 1306 orig_inode->i_ino, donor_inode->i_ino);
1201 return -EINVAL; 1307 return -EINVAL;
1202 } 1308 }
@@ -1208,18 +1314,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1208 orig_inode->i_ino, donor_inode->i_ino); 1314 orig_inode->i_ino, donor_inode->i_ino);
1209 return -EINVAL; 1315 return -EINVAL;
1210 } 1316 }
1211 1317 /* TODO: This is non obvious task to swap blocks for inodes with full
1318 jornaling enabled */
1319 if (ext4_should_journal_data(orig_inode) ||
1320 ext4_should_journal_data(donor_inode)) {
1321 return -EINVAL;
1322 }
1212 /* Protect orig and donor inodes against a truncate */ 1323 /* Protect orig and donor inodes against a truncate */
1213 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1324 mext_inode_double_lock(orig_inode, donor_inode);
1214 if (ret1 < 0) 1325
1215 return ret1; 1326 /* Wait for all existing dio workers */
1327 ext4_inode_block_unlocked_dio(orig_inode);
1328 ext4_inode_block_unlocked_dio(donor_inode);
1329 inode_dio_wait(orig_inode);
1330 inode_dio_wait(donor_inode);
1216 1331
1217 /* Protect extent tree against block allocations via delalloc */ 1332 /* Protect extent tree against block allocations via delalloc */
1218 double_down_write_data_sem(orig_inode, donor_inode); 1333 double_down_write_data_sem(orig_inode, donor_inode);
1219 /* Check the filesystem environment whether move_extent can be done */ 1334 /* Check the filesystem environment whether move_extent can be done */
1220 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, 1335 ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
1221 donor_start, &len); 1336 donor_start, &len);
1222 if (ret1) 1337 if (ret)
1223 goto out; 1338 goto out;
1224 1339
1225 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; 1340 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
@@ -1227,13 +1342,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1227 if (file_end < block_end) 1342 if (file_end < block_end)
1228 len -= block_end - file_end; 1343 len -= block_end - file_end;
1229 1344
1230 ret1 = get_ext_path(orig_inode, block_start, &orig_path); 1345 ret = get_ext_path(orig_inode, block_start, &orig_path);
1231 if (ret1) 1346 if (ret)
1232 goto out; 1347 goto out;
1233 1348
1234 /* Get path structure to check the hole */ 1349 /* Get path structure to check the hole */
1235 ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); 1350 ret = get_ext_path(orig_inode, block_start, &holecheck_path);
1236 if (ret1) 1351 if (ret)
1237 goto out; 1352 goto out;
1238 1353
1239 depth = ext_depth(orig_inode); 1354 depth = ext_depth(orig_inode);
@@ -1252,13 +1367,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1252 last_extent = mext_next_extent(orig_inode, 1367 last_extent = mext_next_extent(orig_inode,
1253 holecheck_path, &ext_cur); 1368 holecheck_path, &ext_cur);
1254 if (last_extent < 0) { 1369 if (last_extent < 0) {
1255 ret1 = last_extent; 1370 ret = last_extent;
1256 goto out; 1371 goto out;
1257 } 1372 }
1258 last_extent = mext_next_extent(orig_inode, orig_path, 1373 last_extent = mext_next_extent(orig_inode, orig_path,
1259 &ext_dummy); 1374 &ext_dummy);
1260 if (last_extent < 0) { 1375 if (last_extent < 0) {
1261 ret1 = last_extent; 1376 ret = last_extent;
1262 goto out; 1377 goto out;
1263 } 1378 }
1264 seq_start = le32_to_cpu(ext_cur->ee_block); 1379 seq_start = le32_to_cpu(ext_cur->ee_block);
@@ -1272,7 +1387,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1272 if (le32_to_cpu(ext_cur->ee_block) > block_end) { 1387 if (le32_to_cpu(ext_cur->ee_block) > block_end) {
1273 ext4_debug("ext4 move extent: The specified range of file " 1388 ext4_debug("ext4 move extent: The specified range of file "
1274 "may be the hole\n"); 1389 "may be the hole\n");
1275 ret1 = -EINVAL; 1390 ret = -EINVAL;
1276 goto out; 1391 goto out;
1277 } 1392 }
1278 1393
@@ -1292,7 +1407,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1292 last_extent = mext_next_extent(orig_inode, holecheck_path, 1407 last_extent = mext_next_extent(orig_inode, holecheck_path,
1293 &ext_cur); 1408 &ext_cur);
1294 if (last_extent < 0) { 1409 if (last_extent < 0) {
1295 ret1 = last_extent; 1410 ret = last_extent;
1296 break; 1411 break;
1297 } 1412 }
1298 add_blocks = ext4_ext_get_actual_len(ext_cur); 1413 add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1349,18 +1464,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1349 orig_page_offset, 1464 orig_page_offset,
1350 data_offset_in_page, 1465 data_offset_in_page,
1351 block_len_in_page, uninit, 1466 block_len_in_page, uninit,
1352 &ret1); 1467 &ret);
1353 1468
1354 /* Count how many blocks we have exchanged */ 1469 /* Count how many blocks we have exchanged */
1355 *moved_len += block_len_in_page; 1470 *moved_len += block_len_in_page;
1356 if (ret1 < 0) 1471 if (ret < 0)
1357 break; 1472 break;
1358 if (*moved_len > len) { 1473 if (*moved_len > len) {
1359 EXT4_ERROR_INODE(orig_inode, 1474 EXT4_ERROR_INODE(orig_inode,
1360 "We replaced blocks too much! " 1475 "We replaced blocks too much! "
1361 "sum of replaced: %llu requested: %llu", 1476 "sum of replaced: %llu requested: %llu",
1362 *moved_len, len); 1477 *moved_len, len);
1363 ret1 = -EIO; 1478 ret = -EIO;
1364 break; 1479 break;
1365 } 1480 }
1366 1481
@@ -1374,22 +1489,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1374 } 1489 }
1375 1490
1376 double_down_write_data_sem(orig_inode, donor_inode); 1491 double_down_write_data_sem(orig_inode, donor_inode);
1377 if (ret1 < 0) 1492 if (ret < 0)
1378 break; 1493 break;
1379 1494
1380 /* Decrease buffer counter */ 1495 /* Decrease buffer counter */
1381 if (holecheck_path) 1496 if (holecheck_path)
1382 ext4_ext_drop_refs(holecheck_path); 1497 ext4_ext_drop_refs(holecheck_path);
1383 ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); 1498 ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
1384 if (ret1) 1499 if (ret)
1385 break; 1500 break;
1386 depth = holecheck_path->p_depth; 1501 depth = holecheck_path->p_depth;
1387 1502
1388 /* Decrease buffer counter */ 1503 /* Decrease buffer counter */
1389 if (orig_path) 1504 if (orig_path)
1390 ext4_ext_drop_refs(orig_path); 1505 ext4_ext_drop_refs(orig_path);
1391 ret1 = get_ext_path(orig_inode, seq_start, &orig_path); 1506 ret = get_ext_path(orig_inode, seq_start, &orig_path);
1392 if (ret1) 1507 if (ret)
1393 break; 1508 break;
1394 1509
1395 ext_cur = holecheck_path[depth].p_ext; 1510 ext_cur = holecheck_path[depth].p_ext;
@@ -1412,12 +1527,9 @@ out:
1412 kfree(holecheck_path); 1527 kfree(holecheck_path);
1413 } 1528 }
1414 double_up_write_data_sem(orig_inode, donor_inode); 1529 double_up_write_data_sem(orig_inode, donor_inode);
1415 ret2 = mext_inode_double_unlock(orig_inode, donor_inode); 1530 ext4_inode_resume_unlocked_dio(orig_inode);
1416 1531 ext4_inode_resume_unlocked_dio(donor_inode);
1417 if (ret1) 1532 mext_inode_double_unlock(orig_inode, donor_inode);
1418 return ret1;
1419 else if (ret2)
1420 return ret2;
1421 1533
1422 return 0; 1534 return ret;
1423} 1535}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2a42cc04466f..6d600a69fc9d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -55,6 +55,13 @@ static struct buffer_head *ext4_append(handle_t *handle,
55{ 55{
56 struct buffer_head *bh; 56 struct buffer_head *bh;
57 57
58 if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
59 ((inode->i_size >> 10) >=
60 EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
61 *err = -ENOSPC;
62 return NULL;
63 }
64
58 *block = inode->i_size >> inode->i_sb->s_blocksize_bits; 65 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
59 66
60 bh = ext4_bread(handle, inode, *block, 1, err); 67 bh = ext4_bread(handle, inode, *block, 1, err);
@@ -67,6 +74,12 @@ static struct buffer_head *ext4_append(handle_t *handle,
67 bh = NULL; 74 bh = NULL;
68 } 75 }
69 } 76 }
77 if (!bh && !(*err)) {
78 *err = -EIO;
79 ext4_error(inode->i_sb,
80 "Directory hole detected on inode %lu\n",
81 inode->i_ino);
82 }
70 return bh; 83 return bh;
71} 84}
72 85
@@ -594,8 +607,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
594 u32 hash; 607 u32 hash;
595 608
596 frame->bh = NULL; 609 frame->bh = NULL;
597 if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) 610 if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
611 if (*err == 0)
612 *err = ERR_BAD_DX_DIR;
598 goto fail; 613 goto fail;
614 }
599 root = (struct dx_root *) bh->b_data; 615 root = (struct dx_root *) bh->b_data;
600 if (root->info.hash_version != DX_HASH_TEA && 616 if (root->info.hash_version != DX_HASH_TEA &&
601 root->info.hash_version != DX_HASH_HALF_MD4 && 617 root->info.hash_version != DX_HASH_HALF_MD4 &&
@@ -696,8 +712,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
696 frame->entries = entries; 712 frame->entries = entries;
697 frame->at = at; 713 frame->at = at;
698 if (!indirect--) return frame; 714 if (!indirect--) return frame;
699 if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) 715 if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
716 if (!(*err))
717 *err = ERR_BAD_DX_DIR;
700 goto fail2; 718 goto fail2;
719 }
701 at = entries = ((struct dx_node *) bh->b_data)->entries; 720 at = entries = ((struct dx_node *) bh->b_data)->entries;
702 721
703 if (!buffer_verified(bh) && 722 if (!buffer_verified(bh) &&
@@ -807,8 +826,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
807 */ 826 */
808 while (num_frames--) { 827 while (num_frames--) {
809 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), 828 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
810 0, &err))) 829 0, &err))) {
830 if (!err) {
831 ext4_error(dir->i_sb,
832 "Directory hole detected on inode %lu\n",
833 dir->i_ino);
834 return -EIO;
835 }
811 return err; /* Failure */ 836 return err; /* Failure */
837 }
812 838
813 if (!buffer_verified(bh) && 839 if (!buffer_verified(bh) &&
814 !ext4_dx_csum_verify(dir, 840 !ext4_dx_csum_verify(dir,
@@ -839,12 +865,19 @@ static int htree_dirblock_to_tree(struct file *dir_file,
839{ 865{
840 struct buffer_head *bh; 866 struct buffer_head *bh;
841 struct ext4_dir_entry_2 *de, *top; 867 struct ext4_dir_entry_2 *de, *top;
842 int err, count = 0; 868 int err = 0, count = 0;
843 869
844 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", 870 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
845 (unsigned long)block)); 871 (unsigned long)block));
846 if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) 872 if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
873 if (!err) {
874 err = -EIO;
875 ext4_error(dir->i_sb,
876 "Directory hole detected on inode %lu\n",
877 dir->i_ino);
878 }
847 return err; 879 return err;
880 }
848 881
849 if (!buffer_verified(bh) && 882 if (!buffer_verified(bh) &&
850 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) 883 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -1267,8 +1300,15 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1267 return NULL; 1300 return NULL;
1268 do { 1301 do {
1269 block = dx_get_block(frame->at); 1302 block = dx_get_block(frame->at);
1270 if (!(bh = ext4_bread(NULL, dir, block, 0, err))) 1303 if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
1304 if (!(*err)) {
1305 *err = -EIO;
1306 ext4_error(dir->i_sb,
1307 "Directory hole detected on inode %lu\n",
1308 dir->i_ino);
1309 }
1271 goto errout; 1310 goto errout;
1311 }
1272 1312
1273 if (!buffer_verified(bh) && 1313 if (!buffer_verified(bh) &&
1274 !ext4_dirent_csum_verify(dir, 1314 !ext4_dirent_csum_verify(dir,
@@ -1801,9 +1841,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1801 } 1841 }
1802 blocks = dir->i_size >> sb->s_blocksize_bits; 1842 blocks = dir->i_size >> sb->s_blocksize_bits;
1803 for (block = 0; block < blocks; block++) { 1843 for (block = 0; block < blocks; block++) {
1804 bh = ext4_bread(handle, dir, block, 0, &retval); 1844 if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
1805 if(!bh) 1845 if (!retval) {
1846 retval = -EIO;
1847 ext4_error(inode->i_sb,
1848 "Directory hole detected on inode %lu\n",
1849 inode->i_ino);
1850 }
1806 return retval; 1851 return retval;
1852 }
1807 if (!buffer_verified(bh) && 1853 if (!buffer_verified(bh) &&
1808 !ext4_dirent_csum_verify(dir, 1854 !ext4_dirent_csum_verify(dir,
1809 (struct ext4_dir_entry *)bh->b_data)) 1855 (struct ext4_dir_entry *)bh->b_data))
@@ -1860,8 +1906,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1860 entries = frame->entries; 1906 entries = frame->entries;
1861 at = frame->at; 1907 at = frame->at;
1862 1908
1863 if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) 1909 if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
1910 if (!err) {
1911 err = -EIO;
1912 ext4_error(dir->i_sb,
1913 "Directory hole detected on inode %lu\n",
1914 dir->i_ino);
1915 }
1864 goto cleanup; 1916 goto cleanup;
1917 }
1865 1918
1866 if (!buffer_verified(bh) && 1919 if (!buffer_verified(bh) &&
1867 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) 1920 !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
@@ -2149,9 +2202,7 @@ retry:
2149 err = PTR_ERR(inode); 2202 err = PTR_ERR(inode);
2150 if (!IS_ERR(inode)) { 2203 if (!IS_ERR(inode)) {
2151 init_special_inode(inode, inode->i_mode, rdev); 2204 init_special_inode(inode, inode->i_mode, rdev);
2152#ifdef CONFIG_EXT4_FS_XATTR
2153 inode->i_op = &ext4_special_inode_operations; 2205 inode->i_op = &ext4_special_inode_operations;
2154#endif
2155 err = ext4_add_nondir(handle, dentry, inode); 2206 err = ext4_add_nondir(handle, dentry, inode);
2156 } 2207 }
2157 ext4_journal_stop(handle); 2208 ext4_journal_stop(handle);
@@ -2199,9 +2250,15 @@ retry:
2199 inode->i_op = &ext4_dir_inode_operations; 2250 inode->i_op = &ext4_dir_inode_operations;
2200 inode->i_fop = &ext4_dir_operations; 2251 inode->i_fop = &ext4_dir_operations;
2201 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 2252 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
2202 dir_block = ext4_bread(handle, inode, 0, 1, &err); 2253 if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
2203 if (!dir_block) 2254 if (!err) {
2255 err = -EIO;
2256 ext4_error(inode->i_sb,
2257 "Directory hole detected on inode %lu\n",
2258 inode->i_ino);
2259 }
2204 goto out_clear_inode; 2260 goto out_clear_inode;
2261 }
2205 BUFFER_TRACE(dir_block, "get_write_access"); 2262 BUFFER_TRACE(dir_block, "get_write_access");
2206 err = ext4_journal_get_write_access(handle, dir_block); 2263 err = ext4_journal_get_write_access(handle, dir_block);
2207 if (err) 2264 if (err)
@@ -2318,6 +2375,11 @@ static int empty_dir(struct inode *inode)
2318 EXT4_ERROR_INODE(inode, 2375 EXT4_ERROR_INODE(inode,
2319 "error %d reading directory " 2376 "error %d reading directory "
2320 "lblock %u", err, lblock); 2377 "lblock %u", err, lblock);
2378 else
2379 ext4_warning(inode->i_sb,
2380 "bad directory (dir #%lu) - no data block",
2381 inode->i_ino);
2382
2321 offset += sb->s_blocksize; 2383 offset += sb->s_blocksize;
2322 continue; 2384 continue;
2323 } 2385 }
@@ -2362,7 +2424,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2362 struct ext4_iloc iloc; 2424 struct ext4_iloc iloc;
2363 int err = 0, rc; 2425 int err = 0, rc;
2364 2426
2365 if (!ext4_handle_valid(handle)) 2427 if (!EXT4_SB(sb)->s_journal)
2366 return 0; 2428 return 0;
2367 2429
2368 mutex_lock(&EXT4_SB(sb)->s_orphan_lock); 2430 mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
@@ -2436,8 +2498,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2436 struct ext4_iloc iloc; 2498 struct ext4_iloc iloc;
2437 int err = 0; 2499 int err = 0;
2438 2500
2439 /* ext4_handle_valid() assumes a valid handle_t pointer */ 2501 if (!EXT4_SB(inode->i_sb)->s_journal)
2440 if (handle && !ext4_handle_valid(handle))
2441 return 0; 2502 return 0;
2442 2503
2443 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2504 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2456,7 +2517,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2456 * transaction handle with which to update the orphan list on 2517 * transaction handle with which to update the orphan list on
2457 * disk, but we still need to remove the inode from the linked 2518 * disk, but we still need to remove the inode from the linked
2458 * list in memory. */ 2519 * list in memory. */
2459 if (sbi->s_journal && !handle) 2520 if (!handle)
2460 goto out; 2521 goto out;
2461 2522
2462 err = ext4_reserve_inode_write(handle, inode, &iloc); 2523 err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2826,9 +2887,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2826 goto end_rename; 2887 goto end_rename;
2827 } 2888 }
2828 retval = -EIO; 2889 retval = -EIO;
2829 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); 2890 if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) {
2830 if (!dir_bh) 2891 if (!retval) {
2892 retval = -EIO;
2893 ext4_error(old_inode->i_sb,
2894 "Directory hole detected on inode %lu\n",
2895 old_inode->i_ino);
2896 }
2831 goto end_rename; 2897 goto end_rename;
2898 }
2832 if (!buffer_verified(dir_bh) && 2899 if (!buffer_verified(dir_bh) &&
2833 !ext4_dirent_csum_verify(old_inode, 2900 !ext4_dirent_csum_verify(old_inode,
2834 (struct ext4_dir_entry *)dir_bh->b_data)) 2901 (struct ext4_dir_entry *)dir_bh->b_data))
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dcdeef169a69..68e896e12a67 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -71,6 +71,9 @@ void ext4_free_io_end(ext4_io_end_t *io)
71 int i; 71 int i;
72 72
73 BUG_ON(!io); 73 BUG_ON(!io);
74 BUG_ON(!list_empty(&io->list));
75 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
76
74 if (io->page) 77 if (io->page)
75 put_page(io->page); 78 put_page(io->page);
76 for (i = 0; i < io->num_io_pages; i++) 79 for (i = 0; i < io->num_io_pages; i++)
@@ -81,13 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
81 kmem_cache_free(io_end_cachep, io); 84 kmem_cache_free(io_end_cachep, io);
82} 85}
83 86
84/* 87/* check a range of space and convert unwritten extents to written. */
85 * check a range of space and convert unwritten extents to written. 88static int ext4_end_io(ext4_io_end_t *io)
86 *
87 * Called with inode->i_mutex; we depend on this when we manipulate
88 * io->flag, since we could otherwise race with ext4_flush_completed_IO()
89 */
90int ext4_end_io_nolock(ext4_io_end_t *io)
91{ 89{
92 struct inode *inode = io->inode; 90 struct inode *inode = io->inode;
93 loff_t offset = io->offset; 91 loff_t offset = io->offset;
@@ -106,63 +104,136 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
106 "(inode %lu, offset %llu, size %zd, error %d)", 104 "(inode %lu, offset %llu, size %zd, error %d)",
107 inode->i_ino, offset, size, ret); 105 inode->i_ino, offset, size, ret);
108 } 106 }
109
110 if (io->iocb) 107 if (io->iocb)
111 aio_complete(io->iocb, io->result, 0); 108 aio_complete(io->iocb, io->result, 0);
112 109
113 if (io->flag & EXT4_IO_END_DIRECT) 110 if (io->flag & EXT4_IO_END_DIRECT)
114 inode_dio_done(inode); 111 inode_dio_done(inode);
115 /* Wake up anyone waiting on unwritten extent conversion */ 112 /* Wake up anyone waiting on unwritten extent conversion */
116 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) 113 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
117 wake_up_all(ext4_ioend_wq(io->inode)); 114 wake_up_all(ext4_ioend_wq(io->inode));
118 return ret; 115 return ret;
119} 116}
120 117
121/* 118static void dump_completed_IO(struct inode *inode)
122 * work on completed aio dio IO, to convert unwritten extents to extents 119{
123 */ 120#ifdef EXT4FS_DEBUG
124static void ext4_end_io_work(struct work_struct *work) 121 struct list_head *cur, *before, *after;
122 ext4_io_end_t *io, *io0, *io1;
123 unsigned long flags;
124
125 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
126 ext4_debug("inode %lu completed_io list is empty\n",
127 inode->i_ino);
128 return;
129 }
130
131 ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
132 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
133 cur = &io->list;
134 before = cur->prev;
135 io0 = container_of(before, ext4_io_end_t, list);
136 after = cur->next;
137 io1 = container_of(after, ext4_io_end_t, list);
138
139 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
140 io, inode->i_ino, io0, io1);
141 }
142#endif
143}
144
145/* Add the io_end to per-inode completed end_io list. */
146void ext4_add_complete_io(ext4_io_end_t *io_end)
125{ 147{
126 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); 148 struct ext4_inode_info *ei = EXT4_I(io_end->inode);
127 struct inode *inode = io->inode; 149 struct workqueue_struct *wq;
128 struct ext4_inode_info *ei = EXT4_I(inode); 150 unsigned long flags;
129 unsigned long flags; 151
152 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
153 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
130 154
131 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 155 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
132 if (io->flag & EXT4_IO_END_IN_FSYNC) 156 if (list_empty(&ei->i_completed_io_list)) {
133 goto requeue; 157 io_end->flag |= EXT4_IO_END_QUEUED;
134 if (list_empty(&io->list)) { 158 queue_work(wq, &io_end->work);
135 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
136 goto free;
137 } 159 }
160 list_add_tail(&io_end->list, &ei->i_completed_io_list);
161 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
162}
138 163
139 if (!mutex_trylock(&inode->i_mutex)) { 164static int ext4_do_flush_completed_IO(struct inode *inode,
140 bool was_queued; 165 ext4_io_end_t *work_io)
141requeue: 166{
142 was_queued = !!(io->flag & EXT4_IO_END_QUEUED); 167 ext4_io_end_t *io;
143 io->flag |= EXT4_IO_END_QUEUED; 168 struct list_head unwritten, complete, to_free;
144 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 169 unsigned long flags;
145 /* 170 struct ext4_inode_info *ei = EXT4_I(inode);
146 * Requeue the work instead of waiting so that the work 171 int err, ret = 0;
147 * items queued after this can be processed. 172
148 */ 173 INIT_LIST_HEAD(&complete);
149 queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); 174 INIT_LIST_HEAD(&to_free);
150 /* 175
151 * To prevent the ext4-dio-unwritten thread from keeping 176 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
152 * requeueing end_io requests and occupying cpu for too long, 177 dump_completed_IO(inode);
153 * yield the cpu if it sees an end_io request that has already 178 list_replace_init(&ei->i_completed_io_list, &unwritten);
154 * been requeued. 179 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
155 */ 180
156 if (was_queued) 181 while (!list_empty(&unwritten)) {
157 yield(); 182 io = list_entry(unwritten.next, ext4_io_end_t, list);
158 return; 183 BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
184 list_del_init(&io->list);
185
186 err = ext4_end_io(io);
187 if (unlikely(!ret && err))
188 ret = err;
189
190 list_add_tail(&io->list, &complete);
191 }
192 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
193 while (!list_empty(&complete)) {
194 io = list_entry(complete.next, ext4_io_end_t, list);
195 io->flag &= ~EXT4_IO_END_UNWRITTEN;
196 /* end_io context can not be destroyed now because it still
197 * used by queued worker. Worker thread will destroy it later */
198 if (io->flag & EXT4_IO_END_QUEUED)
199 list_del_init(&io->list);
200 else
201 list_move(&io->list, &to_free);
202 }
203 /* If we are called from worker context, it is time to clear queued
204 * flag, and destroy it's end_io if it was converted already */
205 if (work_io) {
206 work_io->flag &= ~EXT4_IO_END_QUEUED;
207 if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
208 list_add_tail(&work_io->list, &to_free);
159 } 209 }
160 list_del_init(&io->list);
161 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 210 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
162 (void) ext4_end_io_nolock(io); 211
163 mutex_unlock(&inode->i_mutex); 212 while (!list_empty(&to_free)) {
164free: 213 io = list_entry(to_free.next, ext4_io_end_t, list);
165 ext4_free_io_end(io); 214 list_del_init(&io->list);
215 ext4_free_io_end(io);
216 }
217 return ret;
218}
219
220/*
221 * work on completed aio dio IO, to convert unwritten extents to extents
222 */
223static void ext4_end_io_work(struct work_struct *work)
224{
225 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
226 ext4_do_flush_completed_IO(io->inode, io);
227}
228
229int ext4_flush_unwritten_io(struct inode *inode)
230{
231 int ret;
232 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
233 !(inode->i_state & I_FREEING));
234 ret = ext4_do_flush_completed_IO(inode, NULL);
235 ext4_unwritten_wait(inode);
236 return ret;
166} 237}
167 238
168ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 239ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -195,9 +266,7 @@ static void buffer_io_error(struct buffer_head *bh)
195static void ext4_end_bio(struct bio *bio, int error) 266static void ext4_end_bio(struct bio *bio, int error)
196{ 267{
197 ext4_io_end_t *io_end = bio->bi_private; 268 ext4_io_end_t *io_end = bio->bi_private;
198 struct workqueue_struct *wq;
199 struct inode *inode; 269 struct inode *inode;
200 unsigned long flags;
201 int i; 270 int i;
202 sector_t bi_sector = bio->bi_sector; 271 sector_t bi_sector = bio->bi_sector;
203 272
@@ -255,14 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
255 return; 324 return;
256 } 325 }
257 326
258 /* Add the io_end to per-inode completed io list*/ 327 ext4_add_complete_io(io_end);
259 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
260 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
261 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
262
263 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
264 /* queue the work to convert unwritten extents to written */
265 queue_work(wq, &io_end->work);
266} 328}
267 329
268void ext4_io_submit(struct ext4_io_submit *io) 330void ext4_io_submit(struct ext4_io_submit *io)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 41f6ef68e2e1..7a75e1086961 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -45,6 +45,28 @@ void ext4_resize_end(struct super_block *sb)
45 smp_mb__after_clear_bit(); 45 smp_mb__after_clear_bit();
46} 46}
47 47
48static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
49 ext4_group_t group) {
50 return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
51 EXT4_DESC_PER_BLOCK_BITS(sb);
52}
53
54static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
55 ext4_group_t group) {
56 group = ext4_meta_bg_first_group(sb, group);
57 return ext4_group_first_block_no(sb, group);
58}
59
60static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
61 ext4_group_t group) {
62 ext4_grpblk_t overhead;
63 overhead = ext4_bg_num_gdb(sb, group);
64 if (ext4_bg_has_super(sb, group))
65 overhead += 1 +
66 le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
67 return overhead;
68}
69
48#define outside(b, first, last) ((b) < (first) || (b) >= (last)) 70#define outside(b, first, last) ((b) < (first) || (b) >= (last))
49#define inside(b, first, last) ((b) >= (first) && (b) < (last)) 71#define inside(b, first, last) ((b) >= (first) && (b) < (last))
50 72
@@ -57,9 +79,7 @@ static int verify_group_input(struct super_block *sb,
57 ext4_fsblk_t end = start + input->blocks_count; 79 ext4_fsblk_t end = start + input->blocks_count;
58 ext4_group_t group = input->group; 80 ext4_group_t group = input->group;
59 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
60 unsigned overhead = ext4_bg_has_super(sb, group) ? 82 unsigned overhead = ext4_group_overhead_blocks(sb, group);
61 (1 + ext4_bg_num_gdb(sb, group) +
62 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
63 ext4_fsblk_t metaend = start + overhead; 83 ext4_fsblk_t metaend = start + overhead;
64 struct buffer_head *bh = NULL; 84 struct buffer_head *bh = NULL;
65 ext4_grpblk_t free_blocks_count, offset; 85 ext4_grpblk_t free_blocks_count, offset;
@@ -200,13 +220,15 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
200 * be a partial of a flex group. 220 * be a partial of a flex group.
201 * 221 *
202 * @sb: super block of fs to which the groups belongs 222 * @sb: super block of fs to which the groups belongs
223 *
224 * Returns 0 on a successful allocation of the metadata blocks in the
225 * block group.
203 */ 226 */
204static void ext4_alloc_group_tables(struct super_block *sb, 227static int ext4_alloc_group_tables(struct super_block *sb,
205 struct ext4_new_flex_group_data *flex_gd, 228 struct ext4_new_flex_group_data *flex_gd,
206 int flexbg_size) 229 int flexbg_size)
207{ 230{
208 struct ext4_new_group_data *group_data = flex_gd->groups; 231 struct ext4_new_group_data *group_data = flex_gd->groups;
209 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
210 ext4_fsblk_t start_blk; 232 ext4_fsblk_t start_blk;
211 ext4_fsblk_t last_blk; 233 ext4_fsblk_t last_blk;
212 ext4_group_t src_group; 234 ext4_group_t src_group;
@@ -226,23 +248,24 @@ static void ext4_alloc_group_tables(struct super_block *sb,
226 (last_group & ~(flexbg_size - 1)))); 248 (last_group & ~(flexbg_size - 1))));
227next_group: 249next_group:
228 group = group_data[0].group; 250 group = group_data[0].group;
251 if (src_group >= group_data[0].group + flex_gd->count)
252 return -ENOSPC;
229 start_blk = ext4_group_first_block_no(sb, src_group); 253 start_blk = ext4_group_first_block_no(sb, src_group);
230 last_blk = start_blk + group_data[src_group - group].blocks_count; 254 last_blk = start_blk + group_data[src_group - group].blocks_count;
231 255
232 overhead = ext4_bg_has_super(sb, src_group) ? 256 overhead = ext4_group_overhead_blocks(sb, src_group);
233 (1 + ext4_bg_num_gdb(sb, src_group) +
234 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
235 257
236 start_blk += overhead; 258 start_blk += overhead;
237 259
238 BUG_ON(src_group >= group_data[0].group + flex_gd->count);
239 /* We collect contiguous blocks as much as possible. */ 260 /* We collect contiguous blocks as much as possible. */
240 src_group++; 261 src_group++;
241 for (; src_group <= last_group; src_group++) 262 for (; src_group <= last_group; src_group++) {
242 if (!ext4_bg_has_super(sb, src_group)) 263 overhead = ext4_group_overhead_blocks(sb, src_group);
264 if (overhead != 0)
243 last_blk += group_data[src_group - group].blocks_count; 265 last_blk += group_data[src_group - group].blocks_count;
244 else 266 else
245 break; 267 break;
268 }
246 269
247 /* Allocate block bitmaps */ 270 /* Allocate block bitmaps */
248 for (; bb_index < flex_gd->count; bb_index++) { 271 for (; bb_index < flex_gd->count; bb_index++) {
@@ -300,6 +323,7 @@ next_group:
300 group_data[i].free_blocks_count); 323 group_data[i].free_blocks_count);
301 } 324 }
302 } 325 }
326 return 0;
303} 327}
304 328
305static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, 329static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
@@ -433,11 +457,13 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
433 ext4_group_t group, count; 457 ext4_group_t group, count;
434 struct buffer_head *bh = NULL; 458 struct buffer_head *bh = NULL;
435 int reserved_gdb, i, j, err = 0, err2; 459 int reserved_gdb, i, j, err = 0, err2;
460 int meta_bg;
436 461
437 BUG_ON(!flex_gd->count || !group_data || 462 BUG_ON(!flex_gd->count || !group_data ||
438 group_data[0].group != sbi->s_groups_count); 463 group_data[0].group != sbi->s_groups_count);
439 464
440 reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); 465 reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
466 meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
441 467
442 /* This transaction may be extended/restarted along the way */ 468 /* This transaction may be extended/restarted along the way */
443 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); 469 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
@@ -447,12 +473,25 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
447 group = group_data[0].group; 473 group = group_data[0].group;
448 for (i = 0; i < flex_gd->count; i++, group++) { 474 for (i = 0; i < flex_gd->count; i++, group++) {
449 unsigned long gdblocks; 475 unsigned long gdblocks;
476 ext4_grpblk_t overhead;
450 477
451 gdblocks = ext4_bg_num_gdb(sb, group); 478 gdblocks = ext4_bg_num_gdb(sb, group);
452 start = ext4_group_first_block_no(sb, group); 479 start = ext4_group_first_block_no(sb, group);
453 480
481 if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
482 goto handle_itb;
483
484 if (meta_bg == 1) {
485 ext4_group_t first_group;
486 first_group = ext4_meta_bg_first_group(sb, group);
487 if (first_group != group + 1 &&
488 first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
489 goto handle_itb;
490 }
491
492 block = start + ext4_bg_has_super(sb, group);
454 /* Copy all of the GDT blocks into the backup in this group */ 493 /* Copy all of the GDT blocks into the backup in this group */
455 for (j = 0, block = start + 1; j < gdblocks; j++, block++) { 494 for (j = 0; j < gdblocks; j++, block++) {
456 struct buffer_head *gdb; 495 struct buffer_head *gdb;
457 496
458 ext4_debug("update backup group %#04llx\n", block); 497 ext4_debug("update backup group %#04llx\n", block);
@@ -493,6 +532,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
493 goto out; 532 goto out;
494 } 533 }
495 534
535handle_itb:
496 /* Initialize group tables of the grop @group */ 536 /* Initialize group tables of the grop @group */
497 if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) 537 if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
498 goto handle_bb; 538 goto handle_bb;
@@ -521,11 +561,11 @@ handle_bb:
521 err = PTR_ERR(bh); 561 err = PTR_ERR(bh);
522 goto out; 562 goto out;
523 } 563 }
524 if (ext4_bg_has_super(sb, group)) { 564 overhead = ext4_group_overhead_blocks(sb, group);
565 if (overhead != 0) {
525 ext4_debug("mark backup superblock %#04llx (+0)\n", 566 ext4_debug("mark backup superblock %#04llx (+0)\n",
526 start); 567 start);
527 ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 568 ext4_set_bits(bh->b_data, 0, overhead);
528 1);
529 } 569 }
530 ext4_mark_bitmap_end(group_data[i].blocks_count, 570 ext4_mark_bitmap_end(group_data[i].blocks_count,
531 sb->s_blocksize * 8, bh->b_data); 571 sb->s_blocksize * 8, bh->b_data);
@@ -822,6 +862,45 @@ exit_bh:
822} 862}
823 863
824/* 864/*
865 * add_new_gdb_meta_bg is the sister of add_new_gdb.
866 */
867static int add_new_gdb_meta_bg(struct super_block *sb,
868 handle_t *handle, ext4_group_t group) {
869 ext4_fsblk_t gdblock;
870 struct buffer_head *gdb_bh;
871 struct buffer_head **o_group_desc, **n_group_desc;
872 unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
873 int err;
874
875 gdblock = ext4_meta_bg_first_block_no(sb, group) +
876 ext4_bg_has_super(sb, group);
877 gdb_bh = sb_bread(sb, gdblock);
878 if (!gdb_bh)
879 return -EIO;
880 n_group_desc = ext4_kvmalloc((gdb_num + 1) *
881 sizeof(struct buffer_head *),
882 GFP_NOFS);
883 if (!n_group_desc) {
884 err = -ENOMEM;
885 ext4_warning(sb, "not enough memory for %lu groups",
886 gdb_num + 1);
887 return err;
888 }
889
890 o_group_desc = EXT4_SB(sb)->s_group_desc;
891 memcpy(n_group_desc, o_group_desc,
892 EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
893 n_group_desc[gdb_num] = gdb_bh;
894 EXT4_SB(sb)->s_group_desc = n_group_desc;
895 EXT4_SB(sb)->s_gdb_count++;
896 ext4_kvfree(o_group_desc);
897 err = ext4_journal_get_write_access(handle, gdb_bh);
898 if (unlikely(err))
899 brelse(gdb_bh);
900 return err;
901}
902
903/*
825 * Called when we are adding a new group which has a backup copy of each of 904 * Called when we are adding a new group which has a backup copy of each of
826 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. 905 * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
827 * We need to add these reserved backup GDT blocks to the resize inode, so 906 * We need to add these reserved backup GDT blocks to the resize inode, so
@@ -949,16 +1028,16 @@ exit_free:
949 * do not copy the full number of backups at this time. The resize 1028 * do not copy the full number of backups at this time. The resize
950 * which changed s_groups_count will backup again. 1029 * which changed s_groups_count will backup again.
951 */ 1030 */
952static void update_backups(struct super_block *sb, 1031static void update_backups(struct super_block *sb, int blk_off, char *data,
953 int blk_off, char *data, int size) 1032 int size, int meta_bg)
954{ 1033{
955 struct ext4_sb_info *sbi = EXT4_SB(sb); 1034 struct ext4_sb_info *sbi = EXT4_SB(sb);
956 const ext4_group_t last = sbi->s_groups_count; 1035 ext4_group_t last;
957 const int bpg = EXT4_BLOCKS_PER_GROUP(sb); 1036 const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
958 unsigned three = 1; 1037 unsigned three = 1;
959 unsigned five = 5; 1038 unsigned five = 5;
960 unsigned seven = 7; 1039 unsigned seven = 7;
961 ext4_group_t group; 1040 ext4_group_t group = 0;
962 int rest = sb->s_blocksize - size; 1041 int rest = sb->s_blocksize - size;
963 handle_t *handle; 1042 handle_t *handle;
964 int err = 0, err2; 1043 int err = 0, err2;
@@ -970,10 +1049,17 @@ static void update_backups(struct super_block *sb,
970 goto exit_err; 1049 goto exit_err;
971 } 1050 }
972 1051
973 ext4_superblock_csum_set(sb, (struct ext4_super_block *)data); 1052 if (meta_bg == 0) {
1053 group = ext4_list_backups(sb, &three, &five, &seven);
1054 last = sbi->s_groups_count;
1055 } else {
1056 group = ext4_meta_bg_first_group(sb, group) + 1;
1057 last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
1058 }
974 1059
975 while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { 1060 while (group < sbi->s_groups_count) {
976 struct buffer_head *bh; 1061 struct buffer_head *bh;
1062 ext4_fsblk_t backup_block;
977 1063
978 /* Out of journal space, and can't get more - abort - so sad */ 1064 /* Out of journal space, and can't get more - abort - so sad */
979 if (ext4_handle_valid(handle) && 1065 if (ext4_handle_valid(handle) &&
@@ -982,13 +1068,20 @@ static void update_backups(struct super_block *sb,
982 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) 1068 (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
983 break; 1069 break;
984 1070
985 bh = sb_getblk(sb, group * bpg + blk_off); 1071 if (meta_bg == 0)
1072 backup_block = group * bpg + blk_off;
1073 else
1074 backup_block = (ext4_group_first_block_no(sb, group) +
1075 ext4_bg_has_super(sb, group));
1076
1077 bh = sb_getblk(sb, backup_block);
986 if (!bh) { 1078 if (!bh) {
987 err = -EIO; 1079 err = -EIO;
988 break; 1080 break;
989 } 1081 }
990 ext4_debug("update metadata backup %#04lx\n", 1082 ext4_debug("update metadata backup %llu(+%llu)\n",
991 (unsigned long)bh->b_blocknr); 1083 backup_block, backup_block -
1084 ext4_group_first_block_no(sb, group));
992 if ((err = ext4_journal_get_write_access(handle, bh))) 1085 if ((err = ext4_journal_get_write_access(handle, bh)))
993 break; 1086 break;
994 lock_buffer(bh); 1087 lock_buffer(bh);
@@ -1001,6 +1094,13 @@ static void update_backups(struct super_block *sb,
1001 if (unlikely(err)) 1094 if (unlikely(err))
1002 ext4_std_error(sb, err); 1095 ext4_std_error(sb, err);
1003 brelse(bh); 1096 brelse(bh);
1097
1098 if (meta_bg == 0)
1099 group = ext4_list_backups(sb, &three, &five, &seven);
1100 else if (group == last)
1101 break;
1102 else
1103 group = last;
1004 } 1104 }
1005 if ((err2 = ext4_journal_stop(handle)) && !err) 1105 if ((err2 = ext4_journal_stop(handle)) && !err)
1006 err = err2; 1106 err = err2;
@@ -1043,7 +1143,9 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
1043 struct ext4_super_block *es = sbi->s_es; 1143 struct ext4_super_block *es = sbi->s_es;
1044 struct buffer_head *gdb_bh; 1144 struct buffer_head *gdb_bh;
1045 int i, gdb_off, gdb_num, err = 0; 1145 int i, gdb_off, gdb_num, err = 0;
1146 int meta_bg;
1046 1147
1148 meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
1047 for (i = 0; i < count; i++, group++) { 1149 for (i = 0; i < count; i++, group++) {
1048 int reserved_gdb = ext4_bg_has_super(sb, group) ? 1150 int reserved_gdb = ext4_bg_has_super(sb, group) ?
1049 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; 1151 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
@@ -1063,8 +1165,11 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
1063 1165
1064 if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) 1166 if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
1065 err = reserve_backup_gdb(handle, resize_inode, group); 1167 err = reserve_backup_gdb(handle, resize_inode, group);
1066 } else 1168 } else if (meta_bg != 0) {
1169 err = add_new_gdb_meta_bg(sb, handle, group);
1170 } else {
1067 err = add_new_gdb(handle, resize_inode, group); 1171 err = add_new_gdb(handle, resize_inode, group);
1172 }
1068 if (err) 1173 if (err)
1069 break; 1174 break;
1070 } 1175 }
@@ -1076,17 +1181,12 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
1076 struct buffer_head *bh = sb_getblk(sb, block); 1181 struct buffer_head *bh = sb_getblk(sb, block);
1077 if (!bh) 1182 if (!bh)
1078 return NULL; 1183 return NULL;
1079 1184 if (!bh_uptodate_or_lock(bh)) {
1080 if (bitmap_uptodate(bh)) 1185 if (bh_submit_read(bh) < 0) {
1081 return bh; 1186 brelse(bh);
1082 1187 return NULL;
1083 lock_buffer(bh); 1188 }
1084 if (bh_submit_read(bh) < 0) {
1085 unlock_buffer(bh);
1086 brelse(bh);
1087 return NULL;
1088 } 1189 }
1089 unlock_buffer(bh);
1090 1190
1091 return bh; 1191 return bh;
1092} 1192}
@@ -1161,6 +1261,9 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
1161 ext4_free_group_clusters_set(sb, gdp, 1261 ext4_free_group_clusters_set(sb, gdp,
1162 EXT4_B2C(sbi, group_data->free_blocks_count)); 1262 EXT4_B2C(sbi, group_data->free_blocks_count));
1163 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); 1263 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
1264 if (ext4_has_group_desc_csum(sb))
1265 ext4_itable_unused_set(sb, gdp,
1266 EXT4_INODES_PER_GROUP(sb));
1164 gdp->bg_flags = cpu_to_le16(*bg_flags); 1267 gdp->bg_flags = cpu_to_le16(*bg_flags);
1165 ext4_group_desc_csum_set(sb, group, gdp); 1268 ext4_group_desc_csum_set(sb, group, gdp);
1166 1269
@@ -1216,7 +1319,7 @@ static void ext4_update_super(struct super_block *sb,
1216 } 1319 }
1217 1320
1218 reserved_blocks = ext4_r_blocks_count(es) * 100; 1321 reserved_blocks = ext4_r_blocks_count(es) * 100;
1219 do_div(reserved_blocks, ext4_blocks_count(es)); 1322 reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es));
1220 reserved_blocks *= blocks_count; 1323 reserved_blocks *= blocks_count;
1221 do_div(reserved_blocks, 100); 1324 do_div(reserved_blocks, 100);
1222 1325
@@ -1227,6 +1330,7 @@ static void ext4_update_super(struct super_block *sb,
1227 le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * 1330 le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
1228 flex_gd->count); 1331 flex_gd->count);
1229 1332
1333 ext4_debug("free blocks count %llu", ext4_free_blocks_count(es));
1230 /* 1334 /*
1231 * We need to protect s_groups_count against other CPUs seeing 1335 * We need to protect s_groups_count against other CPUs seeing
1232 * inconsistent state in the superblock. 1336 * inconsistent state in the superblock.
@@ -1261,6 +1365,8 @@ static void ext4_update_super(struct super_block *sb,
1261 percpu_counter_add(&sbi->s_freeinodes_counter, 1365 percpu_counter_add(&sbi->s_freeinodes_counter,
1262 EXT4_INODES_PER_GROUP(sb) * flex_gd->count); 1366 EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
1263 1367
1368 ext4_debug("free blocks count %llu",
1369 percpu_counter_read(&sbi->s_freeclusters_counter));
1264 if (EXT4_HAS_INCOMPAT_FEATURE(sb, 1370 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
1265 EXT4_FEATURE_INCOMPAT_FLEX_BG) && 1371 EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
1266 sbi->s_log_groups_per_flex) { 1372 sbi->s_log_groups_per_flex) {
@@ -1349,16 +1455,24 @@ exit_journal:
1349 err = err2; 1455 err = err2;
1350 1456
1351 if (!err) { 1457 if (!err) {
1352 int i; 1458 int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
1459 int gdb_num_end = ((group + flex_gd->count - 1) /
1460 EXT4_DESC_PER_BLOCK(sb));
1461 int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
1462 EXT4_FEATURE_INCOMPAT_META_BG);
1463 sector_t old_gdb = 0;
1464
1353 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, 1465 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
1354 sizeof(struct ext4_super_block)); 1466 sizeof(struct ext4_super_block), 0);
1355 for (i = 0; i < flex_gd->count; i++, group++) { 1467 for (; gdb_num <= gdb_num_end; gdb_num++) {
1356 struct buffer_head *gdb_bh; 1468 struct buffer_head *gdb_bh;
1357 int gdb_num; 1469
1358 gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
1359 gdb_bh = sbi->s_group_desc[gdb_num]; 1470 gdb_bh = sbi->s_group_desc[gdb_num];
1471 if (old_gdb == gdb_bh->b_blocknr)
1472 continue;
1360 update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, 1473 update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
1361 gdb_bh->b_size); 1474 gdb_bh->b_size, meta_bg);
1475 old_gdb = gdb_bh->b_blocknr;
1362 } 1476 }
1363 } 1477 }
1364exit: 1478exit:
@@ -1402,9 +1516,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
1402 1516
1403 group_data[i].group = group + i; 1517 group_data[i].group = group + i;
1404 group_data[i].blocks_count = blocks_per_group; 1518 group_data[i].blocks_count = blocks_per_group;
1405 overhead = ext4_bg_has_super(sb, group + i) ? 1519 overhead = ext4_group_overhead_blocks(sb, group + i);
1406 (1 + ext4_bg_num_gdb(sb, group + i) +
1407 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
1408 group_data[i].free_blocks_count = blocks_per_group - overhead; 1520 group_data[i].free_blocks_count = blocks_per_group - overhead;
1409 if (ext4_has_group_desc_csum(sb)) 1521 if (ext4_has_group_desc_csum(sb))
1410 flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | 1522 flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
@@ -1492,6 +1604,14 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
1492 if (err) 1604 if (err)
1493 goto out; 1605 goto out;
1494 1606
1607 err = ext4_alloc_flex_bg_array(sb, input->group + 1);
1608 if (err)
1609 return err;
1610
1611 err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
1612 if (err)
1613 goto out;
1614
1495 flex_gd.count = 1; 1615 flex_gd.count = 1;
1496 flex_gd.groups = input; 1616 flex_gd.groups = input;
1497 flex_gd.bg_flags = &bg_flags; 1617 flex_gd.bg_flags = &bg_flags;
@@ -1544,11 +1664,13 @@ errout:
1544 err = err2; 1664 err = err2;
1545 1665
1546 if (!err) { 1666 if (!err) {
1667 ext4_fsblk_t first_block;
1668 first_block = ext4_group_first_block_no(sb, 0);
1547 if (test_opt(sb, DEBUG)) 1669 if (test_opt(sb, DEBUG))
1548 printk(KERN_DEBUG "EXT4-fs: extended group to %llu " 1670 printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
1549 "blocks\n", ext4_blocks_count(es)); 1671 "blocks\n", ext4_blocks_count(es));
1550 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, 1672 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
1551 sizeof(struct ext4_super_block)); 1673 (char *)es, sizeof(struct ext4_super_block), 0);
1552 } 1674 }
1553 return err; 1675 return err;
1554} 1676}
@@ -1631,6 +1753,94 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1631 return err; 1753 return err;
1632} /* ext4_group_extend */ 1754} /* ext4_group_extend */
1633 1755
1756
1757static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
1758{
1759 return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
1760}
1761
1762/*
1763 * Release the resize inode and drop the resize_inode feature if there
1764 * are no more reserved gdt blocks, and then convert the file system
1765 * to enable meta_bg
1766 */
1767static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
1768{
1769 handle_t *handle;
1770 struct ext4_sb_info *sbi = EXT4_SB(sb);
1771 struct ext4_super_block *es = sbi->s_es;
1772 struct ext4_inode_info *ei = EXT4_I(inode);
1773 ext4_fsblk_t nr;
1774 int i, ret, err = 0;
1775 int credits = 1;
1776
1777 ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg");
1778 if (inode) {
1779 if (es->s_reserved_gdt_blocks) {
1780 ext4_error(sb, "Unexpected non-zero "
1781 "s_reserved_gdt_blocks");
1782 return -EPERM;
1783 }
1784
1785 /* Do a quick sanity check of the resize inode */
1786 if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
1787 goto invalid_resize_inode;
1788 for (i = 0; i < EXT4_N_BLOCKS; i++) {
1789 if (i == EXT4_DIND_BLOCK) {
1790 if (ei->i_data[i])
1791 continue;
1792 else
1793 goto invalid_resize_inode;
1794 }
1795 if (ei->i_data[i])
1796 goto invalid_resize_inode;
1797 }
1798 credits += 3; /* block bitmap, bg descriptor, resize inode */
1799 }
1800
1801 handle = ext4_journal_start_sb(sb, credits);
1802 if (IS_ERR(handle))
1803 return PTR_ERR(handle);
1804
1805 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
1806 if (err)
1807 goto errout;
1808
1809 EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
1810 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
1811 sbi->s_es->s_first_meta_bg =
1812 cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
1813
1814 err = ext4_handle_dirty_super(handle, sb);
1815 if (err) {
1816 ext4_std_error(sb, err);
1817 goto errout;
1818 }
1819
1820 if (inode) {
1821 nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]);
1822 ext4_free_blocks(handle, inode, NULL, nr, 1,
1823 EXT4_FREE_BLOCKS_METADATA |
1824 EXT4_FREE_BLOCKS_FORGET);
1825 ei->i_data[EXT4_DIND_BLOCK] = 0;
1826 inode->i_blocks = 0;
1827
1828 err = ext4_mark_inode_dirty(handle, inode);
1829 if (err)
1830 ext4_std_error(sb, err);
1831 }
1832
1833errout:
1834 ret = ext4_journal_stop(handle);
1835 if (!err)
1836 err = ret;
1837 return ret;
1838
1839invalid_resize_inode:
1840 ext4_error(sb, "corrupted/inconsistent resize inode");
1841 return -EINVAL;
1842}
1843
1634/* 1844/*
1635 * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count 1845 * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
1636 * 1846 *
@@ -1643,21 +1853,31 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1643 struct ext4_sb_info *sbi = EXT4_SB(sb); 1853 struct ext4_sb_info *sbi = EXT4_SB(sb);
1644 struct ext4_super_block *es = sbi->s_es; 1854 struct ext4_super_block *es = sbi->s_es;
1645 struct buffer_head *bh; 1855 struct buffer_head *bh;
1646 struct inode *resize_inode; 1856 struct inode *resize_inode = NULL;
1647 ext4_fsblk_t o_blocks_count; 1857 ext4_grpblk_t add, offset;
1648 ext4_group_t o_group;
1649 ext4_group_t n_group;
1650 ext4_grpblk_t offset, add;
1651 unsigned long n_desc_blocks; 1858 unsigned long n_desc_blocks;
1652 unsigned long o_desc_blocks; 1859 unsigned long o_desc_blocks;
1653 unsigned long desc_blocks; 1860 ext4_group_t o_group;
1654 int err = 0, flexbg_size = 1; 1861 ext4_group_t n_group;
1862 ext4_fsblk_t o_blocks_count;
1863 ext4_fsblk_t n_blocks_count_retry = 0;
1864 unsigned long last_update_time = 0;
1865 int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
1866 int meta_bg;
1655 1867
1868 /* See if the device is actually as big as what was requested */
1869 bh = sb_bread(sb, n_blocks_count - 1);
1870 if (!bh) {
1871 ext4_warning(sb, "can't read last block, resize aborted");
1872 return -ENOSPC;
1873 }
1874 brelse(bh);
1875
1876retry:
1656 o_blocks_count = ext4_blocks_count(es); 1877 o_blocks_count = ext4_blocks_count(es);
1657 1878
1658 if (test_opt(sb, DEBUG)) 1879 ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu "
1659 ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " 1880 "to %llu blocks", o_blocks_count, n_blocks_count);
1660 "to %llu blocks", o_blocks_count, n_blocks_count);
1661 1881
1662 if (n_blocks_count < o_blocks_count) { 1882 if (n_blocks_count < o_blocks_count) {
1663 /* On-line shrinking not supported */ 1883 /* On-line shrinking not supported */
@@ -1672,32 +1892,49 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1672 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); 1892 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
1673 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); 1893 ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
1674 1894
1675 n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / 1895 n_desc_blocks = num_desc_blocks(sb, n_group + 1);
1676 EXT4_DESC_PER_BLOCK(sb); 1896 o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
1677 o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
1678 EXT4_DESC_PER_BLOCK(sb);
1679 desc_blocks = n_desc_blocks - o_desc_blocks;
1680 1897
1681 if (desc_blocks && 1898 meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
1682 (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
1683 le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
1684 ext4_warning(sb, "No reserved GDT blocks, can't resize");
1685 return -EPERM;
1686 }
1687 1899
1688 resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); 1900 if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
1689 if (IS_ERR(resize_inode)) { 1901 if (meta_bg) {
1690 ext4_warning(sb, "Error opening resize inode"); 1902 ext4_error(sb, "resize_inode and meta_bg enabled "
1691 return PTR_ERR(resize_inode); 1903 "simultaneously");
1904 return -EINVAL;
1905 }
1906 if (n_desc_blocks > o_desc_blocks +
1907 le16_to_cpu(es->s_reserved_gdt_blocks)) {
1908 n_blocks_count_retry = n_blocks_count;
1909 n_desc_blocks = o_desc_blocks +
1910 le16_to_cpu(es->s_reserved_gdt_blocks);
1911 n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
1912 n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb);
1913 n_group--; /* set to last group number */
1914 }
1915
1916 if (!resize_inode)
1917 resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
1918 if (IS_ERR(resize_inode)) {
1919 ext4_warning(sb, "Error opening resize inode");
1920 return PTR_ERR(resize_inode);
1921 }
1692 } 1922 }
1693 1923
1694 /* See if the device is actually as big as what was requested */ 1924 if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
1695 bh = sb_bread(sb, n_blocks_count - 1); 1925 err = ext4_convert_meta_bg(sb, resize_inode);
1696 if (!bh) { 1926 if (err)
1697 ext4_warning(sb, "can't read last block, resize aborted"); 1927 goto out;
1698 return -ENOSPC; 1928 if (resize_inode) {
1929 iput(resize_inode);
1930 resize_inode = NULL;
1931 }
1932 if (n_blocks_count_retry) {
1933 n_blocks_count = n_blocks_count_retry;
1934 n_blocks_count_retry = 0;
1935 goto retry;
1936 }
1699 } 1937 }
1700 brelse(bh);
1701 1938
1702 /* extend the last group */ 1939 /* extend the last group */
1703 if (n_group == o_group) 1940 if (n_group == o_group)
@@ -1710,12 +1947,15 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1710 goto out; 1947 goto out;
1711 } 1948 }
1712 1949
1713 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && 1950 if (ext4_blocks_count(es) == n_blocks_count)
1714 es->s_log_groups_per_flex) 1951 goto out;
1715 flexbg_size = 1 << es->s_log_groups_per_flex;
1716 1952
1717 o_blocks_count = ext4_blocks_count(es); 1953 err = ext4_alloc_flex_bg_array(sb, n_group + 1);
1718 if (o_blocks_count == n_blocks_count) 1954 if (err)
1955 return err;
1956
1957 err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
1958 if (err)
1719 goto out; 1959 goto out;
1720 1960
1721 flex_gd = alloc_flex_gd(flexbg_size); 1961 flex_gd = alloc_flex_gd(flexbg_size);
@@ -1729,19 +1969,33 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1729 */ 1969 */
1730 while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, 1970 while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
1731 flexbg_size)) { 1971 flexbg_size)) {
1732 ext4_alloc_group_tables(sb, flex_gd, flexbg_size); 1972 if (jiffies - last_update_time > HZ * 10) {
1973 if (last_update_time)
1974 ext4_msg(sb, KERN_INFO,
1975 "resized to %llu blocks",
1976 ext4_blocks_count(es));
1977 last_update_time = jiffies;
1978 }
1979 if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
1980 break;
1733 err = ext4_flex_group_add(sb, resize_inode, flex_gd); 1981 err = ext4_flex_group_add(sb, resize_inode, flex_gd);
1734 if (unlikely(err)) 1982 if (unlikely(err))
1735 break; 1983 break;
1736 } 1984 }
1737 1985
1986 if (!err && n_blocks_count_retry) {
1987 n_blocks_count = n_blocks_count_retry;
1988 n_blocks_count_retry = 0;
1989 free_flex_gd(flex_gd);
1990 flex_gd = NULL;
1991 goto retry;
1992 }
1993
1738out: 1994out:
1739 if (flex_gd) 1995 if (flex_gd)
1740 free_flex_gd(flex_gd); 1996 free_flex_gd(flex_gd);
1741 1997 if (resize_inode != NULL)
1742 iput(resize_inode); 1998 iput(resize_inode);
1743 if (test_opt(sb, DEBUG)) 1999 ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
1744 ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
1745 "upto %llu blocks", o_blocks_count, n_blocks_count);
1746 return err; 2000 return err;
1747} 2001}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 69c55d4e4626..7265a0367476 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -420,7 +420,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
420 */ 420 */
421 if (!es->s_error_count) 421 if (!es->s_error_count)
422 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); 422 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
423 es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); 423 le32_add_cpu(&es->s_error_count, 1);
424} 424}
425 425
426static void save_error_info(struct super_block *sb, const char *func, 426static void save_error_info(struct super_block *sb, const char *func,
@@ -850,7 +850,6 @@ static void ext4_put_super(struct super_block *sb)
850 flush_workqueue(sbi->dio_unwritten_wq); 850 flush_workqueue(sbi->dio_unwritten_wq);
851 destroy_workqueue(sbi->dio_unwritten_wq); 851 destroy_workqueue(sbi->dio_unwritten_wq);
852 852
853 lock_super(sb);
854 if (sbi->s_journal) { 853 if (sbi->s_journal) {
855 err = jbd2_journal_destroy(sbi->s_journal); 854 err = jbd2_journal_destroy(sbi->s_journal);
856 sbi->s_journal = NULL; 855 sbi->s_journal = NULL;
@@ -917,7 +916,6 @@ static void ext4_put_super(struct super_block *sb)
917 * Now that we are completely done shutting down the 916 * Now that we are completely done shutting down the
918 * superblock, we need to actually destroy the kobject. 917 * superblock, we need to actually destroy the kobject.
919 */ 918 */
920 unlock_super(sb);
921 kobject_put(&sbi->s_kobj); 919 kobject_put(&sbi->s_kobj);
922 wait_for_completion(&sbi->s_kobj_unregister); 920 wait_for_completion(&sbi->s_kobj_unregister);
923 if (sbi->s_chksum_driver) 921 if (sbi->s_chksum_driver)
@@ -956,11 +954,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
956 ei->jinode = NULL; 954 ei->jinode = NULL;
957 INIT_LIST_HEAD(&ei->i_completed_io_list); 955 INIT_LIST_HEAD(&ei->i_completed_io_list);
958 spin_lock_init(&ei->i_completed_io_lock); 956 spin_lock_init(&ei->i_completed_io_lock);
959 ei->cur_aio_dio = NULL;
960 ei->i_sync_tid = 0; 957 ei->i_sync_tid = 0;
961 ei->i_datasync_tid = 0; 958 ei->i_datasync_tid = 0;
962 atomic_set(&ei->i_ioend_count, 0); 959 atomic_set(&ei->i_ioend_count, 0);
963 atomic_set(&ei->i_aiodio_unwritten, 0); 960 atomic_set(&ei->i_unwritten, 0);
964 961
965 return &ei->vfs_inode; 962 return &ei->vfs_inode;
966} 963}
@@ -1224,6 +1221,7 @@ enum {
1224 Opt_inode_readahead_blks, Opt_journal_ioprio, 1221 Opt_inode_readahead_blks, Opt_journal_ioprio,
1225 Opt_dioread_nolock, Opt_dioread_lock, 1222 Opt_dioread_nolock, Opt_dioread_lock,
1226 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 1223 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1224 Opt_max_dir_size_kb,
1227}; 1225};
1228 1226
1229static const match_table_t tokens = { 1227static const match_table_t tokens = {
@@ -1297,6 +1295,7 @@ static const match_table_t tokens = {
1297 {Opt_init_itable, "init_itable=%u"}, 1295 {Opt_init_itable, "init_itable=%u"},
1298 {Opt_init_itable, "init_itable"}, 1296 {Opt_init_itable, "init_itable"},
1299 {Opt_noinit_itable, "noinit_itable"}, 1297 {Opt_noinit_itable, "noinit_itable"},
1298 {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
1300 {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 1299 {Opt_removed, "check=none"}, /* mount option from ext2/3 */
1301 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 1300 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
1302 {Opt_removed, "reservation"}, /* mount option from ext2/3 */ 1301 {Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1477,6 +1476,7 @@ static const struct mount_opts {
1477 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, 1476 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1478 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, 1477 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1479 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, 1478 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1479 {Opt_max_dir_size_kb, 0, MOPT_GTE0},
1480 {Opt_err, 0, 0} 1480 {Opt_err, 0, 0}
1481}; 1481};
1482 1482
@@ -1592,6 +1592,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1592 if (!args->from) 1592 if (!args->from)
1593 arg = EXT4_DEF_LI_WAIT_MULT; 1593 arg = EXT4_DEF_LI_WAIT_MULT;
1594 sbi->s_li_wait_mult = arg; 1594 sbi->s_li_wait_mult = arg;
1595 } else if (token == Opt_max_dir_size_kb) {
1596 sbi->s_max_dir_size_kb = arg;
1595 } else if (token == Opt_stripe) { 1597 } else if (token == Opt_stripe) {
1596 sbi->s_stripe = arg; 1598 sbi->s_stripe = arg;
1597 } else if (m->flags & MOPT_DATAJ) { 1599 } else if (m->flags & MOPT_DATAJ) {
@@ -1664,7 +1666,7 @@ static int parse_options(char *options, struct super_block *sb,
1664 * Initialize args struct so we know whether arg was 1666 * Initialize args struct so we know whether arg was
1665 * found; some options take optional arguments. 1667 * found; some options take optional arguments.
1666 */ 1668 */
1667 args[0].to = args[0].from = 0; 1669 args[0].to = args[0].from = NULL;
1668 token = match_token(p, tokens, args); 1670 token = match_token(p, tokens, args);
1669 if (handle_mount_opt(sb, p, token, args, journal_devnum, 1671 if (handle_mount_opt(sb, p, token, args, journal_devnum,
1670 journal_ioprio, is_remount) < 0) 1672 journal_ioprio, is_remount) < 0)
@@ -1740,7 +1742,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
1740 1742
1741static const char *token2str(int token) 1743static const char *token2str(int token)
1742{ 1744{
1743 static const struct match_token *t; 1745 const struct match_token *t;
1744 1746
1745 for (t = tokens; t->token != Opt_err; t++) 1747 for (t = tokens; t->token != Opt_err; t++)
1746 if (t->token == token && !strchr(t->pattern, '=')) 1748 if (t->token == token && !strchr(t->pattern, '='))
@@ -1823,6 +1825,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
1823 if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && 1825 if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
1824 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) 1826 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
1825 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); 1827 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
1828 if (nodefs || sbi->s_max_dir_size_kb)
1829 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
1826 1830
1827 ext4_show_quota_options(seq, sb); 1831 ext4_show_quota_options(seq, sb);
1828 return 0; 1832 return 0;
@@ -1914,15 +1918,45 @@ done:
1914 return res; 1918 return res;
1915} 1919}
1916 1920
1921int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
1922{
1923 struct ext4_sb_info *sbi = EXT4_SB(sb);
1924 struct flex_groups *new_groups;
1925 int size;
1926
1927 if (!sbi->s_log_groups_per_flex)
1928 return 0;
1929
1930 size = ext4_flex_group(sbi, ngroup - 1) + 1;
1931 if (size <= sbi->s_flex_groups_allocated)
1932 return 0;
1933
1934 size = roundup_pow_of_two(size * sizeof(struct flex_groups));
1935 new_groups = ext4_kvzalloc(size, GFP_KERNEL);
1936 if (!new_groups) {
1937 ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
1938 size / (int) sizeof(struct flex_groups));
1939 return -ENOMEM;
1940 }
1941
1942 if (sbi->s_flex_groups) {
1943 memcpy(new_groups, sbi->s_flex_groups,
1944 (sbi->s_flex_groups_allocated *
1945 sizeof(struct flex_groups)));
1946 ext4_kvfree(sbi->s_flex_groups);
1947 }
1948 sbi->s_flex_groups = new_groups;
1949 sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
1950 return 0;
1951}
1952
1917static int ext4_fill_flex_info(struct super_block *sb) 1953static int ext4_fill_flex_info(struct super_block *sb)
1918{ 1954{
1919 struct ext4_sb_info *sbi = EXT4_SB(sb); 1955 struct ext4_sb_info *sbi = EXT4_SB(sb);
1920 struct ext4_group_desc *gdp = NULL; 1956 struct ext4_group_desc *gdp = NULL;
1921 ext4_group_t flex_group_count;
1922 ext4_group_t flex_group; 1957 ext4_group_t flex_group;
1923 unsigned int groups_per_flex = 0; 1958 unsigned int groups_per_flex = 0;
1924 size_t size; 1959 int i, err;
1925 int i;
1926 1960
1927 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1961 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1928 if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { 1962 if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
@@ -1931,17 +1965,9 @@ static int ext4_fill_flex_info(struct super_block *sb)
1931 } 1965 }
1932 groups_per_flex = 1 << sbi->s_log_groups_per_flex; 1966 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1933 1967
1934 /* We allocate both existing and potentially added groups */ 1968 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
1935 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + 1969 if (err)
1936 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
1937 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
1938 size = flex_group_count * sizeof(struct flex_groups);
1939 sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
1940 if (sbi->s_flex_groups == NULL) {
1941 ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
1942 flex_group_count);
1943 goto failed; 1970 goto failed;
1944 }
1945 1971
1946 for (i = 0; i < sbi->s_groups_count; i++) { 1972 for (i = 0; i < sbi->s_groups_count; i++) {
1947 gdp = ext4_get_group_desc(sb, i, NULL); 1973 gdp = ext4_get_group_desc(sb, i, NULL);
@@ -2144,10 +2170,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2144 } 2170 }
2145 2171
2146 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 2172 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2147 if (es->s_last_orphan) 2173 /* don't clear list on RO mount w/ errors */
2174 if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
2148 jbd_debug(1, "Errors on filesystem, " 2175 jbd_debug(1, "Errors on filesystem, "
2149 "clearing orphan list.\n"); 2176 "clearing orphan list.\n");
2150 es->s_last_orphan = 0; 2177 es->s_last_orphan = 0;
2178 }
2151 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); 2179 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
2152 return; 2180 return;
2153 } 2181 }
@@ -2528,6 +2556,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2528EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2556EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2529EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2557EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2530EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2558EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2559EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
2531EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2560EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2532 2561
2533static struct attribute *ext4_attrs[] = { 2562static struct attribute *ext4_attrs[] = {
@@ -2543,6 +2572,7 @@ static struct attribute *ext4_attrs[] = {
2543 ATTR_LIST(mb_stream_req), 2572 ATTR_LIST(mb_stream_req),
2544 ATTR_LIST(mb_group_prealloc), 2573 ATTR_LIST(mb_group_prealloc),
2545 ATTR_LIST(max_writeback_mb_bump), 2574 ATTR_LIST(max_writeback_mb_bump),
2575 ATTR_LIST(extent_max_zeroout_kb),
2546 ATTR_LIST(trigger_fs_error), 2576 ATTR_LIST(trigger_fs_error),
2547 NULL, 2577 NULL,
2548}; 2578};
@@ -2550,10 +2580,12 @@ static struct attribute *ext4_attrs[] = {
2550/* Features this copy of ext4 supports */ 2580/* Features this copy of ext4 supports */
2551EXT4_INFO_ATTR(lazy_itable_init); 2581EXT4_INFO_ATTR(lazy_itable_init);
2552EXT4_INFO_ATTR(batched_discard); 2582EXT4_INFO_ATTR(batched_discard);
2583EXT4_INFO_ATTR(meta_bg_resize);
2553 2584
2554static struct attribute *ext4_feat_attrs[] = { 2585static struct attribute *ext4_feat_attrs[] = {
2555 ATTR_LIST(lazy_itable_init), 2586 ATTR_LIST(lazy_itable_init),
2556 ATTR_LIST(batched_discard), 2587 ATTR_LIST(batched_discard),
2588 ATTR_LIST(meta_bg_resize),
2557 NULL, 2589 NULL,
2558}; 2590};
2559 2591
@@ -3374,7 +3406,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3374 * enable delayed allocation by default 3406 * enable delayed allocation by default
3375 * Use -o nodelalloc to turn it off 3407 * Use -o nodelalloc to turn it off
3376 */ 3408 */
3377 if (!IS_EXT3_SB(sb) && 3409 if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
3378 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3410 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3379 set_opt(sb, DELALLOC); 3411 set_opt(sb, DELALLOC);
3380 3412
@@ -3743,6 +3775,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3743 3775
3744 sbi->s_stripe = ext4_get_stripe_size(sbi); 3776 sbi->s_stripe = ext4_get_stripe_size(sbi);
3745 sbi->s_max_writeback_mb_bump = 128; 3777 sbi->s_max_writeback_mb_bump = 128;
3778 sbi->s_extent_max_zeroout_kb = 32;
3746 3779
3747 /* 3780 /*
3748 * set up enough so that it can read an inode 3781 * set up enough so that it can read an inode
@@ -4519,11 +4552,9 @@ static int ext4_unfreeze(struct super_block *sb)
4519 if (sb->s_flags & MS_RDONLY) 4552 if (sb->s_flags & MS_RDONLY)
4520 return 0; 4553 return 0;
4521 4554
4522 lock_super(sb);
4523 /* Reset the needs_recovery flag before the fs is unlocked. */ 4555 /* Reset the needs_recovery flag before the fs is unlocked. */
4524 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 4556 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4525 ext4_commit_super(sb, 1); 4557 ext4_commit_super(sb, 1);
4526 unlock_super(sb);
4527 return 0; 4558 return 0;
4528} 4559}
4529 4560
@@ -4559,7 +4590,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4559 char *orig_data = kstrdup(data, GFP_KERNEL); 4590 char *orig_data = kstrdup(data, GFP_KERNEL);
4560 4591
4561 /* Store the original options */ 4592 /* Store the original options */
4562 lock_super(sb);
4563 old_sb_flags = sb->s_flags; 4593 old_sb_flags = sb->s_flags;
4564 old_opts.s_mount_opt = sbi->s_mount_opt; 4594 old_opts.s_mount_opt = sbi->s_mount_opt;
4565 old_opts.s_mount_opt2 = sbi->s_mount_opt2; 4595 old_opts.s_mount_opt2 = sbi->s_mount_opt2;
@@ -4701,7 +4731,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4701 if (sbi->s_journal == NULL) 4731 if (sbi->s_journal == NULL)
4702 ext4_commit_super(sb, 1); 4732 ext4_commit_super(sb, 1);
4703 4733
4704 unlock_super(sb);
4705#ifdef CONFIG_QUOTA 4734#ifdef CONFIG_QUOTA
4706 /* Release old quota file names */ 4735 /* Release old quota file names */
4707 for (i = 0; i < MAXQUOTAS; i++) 4736 for (i = 0; i < MAXQUOTAS; i++)
@@ -4714,10 +4743,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4714 else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4743 else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
4715 EXT4_FEATURE_RO_COMPAT_QUOTA)) { 4744 EXT4_FEATURE_RO_COMPAT_QUOTA)) {
4716 err = ext4_enable_quotas(sb); 4745 err = ext4_enable_quotas(sb);
4717 if (err) { 4746 if (err)
4718 lock_super(sb);
4719 goto restore_opts; 4747 goto restore_opts;
4720 }
4721 } 4748 }
4722 } 4749 }
4723#endif 4750#endif
@@ -4744,7 +4771,6 @@ restore_opts:
4744 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 4771 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
4745 } 4772 }
4746#endif 4773#endif
4747 unlock_super(sb);
4748 kfree(orig_data); 4774 kfree(orig_data);
4749 return err; 4775 return err;
4750} 4776}
@@ -5269,8 +5295,10 @@ static int __init ext4_init_fs(void)
5269 if (err) 5295 if (err)
5270 goto out6; 5296 goto out6;
5271 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 5297 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
5272 if (!ext4_kset) 5298 if (!ext4_kset) {
5299 err = -ENOMEM;
5273 goto out5; 5300 goto out5;
5301 }
5274 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 5302 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
5275 5303
5276 err = ext4_init_feat_adverts(); 5304 err = ext4_init_feat_adverts();