aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/ext4.h187
-rw-r--r--fs/ext4/ext4_jbd2.c58
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/extents.c193
-rw-r--r--fs/ext4/extents_status.c75
-rw-r--r--fs/ext4/extents_status.h5
-rw-r--r--fs/ext4/file.c14
-rw-r--r--fs/ext4/fsync.c52
-rw-r--r--fs/ext4/ialloc.c3
-rw-r--r--fs/ext4/indirect.c40
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c1751
-rw-r--r--fs/ext4/mballoc.c21
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/page-io.c325
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c155
19 files changed, 1525 insertions, 1435 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..58339393fa6e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
682 682
683static inline int test_root(ext4_group_t a, int b) 683static inline int test_root(ext4_group_t a, int b)
684{ 684{
685 int num = b; 685 while (1) {
686 686 if (a < b)
687 while (a > num) 687 return 0;
688 num *= b; 688 if (a == b)
689 return num == a; 689 return 1;
690 if ((a % b) != 0)
691 return 0;
692 a = a / b;
693 }
690} 694}
691 695
692static int ext4_group_sparse(ext4_group_t group) 696static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4af03ea84aa3..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,38 +177,28 @@ struct ext4_map_blocks {
177}; 177};
178 178
179/* 179/*
180 * For delayed allocation tracking
181 */
182struct mpage_da_data {
183 struct inode *inode;
184 sector_t b_blocknr; /* start block number of extent */
185 size_t b_size; /* size of extent */
186 unsigned long b_state; /* state of the extent */
187 unsigned long first_page, next_page; /* extent of pages */
188 struct writeback_control *wbc;
189 int io_done;
190 int pages_written;
191 int retval;
192};
193
194/*
195 * Flags for ext4_io_end->flags 180 * Flags for ext4_io_end->flags
196 */ 181 */
197#define EXT4_IO_END_UNWRITTEN 0x0001 182#define EXT4_IO_END_UNWRITTEN 0x0001
198#define EXT4_IO_END_ERROR 0x0002 183#define EXT4_IO_END_DIRECT 0x0002
199#define EXT4_IO_END_DIRECT 0x0004
200 184
201/* 185/*
202 * For converting uninitialized extents on a work queue. 186 * For converting uninitialized extents on a work queue. 'handle' is used for
187 * buffered writeback.
203 */ 188 */
204typedef struct ext4_io_end { 189typedef struct ext4_io_end {
205 struct list_head list; /* per-file finished IO list */ 190 struct list_head list; /* per-file finished IO list */
191 handle_t *handle; /* handle reserved for extent
192 * conversion */
206 struct inode *inode; /* file being written to */ 193 struct inode *inode; /* file being written to */
194 struct bio *bio; /* Linked list of completed
195 * bios covering the extent */
207 unsigned int flag; /* unwritten or not */ 196 unsigned int flag; /* unwritten or not */
208 loff_t offset; /* offset in the file */ 197 loff_t offset; /* offset in the file */
209 ssize_t size; /* size of the extent */ 198 ssize_t size; /* size of the extent */
210 struct kiocb *iocb; /* iocb struct for AIO */ 199 struct kiocb *iocb; /* iocb struct for AIO */
211 int result; /* error value for AIO */ 200 int result; /* error value for AIO */
201 atomic_t count; /* reference counter */
212} ext4_io_end_t; 202} ext4_io_end_t;
213 203
214struct ext4_io_submit { 204struct ext4_io_submit {
@@ -581,11 +571,6 @@ enum {
581#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 571#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
582 572
583/* 573/*
584 * Flags used by ext4_discard_partial_page_buffers
585 */
586#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
587
588/*
589 * ioctl commands 574 * ioctl commands
590 */ 575 */
591#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS 576#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -879,6 +864,7 @@ struct ext4_inode_info {
879 rwlock_t i_es_lock; 864 rwlock_t i_es_lock;
880 struct list_head i_es_lru; 865 struct list_head i_es_lru;
881 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 866 unsigned int i_es_lru_nr; /* protected by i_es_lock */
867 unsigned long i_touch_when; /* jiffies of last accessing */
882 868
883 /* ialloc */ 869 /* ialloc */
884 ext4_group_t i_last_alloc_group; 870 ext4_group_t i_last_alloc_group;
@@ -903,12 +889,22 @@ struct ext4_inode_info {
903 qsize_t i_reserved_quota; 889 qsize_t i_reserved_quota;
904#endif 890#endif
905 891
906 /* completed IOs that might need unwritten extents handling */ 892 /* Lock protecting lists below */
907 struct list_head i_completed_io_list;
908 spinlock_t i_completed_io_lock; 893 spinlock_t i_completed_io_lock;
894 /*
895 * Completed IOs that need unwritten extents handling and have
896 * transaction reserved
897 */
898 struct list_head i_rsv_conversion_list;
899 /*
900 * Completed IOs that need unwritten extents handling and don't have
901 * transaction reserved
902 */
903 struct list_head i_unrsv_conversion_list;
909 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 904 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
910 atomic_t i_unwritten; /* Nr. of inflight conversions pending */ 905 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
911 struct work_struct i_unwritten_work; /* deferred extent conversion */ 906 struct work_struct i_rsv_conversion_work;
907 struct work_struct i_unrsv_conversion_work;
912 908
913 spinlock_t i_block_reservation_lock; 909 spinlock_t i_block_reservation_lock;
914 910
@@ -1245,7 +1241,6 @@ struct ext4_sb_info {
1245 unsigned int s_mb_stats; 1241 unsigned int s_mb_stats;
1246 unsigned int s_mb_order2_reqs; 1242 unsigned int s_mb_order2_reqs;
1247 unsigned int s_mb_group_prealloc; 1243 unsigned int s_mb_group_prealloc;
1248 unsigned int s_max_writeback_mb_bump;
1249 unsigned int s_max_dir_size_kb; 1244 unsigned int s_max_dir_size_kb;
1250 /* where last allocation was done - for stream allocation */ 1245 /* where last allocation was done - for stream allocation */
1251 unsigned long s_mb_last_group; 1246 unsigned long s_mb_last_group;
@@ -1281,8 +1276,10 @@ struct ext4_sb_info {
1281 struct flex_groups *s_flex_groups; 1276 struct flex_groups *s_flex_groups;
1282 ext4_group_t s_flex_groups_allocated; 1277 ext4_group_t s_flex_groups_allocated;
1283 1278
1284 /* workqueue for dio unwritten */ 1279 /* workqueue for unreserved extent convertions (dio) */
1285 struct workqueue_struct *dio_unwritten_wq; 1280 struct workqueue_struct *unrsv_conversion_wq;
1281 /* workqueue for reserved extent conversions (buffered io) */
1282 struct workqueue_struct *rsv_conversion_wq;
1286 1283
1287 /* timer for periodic error stats printing */ 1284 /* timer for periodic error stats printing */
1288 struct timer_list s_err_report; 1285 struct timer_list s_err_report;
@@ -1307,6 +1304,7 @@ struct ext4_sb_info {
1307 /* Reclaim extents from extent status tree */ 1304 /* Reclaim extents from extent status tree */
1308 struct shrinker s_es_shrinker; 1305 struct shrinker s_es_shrinker;
1309 struct list_head s_es_lru; 1306 struct list_head s_es_lru;
1307 unsigned long s_es_last_sorted;
1310 struct percpu_counter s_extent_cache_cnt; 1308 struct percpu_counter s_extent_cache_cnt;
1311 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1309 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1312}; 1310};
@@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1342 struct ext4_io_end *io_end) 1340 struct ext4_io_end *io_end)
1343{ 1341{
1344 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1342 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1343 /* Writeback has to have coversion transaction reserved */
1344 WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
1345 !(io_end->flag & EXT4_IO_END_DIRECT));
1345 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1346 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1346 atomic_inc(&EXT4_I(inode)->i_unwritten); 1347 atomic_inc(&EXT4_I(inode)->i_unwritten);
1347 } 1348 }
@@ -1999,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
1999 2000
2000/* fsync.c */ 2001/* fsync.c */
2001extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 2002extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
2002extern int ext4_flush_unwritten_io(struct inode *);
2003 2003
2004/* hash.c */ 2004/* hash.c */
2005extern int ext4fs_dirhash(const char *name, int len, struct 2005extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
2088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 2088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
2089extern int ext4_can_truncate(struct inode *inode); 2089extern int ext4_can_truncate(struct inode *inode);
2090extern void ext4_truncate(struct inode *); 2090extern void ext4_truncate(struct inode *);
2091extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); 2091extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
2092extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 2092extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
2093extern void ext4_set_inode_flags(struct inode *); 2093extern void ext4_set_inode_flags(struct inode *);
2094extern void ext4_get_inode_flags(struct ext4_inode_info *); 2094extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
2096extern void ext4_set_aops(struct inode *inode); 2096extern void ext4_set_aops(struct inode *inode);
2097extern int ext4_writepage_trans_blocks(struct inode *); 2097extern int ext4_writepage_trans_blocks(struct inode *);
2098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
2099extern int ext4_discard_partial_page_buffers(handle_t *handle, 2099extern int ext4_block_truncate_page(handle_t *handle,
2100 struct address_space *mapping, loff_t from, 2100 struct address_space *mapping, loff_t from);
2101 loff_t length, int flags); 2101extern int ext4_block_zero_page_range(handle_t *handle,
2102 struct address_space *mapping, loff_t from, loff_t length);
2103extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
2104 loff_t lstart, loff_t lend);
2102extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2105extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2103extern qsize_t *ext4_get_reserved_space(struct inode *inode); 2106extern qsize_t *ext4_get_reserved_space(struct inode *inode);
2104extern void ext4_da_update_reserve_space(struct inode *inode, 2107extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
2111 const struct iovec *iov, loff_t offset, 2114 const struct iovec *iov, loff_t offset,
2112 unsigned long nr_segs); 2115 unsigned long nr_segs);
2113extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2116extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2114extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 2117extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2115extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2118extern void ext4_ind_truncate(handle_t *, struct inode *inode);
2116extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 2119extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
2117 ext4_lblk_t first, ext4_lblk_t stop); 2120 ext4_lblk_t first, ext4_lblk_t stop);
@@ -2166,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
2166 ext4_group_t ngroup); 2169 ext4_group_t ngroup);
2167extern const char *ext4_decode_error(struct super_block *sb, int errno, 2170extern const char *ext4_decode_error(struct super_block *sb, int errno,
2168 char nbuf[16]); 2171 char nbuf[16]);
2172
2169extern __printf(4, 5) 2173extern __printf(4, 5)
2170void __ext4_error(struct super_block *, const char *, unsigned int, 2174void __ext4_error(struct super_block *, const char *, unsigned int,
2171 const char *, ...); 2175 const char *, ...);
2172#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
2173 __LINE__, ## message)
2174extern __printf(5, 6) 2176extern __printf(5, 6)
2175void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, 2177void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
2176 const char *, ...); 2178 const char *, ...);
2177extern __printf(5, 6) 2179extern __printf(5, 6)
2178void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, 2180void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
2179 const char *, ...); 2181 const char *, ...);
2180extern void __ext4_std_error(struct super_block *, const char *, 2182extern void __ext4_std_error(struct super_block *, const char *,
2181 unsigned int, int); 2183 unsigned int, int);
2182extern __printf(4, 5) 2184extern __printf(4, 5)
2183void __ext4_abort(struct super_block *, const char *, unsigned int, 2185void __ext4_abort(struct super_block *, const char *, unsigned int,
2184 const char *, ...); 2186 const char *, ...);
2185#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
2186 __LINE__, ## message)
2187extern __printf(4, 5) 2187extern __printf(4, 5)
2188void __ext4_warning(struct super_block *, const char *, unsigned int, 2188void __ext4_warning(struct super_block *, const char *, unsigned int,
2189 const char *, ...); 2189 const char *, ...);
2190#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
2191 __LINE__, ## message)
2192extern __printf(3, 4) 2190extern __printf(3, 4)
2193void ext4_msg(struct super_block *, const char *, const char *, ...); 2191void __ext4_msg(struct super_block *, const char *, const char *, ...);
2194extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, 2192extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
2195 const char *, unsigned int, const char *); 2193 const char *, unsigned int, const char *);
2196#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
2197 __LINE__, msg)
2198extern __printf(7, 8) 2194extern __printf(7, 8)
2199void __ext4_grp_locked_error(const char *, unsigned int, 2195void __ext4_grp_locked_error(const char *, unsigned int,
2200 struct super_block *, ext4_group_t, 2196 struct super_block *, ext4_group_t,
2201 unsigned long, ext4_fsblk_t, 2197 unsigned long, ext4_fsblk_t,
2202 const char *, ...); 2198 const char *, ...);
2203#define ext4_grp_locked_error(sb, grp, message...) \ 2199
2204 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) 2200#ifdef CONFIG_PRINTK
2201
2202#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2203 __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
2204#define ext4_error_file(file, func, line, block, fmt, ...) \
2205 __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
2206#define ext4_error(sb, fmt, ...) \
2207 __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2208#define ext4_abort(sb, fmt, ...) \
2209 __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2210#define ext4_warning(sb, fmt, ...) \
2211 __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2212#define ext4_msg(sb, level, fmt, ...) \
2213 __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
2214#define dump_mmp_msg(sb, mmp, msg) \
2215 __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
2216#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2217 __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
2218 fmt, ##__VA_ARGS__)
2219
2220#else
2221
2222#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2223do { \
2224 no_printk(fmt, ##__VA_ARGS__); \
2225 __ext4_error_inode(inode, "", 0, block, " "); \
2226} while (0)
2227#define ext4_error_file(file, func, line, block, fmt, ...) \
2228do { \
2229 no_printk(fmt, ##__VA_ARGS__); \
2230 __ext4_error_file(file, "", 0, block, " "); \
2231} while (0)
2232#define ext4_error(sb, fmt, ...) \
2233do { \
2234 no_printk(fmt, ##__VA_ARGS__); \
2235 __ext4_error(sb, "", 0, " "); \
2236} while (0)
2237#define ext4_abort(sb, fmt, ...) \
2238do { \
2239 no_printk(fmt, ##__VA_ARGS__); \
2240 __ext4_abort(sb, "", 0, " "); \
2241} while (0)
2242#define ext4_warning(sb, fmt, ...) \
2243do { \
2244 no_printk(fmt, ##__VA_ARGS__); \
2245 __ext4_warning(sb, "", 0, " "); \
2246} while (0)
2247#define ext4_msg(sb, level, fmt, ...) \
2248do { \
2249 no_printk(fmt, ##__VA_ARGS__); \
2250 __ext4_msg(sb, "", " "); \
2251} while (0)
2252#define dump_mmp_msg(sb, mmp, msg) \
2253 __dump_mmp_msg(sb, mmp, "", 0, "")
2254#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2255do { \
2256 no_printk(fmt, ##__VA_ARGS__); \
2257 __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
2258} while (0)
2259
2260#endif
2261
2205extern void ext4_update_dynamic_rev(struct super_block *sb); 2262extern void ext4_update_dynamic_rev(struct super_block *sb);
2206extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 2263extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
2207 __u32 compat); 2264 __u32 compat);
@@ -2312,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
2312{ 2369{
2313 struct ext4_group_info ***grp_info; 2370 struct ext4_group_info ***grp_info;
2314 long indexv, indexh; 2371 long indexv, indexh;
2372 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
2315 grp_info = EXT4_SB(sb)->s_group_info; 2373 grp_info = EXT4_SB(sb)->s_group_info;
2316 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); 2374 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
2317 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); 2375 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2598,8 +2656,7 @@ struct ext4_extent;
2598 2656
2599extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2657extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
2600extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2658extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
2601extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 2659extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
2602 int chunk);
2603extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2660extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2604 struct ext4_map_blocks *map, int flags); 2661 struct ext4_map_blocks *map, int flags);
2605extern void ext4_ext_truncate(handle_t *, struct inode *); 2662extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2609,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);
2609extern void ext4_ext_release(struct super_block *); 2666extern void ext4_ext_release(struct super_block *);
2610extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2667extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
2611 loff_t len); 2668 loff_t len);
2612extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2669extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
2613 ssize_t len); 2670 loff_t offset, ssize_t len);
2614extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2671extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
2615 struct ext4_map_blocks *map, int flags); 2672 struct ext4_map_blocks *map, int flags);
2616extern int ext4_ext_calc_metadata_amount(struct inode *inode, 2673extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2650,12 +2707,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2650 2707
2651/* page-io.c */ 2708/* page-io.c */
2652extern int __init ext4_init_pageio(void); 2709extern int __init ext4_init_pageio(void);
2653extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2654extern void ext4_exit_pageio(void); 2710extern void ext4_exit_pageio(void);
2655extern void ext4_ioend_shutdown(struct inode *);
2656extern void ext4_free_io_end(ext4_io_end_t *io);
2657extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2711extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2658extern void ext4_end_io_work(struct work_struct *work); 2712extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
2713extern int ext4_put_io_end(ext4_io_end_t *io_end);
2714extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
2715extern void ext4_io_submit_init(struct ext4_io_submit *io,
2716 struct writeback_control *wbc);
2717extern void ext4_end_io_rsv_work(struct work_struct *work);
2718extern void ext4_end_io_unrsv_work(struct work_struct *work);
2659extern void ext4_io_submit(struct ext4_io_submit *io); 2719extern void ext4_io_submit(struct ext4_io_submit *io);
2660extern int ext4_bio_write_page(struct ext4_io_submit *io, 2720extern int ext4_bio_write_page(struct ext4_io_submit *io,
2661 struct page *page, 2721 struct page *page,
@@ -2668,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
2668extern int ext4_mmp_csum_verify(struct super_block *sb, 2728extern int ext4_mmp_csum_verify(struct super_block *sb,
2669 struct mmp_struct *mmp); 2729 struct mmp_struct *mmp);
2670 2730
2671/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2731/*
2732 * Note that these flags will never ever appear in a buffer_head's state flag.
2733 * See EXT4_MAP_... to see where this is used.
2734 */
2672enum ext4_state_bits { 2735enum ext4_state_bits {
2673 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2736 BH_Uninit /* blocks are allocated but uninitialized on disk */
2674 = BH_JBDPrivateStart, 2737 = BH_JBDPrivateStart,
2675 BH_AllocFromCluster, /* allocated blocks were part of already 2738 BH_AllocFromCluster, /* allocated blocks were part of already
2676 * allocated cluster. Note that this flag will 2739 * allocated cluster. */
2677 * never, ever appear in a buffer_head's state
2678 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2679 * this is used. */
2680}; 2740};
2681 2741
2682BUFFER_FNS(Uninit, uninit)
2683TAS_BUFFER_FNS(Uninit, uninit)
2684
2685/* 2742/*
2686 * Add new method to test whether block and inode bitmaps are properly 2743 * Add new method to test whether block and inode bitmaps are properly
2687 * initialized. With uninit_bg reading the block from disk is not enough 2744 * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
38/* 38/*
39 * Wrappers for jbd2_journal_start/end. 39 * Wrappers for jbd2_journal_start/end.
40 */ 40 */
41handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 41static int ext4_journal_check_start(struct super_block *sb)
42 int type, int nblocks)
43{ 42{
44 journal_t *journal; 43 journal_t *journal;
45 44
46 might_sleep(); 45 might_sleep();
47
48 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
49 if (sb->s_flags & MS_RDONLY) 46 if (sb->s_flags & MS_RDONLY)
50 return ERR_PTR(-EROFS); 47 return -EROFS;
51
52 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); 48 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
53 journal = EXT4_SB(sb)->s_journal; 49 journal = EXT4_SB(sb)->s_journal;
54 if (!journal)
55 return ext4_get_nojournal();
56 /* 50 /*
57 * Special case here: if the journal has aborted behind our 51 * Special case here: if the journal has aborted behind our
58 * backs (eg. EIO in the commit thread), then we still need to 52 * backs (eg. EIO in the commit thread), then we still need to
59 * take the FS itself readonly cleanly. 53 * take the FS itself readonly cleanly.
60 */ 54 */
61 if (is_journal_aborted(journal)) { 55 if (journal && is_journal_aborted(journal)) {
62 ext4_abort(sb, "Detected aborted journal"); 56 ext4_abort(sb, "Detected aborted journal");
63 return ERR_PTR(-EROFS); 57 return -EROFS;
64 } 58 }
65 return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); 59 return 0;
60}
61
62handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
63 int type, int blocks, int rsv_blocks)
64{
65 journal_t *journal;
66 int err;
67
68 trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
69 err = ext4_journal_check_start(sb);
70 if (err < 0)
71 return ERR_PTR(err);
72
73 journal = EXT4_SB(sb)->s_journal;
74 if (!journal)
75 return ext4_get_nojournal();
76 return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
77 type, line);
66} 78}
67 79
68int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) 80int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
86 return err; 98 return err;
87} 99}
88 100
101handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
102 int type)
103{
104 struct super_block *sb;
105 int err;
106
107 if (!ext4_handle_valid(handle))
108 return ext4_get_nojournal();
109
110 sb = handle->h_journal->j_private;
111 trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
112 _RET_IP_);
113 err = ext4_journal_check_start(sb);
114 if (err < 0) {
115 jbd2_journal_free_reserved(handle);
116 return ERR_PTR(err);
117 }
118
119 err = jbd2_journal_start_reserved(handle, type, line);
120 if (err < 0)
121 return ERR_PTR(err);
122 return handle;
123}
124
89void ext4_journal_abort_handle(const char *caller, unsigned int line, 125void ext4_journal_abort_handle(const char *caller, unsigned int line,
90 const char *err_fn, struct buffer_head *bh, 126 const char *err_fn, struct buffer_head *bh,
91 handle_t *handle, int err) 127 handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
134#define EXT4_HT_MIGRATE 8 134#define EXT4_HT_MIGRATE 8
135#define EXT4_HT_MOVE_EXTENTS 9 135#define EXT4_HT_MOVE_EXTENTS 9
136#define EXT4_HT_XATTR 10 136#define EXT4_HT_XATTR 10
137#define EXT4_HT_MAX 11 137#define EXT4_HT_EXT_CONVERT 11
138#define EXT4_HT_MAX 12
138 139
139/** 140/**
140 * struct ext4_journal_cb_entry - Base structure for callback information. 141 * struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
265 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) 266 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
266 267
267handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 268handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
268 int type, int nblocks); 269 int type, int blocks, int rsv_blocks);
269int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); 270int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
270 271
271#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) 272#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
300} 301}
301 302
302#define ext4_journal_start_sb(sb, type, nblocks) \ 303#define ext4_journal_start_sb(sb, type, nblocks) \
303 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) 304 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
304 305
305#define ext4_journal_start(inode, type, nblocks) \ 306#define ext4_journal_start(inode, type, nblocks) \
306 __ext4_journal_start((inode), __LINE__, (type), (nblocks)) 307 __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
308
309#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
310 __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
307 311
308static inline handle_t *__ext4_journal_start(struct inode *inode, 312static inline handle_t *__ext4_journal_start(struct inode *inode,
309 unsigned int line, int type, 313 unsigned int line, int type,
310 int nblocks) 314 int blocks, int rsv_blocks)
311{ 315{
312 return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); 316 return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
317 rsv_blocks);
313} 318}
314 319
315#define ext4_journal_stop(handle) \ 320#define ext4_journal_stop(handle) \
316 __ext4_journal_stop(__func__, __LINE__, (handle)) 321 __ext4_journal_stop(__func__, __LINE__, (handle))
317 322
323#define ext4_journal_start_reserved(handle, type) \
324 __ext4_journal_start_reserved((handle), __LINE__, (type))
325
326handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
327 int type);
328
329static inline void ext4_journal_free_reserved(handle_t *handle)
330{
331 if (ext4_handle_valid(handle))
332 jbd2_journal_free_reserved(handle);
333}
334
318static inline handle_t *ext4_journal_current_handle(void) 335static inline handle_t *ext4_journal_current_handle(void)
319{ 336{
320 return journal_current_handle(); 337 return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc0f1910b9cf..7097b0f680e6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2125 next_del = ext4_find_delayed_extent(inode, &es); 2125 next_del = ext4_find_delayed_extent(inode, &es);
2126 if (!exists && next_del) { 2126 if (!exists && next_del) {
2127 exists = 1; 2127 exists = 1;
2128 flags |= FIEMAP_EXTENT_DELALLOC; 2128 flags |= (FIEMAP_EXTENT_DELALLOC |
2129 FIEMAP_EXTENT_UNKNOWN);
2129 } 2130 }
2130 up_read(&EXT4_I(inode)->i_data_sem); 2131 up_read(&EXT4_I(inode)->i_data_sem);
2131 2132
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2328} 2329}
2329 2330
2330/* 2331/*
2331 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2332 * How many index/leaf blocks need to change/allocate to add @extents extents?
2332 * 2333 *
2333 * if nrblocks are fit in a single extent (chunk flag is 1), then 2334 * If we add a single extent, then in the worse case, each tree level
2334 * in the worse case, each tree level index/leaf need to be changed 2335 * index/leaf need to be changed in case of the tree split.
2335 * if the tree split due to insert a new extent, then the old tree
2336 * index/leaf need to be updated too
2337 * 2336 *
2338 * If the nrblocks are discontiguous, they could cause 2337 * If more extents are inserted, they could cause the whole tree split more
2339 * the whole tree split more than once, but this is really rare. 2338 * than once, but this is really rare.
2340 */ 2339 */
2341int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2340int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2342{ 2341{
2343 int index; 2342 int index;
2344 int depth; 2343 int depth;
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2349 2348
2350 depth = ext_depth(inode); 2349 depth = ext_depth(inode);
2351 2350
2352 if (chunk) 2351 if (extents <= 1)
2353 index = depth * 2; 2352 index = depth * 2;
2354 else 2353 else
2355 index = depth * 3; 2354 index = depth * 3;
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2357 return index; 2356 return index;
2358} 2357}
2359 2358
2359static inline int get_default_free_blocks_flags(struct inode *inode)
2360{
2361 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2362 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2363 else if (ext4_should_journal_data(inode))
2364 return EXT4_FREE_BLOCKS_FORGET;
2365 return 0;
2366}
2367
2360static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2368static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2361 struct ext4_extent *ex, 2369 struct ext4_extent *ex,
2362 ext4_fsblk_t *partial_cluster, 2370 long long *partial_cluster,
2363 ext4_lblk_t from, ext4_lblk_t to) 2371 ext4_lblk_t from, ext4_lblk_t to)
2364{ 2372{
2365 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2373 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2366 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2374 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2367 ext4_fsblk_t pblk; 2375 ext4_fsblk_t pblk;
2368 int flags = 0; 2376 int flags = get_default_free_blocks_flags(inode);
2369
2370 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2371 flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2372 else if (ext4_should_journal_data(inode))
2373 flags |= EXT4_FREE_BLOCKS_FORGET;
2374 2377
2375 /* 2378 /*
2376 * For bigalloc file systems, we never free a partial cluster 2379 * For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2388 * partial cluster here. 2391 * partial cluster here.
2389 */ 2392 */
2390 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2393 pblk = ext4_ext_pblock(ex) + ee_len - 1;
2391 if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2394 if ((*partial_cluster > 0) &&
2395 (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2392 ext4_free_blocks(handle, inode, NULL, 2396 ext4_free_blocks(handle, inode, NULL,
2393 EXT4_C2B(sbi, *partial_cluster), 2397 EXT4_C2B(sbi, *partial_cluster),
2394 sbi->s_cluster_ratio, flags); 2398 sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2414 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2418 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2415 /* tail removal */ 2419 /* tail removal */
2416 ext4_lblk_t num; 2420 ext4_lblk_t num;
2421 unsigned int unaligned;
2417 2422
2418 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2423 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2419 pblk = ext4_ext_pblock(ex) + ee_len - num; 2424 pblk = ext4_ext_pblock(ex) + ee_len - num;
2420 ext_debug("free last %u blocks starting %llu\n", num, pblk); 2425 /*
2426 * Usually we want to free partial cluster at the end of the
2427 * extent, except for the situation when the cluster is still
2428 * used by any other extent (partial_cluster is negative).
2429 */
2430 if (*partial_cluster < 0 &&
2431 -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
2432 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2433
2434 ext_debug("free last %u blocks starting %llu partial %lld\n",
2435 num, pblk, *partial_cluster);
2421 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2436 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2422 /* 2437 /*
2423 * If the block range to be freed didn't start at the 2438 * If the block range to be freed didn't start at the
2424 * beginning of a cluster, and we removed the entire 2439 * beginning of a cluster, and we removed the entire
2425 * extent, save the partial cluster here, since we 2440 * extent and the cluster is not used by any other extent,
2426 * might need to delete if we determine that the 2441 * save the partial cluster here, since we might need to
2427 * truncate operation has removed all of the blocks in 2442 * delete if we determine that the truncate operation has
2428 * the cluster. 2443 * removed all of the blocks in the cluster.
2444 *
2445 * On the other hand, if we did not manage to free the whole
2446 * extent, we have to mark the cluster as used (store negative
2447 * cluster number in partial_cluster).
2429 */ 2448 */
2430 if (pblk & (sbi->s_cluster_ratio - 1) && 2449 unaligned = pblk & (sbi->s_cluster_ratio - 1);
2431 (ee_len == num)) 2450 if (unaligned && (ee_len == num) &&
2451 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2432 *partial_cluster = EXT4_B2C(sbi, pblk); 2452 *partial_cluster = EXT4_B2C(sbi, pblk);
2433 else 2453 else if (unaligned)
2454 *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2455 else if (*partial_cluster > 0)
2434 *partial_cluster = 0; 2456 *partial_cluster = 0;
2435 } else if (from == le32_to_cpu(ex->ee_block) 2457 } else
2436 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2458 ext4_error(sbi->s_sb, "strange request: removal(2) "
2437 /* head removal */ 2459 "%u-%u from %u:%u\n",
2438 ext4_lblk_t num; 2460 from, to, le32_to_cpu(ex->ee_block), ee_len);
2439 ext4_fsblk_t start;
2440
2441 num = to - from;
2442 start = ext4_ext_pblock(ex);
2443
2444 ext_debug("free first %u blocks starting %llu\n", num, start);
2445 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2446
2447 } else {
2448 printk(KERN_INFO "strange request: removal(2) "
2449 "%u-%u from %u:%u\n",
2450 from, to, le32_to_cpu(ex->ee_block), ee_len);
2451 }
2452 return 0; 2461 return 0;
2453} 2462}
2454 2463
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2461 * @handle: The journal handle 2470 * @handle: The journal handle
2462 * @inode: The files inode 2471 * @inode: The files inode
2463 * @path: The path to the leaf 2472 * @path: The path to the leaf
2473 * @partial_cluster: The cluster which we'll have to free if all extents
2474 * has been released from it. It gets negative in case
2475 * that the cluster is still used.
2464 * @start: The first block to remove 2476 * @start: The first block to remove
2465 * @end: The last block to remove 2477 * @end: The last block to remove
2466 */ 2478 */
2467static int 2479static int
2468ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2480ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2469 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, 2481 struct ext4_ext_path *path,
2482 long long *partial_cluster,
2470 ext4_lblk_t start, ext4_lblk_t end) 2483 ext4_lblk_t start, ext4_lblk_t end)
2471{ 2484{
2472 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2485 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2479 unsigned short ex_ee_len; 2492 unsigned short ex_ee_len;
2480 unsigned uninitialized = 0; 2493 unsigned uninitialized = 0;
2481 struct ext4_extent *ex; 2494 struct ext4_extent *ex;
2495 ext4_fsblk_t pblk;
2482 2496
2483 /* the header must be checked already in ext4_ext_remove_space() */ 2497 /* the header must be checked already in ext4_ext_remove_space() */
2484 ext_debug("truncate since %u in leaf to %u\n", start, end); 2498 ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2490 return -EIO; 2504 return -EIO;
2491 } 2505 }
2492 /* find where to start removing */ 2506 /* find where to start removing */
2493 ex = EXT_LAST_EXTENT(eh); 2507 ex = path[depth].p_ext;
2508 if (!ex)
2509 ex = EXT_LAST_EXTENT(eh);
2494 2510
2495 ex_ee_block = le32_to_cpu(ex->ee_block); 2511 ex_ee_block = le32_to_cpu(ex->ee_block);
2496 ex_ee_len = ext4_ext_get_actual_len(ex); 2512 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2517 2533
2518 /* If this extent is beyond the end of the hole, skip it */ 2534 /* If this extent is beyond the end of the hole, skip it */
2519 if (end < ex_ee_block) { 2535 if (end < ex_ee_block) {
2536 /*
2537 * We're going to skip this extent and move to another,
2538 * so if this extent is not cluster aligned we have
2539 * to mark the current cluster as used to avoid
2540 * accidentally freeing it later on
2541 */
2542 pblk = ext4_ext_pblock(ex);
2543 if (pblk & (sbi->s_cluster_ratio - 1))
2544 *partial_cluster =
2545 -((long long)EXT4_B2C(sbi, pblk));
2520 ex--; 2546 ex--;
2521 ex_ee_block = le32_to_cpu(ex->ee_block); 2547 ex_ee_block = le32_to_cpu(ex->ee_block);
2522 ex_ee_len = ext4_ext_get_actual_len(ex); 2548 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2592 sizeof(struct ext4_extent)); 2618 sizeof(struct ext4_extent));
2593 } 2619 }
2594 le16_add_cpu(&eh->eh_entries, -1); 2620 le16_add_cpu(&eh->eh_entries, -1);
2595 } else 2621 } else if (*partial_cluster > 0)
2596 *partial_cluster = 0; 2622 *partial_cluster = 0;
2597 2623
2598 err = ext4_ext_dirty(handle, inode, path + depth); 2624 err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2610 err = ext4_ext_correct_indexes(handle, inode, path); 2636 err = ext4_ext_correct_indexes(handle, inode, path);
2611 2637
2612 /* 2638 /*
2613 * If there is still a entry in the leaf node, check to see if 2639 * Free the partial cluster only if the current extent does not
2614 * it references the partial cluster. This is the only place 2640 * reference it. Otherwise we might free used cluster.
2615 * where it could; if it doesn't, we can free the cluster.
2616 */ 2641 */
2617 if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && 2642 if (*partial_cluster > 0 &&
2618 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2643 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2619 *partial_cluster)) { 2644 *partial_cluster)) {
2620 int flags = EXT4_FREE_BLOCKS_FORGET; 2645 int flags = get_default_free_blocks_flags(inode);
2621
2622 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2623 flags |= EXT4_FREE_BLOCKS_METADATA;
2624 2646
2625 ext4_free_blocks(handle, inode, NULL, 2647 ext4_free_blocks(handle, inode, NULL,
2626 EXT4_C2B(sbi, *partial_cluster), 2648 EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2664 struct super_block *sb = inode->i_sb; 2686 struct super_block *sb = inode->i_sb;
2665 int depth = ext_depth(inode); 2687 int depth = ext_depth(inode);
2666 struct ext4_ext_path *path = NULL; 2688 struct ext4_ext_path *path = NULL;
2667 ext4_fsblk_t partial_cluster = 0; 2689 long long partial_cluster = 0;
2668 handle_t *handle; 2690 handle_t *handle;
2669 int i = 0, err = 0; 2691 int i = 0, err = 0;
2670 2692
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2676 return PTR_ERR(handle); 2698 return PTR_ERR(handle);
2677 2699
2678again: 2700again:
2679 trace_ext4_ext_remove_space(inode, start, depth); 2701 trace_ext4_ext_remove_space(inode, start, end, depth);
2680 2702
2681 /* 2703 /*
2682 * Check if we are removing extents inside the extent tree. If that 2704 * Check if we are removing extents inside the extent tree. If that
@@ -2844,17 +2866,14 @@ again:
2844 } 2866 }
2845 } 2867 }
2846 2868
2847 trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, 2869 trace_ext4_ext_remove_space_done(inode, start, end, depth,
2848 path->p_hdr->eh_entries); 2870 partial_cluster, path->p_hdr->eh_entries);
2849 2871
2850 /* If we still have something in the partial cluster and we have removed 2872 /* If we still have something in the partial cluster and we have removed
2851 * even the first extent, then we should free the blocks in the partial 2873 * even the first extent, then we should free the blocks in the partial
2852 * cluster as well. */ 2874 * cluster as well. */
2853 if (partial_cluster && path->p_hdr->eh_entries == 0) { 2875 if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
2854 int flags = EXT4_FREE_BLOCKS_FORGET; 2876 int flags = get_default_free_blocks_flags(inode);
2855
2856 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2857 flags |= EXT4_FREE_BLOCKS_METADATA;
2858 2877
2859 ext4_free_blocks(handle, inode, NULL, 2878 ext4_free_blocks(handle, inode, NULL,
2860 EXT4_C2B(EXT4_SB(sb), partial_cluster), 2879 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -4363,7 +4382,7 @@ out2:
4363 } 4382 }
4364 4383
4365out3: 4384out3:
4366 trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); 4385 trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
4367 4386
4368 return err ? err : allocated; 4387 return err ? err : allocated;
4369} 4388}
@@ -4446,7 +4465,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4446 return -EOPNOTSUPP; 4465 return -EOPNOTSUPP;
4447 4466
4448 if (mode & FALLOC_FL_PUNCH_HOLE) 4467 if (mode & FALLOC_FL_PUNCH_HOLE)
4449 return ext4_punch_hole(file, offset, len); 4468 return ext4_punch_hole(inode, offset, len);
4450 4469
4451 ret = ext4_convert_inline_data(inode); 4470 ret = ext4_convert_inline_data(inode);
4452 if (ret) 4471 if (ret)
@@ -4548,10 +4567,9 @@ retry:
4548 * function, to convert the fallocated extents after IO is completed. 4567 * function, to convert the fallocated extents after IO is completed.
4549 * Returns 0 on success. 4568 * Returns 0 on success.
4550 */ 4569 */
4551int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 4570int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4552 ssize_t len) 4571 loff_t offset, ssize_t len)
4553{ 4572{
4554 handle_t *handle;
4555 unsigned int max_blocks; 4573 unsigned int max_blocks;
4556 int ret = 0; 4574 int ret = 0;
4557 int ret2 = 0; 4575 int ret2 = 0;
@@ -4566,16 +4584,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4566 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 4584 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4567 map.m_lblk); 4585 map.m_lblk);
4568 /* 4586 /*
4569 * credits to insert 1 extent into extent tree 4587 * This is somewhat ugly but the idea is clear: When transaction is
4588 * reserved, everything goes into it. Otherwise we rather start several
4589 * smaller transactions for conversion of each extent separately.
4570 */ 4590 */
4571 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4591 if (handle) {
4592 handle = ext4_journal_start_reserved(handle,
4593 EXT4_HT_EXT_CONVERT);
4594 if (IS_ERR(handle))
4595 return PTR_ERR(handle);
4596 credits = 0;
4597 } else {
4598 /*
4599 * credits to insert 1 extent into extent tree
4600 */
4601 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4602 }
4572 while (ret >= 0 && ret < max_blocks) { 4603 while (ret >= 0 && ret < max_blocks) {
4573 map.m_lblk += ret; 4604 map.m_lblk += ret;
4574 map.m_len = (max_blocks -= ret); 4605 map.m_len = (max_blocks -= ret);
4575 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 4606 if (credits) {
4576 if (IS_ERR(handle)) { 4607 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4577 ret = PTR_ERR(handle); 4608 credits);
4578 break; 4609 if (IS_ERR(handle)) {
4610 ret = PTR_ERR(handle);
4611 break;
4612 }
4579 } 4613 }
4580 ret = ext4_map_blocks(handle, inode, &map, 4614 ret = ext4_map_blocks(handle, inode, &map,
4581 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4615 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4620,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4586 inode->i_ino, map.m_lblk, 4620 inode->i_ino, map.m_lblk,
4587 map.m_len, ret); 4621 map.m_len, ret);
4588 ext4_mark_inode_dirty(handle, inode); 4622 ext4_mark_inode_dirty(handle, inode);
4589 ret2 = ext4_journal_stop(handle); 4623 if (credits)
4590 if (ret <= 0 || ret2 ) 4624 ret2 = ext4_journal_stop(handle);
4625 if (ret <= 0 || ret2)
4591 break; 4626 break;
4592 } 4627 }
4628 if (!credits)
4629 ret2 = ext4_journal_stop(handle);
4593 return ret > 0 ? ret2 : ret; 4630 return ret > 0 ? ret2 : ret;
4594} 4631}
4595 4632
@@ -4659,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4659 error = ext4_get_inode_loc(inode, &iloc); 4696 error = ext4_get_inode_loc(inode, &iloc);
4660 if (error) 4697 if (error)
4661 return error; 4698 return error;
4662 physical = iloc.bh->b_blocknr << blockbits; 4699 physical = (__u64)iloc.bh->b_blocknr << blockbits;
4663 offset = EXT4_GOOD_OLD_INODE_SIZE + 4700 offset = EXT4_GOOD_OLD_INODE_SIZE +
4664 EXT4_I(inode)->i_extra_isize; 4701 EXT4_I(inode)->i_extra_isize;
4665 physical += offset; 4702 physical += offset;
@@ -4667,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4667 flags |= FIEMAP_EXTENT_DATA_INLINE; 4704 flags |= FIEMAP_EXTENT_DATA_INLINE;
4668 brelse(iloc.bh); 4705 brelse(iloc.bh);
4669 } else { /* external block */ 4706 } else { /* external block */
4670 physical = EXT4_I(inode)->i_file_acl << blockbits; 4707 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4671 length = inode->i_sb->s_blocksize; 4708 length = inode->i_sb->s_blocksize;
4672 } 4709 }
4673 4710
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e622d31..ee018d5f397e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
10 * Ext4 extents status tree core functions. 10 * Ext4 extents status tree core functions.
11 */ 11 */
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/list_sort.h>
13#include "ext4.h" 14#include "ext4.h"
14#include "extents_status.h" 15#include "extents_status.h"
15#include "ext4_extents.h" 16#include "ext4_extents.h"
@@ -291,7 +292,6 @@ out:
291 292
292 read_unlock(&EXT4_I(inode)->i_es_lock); 293 read_unlock(&EXT4_I(inode)->i_es_lock);
293 294
294 ext4_es_lru_add(inode);
295 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 295 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
296} 296}
297 297
@@ -672,7 +672,6 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
672error: 672error:
673 write_unlock(&EXT4_I(inode)->i_es_lock); 673 write_unlock(&EXT4_I(inode)->i_es_lock);
674 674
675 ext4_es_lru_add(inode);
676 ext4_es_print_tree(inode); 675 ext4_es_print_tree(inode);
677 676
678 return err; 677 return err;
@@ -734,7 +733,6 @@ out:
734 733
735 read_unlock(&EXT4_I(inode)->i_es_lock); 734 read_unlock(&EXT4_I(inode)->i_es_lock);
736 735
737 ext4_es_lru_add(inode);
738 trace_ext4_es_lookup_extent_exit(inode, es, found); 736 trace_ext4_es_lookup_extent_exit(inode, es, found);
739 return found; 737 return found;
740} 738}
@@ -878,12 +876,28 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
878 EXTENT_STATUS_WRITTEN); 876 EXTENT_STATUS_WRITTEN);
879} 877}
880 878
879static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
880 struct list_head *b)
881{
882 struct ext4_inode_info *eia, *eib;
883 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
884 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
885
886 if (eia->i_touch_when == eib->i_touch_when)
887 return 0;
888 if (time_after(eia->i_touch_when, eib->i_touch_when))
889 return 1;
890 else
891 return -1;
892}
893
881static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 894static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
882{ 895{
883 struct ext4_sb_info *sbi = container_of(shrink, 896 struct ext4_sb_info *sbi = container_of(shrink,
884 struct ext4_sb_info, s_es_shrinker); 897 struct ext4_sb_info, s_es_shrinker);
885 struct ext4_inode_info *ei; 898 struct ext4_inode_info *ei;
886 struct list_head *cur, *tmp, scanned; 899 struct list_head *cur, *tmp;
900 LIST_HEAD(skiped);
887 int nr_to_scan = sc->nr_to_scan; 901 int nr_to_scan = sc->nr_to_scan;
888 int ret, nr_shrunk = 0; 902 int ret, nr_shrunk = 0;
889 903
@@ -893,23 +907,41 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
893 if (!nr_to_scan) 907 if (!nr_to_scan)
894 return ret; 908 return ret;
895 909
896 INIT_LIST_HEAD(&scanned);
897
898 spin_lock(&sbi->s_es_lru_lock); 910 spin_lock(&sbi->s_es_lru_lock);
911
912 /*
913 * If the inode that is at the head of LRU list is newer than
914 * last_sorted time, that means that we need to sort this list.
915 */
916 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
917 if (sbi->s_es_last_sorted < ei->i_touch_when) {
918 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
919 sbi->s_es_last_sorted = jiffies;
920 }
921
899 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 922 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
900 list_move_tail(cur, &scanned); 923 /*
924 * If we have already reclaimed all extents from extent
925 * status tree, just stop the loop immediately.
926 */
927 if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
928 break;
901 929
902 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 930 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
903 931
904 read_lock(&ei->i_es_lock); 932 /* Skip the inode that is newer than the last_sorted time */
905 if (ei->i_es_lru_nr == 0) { 933 if (sbi->s_es_last_sorted < ei->i_touch_when) {
906 read_unlock(&ei->i_es_lock); 934 list_move_tail(cur, &skiped);
907 continue; 935 continue;
908 } 936 }
909 read_unlock(&ei->i_es_lock); 937
938 if (ei->i_es_lru_nr == 0)
939 continue;
910 940
911 write_lock(&ei->i_es_lock); 941 write_lock(&ei->i_es_lock);
912 ret = __es_try_to_reclaim_extents(ei, nr_to_scan); 942 ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
943 if (ei->i_es_lru_nr == 0)
944 list_del_init(&ei->i_es_lru);
913 write_unlock(&ei->i_es_lock); 945 write_unlock(&ei->i_es_lock);
914 946
915 nr_shrunk += ret; 947 nr_shrunk += ret;
@@ -917,7 +949,9 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
917 if (nr_to_scan == 0) 949 if (nr_to_scan == 0)
918 break; 950 break;
919 } 951 }
920 list_splice_tail(&scanned, &sbi->s_es_lru); 952
953 /* Move the newer inodes into the tail of the LRU list. */
954 list_splice_tail(&skiped, &sbi->s_es_lru);
921 spin_unlock(&sbi->s_es_lru_lock); 955 spin_unlock(&sbi->s_es_lru_lock);
922 956
923 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 957 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
@@ -925,21 +959,19 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
925 return ret; 959 return ret;
926} 960}
927 961
928void ext4_es_register_shrinker(struct super_block *sb) 962void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
929{ 963{
930 struct ext4_sb_info *sbi;
931
932 sbi = EXT4_SB(sb);
933 INIT_LIST_HEAD(&sbi->s_es_lru); 964 INIT_LIST_HEAD(&sbi->s_es_lru);
934 spin_lock_init(&sbi->s_es_lru_lock); 965 spin_lock_init(&sbi->s_es_lru_lock);
966 sbi->s_es_last_sorted = 0;
935 sbi->s_es_shrinker.shrink = ext4_es_shrink; 967 sbi->s_es_shrinker.shrink = ext4_es_shrink;
936 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 968 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
937 register_shrinker(&sbi->s_es_shrinker); 969 register_shrinker(&sbi->s_es_shrinker);
938} 970}
939 971
940void ext4_es_unregister_shrinker(struct super_block *sb) 972void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
941{ 973{
942 unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); 974 unregister_shrinker(&sbi->s_es_shrinker);
943} 975}
944 976
945void ext4_es_lru_add(struct inode *inode) 977void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +979,14 @@ void ext4_es_lru_add(struct inode *inode)
947 struct ext4_inode_info *ei = EXT4_I(inode); 979 struct ext4_inode_info *ei = EXT4_I(inode);
948 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 980 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
949 981
982 ei->i_touch_when = jiffies;
983
984 if (!list_empty(&ei->i_es_lru))
985 return;
986
950 spin_lock(&sbi->s_es_lru_lock); 987 spin_lock(&sbi->s_es_lru_lock);
951 if (list_empty(&ei->i_es_lru)) 988 if (list_empty(&ei->i_es_lru))
952 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 989 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
953 else
954 list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
955 spin_unlock(&sbi->s_es_lru_lock); 990 spin_unlock(&sbi->s_es_lru_lock);
956} 991}
957 992
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03b707..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
39 EXTENT_STATUS_DELAYED | \ 39 EXTENT_STATUS_DELAYED | \
40 EXTENT_STATUS_HOLE) 40 EXTENT_STATUS_HOLE)
41 41
42struct ext4_sb_info;
42struct ext4_extent; 43struct ext4_extent;
43 44
44struct extent_status { 45struct extent_status {
@@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
119 es->es_pblk = block; 120 es->es_pblk = block;
120} 121}
121 122
122extern void ext4_es_register_shrinker(struct super_block *sb); 123extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
123extern void ext4_es_unregister_shrinker(struct super_block *sb); 124extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
124extern void ext4_es_lru_add(struct inode *inode); 125extern void ext4_es_lru_add(struct inode *inode);
125extern void ext4_es_lru_del(struct inode *inode); 126extern void ext4_es_lru_del(struct inode *inode);
126 127
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1b4d51b5d86..b19f0a457f32 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
312 blkbits = inode->i_sb->s_blocksize_bits; 312 blkbits = inode->i_sb->s_blocksize_bits;
313 startoff = *offset; 313 startoff = *offset;
314 lastoff = startoff; 314 lastoff = startoff;
315 endoff = (map->m_lblk + map->m_len) << blkbits; 315 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
316 316
317 index = startoff >> PAGE_CACHE_SHIFT; 317 index = startoff >> PAGE_CACHE_SHIFT;
318 end = endoff >> PAGE_CACHE_SHIFT; 318 end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
457 ret = ext4_map_blocks(NULL, inode, &map, 0); 457 ret = ext4_map_blocks(NULL, inode, &map, 0);
458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
459 if (last != start) 459 if (last != start)
460 dataoff = last << blkbits; 460 dataoff = (loff_t)last << blkbits;
461 break; 461 break;
462 } 462 }
463 463
@@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
468 ext4_es_find_delayed_extent_range(inode, last, last, &es); 468 ext4_es_find_delayed_extent_range(inode, last, last, &es);
469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
470 if (last != start) 470 if (last != start)
471 dataoff = last << blkbits; 471 dataoff = (loff_t)last << blkbits;
472 break; 472 break;
473 } 473 }
474 474
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
486 } 486 }
487 487
488 last++; 488 last++;
489 dataoff = last << blkbits; 489 dataoff = (loff_t)last << blkbits;
490 } while (last <= end); 490 } while (last <= end);
491 491
492 mutex_unlock(&inode->i_mutex); 492 mutex_unlock(&inode->i_mutex);
@@ -540,7 +540,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
540 ret = ext4_map_blocks(NULL, inode, &map, 0); 540 ret = ext4_map_blocks(NULL, inode, &map, 0);
541 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 541 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
542 last += ret; 542 last += ret;
543 holeoff = last << blkbits; 543 holeoff = (loff_t)last << blkbits;
544 continue; 544 continue;
545 } 545 }
546 546
@@ -551,7 +551,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
551 ext4_es_find_delayed_extent_range(inode, last, last, &es); 551 ext4_es_find_delayed_extent_range(inode, last, last, &es);
552 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 552 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
553 last = es.es_lblk + es.es_len; 553 last = es.es_lblk + es.es_len;
554 holeoff = last << blkbits; 554 holeoff = (loff_t)last << blkbits;
555 continue; 555 continue;
556 } 556 }
557 557
@@ -566,7 +566,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
566 &map, &holeoff); 566 &map, &holeoff);
567 if (!unwritten) { 567 if (!unwritten) {
568 last += ret; 568 last += ret;
569 holeoff = last << blkbits; 569 holeoff = (loff_t)last << blkbits;
570 continue; 570 continue;
571 } 571 }
572 } 572 }
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
73 return ret; 73 return ret;
74} 74}
75 75
76/**
77 * __sync_file - generic_file_fsync without the locking and filemap_write
78 * @inode: inode to sync
79 * @datasync: only sync essential metadata if true
80 *
81 * This is just generic_file_fsync without the locking. This is needed for
82 * nojournal mode to make sure this inodes data/metadata makes it to disk
83 * properly. The i_mutex should be held already.
84 */
85static int __sync_inode(struct inode *inode, int datasync)
86{
87 int err;
88 int ret;
89
90 ret = sync_mapping_buffers(inode->i_mapping);
91 if (!(inode->i_state & I_DIRTY))
92 return ret;
93 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
94 return ret;
95
96 err = sync_inode_metadata(inode, 1);
97 if (ret == 0)
98 ret = err;
99 return ret;
100}
101
102/* 76/*
103 * akpm: A new design for ext4_sync_file(). 77 * akpm: A new design for ext4_sync_file().
104 * 78 *
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
116 struct inode *inode = file->f_mapping->host; 90 struct inode *inode = file->f_mapping->host;
117 struct ext4_inode_info *ei = EXT4_I(inode); 91 struct ext4_inode_info *ei = EXT4_I(inode);
118 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 92 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
119 int ret, err; 93 int ret = 0, err;
120 tid_t commit_tid; 94 tid_t commit_tid;
121 bool needs_barrier = false; 95 bool needs_barrier = false;
122 96
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
124 98
125 trace_ext4_sync_file_enter(file, datasync); 99 trace_ext4_sync_file_enter(file, datasync);
126 100
127 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 101 if (inode->i_sb->s_flags & MS_RDONLY) {
128 if (ret) 102 /* Make sure that we read updated s_mount_flags value */
129 return ret; 103 smp_rmb();
130 mutex_lock(&inode->i_mutex); 104 if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
131 105 ret = -EROFS;
132 if (inode->i_sb->s_flags & MS_RDONLY)
133 goto out;
134
135 ret = ext4_flush_unwritten_io(inode);
136 if (ret < 0)
137 goto out; 106 goto out;
107 }
138 108
139 if (!journal) { 109 if (!journal) {
140 ret = __sync_inode(inode, datasync); 110 ret = generic_file_fsync(file, start, end, datasync);
141 if (!ret && !hlist_empty(&inode->i_dentry)) 111 if (!ret && !hlist_empty(&inode->i_dentry))
142 ret = ext4_sync_parent(inode); 112 ret = ext4_sync_parent(inode);
143 goto out; 113 goto out;
144 } 114 }
145 115
116 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
117 if (ret)
118 return ret;
146 /* 119 /*
147 * data=writeback,ordered: 120 * data=writeback,ordered:
148 * The caller's filemap_fdatawrite()/wait will sync the data. 121 * The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
172 if (!ret) 145 if (!ret)
173 ret = err; 146 ret = err;
174 } 147 }
175 out: 148out:
176 mutex_unlock(&inode->i_mutex);
177 trace_ext4_sync_file_exit(inode, ret); 149 trace_ext4_sync_file_exit(inode, ret);
178 return ret; 150 return ret;
179} 151}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..f03598c6ffd3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -747,7 +747,8 @@ repeat_in_this_group:
747 if (!handle) { 747 if (!handle) {
748 BUG_ON(nblocks <= 0); 748 BUG_ON(nblocks <= 0);
749 handle = __ext4_journal_start_sb(dir->i_sb, line_no, 749 handle = __ext4_journal_start_sb(dir->i_sb, line_no,
750 handle_type, nblocks); 750 handle_type, nblocks,
751 0);
751 if (IS_ERR(handle)) { 752 if (IS_ERR(handle)) {
752 err = PTR_ERR(handle); 753 err = PTR_ERR(handle);
753 ext4_std_error(sb, err); 754 ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
624 partial--; 624 partial--;
625 } 625 }
626out: 626out:
627 trace_ext4_ind_map_blocks_exit(inode, map, err); 627 trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
628 return err; 628 return err;
629} 629}
630 630
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
675 675
676retry: 676retry:
677 if (rw == READ && ext4_should_dioread_nolock(inode)) { 677 if (rw == READ && ext4_should_dioread_nolock(inode)) {
678 if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
679 mutex_lock(&inode->i_mutex);
680 ext4_flush_unwritten_io(inode);
681 mutex_unlock(&inode->i_mutex);
682 }
683 /* 678 /*
684 * Nolock dioread optimization may be dynamically disabled 679 * Nolock dioread optimization may be dynamically disabled
685 * via ext4_inode_block_unlocked_dio(). Check inode's state 680 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
779 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 774 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
780} 775}
781 776
782int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) 777/*
778 * Calculate number of indirect blocks touched by mapping @nrblocks logically
779 * contiguous blocks
780 */
781int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
783{ 782{
784 int indirects;
785
786 /* if nrblocks are contiguous */
787 if (chunk) {
788 /*
789 * With N contiguous data blocks, we need at most
790 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
791 * 2 dindirect blocks, and 1 tindirect block
792 */
793 return DIV_ROUND_UP(nrblocks,
794 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
795 }
796 /* 783 /*
797 * if nrblocks are not contiguous, worse case, each block touch 784 * With N contiguous data blocks, we need at most
798 * a indirect block, and each indirect block touch a double indirect 785 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
799 * block, plus a triple indirect block 786 * 2 dindirect blocks, and 1 tindirect block
800 */ 787 */
801 indirects = nrblocks * 2 + 1; 788 return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
802 return indirects;
803} 789}
804 790
805/* 791/*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
940 __le32 *last) 926 __le32 *last)
941{ 927{
942 __le32 *p; 928 __le32 *p;
943 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 929 int flags = EXT4_FREE_BLOCKS_VALIDATED;
944 int err; 930 int err;
945 931
946 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 932 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
947 flags |= EXT4_FREE_BLOCKS_METADATA; 933 flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
934 else if (ext4_should_journal_data(inode))
935 flags |= EXT4_FREE_BLOCKS_FORGET;
948 936
949 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 937 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
950 count)) { 938 count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1a346a6bdc8f..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
72 entry = (struct ext4_xattr_entry *) 72 entry = (struct ext4_xattr_entry *)
73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off); 73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
74 74
75 free += le32_to_cpu(entry->e_value_size); 75 free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
76 goto out; 76 goto out;
77 } 77 }
78 78
@@ -1810,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
1810 if (error) 1810 if (error)
1811 goto out; 1811 goto out;
1812 1812
1813 physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1813 physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
1814 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; 1814 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
1815 physical += offsetof(struct ext4_inode, i_block); 1815 physical += offsetof(struct ext4_inode, i_block);
1816 length = i_size_read(inode); 1816 length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b89ecbd..0188e65e1f58 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
132 new_size); 132 new_size);
133} 133}
134 134
135static void ext4_invalidatepage(struct page *page, unsigned long offset); 135static void ext4_invalidatepage(struct page *page, unsigned int offset,
136 unsigned int length);
136static int __ext4_journalled_writepage(struct page *page, unsigned int len); 137static int __ext4_journalled_writepage(struct page *page, unsigned int len);
137static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 138static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
138static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 139static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
139 struct inode *inode, struct page *page, loff_t from, 140 int pextents);
140 loff_t length, int flags);
141 141
142/* 142/*
143 * Test whether an inode is a fast symlink. 143 * Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
215 filemap_write_and_wait(&inode->i_data); 215 filemap_write_and_wait(&inode->i_data);
216 } 216 }
217 truncate_inode_pages(&inode->i_data, 0); 217 truncate_inode_pages(&inode->i_data, 0);
218 ext4_ioend_shutdown(inode); 218
219 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
219 goto no_delete; 220 goto no_delete;
220 } 221 }
221 222
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
225 if (ext4_should_order_data(inode)) 226 if (ext4_should_order_data(inode))
226 ext4_begin_ordered_truncate(inode, 0); 227 ext4_begin_ordered_truncate(inode, 0);
227 truncate_inode_pages(&inode->i_data, 0); 228 truncate_inode_pages(&inode->i_data, 0);
228 ext4_ioend_shutdown(inode);
229 229
230 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
230 if (is_bad_inode(inode)) 231 if (is_bad_inode(inode))
231 goto no_delete; 232 goto no_delete;
232 233
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
423#define check_block_validity(inode, map) \ 424#define check_block_validity(inode, map) \
424 __check_block_validity((inode), __func__, __LINE__, (map)) 425 __check_block_validity((inode), __func__, __LINE__, (map))
425 426
426/*
427 * Return the number of contiguous dirty pages in a given inode
428 * starting at page frame idx.
429 */
430static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
431 unsigned int max_pages)
432{
433 struct address_space *mapping = inode->i_mapping;
434 pgoff_t index;
435 struct pagevec pvec;
436 pgoff_t num = 0;
437 int i, nr_pages, done = 0;
438
439 if (max_pages == 0)
440 return 0;
441 pagevec_init(&pvec, 0);
442 while (!done) {
443 index = idx;
444 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
445 PAGECACHE_TAG_DIRTY,
446 (pgoff_t)PAGEVEC_SIZE);
447 if (nr_pages == 0)
448 break;
449 for (i = 0; i < nr_pages; i++) {
450 struct page *page = pvec.pages[i];
451 struct buffer_head *bh, *head;
452
453 lock_page(page);
454 if (unlikely(page->mapping != mapping) ||
455 !PageDirty(page) ||
456 PageWriteback(page) ||
457 page->index != idx) {
458 done = 1;
459 unlock_page(page);
460 break;
461 }
462 if (page_has_buffers(page)) {
463 bh = head = page_buffers(page);
464 do {
465 if (!buffer_delay(bh) &&
466 !buffer_unwritten(bh))
467 done = 1;
468 bh = bh->b_this_page;
469 } while (!done && (bh != head));
470 }
471 unlock_page(page);
472 if (done)
473 break;
474 idx++;
475 num++;
476 if (num >= max_pages) {
477 done = 1;
478 break;
479 }
480 }
481 pagevec_release(&pvec);
482 }
483 return num;
484}
485
486#ifdef ES_AGGRESSIVE_TEST 427#ifdef ES_AGGRESSIVE_TEST
487static void ext4_map_blocks_es_recheck(handle_t *handle, 428static void ext4_map_blocks_es_recheck(handle_t *handle,
488 struct inode *inode, 429 struct inode *inode,
@@ -573,6 +514,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
573 "logical block %lu\n", inode->i_ino, flags, map->m_len, 514 "logical block %lu\n", inode->i_ino, flags, map->m_len,
574 (unsigned long) map->m_lblk); 515 (unsigned long) map->m_lblk);
575 516
517 ext4_es_lru_add(inode);
518
576 /* Lookup extent status tree firstly */ 519 /* Lookup extent status tree firstly */
577 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 520 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
578 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 521 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
@@ -1118,10 +1061,13 @@ static int ext4_write_end(struct file *file,
1118 } 1061 }
1119 } 1062 }
1120 1063
1121 if (ext4_has_inline_data(inode)) 1064 if (ext4_has_inline_data(inode)) {
1122 copied = ext4_write_inline_data_end(inode, pos, len, 1065 ret = ext4_write_inline_data_end(inode, pos, len,
1123 copied, page); 1066 copied, page);
1124 else 1067 if (ret < 0)
1068 goto errout;
1069 copied = ret;
1070 } else
1125 copied = block_write_end(file, mapping, pos, 1071 copied = block_write_end(file, mapping, pos,
1126 len, copied, page, fsdata); 1072 len, copied, page, fsdata);
1127 1073
@@ -1157,8 +1103,6 @@ static int ext4_write_end(struct file *file,
1157 if (i_size_changed) 1103 if (i_size_changed)
1158 ext4_mark_inode_dirty(handle, inode); 1104 ext4_mark_inode_dirty(handle, inode);
1159 1105
1160 if (copied < 0)
1161 ret = copied;
1162 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1106 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1163 /* if we have allocated more blocks and copied 1107 /* if we have allocated more blocks and copied
1164 * less. We will have blocks allocated outside 1108 * less. We will have blocks allocated outside
@@ -1415,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1415} 1359}
1416 1360
1417static void ext4_da_page_release_reservation(struct page *page, 1361static void ext4_da_page_release_reservation(struct page *page,
1418 unsigned long offset) 1362 unsigned int offset,
1363 unsigned int length)
1419{ 1364{
1420 int to_release = 0; 1365 int to_release = 0;
1421 struct buffer_head *head, *bh; 1366 struct buffer_head *head, *bh;
1422 unsigned int curr_off = 0; 1367 unsigned int curr_off = 0;
1423 struct inode *inode = page->mapping->host; 1368 struct inode *inode = page->mapping->host;
1424 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1369 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1370 unsigned int stop = offset + length;
1425 int num_clusters; 1371 int num_clusters;
1426 ext4_fsblk_t lblk; 1372 ext4_fsblk_t lblk;
1427 1373
1374 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1375
1428 head = page_buffers(page); 1376 head = page_buffers(page);
1429 bh = head; 1377 bh = head;
1430 do { 1378 do {
1431 unsigned int next_off = curr_off + bh->b_size; 1379 unsigned int next_off = curr_off + bh->b_size;
1432 1380
1381 if (next_off > stop)
1382 break;
1383
1433 if ((offset <= curr_off) && (buffer_delay(bh))) { 1384 if ((offset <= curr_off) && (buffer_delay(bh))) {
1434 to_release++; 1385 to_release++;
1435 clear_buffer_delay(bh); 1386 clear_buffer_delay(bh);
@@ -1460,140 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page,
1460 * Delayed allocation stuff 1411 * Delayed allocation stuff
1461 */ 1412 */
1462 1413
1463/* 1414struct mpage_da_data {
1464 * mpage_da_submit_io - walks through extent of pages and try to write 1415 struct inode *inode;
1465 * them with writepage() call back 1416 struct writeback_control *wbc;
1466 *
1467 * @mpd->inode: inode
1468 * @mpd->first_page: first page of the extent
1469 * @mpd->next_page: page after the last page of the extent
1470 *
1471 * By the time mpage_da_submit_io() is called we expect all blocks
1472 * to be allocated. this may be wrong if allocation failed.
1473 *
1474 * As pages are already locked by write_cache_pages(), we can't use it
1475 */
1476static int mpage_da_submit_io(struct mpage_da_data *mpd,
1477 struct ext4_map_blocks *map)
1478{
1479 struct pagevec pvec;
1480 unsigned long index, end;
1481 int ret = 0, err, nr_pages, i;
1482 struct inode *inode = mpd->inode;
1483 struct address_space *mapping = inode->i_mapping;
1484 loff_t size = i_size_read(inode);
1485 unsigned int len, block_start;
1486 struct buffer_head *bh, *page_bufs = NULL;
1487 sector_t pblock = 0, cur_logical = 0;
1488 struct ext4_io_submit io_submit;
1489 1417
1490 BUG_ON(mpd->next_page <= mpd->first_page); 1418 pgoff_t first_page; /* The first page to write */
1491 memset(&io_submit, 0, sizeof(io_submit)); 1419 pgoff_t next_page; /* Current page to examine */
1420 pgoff_t last_page; /* Last page to examine */
1492 /* 1421 /*
1493 * We need to start from the first_page to the next_page - 1 1422 * Extent to map - this can be after first_page because that can be
1494 * to make sure we also write the mapped dirty buffer_heads. 1423 * fully mapped. We somewhat abuse m_flags to store whether the extent
1495 * If we look at mpd->b_blocknr we would only be looking 1424 * is delalloc or unwritten.
1496 * at the currently mapped buffer_heads.
1497 */ 1425 */
1498 index = mpd->first_page; 1426 struct ext4_map_blocks map;
1499 end = mpd->next_page - 1; 1427 struct ext4_io_submit io_submit; /* IO submission data */
1500 1428};
1501 pagevec_init(&pvec, 0);
1502 while (index <= end) {
1503 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1504 if (nr_pages == 0)
1505 break;
1506 for (i = 0; i < nr_pages; i++) {
1507 int skip_page = 0;
1508 struct page *page = pvec.pages[i];
1509
1510 index = page->index;
1511 if (index > end)
1512 break;
1513
1514 if (index == size >> PAGE_CACHE_SHIFT)
1515 len = size & ~PAGE_CACHE_MASK;
1516 else
1517 len = PAGE_CACHE_SIZE;
1518 if (map) {
1519 cur_logical = index << (PAGE_CACHE_SHIFT -
1520 inode->i_blkbits);
1521 pblock = map->m_pblk + (cur_logical -
1522 map->m_lblk);
1523 }
1524 index++;
1525
1526 BUG_ON(!PageLocked(page));
1527 BUG_ON(PageWriteback(page));
1528
1529 bh = page_bufs = page_buffers(page);
1530 block_start = 0;
1531 do {
1532 if (map && (cur_logical >= map->m_lblk) &&
1533 (cur_logical <= (map->m_lblk +
1534 (map->m_len - 1)))) {
1535 if (buffer_delay(bh)) {
1536 clear_buffer_delay(bh);
1537 bh->b_blocknr = pblock;
1538 }
1539 if (buffer_unwritten(bh) ||
1540 buffer_mapped(bh))
1541 BUG_ON(bh->b_blocknr != pblock);
1542 if (map->m_flags & EXT4_MAP_UNINIT)
1543 set_buffer_uninit(bh);
1544 clear_buffer_unwritten(bh);
1545 }
1546
1547 /*
1548 * skip page if block allocation undone and
1549 * block is dirty
1550 */
1551 if (ext4_bh_delay_or_unwritten(NULL, bh))
1552 skip_page = 1;
1553 bh = bh->b_this_page;
1554 block_start += bh->b_size;
1555 cur_logical++;
1556 pblock++;
1557 } while (bh != page_bufs);
1558
1559 if (skip_page) {
1560 unlock_page(page);
1561 continue;
1562 }
1563
1564 clear_page_dirty_for_io(page);
1565 err = ext4_bio_write_page(&io_submit, page, len,
1566 mpd->wbc);
1567 if (!err)
1568 mpd->pages_written++;
1569 /*
1570 * In error case, we have to continue because
1571 * remaining pages are still locked
1572 */
1573 if (ret == 0)
1574 ret = err;
1575 }
1576 pagevec_release(&pvec);
1577 }
1578 ext4_io_submit(&io_submit);
1579 return ret;
1580}
1581 1429
1582static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) 1430static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1431 bool invalidate)
1583{ 1432{
1584 int nr_pages, i; 1433 int nr_pages, i;
1585 pgoff_t index, end; 1434 pgoff_t index, end;
1586 struct pagevec pvec; 1435 struct pagevec pvec;
1587 struct inode *inode = mpd->inode; 1436 struct inode *inode = mpd->inode;
1588 struct address_space *mapping = inode->i_mapping; 1437 struct address_space *mapping = inode->i_mapping;
1589 ext4_lblk_t start, last; 1438
1439 /* This is necessary when next_page == 0. */
1440 if (mpd->first_page >= mpd->next_page)
1441 return;
1590 1442
1591 index = mpd->first_page; 1443 index = mpd->first_page;
1592 end = mpd->next_page - 1; 1444 end = mpd->next_page - 1;
1593 1445 if (invalidate) {
1594 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1446 ext4_lblk_t start, last;
1595 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1447 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1596 ext4_es_remove_extent(inode, start, last - start + 1); 1448 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1449 ext4_es_remove_extent(inode, start, last - start + 1);
1450 }
1597 1451
1598 pagevec_init(&pvec, 0); 1452 pagevec_init(&pvec, 0);
1599 while (index <= end) { 1453 while (index <= end) {
@@ -1606,14 +1460,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1606 break; 1460 break;
1607 BUG_ON(!PageLocked(page)); 1461 BUG_ON(!PageLocked(page));
1608 BUG_ON(PageWriteback(page)); 1462 BUG_ON(PageWriteback(page));
1609 block_invalidatepage(page, 0); 1463 if (invalidate) {
1610 ClearPageUptodate(page); 1464 block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1465 ClearPageUptodate(page);
1466 }
1611 unlock_page(page); 1467 unlock_page(page);
1612 } 1468 }
1613 index = pvec.pages[nr_pages - 1]->index + 1; 1469 index = pvec.pages[nr_pages - 1]->index + 1;
1614 pagevec_release(&pvec); 1470 pagevec_release(&pvec);
1615 } 1471 }
1616 return;
1617} 1472}
1618 1473
1619static void ext4_print_free_blocks(struct inode *inode) 1474static void ext4_print_free_blocks(struct inode *inode)
@@ -1642,215 +1497,6 @@ static void ext4_print_free_blocks(struct inode *inode)
1642 return; 1497 return;
1643} 1498}
1644 1499
1645/*
1646 * mpage_da_map_and_submit - go through given space, map them
1647 * if necessary, and then submit them for I/O
1648 *
1649 * @mpd - bh describing space
1650 *
1651 * The function skips space we know is already mapped to disk blocks.
1652 *
1653 */
1654static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1655{
1656 int err, blks, get_blocks_flags;
1657 struct ext4_map_blocks map, *mapp = NULL;
1658 sector_t next = mpd->b_blocknr;
1659 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1660 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1661 handle_t *handle = NULL;
1662
1663 /*
1664 * If the blocks are mapped already, or we couldn't accumulate
1665 * any blocks, then proceed immediately to the submission stage.
1666 */
1667 if ((mpd->b_size == 0) ||
1668 ((mpd->b_state & (1 << BH_Mapped)) &&
1669 !(mpd->b_state & (1 << BH_Delay)) &&
1670 !(mpd->b_state & (1 << BH_Unwritten))))
1671 goto submit_io;
1672
1673 handle = ext4_journal_current_handle();
1674 BUG_ON(!handle);
1675
1676 /*
1677 * Call ext4_map_blocks() to allocate any delayed allocation
1678 * blocks, or to convert an uninitialized extent to be
1679 * initialized (in the case where we have written into
1680 * one or more preallocated blocks).
1681 *
1682 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1683 * indicate that we are on the delayed allocation path. This
1684 * affects functions in many different parts of the allocation
1685 * call path. This flag exists primarily because we don't
1686 * want to change *many* call functions, so ext4_map_blocks()
1687 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1688 * inode's allocation semaphore is taken.
1689 *
1690 * If the blocks in questions were delalloc blocks, set
1691 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1692 * variables are updated after the blocks have been allocated.
1693 */
1694 map.m_lblk = next;
1695 map.m_len = max_blocks;
1696 /*
1697 * We're in delalloc path and it is possible that we're going to
1698 * need more metadata blocks than previously reserved. However
1699 * we must not fail because we're in writeback and there is
1700 * nothing we can do about it so it might result in data loss.
1701 * So use reserved blocks to allocate metadata if possible.
1702 */
1703 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1704 EXT4_GET_BLOCKS_METADATA_NOFAIL;
1705 if (ext4_should_dioread_nolock(mpd->inode))
1706 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1707 if (mpd->b_state & (1 << BH_Delay))
1708 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1709
1710
1711 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1712 if (blks < 0) {
1713 struct super_block *sb = mpd->inode->i_sb;
1714
1715 err = blks;
1716 /*
1717 * If get block returns EAGAIN or ENOSPC and there
1718 * appears to be free blocks we will just let
1719 * mpage_da_submit_io() unlock all of the pages.
1720 */
1721 if (err == -EAGAIN)
1722 goto submit_io;
1723
1724 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1725 mpd->retval = err;
1726 goto submit_io;
1727 }
1728
1729 /*
1730 * get block failure will cause us to loop in
1731 * writepages, because a_ops->writepage won't be able
1732 * to make progress. The page will be redirtied by
1733 * writepage and writepages will again try to write
1734 * the same.
1735 */
1736 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1737 ext4_msg(sb, KERN_CRIT,
1738 "delayed block allocation failed for inode %lu "
1739 "at logical offset %llu with max blocks %zd "
1740 "with error %d", mpd->inode->i_ino,
1741 (unsigned long long) next,
1742 mpd->b_size >> mpd->inode->i_blkbits, err);
1743 ext4_msg(sb, KERN_CRIT,
1744 "This should not happen!! Data will be lost");
1745 if (err == -ENOSPC)
1746 ext4_print_free_blocks(mpd->inode);
1747 }
1748 /* invalidate all the pages */
1749 ext4_da_block_invalidatepages(mpd);
1750
1751 /* Mark this page range as having been completed */
1752 mpd->io_done = 1;
1753 return;
1754 }
1755 BUG_ON(blks == 0);
1756
1757 mapp = &map;
1758 if (map.m_flags & EXT4_MAP_NEW) {
1759 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1760 int i;
1761
1762 for (i = 0; i < map.m_len; i++)
1763 unmap_underlying_metadata(bdev, map.m_pblk + i);
1764 }
1765
1766 /*
1767 * Update on-disk size along with block allocation.
1768 */
1769 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1770 if (disksize > i_size_read(mpd->inode))
1771 disksize = i_size_read(mpd->inode);
1772 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1773 ext4_update_i_disksize(mpd->inode, disksize);
1774 err = ext4_mark_inode_dirty(handle, mpd->inode);
1775 if (err)
1776 ext4_error(mpd->inode->i_sb,
1777 "Failed to mark inode %lu dirty",
1778 mpd->inode->i_ino);
1779 }
1780
1781submit_io:
1782 mpage_da_submit_io(mpd, mapp);
1783 mpd->io_done = 1;
1784}
1785
1786#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1787 (1 << BH_Delay) | (1 << BH_Unwritten))
1788
1789/*
1790 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1791 *
1792 * @mpd->lbh - extent of blocks
1793 * @logical - logical number of the block in the file
1794 * @b_state - b_state of the buffer head added
1795 *
1796 * the function is used to collect contig. blocks in same state
1797 */
1798static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
1799 unsigned long b_state)
1800{
1801 sector_t next;
1802 int blkbits = mpd->inode->i_blkbits;
1803 int nrblocks = mpd->b_size >> blkbits;
1804
1805 /*
1806 * XXX Don't go larger than mballoc is willing to allocate
1807 * This is a stopgap solution. We eventually need to fold
1808 * mpage_da_submit_io() into this function and then call
1809 * ext4_map_blocks() multiple times in a loop
1810 */
1811 if (nrblocks >= (8*1024*1024 >> blkbits))
1812 goto flush_it;
1813
1814 /* check if the reserved journal credits might overflow */
1815 if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
1816 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1817 /*
1818 * With non-extent format we are limited by the journal
1819 * credit available. Total credit needed to insert
1820 * nrblocks contiguous blocks is dependent on the
1821 * nrblocks. So limit nrblocks.
1822 */
1823 goto flush_it;
1824 }
1825 }
1826 /*
1827 * First block in the extent
1828 */
1829 if (mpd->b_size == 0) {
1830 mpd->b_blocknr = logical;
1831 mpd->b_size = 1 << blkbits;
1832 mpd->b_state = b_state & BH_FLAGS;
1833 return;
1834 }
1835
1836 next = mpd->b_blocknr + nrblocks;
1837 /*
1838 * Can we merge the block to our big extent?
1839 */
1840 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1841 mpd->b_size += 1 << blkbits;
1842 return;
1843 }
1844
1845flush_it:
1846 /*
1847 * We couldn't merge the block to our extent, so we
1848 * need to flush current extent and start new one
1849 */
1850 mpage_da_map_and_submit(mpd);
1851 return;
1852}
1853
1854static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 1500static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1855{ 1501{
1856 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1502 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1883,6 +1529,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1883 "logical block %lu\n", inode->i_ino, map->m_len, 1529 "logical block %lu\n", inode->i_ino, map->m_len,
1884 (unsigned long) map->m_lblk); 1530 (unsigned long) map->m_lblk);
1885 1531
1532 ext4_es_lru_add(inode);
1533
1886 /* Lookup extent status tree firstly */ 1534 /* Lookup extent status tree firstly */
1887 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1535 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1888 1536
@@ -2156,7 +1804,7 @@ out:
2156 * lock so we have to do some magic. 1804 * lock so we have to do some magic.
2157 * 1805 *
2158 * This function can get called via... 1806 * This function can get called via...
2159 * - ext4_da_writepages after taking page lock (have journal handle) 1807 * - ext4_writepages after taking page lock (have journal handle)
2160 * - journal_submit_inode_data_buffers (no journal handle) 1808 * - journal_submit_inode_data_buffers (no journal handle)
2161 * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1809 * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
2162 * - grab_page_cache when doing write_begin (have journal handle) 1810 * - grab_page_cache when doing write_begin (have journal handle)
@@ -2234,76 +1882,405 @@ static int ext4_writepage(struct page *page,
2234 */ 1882 */
2235 return __ext4_journalled_writepage(page, len); 1883 return __ext4_journalled_writepage(page, len);
2236 1884
2237 memset(&io_submit, 0, sizeof(io_submit)); 1885 ext4_io_submit_init(&io_submit, wbc);
1886 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
1887 if (!io_submit.io_end) {
1888 redirty_page_for_writepage(wbc, page);
1889 unlock_page(page);
1890 return -ENOMEM;
1891 }
2238 ret = ext4_bio_write_page(&io_submit, page, len, wbc); 1892 ret = ext4_bio_write_page(&io_submit, page, len, wbc);
2239 ext4_io_submit(&io_submit); 1893 ext4_io_submit(&io_submit);
1894 /* Drop io_end reference we got from init */
1895 ext4_put_io_end_defer(io_submit.io_end);
2240 return ret; 1896 return ret;
2241} 1897}
2242 1898
1899#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
1900
2243/* 1901/*
2244 * This is called via ext4_da_writepages() to 1902 * mballoc gives us at most this number of blocks...
2245 * calculate the total number of credits to reserve to fit 1903 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
2246 * a single extent allocation into a single transaction, 1904 * The rest of mballoc seems to handle chunks upto full group size.
2247 * ext4_da_writpeages() will loop calling this before
2248 * the block allocation.
2249 */ 1905 */
1906#define MAX_WRITEPAGES_EXTENT_LEN 2048
2250 1907
2251static int ext4_da_writepages_trans_blocks(struct inode *inode) 1908/*
1909 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
1910 *
1911 * @mpd - extent of blocks
1912 * @lblk - logical number of the block in the file
1913 * @b_state - b_state of the buffer head added
1914 *
1915 * the function is used to collect contig. blocks in same state
1916 */
1917static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
1918 unsigned long b_state)
1919{
1920 struct ext4_map_blocks *map = &mpd->map;
1921
1922 /* Don't go larger than mballoc is willing to allocate */
1923 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
1924 return 0;
1925
1926 /* First block in the extent? */
1927 if (map->m_len == 0) {
1928 map->m_lblk = lblk;
1929 map->m_len = 1;
1930 map->m_flags = b_state & BH_FLAGS;
1931 return 1;
1932 }
1933
1934 /* Can we merge the block to our big extent? */
1935 if (lblk == map->m_lblk + map->m_len &&
1936 (b_state & BH_FLAGS) == map->m_flags) {
1937 map->m_len++;
1938 return 1;
1939 }
1940 return 0;
1941}
1942
1943static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
1944 struct buffer_head *head,
1945 struct buffer_head *bh,
1946 ext4_lblk_t lblk)
1947{
1948 struct inode *inode = mpd->inode;
1949 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
1950 >> inode->i_blkbits;
1951
1952 do {
1953 BUG_ON(buffer_locked(bh));
1954
1955 if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
1956 (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
1957 lblk >= blocks) {
1958 /* Found extent to map? */
1959 if (mpd->map.m_len)
1960 return false;
1961 if (lblk >= blocks)
1962 return true;
1963 continue;
1964 }
1965 if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
1966 return false;
1967 } while (lblk++, (bh = bh->b_this_page) != head);
1968 return true;
1969}
1970
1971static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
2252{ 1972{
2253 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1973 int len;
1974 loff_t size = i_size_read(mpd->inode);
1975 int err;
1976
1977 BUG_ON(page->index != mpd->first_page);
1978 if (page->index == size >> PAGE_CACHE_SHIFT)
1979 len = size & ~PAGE_CACHE_MASK;
1980 else
1981 len = PAGE_CACHE_SIZE;
1982 clear_page_dirty_for_io(page);
1983 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
1984 if (!err)
1985 mpd->wbc->nr_to_write--;
1986 mpd->first_page++;
2254 1987
1988 return err;
1989}
1990
1991/*
1992 * mpage_map_buffers - update buffers corresponding to changed extent and
1993 * submit fully mapped pages for IO
1994 *
1995 * @mpd - description of extent to map, on return next extent to map
1996 *
1997 * Scan buffers corresponding to changed extent (we expect corresponding pages
1998 * to be already locked) and update buffer state according to new extent state.
1999 * We map delalloc buffers to their physical location, clear unwritten bits,
2000 * and mark buffers as uninit when we perform writes to uninitialized extents
2001 * and do extent conversion after IO is finished. If the last page is not fully
2002 * mapped, we update @map to the next extent in the last page that needs
2003 * mapping. Otherwise we submit the page for IO.
2004 */
2005static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2006{
2007 struct pagevec pvec;
2008 int nr_pages, i;
2009 struct inode *inode = mpd->inode;
2010 struct buffer_head *head, *bh;
2011 int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
2012 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
2013 >> inode->i_blkbits;
2014 pgoff_t start, end;
2015 ext4_lblk_t lblk;
2016 sector_t pblock;
2017 int err;
2018
2019 start = mpd->map.m_lblk >> bpp_bits;
2020 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
2021 lblk = start << bpp_bits;
2022 pblock = mpd->map.m_pblk;
2023
2024 pagevec_init(&pvec, 0);
2025 while (start <= end) {
2026 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
2027 PAGEVEC_SIZE);
2028 if (nr_pages == 0)
2029 break;
2030 for (i = 0; i < nr_pages; i++) {
2031 struct page *page = pvec.pages[i];
2032
2033 if (page->index > end)
2034 break;
2035 /* Upto 'end' pages must be contiguous */
2036 BUG_ON(page->index != start);
2037 bh = head = page_buffers(page);
2038 do {
2039 if (lblk < mpd->map.m_lblk)
2040 continue;
2041 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2042 /*
2043 * Buffer after end of mapped extent.
2044 * Find next buffer in the page to map.
2045 */
2046 mpd->map.m_len = 0;
2047 mpd->map.m_flags = 0;
2048 add_page_bufs_to_extent(mpd, head, bh,
2049 lblk);
2050 pagevec_release(&pvec);
2051 return 0;
2052 }
2053 if (buffer_delay(bh)) {
2054 clear_buffer_delay(bh);
2055 bh->b_blocknr = pblock++;
2056 }
2057 clear_buffer_unwritten(bh);
2058 } while (++lblk < blocks &&
2059 (bh = bh->b_this_page) != head);
2060
2061 /*
2062 * FIXME: This is going to break if dioread_nolock
2063 * supports blocksize < pagesize as we will try to
2064 * convert potentially unmapped parts of inode.
2065 */
2066 mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
2067 /* Page fully mapped - let IO run! */
2068 err = mpage_submit_page(mpd, page);
2069 if (err < 0) {
2070 pagevec_release(&pvec);
2071 return err;
2072 }
2073 start++;
2074 }
2075 pagevec_release(&pvec);
2076 }
2077 /* Extent fully mapped and matches with page boundary. We are done. */
2078 mpd->map.m_len = 0;
2079 mpd->map.m_flags = 0;
2080 return 0;
2081}
2082
2083static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2084{
2085 struct inode *inode = mpd->inode;
2086 struct ext4_map_blocks *map = &mpd->map;
2087 int get_blocks_flags;
2088 int err;
2089
2090 trace_ext4_da_write_pages_extent(inode, map);
2255 /* 2091 /*
2256 * With non-extent format the journal credit needed to 2092 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
2257 * insert nrblocks contiguous block is dependent on 2093 * to convert an uninitialized extent to be initialized (in the case
2258 * number of contiguous block. So we will limit 2094 * where we have written into one or more preallocated blocks). It is
2259 * number of contiguous block to a sane value 2095 * possible that we're going to need more metadata blocks than
2096 * previously reserved. However we must not fail because we're in
2097 * writeback and there is nothing we can do about it so it might result
2098 * in data loss. So use reserved blocks to allocate metadata if
2099 * possible.
2100 *
2101 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
2102 * in question are delalloc blocks. This affects functions in many
2103 * different parts of the allocation call path. This flag exists
2104 * primarily because we don't want to change *many* call functions, so
2105 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
2106 * once the inode's allocation semaphore is taken.
2260 */ 2107 */
2261 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && 2108 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2262 (max_blocks > EXT4_MAX_TRANS_DATA)) 2109 EXT4_GET_BLOCKS_METADATA_NOFAIL;
2263 max_blocks = EXT4_MAX_TRANS_DATA; 2110 if (ext4_should_dioread_nolock(inode))
2111 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2112 if (map->m_flags & (1 << BH_Delay))
2113 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2264 2114
2265 return ext4_chunk_trans_blocks(inode, max_blocks); 2115 err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
2116 if (err < 0)
2117 return err;
2118 if (map->m_flags & EXT4_MAP_UNINIT) {
2119 if (!mpd->io_submit.io_end->handle &&
2120 ext4_handle_valid(handle)) {
2121 mpd->io_submit.io_end->handle = handle->h_rsv_handle;
2122 handle->h_rsv_handle = NULL;
2123 }
2124 ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
2125 }
2126
2127 BUG_ON(map->m_len == 0);
2128 if (map->m_flags & EXT4_MAP_NEW) {
2129 struct block_device *bdev = inode->i_sb->s_bdev;
2130 int i;
2131
2132 for (i = 0; i < map->m_len; i++)
2133 unmap_underlying_metadata(bdev, map->m_pblk + i);
2134 }
2135 return 0;
2266} 2136}
2267 2137
2268/* 2138/*
2269 * write_cache_pages_da - walk the list of dirty pages of the given 2139 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2270 * address space and accumulate pages that need writing, and call 2140 * mpd->len and submit pages underlying it for IO
2271 * mpage_da_map_and_submit to map a single contiguous memory region 2141 *
2272 * and then write them. 2142 * @handle - handle for journal operations
2143 * @mpd - extent to map
2144 *
2145 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2146 * delayed, blocks are allocated, if it is unwritten, we may need to convert
2147 * them to initialized or split the described range from larger unwritten
2148 * extent. Note that we need not map all the described range since allocation
2149 * can return less blocks or the range is covered by more unwritten extents. We
2150 * cannot map more because we are limited by reserved transaction credits. On
2151 * the other hand we always make sure that the last touched page is fully
2152 * mapped so that it can be written out (and thus forward progress is
2153 * guaranteed). After mapping we submit all mapped pages for IO.
2273 */ 2154 */
2274static int write_cache_pages_da(handle_t *handle, 2155static int mpage_map_and_submit_extent(handle_t *handle,
2275 struct address_space *mapping, 2156 struct mpage_da_data *mpd,
2276 struct writeback_control *wbc, 2157 bool *give_up_on_write)
2277 struct mpage_da_data *mpd,
2278 pgoff_t *done_index)
2279{ 2158{
2280 struct buffer_head *bh, *head; 2159 struct inode *inode = mpd->inode;
2281 struct inode *inode = mapping->host; 2160 struct ext4_map_blocks *map = &mpd->map;
2282 struct pagevec pvec; 2161 int err;
2283 unsigned int nr_pages; 2162 loff_t disksize;
2284 sector_t logical;
2285 pgoff_t index, end;
2286 long nr_to_write = wbc->nr_to_write;
2287 int i, tag, ret = 0;
2288
2289 memset(mpd, 0, sizeof(struct mpage_da_data));
2290 mpd->wbc = wbc;
2291 mpd->inode = inode;
2292 pagevec_init(&pvec, 0);
2293 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2294 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2295 2163
2296 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2164 mpd->io_submit.io_end->offset =
2165 ((loff_t)map->m_lblk) << inode->i_blkbits;
2166 while (map->m_len) {
2167 err = mpage_map_one_extent(handle, mpd);
2168 if (err < 0) {
2169 struct super_block *sb = inode->i_sb;
2170
2171 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
2172 goto invalidate_dirty_pages;
2173 /*
2174 * Let the uper layers retry transient errors.
2175 * In the case of ENOSPC, if ext4_count_free_blocks()
2176 * is non-zero, a commit should free up blocks.
2177 */
2178 if ((err == -ENOMEM) ||
2179 (err == -ENOSPC && ext4_count_free_clusters(sb)))
2180 return err;
2181 ext4_msg(sb, KERN_CRIT,
2182 "Delayed block allocation failed for "
2183 "inode %lu at logical offset %llu with"
2184 " max blocks %u with error %d",
2185 inode->i_ino,
2186 (unsigned long long)map->m_lblk,
2187 (unsigned)map->m_len, -err);
2188 ext4_msg(sb, KERN_CRIT,
2189 "This should not happen!! Data will "
2190 "be lost\n");
2191 if (err == -ENOSPC)
2192 ext4_print_free_blocks(inode);
2193 invalidate_dirty_pages:
2194 *give_up_on_write = true;
2195 return err;
2196 }
2197 /*
2198 * Update buffer state, submit mapped pages, and get us new
2199 * extent to map
2200 */
2201 err = mpage_map_and_submit_buffers(mpd);
2202 if (err < 0)
2203 return err;
2204 }
2205
2206 /* Update on-disk size after IO is submitted */
2207 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
2208 if (disksize > i_size_read(inode))
2209 disksize = i_size_read(inode);
2210 if (disksize > EXT4_I(inode)->i_disksize) {
2211 int err2;
2212
2213 ext4_update_i_disksize(inode, disksize);
2214 err2 = ext4_mark_inode_dirty(handle, inode);
2215 if (err2)
2216 ext4_error(inode->i_sb,
2217 "Failed to mark inode %lu dirty",
2218 inode->i_ino);
2219 if (!err)
2220 err = err2;
2221 }
2222 return err;
2223}
2224
2225/*
2226 * Calculate the total number of credits to reserve for one writepages
2227 * iteration. This is called from ext4_writepages(). We map an extent of
2228 * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2229 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2230 * bpp - 1 blocks in bpp different extents.
2231 */
2232static int ext4_da_writepages_trans_blocks(struct inode *inode)
2233{
2234 int bpp = ext4_journal_blocks_per_page(inode);
2235
2236 return ext4_meta_trans_blocks(inode,
2237 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
2238}
2239
2240/*
2241 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2242 * and underlying extent to map
2243 *
2244 * @mpd - where to look for pages
2245 *
2246 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2247 * IO immediately. When we find a page which isn't mapped we start accumulating
2248 * extent of buffers underlying these pages that needs mapping (formed by
2249 * either delayed or unwritten buffers). We also lock the pages containing
2250 * these buffers. The extent found is returned in @mpd structure (starting at
2251 * mpd->lblk with length mpd->len blocks).
2252 *
2253 * Note that this function can attach bios to one io_end structure which are
2254 * neither logically nor physically contiguous. Although it may seem as an
2255 * unnecessary complication, it is actually inevitable in blocksize < pagesize
2256 * case as we need to track IO to all buffers underlying a page in one io_end.
2257 */
2258static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2259{
2260 struct address_space *mapping = mpd->inode->i_mapping;
2261 struct pagevec pvec;
2262 unsigned int nr_pages;
2263 pgoff_t index = mpd->first_page;
2264 pgoff_t end = mpd->last_page;
2265 int tag;
2266 int i, err = 0;
2267 int blkbits = mpd->inode->i_blkbits;
2268 ext4_lblk_t lblk;
2269 struct buffer_head *head;
2270
2271 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
2297 tag = PAGECACHE_TAG_TOWRITE; 2272 tag = PAGECACHE_TAG_TOWRITE;
2298 else 2273 else
2299 tag = PAGECACHE_TAG_DIRTY; 2274 tag = PAGECACHE_TAG_DIRTY;
2300 2275
2301 *done_index = index; 2276 pagevec_init(&pvec, 0);
2277 mpd->map.m_len = 0;
2278 mpd->next_page = index;
2302 while (index <= end) { 2279 while (index <= end) {
2303 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2280 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2304 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2281 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2305 if (nr_pages == 0) 2282 if (nr_pages == 0)
2306 return 0; 2283 goto out;
2307 2284
2308 for (i = 0; i < nr_pages; i++) { 2285 for (i = 0; i < nr_pages; i++) {
2309 struct page *page = pvec.pages[i]; 2286 struct page *page = pvec.pages[i];
@@ -2318,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle,
2318 if (page->index > end) 2295 if (page->index > end)
2319 goto out; 2296 goto out;
2320 2297
2321 *done_index = page->index + 1; 2298 /* If we can't merge this page, we are done. */
2322 2299 if (mpd->map.m_len > 0 && mpd->next_page != page->index)
2323 /* 2300 goto out;
2324 * If we can't merge this page, and we have
2325 * accumulated an contiguous region, write it
2326 */
2327 if ((mpd->next_page != page->index) &&
2328 (mpd->next_page != mpd->first_page)) {
2329 mpage_da_map_and_submit(mpd);
2330 goto ret_extent_tail;
2331 }
2332 2301
2333 lock_page(page); 2302 lock_page(page);
2334
2335 /* 2303 /*
2336 * If the page is no longer dirty, or its 2304 * If the page is no longer dirty, or its mapping no
2337 * mapping no longer corresponds to inode we 2305 * longer corresponds to inode we are writing (which
2338 * are writing (which means it has been 2306 * means it has been truncated or invalidated), or the
2339 * truncated or invalidated), or the page is 2307 * page is already under writeback and we are not doing
2340 * already under writeback and we are not 2308 * a data integrity writeback, skip the page
2341 * doing a data integrity writeback, skip the page
2342 */ 2309 */
2343 if (!PageDirty(page) || 2310 if (!PageDirty(page) ||
2344 (PageWriteback(page) && 2311 (PageWriteback(page) &&
2345 (wbc->sync_mode == WB_SYNC_NONE)) || 2312 (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
2346 unlikely(page->mapping != mapping)) { 2313 unlikely(page->mapping != mapping)) {
2347 unlock_page(page); 2314 unlock_page(page);
2348 continue; 2315 continue;
@@ -2351,106 +2318,70 @@ static int write_cache_pages_da(handle_t *handle,
2351 wait_on_page_writeback(page); 2318 wait_on_page_writeback(page);
2352 BUG_ON(PageWriteback(page)); 2319 BUG_ON(PageWriteback(page));
2353 2320
2354 /* 2321 if (mpd->map.m_len == 0)
2355 * If we have inline data and arrive here, it means that
2356 * we will soon create the block for the 1st page, so
2357 * we'd better clear the inline data here.
2358 */
2359 if (ext4_has_inline_data(inode)) {
2360 BUG_ON(ext4_test_inode_state(inode,
2361 EXT4_STATE_MAY_INLINE_DATA));
2362 ext4_destroy_inline_data(handle, inode);
2363 }
2364
2365 if (mpd->next_page != page->index)
2366 mpd->first_page = page->index; 2322 mpd->first_page = page->index;
2367 mpd->next_page = page->index + 1; 2323 mpd->next_page = page->index + 1;
2368 logical = (sector_t) page->index <<
2369 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2370
2371 /* Add all dirty buffers to mpd */ 2324 /* Add all dirty buffers to mpd */
2325 lblk = ((ext4_lblk_t)page->index) <<
2326 (PAGE_CACHE_SHIFT - blkbits);
2372 head = page_buffers(page); 2327 head = page_buffers(page);
2373 bh = head; 2328 if (!add_page_bufs_to_extent(mpd, head, head, lblk))
2374 do { 2329 goto out;
2375 BUG_ON(buffer_locked(bh)); 2330 /* So far everything mapped? Submit the page for IO. */
2376 /* 2331 if (mpd->map.m_len == 0) {
2377 * We need to try to allocate unmapped blocks 2332 err = mpage_submit_page(mpd, page);
2378 * in the same page. Otherwise we won't make 2333 if (err < 0)
2379 * progress with the page in ext4_writepage
2380 */
2381 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2382 mpage_add_bh_to_extent(mpd, logical,
2383 bh->b_state);
2384 if (mpd->io_done)
2385 goto ret_extent_tail;
2386 } else if (buffer_dirty(bh) &&
2387 buffer_mapped(bh)) {
2388 /*
2389 * mapped dirty buffer. We need to
2390 * update the b_state because we look
2391 * at b_state in mpage_da_map_blocks.
2392 * We don't update b_size because if we
2393 * find an unmapped buffer_head later
2394 * we need to use the b_state flag of
2395 * that buffer_head.
2396 */
2397 if (mpd->b_size == 0)
2398 mpd->b_state =
2399 bh->b_state & BH_FLAGS;
2400 }
2401 logical++;
2402 } while ((bh = bh->b_this_page) != head);
2403
2404 if (nr_to_write > 0) {
2405 nr_to_write--;
2406 if (nr_to_write == 0 &&
2407 wbc->sync_mode == WB_SYNC_NONE)
2408 /*
2409 * We stop writing back only if we are
2410 * not doing integrity sync. In case of
2411 * integrity sync we have to keep going
2412 * because someone may be concurrently
2413 * dirtying pages, and we might have
2414 * synced a lot of newly appeared dirty
2415 * pages, but have not synced all of the
2416 * old dirty pages.
2417 */
2418 goto out; 2334 goto out;
2419 } 2335 }
2336
2337 /*
2338 * Accumulated enough dirty pages? This doesn't apply
2339 * to WB_SYNC_ALL mode. For integrity sync we have to
2340 * keep going because someone may be concurrently
2341 * dirtying pages, and we might have synced a lot of
2342 * newly appeared dirty pages, but have not synced all
2343 * of the old dirty pages.
2344 */
2345 if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
2346 mpd->next_page - mpd->first_page >=
2347 mpd->wbc->nr_to_write)
2348 goto out;
2420 } 2349 }
2421 pagevec_release(&pvec); 2350 pagevec_release(&pvec);
2422 cond_resched(); 2351 cond_resched();
2423 } 2352 }
2424 return 0; 2353 return 0;
2425ret_extent_tail:
2426 ret = MPAGE_DA_EXTENT_TAIL;
2427out: 2354out:
2428 pagevec_release(&pvec); 2355 pagevec_release(&pvec);
2429 cond_resched(); 2356 return err;
2430 return ret;
2431} 2357}
2432 2358
2359static int __writepage(struct page *page, struct writeback_control *wbc,
2360 void *data)
2361{
2362 struct address_space *mapping = data;
2363 int ret = ext4_writepage(page, wbc);
2364 mapping_set_error(mapping, ret);
2365 return ret;
2366}
2433 2367
2434static int ext4_da_writepages(struct address_space *mapping, 2368static int ext4_writepages(struct address_space *mapping,
2435 struct writeback_control *wbc) 2369 struct writeback_control *wbc)
2436{ 2370{
2437 pgoff_t index; 2371 pgoff_t writeback_index = 0;
2372 long nr_to_write = wbc->nr_to_write;
2438 int range_whole = 0; 2373 int range_whole = 0;
2374 int cycled = 1;
2439 handle_t *handle = NULL; 2375 handle_t *handle = NULL;
2440 struct mpage_da_data mpd; 2376 struct mpage_da_data mpd;
2441 struct inode *inode = mapping->host; 2377 struct inode *inode = mapping->host;
2442 int pages_written = 0; 2378 int needed_blocks, rsv_blocks = 0, ret = 0;
2443 unsigned int max_pages;
2444 int range_cyclic, cycled = 1, io_done = 0;
2445 int needed_blocks, ret = 0;
2446 long desired_nr_to_write, nr_to_writebump = 0;
2447 loff_t range_start = wbc->range_start;
2448 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2379 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2449 pgoff_t done_index = 0; 2380 bool done;
2450 pgoff_t end;
2451 struct blk_plug plug; 2381 struct blk_plug plug;
2382 bool give_up_on_write = false;
2452 2383
2453 trace_ext4_da_writepages(inode, wbc); 2384 trace_ext4_writepages(inode, wbc);
2454 2385
2455 /* 2386 /*
2456 * No pages to write? This is mainly a kludge to avoid starting 2387 * No pages to write? This is mainly a kludge to avoid starting
@@ -2460,164 +2391,165 @@ static int ext4_da_writepages(struct address_space *mapping,
2460 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2391 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2461 return 0; 2392 return 0;
2462 2393
2394 if (ext4_should_journal_data(inode)) {
2395 struct blk_plug plug;
2396 int ret;
2397
2398 blk_start_plug(&plug);
2399 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
2400 blk_finish_plug(&plug);
2401 return ret;
2402 }
2403
2463 /* 2404 /*
2464 * If the filesystem has aborted, it is read-only, so return 2405 * If the filesystem has aborted, it is read-only, so return
2465 * right away instead of dumping stack traces later on that 2406 * right away instead of dumping stack traces later on that
2466 * will obscure the real source of the problem. We test 2407 * will obscure the real source of the problem. We test
2467 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2408 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2468 * the latter could be true if the filesystem is mounted 2409 * the latter could be true if the filesystem is mounted
2469 * read-only, and in that case, ext4_da_writepages should 2410 * read-only, and in that case, ext4_writepages should
2470 * *never* be called, so if that ever happens, we would want 2411 * *never* be called, so if that ever happens, we would want
2471 * the stack trace. 2412 * the stack trace.
2472 */ 2413 */
2473 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2414 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2474 return -EROFS; 2415 return -EROFS;
2475 2416
2417 if (ext4_should_dioread_nolock(inode)) {
2418 /*
2419 * We may need to convert upto one extent per block in
2420 * the page and we may dirty the inode.
2421 */
2422 rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
2423 }
2424
2425 /*
2426 * If we have inline data and arrive here, it means that
2427 * we will soon create the block for the 1st page, so
2428 * we'd better clear the inline data here.
2429 */
2430 if (ext4_has_inline_data(inode)) {
2431 /* Just inode will be modified... */
2432 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
2433 if (IS_ERR(handle)) {
2434 ret = PTR_ERR(handle);
2435 goto out_writepages;
2436 }
2437 BUG_ON(ext4_test_inode_state(inode,
2438 EXT4_STATE_MAY_INLINE_DATA));
2439 ext4_destroy_inline_data(handle, inode);
2440 ext4_journal_stop(handle);
2441 }
2442
2476 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2443 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2477 range_whole = 1; 2444 range_whole = 1;
2478 2445
2479 range_cyclic = wbc->range_cyclic;
2480 if (wbc->range_cyclic) { 2446 if (wbc->range_cyclic) {
2481 index = mapping->writeback_index; 2447 writeback_index = mapping->writeback_index;
2482 if (index) 2448 if (writeback_index)
2483 cycled = 0; 2449 cycled = 0;
2484 wbc->range_start = index << PAGE_CACHE_SHIFT; 2450 mpd.first_page = writeback_index;
2485 wbc->range_end = LLONG_MAX; 2451 mpd.last_page = -1;
2486 wbc->range_cyclic = 0;
2487 end = -1;
2488 } else { 2452 } else {
2489 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2453 mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
2490 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2454 mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
2491 }
2492
2493 /*
2494 * This works around two forms of stupidity. The first is in
2495 * the writeback code, which caps the maximum number of pages
2496 * written to be 1024 pages. This is wrong on multiple
2497 * levels; different architectues have a different page size,
2498 * which changes the maximum amount of data which gets
2499 * written. Secondly, 4 megabytes is way too small. XFS
2500 * forces this value to be 16 megabytes by multiplying
2501 * nr_to_write parameter by four, and then relies on its
2502 * allocator to allocate larger extents to make them
2503 * contiguous. Unfortunately this brings us to the second
2504 * stupidity, which is that ext4's mballoc code only allocates
2505 * at most 2048 blocks. So we force contiguous writes up to
2506 * the number of dirty blocks in the inode, or
2507 * sbi->max_writeback_mb_bump whichever is smaller.
2508 */
2509 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2510 if (!range_cyclic && range_whole) {
2511 if (wbc->nr_to_write == LONG_MAX)
2512 desired_nr_to_write = wbc->nr_to_write;
2513 else
2514 desired_nr_to_write = wbc->nr_to_write * 8;
2515 } else
2516 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2517 max_pages);
2518 if (desired_nr_to_write > max_pages)
2519 desired_nr_to_write = max_pages;
2520
2521 if (wbc->nr_to_write < desired_nr_to_write) {
2522 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2523 wbc->nr_to_write = desired_nr_to_write;
2524 } 2455 }
2525 2456
2457 mpd.inode = inode;
2458 mpd.wbc = wbc;
2459 ext4_io_submit_init(&mpd.io_submit, wbc);
2526retry: 2460retry:
2527 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2461 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2528 tag_pages_for_writeback(mapping, index, end); 2462 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2529 2463 done = false;
2530 blk_start_plug(&plug); 2464 blk_start_plug(&plug);
2531 while (!ret && wbc->nr_to_write > 0) { 2465 while (!done && mpd.first_page <= mpd.last_page) {
2466 /* For each extent of pages we use new io_end */
2467 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2468 if (!mpd.io_submit.io_end) {
2469 ret = -ENOMEM;
2470 break;
2471 }
2532 2472
2533 /* 2473 /*
2534 * we insert one extent at a time. So we need 2474 * We have two constraints: We find one extent to map and we
2535 * credit needed for single extent allocation. 2475 * must always write out whole page (makes a difference when
2536 * journalled mode is currently not supported 2476 * blocksize < pagesize) so that we don't block on IO when we
2537 * by delalloc 2477 * try to write out the rest of the page. Journalled mode is
2478 * not supported by delalloc.
2538 */ 2479 */
2539 BUG_ON(ext4_should_journal_data(inode)); 2480 BUG_ON(ext4_should_journal_data(inode));
2540 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2481 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2541 2482
2542 /* start a new transaction*/ 2483 /* start a new transaction */
2543 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2484 handle = ext4_journal_start_with_reserve(inode,
2544 needed_blocks); 2485 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
2545 if (IS_ERR(handle)) { 2486 if (IS_ERR(handle)) {
2546 ret = PTR_ERR(handle); 2487 ret = PTR_ERR(handle);
2547 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2488 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2548 "%ld pages, ino %lu; err %d", __func__, 2489 "%ld pages, ino %lu; err %d", __func__,
2549 wbc->nr_to_write, inode->i_ino, ret); 2490 wbc->nr_to_write, inode->i_ino, ret);
2550 blk_finish_plug(&plug); 2491 /* Release allocated io_end */
2551 goto out_writepages; 2492 ext4_put_io_end(mpd.io_submit.io_end);
2493 break;
2552 } 2494 }
2553 2495
2554 /* 2496 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
2555 * Now call write_cache_pages_da() to find the next 2497 ret = mpage_prepare_extent_to_map(&mpd);
2556 * contiguous region of logical blocks that need 2498 if (!ret) {
2557 * blocks to be allocated by ext4 and submit them. 2499 if (mpd.map.m_len)
2558 */ 2500 ret = mpage_map_and_submit_extent(handle, &mpd,
2559 ret = write_cache_pages_da(handle, mapping, 2501 &give_up_on_write);
2560 wbc, &mpd, &done_index); 2502 else {
2561 /* 2503 /*
2562 * If we have a contiguous extent of pages and we 2504 * We scanned the whole range (or exhausted
2563 * haven't done the I/O yet, map the blocks and submit 2505 * nr_to_write), submitted what was mapped and
2564 * them for I/O. 2506 * didn't find anything needing mapping. We are
2565 */ 2507 * done.
2566 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2508 */
2567 mpage_da_map_and_submit(&mpd); 2509 done = true;
2568 ret = MPAGE_DA_EXTENT_TAIL; 2510 }
2569 } 2511 }
2570 trace_ext4_da_write_pages(inode, &mpd);
2571 wbc->nr_to_write -= mpd.pages_written;
2572
2573 ext4_journal_stop(handle); 2512 ext4_journal_stop(handle);
2574 2513 /* Submit prepared bio */
2575 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2514 ext4_io_submit(&mpd.io_submit);
2576 /* commit the transaction which would 2515 /* Unlock pages we didn't use */
2516 mpage_release_unused_pages(&mpd, give_up_on_write);
2517 /* Drop our io_end reference we got from init */
2518 ext4_put_io_end(mpd.io_submit.io_end);
2519
2520 if (ret == -ENOSPC && sbi->s_journal) {
2521 /*
2522 * Commit the transaction which would
2577 * free blocks released in the transaction 2523 * free blocks released in the transaction
2578 * and try again 2524 * and try again
2579 */ 2525 */
2580 jbd2_journal_force_commit_nested(sbi->s_journal); 2526 jbd2_journal_force_commit_nested(sbi->s_journal);
2581 ret = 0; 2527 ret = 0;
2582 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2528 continue;
2583 /* 2529 }
2584 * Got one extent now try with rest of the pages. 2530 /* Fatal error - ENOMEM, EIO... */
2585 * If mpd.retval is set -EIO, journal is aborted. 2531 if (ret)
2586 * So we don't need to write any more.
2587 */
2588 pages_written += mpd.pages_written;
2589 ret = mpd.retval;
2590 io_done = 1;
2591 } else if (wbc->nr_to_write)
2592 /*
2593 * There is no more writeout needed
2594 * or we requested for a noblocking writeout
2595 * and we found the device congested
2596 */
2597 break; 2532 break;
2598 } 2533 }
2599 blk_finish_plug(&plug); 2534 blk_finish_plug(&plug);
2600 if (!io_done && !cycled) { 2535 if (!ret && !cycled) {
2601 cycled = 1; 2536 cycled = 1;
2602 index = 0; 2537 mpd.last_page = writeback_index - 1;
2603 wbc->range_start = index << PAGE_CACHE_SHIFT; 2538 mpd.first_page = 0;
2604 wbc->range_end = mapping->writeback_index - 1;
2605 goto retry; 2539 goto retry;
2606 } 2540 }
2607 2541
2608 /* Update index */ 2542 /* Update index */
2609 wbc->range_cyclic = range_cyclic;
2610 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2543 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2611 /* 2544 /*
2612 * set the writeback_index so that range_cyclic 2545 * Set the writeback_index so that range_cyclic
2613 * mode will write it back later 2546 * mode will write it back later
2614 */ 2547 */
2615 mapping->writeback_index = done_index; 2548 mapping->writeback_index = mpd.first_page;
2616 2549
2617out_writepages: 2550out_writepages:
2618 wbc->nr_to_write -= nr_to_writebump; 2551 trace_ext4_writepages_result(inode, wbc, ret,
2619 wbc->range_start = range_start; 2552 nr_to_write - wbc->nr_to_write);
2620 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2621 return ret; 2553 return ret;
2622} 2554}
2623 2555
@@ -2829,7 +2761,8 @@ static int ext4_da_write_end(struct file *file,
2829 return ret ? ret : copied; 2761 return ret ? ret : copied;
2830} 2762}
2831 2763
2832static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2764static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
2765 unsigned int length)
2833{ 2766{
2834 /* 2767 /*
2835 * Drop reserved blocks 2768 * Drop reserved blocks
@@ -2838,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2838 if (!page_has_buffers(page)) 2771 if (!page_has_buffers(page))
2839 goto out; 2772 goto out;
2840 2773
2841 ext4_da_page_release_reservation(page, offset); 2774 ext4_da_page_release_reservation(page, offset, length);
2842 2775
2843out: 2776out:
2844 ext4_invalidatepage(page, offset); 2777 ext4_invalidatepage(page, offset, length);
2845 2778
2846 return; 2779 return;
2847} 2780}
@@ -2864,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
2864 * laptop_mode, not even desirable). However, to do otherwise 2797 * laptop_mode, not even desirable). However, to do otherwise
2865 * would require replicating code paths in: 2798 * would require replicating code paths in:
2866 * 2799 *
2867 * ext4_da_writepages() -> 2800 * ext4_writepages() ->
2868 * write_cache_pages() ---> (via passed in callback function) 2801 * write_cache_pages() ---> (via passed in callback function)
2869 * __mpage_da_writepage() --> 2802 * __mpage_da_writepage() -->
2870 * mpage_add_bh_to_extent() 2803 * mpage_add_bh_to_extent()
@@ -2989,37 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
2989 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2922 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2990} 2923}
2991 2924
2992static void ext4_invalidatepage(struct page *page, unsigned long offset) 2925static void ext4_invalidatepage(struct page *page, unsigned int offset,
2926 unsigned int length)
2993{ 2927{
2994 trace_ext4_invalidatepage(page, offset); 2928 trace_ext4_invalidatepage(page, offset, length);
2995 2929
2996 /* No journalling happens on data buffers when this function is used */ 2930 /* No journalling happens on data buffers when this function is used */
2997 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 2931 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
2998 2932
2999 block_invalidatepage(page, offset); 2933 block_invalidatepage(page, offset, length);
3000} 2934}
3001 2935
3002static int __ext4_journalled_invalidatepage(struct page *page, 2936static int __ext4_journalled_invalidatepage(struct page *page,
3003 unsigned long offset) 2937 unsigned int offset,
2938 unsigned int length)
3004{ 2939{
3005 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2940 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3006 2941
3007 trace_ext4_journalled_invalidatepage(page, offset); 2942 trace_ext4_journalled_invalidatepage(page, offset, length);
3008 2943
3009 /* 2944 /*
3010 * If it's a full truncate we just forget about the pending dirtying 2945 * If it's a full truncate we just forget about the pending dirtying
3011 */ 2946 */
3012 if (offset == 0) 2947 if (offset == 0 && length == PAGE_CACHE_SIZE)
3013 ClearPageChecked(page); 2948 ClearPageChecked(page);
3014 2949
3015 return jbd2_journal_invalidatepage(journal, page, offset); 2950 return jbd2_journal_invalidatepage(journal, page, offset, length);
3016} 2951}
3017 2952
3018/* Wrapper for aops... */ 2953/* Wrapper for aops... */
3019static void ext4_journalled_invalidatepage(struct page *page, 2954static void ext4_journalled_invalidatepage(struct page *page,
3020 unsigned long offset) 2955 unsigned int offset,
2956 unsigned int length)
3021{ 2957{
3022 WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); 2958 WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
3023} 2959}
3024 2960
3025static int ext4_releasepage(struct page *page, gfp_t wait) 2961static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +3003,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3067 struct inode *inode = file_inode(iocb->ki_filp); 3003 struct inode *inode = file_inode(iocb->ki_filp);
3068 ext4_io_end_t *io_end = iocb->private; 3004 ext4_io_end_t *io_end = iocb->private;
3069 3005
3070 /* if not async direct IO or dio with 0 bytes write, just return */ 3006 /* if not async direct IO just return */
3071 if (!io_end || !size) 3007 if (!io_end) {
3072 goto out; 3008 inode_dio_done(inode);
3009 if (is_async)
3010 aio_complete(iocb, ret, 0);
3011 return;
3012 }
3073 3013
3074 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3014 ext_debug("ext4_end_io_dio(): io_end 0x%p "
3075 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3015 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3017,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3077 size); 3017 size);
3078 3018
3079 iocb->private = NULL; 3019 iocb->private = NULL;
3080
3081 /* if not aio dio with unwritten extents, just free io and return */
3082 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3083 ext4_free_io_end(io_end);
3084out:
3085 inode_dio_done(inode);
3086 if (is_async)
3087 aio_complete(iocb, ret, 0);
3088 return;
3089 }
3090
3091 io_end->offset = offset; 3020 io_end->offset = offset;
3092 io_end->size = size; 3021 io_end->size = size;
3093 if (is_async) { 3022 if (is_async) {
3094 io_end->iocb = iocb; 3023 io_end->iocb = iocb;
3095 io_end->result = ret; 3024 io_end->result = ret;
3096 } 3025 }
3097 3026 ext4_put_io_end_defer(io_end);
3098 ext4_add_complete_io(io_end);
3099} 3027}
3100 3028
3101/* 3029/*
@@ -3129,6 +3057,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3129 get_block_t *get_block_func = NULL; 3057 get_block_t *get_block_func = NULL;
3130 int dio_flags = 0; 3058 int dio_flags = 0;
3131 loff_t final_size = offset + count; 3059 loff_t final_size = offset + count;
3060 ext4_io_end_t *io_end = NULL;
3132 3061
3133 /* Use the old path for reads and writes beyond i_size. */ 3062 /* Use the old path for reads and writes beyond i_size. */
3134 if (rw != WRITE || final_size > inode->i_size) 3063 if (rw != WRITE || final_size > inode->i_size)
@@ -3136,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3136 3065
3137 BUG_ON(iocb->private == NULL); 3066 BUG_ON(iocb->private == NULL);
3138 3067
3068 /*
3069 * Make all waiters for direct IO properly wait also for extent
3070 * conversion. This also disallows race between truncate() and
3071 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3072 */
3073 if (rw == WRITE)
3074 atomic_inc(&inode->i_dio_count);
3075
3139 /* If we do a overwrite dio, i_mutex locking can be released */ 3076 /* If we do a overwrite dio, i_mutex locking can be released */
3140 overwrite = *((int *)iocb->private); 3077 overwrite = *((int *)iocb->private);
3141 3078
3142 if (overwrite) { 3079 if (overwrite) {
3143 atomic_inc(&inode->i_dio_count);
3144 down_read(&EXT4_I(inode)->i_data_sem); 3080 down_read(&EXT4_I(inode)->i_data_sem);
3145 mutex_unlock(&inode->i_mutex); 3081 mutex_unlock(&inode->i_mutex);
3146 } 3082 }
@@ -3167,13 +3103,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3167 iocb->private = NULL; 3103 iocb->private = NULL;
3168 ext4_inode_aio_set(inode, NULL); 3104 ext4_inode_aio_set(inode, NULL);
3169 if (!is_sync_kiocb(iocb)) { 3105 if (!is_sync_kiocb(iocb)) {
3170 ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); 3106 io_end = ext4_init_io_end(inode, GFP_NOFS);
3171 if (!io_end) { 3107 if (!io_end) {
3172 ret = -ENOMEM; 3108 ret = -ENOMEM;
3173 goto retake_lock; 3109 goto retake_lock;
3174 } 3110 }
3175 io_end->flag |= EXT4_IO_END_DIRECT; 3111 io_end->flag |= EXT4_IO_END_DIRECT;
3176 iocb->private = io_end; 3112 /*
3113 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
3114 */
3115 iocb->private = ext4_get_io_end(io_end);
3177 /* 3116 /*
3178 * we save the io structure for current async direct 3117 * we save the io structure for current async direct
3179 * IO, so that later ext4_map_blocks() could flag the 3118 * IO, so that later ext4_map_blocks() could flag the
@@ -3197,33 +3136,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3197 NULL, 3136 NULL,
3198 dio_flags); 3137 dio_flags);
3199 3138
3200 if (iocb->private)
3201 ext4_inode_aio_set(inode, NULL);
3202 /* 3139 /*
3203 * The io_end structure takes a reference to the inode, that 3140 * Put our reference to io_end. This can free the io_end structure e.g.
3204 * structure needs to be destroyed and the reference to the 3141 * in sync IO case or in case of error. It can even perform extent
3205 * inode need to be dropped, when IO is complete, even with 0 3142 * conversion if all bios we submitted finished before we got here.
3206 * byte write, or failed. 3143 * Note that in that case iocb->private can be already set to NULL
3207 * 3144 * here.
3208 * In the successful AIO DIO case, the io_end structure will
3209 * be destroyed and the reference to the inode will be dropped
3210 * after the end_io call back function is called.
3211 *
3212 * In the case there is 0 byte write, or error case, since VFS
3213 * direct IO won't invoke the end_io call back function, we
3214 * need to free the end_io structure here.
3215 */ 3145 */
3216 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3146 if (io_end) {
3217 ext4_free_io_end(iocb->private); 3147 ext4_inode_aio_set(inode, NULL);
3218 iocb->private = NULL; 3148 ext4_put_io_end(io_end);
3219 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3149 /*
3150 * When no IO was submitted ext4_end_io_dio() was not
3151 * called so we have to put iocb's reference.
3152 */
3153 if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
3154 WARN_ON(iocb->private != io_end);
3155 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
3156 WARN_ON(io_end->iocb);
3157 /*
3158 * Generic code already did inode_dio_done() so we
3159 * have to clear EXT4_IO_END_DIRECT to not do it for
3160 * the second time.
3161 */
3162 io_end->flag = 0;
3163 ext4_put_io_end(io_end);
3164 iocb->private = NULL;
3165 }
3166 }
3167 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3220 EXT4_STATE_DIO_UNWRITTEN)) { 3168 EXT4_STATE_DIO_UNWRITTEN)) {
3221 int err; 3169 int err;
3222 /* 3170 /*
3223 * for non AIO case, since the IO is already 3171 * for non AIO case, since the IO is already
3224 * completed, we could do the conversion right here 3172 * completed, we could do the conversion right here
3225 */ 3173 */
3226 err = ext4_convert_unwritten_extents(inode, 3174 err = ext4_convert_unwritten_extents(NULL, inode,
3227 offset, ret); 3175 offset, ret);
3228 if (err < 0) 3176 if (err < 0)
3229 ret = err; 3177 ret = err;
@@ -3231,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3231 } 3179 }
3232 3180
3233retake_lock: 3181retake_lock:
3182 if (rw == WRITE)
3183 inode_dio_done(inode);
3234 /* take i_mutex locking again if we do a ovewrite dio */ 3184 /* take i_mutex locking again if we do a ovewrite dio */
3235 if (overwrite) { 3185 if (overwrite) {
3236 inode_dio_done(inode);
3237 up_read(&EXT4_I(inode)->i_data_sem); 3186 up_read(&EXT4_I(inode)->i_data_sem);
3238 mutex_lock(&inode->i_mutex); 3187 mutex_lock(&inode->i_mutex);
3239 } 3188 }
@@ -3292,6 +3241,7 @@ static const struct address_space_operations ext4_aops = {
3292 .readpage = ext4_readpage, 3241 .readpage = ext4_readpage,
3293 .readpages = ext4_readpages, 3242 .readpages = ext4_readpages,
3294 .writepage = ext4_writepage, 3243 .writepage = ext4_writepage,
3244 .writepages = ext4_writepages,
3295 .write_begin = ext4_write_begin, 3245 .write_begin = ext4_write_begin,
3296 .write_end = ext4_write_end, 3246 .write_end = ext4_write_end,
3297 .bmap = ext4_bmap, 3247 .bmap = ext4_bmap,
@@ -3307,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3307 .readpage = ext4_readpage, 3257 .readpage = ext4_readpage,
3308 .readpages = ext4_readpages, 3258 .readpages = ext4_readpages,
3309 .writepage = ext4_writepage, 3259 .writepage = ext4_writepage,
3260 .writepages = ext4_writepages,
3310 .write_begin = ext4_write_begin, 3261 .write_begin = ext4_write_begin,
3311 .write_end = ext4_journalled_write_end, 3262 .write_end = ext4_journalled_write_end,
3312 .set_page_dirty = ext4_journalled_set_page_dirty, 3263 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3322,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = {
3322 .readpage = ext4_readpage, 3273 .readpage = ext4_readpage,
3323 .readpages = ext4_readpages, 3274 .readpages = ext4_readpages,
3324 .writepage = ext4_writepage, 3275 .writepage = ext4_writepage,
3325 .writepages = ext4_da_writepages, 3276 .writepages = ext4_writepages,
3326 .write_begin = ext4_da_write_begin, 3277 .write_begin = ext4_da_write_begin,
3327 .write_end = ext4_da_write_end, 3278 .write_end = ext4_da_write_end,
3328 .bmap = ext4_bmap, 3279 .bmap = ext4_bmap,
@@ -3355,89 +3306,56 @@ void ext4_set_aops(struct inode *inode)
3355 inode->i_mapping->a_ops = &ext4_aops; 3306 inode->i_mapping->a_ops = &ext4_aops;
3356} 3307}
3357 3308
3358
3359/* 3309/*
3360 * ext4_discard_partial_page_buffers() 3310 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3361 * Wrapper function for ext4_discard_partial_page_buffers_no_lock. 3311 * up to the end of the block which corresponds to `from'.
3362 * This function finds and locks the page containing the offset 3312 * This required during truncate. We need to physically zero the tail end
3363 * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. 3313 * of that block so it doesn't yield old data if the file is later grown.
3364 * Calling functions that already have the page locked should call
3365 * ext4_discard_partial_page_buffers_no_lock directly.
3366 */ 3314 */
3367int ext4_discard_partial_page_buffers(handle_t *handle, 3315int ext4_block_truncate_page(handle_t *handle,
3368 struct address_space *mapping, loff_t from, 3316 struct address_space *mapping, loff_t from)
3369 loff_t length, int flags)
3370{ 3317{
3318 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3319 unsigned length;
3320 unsigned blocksize;
3371 struct inode *inode = mapping->host; 3321 struct inode *inode = mapping->host;
3372 struct page *page;
3373 int err = 0;
3374 3322
3375 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, 3323 blocksize = inode->i_sb->s_blocksize;
3376 mapping_gfp_mask(mapping) & ~__GFP_FS); 3324 length = blocksize - (offset & (blocksize - 1));
3377 if (!page)
3378 return -ENOMEM;
3379
3380 err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
3381 from, length, flags);
3382 3325
3383 unlock_page(page); 3326 return ext4_block_zero_page_range(handle, mapping, from, length);
3384 page_cache_release(page);
3385 return err;
3386} 3327}
3387 3328
3388/* 3329/*
3389 * ext4_discard_partial_page_buffers_no_lock() 3330 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3390 * Zeros a page range of length 'length' starting from offset 'from'. 3331 * starting from file offset 'from'. The range to be zero'd must
3391 * Buffer heads that correspond to the block aligned regions of the 3332 * be contained with in one block. If the specified range exceeds
3392 * zeroed range will be unmapped. Unblock aligned regions 3333 * the end of the block it will be shortened to end of the block
3393 * will have the corresponding buffer head mapped if needed so that 3334 * that cooresponds to 'from'
3394 * that region of the page can be updated with the partial zero out.
3395 *
3396 * This function assumes that the page has already been locked. The
3397 * The range to be discarded must be contained with in the given page.
3398 * If the specified range exceeds the end of the page it will be shortened
3399 * to the end of the page that corresponds to 'from'. This function is
3400 * appropriate for updating a page and it buffer heads to be unmapped and
3401 * zeroed for blocks that have been either released, or are going to be
3402 * released.
3403 *
3404 * handle: The journal handle
3405 * inode: The files inode
3406 * page: A locked page that contains the offset "from"
3407 * from: The starting byte offset (from the beginning of the file)
3408 * to begin discarding
3409 * len: The length of bytes to discard
3410 * flags: Optional flags that may be used:
3411 *
3412 * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3413 * Only zero the regions of the page whose buffer heads
3414 * have already been unmapped. This flag is appropriate
3415 * for updating the contents of a page whose blocks may
3416 * have already been released, and we only want to zero
3417 * out the regions that correspond to those released blocks.
3418 *
3419 * Returns zero on success or negative on failure.
3420 */ 3335 */
3421static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 3336int ext4_block_zero_page_range(handle_t *handle,
3422 struct inode *inode, struct page *page, loff_t from, 3337 struct address_space *mapping, loff_t from, loff_t length)
3423 loff_t length, int flags)
3424{ 3338{
3425 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3339 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3426 unsigned int offset = from & (PAGE_CACHE_SIZE-1); 3340 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3427 unsigned int blocksize, max, pos; 3341 unsigned blocksize, max, pos;
3428 ext4_lblk_t iblock; 3342 ext4_lblk_t iblock;
3343 struct inode *inode = mapping->host;
3429 struct buffer_head *bh; 3344 struct buffer_head *bh;
3345 struct page *page;
3430 int err = 0; 3346 int err = 0;
3431 3347
3432 blocksize = inode->i_sb->s_blocksize; 3348 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3433 max = PAGE_CACHE_SIZE - offset; 3349 mapping_gfp_mask(mapping) & ~__GFP_FS);
3350 if (!page)
3351 return -ENOMEM;
3434 3352
3435 if (index != page->index) 3353 blocksize = inode->i_sb->s_blocksize;
3436 return -EINVAL; 3354 max = blocksize - (offset & (blocksize - 1));
3437 3355
3438 /* 3356 /*
3439 * correct length if it does not fall between 3357 * correct length if it does not fall between
3440 * 'from' and the end of the page 3358 * 'from' and the end of the block
3441 */ 3359 */
3442 if (length > max || length < 0) 3360 if (length > max || length < 0)
3443 length = max; 3361 length = max;
@@ -3455,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3455 iblock++; 3373 iblock++;
3456 pos += blocksize; 3374 pos += blocksize;
3457 } 3375 }
3458 3376 if (buffer_freed(bh)) {
3459 pos = offset; 3377 BUFFER_TRACE(bh, "freed: skip");
3460 while (pos < offset + length) { 3378 goto unlock;
3461 unsigned int end_of_block, range_to_discard; 3379 }
3462 3380 if (!buffer_mapped(bh)) {
3463 err = 0; 3381 BUFFER_TRACE(bh, "unmapped");
3464 3382 ext4_get_block(inode, iblock, bh, 0);
3465 /* The length of space left to zero and unmap */ 3383 /* unmapped? It's a hole - nothing to do */
3466 range_to_discard = offset + length - pos;
3467
3468 /* The length of space until the end of the block */
3469 end_of_block = blocksize - (pos & (blocksize-1));
3470
3471 /*
3472 * Do not unmap or zero past end of block
3473 * for this buffer head
3474 */
3475 if (range_to_discard > end_of_block)
3476 range_to_discard = end_of_block;
3477
3478
3479 /*
3480 * Skip this buffer head if we are only zeroing unampped
3481 * regions of the page
3482 */
3483 if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
3484 buffer_mapped(bh))
3485 goto next;
3486
3487 /* If the range is block aligned, unmap */
3488 if (range_to_discard == blocksize) {
3489 clear_buffer_dirty(bh);
3490 bh->b_bdev = NULL;
3491 clear_buffer_mapped(bh);
3492 clear_buffer_req(bh);
3493 clear_buffer_new(bh);
3494 clear_buffer_delay(bh);
3495 clear_buffer_unwritten(bh);
3496 clear_buffer_uptodate(bh);
3497 zero_user(page, pos, range_to_discard);
3498 BUFFER_TRACE(bh, "Buffer discarded");
3499 goto next;
3500 }
3501
3502 /*
3503 * If this block is not completely contained in the range
3504 * to be discarded, then it is not going to be released. Because
3505 * we need to keep this block, we need to make sure this part
3506 * of the page is uptodate before we modify it by writeing
3507 * partial zeros on it.
3508 */
3509 if (!buffer_mapped(bh)) { 3384 if (!buffer_mapped(bh)) {
3510 /* 3385 BUFFER_TRACE(bh, "still unmapped");
3511 * Buffer head must be mapped before we can read 3386 goto unlock;
3512 * from the block
3513 */
3514 BUFFER_TRACE(bh, "unmapped");
3515 ext4_get_block(inode, iblock, bh, 0);
3516 /* unmapped? It's a hole - nothing to do */
3517 if (!buffer_mapped(bh)) {
3518 BUFFER_TRACE(bh, "still unmapped");
3519 goto next;
3520 }
3521 } 3387 }
3388 }
3522 3389
3523 /* Ok, it's mapped. Make sure it's up-to-date */ 3390 /* Ok, it's mapped. Make sure it's up-to-date */
3524 if (PageUptodate(page)) 3391 if (PageUptodate(page))
3525 set_buffer_uptodate(bh); 3392 set_buffer_uptodate(bh);
3526 3393
3527 if (!buffer_uptodate(bh)) { 3394 if (!buffer_uptodate(bh)) {
3528 err = -EIO; 3395 err = -EIO;
3529 ll_rw_block(READ, 1, &bh); 3396 ll_rw_block(READ, 1, &bh);
3530 wait_on_buffer(bh); 3397 wait_on_buffer(bh);
3531 /* Uhhuh. Read error. Complain and punt.*/ 3398 /* Uhhuh. Read error. Complain and punt. */
3532 if (!buffer_uptodate(bh)) 3399 if (!buffer_uptodate(bh))
3533 goto next; 3400 goto unlock;
3534 } 3401 }
3402 if (ext4_should_journal_data(inode)) {
3403 BUFFER_TRACE(bh, "get write access");
3404 err = ext4_journal_get_write_access(handle, bh);
3405 if (err)
3406 goto unlock;
3407 }
3408 zero_user(page, offset, length);
3409 BUFFER_TRACE(bh, "zeroed end of block");
3535 3410
3536 if (ext4_should_journal_data(inode)) { 3411 if (ext4_should_journal_data(inode)) {
3537 BUFFER_TRACE(bh, "get write access"); 3412 err = ext4_handle_dirty_metadata(handle, inode, bh);
3538 err = ext4_journal_get_write_access(handle, bh); 3413 } else {
3539 if (err) 3414 err = 0;
3540 goto next; 3415 mark_buffer_dirty(bh);
3541 } 3416 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
3417 err = ext4_jbd2_file_inode(handle, inode);
3418 }
3419
3420unlock:
3421 unlock_page(page);
3422 page_cache_release(page);
3423 return err;
3424}
3542 3425
3543 zero_user(page, pos, range_to_discard); 3426int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3427 loff_t lstart, loff_t length)
3428{
3429 struct super_block *sb = inode->i_sb;
3430 struct address_space *mapping = inode->i_mapping;
3431 unsigned partial_start, partial_end;
3432 ext4_fsblk_t start, end;
3433 loff_t byte_end = (lstart + length - 1);
3434 int err = 0;
3544 3435
3545 err = 0; 3436 partial_start = lstart & (sb->s_blocksize - 1);
3546 if (ext4_should_journal_data(inode)) { 3437 partial_end = byte_end & (sb->s_blocksize - 1);
3547 err = ext4_handle_dirty_metadata(handle, inode, bh);
3548 } else
3549 mark_buffer_dirty(bh);
3550 3438
3551 BUFFER_TRACE(bh, "Partial buffer zeroed"); 3439 start = lstart >> sb->s_blocksize_bits;
3552next: 3440 end = byte_end >> sb->s_blocksize_bits;
3553 bh = bh->b_this_page;
3554 iblock++;
3555 pos += range_to_discard;
3556 }
3557 3441
3442 /* Handle partial zero within the single block */
3443 if (start == end &&
3444 (partial_start || (partial_end != sb->s_blocksize - 1))) {
3445 err = ext4_block_zero_page_range(handle, mapping,
3446 lstart, length);
3447 return err;
3448 }
3449 /* Handle partial zero out on the start of the range */
3450 if (partial_start) {
3451 err = ext4_block_zero_page_range(handle, mapping,
3452 lstart, sb->s_blocksize);
3453 if (err)
3454 return err;
3455 }
3456 /* Handle partial zero out on the end of the range */
3457 if (partial_end != sb->s_blocksize - 1)
3458 err = ext4_block_zero_page_range(handle, mapping,
3459 byte_end - partial_end,
3460 partial_end + 1);
3558 return err; 3461 return err;
3559} 3462}
3560 3463
@@ -3580,14 +3483,12 @@ int ext4_can_truncate(struct inode *inode)
3580 * Returns: 0 on success or negative on failure 3483 * Returns: 0 on success or negative on failure
3581 */ 3484 */
3582 3485
3583int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3486int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3584{ 3487{
3585 struct inode *inode = file_inode(file);
3586 struct super_block *sb = inode->i_sb; 3488 struct super_block *sb = inode->i_sb;
3587 ext4_lblk_t first_block, stop_block; 3489 ext4_lblk_t first_block, stop_block;
3588 struct address_space *mapping = inode->i_mapping; 3490 struct address_space *mapping = inode->i_mapping;
3589 loff_t first_page, last_page, page_len; 3491 loff_t first_block_offset, last_block_offset;
3590 loff_t first_page_offset, last_page_offset;
3591 handle_t *handle; 3492 handle_t *handle;
3592 unsigned int credits; 3493 unsigned int credits;
3593 int ret = 0; 3494 int ret = 0;
@@ -3638,23 +3539,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3638 offset; 3539 offset;
3639 } 3540 }
3640 3541
3641 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 3542 first_block_offset = round_up(offset, sb->s_blocksize);
3642 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 3543 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
3643 3544
3644 first_page_offset = first_page << PAGE_CACHE_SHIFT; 3545 /* Now release the pages and zero block aligned part of pages*/
3645 last_page_offset = last_page << PAGE_CACHE_SHIFT; 3546 if (last_block_offset > first_block_offset)
3646 3547 truncate_pagecache_range(inode, first_block_offset,
3647 /* Now release the pages */ 3548 last_block_offset);
3648 if (last_page_offset > first_page_offset) {
3649 truncate_pagecache_range(inode, first_page_offset,
3650 last_page_offset - 1);
3651 }
3652 3549
3653 /* Wait all existing dio workers, newcomers will block on i_mutex */ 3550 /* Wait all existing dio workers, newcomers will block on i_mutex */
3654 ext4_inode_block_unlocked_dio(inode); 3551 ext4_inode_block_unlocked_dio(inode);
3655 ret = ext4_flush_unwritten_io(inode);
3656 if (ret)
3657 goto out_dio;
3658 inode_dio_wait(inode); 3552 inode_dio_wait(inode);
3659 3553
3660 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3554 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3668,66 +3562,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3668 goto out_dio; 3562 goto out_dio;
3669 } 3563 }
3670 3564
3671 /* 3565 ret = ext4_zero_partial_blocks(handle, inode, offset,
3672 * Now we need to zero out the non-page-aligned data in the 3566 length);
3673 * pages at the start and tail of the hole, and unmap the 3567 if (ret)
3674 * buffer heads for the block aligned regions of the page that 3568 goto out_stop;
3675 * were completely zeroed.
3676 */
3677 if (first_page > last_page) {
3678 /*
3679 * If the file space being truncated is contained
3680 * within a page just zero out and unmap the middle of
3681 * that page
3682 */
3683 ret = ext4_discard_partial_page_buffers(handle,
3684 mapping, offset, length, 0);
3685
3686 if (ret)
3687 goto out_stop;
3688 } else {
3689 /*
3690 * zero out and unmap the partial page that contains
3691 * the start of the hole
3692 */
3693 page_len = first_page_offset - offset;
3694 if (page_len > 0) {
3695 ret = ext4_discard_partial_page_buffers(handle, mapping,
3696 offset, page_len, 0);
3697 if (ret)
3698 goto out_stop;
3699 }
3700
3701 /*
3702 * zero out and unmap the partial page that contains
3703 * the end of the hole
3704 */
3705 page_len = offset + length - last_page_offset;
3706 if (page_len > 0) {
3707 ret = ext4_discard_partial_page_buffers(handle, mapping,
3708 last_page_offset, page_len, 0);
3709 if (ret)
3710 goto out_stop;
3711 }
3712 }
3713
3714 /*
3715 * If i_size is contained in the last page, we need to
3716 * unmap and zero the partial page after i_size
3717 */
3718 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
3719 inode->i_size % PAGE_CACHE_SIZE != 0) {
3720 page_len = PAGE_CACHE_SIZE -
3721 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3722
3723 if (page_len > 0) {
3724 ret = ext4_discard_partial_page_buffers(handle,
3725 mapping, inode->i_size, page_len, 0);
3726
3727 if (ret)
3728 goto out_stop;
3729 }
3730 }
3731 3569
3732 first_block = (offset + sb->s_blocksize - 1) >> 3570 first_block = (offset + sb->s_blocksize - 1) >>
3733 EXT4_BLOCK_SIZE_BITS(sb); 3571 EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3641,6 @@ void ext4_truncate(struct inode *inode)
3803 unsigned int credits; 3641 unsigned int credits;
3804 handle_t *handle; 3642 handle_t *handle;
3805 struct address_space *mapping = inode->i_mapping; 3643 struct address_space *mapping = inode->i_mapping;
3806 loff_t page_len;
3807 3644
3808 /* 3645 /*
3809 * There is a possibility that we're either freeing the inode 3646 * There is a possibility that we're either freeing the inode
@@ -3830,12 +3667,6 @@ void ext4_truncate(struct inode *inode)
3830 return; 3667 return;
3831 } 3668 }
3832 3669
3833 /*
3834 * finish any pending end_io work so we won't run the risk of
3835 * converting any truncated blocks to initialized later
3836 */
3837 ext4_flush_unwritten_io(inode);
3838
3839 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3670 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3840 credits = ext4_writepage_trans_blocks(inode); 3671 credits = ext4_writepage_trans_blocks(inode);
3841 else 3672 else
@@ -3847,14 +3678,8 @@ void ext4_truncate(struct inode *inode)
3847 return; 3678 return;
3848 } 3679 }
3849 3680
3850 if (inode->i_size % PAGE_CACHE_SIZE != 0) { 3681 if (inode->i_size & (inode->i_sb->s_blocksize - 1))
3851 page_len = PAGE_CACHE_SIZE - 3682 ext4_block_truncate_page(handle, mapping, inode->i_size);
3852 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3853
3854 if (ext4_discard_partial_page_buffers(handle,
3855 mapping, inode->i_size, page_len, 0))
3856 goto out_stop;
3857 }
3858 3683
3859 /* 3684 /*
3860 * We add the inode to the orphan list, so that if this 3685 * We add the inode to the orphan list, so that if this
@@ -4623,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
4623 inode->i_size >> PAGE_CACHE_SHIFT); 4448 inode->i_size >> PAGE_CACHE_SHIFT);
4624 if (!page) 4449 if (!page)
4625 return; 4450 return;
4626 ret = __ext4_journalled_invalidatepage(page, offset); 4451 ret = __ext4_journalled_invalidatepage(page, offset,
4452 PAGE_CACHE_SIZE - offset);
4627 unlock_page(page); 4453 unlock_page(page);
4628 page_cache_release(page); 4454 page_cache_release(page);
4629 if (ret != -EBUSY) 4455 if (ret != -EBUSY)
@@ -4805,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4805 struct kstat *stat) 4631 struct kstat *stat)
4806{ 4632{
4807 struct inode *inode; 4633 struct inode *inode;
4808 unsigned long delalloc_blocks; 4634 unsigned long long delalloc_blocks;
4809 4635
4810 inode = dentry->d_inode; 4636 inode = dentry->d_inode;
4811 generic_fillattr(inode, stat); 4637 generic_fillattr(inode, stat);
@@ -4823,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4823 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 4649 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
4824 EXT4_I(inode)->i_reserved_data_blocks); 4650 EXT4_I(inode)->i_reserved_data_blocks);
4825 4651
4826 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4652 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
4827 return 0; 4653 return 0;
4828} 4654}
4829 4655
4830static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4656static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
4657 int pextents)
4831{ 4658{
4832 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4659 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4833 return ext4_ind_trans_blocks(inode, nrblocks, chunk); 4660 return ext4_ind_trans_blocks(inode, lblocks);
4834 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4661 return ext4_ext_index_trans_blocks(inode, pextents);
4835} 4662}
4836 4663
4837/* 4664/*
@@ -4845,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4845 * 4672 *
4846 * Also account for superblock, inode, quota and xattr blocks 4673 * Also account for superblock, inode, quota and xattr blocks
4847 */ 4674 */
4848static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4675static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
4676 int pextents)
4849{ 4677{
4850 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 4678 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
4851 int gdpblocks; 4679 int gdpblocks;
@@ -4853,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4853 int ret = 0; 4681 int ret = 0;
4854 4682
4855 /* 4683 /*
4856 * How many index blocks need to touch to modify nrblocks? 4684 * How many index blocks need to touch to map @lblocks logical blocks
4857 * The "Chunk" flag indicating whether the nrblocks is 4685 * to @pextents physical extents?
4858 * physically contiguous on disk
4859 *
4860 * For Direct IO and fallocate, they calls get_block to allocate
4861 * one single extent at a time, so they could set the "Chunk" flag
4862 */ 4686 */
4863 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4687 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
4864 4688
4865 ret = idxblocks; 4689 ret = idxblocks;
4866 4690
@@ -4868,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4868 * Now let's see how many group bitmaps and group descriptors need 4692 * Now let's see how many group bitmaps and group descriptors need
4869 * to account 4693 * to account
4870 */ 4694 */
4871 groups = idxblocks; 4695 groups = idxblocks + pextents;
4872 if (chunk)
4873 groups += 1;
4874 else
4875 groups += nrblocks;
4876
4877 gdpblocks = groups; 4696 gdpblocks = groups;
4878 if (groups > ngroups) 4697 if (groups > ngroups)
4879 groups = ngroups; 4698 groups = ngroups;
@@ -4904,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
4904 int bpp = ext4_journal_blocks_per_page(inode); 4723 int bpp = ext4_journal_blocks_per_page(inode);
4905 int ret; 4724 int ret;
4906 4725
4907 ret = ext4_meta_trans_blocks(inode, bpp, 0); 4726 ret = ext4_meta_trans_blocks(inode, bpp, bpp);
4908 4727
4909 /* Account for data blocks for journalled mode */ 4728 /* Account for data blocks for journalled mode */
4910 if (ext4_should_journal_data(inode)) 4729 if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index def84082a9a9..a9ff5e5137ca 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,6 +2105,7 @@ repeat:
2105 group = ac->ac_g_ex.fe_group; 2105 group = ac->ac_g_ex.fe_group;
2106 2106
2107 for (i = 0; i < ngroups; group++, i++) { 2107 for (i = 0; i < ngroups; group++, i++) {
2108 cond_resched();
2108 /* 2109 /*
2109 * Artificially restricted ngroups for non-extent 2110 * Artificially restricted ngroups for non-extent
2110 * files makes group > ngroups possible on first loop. 2111 * files makes group > ngroups possible on first loop.
@@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4405repeat: 4406repeat:
4406 /* allocate space in core */ 4407 /* allocate space in core */
4407 *errp = ext4_mb_regular_allocator(ac); 4408 *errp = ext4_mb_regular_allocator(ac);
4408 if (*errp) { 4409 if (*errp)
4409 ext4_discard_allocated_blocks(ac); 4410 goto discard_and_exit;
4410 goto errout;
4411 }
4412 4411
4413 /* as we've just preallocated more space than 4412 /* as we've just preallocated more space than
4414 * user requested orinally, we store allocated 4413 * user requested originally, we store allocated
4415 * space in a special descriptor */ 4414 * space in a special descriptor */
4416 if (ac->ac_status == AC_STATUS_FOUND && 4415 if (ac->ac_status == AC_STATUS_FOUND &&
4417 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4416 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4418 ext4_mb_new_preallocation(ac); 4417 *errp = ext4_mb_new_preallocation(ac);
4418 if (*errp) {
4419 discard_and_exit:
4420 ext4_discard_allocated_blocks(ac);
4421 goto errout;
4422 }
4419 } 4423 }
4420 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4424 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4421 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 4425 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4612 BUG_ON(bh && (count > 1)); 4616 BUG_ON(bh && (count > 1));
4613 4617
4614 for (i = 0; i < count; i++) { 4618 for (i = 0; i < count; i++) {
4619 cond_resched();
4615 if (!bh) 4620 if (!bh)
4616 tbh = sb_find_get_block(inode->i_sb, 4621 tbh = sb_find_get_block(inode->i_sb,
4617 block + i); 4622 block + i);
4618 if (unlikely(!tbh)) 4623 if (!tbh)
4619 continue; 4624 continue;
4620 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4625 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4621 inode, tbh, block + i); 4626 inode, tbh, block + i);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
912 struct page *pagep[2] = {NULL, NULL}; 912 struct page *pagep[2] = {NULL, NULL};
913 handle_t *handle; 913 handle_t *handle;
914 ext4_lblk_t orig_blk_offset; 914 ext4_lblk_t orig_blk_offset;
915 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
916 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 915 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
917 unsigned int w_flags = 0; 916 unsigned int w_flags = 0;
918 unsigned int tmp_data_size, data_size, replaced_size; 917 unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
940 orig_blk_offset = orig_page_offset * blocks_per_page + 939 orig_blk_offset = orig_page_offset * blocks_per_page +
941 data_offset_in_page; 940 data_offset_in_page;
942 941
943 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
944
945 /* Calculate data_size */ 942 /* Calculate data_size */
946 if ((orig_blk_offset + block_len_in_page - 1) == 943 if ((orig_blk_offset + block_len_in_page - 1) ==
947 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 944 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc35ecb7..ab2f6dc44b3a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
918 bh->b_data, bh->b_size, 918 bh->b_data, bh->b_size,
919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
920 + ((char *)de - bh->b_data))) { 920 + ((char *)de - bh->b_data))) {
921 /* On error, skip the f_pos to the next block. */ 921 /* silently ignore the rest of the block */
922 dir_file->f_pos = (dir_file->f_pos | 922 break;
923 (dir->i_sb->s_blocksize - 1)) + 1;
924 brelse(bh);
925 return count;
926 } 923 }
927 ext4fs_dirhash(de->name, de->name_len, hinfo); 924 ext4fs_dirhash(de->name, de->name_len, hinfo);
928 if ((hinfo->hash < start_hash) || 925 if ((hinfo->hash < start_hash) ||
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f78881b..48786cdb5e6c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -46,46 +46,121 @@ void ext4_exit_pageio(void)
46} 46}
47 47
48/* 48/*
49 * This function is called by ext4_evict_inode() to make sure there is 49 * Print an buffer I/O error compatible with the fs/buffer.c. This
50 * no more pending I/O completion work left to do. 50 * provides compatibility with dmesg scrapers that look for a specific
51 * buffer I/O error message. We really need a unified error reporting
52 * structure to userspace ala Digital Unix's uerf system, but it's
53 * probably not going to happen in my lifetime, due to LKML politics...
51 */ 54 */
52void ext4_ioend_shutdown(struct inode *inode) 55static void buffer_io_error(struct buffer_head *bh)
53{ 56{
54 wait_queue_head_t *wq = ext4_ioend_wq(inode); 57 char b[BDEVNAME_SIZE];
58 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
59 bdevname(bh->b_bdev, b),
60 (unsigned long long)bh->b_blocknr);
61}
55 62
56 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 63static void ext4_finish_bio(struct bio *bio)
57 /* 64{
58 * We need to make sure the work structure is finished being 65 int i;
59 * used before we let the inode get destroyed. 66 int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
60 */ 67
61 if (work_pending(&EXT4_I(inode)->i_unwritten_work)) 68 for (i = 0; i < bio->bi_vcnt; i++) {
62 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 69 struct bio_vec *bvec = &bio->bi_io_vec[i];
70 struct page *page = bvec->bv_page;
71 struct buffer_head *bh, *head;
72 unsigned bio_start = bvec->bv_offset;
73 unsigned bio_end = bio_start + bvec->bv_len;
74 unsigned under_io = 0;
75 unsigned long flags;
76
77 if (!page)
78 continue;
79
80 if (error) {
81 SetPageError(page);
82 set_bit(AS_EIO, &page->mapping->flags);
83 }
84 bh = head = page_buffers(page);
85 /*
86 * We check all buffers in the page under BH_Uptodate_Lock
87 * to avoid races with other end io clearing async_write flags
88 */
89 local_irq_save(flags);
90 bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
91 do {
92 if (bh_offset(bh) < bio_start ||
93 bh_offset(bh) + bh->b_size > bio_end) {
94 if (buffer_async_write(bh))
95 under_io++;
96 continue;
97 }
98 clear_buffer_async_write(bh);
99 if (error)
100 buffer_io_error(bh);
101 } while ((bh = bh->b_this_page) != head);
102 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
103 local_irq_restore(flags);
104 if (!under_io)
105 end_page_writeback(page);
106 }
63} 107}
64 108
65void ext4_free_io_end(ext4_io_end_t *io) 109static void ext4_release_io_end(ext4_io_end_t *io_end)
66{ 110{
67 BUG_ON(!io); 111 struct bio *bio, *next_bio;
68 BUG_ON(!list_empty(&io->list)); 112
69 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); 113 BUG_ON(!list_empty(&io_end->list));
114 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
115 WARN_ON(io_end->handle);
70 116
71 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) 117 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
72 wake_up_all(ext4_ioend_wq(io->inode)); 118 wake_up_all(ext4_ioend_wq(io_end->inode));
73 kmem_cache_free(io_end_cachep, io); 119
120 for (bio = io_end->bio; bio; bio = next_bio) {
121 next_bio = bio->bi_private;
122 ext4_finish_bio(bio);
123 bio_put(bio);
124 }
125 if (io_end->flag & EXT4_IO_END_DIRECT)
126 inode_dio_done(io_end->inode);
127 if (io_end->iocb)
128 aio_complete(io_end->iocb, io_end->result, 0);
129 kmem_cache_free(io_end_cachep, io_end);
74} 130}
75 131
76/* check a range of space and convert unwritten extents to written. */ 132static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
133{
134 struct inode *inode = io_end->inode;
135
136 io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
137 /* Wake up anyone waiting on unwritten extent conversion */
138 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
139 wake_up_all(ext4_ioend_wq(inode));
140}
141
142/*
143 * Check a range of space and convert unwritten extents to written. Note that
144 * we are protected from truncate touching same part of extent tree by the
145 * fact that truncate code waits for all DIO to finish (thus exclusion from
146 * direct IO is achieved) and also waits for PageWriteback bits. Thus we
147 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
148 * completed (happens from ext4_free_ioend()).
149 */
77static int ext4_end_io(ext4_io_end_t *io) 150static int ext4_end_io(ext4_io_end_t *io)
78{ 151{
79 struct inode *inode = io->inode; 152 struct inode *inode = io->inode;
80 loff_t offset = io->offset; 153 loff_t offset = io->offset;
81 ssize_t size = io->size; 154 ssize_t size = io->size;
155 handle_t *handle = io->handle;
82 int ret = 0; 156 int ret = 0;
83 157
84 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 158 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
85 "list->prev 0x%p\n", 159 "list->prev 0x%p\n",
86 io, inode->i_ino, io->list.next, io->list.prev); 160 io, inode->i_ino, io->list.next, io->list.prev);
87 161
88 ret = ext4_convert_unwritten_extents(inode, offset, size); 162 io->handle = NULL; /* Following call will use up the handle */
163 ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
89 if (ret < 0) { 164 if (ret < 0) {
90 ext4_msg(inode->i_sb, KERN_EMERG, 165 ext4_msg(inode->i_sb, KERN_EMERG,
91 "failed to convert unwritten extents to written " 166 "failed to convert unwritten extents to written "
@@ -93,30 +168,22 @@ static int ext4_end_io(ext4_io_end_t *io)
93 "(inode %lu, offset %llu, size %zd, error %d)", 168 "(inode %lu, offset %llu, size %zd, error %d)",
94 inode->i_ino, offset, size, ret); 169 inode->i_ino, offset, size, ret);
95 } 170 }
96 /* Wake up anyone waiting on unwritten extent conversion */ 171 ext4_clear_io_unwritten_flag(io);
97 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 172 ext4_release_io_end(io);
98 wake_up_all(ext4_ioend_wq(inode));
99 if (io->flag & EXT4_IO_END_DIRECT)
100 inode_dio_done(inode);
101 if (io->iocb)
102 aio_complete(io->iocb, io->result, 0);
103 return ret; 173 return ret;
104} 174}
105 175
106static void dump_completed_IO(struct inode *inode) 176static void dump_completed_IO(struct inode *inode, struct list_head *head)
107{ 177{
108#ifdef EXT4FS_DEBUG 178#ifdef EXT4FS_DEBUG
109 struct list_head *cur, *before, *after; 179 struct list_head *cur, *before, *after;
110 ext4_io_end_t *io, *io0, *io1; 180 ext4_io_end_t *io, *io0, *io1;
111 181
112 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { 182 if (list_empty(head))
113 ext4_debug("inode %lu completed_io list is empty\n",
114 inode->i_ino);
115 return; 183 return;
116 }
117 184
118 ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); 185 ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
119 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { 186 list_for_each_entry(io, head, list) {
120 cur = &io->list; 187 cur = &io->list;
121 before = cur->prev; 188 before = cur->prev;
122 io0 = container_of(before, ext4_io_end_t, list); 189 io0 = container_of(before, ext4_io_end_t, list);
@@ -130,23 +197,30 @@ static void dump_completed_IO(struct inode *inode)
130} 197}
131 198
132/* Add the io_end to per-inode completed end_io list. */ 199/* Add the io_end to per-inode completed end_io list. */
133void ext4_add_complete_io(ext4_io_end_t *io_end) 200static void ext4_add_complete_io(ext4_io_end_t *io_end)
134{ 201{
135 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 202 struct ext4_inode_info *ei = EXT4_I(io_end->inode);
136 struct workqueue_struct *wq; 203 struct workqueue_struct *wq;
137 unsigned long flags; 204 unsigned long flags;
138 205
139 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 206 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
140 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
141
142 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 207 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
143 if (list_empty(&ei->i_completed_io_list)) 208 if (io_end->handle) {
144 queue_work(wq, &ei->i_unwritten_work); 209 wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
145 list_add_tail(&io_end->list, &ei->i_completed_io_list); 210 if (list_empty(&ei->i_rsv_conversion_list))
211 queue_work(wq, &ei->i_rsv_conversion_work);
212 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
213 } else {
214 wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
215 if (list_empty(&ei->i_unrsv_conversion_list))
216 queue_work(wq, &ei->i_unrsv_conversion_work);
217 list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
218 }
146 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 219 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
147} 220}
148 221
149static int ext4_do_flush_completed_IO(struct inode *inode) 222static int ext4_do_flush_completed_IO(struct inode *inode,
223 struct list_head *head)
150{ 224{
151 ext4_io_end_t *io; 225 ext4_io_end_t *io;
152 struct list_head unwritten; 226 struct list_head unwritten;
@@ -155,8 +229,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
155 int err, ret = 0; 229 int err, ret = 0;
156 230
157 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 231 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
158 dump_completed_IO(inode); 232 dump_completed_IO(inode, head);
159 list_replace_init(&ei->i_completed_io_list, &unwritten); 233 list_replace_init(head, &unwritten);
160 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 234 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
161 235
162 while (!list_empty(&unwritten)) { 236 while (!list_empty(&unwritten)) {
@@ -167,30 +241,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
167 err = ext4_end_io(io); 241 err = ext4_end_io(io);
168 if (unlikely(!ret && err)) 242 if (unlikely(!ret && err))
169 ret = err; 243 ret = err;
170 io->flag &= ~EXT4_IO_END_UNWRITTEN;
171 ext4_free_io_end(io);
172 } 244 }
173 return ret; 245 return ret;
174} 246}
175 247
176/* 248/*
177 * work on completed aio dio IO, to convert unwritten extents to extents 249 * work on completed IO, to convert unwritten extents to extents
178 */ 250 */
179void ext4_end_io_work(struct work_struct *work) 251void ext4_end_io_rsv_work(struct work_struct *work)
180{ 252{
181 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 253 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
182 i_unwritten_work); 254 i_rsv_conversion_work);
183 ext4_do_flush_completed_IO(&ei->vfs_inode); 255 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
184} 256}
185 257
186int ext4_flush_unwritten_io(struct inode *inode) 258void ext4_end_io_unrsv_work(struct work_struct *work)
187{ 259{
188 int ret; 260 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
189 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && 261 i_unrsv_conversion_work);
190 !(inode->i_state & I_FREEING)); 262 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
191 ret = ext4_do_flush_completed_IO(inode);
192 ext4_unwritten_wait(inode);
193 return ret;
194} 263}
195 264
196ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 265ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -200,83 +269,70 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
200 atomic_inc(&EXT4_I(inode)->i_ioend_count); 269 atomic_inc(&EXT4_I(inode)->i_ioend_count);
201 io->inode = inode; 270 io->inode = inode;
202 INIT_LIST_HEAD(&io->list); 271 INIT_LIST_HEAD(&io->list);
272 atomic_set(&io->count, 1);
203 } 273 }
204 return io; 274 return io;
205} 275}
206 276
207/* 277void ext4_put_io_end_defer(ext4_io_end_t *io_end)
208 * Print an buffer I/O error compatible with the fs/buffer.c. This
209 * provides compatibility with dmesg scrapers that look for a specific
210 * buffer I/O error message. We really need a unified error reporting
211 * structure to userspace ala Digital Unix's uerf system, but it's
212 * probably not going to happen in my lifetime, due to LKML politics...
213 */
214static void buffer_io_error(struct buffer_head *bh)
215{ 278{
216 char b[BDEVNAME_SIZE]; 279 if (atomic_dec_and_test(&io_end->count)) {
217 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", 280 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
218 bdevname(bh->b_bdev, b), 281 ext4_release_io_end(io_end);
219 (unsigned long long)bh->b_blocknr); 282 return;
283 }
284 ext4_add_complete_io(io_end);
285 }
286}
287
288int ext4_put_io_end(ext4_io_end_t *io_end)
289{
290 int err = 0;
291
292 if (atomic_dec_and_test(&io_end->count)) {
293 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
294 err = ext4_convert_unwritten_extents(io_end->handle,
295 io_end->inode, io_end->offset,
296 io_end->size);
297 io_end->handle = NULL;
298 ext4_clear_io_unwritten_flag(io_end);
299 }
300 ext4_release_io_end(io_end);
301 }
302 return err;
303}
304
305ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
306{
307 atomic_inc(&io_end->count);
308 return io_end;
220} 309}
221 310
222static void ext4_end_bio(struct bio *bio, int error) 311static void ext4_end_bio(struct bio *bio, int error)
223{ 312{
224 ext4_io_end_t *io_end = bio->bi_private; 313 ext4_io_end_t *io_end = bio->bi_private;
225 struct inode *inode;
226 int i;
227 int blocksize;
228 sector_t bi_sector = bio->bi_sector; 314 sector_t bi_sector = bio->bi_sector;
229 315
230 BUG_ON(!io_end); 316 BUG_ON(!io_end);
231 inode = io_end->inode;
232 blocksize = 1 << inode->i_blkbits;
233 bio->bi_private = NULL;
234 bio->bi_end_io = NULL; 317 bio->bi_end_io = NULL;
235 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 318 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
236 error = 0; 319 error = 0;
237 for (i = 0; i < bio->bi_vcnt; i++) {
238 struct bio_vec *bvec = &bio->bi_io_vec[i];
239 struct page *page = bvec->bv_page;
240 struct buffer_head *bh, *head;
241 unsigned bio_start = bvec->bv_offset;
242 unsigned bio_end = bio_start + bvec->bv_len;
243 unsigned under_io = 0;
244 unsigned long flags;
245 320
246 if (!page) 321 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
247 continue;
248
249 if (error) {
250 SetPageError(page);
251 set_bit(AS_EIO, &page->mapping->flags);
252 }
253 bh = head = page_buffers(page);
254 /* 322 /*
255 * We check all buffers in the page under BH_Uptodate_Lock 323 * Link bio into list hanging from io_end. We have to do it
256 * to avoid races with other end io clearing async_write flags 324 * atomically as bio completions can be racing against each
325 * other.
257 */ 326 */
258 local_irq_save(flags); 327 bio->bi_private = xchg(&io_end->bio, bio);
259 bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 328 } else {
260 do { 329 ext4_finish_bio(bio);
261 if (bh_offset(bh) < bio_start || 330 bio_put(bio);
262 bh_offset(bh) + blocksize > bio_end) {
263 if (buffer_async_write(bh))
264 under_io++;
265 continue;
266 }
267 clear_buffer_async_write(bh);
268 if (error)
269 buffer_io_error(bh);
270 } while ((bh = bh->b_this_page) != head);
271 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
272 local_irq_restore(flags);
273 if (!under_io)
274 end_page_writeback(page);
275 } 331 }
276 bio_put(bio);
277 332
278 if (error) { 333 if (error) {
279 io_end->flag |= EXT4_IO_END_ERROR; 334 struct inode *inode = io_end->inode;
335
280 ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 336 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
281 "(offset %llu size %ld starting block %llu)", 337 "(offset %llu size %ld starting block %llu)",
282 inode->i_ino, 338 inode->i_ino,
@@ -285,13 +341,7 @@ static void ext4_end_bio(struct bio *bio, int error)
285 (unsigned long long) 341 (unsigned long long)
286 bi_sector >> (inode->i_blkbits - 9)); 342 bi_sector >> (inode->i_blkbits - 9));
287 } 343 }
288 344 ext4_put_io_end_defer(io_end);
289 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
290 ext4_free_io_end(io_end);
291 return;
292 }
293
294 ext4_add_complete_io(io_end);
295} 345}
296 346
297void ext4_io_submit(struct ext4_io_submit *io) 347void ext4_io_submit(struct ext4_io_submit *io)
@@ -305,43 +355,38 @@ void ext4_io_submit(struct ext4_io_submit *io)
305 bio_put(io->io_bio); 355 bio_put(io->io_bio);
306 } 356 }
307 io->io_bio = NULL; 357 io->io_bio = NULL;
308 io->io_op = 0; 358}
359
360void ext4_io_submit_init(struct ext4_io_submit *io,
361 struct writeback_control *wbc)
362{
363 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
364 io->io_bio = NULL;
309 io->io_end = NULL; 365 io->io_end = NULL;
310} 366}
311 367
312static int io_submit_init(struct ext4_io_submit *io, 368static int io_submit_init_bio(struct ext4_io_submit *io,
313 struct inode *inode, 369 struct buffer_head *bh)
314 struct writeback_control *wbc,
315 struct buffer_head *bh)
316{ 370{
317 ext4_io_end_t *io_end;
318 struct page *page = bh->b_page;
319 int nvecs = bio_get_nr_vecs(bh->b_bdev); 371 int nvecs = bio_get_nr_vecs(bh->b_bdev);
320 struct bio *bio; 372 struct bio *bio;
321 373
322 io_end = ext4_init_io_end(inode, GFP_NOFS);
323 if (!io_end)
324 return -ENOMEM;
325 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 374 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
375 if (!bio)
376 return -ENOMEM;
326 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 377 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
327 bio->bi_bdev = bh->b_bdev; 378 bio->bi_bdev = bh->b_bdev;
328 bio->bi_private = io->io_end = io_end;
329 bio->bi_end_io = ext4_end_bio; 379 bio->bi_end_io = ext4_end_bio;
330 380 bio->bi_private = ext4_get_io_end(io->io_end);
331 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
332
333 io->io_bio = bio; 381 io->io_bio = bio;
334 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
335 io->io_next_block = bh->b_blocknr; 382 io->io_next_block = bh->b_blocknr;
336 return 0; 383 return 0;
337} 384}
338 385
339static int io_submit_add_bh(struct ext4_io_submit *io, 386static int io_submit_add_bh(struct ext4_io_submit *io,
340 struct inode *inode, 387 struct inode *inode,
341 struct writeback_control *wbc,
342 struct buffer_head *bh) 388 struct buffer_head *bh)
343{ 389{
344 ext4_io_end_t *io_end;
345 int ret; 390 int ret;
346 391
347 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 392 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -349,18 +394,14 @@ submit_and_retry:
349 ext4_io_submit(io); 394 ext4_io_submit(io);
350 } 395 }
351 if (io->io_bio == NULL) { 396 if (io->io_bio == NULL) {
352 ret = io_submit_init(io, inode, wbc, bh); 397 ret = io_submit_init_bio(io, bh);
353 if (ret) 398 if (ret)
354 return ret; 399 return ret;
355 } 400 }
356 io_end = io->io_end;
357 if (test_clear_buffer_uninit(bh))
358 ext4_set_io_unwritten_flag(inode, io_end);
359 io->io_end->size += bh->b_size;
360 io->io_next_block++;
361 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 401 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
362 if (ret != bh->b_size) 402 if (ret != bh->b_size)
363 goto submit_and_retry; 403 goto submit_and_retry;
404 io->io_next_block++;
364 return 0; 405 return 0;
365} 406}
366 407
@@ -432,7 +473,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
432 do { 473 do {
433 if (!buffer_async_write(bh)) 474 if (!buffer_async_write(bh))
434 continue; 475 continue;
435 ret = io_submit_add_bh(io, inode, wbc, bh); 476 ret = io_submit_add_bh(io, inode, bh);
436 if (ret) { 477 if (ret) {
437 /* 478 /*
438 * We only get here on ENOMEM. Not much else 479 * We only get here on ENOMEM. Not much else
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
79 ext4_fsblk_t end = start + input->blocks_count; 79 ext4_fsblk_t end = start + input->blocks_count;
80 ext4_group_t group = input->group; 80 ext4_group_t group = input->group;
81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
82 unsigned overhead = ext4_group_overhead_blocks(sb, group); 82 unsigned overhead;
83 ext4_fsblk_t metaend = start + overhead; 83 ext4_fsblk_t metaend;
84 struct buffer_head *bh = NULL; 84 struct buffer_head *bh = NULL;
85 ext4_grpblk_t free_blocks_count, offset; 85 ext4_grpblk_t free_blocks_count, offset;
86 int err = -EINVAL; 86 int err = -EINVAL;
87 87
88 if (group != sbi->s_groups_count) {
89 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
90 input->group, sbi->s_groups_count);
91 return -EINVAL;
92 }
93
94 overhead = ext4_group_overhead_blocks(sb, group);
95 metaend = start + overhead;
88 input->free_blocks_count = free_blocks_count = 96 input->free_blocks_count = free_blocks_count =
89 input->blocks_count - 2 - overhead - sbi->s_itb_per_group; 97 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
90 98
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
96 free_blocks_count, input->reserved_blocks); 104 free_blocks_count, input->reserved_blocks);
97 105
98 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 106 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
99 if (group != sbi->s_groups_count) 107 if (offset != 0)
100 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
101 input->group, sbi->s_groups_count);
102 else if (offset != 0)
103 ext4_warning(sb, "Last group not full"); 108 ext4_warning(sb, "Last group not full");
104 else if (input->reserved_blocks > input->blocks_count / 5) 109 else if (input->reserved_blocks > input->blocks_count / 5)
105 ext4_warning(sb, "Reserved blocks too high (%u)", 110 ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
1551 int reserved_gdb = ext4_bg_has_super(sb, input->group) ? 1556 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
1552 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; 1557 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
1553 struct inode *inode = NULL; 1558 struct inode *inode = NULL;
1554 int gdb_off, gdb_num; 1559 int gdb_off;
1555 int err; 1560 int err;
1556 __u16 bg_flags = 0; 1561 __u16 bg_flags = 0;
1557 1562
1558 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
1559 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); 1563 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
1560 1564
1561 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 1565 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
1656 err = err2; 1660 err = err2;
1657 1661
1658 if (!err) { 1662 if (!err) {
1659 ext4_fsblk_t first_block;
1660 first_block = ext4_group_first_block_no(sb, 0);
1661 if (test_opt(sb, DEBUG)) 1663 if (test_opt(sb, DEBUG))
1662 printk(KERN_DEBUG "EXT4-fs: extended group to %llu " 1664 printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
1663 "blocks\n", ext4_blocks_count(es)); 1665 "blocks\n", ext4_blocks_count(es));
1664 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, 1666 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
1665 (char *)es, sizeof(struct ext4_super_block), 0); 1667 (char *)es, sizeof(struct ext4_super_block), 0);
1666 } 1668 }
1667 return err; 1669 return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..85b3dd60169b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
69static void ext4_clear_journal_err(struct super_block *sb, 69static void ext4_clear_journal_err(struct super_block *sb,
70 struct ext4_super_block *es); 70 struct ext4_super_block *es);
71static int ext4_sync_fs(struct super_block *sb, int wait); 71static int ext4_sync_fs(struct super_block *sb, int wait);
72static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
72static int ext4_remount(struct super_block *sb, int *flags, char *data); 73static int ext4_remount(struct super_block *sb, int *flags, char *data);
73static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 74static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
74static int ext4_unfreeze(struct super_block *sb); 75static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
398 } 399 }
399 if (test_opt(sb, ERRORS_RO)) { 400 if (test_opt(sb, ERRORS_RO)) {
400 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 401 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
402 /*
403 * Make sure updated value of ->s_mount_flags will be visible
404 * before ->s_flags update
405 */
406 smp_wmb();
401 sb->s_flags |= MS_RDONLY; 407 sb->s_flags |= MS_RDONLY;
402 } 408 }
403 if (test_opt(sb, ERRORS_PANIC)) 409 if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
422 ext4_handle_error(sb); 428 ext4_handle_error(sb);
423} 429}
424 430
425void ext4_error_inode(struct inode *inode, const char *function, 431void __ext4_error_inode(struct inode *inode, const char *function,
426 unsigned int line, ext4_fsblk_t block, 432 unsigned int line, ext4_fsblk_t block,
427 const char *fmt, ...) 433 const char *fmt, ...)
428{ 434{
429 va_list args; 435 va_list args;
430 struct va_format vaf; 436 struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
451 ext4_handle_error(inode->i_sb); 457 ext4_handle_error(inode->i_sb);
452} 458}
453 459
454void ext4_error_file(struct file *file, const char *function, 460void __ext4_error_file(struct file *file, const char *function,
455 unsigned int line, ext4_fsblk_t block, 461 unsigned int line, ext4_fsblk_t block,
456 const char *fmt, ...) 462 const char *fmt, ...)
457{ 463{
458 va_list args; 464 va_list args;
459 struct va_format vaf; 465 struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
570 576
571 if ((sb->s_flags & MS_RDONLY) == 0) { 577 if ((sb->s_flags & MS_RDONLY) == 0) {
572 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 578 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
573 sb->s_flags |= MS_RDONLY;
574 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; 579 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
580 /*
581 * Make sure updated value of ->s_mount_flags will be visible
582 * before ->s_flags update
583 */
584 smp_wmb();
585 sb->s_flags |= MS_RDONLY;
575 if (EXT4_SB(sb)->s_journal) 586 if (EXT4_SB(sb)->s_journal)
576 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 587 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
577 save_error_info(sb, function, line); 588 save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
580 panic("EXT4-fs panic from previous error\n"); 591 panic("EXT4-fs panic from previous error\n");
581} 592}
582 593
583void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) 594void __ext4_msg(struct super_block *sb,
595 const char *prefix, const char *fmt, ...)
584{ 596{
585 struct va_format vaf; 597 struct va_format vaf;
586 va_list args; 598 va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
750 ext4_unregister_li_request(sb); 762 ext4_unregister_li_request(sb);
751 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 763 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
752 764
753 flush_workqueue(sbi->dio_unwritten_wq); 765 flush_workqueue(sbi->unrsv_conversion_wq);
754 destroy_workqueue(sbi->dio_unwritten_wq); 766 flush_workqueue(sbi->rsv_conversion_wq);
767 destroy_workqueue(sbi->unrsv_conversion_wq);
768 destroy_workqueue(sbi->rsv_conversion_wq);
755 769
756 if (sbi->s_journal) { 770 if (sbi->s_journal) {
757 err = jbd2_journal_destroy(sbi->s_journal); 771 err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
760 ext4_abort(sb, "Couldn't clean up the journal"); 774 ext4_abort(sb, "Couldn't clean up the journal");
761 } 775 }
762 776
763 ext4_es_unregister_shrinker(sb); 777 ext4_es_unregister_shrinker(sbi);
764 del_timer(&sbi->s_err_report); 778 del_timer(&sbi->s_err_report);
765 ext4_release_system_zone(sb); 779 ext4_release_system_zone(sb);
766 ext4_mb_release(sb); 780 ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
849 rwlock_init(&ei->i_es_lock); 863 rwlock_init(&ei->i_es_lock);
850 INIT_LIST_HEAD(&ei->i_es_lru); 864 INIT_LIST_HEAD(&ei->i_es_lru);
851 ei->i_es_lru_nr = 0; 865 ei->i_es_lru_nr = 0;
866 ei->i_touch_when = 0;
852 ei->i_reserved_data_blocks = 0; 867 ei->i_reserved_data_blocks = 0;
853 ei->i_reserved_meta_blocks = 0; 868 ei->i_reserved_meta_blocks = 0;
854 ei->i_allocated_meta_blocks = 0; 869 ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
859 ei->i_reserved_quota = 0; 874 ei->i_reserved_quota = 0;
860#endif 875#endif
861 ei->jinode = NULL; 876 ei->jinode = NULL;
862 INIT_LIST_HEAD(&ei->i_completed_io_list); 877 INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
878 INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
863 spin_lock_init(&ei->i_completed_io_lock); 879 spin_lock_init(&ei->i_completed_io_lock);
864 ei->i_sync_tid = 0; 880 ei->i_sync_tid = 0;
865 ei->i_datasync_tid = 0; 881 ei->i_datasync_tid = 0;
866 atomic_set(&ei->i_ioend_count, 0); 882 atomic_set(&ei->i_ioend_count, 0);
867 atomic_set(&ei->i_unwritten, 0); 883 atomic_set(&ei->i_unwritten, 0);
868 INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); 884 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
885 INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
869 886
870 return &ei->vfs_inode; 887 return &ei->vfs_inode;
871} 888}
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
1093 .dirty_inode = ext4_dirty_inode, 1110 .dirty_inode = ext4_dirty_inode,
1094 .drop_inode = ext4_drop_inode, 1111 .drop_inode = ext4_drop_inode,
1095 .evict_inode = ext4_evict_inode, 1112 .evict_inode = ext4_evict_inode,
1113 .sync_fs = ext4_sync_fs_nojournal,
1096 .put_super = ext4_put_super, 1114 .put_super = ext4_put_super,
1097 .statfs = ext4_statfs, 1115 .statfs = ext4_statfs,
1098 .remount_fs = ext4_remount, 1116 .remount_fs = ext4_remount,
@@ -1908,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1908 struct ext4_sb_info *sbi = EXT4_SB(sb); 1926 struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 struct ext4_group_desc *gdp = NULL; 1927 struct ext4_group_desc *gdp = NULL;
1910 ext4_group_t flex_group; 1928 ext4_group_t flex_group;
1911 unsigned int groups_per_flex = 0;
1912 int i, err; 1929 int i, err;
1913 1930
1914 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1931 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1916 sbi->s_log_groups_per_flex = 0; 1933 sbi->s_log_groups_per_flex = 0;
1917 return 1; 1934 return 1;
1918 } 1935 }
1919 groups_per_flex = 1U << sbi->s_log_groups_per_flex;
1920 1936
1921 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); 1937 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
1922 if (err) 1938 if (err)
@@ -2164,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2164 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2180 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2165 dquot_initialize(inode); 2181 dquot_initialize(inode);
2166 if (inode->i_nlink) { 2182 if (inode->i_nlink) {
2167 ext4_msg(sb, KERN_DEBUG, 2183 if (test_opt(sb, DEBUG))
2168 "%s: truncating inode %lu to %lld bytes", 2184 ext4_msg(sb, KERN_DEBUG,
2169 __func__, inode->i_ino, inode->i_size); 2185 "%s: truncating inode %lu to %lld bytes",
2186 __func__, inode->i_ino, inode->i_size);
2170 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 2187 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2171 inode->i_ino, inode->i_size); 2188 inode->i_ino, inode->i_size);
2172 mutex_lock(&inode->i_mutex); 2189 mutex_lock(&inode->i_mutex);
2190 truncate_inode_pages(inode->i_mapping, inode->i_size);
2173 ext4_truncate(inode); 2191 ext4_truncate(inode);
2174 mutex_unlock(&inode->i_mutex); 2192 mutex_unlock(&inode->i_mutex);
2175 nr_truncates++; 2193 nr_truncates++;
2176 } else { 2194 } else {
2177 ext4_msg(sb, KERN_DEBUG, 2195 if (test_opt(sb, DEBUG))
2178 "%s: deleting unreferenced inode %lu", 2196 ext4_msg(sb, KERN_DEBUG,
2179 __func__, inode->i_ino); 2197 "%s: deleting unreferenced inode %lu",
2198 __func__, inode->i_ino);
2180 jbd_debug(2, "deleting unreferenced inode %lu\n", 2199 jbd_debug(2, "deleting unreferenced inode %lu\n",
2181 inode->i_ino); 2200 inode->i_ino);
2182 nr_orphans++; 2201 nr_orphans++;
@@ -2377,7 +2396,10 @@ struct ext4_attr {
2377 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2396 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2378 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2397 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2379 const char *, size_t); 2398 const char *, size_t);
2380 int offset; 2399 union {
2400 int offset;
2401 int deprecated_val;
2402 } u;
2381}; 2403};
2382 2404
2383static int parse_strtoull(const char *buf, 2405static int parse_strtoull(const char *buf,
@@ -2446,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2446static ssize_t sbi_ui_show(struct ext4_attr *a, 2468static ssize_t sbi_ui_show(struct ext4_attr *a,
2447 struct ext4_sb_info *sbi, char *buf) 2469 struct ext4_sb_info *sbi, char *buf)
2448{ 2470{
2449 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2471 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2450 2472
2451 return snprintf(buf, PAGE_SIZE, "%u\n", *ui); 2473 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2452} 2474}
@@ -2455,7 +2477,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
2455 struct ext4_sb_info *sbi, 2477 struct ext4_sb_info *sbi,
2456 const char *buf, size_t count) 2478 const char *buf, size_t count)
2457{ 2479{
2458 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2480 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2459 unsigned long t; 2481 unsigned long t;
2460 int ret; 2482 int ret;
2461 2483
@@ -2504,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
2504 return count; 2526 return count;
2505} 2527}
2506 2528
2529static ssize_t sbi_deprecated_show(struct ext4_attr *a,
2530 struct ext4_sb_info *sbi, char *buf)
2531{
2532 return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
2533}
2534
2507#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ 2535#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2508static struct ext4_attr ext4_attr_##_name = { \ 2536static struct ext4_attr ext4_attr_##_name = { \
2509 .attr = {.name = __stringify(_name), .mode = _mode }, \ 2537 .attr = {.name = __stringify(_name), .mode = _mode }, \
2510 .show = _show, \ 2538 .show = _show, \
2511 .store = _store, \ 2539 .store = _store, \
2512 .offset = offsetof(struct ext4_sb_info, _elname), \ 2540 .u = { \
2541 .offset = offsetof(struct ext4_sb_info, _elname),\
2542 }, \
2513} 2543}
2514#define EXT4_ATTR(name, mode, show, store) \ 2544#define EXT4_ATTR(name, mode, show, store) \
2515static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2545static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2550,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2520#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2550#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2521 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) 2551 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2522#define ATTR_LIST(name) &ext4_attr_##name.attr 2552#define ATTR_LIST(name) &ext4_attr_##name.attr
2553#define EXT4_DEPRECATED_ATTR(_name, _val) \
2554static struct ext4_attr ext4_attr_##_name = { \
2555 .attr = {.name = __stringify(_name), .mode = 0444 }, \
2556 .show = sbi_deprecated_show, \
2557 .u = { \
2558 .deprecated_val = _val, \
2559 }, \
2560}
2523 2561
2524EXT4_RO_ATTR(delayed_allocation_blocks); 2562EXT4_RO_ATTR(delayed_allocation_blocks);
2525EXT4_RO_ATTR(session_write_kbytes); 2563EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2534EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2572EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2535EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2573EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2536EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2574EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2537EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2575EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
2538EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 2576EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
2539EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2577EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2540 2578
@@ -3763,7 +3801,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3763 sbi->s_err_report.data = (unsigned long) sb; 3801 sbi->s_err_report.data = (unsigned long) sb;
3764 3802
3765 /* Register extent status tree shrinker */ 3803 /* Register extent status tree shrinker */
3766 ext4_es_register_shrinker(sb); 3804 ext4_es_register_shrinker(sbi);
3767 3805
3768 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3806 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3769 ext4_count_free_clusters(sb)); 3807 ext4_count_free_clusters(sb));
@@ -3787,7 +3825,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3787 } 3825 }
3788 3826
3789 sbi->s_stripe = ext4_get_stripe_size(sbi); 3827 sbi->s_stripe = ext4_get_stripe_size(sbi);
3790 sbi->s_max_writeback_mb_bump = 128;
3791 sbi->s_extent_max_zeroout_kb = 32; 3828 sbi->s_extent_max_zeroout_kb = 32;
3792 3829
3793 /* 3830 /*
@@ -3915,12 +3952,20 @@ no_journal:
3915 * The maximum number of concurrent works can be high and 3952 * The maximum number of concurrent works can be high and
3916 * concurrency isn't really necessary. Limit it to 1. 3953 * concurrency isn't really necessary. Limit it to 1.
3917 */ 3954 */
3918 EXT4_SB(sb)->dio_unwritten_wq = 3955 EXT4_SB(sb)->rsv_conversion_wq =
3919 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3956 alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3920 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3957 if (!EXT4_SB(sb)->rsv_conversion_wq) {
3921 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3958 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3922 ret = -ENOMEM; 3959 ret = -ENOMEM;
3923 goto failed_mount_wq; 3960 goto failed_mount4;
3961 }
3962
3963 EXT4_SB(sb)->unrsv_conversion_wq =
3964 alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3965 if (!EXT4_SB(sb)->unrsv_conversion_wq) {
3966 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3967 ret = -ENOMEM;
3968 goto failed_mount4;
3924 } 3969 }
3925 3970
3926 /* 3971 /*
@@ -4074,14 +4119,17 @@ failed_mount4a:
4074 sb->s_root = NULL; 4119 sb->s_root = NULL;
4075failed_mount4: 4120failed_mount4:
4076 ext4_msg(sb, KERN_ERR, "mount failed"); 4121 ext4_msg(sb, KERN_ERR, "mount failed");
4077 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 4122 if (EXT4_SB(sb)->rsv_conversion_wq)
4123 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4124 if (EXT4_SB(sb)->unrsv_conversion_wq)
4125 destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4078failed_mount_wq: 4126failed_mount_wq:
4079 if (sbi->s_journal) { 4127 if (sbi->s_journal) {
4080 jbd2_journal_destroy(sbi->s_journal); 4128 jbd2_journal_destroy(sbi->s_journal);
4081 sbi->s_journal = NULL; 4129 sbi->s_journal = NULL;
4082 } 4130 }
4083failed_mount3: 4131failed_mount3:
4084 ext4_es_unregister_shrinker(sb); 4132 ext4_es_unregister_shrinker(sbi);
4085 del_timer(&sbi->s_err_report); 4133 del_timer(&sbi->s_err_report);
4086 if (sbi->s_flex_groups) 4134 if (sbi->s_flex_groups)
4087 ext4_kvfree(sbi->s_flex_groups); 4135 ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4565,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4517{ 4565{
4518 int ret = 0; 4566 int ret = 0;
4519 tid_t target; 4567 tid_t target;
4568 bool needs_barrier = false;
4520 struct ext4_sb_info *sbi = EXT4_SB(sb); 4569 struct ext4_sb_info *sbi = EXT4_SB(sb);
4521 4570
4522 trace_ext4_sync_fs(sb, wait); 4571 trace_ext4_sync_fs(sb, wait);
4523 flush_workqueue(sbi->dio_unwritten_wq); 4572 flush_workqueue(sbi->rsv_conversion_wq);
4573 flush_workqueue(sbi->unrsv_conversion_wq);
4524 /* 4574 /*
4525 * Writeback quota in non-journalled quota case - journalled quota has 4575 * Writeback quota in non-journalled quota case - journalled quota has
4526 * no dirty dquots 4576 * no dirty dquots
4527 */ 4577 */
4528 dquot_writeback_dquots(sb, -1); 4578 dquot_writeback_dquots(sb, -1);
4579 /*
4580 * Data writeback is possible w/o journal transaction, so barrier must
4581 * being sent at the end of the function. But we can skip it if
4582 * transaction_commit will do it for us.
4583 */
4584 target = jbd2_get_latest_transaction(sbi->s_journal);
4585 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
4586 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
4587 needs_barrier = true;
4588
4529 if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 4589 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4530 if (wait) 4590 if (wait)
4531 jbd2_log_wait_commit(sbi->s_journal, target); 4591 ret = jbd2_log_wait_commit(sbi->s_journal, target);
4592 }
4593 if (needs_barrier) {
4594 int err;
4595 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4596 if (!ret)
4597 ret = err;
4532 } 4598 }
4599
4600 return ret;
4601}
4602
4603static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
4604{
4605 int ret = 0;
4606
4607 trace_ext4_sync_fs(sb, wait);
4608 flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4609 flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4610 dquot_writeback_dquots(sb, -1);
4611 if (wait && test_opt(sb, BARRIER))
4612 ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4613
4533 return ret; 4614 return ret;
4534} 4615}
4535 4616