aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorJames Morris <jmorris@macbook.(none)>2009-12-03 01:33:40 -0500
committerJames Morris <jmorris@macbook.(none)>2009-12-03 01:33:40 -0500
commitc84d6efd363a3948eb32ec40d46bab6338580454 (patch)
tree3ba7ac46e6626fe8ac843834588609eb6ccee5c6 /fs/ext4
parent7539cf4b92be4aecc573ea962135f246a7a33401 (diff)
parent22763c5cf3690a681551162c15d34d935308c8d7 (diff)
Merge branch 'master' into next
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig14
-rw-r--r--fs/ext4/ext4.h56
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c458
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/inode.c586
-rw-r--r--fs/ext4/mballoc.c305
-rw-r--r--fs/ext4/mballoc.h35
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c20
-rw-r--r--fs/ext4/namei.c19
-rw-r--r--fs/ext4/super.c150
13 files changed, 1089 insertions, 574 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2e8f2d..9f2d45d75b1a 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config EXT4DEV_COMPAT
30 bool "Enable ext4dev compatibility"
31 depends on EXT4_FS
32 help
33 Starting with 2.6.28, the name of the ext4 filesystem was
34 renamed from ext4dev to ext4. Unfortunately there are some
35 legacy userspace programs (such as klibc's fstype) have
36 "ext4dev" hardcoded.
37
38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev,
40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed!
42
43config EXT4_FS_XATTR 29config EXT4_FS_XATTR
44 bool "Ext4 extended attributes" 30 bool "Ext4 extended attributes"
45 depends on EXT4_FS 31 depends on EXT4_FS
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..8825515eeddd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
65/* data type for block group number */ 65/* data type for block group number */
66typedef unsigned int ext4_group_t; 66typedef unsigned int ext4_group_t;
67 67
68/*
69 * Flags used in mballoc's allocation_context flags field.
70 *
71 * Also used to show what's going on for debugging purposes when the
72 * flag field is exported via the traceport interface
73 */
68 74
69/* prefer goal again. length */ 75/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 0x0001 76#define EXT4_MB_HINT_MERGE 0x0001
@@ -127,6 +133,16 @@ struct mpage_da_data {
127 int pages_written; 133 int pages_written;
128 int retval; 134 int retval;
129}; 135};
136#define DIO_AIO_UNWRITTEN 0x1
137typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */
142 ext4_lblk_t offset; /* offset in the file */
143 size_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */
145} ext4_io_end_t;
130 146
131/* 147/*
132 * Special inodes numbers 148 * Special inodes numbers
@@ -306,6 +322,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
306#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 322#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
307#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ 323#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
308#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ 324#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
325#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
309 326
310/* Used to pass group descriptor data when online resize is done */ 327/* Used to pass group descriptor data when online resize is done */
311struct ext4_new_group_input { 328struct ext4_new_group_input {
@@ -347,7 +364,16 @@ struct ext4_new_group_data {
347 /* Call ext4_da_update_reserve_space() after successfully 364 /* Call ext4_da_update_reserve_space() after successfully
348 allocating the blocks */ 365 allocating the blocks */
349#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 366#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
350 367 /* caller is from the direct IO path, request to creation of an
368 unitialized extents if not allocated, split the uninitialized
369 extent if blocks has been preallocated already*/
370#define EXT4_GET_BLOCKS_DIO 0x0010
371#define EXT4_GET_BLOCKS_CONVERT 0x0020
372#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
373 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
374 /* Convert extent to initialized after direct IO complete */
375#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
376 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
351 377
352/* 378/*
353 * ioctl commands 379 * ioctl commands
@@ -500,8 +526,8 @@ struct move_extent {
500static inline __le32 ext4_encode_extra_time(struct timespec *time) 526static inline __le32 ext4_encode_extra_time(struct timespec *time)
501{ 527{
502 return cpu_to_le32((sizeof(time->tv_sec) > 4 ? 528 return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
503 time->tv_sec >> 32 : 0) | 529 (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
504 ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); 530 ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
505} 531}
506 532
507static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) 533static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -509,7 +535,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
509 if (sizeof(time->tv_sec) > 4) 535 if (sizeof(time->tv_sec) > 4)
510 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) 536 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
511 << 32; 537 << 32;
512 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; 538 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
513} 539}
514 540
515#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ 541#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -672,6 +698,11 @@ struct ext4_inode_info {
672 __u16 i_extra_isize; 698 __u16 i_extra_isize;
673 699
674 spinlock_t i_block_reservation_lock; 700 spinlock_t i_block_reservation_lock;
701
702 /* completed async DIOs that might need unwritten extents handling */
703 struct list_head i_aio_dio_complete_list;
704 /* current io_end structure for async DIO write*/
705 ext4_io_end_t *cur_aio_dio;
675}; 706};
676 707
677/* 708/*
@@ -713,6 +744,7 @@ struct ext4_inode_info {
713#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 744#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
714#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 745#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
715#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 746#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
747#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
716#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 748#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
717#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 749#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
718#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 750#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -942,18 +974,11 @@ struct ext4_sb_info {
942 unsigned int s_mb_stats; 974 unsigned int s_mb_stats;
943 unsigned int s_mb_order2_reqs; 975 unsigned int s_mb_order2_reqs;
944 unsigned int s_mb_group_prealloc; 976 unsigned int s_mb_group_prealloc;
977 unsigned int s_max_writeback_mb_bump;
945 /* where last allocation was done - for stream allocation */ 978 /* where last allocation was done - for stream allocation */
946 unsigned long s_mb_last_group; 979 unsigned long s_mb_last_group;
947 unsigned long s_mb_last_start; 980 unsigned long s_mb_last_start;
948 981
949 /* history to debug policy */
950 struct ext4_mb_history *s_mb_history;
951 int s_mb_history_cur;
952 int s_mb_history_max;
953 int s_mb_history_num;
954 spinlock_t s_mb_history_lock;
955 int s_mb_history_filter;
956
957 /* stats for buddy allocator */ 982 /* stats for buddy allocator */
958 spinlock_t s_mb_pa_lock; 983 spinlock_t s_mb_pa_lock;
959 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 984 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -980,6 +1005,9 @@ struct ext4_sb_info {
980 1005
981 unsigned int s_log_groups_per_flex; 1006 unsigned int s_log_groups_per_flex;
982 struct flex_groups *s_flex_groups; 1007 struct flex_groups *s_flex_groups;
1008
1009 /* workqueue for dio unwritten */
1010 struct workqueue_struct *dio_unwritten_wq;
983}; 1011};
984 1012
985static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1013static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1397,7 +1425,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1397 struct address_space *mapping, loff_t from); 1425 struct address_space *mapping, loff_t from);
1398extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1426extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1399extern qsize_t ext4_get_reserved_space(struct inode *inode); 1427extern qsize_t ext4_get_reserved_space(struct inode *inode);
1400 1428extern int flush_aio_dio_completed_IO(struct inode *inode);
1401/* ioctl.c */ 1429/* ioctl.c */
1402extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1430extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1403extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1431extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1699,6 +1727,8 @@ extern void ext4_ext_init(struct super_block *);
1699extern void ext4_ext_release(struct super_block *); 1727extern void ext4_ext_release(struct super_block *);
1700extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1728extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1701 loff_t len); 1729 loff_t len);
1730extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1731 loff_t len);
1702extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1732extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1703 sector_t block, unsigned int max_blocks, 1733 sector_t block, unsigned int max_blocks,
1704 struct buffer_head *bh, int flags); 1734 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1d15e6..2ca686454e87 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
221} 221}
222 222
223static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
224{
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226}
227
223extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 228extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
224extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); 229extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
225extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 230extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
235 struct ext4_ext_path *path, 240 struct ext4_ext_path *path,
236 struct ext4_extent *); 241 struct ext4_extent *);
237extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 242extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
238extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 243extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
239extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, 244extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
240 ext_prepare_callback, void *); 245 ext_prepare_callback, void *);
241extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 246extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..a2865980342f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
162int __ext4_journal_stop(const char *where, handle_t *handle); 162int __ext4_journal_stop(const char *where, handle_t *handle);
163 163
164#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) 164#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
165 165
166/* Note: Do not use this for NULL handles. This is only to determine if
167 * a properly allocated handle is using a journal or not. */
166static inline int ext4_handle_valid(handle_t *handle) 168static inline int ext4_handle_valid(handle_t *handle)
167{ 169{
168 if (handle == EXT4_NOJOURNAL_HANDLE) 170 if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
169 return 0; 171 return 0;
170 return 1; 172 return 1;
171} 173}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a3832577923..715264b4bae4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -723,7 +723,7 @@ err:
723 * insert new index [@logical;@ptr] into the block at @curp; 723 * insert new index [@logical;@ptr] into the block at @curp;
724 * check where to insert: before @curp or after @curp 724 * check where to insert: before @curp or after @curp
725 */ 725 */
726static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 726int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
727 struct ext4_ext_path *curp, 727 struct ext4_ext_path *curp,
728 int logical, ext4_fsblk_t ptr) 728 int logical, ext4_fsblk_t ptr)
729{ 729{
@@ -1586,7 +1586,7 @@ out:
1586 */ 1586 */
1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1588 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1589 struct ext4_extent *newext) 1589 struct ext4_extent *newext, int flag)
1590{ 1590{
1591 struct ext4_extent_header *eh; 1591 struct ext4_extent_header *eh;
1592 struct ext4_extent *ex, *fex; 1592 struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1602 BUG_ON(path[depth].p_hdr == NULL); 1602 BUG_ON(path[depth].p_hdr == NULL);
1603 1603
1604 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1606 && ext4_can_extents_be_merged(inode, ex, newext)) {
1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1607 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext), 1608 ext4_ext_is_uninitialized(newext),
1608 ext4_ext_get_actual_len(newext), 1609 ext4_ext_get_actual_len(newext),
@@ -1722,7 +1723,8 @@ has_space:
1722 1723
1723merge: 1724merge:
1724 /* try to merge extents to the right */ 1725 /* try to merge extents to the right */
1725 ext4_ext_try_to_merge(inode, path, nearex); 1726 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1727 ext4_ext_try_to_merge(inode, path, nearex);
1726 1728
1727 /* try to merge extents to the left */ 1729 /* try to merge extents to the left */
1728 1730
@@ -2378,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
2378 */ 2380 */
2379 2381
2380 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2382 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2383#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2381 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2384 printk(KERN_INFO "EXT4-fs: file extents enabled");
2382#ifdef AGGRESSIVE_TEST 2385#ifdef AGGRESSIVE_TEST
2383 printk(", aggressive tests"); 2386 printk(", aggressive tests");
@@ -2389,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
2389 printk(", stats"); 2392 printk(", stats");
2390#endif 2393#endif
2391 printk("\n"); 2394 printk("\n");
2395#endif
2392#ifdef EXTENTS_STATS 2396#ifdef EXTENTS_STATS
2393 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2397 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2394 EXT4_SB(sb)->s_ext_min = 1 << 30; 2398 EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2490,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2490} 2494}
2491 2495
2492#define EXT4_EXT_ZERO_LEN 7 2496#define EXT4_EXT_ZERO_LEN 7
2493
2494/* 2497/*
2495 * This function is called by ext4_ext_get_blocks() if someone tries to write 2498 * This function is called by ext4_ext_get_blocks() if someone tries to write
2496 * to an uninitialized extent. It may result in splitting the uninitialized 2499 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2583 ex3->ee_block = cpu_to_le32(iblock); 2586 ex3->ee_block = cpu_to_le32(iblock);
2584 ext4_ext_store_pblock(ex3, newblock); 2587 ext4_ext_store_pblock(ex3, newblock);
2585 ex3->ee_len = cpu_to_le16(allocated); 2588 ex3->ee_len = cpu_to_le16(allocated);
2586 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2589 err = ext4_ext_insert_extent(handle, inode, path,
2590 ex3, 0);
2587 if (err == -ENOSPC) { 2591 if (err == -ENOSPC) {
2588 err = ext4_ext_zeroout(inode, &orig_ex); 2592 err = ext4_ext_zeroout(inode, &orig_ex);
2589 if (err) 2593 if (err)
@@ -2639,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2639 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2643 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2640 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2644 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2641 ext4_ext_mark_uninitialized(ex3); 2645 ext4_ext_mark_uninitialized(ex3);
2642 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2646 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2643 if (err == -ENOSPC) { 2647 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex); 2648 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err) 2649 if (err)
@@ -2757,7 +2761,192 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2757 err = ext4_ext_dirty(handle, inode, path + depth); 2761 err = ext4_ext_dirty(handle, inode, path + depth);
2758 goto out; 2762 goto out;
2759insert: 2763insert:
2760 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2764 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2765 if (err == -ENOSPC) {
2766 err = ext4_ext_zeroout(inode, &orig_ex);
2767 if (err)
2768 goto fix_extent_len;
2769 /* update the extent length and mark as initialized */
2770 ex->ee_block = orig_ex.ee_block;
2771 ex->ee_len = orig_ex.ee_len;
2772 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2773 ext4_ext_dirty(handle, inode, path + depth);
2774 /* zero out the first half */
2775 return allocated;
2776 } else if (err)
2777 goto fix_extent_len;
2778out:
2779 ext4_ext_show_leaf(inode, path);
2780 return err ? err : allocated;
2781
2782fix_extent_len:
2783 ex->ee_block = orig_ex.ee_block;
2784 ex->ee_len = orig_ex.ee_len;
2785 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2786 ext4_ext_mark_uninitialized(ex);
2787 ext4_ext_dirty(handle, inode, path + depth);
2788 return err;
2789}
2790
2791/*
2792 * This function is called by ext4_ext_get_blocks() from
2793 * ext4_get_blocks_dio_write() when DIO to write
2794 * to an uninitialized extent.
2795 *
2796 * Writing to an uninitized extent may result in splitting the uninitialized
2797 * extent into multiple /intialized unintialized extents (up to three)
2798 * There are three possibilities:
2799 * a> There is no split required: Entire extent should be uninitialized
2800 * b> Splits in two extents: Write is happening at either end of the extent
2801 * c> Splits in three extents: Somone is writing in middle of the extent
2802 *
2803 * One of more index blocks maybe needed if the extent tree grow after
2804 * the unintialized extent split. To prevent ENOSPC occur at the IO
2805 * complete, we need to split the uninitialized extent before DIO submit
2806 * the IO. The uninitilized extent called at this time will be split
2807 * into three uninitialized extent(at most). After IO complete, the part
2808 * being filled will be convert to initialized by the end_io callback function
2809 * via ext4_convert_unwritten_extents().
2810 *
2811 * Returns the size of uninitialized extent to be written on success.
2812 */
2813static int ext4_split_unwritten_extents(handle_t *handle,
2814 struct inode *inode,
2815 struct ext4_ext_path *path,
2816 ext4_lblk_t iblock,
2817 unsigned int max_blocks,
2818 int flags)
2819{
2820 struct ext4_extent *ex, newex, orig_ex;
2821 struct ext4_extent *ex1 = NULL;
2822 struct ext4_extent *ex2 = NULL;
2823 struct ext4_extent *ex3 = NULL;
2824 struct ext4_extent_header *eh;
2825 ext4_lblk_t ee_block;
2826 unsigned int allocated, ee_len, depth;
2827 ext4_fsblk_t newblock;
2828 int err = 0;
2829
2830 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2831 "iblock %llu, max_blocks %u\n", inode->i_ino,
2832 (unsigned long long)iblock, max_blocks);
2833 depth = ext_depth(inode);
2834 eh = path[depth].p_hdr;
2835 ex = path[depth].p_ext;
2836 ee_block = le32_to_cpu(ex->ee_block);
2837 ee_len = ext4_ext_get_actual_len(ex);
2838 allocated = ee_len - (iblock - ee_block);
2839 newblock = iblock - ee_block + ext_pblock(ex);
2840 ex2 = ex;
2841 orig_ex.ee_block = ex->ee_block;
2842 orig_ex.ee_len = cpu_to_le16(ee_len);
2843 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2844
2845 /*
2846 * If the uninitialized extent begins at the same logical
2847 * block where the write begins, and the write completely
2848 * covers the extent, then we don't need to split it.
2849 */
2850 if ((iblock == ee_block) && (allocated <= max_blocks))
2851 return allocated;
2852
2853 err = ext4_ext_get_access(handle, inode, path + depth);
2854 if (err)
2855 goto out;
2856 /* ex1: ee_block to iblock - 1 : uninitialized */
2857 if (iblock > ee_block) {
2858 ex1 = ex;
2859 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2860 ext4_ext_mark_uninitialized(ex1);
2861 ex2 = &newex;
2862 }
2863 /*
2864 * for sanity, update the length of the ex2 extent before
2865 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2866 * overlap of blocks.
2867 */
2868 if (!ex1 && allocated > max_blocks)
2869 ex2->ee_len = cpu_to_le16(max_blocks);
2870 /* ex3: to ee_block + ee_len : uninitialised */
2871 if (allocated > max_blocks) {
2872 unsigned int newdepth;
2873 ex3 = &newex;
2874 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2875 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2876 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2877 ext4_ext_mark_uninitialized(ex3);
2878 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2879 if (err == -ENOSPC) {
2880 err = ext4_ext_zeroout(inode, &orig_ex);
2881 if (err)
2882 goto fix_extent_len;
2883 /* update the extent length and mark as initialized */
2884 ex->ee_block = orig_ex.ee_block;
2885 ex->ee_len = orig_ex.ee_len;
2886 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2887 ext4_ext_dirty(handle, inode, path + depth);
2888 /* zeroed the full extent */
2889 /* blocks available from iblock */
2890 return allocated;
2891
2892 } else if (err)
2893 goto fix_extent_len;
2894 /*
2895 * The depth, and hence eh & ex might change
2896 * as part of the insert above.
2897 */
2898 newdepth = ext_depth(inode);
2899 /*
2900 * update the extent length after successful insert of the
2901 * split extent
2902 */
2903 orig_ex.ee_len = cpu_to_le16(ee_len -
2904 ext4_ext_get_actual_len(ex3));
2905 depth = newdepth;
2906 ext4_ext_drop_refs(path);
2907 path = ext4_ext_find_extent(inode, iblock, path);
2908 if (IS_ERR(path)) {
2909 err = PTR_ERR(path);
2910 goto out;
2911 }
2912 eh = path[depth].p_hdr;
2913 ex = path[depth].p_ext;
2914 if (ex2 != &newex)
2915 ex2 = ex;
2916
2917 err = ext4_ext_get_access(handle, inode, path + depth);
2918 if (err)
2919 goto out;
2920
2921 allocated = max_blocks;
2922 }
2923 /*
2924 * If there was a change of depth as part of the
2925 * insertion of ex3 above, we need to update the length
2926 * of the ex1 extent again here
2927 */
2928 if (ex1 && ex1 != ex) {
2929 ex1 = ex;
2930 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2931 ext4_ext_mark_uninitialized(ex1);
2932 ex2 = &newex;
2933 }
2934 /*
2935 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2936 * uninitialised still.
2937 */
2938 ex2->ee_block = cpu_to_le32(iblock);
2939 ext4_ext_store_pblock(ex2, newblock);
2940 ex2->ee_len = cpu_to_le16(allocated);
2941 ext4_ext_mark_uninitialized(ex2);
2942 if (ex2 != ex)
2943 goto insert;
2944 /* Mark modified extent as dirty */
2945 err = ext4_ext_dirty(handle, inode, path + depth);
2946 ext_debug("out here\n");
2947 goto out;
2948insert:
2949 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2761 if (err == -ENOSPC) { 2950 if (err == -ENOSPC) {
2762 err = ext4_ext_zeroout(inode, &orig_ex); 2951 err = ext4_ext_zeroout(inode, &orig_ex);
2763 if (err) 2952 if (err)
@@ -2783,7 +2972,147 @@ fix_extent_len:
2783 ext4_ext_dirty(handle, inode, path + depth); 2972 ext4_ext_dirty(handle, inode, path + depth);
2784 return err; 2973 return err;
2785} 2974}
2975static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2976 struct inode *inode,
2977 struct ext4_ext_path *path)
2978{
2979 struct ext4_extent *ex;
2980 struct ext4_extent_header *eh;
2981 int depth;
2982 int err = 0;
2983 int ret = 0;
2984
2985 depth = ext_depth(inode);
2986 eh = path[depth].p_hdr;
2987 ex = path[depth].p_ext;
2988
2989 err = ext4_ext_get_access(handle, inode, path + depth);
2990 if (err)
2991 goto out;
2992 /* first mark the extent as initialized */
2993 ext4_ext_mark_initialized(ex);
2994
2995 /*
2996 * We have to see if it can be merged with the extent
2997 * on the left.
2998 */
2999 if (ex > EXT_FIRST_EXTENT(eh)) {
3000 /*
3001 * To merge left, pass "ex - 1" to try_to_merge(),
3002 * since it merges towards right _only_.
3003 */
3004 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3005 if (ret) {
3006 err = ext4_ext_correct_indexes(handle, inode, path);
3007 if (err)
3008 goto out;
3009 depth = ext_depth(inode);
3010 ex--;
3011 }
3012 }
3013 /*
3014 * Try to Merge towards right.
3015 */
3016 ret = ext4_ext_try_to_merge(inode, path, ex);
3017 if (ret) {
3018 err = ext4_ext_correct_indexes(handle, inode, path);
3019 if (err)
3020 goto out;
3021 depth = ext_depth(inode);
3022 }
3023 /* Mark modified extent as dirty */
3024 err = ext4_ext_dirty(handle, inode, path + depth);
3025out:
3026 ext4_ext_show_leaf(inode, path);
3027 return err;
3028}
3029
3030static int
3031ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3032 ext4_lblk_t iblock, unsigned int max_blocks,
3033 struct ext4_ext_path *path, int flags,
3034 unsigned int allocated, struct buffer_head *bh_result,
3035 ext4_fsblk_t newblock)
3036{
3037 int ret = 0;
3038 int err = 0;
3039 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3040
3041 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3042 "block %llu, max_blocks %u, flags %d, allocated %u",
3043 inode->i_ino, (unsigned long long)iblock, max_blocks,
3044 flags, allocated);
3045 ext4_ext_show_leaf(inode, path);
2786 3046
3047 /* DIO get_block() before submit the IO, split the extent */
3048 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3049 ret = ext4_split_unwritten_extents(handle,
3050 inode, path, iblock,
3051 max_blocks, flags);
3052 /*
3053 * Flag the inode(non aio case) or end_io struct (aio case)
3054 * that this IO needs to convertion to written when IO is
3055 * completed
3056 */
3057 if (io)
3058 io->flag = DIO_AIO_UNWRITTEN;
3059 else
3060 EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
3061 goto out;
3062 }
3063 /* async DIO end_io complete, convert the filled extent to written */
3064 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3065 ret = ext4_convert_unwritten_extents_dio(handle, inode,
3066 path);
3067 goto out2;
3068 }
3069 /* buffered IO case */
3070 /*
3071 * repeat fallocate creation request
3072 * we already have an unwritten extent
3073 */
3074 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3075 goto map_out;
3076
3077 /* buffered READ or buffered write_begin() lookup */
3078 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3079 /*
3080 * We have blocks reserved already. We
3081 * return allocated blocks so that delalloc
3082 * won't do block reservation for us. But
3083 * the buffer head will be unmapped so that
3084 * a read from the block returns 0s.
3085 */
3086 set_buffer_unwritten(bh_result);
3087 goto out1;
3088 }
3089
3090 /* buffered write, writepage time, convert*/
3091 ret = ext4_ext_convert_to_initialized(handle, inode,
3092 path, iblock,
3093 max_blocks);
3094out:
3095 if (ret <= 0) {
3096 err = ret;
3097 goto out2;
3098 } else
3099 allocated = ret;
3100 set_buffer_new(bh_result);
3101map_out:
3102 set_buffer_mapped(bh_result);
3103out1:
3104 if (allocated > max_blocks)
3105 allocated = max_blocks;
3106 ext4_ext_show_leaf(inode, path);
3107 bh_result->b_bdev = inode->i_sb->s_bdev;
3108 bh_result->b_blocknr = newblock;
3109out2:
3110 if (path) {
3111 ext4_ext_drop_refs(path);
3112 kfree(path);
3113 }
3114 return err ? err : allocated;
3115}
2787/* 3116/*
2788 * Block allocation/map/preallocation routine for extents based files 3117 * Block allocation/map/preallocation routine for extents based files
2789 * 3118 *
@@ -2814,6 +3143,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2814 int err = 0, depth, ret, cache_type; 3143 int err = 0, depth, ret, cache_type;
2815 unsigned int allocated = 0; 3144 unsigned int allocated = 0;
2816 struct ext4_allocation_request ar; 3145 struct ext4_allocation_request ar;
3146 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
2817 3147
2818 __clear_bit(BH_New, &bh_result->b_state); 3148 __clear_bit(BH_New, &bh_result->b_state);
2819 ext_debug("blocks %u/%u requested for inode %lu\n", 3149 ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -2889,33 +3219,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2889 EXT4_EXT_CACHE_EXTENT); 3219 EXT4_EXT_CACHE_EXTENT);
2890 goto out; 3220 goto out;
2891 } 3221 }
2892 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3222 ret = ext4_ext_handle_uninitialized_extents(handle,
2893 goto out; 3223 inode, iblock, max_blocks, path,
2894 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3224 flags, allocated, bh_result, newblock);
2895 if (allocated > max_blocks) 3225 return ret;
2896 allocated = max_blocks;
2897 /*
2898 * We have blocks reserved already. We
2899 * return allocated blocks so that delalloc
2900 * won't do block reservation for us. But
2901 * the buffer head will be unmapped so that
2902 * a read from the block returns 0s.
2903 */
2904 set_buffer_unwritten(bh_result);
2905 bh_result->b_bdev = inode->i_sb->s_bdev;
2906 bh_result->b_blocknr = newblock;
2907 goto out2;
2908 }
2909
2910 ret = ext4_ext_convert_to_initialized(handle, inode,
2911 path, iblock,
2912 max_blocks);
2913 if (ret <= 0) {
2914 err = ret;
2915 goto out2;
2916 } else
2917 allocated = ret;
2918 goto outnew;
2919 } 3226 }
2920 } 3227 }
2921 3228
@@ -2986,9 +3293,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2986 /* try to insert new extent into found leaf and return */ 3293 /* try to insert new extent into found leaf and return */
2987 ext4_ext_store_pblock(&newex, newblock); 3294 ext4_ext_store_pblock(&newex, newblock);
2988 newex.ee_len = cpu_to_le16(ar.len); 3295 newex.ee_len = cpu_to_le16(ar.len);
2989 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ 3296 /* Mark uninitialized */
3297 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
2990 ext4_ext_mark_uninitialized(&newex); 3298 ext4_ext_mark_uninitialized(&newex);
2991 err = ext4_ext_insert_extent(handle, inode, path, &newex); 3299 /*
3300 * io_end structure was created for every async
3301 * direct IO write to the middle of the file.
3302 * To avoid unecessary convertion for every aio dio rewrite
3303 * to the mid of file, here we flag the IO that is really
3304 * need the convertion.
3305 * For non asycn direct IO case, flag the inode state
3306 * that we need to perform convertion when IO is done.
3307 */
3308 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3309 if (io)
3310 io->flag = DIO_AIO_UNWRITTEN;
3311 else
3312 EXT4_I(inode)->i_state |=
3313 EXT4_STATE_DIO_UNWRITTEN;;
3314 }
3315 }
3316 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2992 if (err) { 3317 if (err) {
2993 /* free data blocks we just allocated */ 3318 /* free data blocks we just allocated */
2994 /* not a good idea to call discard here directly, 3319 /* not a good idea to call discard here directly,
@@ -3002,7 +3327,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3002 /* previous routine could use block we allocated */ 3327 /* previous routine could use block we allocated */
3003 newblock = ext_pblock(&newex); 3328 newblock = ext_pblock(&newex);
3004 allocated = ext4_ext_get_actual_len(&newex); 3329 allocated = ext4_ext_get_actual_len(&newex);
3005outnew:
3006 set_buffer_new(bh_result); 3330 set_buffer_new(bh_result);
3007 3331
3008 /* Cache only when it is _not_ an uninitialized extent */ 3332 /* Cache only when it is _not_ an uninitialized extent */
@@ -3201,6 +3525,64 @@ retry:
3201} 3525}
3202 3526
3203/* 3527/*
3528 * This function convert a range of blocks to written extents
3529 * The caller of this function will pass the start offset and the size.
3530 * all unwritten extents within this range will be converted to
3531 * written extents.
3532 *
3533 * This function is called from the direct IO end io call back
3534 * function, to convert the fallocated extents after IO is completed.
3535 * Returns 0 on success.
3536 */
3537int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3538 loff_t len)
3539{
3540 handle_t *handle;
3541 ext4_lblk_t block;
3542 unsigned int max_blocks;
3543 int ret = 0;
3544 int ret2 = 0;
3545 struct buffer_head map_bh;
3546 unsigned int credits, blkbits = inode->i_blkbits;
3547
3548 block = offset >> blkbits;
3549 /*
3550 * We can't just convert len to max_blocks because
3551 * If blocksize = 4096 offset = 3072 and len = 2048
3552 */
3553 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3554 - block;
3555 /*
3556 * credits to insert 1 extent into extent tree
3557 */
3558 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3559 while (ret >= 0 && ret < max_blocks) {
3560 block = block + ret;
3561 max_blocks = max_blocks - ret;
3562 handle = ext4_journal_start(inode, credits);
3563 if (IS_ERR(handle)) {
3564 ret = PTR_ERR(handle);
3565 break;
3566 }
3567 map_bh.b_state = 0;
3568 ret = ext4_get_blocks(handle, inode, block,
3569 max_blocks, &map_bh,
3570 EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3571 if (ret <= 0) {
3572 WARN_ON(ret <= 0);
3573 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3574 "returned error inode#%lu, block=%u, "
3575 "max_blocks=%u", __func__,
3576 inode->i_ino, block, max_blocks);
3577 }
3578 ext4_mark_inode_dirty(handle, inode);
3579 ret2 = ext4_journal_stop(handle);
3580 if (ret <= 0 || ret2 )
3581 break;
3582 }
3583 return ret > 0 ? ret2 : ret;
3584}
3585/*
3204 * Callback function called for each extent to gather FIEMAP information. 3586 * Callback function called for each extent to gather FIEMAP information.
3205 */ 3587 */
3206static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3588static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 07475740b512..2b1531266ee2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
44 * 44 *
45 * What we do is just kick off a commit and wait on it. This will snapshot the 45 * What we do is just kick off a commit and wait on it. This will snapshot the
46 * inode to disk. 46 * inode to disk.
47 *
48 * i_mutex lock is held when entering and exiting this function
47 */ 49 */
48 50
49int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
56 58
57 trace_ext4_sync_file(file, dentry, datasync); 59 trace_ext4_sync_file(file, dentry, datasync);
58 60
61 ret = flush_aio_dio_completed_IO(inode);
62 if (ret < 0)
63 goto out;
59 /* 64 /*
60 * data=writeback: 65 * data=writeback:
61 * The caller's filemap_fdatawrite()/wait will sync the data. 66 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 064746fad581..2c8caa51addb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h>
40 41
41#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
42#include "xattr.h" 43#include "xattr.h"
@@ -192,7 +193,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
192 * so before we call here everything must be consistently dirtied against 193 * so before we call here everything must be consistently dirtied against
193 * this transaction. 194 * this transaction.
194 */ 195 */
195 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, 196int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
196 int nblocks) 197 int nblocks)
197{ 198{
198 int ret; 199 int ret;
@@ -208,6 +209,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
208 up_write(&EXT4_I(inode)->i_data_sem); 209 up_write(&EXT4_I(inode)->i_data_sem);
209 ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); 210 ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
210 down_write(&EXT4_I(inode)->i_data_sem); 211 down_write(&EXT4_I(inode)->i_data_sem);
212 ext4_discard_preallocations(inode);
211 213
212 return ret; 214 return ret;
213} 215}
@@ -1145,6 +1147,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
1145} 1147}
1146 1148
1147/* 1149/*
1150 * Return the number of contiguous dirty pages in a given inode
1151 * starting at page frame idx.
1152 */
1153static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1154 unsigned int max_pages)
1155{
1156 struct address_space *mapping = inode->i_mapping;
1157 pgoff_t index;
1158 struct pagevec pvec;
1159 pgoff_t num = 0;
1160 int i, nr_pages, done = 0;
1161
1162 if (max_pages == 0)
1163 return 0;
1164 pagevec_init(&pvec, 0);
1165 while (!done) {
1166 index = idx;
1167 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1168 PAGECACHE_TAG_DIRTY,
1169 (pgoff_t)PAGEVEC_SIZE);
1170 if (nr_pages == 0)
1171 break;
1172 for (i = 0; i < nr_pages; i++) {
1173 struct page *page = pvec.pages[i];
1174 struct buffer_head *bh, *head;
1175
1176 lock_page(page);
1177 if (unlikely(page->mapping != mapping) ||
1178 !PageDirty(page) ||
1179 PageWriteback(page) ||
1180 page->index != idx) {
1181 done = 1;
1182 unlock_page(page);
1183 break;
1184 }
1185 if (page_has_buffers(page)) {
1186 bh = head = page_buffers(page);
1187 do {
1188 if (!buffer_delay(bh) &&
1189 !buffer_unwritten(bh))
1190 done = 1;
1191 bh = bh->b_this_page;
1192 } while (!done && (bh != head));
1193 }
1194 unlock_page(page);
1195 if (done)
1196 break;
1197 idx++;
1198 num++;
1199 if (num >= max_pages)
1200 break;
1201 }
1202 pagevec_release(&pvec);
1203 }
1204 return num;
1205}
1206
1207/*
1148 * The ext4_get_blocks() function tries to look up the requested blocks, 1208 * The ext4_get_blocks() function tries to look up the requested blocks,
1149 * and returns if the blocks are already mapped. 1209 * and returns if the blocks are already mapped.
1150 * 1210 *
@@ -1175,6 +1235,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1175 clear_buffer_mapped(bh); 1235 clear_buffer_mapped(bh);
1176 clear_buffer_unwritten(bh); 1236 clear_buffer_unwritten(bh);
1177 1237
1238 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
1239 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1240 (unsigned long)block);
1178 /* 1241 /*
1179 * Try to see if we can get the block without requesting a new 1242 * Try to see if we can get the block without requesting a new
1180 * file system block. 1243 * file system block.
@@ -1796,11 +1859,11 @@ repeat:
1796 1859
1797 if (ext4_claim_free_blocks(sbi, total)) { 1860 if (ext4_claim_free_blocks(sbi, total)) {
1798 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1861 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1862 vfs_dq_release_reservation_block(inode, total);
1799 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1863 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1800 yield(); 1864 yield();
1801 goto repeat; 1865 goto repeat;
1802 } 1866 }
1803 vfs_dq_release_reservation_block(inode, total);
1804 return -ENOSPC; 1867 return -ENOSPC;
1805 } 1868 }
1806 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1869 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -2092,18 +2155,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2092static void ext4_print_free_blocks(struct inode *inode) 2155static void ext4_print_free_blocks(struct inode *inode)
2093{ 2156{
2094 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2157 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2095 printk(KERN_EMERG "Total free blocks count %lld\n", 2158 printk(KERN_CRIT "Total free blocks count %lld\n",
2096 ext4_count_free_blocks(inode->i_sb)); 2159 ext4_count_free_blocks(inode->i_sb));
2097 printk(KERN_EMERG "Free/Dirty block details\n"); 2160 printk(KERN_CRIT "Free/Dirty block details\n");
2098 printk(KERN_EMERG "free_blocks=%lld\n", 2161 printk(KERN_CRIT "free_blocks=%lld\n",
2099 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 2162 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2100 printk(KERN_EMERG "dirty_blocks=%lld\n", 2163 printk(KERN_CRIT "dirty_blocks=%lld\n",
2101 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2164 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2102 printk(KERN_EMERG "Block reservation details\n"); 2165 printk(KERN_CRIT "Block reservation details\n");
2103 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 2166 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2104 EXT4_I(inode)->i_reserved_data_blocks); 2167 EXT4_I(inode)->i_reserved_data_blocks);
2105 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 2168 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2106 EXT4_I(inode)->i_reserved_meta_blocks); 2169 EXT4_I(inode)->i_reserved_meta_blocks);
2107 return; 2170 return;
2108} 2171}
2109 2172
@@ -2189,14 +2252,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2189 * writepage and writepages will again try to write 2252 * writepage and writepages will again try to write
2190 * the same. 2253 * the same.
2191 */ 2254 */
2192 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2255 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2193 "at logical offset %llu with max blocks " 2256 "delayed block allocation failed for inode %lu at "
2194 "%zd with error %d\n", 2257 "logical offset %llu with max blocks %zd with "
2195 __func__, mpd->inode->i_ino, 2258 "error %d\n", mpd->inode->i_ino,
2196 (unsigned long long)next, 2259 (unsigned long long) next,
2197 mpd->b_size >> mpd->inode->i_blkbits, err); 2260 mpd->b_size >> mpd->inode->i_blkbits, err);
2198 printk(KERN_EMERG "This should not happen.!! " 2261 printk(KERN_CRIT "This should not happen!! "
2199 "Data will be lost\n"); 2262 "Data will be lost\n");
2200 if (err == -ENOSPC) { 2263 if (err == -ENOSPC) {
2201 ext4_print_free_blocks(mpd->inode); 2264 ext4_print_free_blocks(mpd->inode);
2202 } 2265 }
@@ -2743,8 +2806,10 @@ static int ext4_da_writepages(struct address_space *mapping,
2743 int no_nrwrite_index_update; 2806 int no_nrwrite_index_update;
2744 int pages_written = 0; 2807 int pages_written = 0;
2745 long pages_skipped; 2808 long pages_skipped;
2809 unsigned int max_pages;
2746 int range_cyclic, cycled = 1, io_done = 0; 2810 int range_cyclic, cycled = 1, io_done = 0;
2747 int needed_blocks, ret = 0, nr_to_writebump = 0; 2811 int needed_blocks, ret = 0;
2812 long desired_nr_to_write, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start; 2813 loff_t range_start = wbc->range_start;
2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2814 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2750 2815
@@ -2771,16 +2836,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2771 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2836 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2772 return -EROFS; 2837 return -EROFS;
2773 2838
2774 /*
2775 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2776 * This make sure small files blocks are allocated in
2777 * single attempt. This ensure that small files
2778 * get less fragmented.
2779 */
2780 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2781 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2782 wbc->nr_to_write = sbi->s_mb_stream_request;
2783 }
2784 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2839 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2785 range_whole = 1; 2840 range_whole = 1;
2786 2841
@@ -2795,6 +2850,36 @@ static int ext4_da_writepages(struct address_space *mapping,
2795 } else 2850 } else
2796 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2851 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2797 2852
2853 /*
2854 * This works around two forms of stupidity. The first is in
2855 * the writeback code, which caps the maximum number of pages
2856 * written to be 1024 pages. This is wrong on multiple
2857 * levels; different architectues have a different page size,
2858 * which changes the maximum amount of data which gets
2859 * written. Secondly, 4 megabytes is way too small. XFS
2860 * forces this value to be 16 megabytes by multiplying
2861 * nr_to_write parameter by four, and then relies on its
2862 * allocator to allocate larger extents to make them
2863 * contiguous. Unfortunately this brings us to the second
2864 * stupidity, which is that ext4's mballoc code only allocates
2865 * at most 2048 blocks. So we force contiguous writes up to
2866 * the number of dirty blocks in the inode, or
2867 * sbi->max_writeback_mb_bump whichever is smaller.
2868 */
2869 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2870 if (!range_cyclic && range_whole)
2871 desired_nr_to_write = wbc->nr_to_write * 8;
2872 else
2873 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2874 max_pages);
2875 if (desired_nr_to_write > max_pages)
2876 desired_nr_to_write = max_pages;
2877
2878 if (wbc->nr_to_write < desired_nr_to_write) {
2879 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2880 wbc->nr_to_write = desired_nr_to_write;
2881 }
2882
2798 mpd.wbc = wbc; 2883 mpd.wbc = wbc;
2799 mpd.inode = mapping->host; 2884 mpd.inode = mapping->host;
2800 2885
@@ -2822,10 +2907,9 @@ retry:
2822 handle = ext4_journal_start(inode, needed_blocks); 2907 handle = ext4_journal_start(inode, needed_blocks);
2823 if (IS_ERR(handle)) { 2908 if (IS_ERR(handle)) {
2824 ret = PTR_ERR(handle); 2909 ret = PTR_ERR(handle);
2825 printk(KERN_CRIT "%s: jbd2_start: " 2910 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2826 "%ld pages, ino %lu; err %d\n", __func__, 2911 "%ld pages, ino %lu; err %d\n", __func__,
2827 wbc->nr_to_write, inode->i_ino, ret); 2912 wbc->nr_to_write, inode->i_ino, ret);
2828 dump_stack();
2829 goto out_writepages; 2913 goto out_writepages;
2830 } 2914 }
2831 2915
@@ -2897,9 +2981,10 @@ retry:
2897 goto retry; 2981 goto retry;
2898 } 2982 }
2899 if (pages_skipped != wbc->pages_skipped) 2983 if (pages_skipped != wbc->pages_skipped)
2900 printk(KERN_EMERG "This should not happen leaving %s " 2984 ext4_msg(inode->i_sb, KERN_CRIT,
2901 "with nr_to_write = %ld ret = %d\n", 2985 "This should not happen leaving %s "
2902 __func__, wbc->nr_to_write, ret); 2986 "with nr_to_write = %ld ret = %d\n",
2987 __func__, wbc->nr_to_write, ret);
2903 2988
2904 /* Update index */ 2989 /* Update index */
2905 index += pages_written; 2990 index += pages_written;
@@ -2914,7 +2999,8 @@ retry:
2914out_writepages: 2999out_writepages:
2915 if (!no_nrwrite_index_update) 3000 if (!no_nrwrite_index_update)
2916 wbc->no_nrwrite_index_update = 0; 3001 wbc->no_nrwrite_index_update = 0;
2917 wbc->nr_to_write -= nr_to_writebump; 3002 if (wbc->nr_to_write > nr_to_writebump)
3003 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start; 3004 wbc->range_start = range_start;
2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3005 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2920 return ret; 3006 return ret;
@@ -3272,6 +3358,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3272} 3358}
3273 3359
3274/* 3360/*
3361 * O_DIRECT for ext3 (or indirect map) based files
3362 *
3275 * If the O_DIRECT write will extend the file then add this inode to the 3363 * If the O_DIRECT write will extend the file then add this inode to the
3276 * orphan list. So recovery will truncate it back to the original size 3364 * orphan list. So recovery will truncate it back to the original size
3277 * if the machine crashes during the write. 3365 * if the machine crashes during the write.
@@ -3280,7 +3368,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3280 * crashes then stale disk data _may_ be exposed inside the file. But current 3368 * crashes then stale disk data _may_ be exposed inside the file. But current
3281 * VFS code falls back into buffered path in that case so we are safe. 3369 * VFS code falls back into buffered path in that case so we are safe.
3282 */ 3370 */
3283static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3371static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3284 const struct iovec *iov, loff_t offset, 3372 const struct iovec *iov, loff_t offset,
3285 unsigned long nr_segs) 3373 unsigned long nr_segs)
3286{ 3374{
@@ -3291,6 +3379,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3291 ssize_t ret; 3379 ssize_t ret;
3292 int orphan = 0; 3380 int orphan = 0;
3293 size_t count = iov_length(iov, nr_segs); 3381 size_t count = iov_length(iov, nr_segs);
3382 int retries = 0;
3294 3383
3295 if (rw == WRITE) { 3384 if (rw == WRITE) {
3296 loff_t final_size = offset + count; 3385 loff_t final_size = offset + count;
@@ -3313,9 +3402,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3313 } 3402 }
3314 } 3403 }
3315 3404
3405retry:
3316 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3406 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3317 offset, nr_segs, 3407 offset, nr_segs,
3318 ext4_get_block, NULL); 3408 ext4_get_block, NULL);
3409 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3410 goto retry;
3319 3411
3320 if (orphan) { 3412 if (orphan) {
3321 int err; 3413 int err;
@@ -3354,6 +3446,364 @@ out:
3354 return ret; 3446 return ret;
3355} 3447}
3356 3448
3449static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3450 struct buffer_head *bh_result, int create)
3451{
3452 handle_t *handle = NULL;
3453 int ret = 0;
3454 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3455 int dio_credits;
3456
3457 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3458 inode->i_ino, create);
3459 /*
3460 * DIO VFS code passes create = 0 flag for write to
3461 * the middle of file. It does this to avoid block
3462 * allocation for holes, to prevent expose stale data
3463 * out when there is parallel buffered read (which does
3464 * not hold the i_mutex lock) while direct IO write has
3465 * not completed. DIO request on holes finally falls back
3466 * to buffered IO for this reason.
3467 *
3468 * For ext4 extent based file, since we support fallocate,
3469 * new allocated extent as uninitialized, for holes, we
3470 * could fallocate blocks for holes, thus parallel
3471 * buffered IO read will zero out the page when read on
3472 * a hole while parallel DIO write to the hole has not completed.
3473 *
3474 * when we come here, we know it's a direct IO write to
3475 * to the middle of file (<i_size)
3476 * so it's safe to override the create flag from VFS.
3477 */
3478 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3479
3480 if (max_blocks > DIO_MAX_BLOCKS)
3481 max_blocks = DIO_MAX_BLOCKS;
3482 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3483 handle = ext4_journal_start(inode, dio_credits);
3484 if (IS_ERR(handle)) {
3485 ret = PTR_ERR(handle);
3486 goto out;
3487 }
3488 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3489 create);
3490 if (ret > 0) {
3491 bh_result->b_size = (ret << inode->i_blkbits);
3492 ret = 0;
3493 }
3494 ext4_journal_stop(handle);
3495out:
3496 return ret;
3497}
3498
3499static void ext4_free_io_end(ext4_io_end_t *io)
3500{
3501 BUG_ON(!io);
3502 iput(io->inode);
3503 kfree(io);
3504}
3505static void dump_aio_dio_list(struct inode * inode)
3506{
3507#ifdef EXT4_DEBUG
3508 struct list_head *cur, *before, *after;
3509 ext4_io_end_t *io, *io0, *io1;
3510
3511 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3512 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3513 return;
3514 }
3515
3516 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3517 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3518 cur = &io->list;
3519 before = cur->prev;
3520 io0 = container_of(before, ext4_io_end_t, list);
3521 after = cur->next;
3522 io1 = container_of(after, ext4_io_end_t, list);
3523
3524 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3525 io, inode->i_ino, io0, io1);
3526 }
3527#endif
3528}
3529
3530/*
3531 * check a range of space and convert unwritten extents to written.
3532 */
3533static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3534{
3535 struct inode *inode = io->inode;
3536 loff_t offset = io->offset;
3537 size_t size = io->size;
3538 int ret = 0;
3539
3540 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3541 "list->prev 0x%p\n",
3542 io, inode->i_ino, io->list.next, io->list.prev);
3543
3544 if (list_empty(&io->list))
3545 return ret;
3546
3547 if (io->flag != DIO_AIO_UNWRITTEN)
3548 return ret;
3549
3550 if (offset + size <= i_size_read(inode))
3551 ret = ext4_convert_unwritten_extents(inode, offset, size);
3552
3553 if (ret < 0) {
3554 printk(KERN_EMERG "%s: failed to convert unwritten"
3555 "extents to written extents, error is %d"
3556 " io is still on inode %lu aio dio list\n",
3557 __func__, ret, inode->i_ino);
3558 return ret;
3559 }
3560
3561 /* clear the DIO AIO unwritten flag */
3562 io->flag = 0;
3563 return ret;
3564}
3565/*
3566 * work on completed aio dio IO, to convert unwritten extents to extents
3567 */
3568static void ext4_end_aio_dio_work(struct work_struct *work)
3569{
3570 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3571 struct inode *inode = io->inode;
3572 int ret = 0;
3573
3574 mutex_lock(&inode->i_mutex);
3575 ret = ext4_end_aio_dio_nolock(io);
3576 if (ret >= 0) {
3577 if (!list_empty(&io->list))
3578 list_del_init(&io->list);
3579 ext4_free_io_end(io);
3580 }
3581 mutex_unlock(&inode->i_mutex);
3582}
3583/*
3584 * This function is called from ext4_sync_file().
3585 *
3586 * When AIO DIO IO is completed, the work to convert unwritten
3587 * extents to written is queued on workqueue but may not get immediately
3588 * scheduled. When fsync is called, we need to ensure the
3589 * conversion is complete before fsync returns.
3590 * The inode keeps track of a list of completed AIO from DIO path
3591 * that might needs to do the conversion. This function walks through
3592 * the list and convert the related unwritten extents to written.
3593 */
3594int flush_aio_dio_completed_IO(struct inode *inode)
3595{
3596 ext4_io_end_t *io;
3597 int ret = 0;
3598 int ret2 = 0;
3599
3600 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3601 return ret;
3602
3603 dump_aio_dio_list(inode);
3604 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3605 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3606 ext4_io_end_t, list);
3607 /*
3608 * Calling ext4_end_aio_dio_nolock() to convert completed
3609 * IO to written.
3610 *
3611 * When ext4_sync_file() is called, run_queue() may already
3612 * about to flush the work corresponding to this io structure.
3613 * It will be upset if it founds the io structure related
3614 * to the work-to-be schedule is freed.
3615 *
3616 * Thus we need to keep the io structure still valid here after
3617 * convertion finished. The io structure has a flag to
3618 * avoid double converting from both fsync and background work
3619 * queue work.
3620 */
3621 ret = ext4_end_aio_dio_nolock(io);
3622 if (ret < 0)
3623 ret2 = ret;
3624 else
3625 list_del_init(&io->list);
3626 }
3627 return (ret2 < 0) ? ret2 : 0;
3628}
3629
3630static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3631{
3632 ext4_io_end_t *io = NULL;
3633
3634 io = kmalloc(sizeof(*io), GFP_NOFS);
3635
3636 if (io) {
3637 igrab(inode);
3638 io->inode = inode;
3639 io->flag = 0;
3640 io->offset = 0;
3641 io->size = 0;
3642 io->error = 0;
3643 INIT_WORK(&io->work, ext4_end_aio_dio_work);
3644 INIT_LIST_HEAD(&io->list);
3645 }
3646
3647 return io;
3648}
3649
3650static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3651 ssize_t size, void *private)
3652{
3653 ext4_io_end_t *io_end = iocb->private;
3654 struct workqueue_struct *wq;
3655
3656 /* if not async direct IO or dio with 0 bytes write, just return */
3657 if (!io_end || !size)
3658 return;
3659
3660 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3661 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3662 iocb->private, io_end->inode->i_ino, iocb, offset,
3663 size);
3664
3665 /* if not aio dio with unwritten extents, just free io and return */
3666 if (io_end->flag != DIO_AIO_UNWRITTEN){
3667 ext4_free_io_end(io_end);
3668 iocb->private = NULL;
3669 return;
3670 }
3671
3672 io_end->offset = offset;
3673 io_end->size = size;
3674 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3675
3676 /* queue the work to convert unwritten extents to written */
3677 queue_work(wq, &io_end->work);
3678
3679 /* Add the io_end to per-inode completed aio dio list*/
3680 list_add_tail(&io_end->list,
3681 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3682 iocb->private = NULL;
3683}
3684/*
3685 * For ext4 extent files, ext4 will do direct-io write to holes,
3686 * preallocated extents, and those write extend the file, no need to
3687 * fall back to buffered IO.
3688 *
3689 * For holes, we fallocate those blocks, mark them as unintialized
3690 * If those blocks were preallocated, we mark sure they are splited, but
3691 * still keep the range to write as unintialized.
3692 *
3693 * The unwrritten extents will be converted to written when DIO is completed.
3694 * For async direct IO, since the IO may still pending when return, we
3695 * set up an end_io call back function, which will do the convertion
3696 * when async direct IO completed.
3697 *
3698 * If the O_DIRECT write will extend the file then add this inode to the
3699 * orphan list. So recovery will truncate it back to the original size
3700 * if the machine crashes during the write.
3701 *
3702 */
3703static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3704 const struct iovec *iov, loff_t offset,
3705 unsigned long nr_segs)
3706{
3707 struct file *file = iocb->ki_filp;
3708 struct inode *inode = file->f_mapping->host;
3709 ssize_t ret;
3710 size_t count = iov_length(iov, nr_segs);
3711
3712 loff_t final_size = offset + count;
3713 if (rw == WRITE && final_size <= inode->i_size) {
3714 /*
3715 * We could direct write to holes and fallocate.
3716 *
3717 * Allocated blocks to fill the hole are marked as uninitialized
3718 * to prevent paralel buffered read to expose the stale data
3719 * before DIO complete the data IO.
3720 *
3721 * As to previously fallocated extents, ext4 get_block
3722 * will just simply mark the buffer mapped but still
3723 * keep the extents uninitialized.
3724 *
3725 * for non AIO case, we will convert those unwritten extents
3726 * to written after return back from blockdev_direct_IO.
3727 *
3728 * for async DIO, the conversion needs to be defered when
3729 * the IO is completed. The ext4 end_io callback function
3730 * will be called to take care of the conversion work.
3731 * Here for async case, we allocate an io_end structure to
3732 * hook to the iocb.
3733 */
3734 iocb->private = NULL;
3735 EXT4_I(inode)->cur_aio_dio = NULL;
3736 if (!is_sync_kiocb(iocb)) {
3737 iocb->private = ext4_init_io_end(inode);
3738 if (!iocb->private)
3739 return -ENOMEM;
3740 /*
3741 * we save the io structure for current async
3742 * direct IO, so that later ext4_get_blocks()
3743 * could flag the io structure whether there
3744 * is a unwritten extents needs to be converted
3745 * when IO is completed.
3746 */
3747 EXT4_I(inode)->cur_aio_dio = iocb->private;
3748 }
3749
3750 ret = blockdev_direct_IO(rw, iocb, inode,
3751 inode->i_sb->s_bdev, iov,
3752 offset, nr_segs,
3753 ext4_get_block_dio_write,
3754 ext4_end_io_dio);
3755 if (iocb->private)
3756 EXT4_I(inode)->cur_aio_dio = NULL;
3757 /*
3758 * The io_end structure takes a reference to the inode,
3759 * that structure needs to be destroyed and the
3760 * reference to the inode need to be dropped, when IO is
3761 * complete, even with 0 byte write, or failed.
3762 *
3763 * In the successful AIO DIO case, the io_end structure will be
3764 * desctroyed and the reference to the inode will be dropped
3765 * after the end_io call back function is called.
3766 *
3767 * In the case there is 0 byte write, or error case, since
3768 * VFS direct IO won't invoke the end_io call back function,
3769 * we need to free the end_io structure here.
3770 */
3771 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772 ext4_free_io_end(iocb->private);
3773 iocb->private = NULL;
3774 } else if (ret > 0 && (EXT4_I(inode)->i_state &
3775 EXT4_STATE_DIO_UNWRITTEN)) {
3776 int err;
3777 /*
3778 * for non AIO case, since the IO is already
3779 * completed, we could do the convertion right here
3780 */
3781 err = ext4_convert_unwritten_extents(inode,
3782 offset, ret);
3783 if (err < 0)
3784 ret = err;
3785 EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
3786 }
3787 return ret;
3788 }
3789
3790 /* for write the the end of file case, we fall back to old way */
3791 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3792}
3793
3794static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3795 const struct iovec *iov, loff_t offset,
3796 unsigned long nr_segs)
3797{
3798 struct file *file = iocb->ki_filp;
3799 struct inode *inode = file->f_mapping->host;
3800
3801 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3802 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3803
3804 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3805}
3806
3357/* 3807/*
3358 * Pages can be marked dirty completely asynchronously from ext4's journalling 3808 * Pages can be marked dirty completely asynchronously from ext4's journalling
3359 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3809 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -4551,8 +5001,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
4551 */ 5001 */
4552static int ext4_do_update_inode(handle_t *handle, 5002static int ext4_do_update_inode(handle_t *handle,
4553 struct inode *inode, 5003 struct inode *inode,
4554 struct ext4_iloc *iloc, 5004 struct ext4_iloc *iloc)
4555 int do_sync)
4556{ 5005{
4557 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5006 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4558 struct ext4_inode_info *ei = EXT4_I(inode); 5007 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4653,22 +5102,10 @@ static int ext4_do_update_inode(handle_t *handle,
4653 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 5102 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4654 } 5103 }
4655 5104
4656 /* 5105 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4657 * If we're not using a journal and we were called from 5106 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4658 * ext4_write_inode() to sync the inode (making do_sync true), 5107 if (!err)
4659 * we can just use sync_dirty_buffer() directly to do our dirty 5108 err = rc;
4660 * work. Testing s_journal here is a bit redundant but it's
4661 * worth it to avoid potential future trouble.
4662 */
4663 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4664 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4665 sync_dirty_buffer(bh);
4666 } else {
4667 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4668 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4669 if (!err)
4670 err = rc;
4671 }
4672 ei->i_state &= ~EXT4_STATE_NEW; 5109 ei->i_state &= ~EXT4_STATE_NEW;
4673 5110
4674out_brelse: 5111out_brelse:
@@ -4736,8 +5173,16 @@ int ext4_write_inode(struct inode *inode, int wait)
4736 err = ext4_get_inode_loc(inode, &iloc); 5173 err = ext4_get_inode_loc(inode, &iloc);
4737 if (err) 5174 if (err)
4738 return err; 5175 return err;
4739 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, 5176 if (wait)
4740 inode, &iloc, wait); 5177 sync_dirty_buffer(iloc.bh);
5178 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5179 ext4_error(inode->i_sb, __func__,
5180 "IO error syncing inode, "
5181 "inode=%lu, block=%llu",
5182 inode->i_ino,
5183 (unsigned long long)iloc.bh->b_blocknr);
5184 err = -EIO;
5185 }
4741 } 5186 }
4742 return err; 5187 return err;
4743} 5188}
@@ -5033,7 +5478,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
5033 get_bh(iloc->bh); 5478 get_bh(iloc->bh);
5034 5479
5035 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5480 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5036 err = ext4_do_update_inode(handle, inode, iloc, 0); 5481 err = ext4_do_update_inode(handle, inode, iloc);
5037 put_bh(iloc->bh); 5482 put_bh(iloc->bh);
5038 return err; 5483 return err;
5039} 5484}
@@ -5177,27 +5622,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5177 */ 5622 */
5178void ext4_dirty_inode(struct inode *inode) 5623void ext4_dirty_inode(struct inode *inode)
5179{ 5624{
5180 handle_t *current_handle = ext4_journal_current_handle();
5181 handle_t *handle; 5625 handle_t *handle;
5182 5626
5183 if (!ext4_handle_valid(current_handle)) {
5184 ext4_mark_inode_dirty(current_handle, inode);
5185 return;
5186 }
5187
5188 handle = ext4_journal_start(inode, 2); 5627 handle = ext4_journal_start(inode, 2);
5189 if (IS_ERR(handle)) 5628 if (IS_ERR(handle))
5190 goto out; 5629 goto out;
5191 if (current_handle && 5630
5192 current_handle->h_transaction != handle->h_transaction) { 5631 ext4_mark_inode_dirty(handle, inode);
5193 /* This task has a transaction open against a different fs */ 5632
5194 printk(KERN_EMERG "%s: transactions do not match!\n",
5195 __func__);
5196 } else {
5197 jbd_debug(5, "marking dirty. outer handle=%p\n",
5198 current_handle);
5199 ext4_mark_inode_dirty(handle, inode);
5200 }
5201 ext4_journal_stop(handle); 5633 ext4_journal_stop(handle);
5202out: 5634out:
5203 return; 5635 return;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c61896d605..bba12824defa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2096,207 +2096,6 @@ out:
2096 return err; 2096 return err;
2097} 2097}
2098 2098
2099#ifdef EXT4_MB_HISTORY
2100struct ext4_mb_proc_session {
2101 struct ext4_mb_history *history;
2102 struct super_block *sb;
2103 int start;
2104 int max;
2105};
2106
2107static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2108 struct ext4_mb_history *hs,
2109 int first)
2110{
2111 if (hs == s->history + s->max)
2112 hs = s->history;
2113 if (!first && hs == s->history + s->start)
2114 return NULL;
2115 while (hs->orig.fe_len == 0) {
2116 hs++;
2117 if (hs == s->history + s->max)
2118 hs = s->history;
2119 if (hs == s->history + s->start)
2120 return NULL;
2121 }
2122 return hs;
2123}
2124
2125static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2126{
2127 struct ext4_mb_proc_session *s = seq->private;
2128 struct ext4_mb_history *hs;
2129 int l = *pos;
2130
2131 if (l == 0)
2132 return SEQ_START_TOKEN;
2133 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2134 if (!hs)
2135 return NULL;
2136 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2137 return hs;
2138}
2139
2140static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2141 loff_t *pos)
2142{
2143 struct ext4_mb_proc_session *s = seq->private;
2144 struct ext4_mb_history *hs = v;
2145
2146 ++*pos;
2147 if (v == SEQ_START_TOKEN)
2148 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2149 else
2150 return ext4_mb_history_skip_empty(s, ++hs, 0);
2151}
2152
2153static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2154{
2155 char buf[25], buf2[25], buf3[25], *fmt;
2156 struct ext4_mb_history *hs = v;
2157
2158 if (v == SEQ_START_TOKEN) {
2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2161 "pid", "inode", "original", "goal", "result", "found",
2162 "grps", "cr", "flags", "merge", "tail", "broken");
2163 return 0;
2164 }
2165
2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2168 "0x%04x %-5s %-5u %-6u\n";
2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2170 hs->result.fe_start, hs->result.fe_len,
2171 hs->result.fe_logical);
2172 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2173 hs->orig.fe_start, hs->orig.fe_len,
2174 hs->orig.fe_logical);
2175 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
2176 hs->goal.fe_start, hs->goal.fe_len,
2177 hs->goal.fe_logical);
2178 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2179 hs->found, hs->groups, hs->cr, hs->flags,
2180 hs->merged ? "M" : "", hs->tail,
2181 hs->buddy ? 1 << hs->buddy : 0);
2182 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2183 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2184 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2185 hs->result.fe_start, hs->result.fe_len,
2186 hs->result.fe_logical);
2187 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2188 hs->orig.fe_start, hs->orig.fe_len,
2189 hs->orig.fe_logical);
2190 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2191 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2192 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2193 hs->result.fe_start, hs->result.fe_len);
2194 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2195 hs->pid, hs->ino, buf2);
2196 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2197 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2198 hs->result.fe_start, hs->result.fe_len);
2199 seq_printf(seq, "%-5u %-8u %-23s free\n",
2200 hs->pid, hs->ino, buf2);
2201 }
2202 return 0;
2203}
2204
2205static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2206{
2207}
2208
2209static const struct seq_operations ext4_mb_seq_history_ops = {
2210 .start = ext4_mb_seq_history_start,
2211 .next = ext4_mb_seq_history_next,
2212 .stop = ext4_mb_seq_history_stop,
2213 .show = ext4_mb_seq_history_show,
2214};
2215
2216static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2217{
2218 struct super_block *sb = PDE(inode)->data;
2219 struct ext4_sb_info *sbi = EXT4_SB(sb);
2220 struct ext4_mb_proc_session *s;
2221 int rc;
2222 int size;
2223
2224 if (unlikely(sbi->s_mb_history == NULL))
2225 return -ENOMEM;
2226 s = kmalloc(sizeof(*s), GFP_KERNEL);
2227 if (s == NULL)
2228 return -ENOMEM;
2229 s->sb = sb;
2230 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2231 s->history = kmalloc(size, GFP_KERNEL);
2232 if (s->history == NULL) {
2233 kfree(s);
2234 return -ENOMEM;
2235 }
2236
2237 spin_lock(&sbi->s_mb_history_lock);
2238 memcpy(s->history, sbi->s_mb_history, size);
2239 s->max = sbi->s_mb_history_max;
2240 s->start = sbi->s_mb_history_cur % s->max;
2241 spin_unlock(&sbi->s_mb_history_lock);
2242
2243 rc = seq_open(file, &ext4_mb_seq_history_ops);
2244 if (rc == 0) {
2245 struct seq_file *m = (struct seq_file *)file->private_data;
2246 m->private = s;
2247 } else {
2248 kfree(s->history);
2249 kfree(s);
2250 }
2251 return rc;
2252
2253}
2254
2255static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2256{
2257 struct seq_file *seq = (struct seq_file *)file->private_data;
2258 struct ext4_mb_proc_session *s = seq->private;
2259 kfree(s->history);
2260 kfree(s);
2261 return seq_release(inode, file);
2262}
2263
2264static ssize_t ext4_mb_seq_history_write(struct file *file,
2265 const char __user *buffer,
2266 size_t count, loff_t *ppos)
2267{
2268 struct seq_file *seq = (struct seq_file *)file->private_data;
2269 struct ext4_mb_proc_session *s = seq->private;
2270 struct super_block *sb = s->sb;
2271 char str[32];
2272 int value;
2273
2274 if (count >= sizeof(str)) {
2275 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2276 "mb_history", (int)sizeof(str));
2277 return -EOVERFLOW;
2278 }
2279
2280 if (copy_from_user(str, buffer, count))
2281 return -EFAULT;
2282
2283 value = simple_strtol(str, NULL, 0);
2284 if (value < 0)
2285 return -ERANGE;
2286 EXT4_SB(sb)->s_mb_history_filter = value;
2287
2288 return count;
2289}
2290
2291static const struct file_operations ext4_mb_seq_history_fops = {
2292 .owner = THIS_MODULE,
2293 .open = ext4_mb_seq_history_open,
2294 .read = seq_read,
2295 .write = ext4_mb_seq_history_write,
2296 .llseek = seq_lseek,
2297 .release = ext4_mb_seq_history_release,
2298};
2299
2300static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2099static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2301{ 2100{
2302 struct super_block *sb = seq->private; 2101 struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2396 .release = seq_release, 2195 .release = seq_release,
2397}; 2196};
2398 2197
2399static void ext4_mb_history_release(struct super_block *sb)
2400{
2401 struct ext4_sb_info *sbi = EXT4_SB(sb);
2402
2403 if (sbi->s_proc != NULL) {
2404 remove_proc_entry("mb_groups", sbi->s_proc);
2405 if (sbi->s_mb_history_max)
2406 remove_proc_entry("mb_history", sbi->s_proc);
2407 }
2408 kfree(sbi->s_mb_history);
2409}
2410
2411static void ext4_mb_history_init(struct super_block *sb)
2412{
2413 struct ext4_sb_info *sbi = EXT4_SB(sb);
2414 int i;
2415
2416 if (sbi->s_proc != NULL) {
2417 if (sbi->s_mb_history_max)
2418 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2419 &ext4_mb_seq_history_fops, sb);
2420 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2421 &ext4_mb_seq_groups_fops, sb);
2422 }
2423
2424 sbi->s_mb_history_cur = 0;
2425 spin_lock_init(&sbi->s_mb_history_lock);
2426 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2427 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
2428 /* if we can't allocate history, then we simple won't use it */
2429}
2430
2431static noinline_for_stack void
2432ext4_mb_store_history(struct ext4_allocation_context *ac)
2433{
2434 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2435 struct ext4_mb_history h;
2436
2437 if (sbi->s_mb_history == NULL)
2438 return;
2439
2440 if (!(ac->ac_op & sbi->s_mb_history_filter))
2441 return;
2442
2443 h.op = ac->ac_op;
2444 h.pid = current->pid;
2445 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2446 h.orig = ac->ac_o_ex;
2447 h.result = ac->ac_b_ex;
2448 h.flags = ac->ac_flags;
2449 h.found = ac->ac_found;
2450 h.groups = ac->ac_groups_scanned;
2451 h.cr = ac->ac_criteria;
2452 h.tail = ac->ac_tail;
2453 h.buddy = ac->ac_buddy;
2454 h.merged = 0;
2455 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2456 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2457 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2458 h.merged = 1;
2459 h.goal = ac->ac_g_ex;
2460 h.result = ac->ac_f_ex;
2461 }
2462
2463 spin_lock(&sbi->s_mb_history_lock);
2464 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2465 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2466 sbi->s_mb_history_cur = 0;
2467 spin_unlock(&sbi->s_mb_history_lock);
2468}
2469
2470#else
2471#define ext4_mb_history_release(sb)
2472#define ext4_mb_history_init(sb)
2473#endif
2474
2475 2198
2476/* Create and initialize ext4_group_info data for the given group. */ 2199/* Create and initialize ext4_group_info data for the given group. */
2477int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2200int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2690 sbi->s_mb_stats = MB_DEFAULT_STATS; 2413 sbi->s_mb_stats = MB_DEFAULT_STATS;
2691 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2414 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2692 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2415 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2693 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2694 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2416 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2695 2417
2696 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2418 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2708 spin_lock_init(&lg->lg_prealloc_lock); 2430 spin_lock_init(&lg->lg_prealloc_lock);
2709 } 2431 }
2710 2432
2711 ext4_mb_history_init(sb); 2433 if (sbi->s_proc)
2434 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2435 &ext4_mb_seq_groups_fops, sb);
2712 2436
2713 if (sbi->s_journal) 2437 if (sbi->s_journal)
2714 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2438 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2715
2716 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2717 return 0; 2439 return 0;
2718} 2440}
2719 2441
@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
2790 } 2512 }
2791 2513
2792 free_percpu(sbi->s_locality_groups); 2514 free_percpu(sbi->s_locality_groups);
2793 ext4_mb_history_release(sb); 2515 if (sbi->s_proc)
2516 remove_proc_entry("mb_groups", sbi->s_proc);
2794 2517
2795 return 0; 2518 return 0;
2796} 2519}
@@ -3276,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3276 atomic_inc(&sbi->s_bal_breaks); 2999 atomic_inc(&sbi->s_bal_breaks);
3277 } 3000 }
3278 3001
3279 ext4_mb_store_history(ac); 3002 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3003 trace_ext4_mballoc_alloc(ac);
3004 else
3005 trace_ext4_mballoc_prealloc(ac);
3280} 3006}
3281 3007
3282/* 3008/*
@@ -3776,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3776 if (ac) { 3502 if (ac) {
3777 ac->ac_sb = sb; 3503 ac->ac_sb = sb;
3778 ac->ac_inode = pa->pa_inode; 3504 ac->ac_inode = pa->pa_inode;
3779 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3780 } 3505 }
3781 3506
3782 while (bit < end) { 3507 while (bit < end) {
@@ -3796,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3796 ac->ac_b_ex.fe_start = bit; 3521 ac->ac_b_ex.fe_start = bit;
3797 ac->ac_b_ex.fe_len = next - bit; 3522 ac->ac_b_ex.fe_len = next - bit;
3798 ac->ac_b_ex.fe_logical = 0; 3523 ac->ac_b_ex.fe_logical = 0;
3799 ext4_mb_store_history(ac); 3524 trace_ext4_mballoc_discard(ac);
3800 } 3525 }
3801 3526
3802 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3527 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3831,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3831 ext4_group_t group; 3556 ext4_group_t group;
3832 ext4_grpblk_t bit; 3557 ext4_grpblk_t bit;
3833 3558
3834 if (ac)
3835 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3836
3837 trace_ext4_mb_release_group_pa(ac, pa); 3559 trace_ext4_mb_release_group_pa(ac, pa);
3838 BUG_ON(pa->pa_deleted == 0); 3560 BUG_ON(pa->pa_deleted == 0);
3839 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3561 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3848,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3848 ac->ac_b_ex.fe_start = bit; 3570 ac->ac_b_ex.fe_start = bit;
3849 ac->ac_b_ex.fe_len = pa->pa_len; 3571 ac->ac_b_ex.fe_len = pa->pa_len;
3850 ac->ac_b_ex.fe_logical = 0; 3572 ac->ac_b_ex.fe_logical = 0;
3851 ext4_mb_store_history(ac); 3573 trace_ext4_mballoc_discard(ac);
3852 } 3574 }
3853 3575
3854 return 0; 3576 return 0;
@@ -4189,7 +3911,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3911 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3912 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits; 3913 >> bsbits;
4192 size = max(size, isize);
4193 3914
4194 if ((size == isize) && 3915 if ((size == isize) &&
4195 !ext4_fs_is_busy(sbi) && 3916 !ext4_fs_is_busy(sbi) &&
@@ -4199,6 +3920,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4199 } 3920 }
4200 3921
4201 /* don't use group allocation for large files */ 3922 /* don't use group allocation for large files */
3923 size = max(size, isize);
4202 if (size >= sbi->s_mb_stream_request) { 3924 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3925 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4204 return; 3926 return;
@@ -4739,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4739 4461
4740 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4462 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4741 if (ac) { 4463 if (ac) {
4742 ac->ac_op = EXT4_MB_HISTORY_FREE;
4743 ac->ac_inode = inode; 4464 ac->ac_inode = inode;
4744 ac->ac_sb = sb; 4465 ac->ac_sb = sb;
4745 } 4466 }
@@ -4806,7 +4527,7 @@ do_more:
4806 ac->ac_b_ex.fe_group = block_group; 4527 ac->ac_b_ex.fe_group = block_group;
4807 ac->ac_b_ex.fe_start = bit; 4528 ac->ac_b_ex.fe_start = bit;
4808 ac->ac_b_ex.fe_len = count; 4529 ac->ac_b_ex.fe_len = count;
4809 ext4_mb_store_history(ac); 4530 trace_ext4_mballoc_free(ac);
4810 } 4531 }
4811 4532
4812 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4533 err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d709b24..0ca811061bc7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
52#define mb_debug(n, fmt, a...) 52#define mb_debug(n, fmt, a...)
53#endif 53#endif
54 54
55/*
56 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
57 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
58 */
59#define EXT4_MB_HISTORY
60#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 55#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
61#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ 56#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
62#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
63#define EXT4_MB_HISTORY_FREE 8 /* free */
64
65#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
66 EXT4_MB_HISTORY_PREALLOC)
67 57
68/* 58/*
69 * How long mballoc can look for a best extent (in found extents) 59 * How long mballoc can look for a best extent (in found extents)
@@ -84,7 +74,7 @@ extern u8 mb_enable_debug;
84 * with 'ext4_mb_stats' allocator will collect stats that will be 74 * with 'ext4_mb_stats' allocator will collect stats that will be
85 * shown at umount. The collecting costs though! 75 * shown at umount. The collecting costs though!
86 */ 76 */
87#define MB_DEFAULT_STATS 1 77#define MB_DEFAULT_STATS 0
88 78
89/* 79/*
90 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served 80 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -217,22 +207,6 @@ struct ext4_allocation_context {
217#define AC_STATUS_FOUND 2 207#define AC_STATUS_FOUND 2
218#define AC_STATUS_BREAK 3 208#define AC_STATUS_BREAK 3
219 209
220struct ext4_mb_history {
221 struct ext4_free_extent orig; /* orig allocation */
222 struct ext4_free_extent goal; /* goal allocation */
223 struct ext4_free_extent result; /* result allocation */
224 unsigned pid;
225 unsigned ino;
226 __u16 found; /* how many extents have been found */
227 __u16 groups; /* how many groups have been scanned */
228 __u16 tail; /* what tail broke some buddy */
229 __u16 buddy; /* buddy the tail ^^^ broke */
230 __u16 flags;
231 __u8 cr:3; /* which phase the result extent was found at */
232 __u8 op:4;
233 __u8 merged:1;
234};
235
236struct ext4_buddy { 210struct ext4_buddy {
237 struct page *bd_buddy_page; 211 struct page *bd_buddy_page;
238 void *bd_buddy; 212 void *bd_buddy;
@@ -247,13 +221,6 @@ struct ext4_buddy {
247#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 221#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
248#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 222#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
249 223
250#ifndef EXT4_MB_HISTORY
251static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
252{
253 return;
254}
255#endif
256
257#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 224#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
258 225
259static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 226static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f239ae6..a93d5b80f3e2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
75 goto err_out; 75 goto err_out;
76 } 76 }
77 } 77 }
78 retval = ext4_ext_insert_extent(handle, inode, path, &newext); 78 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
79err_out: 79err_out:
80 if (path) { 80 if (path) {
81 ext4_ext_drop_refs(path); 81 ext4_ext_drop_refs(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a2915e40b..25b6b1457360 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
322 goto out; 322 goto out;
323 323
324 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
325 orig_path, new_ext)) 325 orig_path, new_ext, 0))
326 goto out; 326 goto out;
327 } 327 }
328 328
@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
333 goto out; 333 goto out;
334 334
335 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
336 orig_path, end_ext)) 336 orig_path, end_ext, 0))
337 goto out; 337 goto out;
338 } 338 }
339out: 339out:
@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
1001 return -EINVAL; 1001 return -EINVAL;
1002 } 1002 }
1003 1003
1004 /* orig and donor should be different file */
1005 if (orig_inode->i_ino == donor_inode->i_ino) {
1006 ext4_debug("ext4 move extent: The argument files should not "
1007 "be same file [ino:orig %lu, donor %lu]\n",
1008 orig_inode->i_ino, donor_inode->i_ino);
1009 return -EINVAL;
1010 }
1011
1012 /* Ext4 move extent supports only extent based file */ 1004 /* Ext4 move extent supports only extent based file */
1013 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 1005 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
1014 ext4_debug("ext4 move extent: orig file is not extents " 1006 ext4_debug("ext4 move extent: orig file is not extents "
@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1232 int block_len_in_page; 1224 int block_len_in_page;
1233 int uninit; 1225 int uninit;
1234 1226
1227 /* orig and donor should be different file */
1228 if (orig_inode->i_ino == donor_inode->i_ino) {
1229 ext4_debug("ext4 move extent: The argument files should not "
1230 "be same file [ino:orig %lu, donor %lu]\n",
1231 orig_inode->i_ino, donor_inode->i_ino);
1232 return -EINVAL;
1233 }
1234
1235 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0) 1237 if (ret1 < 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d285cd5..6d2c1b897fc7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,12 +1518,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1518 return retval; 1518 return retval;
1519 1519
1520 if (blocks == 1 && !dx_fallback && 1520 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { 1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
1522 retval = make_indexed_dir(handle, dentry, inode, bh); 1522 return make_indexed_dir(handle, dentry, inode, bh);
1523 if (retval == -ENOSPC)
1524 brelse(bh);
1525 return retval;
1526 }
1527 brelse(bh); 1523 brelse(bh);
1528 } 1524 }
1529 bh = ext4_append(handle, dir, &block, &retval); 1525 bh = ext4_append(handle, dir, &block, &retval);
@@ -1532,10 +1528,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1532 de = (struct ext4_dir_entry_2 *) bh->b_data; 1528 de = (struct ext4_dir_entry_2 *) bh->b_data;
1533 de->inode = 0; 1529 de->inode = 0;
1534 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1535 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1531 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1536 if (retval == -ENOSPC)
1537 brelse(bh);
1538 return retval;
1539} 1532}
1540 1533
1541/* 1534/*
@@ -1664,8 +1657,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1664 if (!de) 1657 if (!de)
1665 goto cleanup; 1658 goto cleanup;
1666 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1667 if (err != -ENOSPC) 1660 bh = NULL;
1668 bh = NULL;
1669 goto cleanup; 1661 goto cleanup;
1670 1662
1671journal_error: 1663journal_error:
@@ -2076,7 +2068,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2076 struct ext4_iloc iloc; 2068 struct ext4_iloc iloc;
2077 int err = 0; 2069 int err = 0;
2078 2070
2079 if (!ext4_handle_valid(handle)) 2071 /* ext4_handle_valid() assumes a valid handle_t pointer */
2072 if (handle && !ext4_handle_valid(handle))
2080 return 0; 2073 return 0;
2081 2074
2082 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2075 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba27779..d4ca92aab514 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
50#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
52 52
53static int default_mb_history_length = 1000;
54
55module_param_named(default_mb_history_length, default_mb_history_length,
56 int, 0644);
57MODULE_PARM_DESC(default_mb_history_length,
58 "Default number of entries saved for mb_history");
59
60struct proc_dir_entry *ext4_proc_root; 53struct proc_dir_entry *ext4_proc_root;
61static struct kset *ext4_kset; 54static struct kset *ext4_kset;
62 55
@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
189 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 182 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
190} 183}
191 184
185
186/* Just increment the non-pointer handle value */
187static handle_t *ext4_get_nojournal(void)
188{
189 handle_t *handle = current->journal_info;
190 unsigned long ref_cnt = (unsigned long)handle;
191
192 BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
193
194 ref_cnt++;
195 handle = (handle_t *)ref_cnt;
196
197 current->journal_info = handle;
198 return handle;
199}
200
201
202/* Decrement the non-pointer handle value */
203static void ext4_put_nojournal(handle_t *handle)
204{
205 unsigned long ref_cnt = (unsigned long)handle;
206
207 BUG_ON(ref_cnt == 0);
208
209 ref_cnt--;
210 handle = (handle_t *)ref_cnt;
211
212 current->journal_info = handle;
213}
214
192/* 215/*
193 * Wrappers for jbd2_journal_start/end. 216 * Wrappers for jbd2_journal_start/end.
194 * 217 *
@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
215 } 238 }
216 return jbd2_journal_start(journal, nblocks); 239 return jbd2_journal_start(journal, nblocks);
217 } 240 }
218 /* 241 return ext4_get_nojournal();
219 * We're not journaling, return the appropriate indication.
220 */
221 current->journal_info = EXT4_NOJOURNAL_HANDLE;
222 return current->journal_info;
223} 242}
224 243
225/* 244/*
@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
235 int rc; 254 int rc;
236 255
237 if (!ext4_handle_valid(handle)) { 256 if (!ext4_handle_valid(handle)) {
238 /* 257 ext4_put_nojournal(handle);
239 * Do this here since we don't call jbd2_journal_stop() in
240 * no-journal mode.
241 */
242 current->journal_info = NULL;
243 return 0; 258 return 0;
244 } 259 }
245 sb = handle->h_transaction->t_journal->j_private; 260 sb = handle->h_transaction->t_journal->j_private;
@@ -580,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
580 struct ext4_super_block *es = sbi->s_es; 595 struct ext4_super_block *es = sbi->s_es;
581 int i, err; 596 int i, err;
582 597
598 flush_workqueue(sbi->dio_unwritten_wq);
599 destroy_workqueue(sbi->dio_unwritten_wq);
600
583 lock_super(sb); 601 lock_super(sb);
584 lock_kernel(); 602 lock_kernel();
585 if (sb->s_dirt) 603 if (sb->s_dirt)
@@ -684,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
684 ei->i_allocated_meta_blocks = 0; 702 ei->i_allocated_meta_blocks = 0;
685 ei->i_delalloc_reserved_flag = 0; 703 ei->i_delalloc_reserved_flag = 0;
686 spin_lock_init(&(ei->i_block_reservation_lock)); 704 spin_lock_init(&(ei->i_block_reservation_lock));
705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
706 ei->cur_aio_dio = NULL;
687 707
688 return &ei->vfs_inode; 708 return &ei->vfs_inode;
689} 709}
@@ -1052,7 +1072,7 @@ enum {
1052 Opt_journal_update, Opt_journal_dev, 1072 Opt_journal_update, Opt_journal_dev,
1053 Opt_journal_checksum, Opt_journal_async_commit, 1073 Opt_journal_checksum, Opt_journal_async_commit,
1054 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1074 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1055 Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, 1075 Opt_data_err_abort, Opt_data_err_ignore,
1056 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1076 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1057 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1077 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1058 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1078 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1099,7 +1119,6 @@ static const match_table_t tokens = {
1099 {Opt_data_writeback, "data=writeback"}, 1119 {Opt_data_writeback, "data=writeback"},
1100 {Opt_data_err_abort, "data_err=abort"}, 1120 {Opt_data_err_abort, "data_err=abort"},
1101 {Opt_data_err_ignore, "data_err=ignore"}, 1121 {Opt_data_err_ignore, "data_err=ignore"},
1102 {Opt_mb_history_length, "mb_history_length=%u"},
1103 {Opt_offusrjquota, "usrjquota="}, 1122 {Opt_offusrjquota, "usrjquota="},
1104 {Opt_usrjquota, "usrjquota=%s"}, 1123 {Opt_usrjquota, "usrjquota=%s"},
1105 {Opt_offgrpjquota, "grpjquota="}, 1124 {Opt_offgrpjquota, "grpjquota="},
@@ -1281,9 +1300,11 @@ static int parse_options(char *options, struct super_block *sb,
1281 *journal_devnum = option; 1300 *journal_devnum = option;
1282 break; 1301 break;
1283 case Opt_journal_checksum: 1302 case Opt_journal_checksum:
1284 break; /* Kept for backwards compatibility */ 1303 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1304 break;
1285 case Opt_journal_async_commit: 1305 case Opt_journal_async_commit:
1286 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1306 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1307 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1287 break; 1308 break;
1288 case Opt_noload: 1309 case Opt_noload:
1289 set_opt(sbi->s_mount_opt, NOLOAD); 1310 set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1340,13 +1361,6 @@ static int parse_options(char *options, struct super_block *sb,
1340 case Opt_data_err_ignore: 1361 case Opt_data_err_ignore:
1341 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1362 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1342 break; 1363 break;
1343 case Opt_mb_history_length:
1344 if (match_int(&args[0], &option))
1345 return 0;
1346 if (option < 0)
1347 return 0;
1348 sbi->s_mb_history_max = option;
1349 break;
1350#ifdef CONFIG_QUOTA 1364#ifdef CONFIG_QUOTA
1351 case Opt_usrjquota: 1365 case Opt_usrjquota:
1352 qtype = USRQUOTA; 1366 qtype = USRQUOTA;
@@ -1646,13 +1660,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1646 EXT4_INODES_PER_GROUP(sb), 1660 EXT4_INODES_PER_GROUP(sb),
1647 sbi->s_mount_opt); 1661 sbi->s_mount_opt);
1648 1662
1649 if (EXT4_SB(sb)->s_journal) {
1650 ext4_msg(sb, KERN_INFO, "%s journal on %s",
1651 EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1652 "external", EXT4_SB(sb)->s_journal->j_devname);
1653 } else {
1654 ext4_msg(sb, KERN_INFO, "no journal");
1655 }
1656 return res; 1663 return res;
1657} 1664}
1658 1665
@@ -2197,6 +2204,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2204EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2205EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2206EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2207EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2200 2208
2201static struct attribute *ext4_attrs[] = { 2209static struct attribute *ext4_attrs[] = {
2202 ATTR_LIST(delayed_allocation_blocks), 2210 ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2218,7 @@ static struct attribute *ext4_attrs[] = {
2210 ATTR_LIST(mb_order2_req), 2218 ATTR_LIST(mb_order2_req),
2211 ATTR_LIST(mb_stream_req), 2219 ATTR_LIST(mb_stream_req),
2212 ATTR_LIST(mb_group_prealloc), 2220 ATTR_LIST(mb_group_prealloc),
2221 ATTR_LIST(max_writeback_mb_bump),
2213 NULL, 2222 NULL,
2214}; 2223};
2215 2224
@@ -2413,7 +2422,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2413 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 2422 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2414 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2423 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2415 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2424 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2416 sbi->s_mb_history_max = default_mb_history_length;
2417 2425
2418 set_opt(sbi->s_mount_opt, BARRIER); 2426 set_opt(sbi->s_mount_opt, BARRIER);
2419 2427
@@ -2679,6 +2687,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 } 2687 }
2680 2688
2681 sbi->s_stripe = ext4_get_stripe_size(sbi); 2689 sbi->s_stripe = ext4_get_stripe_size(sbi);
2690 sbi->s_max_writeback_mb_bump = 128;
2682 2691
2683 /* 2692 /*
2684 * set up enough so that it can read an inode 2693 * set up enough so that it can read an inode
@@ -2752,14 +2761,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2752 goto failed_mount4; 2761 goto failed_mount4;
2753 } 2762 }
2754 2763
2755 jbd2_journal_set_features(sbi->s_journal, 2764 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
2756 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); 2765 jbd2_journal_set_features(sbi->s_journal,
2757 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 2766 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2758 jbd2_journal_set_features(sbi->s_journal, 0, 0,
2759 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2767 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2760 else 2768 } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
2769 jbd2_journal_set_features(sbi->s_journal,
2770 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2761 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 2771 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2762 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2772 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2773 } else {
2774 jbd2_journal_clear_features(sbi->s_journal,
2775 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2776 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2777 }
2763 2778
2764 /* We have now updated the journal if required, so we can 2779 /* We have now updated the journal if required, so we can
2765 * validate the data journaling mode. */ 2780 * validate the data journaling mode. */
@@ -2798,6 +2813,12 @@ no_journal:
2798 clear_opt(sbi->s_mount_opt, NOBH); 2813 clear_opt(sbi->s_mount_opt, NOBH);
2799 } 2814 }
2800 } 2815 }
2816 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2817 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2818 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
2819 goto failed_mount_wq;
2820 }
2821
2801 /* 2822 /*
2802 * The jbd2_journal_load will have done any necessary log recovery, 2823 * The jbd2_journal_load will have done any necessary log recovery,
2803 * so we can safely mount the rest of the filesystem now. 2824 * so we can safely mount the rest of the filesystem now.
@@ -2849,12 +2870,12 @@ no_journal:
2849 "available"); 2870 "available");
2850 } 2871 }
2851 2872
2852 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2873 if (test_opt(sb, DELALLOC) &&
2874 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2853 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 2875 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
2854 "requested data journaling mode"); 2876 "requested data journaling mode");
2855 clear_opt(sbi->s_mount_opt, DELALLOC); 2877 clear_opt(sbi->s_mount_opt, DELALLOC);
2856 } else if (test_opt(sb, DELALLOC)) 2878 }
2857 ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
2858 2879
2859 err = ext4_setup_system_zone(sb); 2880 err = ext4_setup_system_zone(sb);
2860 if (err) { 2881 if (err) {
@@ -2910,6 +2931,8 @@ cantfind_ext4:
2910 2931
2911failed_mount4: 2932failed_mount4:
2912 ext4_msg(sb, KERN_ERR, "mount failed"); 2933 ext4_msg(sb, KERN_ERR, "mount failed");
2934 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
2935failed_mount_wq:
2913 ext4_release_system_zone(sb); 2936 ext4_release_system_zone(sb);
2914 if (sbi->s_journal) { 2937 if (sbi->s_journal) {
2915 jbd2_journal_destroy(sbi->s_journal); 2938 jbd2_journal_destroy(sbi->s_journal);
@@ -3164,9 +3187,7 @@ static int ext4_load_journal(struct super_block *sb,
3164 return -EINVAL; 3187 return -EINVAL;
3165 } 3188 }
3166 3189
3167 if (journal->j_flags & JBD2_BARRIER) 3190 if (!(journal->j_flags & JBD2_BARRIER))
3168 ext4_msg(sb, KERN_INFO, "barriers enabled");
3169 else
3170 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3191 ext4_msg(sb, KERN_INFO, "barriers disabled");
3171 3192
3172 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 3193 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3361,11 +3382,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3361{ 3382{
3362 int ret = 0; 3383 int ret = 0;
3363 tid_t target; 3384 tid_t target;
3385 struct ext4_sb_info *sbi = EXT4_SB(sb);
3364 3386
3365 trace_ext4_sync_fs(sb, wait); 3387 trace_ext4_sync_fs(sb, wait);
3366 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 3388 flush_workqueue(sbi->dio_unwritten_wq);
3389 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
3367 if (wait) 3390 if (wait)
3368 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); 3391 jbd2_log_wait_commit(sbi->s_journal, target);
3369 } 3392 }
3370 return ret; 3393 return ret;
3371} 3394}
@@ -3951,27 +3974,6 @@ static struct file_system_type ext4_fs_type = {
3951 .fs_flags = FS_REQUIRES_DEV, 3974 .fs_flags = FS_REQUIRES_DEV,
3952}; 3975};
3953 3976
3954#ifdef CONFIG_EXT4DEV_COMPAT
3955static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
3956 const char *dev_name, void *data,struct vfsmount *mnt)
3957{
3958 printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
3959 "to mount using ext4\n", dev_name);
3960 printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
3961 "will go away by 2.6.31\n", dev_name);
3962 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3963}
3964
3965static struct file_system_type ext4dev_fs_type = {
3966 .owner = THIS_MODULE,
3967 .name = "ext4dev",
3968 .get_sb = ext4dev_get_sb,
3969 .kill_sb = kill_block_super,
3970 .fs_flags = FS_REQUIRES_DEV,
3971};
3972MODULE_ALIAS("ext4dev");
3973#endif
3974
3975static int __init init_ext4_fs(void) 3977static int __init init_ext4_fs(void)
3976{ 3978{
3977 int err; 3979 int err;
@@ -3996,13 +3998,6 @@ static int __init init_ext4_fs(void)
3996 err = register_filesystem(&ext4_fs_type); 3998 err = register_filesystem(&ext4_fs_type);
3997 if (err) 3999 if (err)
3998 goto out; 4000 goto out;
3999#ifdef CONFIG_EXT4DEV_COMPAT
4000 err = register_filesystem(&ext4dev_fs_type);
4001 if (err) {
4002 unregister_filesystem(&ext4_fs_type);
4003 goto out;
4004 }
4005#endif
4006 return 0; 4001 return 0;
4007out: 4002out:
4008 destroy_inodecache(); 4003 destroy_inodecache();
@@ -4021,9 +4016,6 @@ out4:
4021static void __exit exit_ext4_fs(void) 4016static void __exit exit_ext4_fs(void)
4022{ 4017{
4023 unregister_filesystem(&ext4_fs_type); 4018 unregister_filesystem(&ext4_fs_type);
4024#ifdef CONFIG_EXT4DEV_COMPAT
4025 unregister_filesystem(&ext4dev_fs_type);
4026#endif
4027 destroy_inodecache(); 4019 destroy_inodecache();
4028 exit_ext4_xattr(); 4020 exit_ext4_xattr();
4029 exit_ext4_mballoc(); 4021 exit_ext4_mballoc();