aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig14
-rw-r--r--fs/ext4/ext4.h54
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c444
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/inode.c578
-rw-r--r--fs/ext4/mballoc.c305
-rw-r--r--fs/ext4/mballoc.h35
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c20
-rw-r--r--fs/ext4/namei.c3
-rw-r--r--fs/ext4/super.c130
13 files changed, 1048 insertions, 555 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2e8f2d..9f2d45d75b1a 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config EXT4DEV_COMPAT
30 bool "Enable ext4dev compatibility"
31 depends on EXT4_FS
32 help
33 Starting with 2.6.28, the name of the ext4 filesystem was
34 renamed from ext4dev to ext4. Unfortunately there are some
35 legacy userspace programs (such as klibc's fstype) have
36 "ext4dev" hardcoded.
37
38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev,
40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed!
42
43config EXT4_FS_XATTR 29config EXT4_FS_XATTR
44 bool "Ext4 extended attributes" 30 bool "Ext4 extended attributes"
45 depends on EXT4_FS 31 depends on EXT4_FS
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..984ca0cb38c3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
65/* data type for block group number */ 65/* data type for block group number */
66typedef unsigned int ext4_group_t; 66typedef unsigned int ext4_group_t;
67 67
68/*
69 * Flags used in mballoc's allocation_context flags field.
70 *
71 * Also used to show what's going on for debugging purposes when the
72 * flag field is exported via the traceport interface
73 */
68 74
69/* prefer goal again. length */ 75/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 0x0001 76#define EXT4_MB_HINT_MERGE 0x0001
@@ -127,6 +133,16 @@ struct mpage_da_data {
127 int pages_written; 133 int pages_written;
128 int retval; 134 int retval;
129}; 135};
136#define DIO_AIO_UNWRITTEN 0x1
137typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */
142 ext4_lblk_t offset; /* offset in the file */
143 size_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */
145} ext4_io_end_t;
130 146
131/* 147/*
132 * Special inodes numbers 148 * Special inodes numbers
@@ -347,7 +363,16 @@ struct ext4_new_group_data {
347 /* Call ext4_da_update_reserve_space() after successfully 363 /* Call ext4_da_update_reserve_space() after successfully
348 allocating the blocks */ 364 allocating the blocks */
349#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 365#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
350 366 /* caller is from the direct IO path, request to creation of an
367 unitialized extents if not allocated, split the uninitialized
368 extent if blocks has been preallocated already*/
369#define EXT4_GET_BLOCKS_DIO 0x0010
370#define EXT4_GET_BLOCKS_CONVERT 0x0020
371#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
372 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
373 /* Convert extent to initialized after direct IO complete */
374#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
375 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
351 376
352/* 377/*
353 * ioctl commands 378 * ioctl commands
@@ -500,8 +525,8 @@ struct move_extent {
500static inline __le32 ext4_encode_extra_time(struct timespec *time) 525static inline __le32 ext4_encode_extra_time(struct timespec *time)
501{ 526{
502 return cpu_to_le32((sizeof(time->tv_sec) > 4 ? 527 return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
503 time->tv_sec >> 32 : 0) | 528 (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
504 ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); 529 ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
505} 530}
506 531
507static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) 532static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -509,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
509 if (sizeof(time->tv_sec) > 4) 534 if (sizeof(time->tv_sec) > 4)
510 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) 535 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
511 << 32; 536 << 32;
512 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; 537 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
513} 538}
514 539
515#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ 540#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -672,6 +697,11 @@ struct ext4_inode_info {
672 __u16 i_extra_isize; 697 __u16 i_extra_isize;
673 698
674 spinlock_t i_block_reservation_lock; 699 spinlock_t i_block_reservation_lock;
700
701 /* completed async DIOs that might need unwritten extents handling */
702 struct list_head i_aio_dio_complete_list;
703 /* current io_end structure for async DIO write*/
704 ext4_io_end_t *cur_aio_dio;
675}; 705};
676 706
677/* 707/*
@@ -942,18 +972,11 @@ struct ext4_sb_info {
942 unsigned int s_mb_stats; 972 unsigned int s_mb_stats;
943 unsigned int s_mb_order2_reqs; 973 unsigned int s_mb_order2_reqs;
944 unsigned int s_mb_group_prealloc; 974 unsigned int s_mb_group_prealloc;
975 unsigned int s_max_writeback_mb_bump;
945 /* where last allocation was done - for stream allocation */ 976 /* where last allocation was done - for stream allocation */
946 unsigned long s_mb_last_group; 977 unsigned long s_mb_last_group;
947 unsigned long s_mb_last_start; 978 unsigned long s_mb_last_start;
948 979
949 /* history to debug policy */
950 struct ext4_mb_history *s_mb_history;
951 int s_mb_history_cur;
952 int s_mb_history_max;
953 int s_mb_history_num;
954 spinlock_t s_mb_history_lock;
955 int s_mb_history_filter;
956
957 /* stats for buddy allocator */ 980 /* stats for buddy allocator */
958 spinlock_t s_mb_pa_lock; 981 spinlock_t s_mb_pa_lock;
959 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 982 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -980,6 +1003,9 @@ struct ext4_sb_info {
980 1003
981 unsigned int s_log_groups_per_flex; 1004 unsigned int s_log_groups_per_flex;
982 struct flex_groups *s_flex_groups; 1005 struct flex_groups *s_flex_groups;
1006
1007 /* workqueue for dio unwritten */
1008 struct workqueue_struct *dio_unwritten_wq;
983}; 1009};
984 1010
985static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1011static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1397,7 +1423,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1397 struct address_space *mapping, loff_t from); 1423 struct address_space *mapping, loff_t from);
1398extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1424extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1399extern qsize_t ext4_get_reserved_space(struct inode *inode); 1425extern qsize_t ext4_get_reserved_space(struct inode *inode);
1400 1426extern int flush_aio_dio_completed_IO(struct inode *inode);
1401/* ioctl.c */ 1427/* ioctl.c */
1402extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1428extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1403extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1429extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1699,6 +1725,8 @@ extern void ext4_ext_init(struct super_block *);
1699extern void ext4_ext_release(struct super_block *); 1725extern void ext4_ext_release(struct super_block *);
1700extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1726extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1701 loff_t len); 1727 loff_t len);
1728extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1729 loff_t len);
1702extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1730extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1703 sector_t block, unsigned int max_blocks, 1731 sector_t block, unsigned int max_blocks,
1704 struct buffer_head *bh, int flags); 1732 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1d15e6..2ca686454e87 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
221} 221}
222 222
223static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
224{
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226}
227
223extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 228extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
224extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); 229extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
225extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 230extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
235 struct ext4_ext_path *path, 240 struct ext4_ext_path *path,
236 struct ext4_extent *); 241 struct ext4_extent *);
237extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 242extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
238extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 243extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
239extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, 244extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
240 ext_prepare_callback, void *); 245 ext_prepare_callback, void *);
241extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 246extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..a2865980342f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
162int __ext4_journal_stop(const char *where, handle_t *handle); 162int __ext4_journal_stop(const char *where, handle_t *handle);
163 163
164#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) 164#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
165 165
166/* Note: Do not use this for NULL handles. This is only to determine if
167 * a properly allocated handle is using a journal or not. */
166static inline int ext4_handle_valid(handle_t *handle) 168static inline int ext4_handle_valid(handle_t *handle)
167{ 169{
168 if (handle == EXT4_NOJOURNAL_HANDLE) 170 if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
169 return 0; 171 return 0;
170 return 1; 172 return 1;
171} 173}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a3832577923..10539e364283 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -723,7 +723,7 @@ err:
723 * insert new index [@logical;@ptr] into the block at @curp; 723 * insert new index [@logical;@ptr] into the block at @curp;
724 * check where to insert: before @curp or after @curp 724 * check where to insert: before @curp or after @curp
725 */ 725 */
726static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 726int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
727 struct ext4_ext_path *curp, 727 struct ext4_ext_path *curp,
728 int logical, ext4_fsblk_t ptr) 728 int logical, ext4_fsblk_t ptr)
729{ 729{
@@ -1586,7 +1586,7 @@ out:
1586 */ 1586 */
1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1588 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1589 struct ext4_extent *newext) 1589 struct ext4_extent *newext, int flag)
1590{ 1590{
1591 struct ext4_extent_header *eh; 1591 struct ext4_extent_header *eh;
1592 struct ext4_extent *ex, *fex; 1592 struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1602 BUG_ON(path[depth].p_hdr == NULL); 1602 BUG_ON(path[depth].p_hdr == NULL);
1603 1603
1604 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1606 && ext4_can_extents_be_merged(inode, ex, newext)) {
1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1607 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext), 1608 ext4_ext_is_uninitialized(newext),
1608 ext4_ext_get_actual_len(newext), 1609 ext4_ext_get_actual_len(newext),
@@ -1722,7 +1723,8 @@ has_space:
1722 1723
1723merge: 1724merge:
1724 /* try to merge extents to the right */ 1725 /* try to merge extents to the right */
1725 ext4_ext_try_to_merge(inode, path, nearex); 1726 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1727 ext4_ext_try_to_merge(inode, path, nearex);
1726 1728
1727 /* try to merge extents to the left */ 1729 /* try to merge extents to the left */
1728 1730
@@ -2378,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
2378 */ 2380 */
2379 2381
2380 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2382 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2383#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2381 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2384 printk(KERN_INFO "EXT4-fs: file extents enabled");
2382#ifdef AGGRESSIVE_TEST 2385#ifdef AGGRESSIVE_TEST
2383 printk(", aggressive tests"); 2386 printk(", aggressive tests");
@@ -2389,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
2389 printk(", stats"); 2392 printk(", stats");
2390#endif 2393#endif
2391 printk("\n"); 2394 printk("\n");
2395#endif
2392#ifdef EXTENTS_STATS 2396#ifdef EXTENTS_STATS
2393 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2397 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2394 EXT4_SB(sb)->s_ext_min = 1 << 30; 2398 EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2490,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2490} 2494}
2491 2495
2492#define EXT4_EXT_ZERO_LEN 7 2496#define EXT4_EXT_ZERO_LEN 7
2493
2494/* 2497/*
2495 * This function is called by ext4_ext_get_blocks() if someone tries to write 2498 * This function is called by ext4_ext_get_blocks() if someone tries to write
2496 * to an uninitialized extent. It may result in splitting the uninitialized 2499 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2583 ex3->ee_block = cpu_to_le32(iblock); 2586 ex3->ee_block = cpu_to_le32(iblock);
2584 ext4_ext_store_pblock(ex3, newblock); 2587 ext4_ext_store_pblock(ex3, newblock);
2585 ex3->ee_len = cpu_to_le16(allocated); 2588 ex3->ee_len = cpu_to_le16(allocated);
2586 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2589 err = ext4_ext_insert_extent(handle, inode, path,
2590 ex3, 0);
2587 if (err == -ENOSPC) { 2591 if (err == -ENOSPC) {
2588 err = ext4_ext_zeroout(inode, &orig_ex); 2592 err = ext4_ext_zeroout(inode, &orig_ex);
2589 if (err) 2593 if (err)
@@ -2639,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2639 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2643 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2640 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2644 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2641 ext4_ext_mark_uninitialized(ex3); 2645 ext4_ext_mark_uninitialized(ex3);
2642 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2646 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2643 if (err == -ENOSPC) { 2647 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex); 2648 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err) 2649 if (err)
@@ -2757,7 +2761,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2757 err = ext4_ext_dirty(handle, inode, path + depth); 2761 err = ext4_ext_dirty(handle, inode, path + depth);
2758 goto out; 2762 goto out;
2759insert: 2763insert:
2760 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2764 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2761 if (err == -ENOSPC) { 2765 if (err == -ENOSPC) {
2762 err = ext4_ext_zeroout(inode, &orig_ex); 2766 err = ext4_ext_zeroout(inode, &orig_ex);
2763 if (err) 2767 if (err)
@@ -2785,6 +2789,324 @@ fix_extent_len:
2785} 2789}
2786 2790
2787/* 2791/*
2792 * This function is called by ext4_ext_get_blocks() from
2793 * ext4_get_blocks_dio_write() when DIO to write
2794 * to an uninitialized extent.
2795 *
2796 * Writing to an uninitized extent may result in splitting the uninitialized
2797 * extent into multiple /intialized unintialized extents (up to three)
2798 * There are three possibilities:
2799 * a> There is no split required: Entire extent should be uninitialized
2800 * b> Splits in two extents: Write is happening at either end of the extent
2801 * c> Splits in three extents: Somone is writing in middle of the extent
2802 *
2803 * One of more index blocks maybe needed if the extent tree grow after
2804 * the unintialized extent split. To prevent ENOSPC occur at the IO
2805 * complete, we need to split the uninitialized extent before DIO submit
2806 * the IO. The uninitilized extent called at this time will be split
2807 * into three uninitialized extent(at most). After IO complete, the part
2808 * being filled will be convert to initialized by the end_io callback function
2809 * via ext4_convert_unwritten_extents().
2810 */
2811static int ext4_split_unwritten_extents(handle_t *handle,
2812 struct inode *inode,
2813 struct ext4_ext_path *path,
2814 ext4_lblk_t iblock,
2815 unsigned int max_blocks,
2816 int flags)
2817{
2818 struct ext4_extent *ex, newex, orig_ex;
2819 struct ext4_extent *ex1 = NULL;
2820 struct ext4_extent *ex2 = NULL;
2821 struct ext4_extent *ex3 = NULL;
2822 struct ext4_extent_header *eh;
2823 ext4_lblk_t ee_block;
2824 unsigned int allocated, ee_len, depth;
2825 ext4_fsblk_t newblock;
2826 int err = 0;
2827 int ret = 0;
2828
2829 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2830 "iblock %llu, max_blocks %u\n", inode->i_ino,
2831 (unsigned long long)iblock, max_blocks);
2832 depth = ext_depth(inode);
2833 eh = path[depth].p_hdr;
2834 ex = path[depth].p_ext;
2835 ee_block = le32_to_cpu(ex->ee_block);
2836 ee_len = ext4_ext_get_actual_len(ex);
2837 allocated = ee_len - (iblock - ee_block);
2838 newblock = iblock - ee_block + ext_pblock(ex);
2839 ex2 = ex;
2840 orig_ex.ee_block = ex->ee_block;
2841 orig_ex.ee_len = cpu_to_le16(ee_len);
2842 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2843
2844 /*
2845 * if the entire unintialized extent length less than
2846 * the size of extent to write, there is no need to split
2847 * uninitialized extent
2848 */
2849 if (allocated <= max_blocks)
2850 return ret;
2851
2852 err = ext4_ext_get_access(handle, inode, path + depth);
2853 if (err)
2854 goto out;
2855 /* ex1: ee_block to iblock - 1 : uninitialized */
2856 if (iblock > ee_block) {
2857 ex1 = ex;
2858 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2859 ext4_ext_mark_uninitialized(ex1);
2860 ex2 = &newex;
2861 }
2862 /*
2863 * for sanity, update the length of the ex2 extent before
2864 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2865 * overlap of blocks.
2866 */
2867 if (!ex1 && allocated > max_blocks)
2868 ex2->ee_len = cpu_to_le16(max_blocks);
2869 /* ex3: to ee_block + ee_len : uninitialised */
2870 if (allocated > max_blocks) {
2871 unsigned int newdepth;
2872 ex3 = &newex;
2873 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2874 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2875 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2876 ext4_ext_mark_uninitialized(ex3);
2877 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2878 if (err == -ENOSPC) {
2879 err = ext4_ext_zeroout(inode, &orig_ex);
2880 if (err)
2881 goto fix_extent_len;
2882 /* update the extent length and mark as initialized */
2883 ex->ee_block = orig_ex.ee_block;
2884 ex->ee_len = orig_ex.ee_len;
2885 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2886 ext4_ext_dirty(handle, inode, path + depth);
2887 /* zeroed the full extent */
2888 /* blocks available from iblock */
2889 return allocated;
2890
2891 } else if (err)
2892 goto fix_extent_len;
2893 /*
2894 * The depth, and hence eh & ex might change
2895 * as part of the insert above.
2896 */
2897 newdepth = ext_depth(inode);
2898 /*
2899 * update the extent length after successful insert of the
2900 * split extent
2901 */
2902 orig_ex.ee_len = cpu_to_le16(ee_len -
2903 ext4_ext_get_actual_len(ex3));
2904 depth = newdepth;
2905 ext4_ext_drop_refs(path);
2906 path = ext4_ext_find_extent(inode, iblock, path);
2907 if (IS_ERR(path)) {
2908 err = PTR_ERR(path);
2909 goto out;
2910 }
2911 eh = path[depth].p_hdr;
2912 ex = path[depth].p_ext;
2913 if (ex2 != &newex)
2914 ex2 = ex;
2915
2916 err = ext4_ext_get_access(handle, inode, path + depth);
2917 if (err)
2918 goto out;
2919
2920 allocated = max_blocks;
2921 }
2922 /*
2923 * If there was a change of depth as part of the
2924 * insertion of ex3 above, we need to update the length
2925 * of the ex1 extent again here
2926 */
2927 if (ex1 && ex1 != ex) {
2928 ex1 = ex;
2929 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2930 ext4_ext_mark_uninitialized(ex1);
2931 ex2 = &newex;
2932 }
2933 /*
2934 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2935 * uninitialised still.
2936 */
2937 ex2->ee_block = cpu_to_le32(iblock);
2938 ext4_ext_store_pblock(ex2, newblock);
2939 ex2->ee_len = cpu_to_le16(allocated);
2940 ext4_ext_mark_uninitialized(ex2);
2941 if (ex2 != ex)
2942 goto insert;
2943 /* Mark modified extent as dirty */
2944 err = ext4_ext_dirty(handle, inode, path + depth);
2945 ext_debug("out here\n");
2946 goto out;
2947insert:
2948 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2949 if (err == -ENOSPC) {
2950 err = ext4_ext_zeroout(inode, &orig_ex);
2951 if (err)
2952 goto fix_extent_len;
2953 /* update the extent length and mark as initialized */
2954 ex->ee_block = orig_ex.ee_block;
2955 ex->ee_len = orig_ex.ee_len;
2956 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2957 ext4_ext_dirty(handle, inode, path + depth);
2958 /* zero out the first half */
2959 return allocated;
2960 } else if (err)
2961 goto fix_extent_len;
2962out:
2963 ext4_ext_show_leaf(inode, path);
2964 return err ? err : allocated;
2965
2966fix_extent_len:
2967 ex->ee_block = orig_ex.ee_block;
2968 ex->ee_len = orig_ex.ee_len;
2969 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2970 ext4_ext_mark_uninitialized(ex);
2971 ext4_ext_dirty(handle, inode, path + depth);
2972 return err;
2973}
2974static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2975 struct inode *inode,
2976 struct ext4_ext_path *path)
2977{
2978 struct ext4_extent *ex;
2979 struct ext4_extent_header *eh;
2980 int depth;
2981 int err = 0;
2982 int ret = 0;
2983
2984 depth = ext_depth(inode);
2985 eh = path[depth].p_hdr;
2986 ex = path[depth].p_ext;
2987
2988 err = ext4_ext_get_access(handle, inode, path + depth);
2989 if (err)
2990 goto out;
2991 /* first mark the extent as initialized */
2992 ext4_ext_mark_initialized(ex);
2993
2994 /*
2995 * We have to see if it can be merged with the extent
2996 * on the left.
2997 */
2998 if (ex > EXT_FIRST_EXTENT(eh)) {
2999 /*
3000 * To merge left, pass "ex - 1" to try_to_merge(),
3001 * since it merges towards right _only_.
3002 */
3003 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3004 if (ret) {
3005 err = ext4_ext_correct_indexes(handle, inode, path);
3006 if (err)
3007 goto out;
3008 depth = ext_depth(inode);
3009 ex--;
3010 }
3011 }
3012 /*
3013 * Try to Merge towards right.
3014 */
3015 ret = ext4_ext_try_to_merge(inode, path, ex);
3016 if (ret) {
3017 err = ext4_ext_correct_indexes(handle, inode, path);
3018 if (err)
3019 goto out;
3020 depth = ext_depth(inode);
3021 }
3022 /* Mark modified extent as dirty */
3023 err = ext4_ext_dirty(handle, inode, path + depth);
3024out:
3025 ext4_ext_show_leaf(inode, path);
3026 return err;
3027}
3028
3029static int
3030ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3031 ext4_lblk_t iblock, unsigned int max_blocks,
3032 struct ext4_ext_path *path, int flags,
3033 unsigned int allocated, struct buffer_head *bh_result,
3034 ext4_fsblk_t newblock)
3035{
3036 int ret = 0;
3037 int err = 0;
3038 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3039
3040 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3041 "block %llu, max_blocks %u, flags %d, allocated %u",
3042 inode->i_ino, (unsigned long long)iblock, max_blocks,
3043 flags, allocated);
3044 ext4_ext_show_leaf(inode, path);
3045
3046 /* DIO get_block() before submit the IO, split the extent */
3047 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3048 ret = ext4_split_unwritten_extents(handle,
3049 inode, path, iblock,
3050 max_blocks, flags);
3051 /* flag the io_end struct that we need convert when IO done */
3052 if (io)
3053 io->flag = DIO_AIO_UNWRITTEN;
3054 goto out;
3055 }
3056 /* DIO end_io complete, convert the filled extent to written */
3057 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3058 ret = ext4_convert_unwritten_extents_dio(handle, inode,
3059 path);
3060 goto out2;
3061 }
3062 /* buffered IO case */
3063 /*
3064 * repeat fallocate creation request
3065 * we already have an unwritten extent
3066 */
3067 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3068 goto map_out;
3069
3070 /* buffered READ or buffered write_begin() lookup */
3071 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3072 /*
3073 * We have blocks reserved already. We
3074 * return allocated blocks so that delalloc
3075 * won't do block reservation for us. But
3076 * the buffer head will be unmapped so that
3077 * a read from the block returns 0s.
3078 */
3079 set_buffer_unwritten(bh_result);
3080 goto out1;
3081 }
3082
3083 /* buffered write, writepage time, convert*/
3084 ret = ext4_ext_convert_to_initialized(handle, inode,
3085 path, iblock,
3086 max_blocks);
3087out:
3088 if (ret <= 0) {
3089 err = ret;
3090 goto out2;
3091 } else
3092 allocated = ret;
3093 set_buffer_new(bh_result);
3094map_out:
3095 set_buffer_mapped(bh_result);
3096out1:
3097 if (allocated > max_blocks)
3098 allocated = max_blocks;
3099 ext4_ext_show_leaf(inode, path);
3100 bh_result->b_bdev = inode->i_sb->s_bdev;
3101 bh_result->b_blocknr = newblock;
3102out2:
3103 if (path) {
3104 ext4_ext_drop_refs(path);
3105 kfree(path);
3106 }
3107 return err ? err : allocated;
3108}
3109/*
2788 * Block allocation/map/preallocation routine for extents based files 3110 * Block allocation/map/preallocation routine for extents based files
2789 * 3111 *
2790 * 3112 *
@@ -2814,6 +3136,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2814 int err = 0, depth, ret, cache_type; 3136 int err = 0, depth, ret, cache_type;
2815 unsigned int allocated = 0; 3137 unsigned int allocated = 0;
2816 struct ext4_allocation_request ar; 3138 struct ext4_allocation_request ar;
3139 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
2817 3140
2818 __clear_bit(BH_New, &bh_result->b_state); 3141 __clear_bit(BH_New, &bh_result->b_state);
2819 ext_debug("blocks %u/%u requested for inode %lu\n", 3142 ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -2889,33 +3212,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2889 EXT4_EXT_CACHE_EXTENT); 3212 EXT4_EXT_CACHE_EXTENT);
2890 goto out; 3213 goto out;
2891 } 3214 }
2892 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3215 ret = ext4_ext_handle_uninitialized_extents(handle,
2893 goto out; 3216 inode, iblock, max_blocks, path,
2894 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3217 flags, allocated, bh_result, newblock);
2895 if (allocated > max_blocks) 3218 return ret;
2896 allocated = max_blocks;
2897 /*
2898 * We have blocks reserved already. We
2899 * return allocated blocks so that delalloc
2900 * won't do block reservation for us. But
2901 * the buffer head will be unmapped so that
2902 * a read from the block returns 0s.
2903 */
2904 set_buffer_unwritten(bh_result);
2905 bh_result->b_bdev = inode->i_sb->s_bdev;
2906 bh_result->b_blocknr = newblock;
2907 goto out2;
2908 }
2909
2910 ret = ext4_ext_convert_to_initialized(handle, inode,
2911 path, iblock,
2912 max_blocks);
2913 if (ret <= 0) {
2914 err = ret;
2915 goto out2;
2916 } else
2917 allocated = ret;
2918 goto outnew;
2919 } 3219 }
2920 } 3220 }
2921 3221
@@ -2986,9 +3286,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2986 /* try to insert new extent into found leaf and return */ 3286 /* try to insert new extent into found leaf and return */
2987 ext4_ext_store_pblock(&newex, newblock); 3287 ext4_ext_store_pblock(&newex, newblock);
2988 newex.ee_len = cpu_to_le16(ar.len); 3288 newex.ee_len = cpu_to_le16(ar.len);
2989 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ 3289 /* Mark uninitialized */
3290 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
2990 ext4_ext_mark_uninitialized(&newex); 3291 ext4_ext_mark_uninitialized(&newex);
2991 err = ext4_ext_insert_extent(handle, inode, path, &newex); 3292 /*
3293 * io_end structure was created for every async
3294 * direct IO write to the middle of the file.
3295 * To avoid unecessary convertion for every aio dio rewrite
3296 * to the mid of file, here we flag the IO that is really
3297 * need the convertion.
3298 *
3299 */
3300 if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
3301 io->flag = DIO_AIO_UNWRITTEN;
3302 }
3303 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2992 if (err) { 3304 if (err) {
2993 /* free data blocks we just allocated */ 3305 /* free data blocks we just allocated */
2994 /* not a good idea to call discard here directly, 3306 /* not a good idea to call discard here directly,
@@ -3002,7 +3314,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3002 /* previous routine could use block we allocated */ 3314 /* previous routine could use block we allocated */
3003 newblock = ext_pblock(&newex); 3315 newblock = ext_pblock(&newex);
3004 allocated = ext4_ext_get_actual_len(&newex); 3316 allocated = ext4_ext_get_actual_len(&newex);
3005outnew:
3006 set_buffer_new(bh_result); 3317 set_buffer_new(bh_result);
3007 3318
3008 /* Cache only when it is _not_ an uninitialized extent */ 3319 /* Cache only when it is _not_ an uninitialized extent */
@@ -3201,6 +3512,63 @@ retry:
3201} 3512}
3202 3513
3203/* 3514/*
3515 * This function convert a range of blocks to written extents
3516 * The caller of this function will pass the start offset and the size.
3517 * all unwritten extents within this range will be converted to
3518 * written extents.
3519 *
3520 * This function is called from the direct IO end io call back
3521 * function, to convert the fallocated extents after IO is completed.
3522 */
3523int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3524 loff_t len)
3525{
3526 handle_t *handle;
3527 ext4_lblk_t block;
3528 unsigned int max_blocks;
3529 int ret = 0;
3530 int ret2 = 0;
3531 struct buffer_head map_bh;
3532 unsigned int credits, blkbits = inode->i_blkbits;
3533
3534 block = offset >> blkbits;
3535 /*
3536 * We can't just convert len to max_blocks because
3537 * If blocksize = 4096 offset = 3072 and len = 2048
3538 */
3539 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3540 - block;
3541 /*
3542 * credits to insert 1 extent into extent tree
3543 */
3544 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3545 while (ret >= 0 && ret < max_blocks) {
3546 block = block + ret;
3547 max_blocks = max_blocks - ret;
3548 handle = ext4_journal_start(inode, credits);
3549 if (IS_ERR(handle)) {
3550 ret = PTR_ERR(handle);
3551 break;
3552 }
3553 map_bh.b_state = 0;
3554 ret = ext4_get_blocks(handle, inode, block,
3555 max_blocks, &map_bh,
3556 EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3557 if (ret <= 0) {
3558 WARN_ON(ret <= 0);
3559 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3560 "returned error inode#%lu, block=%u, "
3561 "max_blocks=%u", __func__,
3562 inode->i_ino, block, max_blocks);
3563 }
3564 ext4_mark_inode_dirty(handle, inode);
3565 ret2 = ext4_journal_stop(handle);
3566 if (ret <= 0 || ret2 )
3567 break;
3568 }
3569 return ret > 0 ? ret2 : ret;
3570}
3571/*
3204 * Callback function called for each extent to gather FIEMAP information. 3572 * Callback function called for each extent to gather FIEMAP information.
3205 */ 3573 */
3206static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3574static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 07475740b512..2b1531266ee2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
44 * 44 *
45 * What we do is just kick off a commit and wait on it. This will snapshot the 45 * What we do is just kick off a commit and wait on it. This will snapshot the
46 * inode to disk. 46 * inode to disk.
47 *
48 * i_mutex lock is held when entering and exiting this function
47 */ 49 */
48 50
49int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
56 58
57 trace_ext4_sync_file(file, dentry, datasync); 59 trace_ext4_sync_file(file, dentry, datasync);
58 60
61 ret = flush_aio_dio_completed_IO(inode);
62 if (ret < 0)
63 goto out;
59 /* 64 /*
60 * data=writeback: 65 * data=writeback:
61 * The caller's filemap_fdatawrite()/wait will sync the data. 66 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 064746fad581..5c5bc5dafff8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h>
40 41
41#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
42#include "xattr.h" 43#include "xattr.h"
@@ -1145,6 +1146,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
1145} 1146}
1146 1147
1147/* 1148/*
1149 * Return the number of contiguous dirty pages in a given inode
1150 * starting at page frame idx.
1151 */
1152static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1153 unsigned int max_pages)
1154{
1155 struct address_space *mapping = inode->i_mapping;
1156 pgoff_t index;
1157 struct pagevec pvec;
1158 pgoff_t num = 0;
1159 int i, nr_pages, done = 0;
1160
1161 if (max_pages == 0)
1162 return 0;
1163 pagevec_init(&pvec, 0);
1164 while (!done) {
1165 index = idx;
1166 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1167 PAGECACHE_TAG_DIRTY,
1168 (pgoff_t)PAGEVEC_SIZE);
1169 if (nr_pages == 0)
1170 break;
1171 for (i = 0; i < nr_pages; i++) {
1172 struct page *page = pvec.pages[i];
1173 struct buffer_head *bh, *head;
1174
1175 lock_page(page);
1176 if (unlikely(page->mapping != mapping) ||
1177 !PageDirty(page) ||
1178 PageWriteback(page) ||
1179 page->index != idx) {
1180 done = 1;
1181 unlock_page(page);
1182 break;
1183 }
1184 if (page_has_buffers(page)) {
1185 bh = head = page_buffers(page);
1186 do {
1187 if (!buffer_delay(bh) &&
1188 !buffer_unwritten(bh))
1189 done = 1;
1190 bh = bh->b_this_page;
1191 } while (!done && (bh != head));
1192 }
1193 unlock_page(page);
1194 if (done)
1195 break;
1196 idx++;
1197 num++;
1198 if (num >= max_pages)
1199 break;
1200 }
1201 pagevec_release(&pvec);
1202 }
1203 return num;
1204}
1205
1206/*
1148 * The ext4_get_blocks() function tries to look up the requested blocks, 1207 * The ext4_get_blocks() function tries to look up the requested blocks,
1149 * and returns if the blocks are already mapped. 1208 * and returns if the blocks are already mapped.
1150 * 1209 *
@@ -1175,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1175 clear_buffer_mapped(bh); 1234 clear_buffer_mapped(bh);
1176 clear_buffer_unwritten(bh); 1235 clear_buffer_unwritten(bh);
1177 1236
1237 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
1238 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1239 (unsigned long)block);
1178 /* 1240 /*
1179 * Try to see if we can get the block without requesting a new 1241 * Try to see if we can get the block without requesting a new
1180 * file system block. 1242 * file system block.
@@ -1796,11 +1858,11 @@ repeat:
1796 1858
1797 if (ext4_claim_free_blocks(sbi, total)) { 1859 if (ext4_claim_free_blocks(sbi, total)) {
1798 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1860 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1861 vfs_dq_release_reservation_block(inode, total);
1799 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1862 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1800 yield(); 1863 yield();
1801 goto repeat; 1864 goto repeat;
1802 } 1865 }
1803 vfs_dq_release_reservation_block(inode, total);
1804 return -ENOSPC; 1866 return -ENOSPC;
1805 } 1867 }
1806 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1868 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -2092,18 +2154,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2092static void ext4_print_free_blocks(struct inode *inode) 2154static void ext4_print_free_blocks(struct inode *inode)
2093{ 2155{
2094 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2156 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2095 printk(KERN_EMERG "Total free blocks count %lld\n", 2157 printk(KERN_CRIT "Total free blocks count %lld\n",
2096 ext4_count_free_blocks(inode->i_sb)); 2158 ext4_count_free_blocks(inode->i_sb));
2097 printk(KERN_EMERG "Free/Dirty block details\n"); 2159 printk(KERN_CRIT "Free/Dirty block details\n");
2098 printk(KERN_EMERG "free_blocks=%lld\n", 2160 printk(KERN_CRIT "free_blocks=%lld\n",
2099 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 2161 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2100 printk(KERN_EMERG "dirty_blocks=%lld\n", 2162 printk(KERN_CRIT "dirty_blocks=%lld\n",
2101 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2163 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2102 printk(KERN_EMERG "Block reservation details\n"); 2164 printk(KERN_CRIT "Block reservation details\n");
2103 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 2165 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2104 EXT4_I(inode)->i_reserved_data_blocks); 2166 EXT4_I(inode)->i_reserved_data_blocks);
2105 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 2167 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2106 EXT4_I(inode)->i_reserved_meta_blocks); 2168 EXT4_I(inode)->i_reserved_meta_blocks);
2107 return; 2169 return;
2108} 2170}
2109 2171
@@ -2189,14 +2251,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2189 * writepage and writepages will again try to write 2251 * writepage and writepages will again try to write
2190 * the same. 2252 * the same.
2191 */ 2253 */
2192 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2254 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2193 "at logical offset %llu with max blocks " 2255 "delayed block allocation failed for inode %lu at "
2194 "%zd with error %d\n", 2256 "logical offset %llu with max blocks %zd with "
2195 __func__, mpd->inode->i_ino, 2257 "error %d\n", mpd->inode->i_ino,
2196 (unsigned long long)next, 2258 (unsigned long long) next,
2197 mpd->b_size >> mpd->inode->i_blkbits, err); 2259 mpd->b_size >> mpd->inode->i_blkbits, err);
2198 printk(KERN_EMERG "This should not happen.!! " 2260 printk(KERN_CRIT "This should not happen!! "
2199 "Data will be lost\n"); 2261 "Data will be lost\n");
2200 if (err == -ENOSPC) { 2262 if (err == -ENOSPC) {
2201 ext4_print_free_blocks(mpd->inode); 2263 ext4_print_free_blocks(mpd->inode);
2202 } 2264 }
@@ -2743,8 +2805,10 @@ static int ext4_da_writepages(struct address_space *mapping,
2743 int no_nrwrite_index_update; 2805 int no_nrwrite_index_update;
2744 int pages_written = 0; 2806 int pages_written = 0;
2745 long pages_skipped; 2807 long pages_skipped;
2808 unsigned int max_pages;
2746 int range_cyclic, cycled = 1, io_done = 0; 2809 int range_cyclic, cycled = 1, io_done = 0;
2747 int needed_blocks, ret = 0, nr_to_writebump = 0; 2810 int needed_blocks, ret = 0;
2811 long desired_nr_to_write, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start; 2812 loff_t range_start = wbc->range_start;
2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2813 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2750 2814
@@ -2771,16 +2835,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2771 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2835 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2772 return -EROFS; 2836 return -EROFS;
2773 2837
2774 /*
2775 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2776 * This make sure small files blocks are allocated in
2777 * single attempt. This ensure that small files
2778 * get less fragmented.
2779 */
2780 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2781 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2782 wbc->nr_to_write = sbi->s_mb_stream_request;
2783 }
2784 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2838 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2785 range_whole = 1; 2839 range_whole = 1;
2786 2840
@@ -2795,6 +2849,36 @@ static int ext4_da_writepages(struct address_space *mapping,
2795 } else 2849 } else
2796 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2850 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2797 2851
2852 /*
2853 * This works around two forms of stupidity. The first is in
2854 * the writeback code, which caps the maximum number of pages
2855 * written to be 1024 pages. This is wrong on multiple
2856 * levels; different architectues have a different page size,
2857 * which changes the maximum amount of data which gets
2858 * written. Secondly, 4 megabytes is way too small. XFS
2859 * forces this value to be 16 megabytes by multiplying
2860 * nr_to_write parameter by four, and then relies on its
2861 * allocator to allocate larger extents to make them
2862 * contiguous. Unfortunately this brings us to the second
2863 * stupidity, which is that ext4's mballoc code only allocates
2864 * at most 2048 blocks. So we force contiguous writes up to
2865 * the number of dirty blocks in the inode, or
2866 * sbi->max_writeback_mb_bump whichever is smaller.
2867 */
2868 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2869 if (!range_cyclic && range_whole)
2870 desired_nr_to_write = wbc->nr_to_write * 8;
2871 else
2872 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2873 max_pages);
2874 if (desired_nr_to_write > max_pages)
2875 desired_nr_to_write = max_pages;
2876
2877 if (wbc->nr_to_write < desired_nr_to_write) {
2878 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2879 wbc->nr_to_write = desired_nr_to_write;
2880 }
2881
2798 mpd.wbc = wbc; 2882 mpd.wbc = wbc;
2799 mpd.inode = mapping->host; 2883 mpd.inode = mapping->host;
2800 2884
@@ -2822,10 +2906,9 @@ retry:
2822 handle = ext4_journal_start(inode, needed_blocks); 2906 handle = ext4_journal_start(inode, needed_blocks);
2823 if (IS_ERR(handle)) { 2907 if (IS_ERR(handle)) {
2824 ret = PTR_ERR(handle); 2908 ret = PTR_ERR(handle);
2825 printk(KERN_CRIT "%s: jbd2_start: " 2909 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2826 "%ld pages, ino %lu; err %d\n", __func__, 2910 "%ld pages, ino %lu; err %d\n", __func__,
2827 wbc->nr_to_write, inode->i_ino, ret); 2911 wbc->nr_to_write, inode->i_ino, ret);
2828 dump_stack();
2829 goto out_writepages; 2912 goto out_writepages;
2830 } 2913 }
2831 2914
@@ -2897,9 +2980,10 @@ retry:
2897 goto retry; 2980 goto retry;
2898 } 2981 }
2899 if (pages_skipped != wbc->pages_skipped) 2982 if (pages_skipped != wbc->pages_skipped)
2900 printk(KERN_EMERG "This should not happen leaving %s " 2983 ext4_msg(inode->i_sb, KERN_CRIT,
2901 "with nr_to_write = %ld ret = %d\n", 2984 "This should not happen leaving %s "
2902 __func__, wbc->nr_to_write, ret); 2985 "with nr_to_write = %ld ret = %d\n",
2986 __func__, wbc->nr_to_write, ret);
2903 2987
2904 /* Update index */ 2988 /* Update index */
2905 index += pages_written; 2989 index += pages_written;
@@ -2914,7 +2998,8 @@ retry:
2914out_writepages: 2998out_writepages:
2915 if (!no_nrwrite_index_update) 2999 if (!no_nrwrite_index_update)
2916 wbc->no_nrwrite_index_update = 0; 3000 wbc->no_nrwrite_index_update = 0;
2917 wbc->nr_to_write -= nr_to_writebump; 3001 if (wbc->nr_to_write > nr_to_writebump)
3002 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start; 3003 wbc->range_start = range_start;
2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3004 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2920 return ret; 3005 return ret;
@@ -3272,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3272} 3357}
3273 3358
3274/* 3359/*
3360 * O_DIRECT for ext3 (or indirect map) based files
3361 *
3275 * If the O_DIRECT write will extend the file then add this inode to the 3362 * If the O_DIRECT write will extend the file then add this inode to the
3276 * orphan list. So recovery will truncate it back to the original size 3363 * orphan list. So recovery will truncate it back to the original size
3277 * if the machine crashes during the write. 3364 * if the machine crashes during the write.
@@ -3280,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3280 * crashes then stale disk data _may_ be exposed inside the file. But current 3367 * crashes then stale disk data _may_ be exposed inside the file. But current
3281 * VFS code falls back into buffered path in that case so we are safe. 3368 * VFS code falls back into buffered path in that case so we are safe.
3282 */ 3369 */
3283static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3370static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3284 const struct iovec *iov, loff_t offset, 3371 const struct iovec *iov, loff_t offset,
3285 unsigned long nr_segs) 3372 unsigned long nr_segs)
3286{ 3373{
@@ -3291,6 +3378,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3291 ssize_t ret; 3378 ssize_t ret;
3292 int orphan = 0; 3379 int orphan = 0;
3293 size_t count = iov_length(iov, nr_segs); 3380 size_t count = iov_length(iov, nr_segs);
3381 int retries = 0;
3294 3382
3295 if (rw == WRITE) { 3383 if (rw == WRITE) {
3296 loff_t final_size = offset + count; 3384 loff_t final_size = offset + count;
@@ -3313,9 +3401,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3313 } 3401 }
3314 } 3402 }
3315 3403
3404retry:
3316 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3405 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3317 offset, nr_segs, 3406 offset, nr_segs,
3318 ext4_get_block, NULL); 3407 ext4_get_block, NULL);
3408 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3409 goto retry;
3319 3410
3320 if (orphan) { 3411 if (orphan) {
3321 int err; 3412 int err;
@@ -3354,6 +3445,359 @@ out:
3354 return ret; 3445 return ret;
3355} 3446}
3356 3447
3448/* Maximum number of blocks we map for direct IO at once. */
3449
3450static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3451 struct buffer_head *bh_result, int create)
3452{
3453 handle_t *handle = NULL;
3454 int ret = 0;
3455 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3456 int dio_credits;
3457
3458 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3459 inode->i_ino, create);
3460 /*
3461 * DIO VFS code passes create = 0 flag for write to
3462 * the middle of file. It does this to avoid block
3463 * allocation for holes, to prevent expose stale data
3464 * out when there is parallel buffered read (which does
3465 * not hold the i_mutex lock) while direct IO write has
3466 * not completed. DIO request on holes finally falls back
3467 * to buffered IO for this reason.
3468 *
3469 * For ext4 extent based file, since we support fallocate,
3470 * new allocated extent as uninitialized, for holes, we
3471 * could fallocate blocks for holes, thus parallel
3472 * buffered IO read will zero out the page when read on
3473 * a hole while parallel DIO write to the hole has not completed.
3474 *
3475 * when we come here, we know it's a direct IO write to
3476 * to the middle of file (<i_size)
3477 * so it's safe to override the create flag from VFS.
3478 */
3479 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3480
3481 if (max_blocks > DIO_MAX_BLOCKS)
3482 max_blocks = DIO_MAX_BLOCKS;
3483 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3484 handle = ext4_journal_start(inode, dio_credits);
3485 if (IS_ERR(handle)) {
3486 ret = PTR_ERR(handle);
3487 goto out;
3488 }
3489 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3490 create);
3491 if (ret > 0) {
3492 bh_result->b_size = (ret << inode->i_blkbits);
3493 ret = 0;
3494 }
3495 ext4_journal_stop(handle);
3496out:
3497 return ret;
3498}
3499
3500static void ext4_free_io_end(ext4_io_end_t *io)
3501{
3502 BUG_ON(!io);
3503 iput(io->inode);
3504 kfree(io);
3505}
3506static void dump_aio_dio_list(struct inode * inode)
3507{
3508#ifdef EXT4_DEBUG
3509 struct list_head *cur, *before, *after;
3510 ext4_io_end_t *io, *io0, *io1;
3511
3512 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3513 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3514 return;
3515 }
3516
3517 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3518 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3519 cur = &io->list;
3520 before = cur->prev;
3521 io0 = container_of(before, ext4_io_end_t, list);
3522 after = cur->next;
3523 io1 = container_of(after, ext4_io_end_t, list);
3524
3525 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3526 io, inode->i_ino, io0, io1);
3527 }
3528#endif
3529}
3530
3531/*
3532 * check a range of space and convert unwritten extents to written.
3533 */
3534static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3535{
3536 struct inode *inode = io->inode;
3537 loff_t offset = io->offset;
3538 size_t size = io->size;
3539 int ret = 0;
3540
3541 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3542 "list->prev 0x%p\n",
3543 io, inode->i_ino, io->list.next, io->list.prev);
3544
3545 if (list_empty(&io->list))
3546 return ret;
3547
3548 if (io->flag != DIO_AIO_UNWRITTEN)
3549 return ret;
3550
3551 if (offset + size <= i_size_read(inode))
3552 ret = ext4_convert_unwritten_extents(inode, offset, size);
3553
3554 if (ret < 0) {
3555 printk(KERN_EMERG "%s: failed to convert unwritten"
3556 "extents to written extents, error is %d"
3557 " io is still on inode %lu aio dio list\n",
3558 __func__, ret, inode->i_ino);
3559 return ret;
3560 }
3561
3562 /* clear the DIO AIO unwritten flag */
3563 io->flag = 0;
3564 return ret;
3565}
3566/*
3567 * work on completed aio dio IO, to convert unwritten extents to extents
3568 */
3569static void ext4_end_aio_dio_work(struct work_struct *work)
3570{
3571 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3572 struct inode *inode = io->inode;
3573 int ret = 0;
3574
3575 mutex_lock(&inode->i_mutex);
3576 ret = ext4_end_aio_dio_nolock(io);
3577 if (ret >= 0) {
3578 if (!list_empty(&io->list))
3579 list_del_init(&io->list);
3580 ext4_free_io_end(io);
3581 }
3582 mutex_unlock(&inode->i_mutex);
3583}
3584/*
3585 * This function is called from ext4_sync_file().
3586 *
3587 * When AIO DIO IO is completed, the work to convert unwritten
3588 * extents to written is queued on workqueue but may not get immediately
3589 * scheduled. When fsync is called, we need to ensure the
3590 * conversion is complete before fsync returns.
3591 * The inode keeps track of a list of completed AIO from DIO path
3592 * that might needs to do the conversion. This function walks through
3593 * the list and convert the related unwritten extents to written.
3594 */
3595int flush_aio_dio_completed_IO(struct inode *inode)
3596{
3597 ext4_io_end_t *io;
3598 int ret = 0;
3599 int ret2 = 0;
3600
3601 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3602 return ret;
3603
3604 dump_aio_dio_list(inode);
3605 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3606 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3607 ext4_io_end_t, list);
3608 /*
3609 * Calling ext4_end_aio_dio_nolock() to convert completed
3610 * IO to written.
3611 *
3612 * When ext4_sync_file() is called, run_queue() may already
3613 * about to flush the work corresponding to this io structure.
3614 * It will be upset if it founds the io structure related
3615 * to the work-to-be schedule is freed.
3616 *
3617 * Thus we need to keep the io structure still valid here after
3618 * convertion finished. The io structure has a flag to
3619 * avoid double converting from both fsync and background work
3620 * queue work.
3621 */
3622 ret = ext4_end_aio_dio_nolock(io);
3623 if (ret < 0)
3624 ret2 = ret;
3625 else
3626 list_del_init(&io->list);
3627 }
3628 return (ret2 < 0) ? ret2 : 0;
3629}
3630
3631static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3632{
3633 ext4_io_end_t *io = NULL;
3634
3635 io = kmalloc(sizeof(*io), GFP_NOFS);
3636
3637 if (io) {
3638 igrab(inode);
3639 io->inode = inode;
3640 io->flag = 0;
3641 io->offset = 0;
3642 io->size = 0;
3643 io->error = 0;
3644 INIT_WORK(&io->work, ext4_end_aio_dio_work);
3645 INIT_LIST_HEAD(&io->list);
3646 }
3647
3648 return io;
3649}
3650
3651static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3652 ssize_t size, void *private)
3653{
3654 ext4_io_end_t *io_end = iocb->private;
3655 struct workqueue_struct *wq;
3656
3657 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3658 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3659 iocb->private, io_end->inode->i_ino, iocb, offset,
3660 size);
3661 /* if not async direct IO or dio with 0 bytes write, just return */
3662 if (!io_end || !size)
3663 return;
3664
3665 /* if not aio dio with unwritten extents, just free io and return */
3666 if (io_end->flag != DIO_AIO_UNWRITTEN){
3667 ext4_free_io_end(io_end);
3668 iocb->private = NULL;
3669 return;
3670 }
3671
3672 io_end->offset = offset;
3673 io_end->size = size;
3674 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3675
3676 /* queue the work to convert unwritten extents to written */
3677 queue_work(wq, &io_end->work);
3678
3679 /* Add the io_end to per-inode completed aio dio list*/
3680 list_add_tail(&io_end->list,
3681 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3682 iocb->private = NULL;
3683}
3684/*
3685 * For ext4 extent files, ext4 will do direct-io write to holes,
3686 * preallocated extents, and those write extend the file, no need to
3687 * fall back to buffered IO.
3688 *
3689 * For holes, we fallocate those blocks, mark them as unintialized
3690 * If those blocks were preallocated, we mark sure they are splited, but
3691 * still keep the range to write as unintialized.
3692 *
3693 * The unwrritten extents will be converted to written when DIO is completed.
3694 * For async direct IO, since the IO may still pending when return, we
3695 * set up an end_io call back function, which will do the convertion
3696 * when async direct IO completed.
3697 *
3698 * If the O_DIRECT write will extend the file then add this inode to the
3699 * orphan list. So recovery will truncate it back to the original size
3700 * if the machine crashes during the write.
3701 *
3702 */
3703static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3704 const struct iovec *iov, loff_t offset,
3705 unsigned long nr_segs)
3706{
3707 struct file *file = iocb->ki_filp;
3708 struct inode *inode = file->f_mapping->host;
3709 ssize_t ret;
3710 size_t count = iov_length(iov, nr_segs);
3711
3712 loff_t final_size = offset + count;
3713 if (rw == WRITE && final_size <= inode->i_size) {
3714 /*
3715 * We could direct write to holes and fallocate.
3716 *
3717 * Allocated blocks to fill the hole are marked as uninitialized
3718 * to prevent paralel buffered read to expose the stale data
3719 * before DIO complete the data IO.
3720 *
3721 * As to previously fallocated extents, ext4 get_block
3722 * will just simply mark the buffer mapped but still
3723 * keep the extents uninitialized.
3724 *
3725 * for non AIO case, we will convert those unwritten extents
3726 * to written after return back from blockdev_direct_IO.
3727 *
3728 * for async DIO, the conversion needs to be defered when
3729 * the IO is completed. The ext4 end_io callback function
3730 * will be called to take care of the conversion work.
3731 * Here for async case, we allocate an io_end structure to
3732 * hook to the iocb.
3733 */
3734 iocb->private = NULL;
3735 EXT4_I(inode)->cur_aio_dio = NULL;
3736 if (!is_sync_kiocb(iocb)) {
3737 iocb->private = ext4_init_io_end(inode);
3738 if (!iocb->private)
3739 return -ENOMEM;
3740 /*
3741 * we save the io structure for current async
3742 * direct IO, so that later ext4_get_blocks()
3743 * could flag the io structure whether there
3744 * is a unwritten extents needs to be converted
3745 * when IO is completed.
3746 */
3747 EXT4_I(inode)->cur_aio_dio = iocb->private;
3748 }
3749
3750 ret = blockdev_direct_IO(rw, iocb, inode,
3751 inode->i_sb->s_bdev, iov,
3752 offset, nr_segs,
3753 ext4_get_block_dio_write,
3754 ext4_end_io_dio);
3755 if (iocb->private)
3756 EXT4_I(inode)->cur_aio_dio = NULL;
3757 /*
3758 * The io_end structure takes a reference to the inode,
3759 * that structure needs to be destroyed and the
3760 * reference to the inode need to be dropped, when IO is
3761 * complete, even with 0 byte write, or failed.
3762 *
3763 * In the successful AIO DIO case, the io_end structure will be
3764 * desctroyed and the reference to the inode will be dropped
3765 * after the end_io call back function is called.
3766 *
3767 * In the case there is 0 byte write, or error case, since
3768 * VFS direct IO won't invoke the end_io call back function,
3769 * we need to free the end_io structure here.
3770 */
3771 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772 ext4_free_io_end(iocb->private);
3773 iocb->private = NULL;
3774 } else if (ret > 0)
3775 /*
3776 * for non AIO case, since the IO is already
3777 * completed, we could do the convertion right here
3778 */
3779 ret = ext4_convert_unwritten_extents(inode,
3780 offset, ret);
3781 return ret;
3782 }
3783
3784 /* for write the the end of file case, we fall back to old way */
3785 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3786}
3787
3788static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3789 const struct iovec *iov, loff_t offset,
3790 unsigned long nr_segs)
3791{
3792 struct file *file = iocb->ki_filp;
3793 struct inode *inode = file->f_mapping->host;
3794
3795 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3796 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3797
3798 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3799}
3800
3357/* 3801/*
3358 * Pages can be marked dirty completely asynchronously from ext4's journalling 3802 * Pages can be marked dirty completely asynchronously from ext4's journalling
3359 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3803 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -4551,8 +4995,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
4551 */ 4995 */
4552static int ext4_do_update_inode(handle_t *handle, 4996static int ext4_do_update_inode(handle_t *handle,
4553 struct inode *inode, 4997 struct inode *inode,
4554 struct ext4_iloc *iloc, 4998 struct ext4_iloc *iloc)
4555 int do_sync)
4556{ 4999{
4557 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5000 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4558 struct ext4_inode_info *ei = EXT4_I(inode); 5001 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4653,22 +5096,10 @@ static int ext4_do_update_inode(handle_t *handle,
4653 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 5096 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4654 } 5097 }
4655 5098
4656 /* 5099 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4657 * If we're not using a journal and we were called from 5100 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4658 * ext4_write_inode() to sync the inode (making do_sync true), 5101 if (!err)
4659 * we can just use sync_dirty_buffer() directly to do our dirty 5102 err = rc;
4660 * work. Testing s_journal here is a bit redundant but it's
4661 * worth it to avoid potential future trouble.
4662 */
4663 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4664 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4665 sync_dirty_buffer(bh);
4666 } else {
4667 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4668 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4669 if (!err)
4670 err = rc;
4671 }
4672 ei->i_state &= ~EXT4_STATE_NEW; 5103 ei->i_state &= ~EXT4_STATE_NEW;
4673 5104
4674out_brelse: 5105out_brelse:
@@ -4736,8 +5167,16 @@ int ext4_write_inode(struct inode *inode, int wait)
4736 err = ext4_get_inode_loc(inode, &iloc); 5167 err = ext4_get_inode_loc(inode, &iloc);
4737 if (err) 5168 if (err)
4738 return err; 5169 return err;
4739 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, 5170 if (wait)
4740 inode, &iloc, wait); 5171 sync_dirty_buffer(iloc.bh);
5172 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5173 ext4_error(inode->i_sb, __func__,
5174 "IO error syncing inode, "
5175 "inode=%lu, block=%llu",
5176 inode->i_ino,
5177 (unsigned long long)iloc.bh->b_blocknr);
5178 err = -EIO;
5179 }
4741 } 5180 }
4742 return err; 5181 return err;
4743} 5182}
@@ -5033,7 +5472,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
5033 get_bh(iloc->bh); 5472 get_bh(iloc->bh);
5034 5473
5035 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5474 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5036 err = ext4_do_update_inode(handle, inode, iloc, 0); 5475 err = ext4_do_update_inode(handle, inode, iloc);
5037 put_bh(iloc->bh); 5476 put_bh(iloc->bh);
5038 return err; 5477 return err;
5039} 5478}
@@ -5177,27 +5616,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5177 */ 5616 */
5178void ext4_dirty_inode(struct inode *inode) 5617void ext4_dirty_inode(struct inode *inode)
5179{ 5618{
5180 handle_t *current_handle = ext4_journal_current_handle();
5181 handle_t *handle; 5619 handle_t *handle;
5182 5620
5183 if (!ext4_handle_valid(current_handle)) {
5184 ext4_mark_inode_dirty(current_handle, inode);
5185 return;
5186 }
5187
5188 handle = ext4_journal_start(inode, 2); 5621 handle = ext4_journal_start(inode, 2);
5189 if (IS_ERR(handle)) 5622 if (IS_ERR(handle))
5190 goto out; 5623 goto out;
5191 if (current_handle && 5624
5192 current_handle->h_transaction != handle->h_transaction) { 5625 ext4_mark_inode_dirty(handle, inode);
5193 /* This task has a transaction open against a different fs */ 5626
5194 printk(KERN_EMERG "%s: transactions do not match!\n",
5195 __func__);
5196 } else {
5197 jbd_debug(5, "marking dirty. outer handle=%p\n",
5198 current_handle);
5199 ext4_mark_inode_dirty(handle, inode);
5200 }
5201 ext4_journal_stop(handle); 5627 ext4_journal_stop(handle);
5202out: 5628out:
5203 return; 5629 return;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c61896d605..bba12824defa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2096,207 +2096,6 @@ out:
2096 return err; 2096 return err;
2097} 2097}
2098 2098
2099#ifdef EXT4_MB_HISTORY
2100struct ext4_mb_proc_session {
2101 struct ext4_mb_history *history;
2102 struct super_block *sb;
2103 int start;
2104 int max;
2105};
2106
2107static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2108 struct ext4_mb_history *hs,
2109 int first)
2110{
2111 if (hs == s->history + s->max)
2112 hs = s->history;
2113 if (!first && hs == s->history + s->start)
2114 return NULL;
2115 while (hs->orig.fe_len == 0) {
2116 hs++;
2117 if (hs == s->history + s->max)
2118 hs = s->history;
2119 if (hs == s->history + s->start)
2120 return NULL;
2121 }
2122 return hs;
2123}
2124
2125static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2126{
2127 struct ext4_mb_proc_session *s = seq->private;
2128 struct ext4_mb_history *hs;
2129 int l = *pos;
2130
2131 if (l == 0)
2132 return SEQ_START_TOKEN;
2133 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2134 if (!hs)
2135 return NULL;
2136 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2137 return hs;
2138}
2139
2140static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2141 loff_t *pos)
2142{
2143 struct ext4_mb_proc_session *s = seq->private;
2144 struct ext4_mb_history *hs = v;
2145
2146 ++*pos;
2147 if (v == SEQ_START_TOKEN)
2148 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2149 else
2150 return ext4_mb_history_skip_empty(s, ++hs, 0);
2151}
2152
2153static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2154{
2155 char buf[25], buf2[25], buf3[25], *fmt;
2156 struct ext4_mb_history *hs = v;
2157
2158 if (v == SEQ_START_TOKEN) {
2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2161 "pid", "inode", "original", "goal", "result", "found",
2162 "grps", "cr", "flags", "merge", "tail", "broken");
2163 return 0;
2164 }
2165
2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2168 "0x%04x %-5s %-5u %-6u\n";
2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2170 hs->result.fe_start, hs->result.fe_len,
2171 hs->result.fe_logical);
2172 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2173 hs->orig.fe_start, hs->orig.fe_len,
2174 hs->orig.fe_logical);
2175 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
2176 hs->goal.fe_start, hs->goal.fe_len,
2177 hs->goal.fe_logical);
2178 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2179 hs->found, hs->groups, hs->cr, hs->flags,
2180 hs->merged ? "M" : "", hs->tail,
2181 hs->buddy ? 1 << hs->buddy : 0);
2182 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2183 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2184 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2185 hs->result.fe_start, hs->result.fe_len,
2186 hs->result.fe_logical);
2187 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2188 hs->orig.fe_start, hs->orig.fe_len,
2189 hs->orig.fe_logical);
2190 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2191 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2192 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2193 hs->result.fe_start, hs->result.fe_len);
2194 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2195 hs->pid, hs->ino, buf2);
2196 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2197 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2198 hs->result.fe_start, hs->result.fe_len);
2199 seq_printf(seq, "%-5u %-8u %-23s free\n",
2200 hs->pid, hs->ino, buf2);
2201 }
2202 return 0;
2203}
2204
2205static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2206{
2207}
2208
2209static const struct seq_operations ext4_mb_seq_history_ops = {
2210 .start = ext4_mb_seq_history_start,
2211 .next = ext4_mb_seq_history_next,
2212 .stop = ext4_mb_seq_history_stop,
2213 .show = ext4_mb_seq_history_show,
2214};
2215
2216static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2217{
2218 struct super_block *sb = PDE(inode)->data;
2219 struct ext4_sb_info *sbi = EXT4_SB(sb);
2220 struct ext4_mb_proc_session *s;
2221 int rc;
2222 int size;
2223
2224 if (unlikely(sbi->s_mb_history == NULL))
2225 return -ENOMEM;
2226 s = kmalloc(sizeof(*s), GFP_KERNEL);
2227 if (s == NULL)
2228 return -ENOMEM;
2229 s->sb = sb;
2230 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2231 s->history = kmalloc(size, GFP_KERNEL);
2232 if (s->history == NULL) {
2233 kfree(s);
2234 return -ENOMEM;
2235 }
2236
2237 spin_lock(&sbi->s_mb_history_lock);
2238 memcpy(s->history, sbi->s_mb_history, size);
2239 s->max = sbi->s_mb_history_max;
2240 s->start = sbi->s_mb_history_cur % s->max;
2241 spin_unlock(&sbi->s_mb_history_lock);
2242
2243 rc = seq_open(file, &ext4_mb_seq_history_ops);
2244 if (rc == 0) {
2245 struct seq_file *m = (struct seq_file *)file->private_data;
2246 m->private = s;
2247 } else {
2248 kfree(s->history);
2249 kfree(s);
2250 }
2251 return rc;
2252
2253}
2254
2255static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2256{
2257 struct seq_file *seq = (struct seq_file *)file->private_data;
2258 struct ext4_mb_proc_session *s = seq->private;
2259 kfree(s->history);
2260 kfree(s);
2261 return seq_release(inode, file);
2262}
2263
2264static ssize_t ext4_mb_seq_history_write(struct file *file,
2265 const char __user *buffer,
2266 size_t count, loff_t *ppos)
2267{
2268 struct seq_file *seq = (struct seq_file *)file->private_data;
2269 struct ext4_mb_proc_session *s = seq->private;
2270 struct super_block *sb = s->sb;
2271 char str[32];
2272 int value;
2273
2274 if (count >= sizeof(str)) {
2275 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2276 "mb_history", (int)sizeof(str));
2277 return -EOVERFLOW;
2278 }
2279
2280 if (copy_from_user(str, buffer, count))
2281 return -EFAULT;
2282
2283 value = simple_strtol(str, NULL, 0);
2284 if (value < 0)
2285 return -ERANGE;
2286 EXT4_SB(sb)->s_mb_history_filter = value;
2287
2288 return count;
2289}
2290
2291static const struct file_operations ext4_mb_seq_history_fops = {
2292 .owner = THIS_MODULE,
2293 .open = ext4_mb_seq_history_open,
2294 .read = seq_read,
2295 .write = ext4_mb_seq_history_write,
2296 .llseek = seq_lseek,
2297 .release = ext4_mb_seq_history_release,
2298};
2299
2300static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2099static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2301{ 2100{
2302 struct super_block *sb = seq->private; 2101 struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2396 .release = seq_release, 2195 .release = seq_release,
2397}; 2196};
2398 2197
2399static void ext4_mb_history_release(struct super_block *sb)
2400{
2401 struct ext4_sb_info *sbi = EXT4_SB(sb);
2402
2403 if (sbi->s_proc != NULL) {
2404 remove_proc_entry("mb_groups", sbi->s_proc);
2405 if (sbi->s_mb_history_max)
2406 remove_proc_entry("mb_history", sbi->s_proc);
2407 }
2408 kfree(sbi->s_mb_history);
2409}
2410
2411static void ext4_mb_history_init(struct super_block *sb)
2412{
2413 struct ext4_sb_info *sbi = EXT4_SB(sb);
2414 int i;
2415
2416 if (sbi->s_proc != NULL) {
2417 if (sbi->s_mb_history_max)
2418 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2419 &ext4_mb_seq_history_fops, sb);
2420 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2421 &ext4_mb_seq_groups_fops, sb);
2422 }
2423
2424 sbi->s_mb_history_cur = 0;
2425 spin_lock_init(&sbi->s_mb_history_lock);
2426 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2427 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
2428 /* if we can't allocate history, then we simple won't use it */
2429}
2430
2431static noinline_for_stack void
2432ext4_mb_store_history(struct ext4_allocation_context *ac)
2433{
2434 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2435 struct ext4_mb_history h;
2436
2437 if (sbi->s_mb_history == NULL)
2438 return;
2439
2440 if (!(ac->ac_op & sbi->s_mb_history_filter))
2441 return;
2442
2443 h.op = ac->ac_op;
2444 h.pid = current->pid;
2445 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2446 h.orig = ac->ac_o_ex;
2447 h.result = ac->ac_b_ex;
2448 h.flags = ac->ac_flags;
2449 h.found = ac->ac_found;
2450 h.groups = ac->ac_groups_scanned;
2451 h.cr = ac->ac_criteria;
2452 h.tail = ac->ac_tail;
2453 h.buddy = ac->ac_buddy;
2454 h.merged = 0;
2455 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2456 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2457 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2458 h.merged = 1;
2459 h.goal = ac->ac_g_ex;
2460 h.result = ac->ac_f_ex;
2461 }
2462
2463 spin_lock(&sbi->s_mb_history_lock);
2464 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2465 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2466 sbi->s_mb_history_cur = 0;
2467 spin_unlock(&sbi->s_mb_history_lock);
2468}
2469
2470#else
2471#define ext4_mb_history_release(sb)
2472#define ext4_mb_history_init(sb)
2473#endif
2474
2475 2198
2476/* Create and initialize ext4_group_info data for the given group. */ 2199/* Create and initialize ext4_group_info data for the given group. */
2477int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2200int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2690 sbi->s_mb_stats = MB_DEFAULT_STATS; 2413 sbi->s_mb_stats = MB_DEFAULT_STATS;
2691 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2414 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2692 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2415 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2693 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2694 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2416 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2695 2417
2696 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2418 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2708 spin_lock_init(&lg->lg_prealloc_lock); 2430 spin_lock_init(&lg->lg_prealloc_lock);
2709 } 2431 }
2710 2432
2711 ext4_mb_history_init(sb); 2433 if (sbi->s_proc)
2434 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2435 &ext4_mb_seq_groups_fops, sb);
2712 2436
2713 if (sbi->s_journal) 2437 if (sbi->s_journal)
2714 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2438 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2715
2716 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2717 return 0; 2439 return 0;
2718} 2440}
2719 2441
@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
2790 } 2512 }
2791 2513
2792 free_percpu(sbi->s_locality_groups); 2514 free_percpu(sbi->s_locality_groups);
2793 ext4_mb_history_release(sb); 2515 if (sbi->s_proc)
2516 remove_proc_entry("mb_groups", sbi->s_proc);
2794 2517
2795 return 0; 2518 return 0;
2796} 2519}
@@ -3276,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3276 atomic_inc(&sbi->s_bal_breaks); 2999 atomic_inc(&sbi->s_bal_breaks);
3277 } 3000 }
3278 3001
3279 ext4_mb_store_history(ac); 3002 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3003 trace_ext4_mballoc_alloc(ac);
3004 else
3005 trace_ext4_mballoc_prealloc(ac);
3280} 3006}
3281 3007
3282/* 3008/*
@@ -3776,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3776 if (ac) { 3502 if (ac) {
3777 ac->ac_sb = sb; 3503 ac->ac_sb = sb;
3778 ac->ac_inode = pa->pa_inode; 3504 ac->ac_inode = pa->pa_inode;
3779 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3780 } 3505 }
3781 3506
3782 while (bit < end) { 3507 while (bit < end) {
@@ -3796,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3796 ac->ac_b_ex.fe_start = bit; 3521 ac->ac_b_ex.fe_start = bit;
3797 ac->ac_b_ex.fe_len = next - bit; 3522 ac->ac_b_ex.fe_len = next - bit;
3798 ac->ac_b_ex.fe_logical = 0; 3523 ac->ac_b_ex.fe_logical = 0;
3799 ext4_mb_store_history(ac); 3524 trace_ext4_mballoc_discard(ac);
3800 } 3525 }
3801 3526
3802 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3527 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3831,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3831 ext4_group_t group; 3556 ext4_group_t group;
3832 ext4_grpblk_t bit; 3557 ext4_grpblk_t bit;
3833 3558
3834 if (ac)
3835 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3836
3837 trace_ext4_mb_release_group_pa(ac, pa); 3559 trace_ext4_mb_release_group_pa(ac, pa);
3838 BUG_ON(pa->pa_deleted == 0); 3560 BUG_ON(pa->pa_deleted == 0);
3839 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3561 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3848,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3848 ac->ac_b_ex.fe_start = bit; 3570 ac->ac_b_ex.fe_start = bit;
3849 ac->ac_b_ex.fe_len = pa->pa_len; 3571 ac->ac_b_ex.fe_len = pa->pa_len;
3850 ac->ac_b_ex.fe_logical = 0; 3572 ac->ac_b_ex.fe_logical = 0;
3851 ext4_mb_store_history(ac); 3573 trace_ext4_mballoc_discard(ac);
3852 } 3574 }
3853 3575
3854 return 0; 3576 return 0;
@@ -4189,7 +3911,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3911 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3912 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits; 3913 >> bsbits;
4192 size = max(size, isize);
4193 3914
4194 if ((size == isize) && 3915 if ((size == isize) &&
4195 !ext4_fs_is_busy(sbi) && 3916 !ext4_fs_is_busy(sbi) &&
@@ -4199,6 +3920,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4199 } 3920 }
4200 3921
4201 /* don't use group allocation for large files */ 3922 /* don't use group allocation for large files */
3923 size = max(size, isize);
4202 if (size >= sbi->s_mb_stream_request) { 3924 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3925 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4204 return; 3926 return;
@@ -4739,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4739 4461
4740 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4462 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4741 if (ac) { 4463 if (ac) {
4742 ac->ac_op = EXT4_MB_HISTORY_FREE;
4743 ac->ac_inode = inode; 4464 ac->ac_inode = inode;
4744 ac->ac_sb = sb; 4465 ac->ac_sb = sb;
4745 } 4466 }
@@ -4806,7 +4527,7 @@ do_more:
4806 ac->ac_b_ex.fe_group = block_group; 4527 ac->ac_b_ex.fe_group = block_group;
4807 ac->ac_b_ex.fe_start = bit; 4528 ac->ac_b_ex.fe_start = bit;
4808 ac->ac_b_ex.fe_len = count; 4529 ac->ac_b_ex.fe_len = count;
4809 ext4_mb_store_history(ac); 4530 trace_ext4_mballoc_free(ac);
4810 } 4531 }
4811 4532
4812 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4533 err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d709b24..0ca811061bc7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
52#define mb_debug(n, fmt, a...) 52#define mb_debug(n, fmt, a...)
53#endif 53#endif
54 54
55/*
56 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
57 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
58 */
59#define EXT4_MB_HISTORY
60#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 55#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
61#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ 56#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
62#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
63#define EXT4_MB_HISTORY_FREE 8 /* free */
64
65#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
66 EXT4_MB_HISTORY_PREALLOC)
67 57
68/* 58/*
69 * How long mballoc can look for a best extent (in found extents) 59 * How long mballoc can look for a best extent (in found extents)
@@ -84,7 +74,7 @@ extern u8 mb_enable_debug;
84 * with 'ext4_mb_stats' allocator will collect stats that will be 74 * with 'ext4_mb_stats' allocator will collect stats that will be
85 * shown at umount. The collecting costs though! 75 * shown at umount. The collecting costs though!
86 */ 76 */
87#define MB_DEFAULT_STATS 1 77#define MB_DEFAULT_STATS 0
88 78
89/* 79/*
90 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served 80 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -217,22 +207,6 @@ struct ext4_allocation_context {
217#define AC_STATUS_FOUND 2 207#define AC_STATUS_FOUND 2
218#define AC_STATUS_BREAK 3 208#define AC_STATUS_BREAK 3
219 209
220struct ext4_mb_history {
221 struct ext4_free_extent orig; /* orig allocation */
222 struct ext4_free_extent goal; /* goal allocation */
223 struct ext4_free_extent result; /* result allocation */
224 unsigned pid;
225 unsigned ino;
226 __u16 found; /* how many extents have been found */
227 __u16 groups; /* how many groups have been scanned */
228 __u16 tail; /* what tail broke some buddy */
229 __u16 buddy; /* buddy the tail ^^^ broke */
230 __u16 flags;
231 __u8 cr:3; /* which phase the result extent was found at */
232 __u8 op:4;
233 __u8 merged:1;
234};
235
236struct ext4_buddy { 210struct ext4_buddy {
237 struct page *bd_buddy_page; 211 struct page *bd_buddy_page;
238 void *bd_buddy; 212 void *bd_buddy;
@@ -247,13 +221,6 @@ struct ext4_buddy {
247#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 221#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
248#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 222#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
249 223
250#ifndef EXT4_MB_HISTORY
251static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
252{
253 return;
254}
255#endif
256
257#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 224#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
258 225
259static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 226static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f239ae6..a93d5b80f3e2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
75 goto err_out; 75 goto err_out;
76 } 76 }
77 } 77 }
78 retval = ext4_ext_insert_extent(handle, inode, path, &newext); 78 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
79err_out: 79err_out:
80 if (path) { 80 if (path) {
81 ext4_ext_drop_refs(path); 81 ext4_ext_drop_refs(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a2915e40b..25b6b1457360 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
322 goto out; 322 goto out;
323 323
324 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
325 orig_path, new_ext)) 325 orig_path, new_ext, 0))
326 goto out; 326 goto out;
327 } 327 }
328 328
@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
333 goto out; 333 goto out;
334 334
335 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
336 orig_path, end_ext)) 336 orig_path, end_ext, 0))
337 goto out; 337 goto out;
338 } 338 }
339out: 339out:
@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
1001 return -EINVAL; 1001 return -EINVAL;
1002 } 1002 }
1003 1003
1004 /* orig and donor should be different file */
1005 if (orig_inode->i_ino == donor_inode->i_ino) {
1006 ext4_debug("ext4 move extent: The argument files should not "
1007 "be same file [ino:orig %lu, donor %lu]\n",
1008 orig_inode->i_ino, donor_inode->i_ino);
1009 return -EINVAL;
1010 }
1011
1012 /* Ext4 move extent supports only extent based file */ 1004 /* Ext4 move extent supports only extent based file */
1013 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 1005 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
1014 ext4_debug("ext4 move extent: orig file is not extents " 1006 ext4_debug("ext4 move extent: orig file is not extents "
@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1232 int block_len_in_page; 1224 int block_len_in_page;
1233 int uninit; 1225 int uninit;
1234 1226
1227 /* orig and donor should be different file */
1228 if (orig_inode->i_ino == donor_inode->i_ino) {
1229 ext4_debug("ext4 move extent: The argument files should not "
1230 "be same file [ino:orig %lu, donor %lu]\n",
1231 orig_inode->i_ino, donor_inode->i_ino);
1232 return -EINVAL;
1233 }
1234
1235 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0) 1237 if (ret1 < 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d285cd5..7c8fe80bacdd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2076,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2076 struct ext4_iloc iloc; 2076 struct ext4_iloc iloc;
2077 int err = 0; 2077 int err = 0;
2078 2078
2079 if (!ext4_handle_valid(handle)) 2079 /* ext4_handle_valid() assumes a valid handle_t pointer */
2080 if (handle && !ext4_handle_valid(handle))
2080 return 0; 2081 return 0;
2081 2082
2082 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2083 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba27779..312211ee05af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
50#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
52 52
53static int default_mb_history_length = 1000;
54
55module_param_named(default_mb_history_length, default_mb_history_length,
56 int, 0644);
57MODULE_PARM_DESC(default_mb_history_length,
58 "Default number of entries saved for mb_history");
59
60struct proc_dir_entry *ext4_proc_root; 53struct proc_dir_entry *ext4_proc_root;
61static struct kset *ext4_kset; 54static struct kset *ext4_kset;
62 55
@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
189 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 182 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
190} 183}
191 184
185
186/* Just increment the non-pointer handle value */
187static handle_t *ext4_get_nojournal(void)
188{
189 handle_t *handle = current->journal_info;
190 unsigned long ref_cnt = (unsigned long)handle;
191
192 BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
193
194 ref_cnt++;
195 handle = (handle_t *)ref_cnt;
196
197 current->journal_info = handle;
198 return handle;
199}
200
201
202/* Decrement the non-pointer handle value */
203static void ext4_put_nojournal(handle_t *handle)
204{
205 unsigned long ref_cnt = (unsigned long)handle;
206
207 BUG_ON(ref_cnt == 0);
208
209 ref_cnt--;
210 handle = (handle_t *)ref_cnt;
211
212 current->journal_info = handle;
213}
214
192/* 215/*
193 * Wrappers for jbd2_journal_start/end. 216 * Wrappers for jbd2_journal_start/end.
194 * 217 *
@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
215 } 238 }
216 return jbd2_journal_start(journal, nblocks); 239 return jbd2_journal_start(journal, nblocks);
217 } 240 }
218 /* 241 return ext4_get_nojournal();
219 * We're not journaling, return the appropriate indication.
220 */
221 current->journal_info = EXT4_NOJOURNAL_HANDLE;
222 return current->journal_info;
223} 242}
224 243
225/* 244/*
@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
235 int rc; 254 int rc;
236 255
237 if (!ext4_handle_valid(handle)) { 256 if (!ext4_handle_valid(handle)) {
238 /* 257 ext4_put_nojournal(handle);
239 * Do this here since we don't call jbd2_journal_stop() in
240 * no-journal mode.
241 */
242 current->journal_info = NULL;
243 return 0; 258 return 0;
244 } 259 }
245 sb = handle->h_transaction->t_journal->j_private; 260 sb = handle->h_transaction->t_journal->j_private;
@@ -580,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
580 struct ext4_super_block *es = sbi->s_es; 595 struct ext4_super_block *es = sbi->s_es;
581 int i, err; 596 int i, err;
582 597
598 flush_workqueue(sbi->dio_unwritten_wq);
599 destroy_workqueue(sbi->dio_unwritten_wq);
600
583 lock_super(sb); 601 lock_super(sb);
584 lock_kernel(); 602 lock_kernel();
585 if (sb->s_dirt) 603 if (sb->s_dirt)
@@ -684,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
684 ei->i_allocated_meta_blocks = 0; 702 ei->i_allocated_meta_blocks = 0;
685 ei->i_delalloc_reserved_flag = 0; 703 ei->i_delalloc_reserved_flag = 0;
686 spin_lock_init(&(ei->i_block_reservation_lock)); 704 spin_lock_init(&(ei->i_block_reservation_lock));
705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
706 ei->cur_aio_dio = NULL;
687 707
688 return &ei->vfs_inode; 708 return &ei->vfs_inode;
689} 709}
@@ -1052,7 +1072,7 @@ enum {
1052 Opt_journal_update, Opt_journal_dev, 1072 Opt_journal_update, Opt_journal_dev,
1053 Opt_journal_checksum, Opt_journal_async_commit, 1073 Opt_journal_checksum, Opt_journal_async_commit,
1054 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1074 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1055 Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, 1075 Opt_data_err_abort, Opt_data_err_ignore,
1056 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1076 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1057 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1077 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1058 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1078 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1099,7 +1119,6 @@ static const match_table_t tokens = {
1099 {Opt_data_writeback, "data=writeback"}, 1119 {Opt_data_writeback, "data=writeback"},
1100 {Opt_data_err_abort, "data_err=abort"}, 1120 {Opt_data_err_abort, "data_err=abort"},
1101 {Opt_data_err_ignore, "data_err=ignore"}, 1121 {Opt_data_err_ignore, "data_err=ignore"},
1102 {Opt_mb_history_length, "mb_history_length=%u"},
1103 {Opt_offusrjquota, "usrjquota="}, 1122 {Opt_offusrjquota, "usrjquota="},
1104 {Opt_usrjquota, "usrjquota=%s"}, 1123 {Opt_usrjquota, "usrjquota=%s"},
1105 {Opt_offgrpjquota, "grpjquota="}, 1124 {Opt_offgrpjquota, "grpjquota="},
@@ -1340,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb,
1340 case Opt_data_err_ignore: 1359 case Opt_data_err_ignore:
1341 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1360 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1342 break; 1361 break;
1343 case Opt_mb_history_length:
1344 if (match_int(&args[0], &option))
1345 return 0;
1346 if (option < 0)
1347 return 0;
1348 sbi->s_mb_history_max = option;
1349 break;
1350#ifdef CONFIG_QUOTA 1362#ifdef CONFIG_QUOTA
1351 case Opt_usrjquota: 1363 case Opt_usrjquota:
1352 qtype = USRQUOTA; 1364 qtype = USRQUOTA;
@@ -1646,13 +1658,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1646 EXT4_INODES_PER_GROUP(sb), 1658 EXT4_INODES_PER_GROUP(sb),
1647 sbi->s_mount_opt); 1659 sbi->s_mount_opt);
1648 1660
1649 if (EXT4_SB(sb)->s_journal) {
1650 ext4_msg(sb, KERN_INFO, "%s journal on %s",
1651 EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1652 "external", EXT4_SB(sb)->s_journal->j_devname);
1653 } else {
1654 ext4_msg(sb, KERN_INFO, "no journal");
1655 }
1656 return res; 1661 return res;
1657} 1662}
1658 1663
@@ -2197,6 +2202,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2202EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2203EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2204EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2205EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2200 2206
2201static struct attribute *ext4_attrs[] = { 2207static struct attribute *ext4_attrs[] = {
2202 ATTR_LIST(delayed_allocation_blocks), 2208 ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2216,7 @@ static struct attribute *ext4_attrs[] = {
2210 ATTR_LIST(mb_order2_req), 2216 ATTR_LIST(mb_order2_req),
2211 ATTR_LIST(mb_stream_req), 2217 ATTR_LIST(mb_stream_req),
2212 ATTR_LIST(mb_group_prealloc), 2218 ATTR_LIST(mb_group_prealloc),
2219 ATTR_LIST(max_writeback_mb_bump),
2213 NULL, 2220 NULL,
2214}; 2221};
2215 2222
@@ -2413,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2413 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 2420 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2414 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2421 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2415 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2422 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2416 sbi->s_mb_history_max = default_mb_history_length;
2417 2423
2418 set_opt(sbi->s_mount_opt, BARRIER); 2424 set_opt(sbi->s_mount_opt, BARRIER);
2419 2425
@@ -2679,6 +2685,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 } 2685 }
2680 2686
2681 sbi->s_stripe = ext4_get_stripe_size(sbi); 2687 sbi->s_stripe = ext4_get_stripe_size(sbi);
2688 sbi->s_max_writeback_mb_bump = 128;
2682 2689
2683 /* 2690 /*
2684 * set up enough so that it can read an inode 2691 * set up enough so that it can read an inode
@@ -2798,6 +2805,12 @@ no_journal:
2798 clear_opt(sbi->s_mount_opt, NOBH); 2805 clear_opt(sbi->s_mount_opt, NOBH);
2799 } 2806 }
2800 } 2807 }
2808 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2809 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2810 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
2811 goto failed_mount_wq;
2812 }
2813
2801 /* 2814 /*
2802 * The jbd2_journal_load will have done any necessary log recovery, 2815 * The jbd2_journal_load will have done any necessary log recovery,
2803 * so we can safely mount the rest of the filesystem now. 2816 * so we can safely mount the rest of the filesystem now.
@@ -2849,12 +2862,12 @@ no_journal:
2849 "available"); 2862 "available");
2850 } 2863 }
2851 2864
2852 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2865 if (test_opt(sb, DELALLOC) &&
2866 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2853 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 2867 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
2854 "requested data journaling mode"); 2868 "requested data journaling mode");
2855 clear_opt(sbi->s_mount_opt, DELALLOC); 2869 clear_opt(sbi->s_mount_opt, DELALLOC);
2856 } else if (test_opt(sb, DELALLOC)) 2870 }
2857 ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
2858 2871
2859 err = ext4_setup_system_zone(sb); 2872 err = ext4_setup_system_zone(sb);
2860 if (err) { 2873 if (err) {
@@ -2910,6 +2923,8 @@ cantfind_ext4:
2910 2923
2911failed_mount4: 2924failed_mount4:
2912 ext4_msg(sb, KERN_ERR, "mount failed"); 2925 ext4_msg(sb, KERN_ERR, "mount failed");
2926 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
2927failed_mount_wq:
2913 ext4_release_system_zone(sb); 2928 ext4_release_system_zone(sb);
2914 if (sbi->s_journal) { 2929 if (sbi->s_journal) {
2915 jbd2_journal_destroy(sbi->s_journal); 2930 jbd2_journal_destroy(sbi->s_journal);
@@ -3164,9 +3179,7 @@ static int ext4_load_journal(struct super_block *sb,
3164 return -EINVAL; 3179 return -EINVAL;
3165 } 3180 }
3166 3181
3167 if (journal->j_flags & JBD2_BARRIER) 3182 if (!(journal->j_flags & JBD2_BARRIER))
3168 ext4_msg(sb, KERN_INFO, "barriers enabled");
3169 else
3170 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3183 ext4_msg(sb, KERN_INFO, "barriers disabled");
3171 3184
3172 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 3185 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3361,11 +3374,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3361{ 3374{
3362 int ret = 0; 3375 int ret = 0;
3363 tid_t target; 3376 tid_t target;
3377 struct ext4_sb_info *sbi = EXT4_SB(sb);
3364 3378
3365 trace_ext4_sync_fs(sb, wait); 3379 trace_ext4_sync_fs(sb, wait);
3366 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 3380 flush_workqueue(sbi->dio_unwritten_wq);
3381 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
3367 if (wait) 3382 if (wait)
3368 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); 3383 jbd2_log_wait_commit(sbi->s_journal, target);
3369 } 3384 }
3370 return ret; 3385 return ret;
3371} 3386}
@@ -3951,27 +3966,6 @@ static struct file_system_type ext4_fs_type = {
3951 .fs_flags = FS_REQUIRES_DEV, 3966 .fs_flags = FS_REQUIRES_DEV,
3952}; 3967};
3953 3968
3954#ifdef CONFIG_EXT4DEV_COMPAT
3955static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
3956 const char *dev_name, void *data,struct vfsmount *mnt)
3957{
3958 printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
3959 "to mount using ext4\n", dev_name);
3960 printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
3961 "will go away by 2.6.31\n", dev_name);
3962 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3963}
3964
3965static struct file_system_type ext4dev_fs_type = {
3966 .owner = THIS_MODULE,
3967 .name = "ext4dev",
3968 .get_sb = ext4dev_get_sb,
3969 .kill_sb = kill_block_super,
3970 .fs_flags = FS_REQUIRES_DEV,
3971};
3972MODULE_ALIAS("ext4dev");
3973#endif
3974
3975static int __init init_ext4_fs(void) 3969static int __init init_ext4_fs(void)
3976{ 3970{
3977 int err; 3971 int err;
@@ -3996,13 +3990,6 @@ static int __init init_ext4_fs(void)
3996 err = register_filesystem(&ext4_fs_type); 3990 err = register_filesystem(&ext4_fs_type);
3997 if (err) 3991 if (err)
3998 goto out; 3992 goto out;
3999#ifdef CONFIG_EXT4DEV_COMPAT
4000 err = register_filesystem(&ext4dev_fs_type);
4001 if (err) {
4002 unregister_filesystem(&ext4_fs_type);
4003 goto out;
4004 }
4005#endif
4006 return 0; 3993 return 0;
4007out: 3994out:
4008 destroy_inodecache(); 3995 destroy_inodecache();
@@ -4021,9 +4008,6 @@ out4:
4021static void __exit exit_ext4_fs(void) 4008static void __exit exit_ext4_fs(void)
4022{ 4009{
4023 unregister_filesystem(&ext4_fs_type); 4010 unregister_filesystem(&ext4_fs_type);
4024#ifdef CONFIG_EXT4DEV_COMPAT
4025 unregister_filesystem(&ext4dev_fs_type);
4026#endif
4027 destroy_inodecache(); 4011 destroy_inodecache();
4028 exit_ext4_xattr(); 4012 exit_ext4_xattr();
4029 exit_ext4_mballoc(); 4013 exit_ext4_mballoc();