aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-09-18 13:56:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-18 13:56:26 -0400
commit3530c1886291df061e3972c55590777ef1cb67f8 (patch)
treebd6755e533eb5a0f37ff600da6bc0d9d1ba33c17
parent6952b61de9984073289859073e8195ad0bee8fd5 (diff)
parent1358870deaf11a752a84fbd89201749aa62498e8 (diff)
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (64 commits) ext4: Update documentation about quota mount options ext4: replace MAX_DEFRAG_SIZE with EXT_MAX_BLOCK ext4: Fix the alloc on close after a truncate hueristic ext4: Add a tracepoint for ext4_alloc_da_blocks() ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags ext4: limit block allocations for indirect-block files to < 2^32 ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT ext4: Add null extent check to ext_get_path ext4: Replace BUG_ON() with ext4_error() in move_extents.c ext4: Replace get_ext_path macro with an inline funciton ext4: Fix include/trace/events/ext4.h to work with Systemtap ext4: Fix initalization of s_flex_groups ext4: Always set dx_node's fake_dirent explicitly. ext4: Fix async commit mode to be safe by using a barrier ext4: Don't update superblock write time when filesystem is read-only ext4: Clarify the locking details in mballoc ext4: check for need init flag in ext4_mb_load_buddy ext4: move ext4_mb_init_group() function earlier in the mballoc.c ext4: Make non-journal fsync work properly ext4: Assure that metadata blocks are written during fsync in no journal mode ...
-rw-r--r--Documentation/filesystems/ext4.txt24
-rw-r--r--fs/ext4/Kconfig11
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/extents.c112
-rw-r--r--fs/ext4/fsync.c13
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c150
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c429
-rw-r--r--fs/ext4/mballoc.h22
-rw-r--r--fs/ext4/migrate.c22
-rw-r--r--fs/ext4/move_extent.c334
-rw-r--r--fs/ext4/namei.c22
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c155
-rw-r--r--fs/ext4/xattr.c15
-rw-r--r--fs/jbd2/commit.c11
-rw-r--r--fs/jbd2/journal.c6
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--include/linux/jbd2.h2
-rw-r--r--include/trace/events/ext4.h142
-rw-r--r--include/trace/events/jbd2.h2
25 files changed, 1000 insertions, 601 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 7be02ac5fa36..18b5ec8cea45 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -134,15 +134,9 @@ ro Mount filesystem read only. Note that ext4 will
134 mount options "ro,noload" can be used to prevent 134 mount options "ro,noload" can be used to prevent
135 writes to the filesystem. 135 writes to the filesystem.
136 136
137journal_checksum Enable checksumming of the journal transactions.
138 This will allow the recovery code in e2fsck and the
139 kernel to detect corruption in the kernel. It is a
140 compatible change and will be ignored by older kernels.
141
142journal_async_commit Commit block can be written to disk without waiting 137journal_async_commit Commit block can be written to disk without waiting
143 for descriptor blocks. If enabled older kernels cannot 138 for descriptor blocks. If enabled older kernels cannot
144 mount the device. This will enable 'journal_checksum' 139 mount the device.
145 internally.
146 140
147journal=update Update the ext4 file system's journal to the current 141journal=update Update the ext4 file system's journal to the current
148 format. 142 format.
@@ -263,10 +257,18 @@ resuid=n The user ID which may use the reserved blocks.
263 257
264sb=n Use alternate superblock at this location. 258sb=n Use alternate superblock at this location.
265 259
266quota 260quota These options are ignored by the filesystem. They
267noquota 261noquota are used only by quota tools to recognize volumes
268grpquota 262grpquota where quota should be turned on. See documentation
269usrquota 263usrquota in the quota-tools package for more details
264 (http://sourceforge.net/projects/linuxquota).
265
266jqfmt=<quota type> These options tell filesystem details about quota
267usrjquota=<file> so that quota information can be properly updated
268grpjquota=<file> during journal replay. They replace the above
269 quota options. See documentation in the quota-tools
270 package for more details
271 (http://sourceforge.net/projects/linuxquota).
270 272
271bh (*) ext4 associates buffer heads to data pages to 273bh (*) ext4 associates buffer heads to data pages to
272nobh (a) cache disk block mapping information 274nobh (a) cache disk block mapping information
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..d5c0ea2e8f2d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
37 37
38 To enable backwards compatibility so that systems that are 38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev, 39 still expecting to mount ext4 filesystems using ext4dev,
40 chose Y here. This feature will go away by 2.6.31, so 40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed! 41 please arrange to get your userspace programs fixed!
42 42
43config EXT4_FS_XATTR 43config EXT4_FS_XATTR
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
77 77
78 If you are not using a security module that requires using 78 If you are not using a security module that requires using
79 extended attributes for file security labels, say N. 79 extended attributes for file security labels, say N.
80
81config EXT4_DEBUG
82 bool "EXT4 debugging support"
83 depends on EXT4_FS
84 help
85 Enables run-time debugging support for the ext4 filesystem.
86
87 If you select Y here, then you will be able to turn on debugging
88 with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
478 * new bitmap information 478 * new bitmap information
479 */ 479 */
480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
481 ext4_mb_update_group_info(grp, blocks_freed); 481 grp->bb_free += blocks_freed;
482 up_write(&grp->alloc_sem); 482 up_write(&grp->alloc_sem);
483 483
484 /* We dirtied the bitmap block */ 484 /* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..e227eea23f05 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t;
67 67
68 68
69/* prefer goal again. length */ 69/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 1 70#define EXT4_MB_HINT_MERGE 0x0001
71/* blocks already reserved */ 71/* blocks already reserved */
72#define EXT4_MB_HINT_RESERVED 2 72#define EXT4_MB_HINT_RESERVED 0x0002
73/* metadata is being allocated */ 73/* metadata is being allocated */
74#define EXT4_MB_HINT_METADATA 4 74#define EXT4_MB_HINT_METADATA 0x0004
75/* first blocks in the file */ 75/* first blocks in the file */
76#define EXT4_MB_HINT_FIRST 8 76#define EXT4_MB_HINT_FIRST 0x0008
77/* search for the best chunk */ 77/* search for the best chunk */
78#define EXT4_MB_HINT_BEST 16 78#define EXT4_MB_HINT_BEST 0x0010
79/* data is being allocated */ 79/* data is being allocated */
80#define EXT4_MB_HINT_DATA 32 80#define EXT4_MB_HINT_DATA 0x0020
81/* don't preallocate (for tails) */ 81/* don't preallocate (for tails) */
82#define EXT4_MB_HINT_NOPREALLOC 64 82#define EXT4_MB_HINT_NOPREALLOC 0x0040
83/* allocate for locality group */ 83/* allocate for locality group */
84#define EXT4_MB_HINT_GROUP_ALLOC 128 84#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
85/* allocate goal blocks or none */ 85/* allocate goal blocks or none */
86#define EXT4_MB_HINT_GOAL_ONLY 256 86#define EXT4_MB_HINT_GOAL_ONLY 0x0100
87/* goal is meaningful */ 87/* goal is meaningful */
88#define EXT4_MB_HINT_TRY_GOAL 512 88#define EXT4_MB_HINT_TRY_GOAL 0x0200
89/* blocks already pre-reserved by delayed allocation */ 89/* blocks already pre-reserved by delayed allocation */
90#define EXT4_MB_DELALLOC_RESERVED 1024 90#define EXT4_MB_DELALLOC_RESERVED 0x0400
91/* We are doing stream allocation */
92#define EXT4_MB_STREAM_ALLOC 0x0800
91 93
92 94
93struct ext4_allocation_request { 95struct ext4_allocation_request {
@@ -112,6 +114,21 @@ struct ext4_allocation_request {
112}; 114};
113 115
114/* 116/*
117 * For delayed allocation tracking
118 */
119struct mpage_da_data {
120 struct inode *inode;
121 sector_t b_blocknr; /* start block number of extent */
122 size_t b_size; /* size of extent */
123 unsigned long b_state; /* state of the extent */
124 unsigned long first_page, next_page; /* extent of pages */
125 struct writeback_control *wbc;
126 int io_done;
127 int pages_written;
128 int retval;
129};
130
131/*
115 * Special inodes numbers 132 * Special inodes numbers
116 */ 133 */
117#define EXT4_BAD_INO 1 /* Bad blocks inode */ 134#define EXT4_BAD_INO 1 /* Bad blocks inode */
@@ -251,7 +268,6 @@ struct flex_groups {
251#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 268#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
252#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 269#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
253#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 270#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
254#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
255#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 271#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
256 272
257#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 273#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
289#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ 305#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
290#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 306#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
291#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ 307#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
308#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
292 309
293/* Used to pass group descriptor data when online resize is done */ 310/* Used to pass group descriptor data when online resize is done */
294struct ext4_new_group_input { 311struct ext4_new_group_input {
@@ -386,6 +403,9 @@ struct ext4_mount_options {
386#endif 403#endif
387}; 404};
388 405
406/* Max physical block we can addres w/o extents */
407#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
408
389/* 409/*
390 * Structure of an inode on the disk 410 * Structure of an inode on the disk
391 */ 411 */
@@ -456,7 +476,6 @@ struct move_extent {
456 __u64 len; /* block length to be moved */ 476 __u64 len; /* block length to be moved */
457 __u64 moved_len; /* moved block length */ 477 __u64 moved_len; /* moved block length */
458}; 478};
459#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
460 479
461#define EXT4_EPOCH_BITS 2 480#define EXT4_EPOCH_BITS 2
462#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) 481#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -694,7 +713,6 @@ struct ext4_inode_info {
694#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 713#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
695#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 714#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
696#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 715#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
697#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
698#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 716#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
699#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 717#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
700#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 718#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -841,6 +859,7 @@ struct ext4_sb_info {
841 unsigned long s_gdb_count; /* Number of group descriptor blocks */ 859 unsigned long s_gdb_count; /* Number of group descriptor blocks */
842 unsigned long s_desc_per_block; /* Number of group descriptors per block */ 860 unsigned long s_desc_per_block; /* Number of group descriptors per block */
843 ext4_group_t s_groups_count; /* Number of groups in the fs */ 861 ext4_group_t s_groups_count; /* Number of groups in the fs */
862 ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
844 unsigned long s_overhead_last; /* Last calculated overhead */ 863 unsigned long s_overhead_last; /* Last calculated overhead */
845 unsigned long s_blocks_last; /* Last seen block count */ 864 unsigned long s_blocks_last; /* Last seen block count */
846 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 865 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -950,6 +969,7 @@ struct ext4_sb_info {
950 atomic_t s_mb_lost_chunks; 969 atomic_t s_mb_lost_chunks;
951 atomic_t s_mb_preallocated; 970 atomic_t s_mb_preallocated;
952 atomic_t s_mb_discarded; 971 atomic_t s_mb_discarded;
972 atomic_t s_lock_busy;
953 973
954 /* locality groups */ 974 /* locality groups */
955 struct ext4_locality_group *s_locality_groups; 975 struct ext4_locality_group *s_locality_groups;
@@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1340 ext4_fsblk_t, unsigned long, int, unsigned long *); 1360 ext4_fsblk_t, unsigned long, int, unsigned long *);
1341extern int ext4_mb_add_groupinfo(struct super_block *sb, 1361extern int ext4_mb_add_groupinfo(struct super_block *sb,
1342 ext4_group_t i, struct ext4_group_desc *desc); 1362 ext4_group_t i, struct ext4_group_desc *desc);
1343extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1344 ext4_grpblk_t add);
1345extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1363extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1346extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1364extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1347 ext4_group_t, int); 1365 ext4_group_t, int);
@@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1367extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1385extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1368extern int ext4_can_truncate(struct inode *inode); 1386extern int ext4_can_truncate(struct inode *inode);
1369extern void ext4_truncate(struct inode *); 1387extern void ext4_truncate(struct inode *);
1388extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1370extern void ext4_set_inode_flags(struct inode *); 1389extern void ext4_set_inode_flags(struct inode *);
1371extern void ext4_get_inode_flags(struct ext4_inode_info *); 1390extern void ext4_get_inode_flags(struct ext4_inode_info *);
1372extern int ext4_alloc_da_blocks(struct inode *inode); 1391extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1575struct ext4_group_info { 1594struct ext4_group_info {
1576 unsigned long bb_state; 1595 unsigned long bb_state;
1577 struct rb_root bb_free_root; 1596 struct rb_root bb_free_root;
1578 unsigned short bb_first_free; 1597 ext4_grpblk_t bb_first_free; /* first free block */
1579 unsigned short bb_free; 1598 ext4_grpblk_t bb_free; /* total free blocks */
1580 unsigned short bb_fragments; 1599 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1581 struct list_head bb_prealloc_list; 1600 struct list_head bb_prealloc_list;
1582#ifdef DOUBLE_CHECK 1601#ifdef DOUBLE_CHECK
1583 void *bb_bitmap; 1602 void *bb_bitmap;
1584#endif 1603#endif
1585 struct rw_semaphore alloc_sem; 1604 struct rw_semaphore alloc_sem;
1586 unsigned short bb_counters[]; 1605 ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
1606 * regions, index is order.
1607 * bb_counters[3] = 5 means
1608 * 5 free 8-block regions. */
1587}; 1609};
1588 1610
1589#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 1611#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
@@ -1591,15 +1613,42 @@ struct ext4_group_info {
1591#define EXT4_MB_GRP_NEED_INIT(grp) \ 1613#define EXT4_MB_GRP_NEED_INIT(grp) \
1592 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 1614 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1593 1615
1616#define EXT4_MAX_CONTENTION 8
1617#define EXT4_CONTENTION_THRESHOLD 2
1618
1594static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, 1619static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
1595 ext4_group_t group) 1620 ext4_group_t group)
1596{ 1621{
1597 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); 1622 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
1598} 1623}
1599 1624
1625/*
1626 * Returns true if the filesystem is busy enough that attempts to
1627 * access the block group locks has run into contention.
1628 */
1629static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
1630{
1631 return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
1632}
1633
1600static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 1634static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1601{ 1635{
1602 spin_lock(ext4_group_lock_ptr(sb, group)); 1636 spinlock_t *lock = ext4_group_lock_ptr(sb, group);
1637 if (spin_trylock(lock))
1638 /*
1639 * We're able to grab the lock right away, so drop the
1640 * lock contention counter.
1641 */
1642 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
1643 else {
1644 /*
1645 * The lock is busy, so bump the contention counter,
1646 * and then wait on the spin lock.
1647 */
1648 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
1649 EXT4_MAX_CONTENTION);
1650 spin_lock(lock);
1651 }
1603} 1652}
1604 1653
1605static inline void ext4_unlock_group(struct super_block *sb, 1654static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..61652f1d15e6 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
43#define CHECK_BINSEARCH__ 43#define CHECK_BINSEARCH__
44 44
45/* 45/*
46 * If EXT_DEBUG is defined you can use the 'extdebug' mount option 46 * Turn on EXT_DEBUG to get lots of info about extents operations.
47 * to get lots of info about what's going on.
48 */ 47 */
49#define EXT_DEBUG__ 48#define EXT_DEBUG__
50#ifdef EXT_DEBUG 49#ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
138#define EXT_BREAK 1 137#define EXT_BREAK 1
139#define EXT_REPEAT 2 138#define EXT_REPEAT 2
140 139
140/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
141#define EXT_MAX_BLOCK 0xffffffff 141#define EXT_MAX_BLOCK 0xffffffff
142 142
143/* 143/*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
44 handle, err); 44 handle, err);
45 } 45 }
46 else 46 else
47 brelse(bh); 47 bforget(bh);
48 return err; 48 return err;
49} 49}
50 50
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
60 handle, err); 60 handle, err);
61 } 61 }
62 else 62 else
63 brelse(bh); 63 bforget(bh);
64 return err; 64 return err;
65} 65}
66 66
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
89 ext4_journal_abort_handle(where, __func__, bh, 89 ext4_journal_abort_handle(where, __func__, bh,
90 handle, err); 90 handle, err);
91 } else { 91 } else {
92 mark_buffer_dirty(bh); 92 if (inode && bh)
93 mark_buffer_dirty_inode(bh, inode);
94 else
95 mark_buffer_dirty(bh);
93 if (inode && inode_needs_sync(inode)) { 96 if (inode && inode_needs_sync(inode)) {
94 sync_dirty_buffer(bh); 97 sync_dirty_buffer(bh);
95 if (buffer_req(bh) && !buffer_uptodate(bh)) { 98 if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..7a3832577923 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94} 94}
95 95
96static int ext4_ext_journal_restart(handle_t *handle, int needed) 96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode,
98 int needed)
97{ 99{
98 int err; 100 int err;
99 101
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
104 err = ext4_journal_extend(handle, needed); 106 err = ext4_journal_extend(handle, needed);
105 if (err <= 0) 107 if (err <= 0)
106 return err; 108 return err;
107 return ext4_journal_restart(handle, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /*
111 * We have dropped i_data_sem so someone might have cached again
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115
116 return err;
108} 117}
109 118
110/* 119/*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
220 return newblock; 229 return newblock;
221} 230}
222 231
223static int ext4_ext_space_block(struct inode *inode) 232static inline int ext4_ext_space_block(struct inode *inode, int check)
224{ 233{
225 int size; 234 int size;
226 235
227 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 236 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
228 / sizeof(struct ext4_extent); 237 / sizeof(struct ext4_extent);
238 if (!check) {
229#ifdef AGGRESSIVE_TEST 239#ifdef AGGRESSIVE_TEST
230 if (size > 6) 240 if (size > 6)
231 size = 6; 241 size = 6;
232#endif 242#endif
243 }
233 return size; 244 return size;
234} 245}
235 246
236static int ext4_ext_space_block_idx(struct inode *inode) 247static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
237{ 248{
238 int size; 249 int size;
239 250
240 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 251 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
241 / sizeof(struct ext4_extent_idx); 252 / sizeof(struct ext4_extent_idx);
253 if (!check) {
242#ifdef AGGRESSIVE_TEST 254#ifdef AGGRESSIVE_TEST
243 if (size > 5) 255 if (size > 5)
244 size = 5; 256 size = 5;
245#endif 257#endif
258 }
246 return size; 259 return size;
247} 260}
248 261
249static int ext4_ext_space_root(struct inode *inode) 262static inline int ext4_ext_space_root(struct inode *inode, int check)
250{ 263{
251 int size; 264 int size;
252 265
253 size = sizeof(EXT4_I(inode)->i_data); 266 size = sizeof(EXT4_I(inode)->i_data);
254 size -= sizeof(struct ext4_extent_header); 267 size -= sizeof(struct ext4_extent_header);
255 size /= sizeof(struct ext4_extent); 268 size /= sizeof(struct ext4_extent);
269 if (!check) {
256#ifdef AGGRESSIVE_TEST 270#ifdef AGGRESSIVE_TEST
257 if (size > 3) 271 if (size > 3)
258 size = 3; 272 size = 3;
259#endif 273#endif
274 }
260 return size; 275 return size;
261} 276}
262 277
263static int ext4_ext_space_root_idx(struct inode *inode) 278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
264{ 279{
265 int size; 280 int size;
266 281
267 size = sizeof(EXT4_I(inode)->i_data); 282 size = sizeof(EXT4_I(inode)->i_data);
268 size -= sizeof(struct ext4_extent_header); 283 size -= sizeof(struct ext4_extent_header);
269 size /= sizeof(struct ext4_extent_idx); 284 size /= sizeof(struct ext4_extent_idx);
285 if (!check) {
270#ifdef AGGRESSIVE_TEST 286#ifdef AGGRESSIVE_TEST
271 if (size > 4) 287 if (size > 4)
272 size = 4; 288 size = 4;
273#endif 289#endif
290 }
274 return size; 291 return size;
275} 292}
276 293
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
284 int lcap, icap, rcap, leafs, idxs, num; 301 int lcap, icap, rcap, leafs, idxs, num;
285 int newextents = blocks; 302 int newextents = blocks;
286 303
287 rcap = ext4_ext_space_root_idx(inode); 304 rcap = ext4_ext_space_root_idx(inode, 0);
288 lcap = ext4_ext_space_block(inode); 305 lcap = ext4_ext_space_block(inode, 0);
289 icap = ext4_ext_space_block_idx(inode); 306 icap = ext4_ext_space_block_idx(inode, 0);
290 307
291 /* number of new leaf blocks needed */ 308 /* number of new leaf blocks needed */
292 num = leafs = (newextents + lcap - 1) / lcap; 309 num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
311 328
312 if (depth == ext_depth(inode)) { 329 if (depth == ext_depth(inode)) {
313 if (depth == 0) 330 if (depth == 0)
314 max = ext4_ext_space_root(inode); 331 max = ext4_ext_space_root(inode, 1);
315 else 332 else
316 max = ext4_ext_space_root_idx(inode); 333 max = ext4_ext_space_root_idx(inode, 1);
317 } else { 334 } else {
318 if (depth == 0) 335 if (depth == 0)
319 max = ext4_ext_space_block(inode); 336 max = ext4_ext_space_block(inode, 1);
320 else 337 else
321 max = ext4_ext_space_block_idx(inode); 338 max = ext4_ext_space_block_idx(inode, 1);
322 } 339 }
323 340
324 return max; 341 return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
437 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 454 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
438 idx_pblock(path->p_idx)); 455 idx_pblock(path->p_idx));
439 } else if (path->p_ext) { 456 } else if (path->p_ext) {
440 ext_debug(" %d:%d:%llu ", 457 ext_debug(" %d:[%d]%d:%llu ",
441 le32_to_cpu(path->p_ext->ee_block), 458 le32_to_cpu(path->p_ext->ee_block),
459 ext4_ext_is_uninitialized(path->p_ext),
442 ext4_ext_get_actual_len(path->p_ext), 460 ext4_ext_get_actual_len(path->p_ext),
443 ext_pblock(path->p_ext)); 461 ext_pblock(path->p_ext));
444 } else 462 } else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
460 eh = path[depth].p_hdr; 478 eh = path[depth].p_hdr;
461 ex = EXT_FIRST_EXTENT(eh); 479 ex = EXT_FIRST_EXTENT(eh);
462 480
481 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
482
463 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 483 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
464 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), 484 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
485 ext4_ext_is_uninitialized(ex),
465 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 486 ext4_ext_get_actual_len(ex), ext_pblock(ex));
466 } 487 }
467 ext_debug("\n"); 488 ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
580 } 601 }
581 602
582 path->p_ext = l - 1; 603 path->p_ext = l - 1;
583 ext_debug(" -> %d:%llu:%d ", 604 ext_debug(" -> %d:%llu:[%d]%d ",
584 le32_to_cpu(path->p_ext->ee_block), 605 le32_to_cpu(path->p_ext->ee_block),
585 ext_pblock(path->p_ext), 606 ext_pblock(path->p_ext),
607 ext4_ext_is_uninitialized(path->p_ext),
586 ext4_ext_get_actual_len(path->p_ext)); 608 ext4_ext_get_actual_len(path->p_ext));
587 609
588#ifdef CHECK_BINSEARCH 610#ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
612 eh->eh_depth = 0; 634 eh->eh_depth = 0;
613 eh->eh_entries = 0; 635 eh->eh_entries = 0;
614 eh->eh_magic = EXT4_EXT_MAGIC; 636 eh->eh_magic = EXT4_EXT_MAGIC;
615 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); 637 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
616 ext4_mark_inode_dirty(handle, inode); 638 ext4_mark_inode_dirty(handle, inode);
617 ext4_ext_invalidate_cache(inode); 639 ext4_ext_invalidate_cache(inode);
618 return 0; 640 return 0;
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
837 859
838 neh = ext_block_hdr(bh); 860 neh = ext_block_hdr(bh);
839 neh->eh_entries = 0; 861 neh->eh_entries = 0;
840 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 862 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
841 neh->eh_magic = EXT4_EXT_MAGIC; 863 neh->eh_magic = EXT4_EXT_MAGIC;
842 neh->eh_depth = 0; 864 neh->eh_depth = 0;
843 ex = EXT_FIRST_EXTENT(neh); 865 ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
850 path[depth].p_ext++; 872 path[depth].p_ext++;
851 while (path[depth].p_ext <= 873 while (path[depth].p_ext <=
852 EXT_MAX_EXTENT(path[depth].p_hdr)) { 874 EXT_MAX_EXTENT(path[depth].p_hdr)) {
853 ext_debug("move %d:%llu:%d in new leaf %llu\n", 875 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
854 le32_to_cpu(path[depth].p_ext->ee_block), 876 le32_to_cpu(path[depth].p_ext->ee_block),
855 ext_pblock(path[depth].p_ext), 877 ext_pblock(path[depth].p_ext),
878 ext4_ext_is_uninitialized(path[depth].p_ext),
856 ext4_ext_get_actual_len(path[depth].p_ext), 879 ext4_ext_get_actual_len(path[depth].p_ext),
857 newblock); 880 newblock);
858 /*memmove(ex++, path[depth].p_ext++, 881 /*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
912 neh = ext_block_hdr(bh); 935 neh = ext_block_hdr(bh);
913 neh->eh_entries = cpu_to_le16(1); 936 neh->eh_entries = cpu_to_le16(1);
914 neh->eh_magic = EXT4_EXT_MAGIC; 937 neh->eh_magic = EXT4_EXT_MAGIC;
915 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 938 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
916 neh->eh_depth = cpu_to_le16(depth - i); 939 neh->eh_depth = cpu_to_le16(depth - i);
917 fidx = EXT_FIRST_INDEX(neh); 940 fidx = EXT_FIRST_INDEX(neh);
918 fidx->ei_block = border; 941 fidx->ei_block = border;
@@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1037 /* old root could have indexes or leaves 1060 /* old root could have indexes or leaves
1038 * so calculate e_max right way */ 1061 * so calculate e_max right way */
1039 if (ext_depth(inode)) 1062 if (ext_depth(inode))
1040 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 1063 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1041 else 1064 else
1042 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 1065 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1043 neh->eh_magic = EXT4_EXT_MAGIC; 1066 neh->eh_magic = EXT4_EXT_MAGIC;
1044 set_buffer_uptodate(bh); 1067 set_buffer_uptodate(bh);
1045 unlock_buffer(bh); 1068 unlock_buffer(bh);
@@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1054 goto out; 1077 goto out;
1055 1078
1056 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1079 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1057 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); 1080 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1058 curp->p_hdr->eh_entries = cpu_to_le16(1); 1081 curp->p_hdr->eh_entries = cpu_to_le16(1);
1059 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1082 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1060 1083
@@ -1580,9 +1603,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1580 1603
1581 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1582 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1583 ext_debug("append %d block to %d:%d (from %llu)\n", 1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext),
1584 ext4_ext_get_actual_len(newext), 1608 ext4_ext_get_actual_len(newext),
1585 le32_to_cpu(ex->ee_block), 1609 le32_to_cpu(ex->ee_block),
1610 ext4_ext_is_uninitialized(ex),
1586 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1611 ext4_ext_get_actual_len(ex), ext_pblock(ex));
1587 err = ext4_ext_get_access(handle, inode, path + depth); 1612 err = ext4_ext_get_access(handle, inode, path + depth);
1588 if (err) 1613 if (err)
@@ -1651,9 +1676,10 @@ has_space:
1651 1676
1652 if (!nearex) { 1677 if (!nearex) {
1653 /* there is no extent in this leaf, create first one */ 1678 /* there is no extent in this leaf, create first one */
1654 ext_debug("first extent in the leaf: %d:%llu:%d\n", 1679 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1655 le32_to_cpu(newext->ee_block), 1680 le32_to_cpu(newext->ee_block),
1656 ext_pblock(newext), 1681 ext_pblock(newext),
1682 ext4_ext_is_uninitialized(newext),
1657 ext4_ext_get_actual_len(newext)); 1683 ext4_ext_get_actual_len(newext));
1658 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1684 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1659 } else if (le32_to_cpu(newext->ee_block) 1685 } else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1689,11 @@ has_space:
1663 len = EXT_MAX_EXTENT(eh) - nearex; 1689 len = EXT_MAX_EXTENT(eh) - nearex;
1664 len = (len - 1) * sizeof(struct ext4_extent); 1690 len = (len - 1) * sizeof(struct ext4_extent);
1665 len = len < 0 ? 0 : len; 1691 len = len < 0 ? 0 : len;
1666 ext_debug("insert %d:%llu:%d after: nearest 0x%p, " 1692 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1667 "move %d from 0x%p to 0x%p\n", 1693 "move %d from 0x%p to 0x%p\n",
1668 le32_to_cpu(newext->ee_block), 1694 le32_to_cpu(newext->ee_block),
1669 ext_pblock(newext), 1695 ext_pblock(newext),
1696 ext4_ext_is_uninitialized(newext),
1670 ext4_ext_get_actual_len(newext), 1697 ext4_ext_get_actual_len(newext),
1671 nearex, len, nearex + 1, nearex + 2); 1698 nearex, len, nearex + 1, nearex + 2);
1672 memmove(nearex + 2, nearex + 1, len); 1699 memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1703,11 @@ has_space:
1676 BUG_ON(newext->ee_block == nearex->ee_block); 1703 BUG_ON(newext->ee_block == nearex->ee_block);
1677 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1704 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1678 len = len < 0 ? 0 : len; 1705 len = len < 0 ? 0 : len;
1679 ext_debug("insert %d:%llu:%d before: nearest 0x%p, " 1706 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1680 "move %d from 0x%p to 0x%p\n", 1707 "move %d from 0x%p to 0x%p\n",
1681 le32_to_cpu(newext->ee_block), 1708 le32_to_cpu(newext->ee_block),
1682 ext_pblock(newext), 1709 ext_pblock(newext),
1710 ext4_ext_is_uninitialized(newext),
1683 ext4_ext_get_actual_len(newext), 1711 ext4_ext_get_actual_len(newext),
1684 nearex, len, nearex + 1, nearex + 2); 1712 nearex, len, nearex + 1, nearex + 2);
1685 memmove(nearex + 1, nearex, len); 1713 memmove(nearex + 1, nearex, len);
@@ -2094,7 +2122,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2094 else 2122 else
2095 uninitialized = 0; 2123 uninitialized = 0;
2096 2124
2097 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); 2125 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2126 uninitialized, ex_ee_len);
2098 path[depth].p_ext = ex; 2127 path[depth].p_ext = ex;
2099 2128
2100 a = ex_ee_block > start ? ex_ee_block : start; 2129 a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2138 } 2167 }
2139 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2168 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2140 2169
2141 err = ext4_ext_journal_restart(handle, credits); 2170 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2142 if (err) 2171 if (err)
2143 goto out; 2172 goto out;
2144 2173
@@ -2327,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2327 if (err == 0) { 2356 if (err == 0) {
2328 ext_inode_hdr(inode)->eh_depth = 0; 2357 ext_inode_hdr(inode)->eh_depth = 0;
2329 ext_inode_hdr(inode)->eh_max = 2358 ext_inode_hdr(inode)->eh_max =
2330 cpu_to_le16(ext4_ext_space_root(inode)); 2359 cpu_to_le16(ext4_ext_space_root(inode, 0));
2331 err = ext4_ext_dirty(handle, inode, path); 2360 err = ext4_ext_dirty(handle, inode, path);
2332 } 2361 }
2333 } 2362 }
@@ -2743,6 +2772,7 @@ insert:
2743 } else if (err) 2772 } else if (err)
2744 goto fix_extent_len; 2773 goto fix_extent_len;
2745out: 2774out:
2775 ext4_ext_show_leaf(inode, path);
2746 return err ? err : allocated; 2776 return err ? err : allocated;
2747 2777
2748fix_extent_len: 2778fix_extent_len:
@@ -2786,7 +2816,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2786 struct ext4_allocation_request ar; 2816 struct ext4_allocation_request ar;
2787 2817
2788 __clear_bit(BH_New, &bh_result->b_state); 2818 __clear_bit(BH_New, &bh_result->b_state);
2789 ext_debug("blocks %u/%u requested for inode %u\n", 2819 ext_debug("blocks %u/%u requested for inode %lu\n",
2790 iblock, max_blocks, inode->i_ino); 2820 iblock, max_blocks, inode->i_ino);
2791 2821
2792 /* check in cache */ 2822 /* check in cache */
@@ -2849,7 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2849 newblock = iblock - ee_block + ee_start; 2879 newblock = iblock - ee_block + ee_start;
2850 /* number of remaining blocks in the extent */ 2880 /* number of remaining blocks in the extent */
2851 allocated = ee_len - (iblock - ee_block); 2881 allocated = ee_len - (iblock - ee_block);
2852 ext_debug("%u fit into %lu:%d -> %llu\n", iblock, 2882 ext_debug("%u fit into %u:%d -> %llu\n", iblock,
2853 ee_block, ee_len, newblock); 2883 ee_block, ee_len, newblock);
2854 2884
2855 /* Do not put uninitialized extent in the cache */ 2885 /* Do not put uninitialized extent in the cache */
@@ -2950,7 +2980,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2950 newblock = ext4_mb_new_blocks(handle, &ar, &err); 2980 newblock = ext4_mb_new_blocks(handle, &ar, &err);
2951 if (!newblock) 2981 if (!newblock)
2952 goto out2; 2982 goto out2;
2953 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2983 ext_debug("allocate new block: goal %llu, found %llu/%u\n",
2954 ar.goal, newblock, allocated); 2984 ar.goal, newblock, allocated);
2955 2985
2956 /* try to insert new extent into found leaf and return */ 2986 /* try to insert new extent into found leaf and return */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..07475740b512 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
50{ 50{
51 struct inode *inode = dentry->d_inode; 51 struct inode *inode = dentry->d_inode;
52 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 52 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
53 int ret = 0; 53 int err, ret = 0;
54 54
55 J_ASSERT(ext4_journal_current_handle() == NULL); 55 J_ASSERT(ext4_journal_current_handle() == NULL);
56 56
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
79 goto out; 79 goto out;
80 } 80 }
81 81
82 if (!journal)
83 ret = sync_mapping_buffers(inode->i_mapping);
84
82 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 85 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
83 goto out; 86 goto out;
84 87
@@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
91 .sync_mode = WB_SYNC_ALL, 94 .sync_mode = WB_SYNC_ALL,
92 .nr_to_write = 0, /* sys_fsync did this */ 95 .nr_to_write = 0, /* sys_fsync did this */
93 }; 96 };
94 ret = sync_inode(inode, &wbc); 97 err = sync_inode(inode, &wbc);
95 if (journal && (journal->j_flags & JBD2_BARRIER)) 98 if (ret == 0)
96 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 99 ret = err;
97 } 100 }
98out: 101out:
102 if (journal && (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
99 return ret; 104 return ret;
100} 105}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1189 1189
1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1192 i, ext4_free_inodes_count(sb, gdp), x); 1192 (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1193 bitmap_count += x; 1193 bitmap_count += x;
1194 } 1194 }
1195 brelse(bitmap_bh); 1195 brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..4abd683b963d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
192 * so before we call here everything must be consistently dirtied against 192 * so before we call here everything must be consistently dirtied against
193 * this transaction. 193 * this transaction.
194 */ 194 */
195static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 195 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
196 int nblocks)
196{ 197{
198 int ret;
199
200 /*
201 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
202 * moment, get_block can be called only for blocks inside i_size since
203 * page cache has been already dropped and writes are blocked by
204 * i_mutex. So we can safely drop the i_data_sem here.
205 */
197 BUG_ON(EXT4_JOURNAL(inode) == NULL); 206 BUG_ON(EXT4_JOURNAL(inode) == NULL);
198 jbd_debug(2, "restarting handle %p\n", handle); 207 jbd_debug(2, "restarting handle %p\n", handle);
199 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 208 up_write(&EXT4_I(inode)->i_data_sem);
209 ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
210 down_write(&EXT4_I(inode)->i_data_sem);
211
212 return ret;
200} 213}
201 214
202/* 215/*
@@ -341,9 +354,7 @@ static int ext4_block_to_path(struct inode *inode,
341 int n = 0; 354 int n = 0;
342 int final = 0; 355 int final = 0;
343 356
344 if (i_block < 0) { 357 if (i_block < direct_blocks) {
345 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
346 } else if (i_block < direct_blocks) {
347 offsets[n++] = i_block; 358 offsets[n++] = i_block;
348 final = direct_blocks; 359 final = direct_blocks;
349 } else if ((i_block -= direct_blocks) < indirect_blocks) { 360 } else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
551 * 562 *
552 * Normally this function find the preferred place for block allocation, 563 * Normally this function find the preferred place for block allocation,
553 * returns it. 564 * returns it.
565 * Because this is only used for non-extent files, we limit the block nr
566 * to 32 bits.
554 */ 567 */
555static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 568static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
556 Indirect *partial) 569 Indirect *partial)
557{ 570{
571 ext4_fsblk_t goal;
572
558 /* 573 /*
559 * XXX need to get goal block from mballoc's data structures 574 * XXX need to get goal block from mballoc's data structures
560 */ 575 */
561 576
562 return ext4_find_near(inode, partial); 577 goal = ext4_find_near(inode, partial);
578 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
579 return goal;
563} 580}
564 581
565/** 582/**
@@ -640,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
640 if (*err) 657 if (*err)
641 goto failed_out; 658 goto failed_out;
642 659
660 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
661
643 target -= count; 662 target -= count;
644 /* allocate blocks for indirect blocks */ 663 /* allocate blocks for indirect blocks */
645 while (index < indirect_blks && count) { 664 while (index < indirect_blks && count) {
@@ -674,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
674 ar.flags = EXT4_MB_HINT_DATA; 693 ar.flags = EXT4_MB_HINT_DATA;
675 694
676 current_block = ext4_mb_new_blocks(handle, &ar, err); 695 current_block = ext4_mb_new_blocks(handle, &ar, err);
696 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
677 697
678 if (*err && (target == blks)) { 698 if (*err && (target == blks)) {
679 /* 699 /*
@@ -762,8 +782,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
762 BUFFER_TRACE(bh, "call get_create_access"); 782 BUFFER_TRACE(bh, "call get_create_access");
763 err = ext4_journal_get_create_access(handle, bh); 783 err = ext4_journal_get_create_access(handle, bh);
764 if (err) { 784 if (err) {
785 /* Don't brelse(bh) here; it's done in
786 * ext4_journal_forget() below */
765 unlock_buffer(bh); 787 unlock_buffer(bh);
766 brelse(bh);
767 goto failed; 788 goto failed;
768 } 789 }
769 790
@@ -1109,16 +1130,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1109 ext4_discard_preallocations(inode); 1130 ext4_discard_preallocations(inode);
1110} 1131}
1111 1132
1112static int check_block_validity(struct inode *inode, sector_t logical, 1133static int check_block_validity(struct inode *inode, const char *msg,
1113 sector_t phys, int len) 1134 sector_t logical, sector_t phys, int len)
1114{ 1135{
1115 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1136 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1116 ext4_error(inode->i_sb, "check_block_validity", 1137 ext4_error(inode->i_sb, msg,
1117 "inode #%lu logical block %llu mapped to %llu " 1138 "inode #%lu logical block %llu mapped to %llu "
1118 "(size %d)", inode->i_ino, 1139 "(size %d)", inode->i_ino,
1119 (unsigned long long) logical, 1140 (unsigned long long) logical,
1120 (unsigned long long) phys, len); 1141 (unsigned long long) phys, len);
1121 WARN_ON(1);
1122 return -EIO; 1142 return -EIO;
1123 } 1143 }
1124 return 0; 1144 return 0;
@@ -1170,8 +1190,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1170 up_read((&EXT4_I(inode)->i_data_sem)); 1190 up_read((&EXT4_I(inode)->i_data_sem));
1171 1191
1172 if (retval > 0 && buffer_mapped(bh)) { 1192 if (retval > 0 && buffer_mapped(bh)) {
1173 int ret = check_block_validity(inode, block, 1193 int ret = check_block_validity(inode, "file system corruption",
1174 bh->b_blocknr, retval); 1194 block, bh->b_blocknr, retval);
1175 if (ret != 0) 1195 if (ret != 0)
1176 return ret; 1196 return ret;
1177 } 1197 }
@@ -1235,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1235 * i_data's format changing. Force the migrate 1255 * i_data's format changing. Force the migrate
1236 * to fail by clearing migrate flags 1256 * to fail by clearing migrate flags
1237 */ 1257 */
1238 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1258 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
1239 ~EXT4_EXT_MIGRATE;
1240 } 1259 }
1241 } 1260 }
1242 1261
@@ -1252,8 +1271,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1252 1271
1253 up_write((&EXT4_I(inode)->i_data_sem)); 1272 up_write((&EXT4_I(inode)->i_data_sem));
1254 if (retval > 0 && buffer_mapped(bh)) { 1273 if (retval > 0 && buffer_mapped(bh)) {
1255 int ret = check_block_validity(inode, block, 1274 int ret = check_block_validity(inode, "file system "
1256 bh->b_blocknr, retval); 1275 "corruption after allocation",
1276 block, bh->b_blocknr, retval);
1257 if (ret != 0) 1277 if (ret != 0)
1258 return ret; 1278 return ret;
1259 } 1279 }
@@ -1863,18 +1883,6 @@ static void ext4_da_page_release_reservation(struct page *page,
1863 * Delayed allocation stuff 1883 * Delayed allocation stuff
1864 */ 1884 */
1865 1885
1866struct mpage_da_data {
1867 struct inode *inode;
1868 sector_t b_blocknr; /* start block number of extent */
1869 size_t b_size; /* size of extent */
1870 unsigned long b_state; /* state of the extent */
1871 unsigned long first_page, next_page; /* extent of pages */
1872 struct writeback_control *wbc;
1873 int io_done;
1874 int pages_written;
1875 int retval;
1876};
1877
1878/* 1886/*
1879 * mpage_da_submit_io - walks through extent of pages and try to write 1887 * mpage_da_submit_io - walks through extent of pages and try to write
1880 * them with writepage() call back 1888 * them with writepage() call back
@@ -2737,6 +2745,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2737 long pages_skipped; 2745 long pages_skipped;
2738 int range_cyclic, cycled = 1, io_done = 0; 2746 int range_cyclic, cycled = 1, io_done = 0;
2739 int needed_blocks, ret = 0, nr_to_writebump = 0; 2747 int needed_blocks, ret = 0, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start;
2740 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2741 2750
2742 trace_ext4_da_writepages(inode, wbc); 2751 trace_ext4_da_writepages(inode, wbc);
@@ -2850,6 +2859,7 @@ retry:
2850 mpd.io_done = 1; 2859 mpd.io_done = 1;
2851 ret = MPAGE_DA_EXTENT_TAIL; 2860 ret = MPAGE_DA_EXTENT_TAIL;
2852 } 2861 }
2862 trace_ext4_da_write_pages(inode, &mpd);
2853 wbc->nr_to_write -= mpd.pages_written; 2863 wbc->nr_to_write -= mpd.pages_written;
2854 2864
2855 ext4_journal_stop(handle); 2865 ext4_journal_stop(handle);
@@ -2905,6 +2915,7 @@ out_writepages:
2905 if (!no_nrwrite_index_update) 2915 if (!no_nrwrite_index_update)
2906 wbc->no_nrwrite_index_update = 0; 2916 wbc->no_nrwrite_index_update = 0;
2907 wbc->nr_to_write -= nr_to_writebump; 2917 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start;
2908 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2909 return ret; 2920 return ret;
2910} 2921}
@@ -3117,6 +3128,8 @@ out:
3117 */ 3128 */
3118int ext4_alloc_da_blocks(struct inode *inode) 3129int ext4_alloc_da_blocks(struct inode *inode)
3119{ 3130{
3131 trace_ext4_alloc_da_blocks(inode);
3132
3120 if (!EXT4_I(inode)->i_reserved_data_blocks && 3133 if (!EXT4_I(inode)->i_reserved_data_blocks &&
3121 !EXT4_I(inode)->i_reserved_meta_blocks) 3134 !EXT4_I(inode)->i_reserved_meta_blocks)
3122 return 0; 3135 return 0;
@@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3659 ext4_handle_dirty_metadata(handle, inode, bh); 3672 ext4_handle_dirty_metadata(handle, inode, bh);
3660 } 3673 }
3661 ext4_mark_inode_dirty(handle, inode); 3674 ext4_mark_inode_dirty(handle, inode);
3662 ext4_journal_test_restart(handle, inode); 3675 ext4_truncate_restart_trans(handle, inode,
3676 blocks_for_truncate(inode));
3663 if (bh) { 3677 if (bh) {
3664 BUFFER_TRACE(bh, "retaking write access"); 3678 BUFFER_TRACE(bh, "retaking write access");
3665 ext4_journal_get_write_access(handle, bh); 3679 ext4_journal_get_write_access(handle, bh);
@@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3870 return; 3884 return;
3871 if (try_to_extend_transaction(handle, inode)) { 3885 if (try_to_extend_transaction(handle, inode)) {
3872 ext4_mark_inode_dirty(handle, inode); 3886 ext4_mark_inode_dirty(handle, inode);
3873 ext4_journal_test_restart(handle, inode); 3887 ext4_truncate_restart_trans(handle, inode,
3888 blocks_for_truncate(inode));
3874 } 3889 }
3875 3890
3876 ext4_free_blocks(handle, inode, nr, 1, 1); 3891 ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3958,8 +3973,7 @@ void ext4_truncate(struct inode *inode)
3958 if (!ext4_can_truncate(inode)) 3973 if (!ext4_can_truncate(inode))
3959 return; 3974 return;
3960 3975
3961 if (ei->i_disksize && inode->i_size == 0 && 3976 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3962 !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3963 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 3977 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3964 3978
3965 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3979 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4533,7 +4547,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
4533 */ 4547 */
4534static int ext4_do_update_inode(handle_t *handle, 4548static int ext4_do_update_inode(handle_t *handle,
4535 struct inode *inode, 4549 struct inode *inode,
4536 struct ext4_iloc *iloc) 4550 struct ext4_iloc *iloc,
4551 int do_sync)
4537{ 4552{
4538 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4553 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4539 struct ext4_inode_info *ei = EXT4_I(inode); 4554 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4581,8 +4596,7 @@ static int ext4_do_update_inode(handle_t *handle,
4581 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4596 if (ext4_inode_blocks_set(handle, raw_inode, ei))
4582 goto out_brelse; 4597 goto out_brelse;
4583 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4598 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4584 /* clear the migrate flag in the raw_inode */ 4599 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
4585 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
4586 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4600 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
4587 cpu_to_le32(EXT4_OS_HURD)) 4601 cpu_to_le32(EXT4_OS_HURD))
4588 raw_inode->i_file_acl_high = 4602 raw_inode->i_file_acl_high =
@@ -4635,10 +4649,22 @@ static int ext4_do_update_inode(handle_t *handle,
4635 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4649 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4636 } 4650 }
4637 4651
4638 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4652 /*
4639 rc = ext4_handle_dirty_metadata(handle, inode, bh); 4653 * If we're not using a journal and we were called from
4640 if (!err) 4654 * ext4_write_inode() to sync the inode (making do_sync true),
4641 err = rc; 4655 * we can just use sync_dirty_buffer() directly to do our dirty
4656 * work. Testing s_journal here is a bit redundant but it's
4657 * worth it to avoid potential future trouble.
4658 */
4659 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4660 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4661 sync_dirty_buffer(bh);
4662 } else {
4663 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4664 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4665 if (!err)
4666 err = rc;
4667 }
4642 ei->i_state &= ~EXT4_STATE_NEW; 4668 ei->i_state &= ~EXT4_STATE_NEW;
4643 4669
4644out_brelse: 4670out_brelse:
@@ -4684,19 +4710,32 @@ out_brelse:
4684 */ 4710 */
4685int ext4_write_inode(struct inode *inode, int wait) 4711int ext4_write_inode(struct inode *inode, int wait)
4686{ 4712{
4713 int err;
4714
4687 if (current->flags & PF_MEMALLOC) 4715 if (current->flags & PF_MEMALLOC)
4688 return 0; 4716 return 0;
4689 4717
4690 if (ext4_journal_current_handle()) { 4718 if (EXT4_SB(inode->i_sb)->s_journal) {
4691 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4719 if (ext4_journal_current_handle()) {
4692 dump_stack(); 4720 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4693 return -EIO; 4721 dump_stack();
4694 } 4722 return -EIO;
4723 }
4695 4724
4696 if (!wait) 4725 if (!wait)
4697 return 0; 4726 return 0;
4727
4728 err = ext4_force_commit(inode->i_sb);
4729 } else {
4730 struct ext4_iloc iloc;
4698 4731
4699 return ext4_force_commit(inode->i_sb); 4732 err = ext4_get_inode_loc(inode, &iloc);
4733 if (err)
4734 return err;
4735 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
4736 inode, &iloc, wait);
4737 }
4738 return err;
4700} 4739}
4701 4740
4702/* 4741/*
@@ -4990,7 +5029,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
4990 get_bh(iloc->bh); 5029 get_bh(iloc->bh);
4991 5030
4992 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5031 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
4993 err = ext4_do_update_inode(handle, inode, iloc); 5032 err = ext4_do_update_inode(handle, inode, iloc, 0);
4994 put_bh(iloc->bh); 5033 put_bh(iloc->bh);
4995 return err; 5034 return err;
4996} 5035}
@@ -5281,12 +5320,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5281 else 5320 else
5282 len = PAGE_CACHE_SIZE; 5321 len = PAGE_CACHE_SIZE;
5283 5322
5323 lock_page(page);
5324 /*
5325 * return if we have all the buffers mapped. This avoid
5326 * the need to call write_begin/write_end which does a
5327 * journal_start/journal_stop which can block and take
5328 * long time
5329 */
5284 if (page_has_buffers(page)) { 5330 if (page_has_buffers(page)) {
5285 /* return if we have all the buffers mapped */
5286 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5331 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5287 ext4_bh_unmapped)) 5332 ext4_bh_unmapped)) {
5333 unlock_page(page);
5288 goto out_unlock; 5334 goto out_unlock;
5335 }
5289 } 5336 }
5337 unlock_page(page);
5290 /* 5338 /*
5291 * OK, we need to fill the hole... Do write_begin write_end 5339 * OK, we need to fill the hole... Do write_begin write_end
5292 * to do block allocation/reservation.We are not holding 5340 * to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
243 me.donor_start, me.len, &me.moved_len); 243 me.donor_start, me.len, &me.moved_len);
244 fput(donor_filp); 244 fput(donor_filp);
245 245
246 if (!err) 246 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
247 if (copy_to_user((struct move_extent *)arg, 247 return -EFAULT;
248 &me, sizeof(me))) 248
249 return -EFAULT;
250 return err; 249 return err;
251 } 250 }
252 251
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h>
25#include <trace/events/ext4.h> 26#include <trace/events/ext4.h>
26 27
27/* 28/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
622 623
623/* FIXME!! need more doc */ 624/* FIXME!! need more doc */
624static void ext4_mb_mark_free_simple(struct super_block *sb, 625static void ext4_mb_mark_free_simple(struct super_block *sb,
625 void *buddy, unsigned first, int len, 626 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
626 struct ext4_group_info *grp) 627 struct ext4_group_info *grp)
627{ 628{
628 struct ext4_sb_info *sbi = EXT4_SB(sb); 629 struct ext4_sb_info *sbi = EXT4_SB(sb);
629 unsigned short min; 630 ext4_grpblk_t min;
630 unsigned short max; 631 ext4_grpblk_t max;
631 unsigned short chunk; 632 ext4_grpblk_t chunk;
632 unsigned short border; 633 unsigned short border;
633 634
634 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 635 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
662 void *buddy, void *bitmap, ext4_group_t group) 663 void *buddy, void *bitmap, ext4_group_t group)
663{ 664{
664 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 665 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
665 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); 666 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
666 unsigned short i = 0; 667 ext4_grpblk_t i = 0;
667 unsigned short first; 668 ext4_grpblk_t first;
668 unsigned short len; 669 ext4_grpblk_t len;
669 unsigned free = 0; 670 unsigned free = 0;
670 unsigned fragments = 0; 671 unsigned fragments = 0;
671 unsigned long long period = get_cycles(); 672 unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
743 char *data; 744 char *data;
744 char *bitmap; 745 char *bitmap;
745 746
746 mb_debug("init page %lu\n", page->index); 747 mb_debug(1, "init page %lu\n", page->index);
747 748
748 inode = page->mapping->host; 749 inode = page->mapping->host;
749 sb = inode->i_sb; 750 sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
822 set_bitmap_uptodate(bh[i]); 823 set_bitmap_uptodate(bh[i]);
823 bh[i]->b_end_io = end_buffer_read_sync; 824 bh[i]->b_end_io = end_buffer_read_sync;
824 submit_bh(READ, bh[i]); 825 submit_bh(READ, bh[i]);
825 mb_debug("read bitmap for group %u\n", first_group + i); 826 mb_debug(1, "read bitmap for group %u\n", first_group + i);
826 } 827 }
827 828
828 /* wait for I/O completion */ 829 /* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 if ((first_block + i) & 1) { 863 if ((first_block + i) & 1) {
863 /* this is block of buddy */ 864 /* this is block of buddy */
864 BUG_ON(incore == NULL); 865 BUG_ON(incore == NULL);
865 mb_debug("put buddy for group %u in page %lu/%x\n", 866 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
866 group, page->index, i * blocksize); 867 group, page->index, i * blocksize);
867 grinfo = ext4_get_group_info(sb, group); 868 grinfo = ext4_get_group_info(sb, group);
868 grinfo->bb_fragments = 0; 869 grinfo->bb_fragments = 0;
869 memset(grinfo->bb_counters, 0, 870 memset(grinfo->bb_counters, 0,
870 sizeof(unsigned short)*(sb->s_blocksize_bits+2)); 871 sizeof(*grinfo->bb_counters) *
872 (sb->s_blocksize_bits+2));
871 /* 873 /*
872 * incore got set to the group block bitmap below 874 * incore got set to the group block bitmap below
873 */ 875 */
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
878 } else { 880 } else {
879 /* this is block of bitmap */ 881 /* this is block of bitmap */
880 BUG_ON(incore != NULL); 882 BUG_ON(incore != NULL);
881 mb_debug("put bitmap for group %u in page %lu/%x\n", 883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
882 group, page->index, i * blocksize); 884 group, page->index, i * blocksize);
883 885
884 /* see comments in ext4_mb_put_pa() */ 886 /* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
908 return err; 910 return err;
909} 911}
910 912
913static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{
916
917 int ret = 0;
918 void *bitmap;
919 int blocks_per_page;
920 int block, pnum, poff;
921 int num_grp_locked = 0;
922 struct ext4_group_info *this_grp;
923 struct ext4_sb_info *sbi = EXT4_SB(sb);
924 struct inode *inode = sbi->s_buddy_cache;
925 struct page *page = NULL, *bitmap_page = NULL;
926
927 mb_debug(1, "init group %u\n", group);
928 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
929 this_grp = ext4_get_group_info(sb, group);
930 /*
931 * This ensures that we don't reinit the buddy cache
932 * page which map to the group from which we are already
933 * allocating. If we are looking at the buddy cache we would
934 * have taken a reference using ext4_mb_load_buddy and that
935 * would have taken the alloc_sem lock.
936 */
937 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
938 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
939 /*
940 * somebody initialized the group
941 * return without doing anything
942 */
943 ret = 0;
944 goto err;
945 }
946 /*
947 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks.
949 * So for each group we need two blocks.
950 */
951 block = group * 2;
952 pnum = block / blocks_per_page;
953 poff = block % blocks_per_page;
954 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
955 if (page) {
956 BUG_ON(page->mapping != inode->i_mapping);
957 ret = ext4_mb_init_cache(page, NULL);
958 if (ret) {
959 unlock_page(page);
960 goto err;
961 }
962 unlock_page(page);
963 }
964 if (page == NULL || !PageUptodate(page)) {
965 ret = -EIO;
966 goto err;
967 }
968 mark_page_accessed(page);
969 bitmap_page = page;
970 bitmap = page_address(page) + (poff * sb->s_blocksize);
971
972 /* init buddy cache */
973 block++;
974 pnum = block / blocks_per_page;
975 poff = block % blocks_per_page;
976 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
977 if (page == bitmap_page) {
978 /*
979 * If both the bitmap and buddy are in
980 * the same page we don't need to force
981 * init the buddy
982 */
983 unlock_page(page);
984 } else if (page) {
985 BUG_ON(page->mapping != inode->i_mapping);
986 ret = ext4_mb_init_cache(page, bitmap);
987 if (ret) {
988 unlock_page(page);
989 goto err;
990 }
991 unlock_page(page);
992 }
993 if (page == NULL || !PageUptodate(page)) {
994 ret = -EIO;
995 goto err;
996 }
997 mark_page_accessed(page);
998err:
999 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1000 if (bitmap_page)
1001 page_cache_release(bitmap_page);
1002 if (page)
1003 page_cache_release(page);
1004 return ret;
1005}
1006
911static noinline_for_stack int 1007static noinline_for_stack int
912ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
913 struct ext4_buddy *e4b) 1009 struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
922 struct ext4_sb_info *sbi = EXT4_SB(sb); 1018 struct ext4_sb_info *sbi = EXT4_SB(sb);
923 struct inode *inode = sbi->s_buddy_cache; 1019 struct inode *inode = sbi->s_buddy_cache;
924 1020
925 mb_debug("load group %u\n", group); 1021 mb_debug(1, "load group %u\n", group);
926 1022
927 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1023 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
928 grp = ext4_get_group_info(sb, group); 1024 grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
941 * groups mapped by the page is blocked 1037 * groups mapped by the page is blocked
942 * till we are done with allocation 1038 * till we are done with allocation
943 */ 1039 */
1040repeat_load_buddy:
944 down_read(e4b->alloc_semp); 1041 down_read(e4b->alloc_semp);
945 1042
1043 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1044 /* we need to check for group need init flag
1045 * with alloc_semp held so that we can be sure
1046 * that new blocks didn't get added to the group
1047 * when we are loading the buddy cache
1048 */
1049 up_read(e4b->alloc_semp);
1050 /*
1051 * we need full data about the group
1052 * to make a good selection
1053 */
1054 ret = ext4_mb_init_group(sb, group);
1055 if (ret)
1056 return ret;
1057 goto repeat_load_buddy;
1058 }
1059
946 /* 1060 /*
947 * the buddy cache inode stores the block bitmap 1061 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks. 1062 * and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1360 ac->alloc_semp = e4b->alloc_semp; 1474 ac->alloc_semp = e4b->alloc_semp;
1361 e4b->alloc_semp = NULL; 1475 e4b->alloc_semp = NULL;
1362 /* store last allocated for subsequent stream allocation */ 1476 /* store last allocated for subsequent stream allocation */
1363 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1477 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1364 spin_lock(&sbi->s_md_lock); 1478 spin_lock(&sbi->s_md_lock);
1365 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1479 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1366 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1480 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1837 1951
1838} 1952}
1839 1953
1840static noinline_for_stack
1841int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1842{
1843
1844 int ret;
1845 void *bitmap;
1846 int blocks_per_page;
1847 int block, pnum, poff;
1848 int num_grp_locked = 0;
1849 struct ext4_group_info *this_grp;
1850 struct ext4_sb_info *sbi = EXT4_SB(sb);
1851 struct inode *inode = sbi->s_buddy_cache;
1852 struct page *page = NULL, *bitmap_page = NULL;
1853
1854 mb_debug("init group %lu\n", group);
1855 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1856 this_grp = ext4_get_group_info(sb, group);
1857 /*
1858 * This ensures we don't add group
1859 * to this buddy cache via resize
1860 */
1861 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1862 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1863 /*
1864 * somebody initialized the group
1865 * return without doing anything
1866 */
1867 ret = 0;
1868 goto err;
1869 }
1870 /*
1871 * the buddy cache inode stores the block bitmap
1872 * and buddy information in consecutive blocks.
1873 * So for each group we need two blocks.
1874 */
1875 block = group * 2;
1876 pnum = block / blocks_per_page;
1877 poff = block % blocks_per_page;
1878 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1879 if (page) {
1880 BUG_ON(page->mapping != inode->i_mapping);
1881 ret = ext4_mb_init_cache(page, NULL);
1882 if (ret) {
1883 unlock_page(page);
1884 goto err;
1885 }
1886 unlock_page(page);
1887 }
1888 if (page == NULL || !PageUptodate(page)) {
1889 ret = -EIO;
1890 goto err;
1891 }
1892 mark_page_accessed(page);
1893 bitmap_page = page;
1894 bitmap = page_address(page) + (poff * sb->s_blocksize);
1895
1896 /* init buddy cache */
1897 block++;
1898 pnum = block / blocks_per_page;
1899 poff = block % blocks_per_page;
1900 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1901 if (page == bitmap_page) {
1902 /*
1903 * If both the bitmap and buddy are in
1904 * the same page we don't need to force
1905 * init the buddy
1906 */
1907 unlock_page(page);
1908 } else if (page) {
1909 BUG_ON(page->mapping != inode->i_mapping);
1910 ret = ext4_mb_init_cache(page, bitmap);
1911 if (ret) {
1912 unlock_page(page);
1913 goto err;
1914 }
1915 unlock_page(page);
1916 }
1917 if (page == NULL || !PageUptodate(page)) {
1918 ret = -EIO;
1919 goto err;
1920 }
1921 mark_page_accessed(page);
1922err:
1923 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1924 if (bitmap_page)
1925 page_cache_release(bitmap_page);
1926 if (page)
1927 page_cache_release(page);
1928 return ret;
1929}
1930
1931static noinline_for_stack int 1954static noinline_for_stack int
1932ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1955ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1933{ 1956{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1938 struct ext4_sb_info *sbi; 1961 struct ext4_sb_info *sbi;
1939 struct super_block *sb; 1962 struct super_block *sb;
1940 struct ext4_buddy e4b; 1963 struct ext4_buddy e4b;
1941 loff_t size, isize;
1942 1964
1943 sb = ac->ac_sb; 1965 sb = ac->ac_sb;
1944 sbi = EXT4_SB(sb); 1966 sbi = EXT4_SB(sb);
1945 ngroups = ext4_get_groups_count(sb); 1967 ngroups = ext4_get_groups_count(sb);
1968 /* non-extent files are limited to low blocks/groups */
1969 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
1970 ngroups = sbi->s_blockfile_groups;
1971
1946 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1972 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1947 1973
1948 /* first, try the goal */ 1974 /* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1974 } 2000 }
1975 2001
1976 bsbits = ac->ac_sb->s_blocksize_bits; 2002 bsbits = ac->ac_sb->s_blocksize_bits;
1977 /* if stream allocation is enabled, use global goal */
1978 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
1979 isize = i_size_read(ac->ac_inode) >> bsbits;
1980 if (size < isize)
1981 size = isize;
1982 2003
1983 if (size < sbi->s_mb_stream_request && 2004 /* if stream allocation is enabled, use global goal */
1984 (ac->ac_flags & EXT4_MB_HINT_DATA)) { 2005 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1985 /* TBD: may be hot point */ 2006 /* TBD: may be hot point */
1986 spin_lock(&sbi->s_md_lock); 2007 spin_lock(&sbi->s_md_lock);
1987 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2008 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
1988 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2009 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1989 spin_unlock(&sbi->s_md_lock); 2010 spin_unlock(&sbi->s_md_lock);
1990 } 2011 }
2012
1991 /* Let's just scan groups to find more-less suitable blocks */ 2013 /* Let's just scan groups to find more-less suitable blocks */
1992 cr = ac->ac_2order ? 0 : 1; 2014 cr = ac->ac_2order ? 0 : 1;
1993 /* 2015 /*
@@ -2015,27 +2037,6 @@ repeat:
2015 if (grp->bb_free == 0) 2037 if (grp->bb_free == 0)
2016 continue; 2038 continue;
2017 2039
2018 /*
2019 * if the group is already init we check whether it is
2020 * a good group and if not we don't load the buddy
2021 */
2022 if (EXT4_MB_GRP_NEED_INIT(grp)) {
2023 /*
2024 * we need full data about the group
2025 * to make a good selection
2026 */
2027 err = ext4_mb_init_group(sb, group);
2028 if (err)
2029 goto out;
2030 }
2031
2032 /*
2033 * If the particular group doesn't satisfy our
2034 * criteria we continue with the next group
2035 */
2036 if (!ext4_mb_good_group(ac, group, cr))
2037 continue;
2038
2039 err = ext4_mb_load_buddy(sb, group, &e4b); 2040 err = ext4_mb_load_buddy(sb, group, &e4b);
2040 if (err) 2041 if (err)
2041 goto out; 2042 goto out;
@@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2156 2157
2157 if (v == SEQ_START_TOKEN) { 2158 if (v == SEQ_START_TOKEN) {
2158 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " 2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2159 "%-5s %-2s %-5s %-5s %-5s %-6s\n", 2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2160 "pid", "inode", "original", "goal", "result", "found", 2161 "pid", "inode", "original", "goal", "result", "found",
2161 "grps", "cr", "flags", "merge", "tail", "broken"); 2162 "grps", "cr", "flags", "merge", "tail", "broken");
2162 return 0; 2163 return 0;
@@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2164 2165
2165 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2166 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2167 "%-5u %-5s %-5u %-6u\n"; 2168 "0x%04x %-5s %-5u %-6u\n";
2168 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, 2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2169 hs->result.fe_start, hs->result.fe_len, 2170 hs->result.fe_start, hs->result.fe_len,
2170 hs->result.fe_logical); 2171 hs->result.fe_logical);
@@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2205{ 2206{
2206} 2207}
2207 2208
2208static struct seq_operations ext4_mb_seq_history_ops = { 2209static const struct seq_operations ext4_mb_seq_history_ops = {
2209 .start = ext4_mb_seq_history_start, 2210 .start = ext4_mb_seq_history_start,
2210 .next = ext4_mb_seq_history_next, 2211 .next = ext4_mb_seq_history_next,
2211 .stop = ext4_mb_seq_history_stop, 2212 .stop = ext4_mb_seq_history_stop,
@@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
2287 return count; 2288 return count;
2288} 2289}
2289 2290
2290static struct file_operations ext4_mb_seq_history_fops = { 2291static const struct file_operations ext4_mb_seq_history_fops = {
2291 .owner = THIS_MODULE, 2292 .owner = THIS_MODULE,
2292 .open = ext4_mb_seq_history_open, 2293 .open = ext4_mb_seq_history_open,
2293 .read = seq_read, 2294 .read = seq_read,
@@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2328 struct ext4_buddy e4b; 2329 struct ext4_buddy e4b;
2329 struct sg { 2330 struct sg {
2330 struct ext4_group_info info; 2331 struct ext4_group_info info;
2331 unsigned short counters[16]; 2332 ext4_grpblk_t counters[16];
2332 } sg; 2333 } sg;
2333 2334
2334 group--; 2335 group--;
@@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2366{ 2367{
2367} 2368}
2368 2369
2369static struct seq_operations ext4_mb_seq_groups_ops = { 2370static const struct seq_operations ext4_mb_seq_groups_ops = {
2370 .start = ext4_mb_seq_groups_start, 2371 .start = ext4_mb_seq_groups_start,
2371 .next = ext4_mb_seq_groups_next, 2372 .next = ext4_mb_seq_groups_next,
2372 .stop = ext4_mb_seq_groups_stop, 2373 .stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2387 2388
2388} 2389}
2389 2390
2390static struct file_operations ext4_mb_seq_groups_fops = { 2391static const struct file_operations ext4_mb_seq_groups_fops = {
2391 .owner = THIS_MODULE, 2392 .owner = THIS_MODULE,
2392 .open = ext4_mb_seq_groups_open, 2393 .open = ext4_mb_seq_groups_open,
2393 .read = seq_read, 2394 .read = seq_read,
@@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2532 2533
2533 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2534 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2534 init_rwsem(&meta_group_info[i]->alloc_sem); 2535 init_rwsem(&meta_group_info[i]->alloc_sem);
2535 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2536 meta_group_info[i]->bb_free_root.rb_node = NULL;
2536 2537
2537#ifdef DOUBLE_CHECK 2538#ifdef DOUBLE_CHECK
2538 { 2539 {
@@ -2558,26 +2559,15 @@ exit_meta_group_info:
2558 return -ENOMEM; 2559 return -ENOMEM;
2559} /* ext4_mb_add_groupinfo */ 2560} /* ext4_mb_add_groupinfo */
2560 2561
2561/*
2562 * Update an existing group.
2563 * This function is used for online resize
2564 */
2565void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2566{
2567 grp->bb_free += add;
2568}
2569
2570static int ext4_mb_init_backend(struct super_block *sb) 2562static int ext4_mb_init_backend(struct super_block *sb)
2571{ 2563{
2572 ext4_group_t ngroups = ext4_get_groups_count(sb); 2564 ext4_group_t ngroups = ext4_get_groups_count(sb);
2573 ext4_group_t i; 2565 ext4_group_t i;
2574 int metalen;
2575 struct ext4_sb_info *sbi = EXT4_SB(sb); 2566 struct ext4_sb_info *sbi = EXT4_SB(sb);
2576 struct ext4_super_block *es = sbi->s_es; 2567 struct ext4_super_block *es = sbi->s_es;
2577 int num_meta_group_infos; 2568 int num_meta_group_infos;
2578 int num_meta_group_infos_max; 2569 int num_meta_group_infos_max;
2579 int array_size; 2570 int array_size;
2580 struct ext4_group_info **meta_group_info;
2581 struct ext4_group_desc *desc; 2571 struct ext4_group_desc *desc;
2582 2572
2583 /* This is the number of blocks used by GDT */ 2573 /* This is the number of blocks used by GDT */
@@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
2622 goto err_freesgi; 2612 goto err_freesgi;
2623 } 2613 }
2624 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2614 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2625
2626 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2627 for (i = 0; i < num_meta_group_infos; i++) {
2628 if ((i + 1) == num_meta_group_infos)
2629 metalen = sizeof(*meta_group_info) *
2630 (ngroups -
2631 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2632 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2633 if (meta_group_info == NULL) {
2634 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2635 "buddy group\n");
2636 goto err_freemeta;
2637 }
2638 sbi->s_group_info[i] = meta_group_info;
2639 }
2640
2641 for (i = 0; i < ngroups; i++) { 2615 for (i = 0; i < ngroups; i++) {
2642 desc = ext4_get_group_desc(sb, i, NULL); 2616 desc = ext4_get_group_desc(sb, i, NULL);
2643 if (desc == NULL) { 2617 if (desc == NULL) {
@@ -2655,7 +2629,6 @@ err_freebuddy:
2655 while (i-- > 0) 2629 while (i-- > 0)
2656 kfree(ext4_get_group_info(sb, i)); 2630 kfree(ext4_get_group_info(sb, i));
2657 i = num_meta_group_infos; 2631 i = num_meta_group_infos;
2658err_freemeta:
2659 while (i-- > 0) 2632 while (i-- > 0)
2660 kfree(sbi->s_group_info[i]); 2633 kfree(sbi->s_group_info[i]);
2661 iput(sbi->s_buddy_cache); 2634 iput(sbi->s_buddy_cache);
@@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2672 unsigned max; 2645 unsigned max;
2673 int ret; 2646 int ret;
2674 2647
2675 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2648 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2676 2649
2677 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2650 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2678 if (sbi->s_mb_offsets == NULL) { 2651 if (sbi->s_mb_offsets == NULL) {
2679 return -ENOMEM; 2652 return -ENOMEM;
2680 } 2653 }
2681 2654
2682 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2655 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2683 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2656 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2684 if (sbi->s_mb_maxs == NULL) { 2657 if (sbi->s_mb_maxs == NULL) {
2685 kfree(sbi->s_mb_offsets); 2658 kfree(sbi->s_mb_offsets);
@@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2758 kmem_cache_free(ext4_pspace_cachep, pa); 2731 kmem_cache_free(ext4_pspace_cachep, pa);
2759 } 2732 }
2760 if (count) 2733 if (count)
2761 mb_debug("mballoc: %u PAs left\n", count); 2734 mb_debug(1, "mballoc: %u PAs left\n", count);
2762 2735
2763} 2736}
2764 2737
@@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2839 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2812 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2840 entry = list_entry(l, struct ext4_free_data, list); 2813 entry = list_entry(l, struct ext4_free_data, list);
2841 2814
2842 mb_debug("gonna free %u blocks in group %u (0x%p):", 2815 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2843 entry->count, entry->group, entry); 2816 entry->count, entry->group, entry);
2844 2817
2845 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2818 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2874 ext4_mb_release_desc(&e4b); 2847 ext4_mb_release_desc(&e4b);
2875 } 2848 }
2876 2849
2877 mb_debug("freed %u blocks in %u structures\n", count, count2); 2850 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2851}
2852
2853#ifdef CONFIG_EXT4_DEBUG
2854u8 mb_enable_debug __read_mostly;
2855
2856static struct dentry *debugfs_dir;
2857static struct dentry *debugfs_debug;
2858
2859static void __init ext4_create_debugfs_entry(void)
2860{
2861 debugfs_dir = debugfs_create_dir("ext4", NULL);
2862 if (debugfs_dir)
2863 debugfs_debug = debugfs_create_u8("mballoc-debug",
2864 S_IRUGO | S_IWUSR,
2865 debugfs_dir,
2866 &mb_enable_debug);
2867}
2868
2869static void ext4_remove_debugfs_entry(void)
2870{
2871 debugfs_remove(debugfs_debug);
2872 debugfs_remove(debugfs_dir);
2878} 2873}
2879 2874
2875#else
2876
2877static void __init ext4_create_debugfs_entry(void)
2878{
2879}
2880
2881static void ext4_remove_debugfs_entry(void)
2882{
2883}
2884
2885#endif
2886
2880int __init init_ext4_mballoc(void) 2887int __init init_ext4_mballoc(void)
2881{ 2888{
2882 ext4_pspace_cachep = 2889 ext4_pspace_cachep =
@@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void)
2904 kmem_cache_destroy(ext4_ac_cachep); 2911 kmem_cache_destroy(ext4_ac_cachep);
2905 return -ENOMEM; 2912 return -ENOMEM;
2906 } 2913 }
2914 ext4_create_debugfs_entry();
2907 return 0; 2915 return 0;
2908} 2916}
2909 2917
@@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void)
2917 kmem_cache_destroy(ext4_pspace_cachep); 2925 kmem_cache_destroy(ext4_pspace_cachep);
2918 kmem_cache_destroy(ext4_ac_cachep); 2926 kmem_cache_destroy(ext4_ac_cachep);
2919 kmem_cache_destroy(ext4_free_ext_cachep); 2927 kmem_cache_destroy(ext4_free_ext_cachep);
2928 ext4_remove_debugfs_entry();
2920} 2929}
2921 2930
2922 2931
@@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3061 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 3070 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
3062 else 3071 else
3063 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3072 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3064 mb_debug("#%u: goal %u blocks for locality group\n", 3073 mb_debug(1, "#%u: goal %u blocks for locality group\n",
3065 current->pid, ac->ac_g_ex.fe_len); 3074 current->pid, ac->ac_g_ex.fe_len);
3066} 3075}
3067 3076
@@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3180 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3189 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3181 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3190 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3182 3191
3183 /* skip PA normalized request doesn't overlap with */ 3192 /* skip PAs this normalized request doesn't overlap with */
3184 if (pa->pa_lstart >= end) { 3193 if (pa->pa_lstart >= end || pa_end <= start) {
3185 spin_unlock(&pa->pa_lock);
3186 continue;
3187 }
3188 if (pa_end <= start) {
3189 spin_unlock(&pa->pa_lock); 3194 spin_unlock(&pa->pa_lock);
3190 continue; 3195 continue;
3191 } 3196 }
3192 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3197 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3193 3198
3199 /* adjust start or end to be adjacent to this pa */
3194 if (pa_end <= ac->ac_o_ex.fe_logical) { 3200 if (pa_end <= ac->ac_o_ex.fe_logical) {
3195 BUG_ON(pa_end < start); 3201 BUG_ON(pa_end < start);
3196 start = pa_end; 3202 start = pa_end;
3197 } 3203 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3198
3199 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3200 BUG_ON(pa->pa_lstart > end); 3204 BUG_ON(pa->pa_lstart > end);
3201 end = pa->pa_lstart; 3205 end = pa->pa_lstart;
3202 } 3206 }
@@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3251 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3255 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3252 } 3256 }
3253 3257
3254 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, 3258 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3255 (unsigned) orig_size, (unsigned) start); 3259 (unsigned) orig_size, (unsigned) start);
3256} 3260}
3257 3261
@@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3300 BUG_ON(pa->pa_free < len); 3304 BUG_ON(pa->pa_free < len);
3301 pa->pa_free -= len; 3305 pa->pa_free -= len;
3302 3306
3303 mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); 3307 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3304} 3308}
3305 3309
3306/* 3310/*
@@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3324 * in on-disk bitmap -- see ext4_mb_release_context() 3328 * in on-disk bitmap -- see ext4_mb_release_context()
3325 * Other CPUs are prevented from allocating from this pa by lg_mutex 3329 * Other CPUs are prevented from allocating from this pa by lg_mutex
3326 */ 3330 */
3327 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3331 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3328} 3332}
3329 3333
3330/* 3334/*
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3382 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3386 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3383 continue; 3387 continue;
3384 3388
3389 /* non-extent files can't have physical blocks past 2^32 */
3390 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
3391 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3392 continue;
3393
3385 /* found preallocated blocks, use them */ 3394 /* found preallocated blocks, use them */
3386 spin_lock(&pa->pa_lock); 3395 spin_lock(&pa->pa_lock);
3387 if (pa->pa_deleted == 0 && pa->pa_free) { 3396 if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3503 preallocated += len; 3512 preallocated += len;
3504 count++; 3513 count++;
3505 } 3514 }
3506 mb_debug("prellocated %u for group %u\n", preallocated, group); 3515 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3507} 3516}
3508 3517
3509static void ext4_mb_pa_callback(struct rcu_head *head) 3518static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3638 pa->pa_deleted = 0; 3647 pa->pa_deleted = 0;
3639 pa->pa_type = MB_INODE_PA; 3648 pa->pa_type = MB_INODE_PA;
3640 3649
3641 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3650 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3642 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3651 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3643 trace_ext4_mb_new_inode_pa(ac, pa); 3652 trace_ext4_mb_new_inode_pa(ac, pa);
3644 3653
@@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3698 pa->pa_deleted = 0; 3707 pa->pa_deleted = 0;
3699 pa->pa_type = MB_GROUP_PA; 3708 pa->pa_type = MB_GROUP_PA;
3700 3709
3701 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3710 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3702 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3711 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3703 trace_ext4_mb_new_group_pa(ac, pa); 3712 trace_ext4_mb_new_group_pa(ac, pa);
3704 3713
@@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3777 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3786 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3778 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3787 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3779 le32_to_cpu(sbi->s_es->s_first_data_block); 3788 le32_to_cpu(sbi->s_es->s_first_data_block);
3780 mb_debug(" free preallocated %u/%u in group %u\n", 3789 mb_debug(1, " free preallocated %u/%u in group %u\n",
3781 (unsigned) start, (unsigned) next - bit, 3790 (unsigned) start, (unsigned) next - bit,
3782 (unsigned) group); 3791 (unsigned) group);
3783 free += next - bit; 3792 free += next - bit;
@@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3868 int busy = 0; 3877 int busy = 0;
3869 int free = 0; 3878 int free = 0;
3870 3879
3871 mb_debug("discard preallocation for group %u\n", group); 3880 mb_debug(1, "discard preallocation for group %u\n", group);
3872 3881
3873 if (list_empty(&grp->bb_prealloc_list)) 3882 if (list_empty(&grp->bb_prealloc_list))
3874 return 0; 3883 return 0;
@@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode)
3992 return; 4001 return;
3993 } 4002 }
3994 4003
3995 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4004 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3996 trace_ext4_discard_preallocations(inode); 4005 trace_ext4_discard_preallocations(inode);
3997 4006
3998 INIT_LIST_HEAD(&list); 4007 INIT_LIST_HEAD(&list);
@@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
4097{ 4106{
4098 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); 4107 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
4099} 4108}
4100#ifdef MB_DEBUG 4109#ifdef CONFIG_EXT4_DEBUG
4101static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4110static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4102{ 4111{
4103 struct super_block *sb = ac->ac_sb; 4112 struct super_block *sb = ac->ac_sb;
@@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4139 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4148 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4140 NULL, &start); 4149 NULL, &start);
4141 spin_unlock(&pa->pa_lock); 4150 spin_unlock(&pa->pa_lock);
4142 printk(KERN_ERR "PA:%lu:%d:%u \n", i, 4151 printk(KERN_ERR "PA:%u:%d:%u \n", i,
4143 start, pa->pa_len); 4152 start, pa->pa_len);
4144 } 4153 }
4145 ext4_unlock_group(sb, i); 4154 ext4_unlock_group(sb, i);
4146 4155
4147 if (grp->bb_free == 0) 4156 if (grp->bb_free == 0)
4148 continue; 4157 continue;
4149 printk(KERN_ERR "%lu: %d/%d \n", 4158 printk(KERN_ERR "%u: %d/%d \n",
4150 i, grp->bb_free, grp->bb_fragments); 4159 i, grp->bb_free, grp->bb_fragments);
4151 } 4160 }
4152 printk(KERN_ERR "\n"); 4161 printk(KERN_ERR "\n");
@@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4174 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4183 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4175 return; 4184 return;
4176 4185
4186 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4187 return;
4188
4177 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4178 isize = i_size_read(ac->ac_inode) >> bsbits; 4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits;
4179 size = max(size, isize); 4192 size = max(size, isize);
4180 4193
4181 /* don't use group allocation for large files */ 4194 if ((size == isize) &&
4182 if (size >= sbi->s_mb_stream_request) 4195 !ext4_fs_is_busy(sbi) &&
4196 (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4197 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4183 return; 4198 return;
4199 }
4184 4200
4185 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4201 /* don't use group allocation for large files */
4202 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4186 return; 4204 return;
4205 }
4187 4206
4188 BUG_ON(ac->ac_lg != NULL); 4207 BUG_ON(ac->ac_lg != NULL);
4189 /* 4208 /*
@@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4246 * locality group. this is a policy, actually */ 4265 * locality group. this is a policy, actually */
4247 ext4_mb_group_or_file(ac); 4266 ext4_mb_group_or_file(ac);
4248 4267
4249 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4268 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4250 "left: %u/%u, right %u/%u to %swritable\n", 4269 "left: %u/%u, right %u/%u to %swritable\n",
4251 (unsigned) ar->len, (unsigned) ar->logical, 4270 (unsigned) ar->len, (unsigned) ar->logical,
4252 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4271 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4268 struct ext4_prealloc_space *pa, *tmp; 4287 struct ext4_prealloc_space *pa, *tmp;
4269 struct ext4_allocation_context *ac; 4288 struct ext4_allocation_context *ac;
4270 4289
4271 mb_debug("discard locality group preallocation\n"); 4290 mb_debug(1, "discard locality group preallocation\n");
4272 4291
4273 INIT_LIST_HEAD(&discard_list); 4292 INIT_LIST_HEAD(&discard_list);
4274 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4293 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..188d3d709b24 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
37 37
38/* 38/*
39 */ 39 */
40#define MB_DEBUG__ 40#ifdef CONFIG_EXT4_DEBUG
41#ifdef MB_DEBUG 41extern u8 mb_enable_debug;
42#define mb_debug(fmt, a...) printk(fmt, ##a) 42
43#define mb_debug(n, fmt, a...) \
44 do { \
45 if ((n) <= mb_enable_debug) { \
46 printk(KERN_DEBUG "(%s, %d): %s: ", \
47 __FILE__, __LINE__, __func__); \
48 printk(fmt, ## a); \
49 } \
50 } while (0)
43#else 51#else
44#define mb_debug(fmt, a...) 52#define mb_debug(n, fmt, a...)
45#endif 53#endif
46 54
47/* 55/*
@@ -128,8 +136,8 @@ struct ext4_prealloc_space {
128 unsigned pa_deleted; 136 unsigned pa_deleted;
129 ext4_fsblk_t pa_pstart; /* phys. block */ 137 ext4_fsblk_t pa_pstart; /* phys. block */
130 ext4_lblk_t pa_lstart; /* log. block */ 138 ext4_lblk_t pa_lstart; /* log. block */
131 unsigned short pa_len; /* len of preallocated chunk */ 139 ext4_grpblk_t pa_len; /* len of preallocated chunk */
132 unsigned short pa_free; /* how many blocks are free */ 140 ext4_grpblk_t pa_free; /* how many blocks are free */
133 unsigned short pa_type; /* pa type. inode or group */ 141 unsigned short pa_type; /* pa type. inode or group */
134 spinlock_t *pa_obj_lock; 142 spinlock_t *pa_obj_lock;
135 struct inode *pa_inode; /* hack, for history only */ 143 struct inode *pa_inode; /* hack, for history only */
@@ -144,7 +152,7 @@ struct ext4_free_extent {
144 ext4_lblk_t fe_logical; 152 ext4_lblk_t fe_logical;
145 ext4_grpblk_t fe_start; 153 ext4_grpblk_t fe_start;
146 ext4_group_t fe_group; 154 ext4_group_t fe_group;
147 int fe_len; 155 ext4_grpblk_t fe_len;
148}; 156};
149 157
150/* 158/*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..bf519f239ae6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
353 353
354 down_write(&EXT4_I(inode)->i_data_sem); 354 down_write(&EXT4_I(inode)->i_data_sem);
355 /* 355 /*
356 * if EXT4_EXT_MIGRATE is cleared a block allocation 356 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
357 * happened after we started the migrate. We need to 357 * happened after we started the migrate. We need to
358 * fail the migrate 358 * fail the migrate
359 */ 359 */
360 if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { 360 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
361 retval = -EAGAIN; 361 retval = -EAGAIN;
362 up_write(&EXT4_I(inode)->i_data_sem); 362 up_write(&EXT4_I(inode)->i_data_sem);
363 goto err_out; 363 goto err_out;
364 } else 364 } else
365 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 365 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
366 ~EXT4_EXT_MIGRATE;
367 /* 366 /*
368 * We have the extent map build with the tmp inode. 367 * We have the extent map build with the tmp inode.
369 * Now copy the i_data across 368 * Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
517 * when we add extents we extent the journal 516 * when we add extents we extent the journal
518 */ 517 */
519 /* 518 /*
520 * Even though we take i_mutex we can still cause block allocation 519 * Even though we take i_mutex we can still cause block
521 * via mmap write to holes. If we have allocated new blocks we fail 520 * allocation via mmap write to holes. If we have allocated
522 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. 521 * new blocks we fail migrate. New block allocation will
523 * The flag is updated with i_data_sem held to prevent racing with 522 * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
524 * block allocation. 523 * with i_data_sem held to prevent racing with block
524 * allocation.
525 */ 525 */
526 down_read((&EXT4_I(inode)->i_data_sem)); 526 down_read((&EXT4_I(inode)->i_data_sem));
527 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; 527 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
528 up_read((&EXT4_I(inode)->i_data_sem)); 528 up_read((&EXT4_I(inode)->i_data_sem));
529 529
530 handle = ext4_journal_start(inode, 1); 530 handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
618 tmp_inode->i_nlink = 0; 618 tmp_inode->i_nlink = 0;
619 619
620 ext4_journal_stop(handle); 620 ext4_journal_stop(handle);
621 621 unlock_new_inode(tmp_inode);
622 iput(tmp_inode); 622 iput(tmp_inode);
623 623
624 return retval; 624 return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..c07a2915e40b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
19#include "ext4_extents.h" 19#include "ext4_extents.h"
20#include "ext4.h" 20#include "ext4.h"
21 21
22#define get_ext_path(path, inode, block, ret) \ 22/**
23 do { \ 23 * get_ext_path - Find an extent path for designated logical block number.
24 path = ext4_ext_find_extent(inode, block, path); \ 24 *
25 if (IS_ERR(path)) { \ 25 * @inode: an inode which is searched
26 ret = PTR_ERR(path); \ 26 * @lblock: logical block number to find an extent path
27 path = NULL; \ 27 * @path: pointer to an extent path pointer (for output)
28 } \ 28 *
29 } while (0) 29 * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
30 * on failure.
31 */
32static inline int
33get_ext_path(struct inode *inode, ext4_lblk_t lblock,
34 struct ext4_ext_path **path)
35{
36 int ret = 0;
37
38 *path = ext4_ext_find_extent(inode, lblock, *path);
39 if (IS_ERR(*path)) {
40 ret = PTR_ERR(*path);
41 *path = NULL;
42 } else if ((*path)[ext_depth(inode)].p_ext == NULL)
43 ret = -ENODATA;
44
45 return ret;
46}
30 47
31/** 48/**
32 * copy_extent_status - Copy the extent's initialization status 49 * copy_extent_status - Copy the extent's initialization status
@@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
113} 130}
114 131
115/** 132/**
133 * mext_check_null_inode - NULL check for two inodes
134 *
135 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
136 */
137static int
138mext_check_null_inode(struct inode *inode1, struct inode *inode2,
139 const char *function)
140{
141 int ret = 0;
142
143 if (inode1 == NULL) {
144 ext4_error(inode2->i_sb, function,
145 "Both inodes should not be NULL: "
146 "inode1 NULL inode2 %lu", inode2->i_ino);
147 ret = -EIO;
148 } else if (inode2 == NULL) {
149 ext4_error(inode1->i_sb, function,
150 "Both inodes should not be NULL: "
151 "inode1 %lu inode2 NULL", inode1->i_ino);
152 ret = -EIO;
153 }
154 return ret;
155}
156
157/**
116 * mext_double_down_read - Acquire two inodes' read semaphore 158 * mext_double_down_read - Acquire two inodes' read semaphore
117 * 159 *
118 * @orig_inode: original inode structure 160 * @orig_inode: original inode structure
@@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
124{ 166{
125 struct inode *first = orig_inode, *second = donor_inode; 167 struct inode *first = orig_inode, *second = donor_inode;
126 168
127 BUG_ON(orig_inode == NULL || donor_inode == NULL);
128
129 /* 169 /*
130 * Use the inode number to provide the stable locking order instead 170 * Use the inode number to provide the stable locking order instead
131 * of its address, because the C language doesn't guarantee you can 171 * of its address, because the C language doesn't guarantee you can
@@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
152{ 192{
153 struct inode *first = orig_inode, *second = donor_inode; 193 struct inode *first = orig_inode, *second = donor_inode;
154 194
155 BUG_ON(orig_inode == NULL || donor_inode == NULL);
156
157 /* 195 /*
158 * Use the inode number to provide the stable locking order instead 196 * Use the inode number to provide the stable locking order instead
159 * of its address, because the C language doesn't guarantee you can 197 * of its address, because the C language doesn't guarantee you can
@@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
178static void 216static void
179mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) 217mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
180{ 218{
181 BUG_ON(orig_inode == NULL || donor_inode == NULL);
182
183 up_read(&EXT4_I(orig_inode)->i_data_sem); 219 up_read(&EXT4_I(orig_inode)->i_data_sem);
184 up_read(&EXT4_I(donor_inode)->i_data_sem); 220 up_read(&EXT4_I(donor_inode)->i_data_sem);
185} 221}
@@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
194static void 230static void
195mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) 231mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
196{ 232{
197 BUG_ON(orig_inode == NULL || donor_inode == NULL);
198
199 up_write(&EXT4_I(orig_inode)->i_data_sem); 233 up_write(&EXT4_I(orig_inode)->i_data_sem);
200 up_write(&EXT4_I(donor_inode)->i_data_sem); 234 up_write(&EXT4_I(donor_inode)->i_data_sem);
201} 235}
@@ -283,8 +317,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
283 } 317 }
284 318
285 if (new_flag) { 319 if (new_flag) {
286 get_ext_path(orig_path, orig_inode, eblock, err); 320 err = get_ext_path(orig_inode, eblock, &orig_path);
287 if (orig_path == NULL) 321 if (err)
288 goto out; 322 goto out;
289 323
290 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
@@ -293,9 +327,9 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
293 } 327 }
294 328
295 if (end_flag) { 329 if (end_flag) {
296 get_ext_path(orig_path, orig_inode, 330 err = get_ext_path(orig_inode,
297 le32_to_cpu(end_ext->ee_block) - 1, err); 331 le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
298 if (orig_path == NULL) 332 if (err)
299 goto out; 333 goto out;
300 334
301 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
@@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
519 * oext |-----------| 553 * oext |-----------|
520 * new_ext |-------| 554 * new_ext |-------|
521 */ 555 */
522 BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); 556 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
557 ext4_error(orig_inode->i_sb, __func__,
558 "new_ext_end(%u) should be less than or equal to "
559 "oext->ee_block(%u) + oext_alen(%d) - 1",
560 new_ext_end, le32_to_cpu(oext->ee_block),
561 oext_alen);
562 ret = -EIO;
563 goto out;
564 }
523 565
524 /* 566 /*
525 * Case: new_ext is smaller than original extent 567 * Case: new_ext is smaller than original extent
@@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
543 585
544 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, 586 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
545 o_end, &start_ext, &new_ext, &end_ext); 587 o_end, &start_ext, &new_ext, &end_ext);
588out:
546 return ret; 589 return ret;
547} 590}
548 591
@@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
554 * @orig_off: block offset of original inode 597 * @orig_off: block offset of original inode
555 * @donor_off: block offset of donor inode 598 * @donor_off: block offset of donor inode
556 * @max_count: the maximun length of extents 599 * @max_count: the maximun length of extents
600 *
601 * Return 0 on success, or a negative error value on failure.
557 */ 602 */
558static void 603static int
559mext_calc_swap_extents(struct ext4_extent *tmp_dext, 604mext_calc_swap_extents(struct ext4_extent *tmp_dext,
560 struct ext4_extent *tmp_oext, 605 struct ext4_extent *tmp_oext,
561 ext4_lblk_t orig_off, ext4_lblk_t donor_off, 606 ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
564 ext4_lblk_t diff, orig_diff; 609 ext4_lblk_t diff, orig_diff;
565 struct ext4_extent dext_old, oext_old; 610 struct ext4_extent dext_old, oext_old;
566 611
612 BUG_ON(orig_off != donor_off);
613
614 /* original and donor extents have to cover the same block offset */
615 if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
616 le32_to_cpu(tmp_oext->ee_block) +
617 ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
618 return -ENODATA;
619
620 if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
621 le32_to_cpu(tmp_dext->ee_block) +
622 ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
623 return -ENODATA;
624
567 dext_old = *tmp_dext; 625 dext_old = *tmp_dext;
568 oext_old = *tmp_oext; 626 oext_old = *tmp_oext;
569 627
@@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
591 649
592 copy_extent_status(&oext_old, tmp_dext); 650 copy_extent_status(&oext_old, tmp_dext);
593 copy_extent_status(&dext_old, tmp_oext); 651 copy_extent_status(&dext_old, tmp_oext);
652
653 return 0;
594} 654}
595 655
596/** 656/**
@@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
631 mext_double_down_write(orig_inode, donor_inode); 691 mext_double_down_write(orig_inode, donor_inode);
632 692
633 /* Get the original extent for the block "orig_off" */ 693 /* Get the original extent for the block "orig_off" */
634 get_ext_path(orig_path, orig_inode, orig_off, err); 694 err = get_ext_path(orig_inode, orig_off, &orig_path);
635 if (orig_path == NULL) 695 if (err)
636 goto out; 696 goto out;
637 697
638 /* Get the donor extent for the head */ 698 /* Get the donor extent for the head */
639 get_ext_path(donor_path, donor_inode, donor_off, err); 699 err = get_ext_path(donor_inode, donor_off, &donor_path);
640 if (donor_path == NULL) 700 if (err)
641 goto out; 701 goto out;
642 depth = ext_depth(orig_inode); 702 depth = ext_depth(orig_inode);
643 oext = orig_path[depth].p_ext; 703 oext = orig_path[depth].p_ext;
@@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
647 dext = donor_path[depth].p_ext; 707 dext = donor_path[depth].p_ext;
648 tmp_dext = *dext; 708 tmp_dext = *dext;
649 709
650 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 710 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
651 donor_off, count); 711 donor_off, count);
712 if (err)
713 goto out;
652 714
653 /* Loop for the donor extents */ 715 /* Loop for the donor extents */
654 while (1) { 716 while (1) {
655 /* The extent for donor must be found. */ 717 /* The extent for donor must be found. */
656 BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); 718 if (!dext) {
719 ext4_error(donor_inode->i_sb, __func__,
720 "The extent for donor must be found");
721 err = -EIO;
722 goto out;
723 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
724 ext4_error(donor_inode->i_sb, __func__,
725 "Donor offset(%u) and the first block of donor "
726 "extent(%u) should be equal",
727 donor_off,
728 le32_to_cpu(tmp_dext.ee_block));
729 err = -EIO;
730 goto out;
731 }
657 732
658 /* Set donor extent to orig extent */ 733 /* Set donor extent to orig extent */
659 err = mext_leaf_block(handle, orig_inode, 734 err = mext_leaf_block(handle, orig_inode,
@@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
678 753
679 if (orig_path) 754 if (orig_path)
680 ext4_ext_drop_refs(orig_path); 755 ext4_ext_drop_refs(orig_path);
681 get_ext_path(orig_path, orig_inode, orig_off, err); 756 err = get_ext_path(orig_inode, orig_off, &orig_path);
682 if (orig_path == NULL) 757 if (err)
683 goto out; 758 goto out;
684 depth = ext_depth(orig_inode); 759 depth = ext_depth(orig_inode);
685 oext = orig_path[depth].p_ext; 760 oext = orig_path[depth].p_ext;
@@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 767
693 if (donor_path) 768 if (donor_path)
694 ext4_ext_drop_refs(donor_path); 769 ext4_ext_drop_refs(donor_path);
695 get_ext_path(donor_path, donor_inode, 770 err = get_ext_path(donor_inode, donor_off, &donor_path);
696 donor_off, err); 771 if (err)
697 if (donor_path == NULL)
698 goto out; 772 goto out;
699 depth = ext_depth(donor_inode); 773 depth = ext_depth(donor_inode);
700 dext = donor_path[depth].p_ext; 774 dext = donor_path[depth].p_ext;
@@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
705 } 779 }
706 tmp_dext = *dext; 780 tmp_dext = *dext;
707 781
708 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 782 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
709 donor_off, 783 donor_off, count - replaced_count);
710 count - replaced_count); 784 if (err)
785 goto out;
711 } 786 }
712 787
713out: 788out:
@@ -740,7 +815,7 @@ out:
740 * on success, or a negative error value on failure. 815 * on success, or a negative error value on failure.
741 */ 816 */
742static int 817static int
743move_extent_par_page(struct file *o_filp, struct inode *donor_inode, 818move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
744 pgoff_t orig_page_offset, int data_offset_in_page, 819 pgoff_t orig_page_offset, int data_offset_in_page,
745 int block_len_in_page, int uninit) 820 int block_len_in_page, int uninit)
746{ 821{
@@ -871,6 +946,7 @@ out:
871 if (PageLocked(page)) 946 if (PageLocked(page))
872 unlock_page(page); 947 unlock_page(page);
873 page_cache_release(page); 948 page_cache_release(page);
949 ext4_journal_stop(handle);
874 } 950 }
875out2: 951out2:
876 ext4_journal_stop(handle); 952 ext4_journal_stop(handle);
@@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode,
897 struct inode *donor_inode, __u64 orig_start, 973 struct inode *donor_inode, __u64 orig_start,
898 __u64 donor_start, __u64 *len, __u64 moved_len) 974 __u64 donor_start, __u64 *len, __u64 moved_len)
899{ 975{
976 ext4_lblk_t orig_blocks, donor_blocks;
977 unsigned int blkbits = orig_inode->i_blkbits;
978 unsigned int blocksize = 1 << blkbits;
979
900 /* Regular file check */ 980 /* Regular file check */
901 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { 981 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
902 ext4_debug("ext4 move extent: The argument files should be " 982 ext4_debug("ext4 move extent: The argument files should be "
@@ -960,54 +1040,58 @@ mext_check_arguments(struct inode *orig_inode,
960 return -EINVAL; 1040 return -EINVAL;
961 } 1041 }
962 1042
963 if ((orig_start > MAX_DEFRAG_SIZE) || 1043 if ((orig_start > EXT_MAX_BLOCK) ||
964 (donor_start > MAX_DEFRAG_SIZE) || 1044 (donor_start > EXT_MAX_BLOCK) ||
965 (*len > MAX_DEFRAG_SIZE) || 1045 (*len > EXT_MAX_BLOCK) ||
966 (orig_start + *len > MAX_DEFRAG_SIZE)) { 1046 (orig_start + *len > EXT_MAX_BLOCK)) {
967 ext4_debug("ext4 move extent: Can't handle over [%lu] blocks " 1047 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
968 "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE, 1048 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
969 orig_inode->i_ino, donor_inode->i_ino); 1049 orig_inode->i_ino, donor_inode->i_ino);
970 return -EINVAL; 1050 return -EINVAL;
971 } 1051 }
972 1052
973 if (orig_inode->i_size > donor_inode->i_size) { 1053 if (orig_inode->i_size > donor_inode->i_size) {
974 if (orig_start >= donor_inode->i_size) { 1054 donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
1055 /* TODO: eliminate this artificial restriction */
1056 if (orig_start >= donor_blocks) {
975 ext4_debug("ext4 move extent: orig start offset " 1057 ext4_debug("ext4 move extent: orig start offset "
976 "[%llu] should be less than donor file size " 1058 "[%llu] should be less than donor file blocks "
977 "[%lld] [ino:orig %lu, donor_inode %lu]\n", 1059 "[%u] [ino:orig %lu, donor %lu]\n",
978 orig_start, donor_inode->i_size, 1060 orig_start, donor_blocks,
979 orig_inode->i_ino, donor_inode->i_ino); 1061 orig_inode->i_ino, donor_inode->i_ino);
980 return -EINVAL; 1062 return -EINVAL;
981 } 1063 }
982 1064
983 if (orig_start + *len > donor_inode->i_size) { 1065 /* TODO: eliminate this artificial restriction */
1066 if (orig_start + *len > donor_blocks) {
984 ext4_debug("ext4 move extent: End offset [%llu] should " 1067 ext4_debug("ext4 move extent: End offset [%llu] should "
985 "be less than donor file size [%lld]." 1068 "be less than donor file blocks [%u]."
986 "So adjust length from %llu to %lld " 1069 "So adjust length from %llu to %llu "
987 "[ino:orig %lu, donor %lu]\n", 1070 "[ino:orig %lu, donor %lu]\n",
988 orig_start + *len, donor_inode->i_size, 1071 orig_start + *len, donor_blocks,
989 *len, donor_inode->i_size - orig_start, 1072 *len, donor_blocks - orig_start,
990 orig_inode->i_ino, donor_inode->i_ino); 1073 orig_inode->i_ino, donor_inode->i_ino);
991 *len = donor_inode->i_size - orig_start; 1074 *len = donor_blocks - orig_start;
992 } 1075 }
993 } else { 1076 } else {
994 if (orig_start >= orig_inode->i_size) { 1077 orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
1078 if (orig_start >= orig_blocks) {
995 ext4_debug("ext4 move extent: start offset [%llu] " 1079 ext4_debug("ext4 move extent: start offset [%llu] "
996 "should be less than original file size " 1080 "should be less than original file blocks "
997 "[%lld] [inode:orig %lu, donor %lu]\n", 1081 "[%u] [ino:orig %lu, donor %lu]\n",
998 orig_start, orig_inode->i_size, 1082 orig_start, orig_blocks,
999 orig_inode->i_ino, donor_inode->i_ino); 1083 orig_inode->i_ino, donor_inode->i_ino);
1000 return -EINVAL; 1084 return -EINVAL;
1001 } 1085 }
1002 1086
1003 if (orig_start + *len > orig_inode->i_size) { 1087 if (orig_start + *len > orig_blocks) {
1004 ext4_debug("ext4 move extent: Adjust length " 1088 ext4_debug("ext4 move extent: Adjust length "
1005 "from %llu to %lld. Because it should be " 1089 "from %llu to %llu. Because it should be "
1006 "less than original file size " 1090 "less than original file blocks "
1007 "[ino:orig %lu, donor %lu]\n", 1091 "[ino:orig %lu, donor %lu]\n",
1008 *len, orig_inode->i_size - orig_start, 1092 *len, orig_blocks - orig_start,
1009 orig_inode->i_ino, donor_inode->i_ino); 1093 orig_inode->i_ino, donor_inode->i_ino);
1010 *len = orig_inode->i_size - orig_start; 1094 *len = orig_blocks - orig_start;
1011 } 1095 }
1012 } 1096 }
1013 1097
@@ -1027,18 +1111,23 @@ mext_check_arguments(struct inode *orig_inode,
1027 * @inode1: the inode structure 1111 * @inode1: the inode structure
1028 * @inode2: the inode structure 1112 * @inode2: the inode structure
1029 * 1113 *
1030 * Lock two inodes' i_mutex by i_ino order. This function is moved from 1114 * Lock two inodes' i_mutex by i_ino order.
1031 * fs/inode.c. 1115 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1032 */ 1116 */
1033static void 1117static int
1034mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1118mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1035{ 1119{
1036 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { 1120 int ret = 0;
1037 if (inode1) 1121
1038 mutex_lock(&inode1->i_mutex); 1122 BUG_ON(inode1 == NULL && inode2 == NULL);
1039 else if (inode2) 1123
1040 mutex_lock(&inode2->i_mutex); 1124 ret = mext_check_null_inode(inode1, inode2, __func__);
1041 return; 1125 if (ret < 0)
1126 goto out;
1127
1128 if (inode1 == inode2) {
1129 mutex_lock(&inode1->i_mutex);
1130 goto out;
1042 } 1131 }
1043 1132
1044 if (inode1->i_ino < inode2->i_ino) { 1133 if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1137,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1048 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1137 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1049 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1138 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1050 } 1139 }
1140
1141out:
1142 return ret;
1051} 1143}
1052 1144
1053/** 1145/**
@@ -1056,17 +1148,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1056 * @inode1: the inode that is released first 1148 * @inode1: the inode that is released first
1057 * @inode2: the inode that is released second 1149 * @inode2: the inode that is released second
1058 * 1150 *
1059 * This function is moved from fs/inode.c. 1151 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1060 */ 1152 */
1061 1153
1062static void 1154static int
1063mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1155mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1064{ 1156{
1157 int ret = 0;
1158
1159 BUG_ON(inode1 == NULL && inode2 == NULL);
1160
1161 ret = mext_check_null_inode(inode1, inode2, __func__);
1162 if (ret < 0)
1163 goto out;
1164
1065 if (inode1) 1165 if (inode1)
1066 mutex_unlock(&inode1->i_mutex); 1166 mutex_unlock(&inode1->i_mutex);
1067 1167
1068 if (inode2 && inode2 != inode1) 1168 if (inode2 && inode2 != inode1)
1069 mutex_unlock(&inode2->i_mutex); 1169 mutex_unlock(&inode2->i_mutex);
1170
1171out:
1172 return ret;
1070} 1173}
1071 1174
1072/** 1175/**
@@ -1123,70 +1226,76 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1123 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; 1226 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
1124 ext4_lblk_t rest_blocks; 1227 ext4_lblk_t rest_blocks;
1125 pgoff_t orig_page_offset = 0, seq_end_page; 1228 pgoff_t orig_page_offset = 0, seq_end_page;
1126 int ret, depth, last_extent = 0; 1229 int ret1, ret2, depth, last_extent = 0;
1127 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 1230 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
1128 int data_offset_in_page; 1231 int data_offset_in_page;
1129 int block_len_in_page; 1232 int block_len_in_page;
1130 int uninit; 1233 int uninit;
1131 1234
1132 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1133 mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0)
1238 return ret1;
1134 1239
1135 mext_double_down_read(orig_inode, donor_inode); 1240 mext_double_down_read(orig_inode, donor_inode);
1136 /* Check the filesystem environment whether move_extent can be done */ 1241 /* Check the filesystem environment whether move_extent can be done */
1137 ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 1242 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
1138 donor_start, &len, *moved_len); 1243 donor_start, &len, *moved_len);
1139 mext_double_up_read(orig_inode, donor_inode); 1244 mext_double_up_read(orig_inode, donor_inode);
1140 if (ret) 1245 if (ret1)
1141 goto out2; 1246 goto out;
1142 1247
1143 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; 1248 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
1144 block_end = block_start + len - 1; 1249 block_end = block_start + len - 1;
1145 if (file_end < block_end) 1250 if (file_end < block_end)
1146 len -= block_end - file_end; 1251 len -= block_end - file_end;
1147 1252
1148 get_ext_path(orig_path, orig_inode, block_start, ret); 1253 ret1 = get_ext_path(orig_inode, block_start, &orig_path);
1149 if (orig_path == NULL) 1254 if (ret1)
1150 goto out2; 1255 goto out;
1151 1256
1152 /* Get path structure to check the hole */ 1257 /* Get path structure to check the hole */
1153 get_ext_path(holecheck_path, orig_inode, block_start, ret); 1258 ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
1154 if (holecheck_path == NULL) 1259 if (ret1)
1155 goto out; 1260 goto out;
1156 1261
1157 depth = ext_depth(orig_inode); 1262 depth = ext_depth(orig_inode);
1158 ext_cur = holecheck_path[depth].p_ext; 1263 ext_cur = holecheck_path[depth].p_ext;
1159 if (ext_cur == NULL) {
1160 ret = -EINVAL;
1161 goto out;
1162 }
1163 1264
1164 /* 1265 /*
1165 * Get proper extent whose ee_block is beyond block_start 1266 * Get proper starting location of block replacement if block_start was
1166 * if block_start was within the hole. 1267 * within the hole.
1167 */ 1268 */
1168 if (le32_to_cpu(ext_cur->ee_block) + 1269 if (le32_to_cpu(ext_cur->ee_block) +
1169 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { 1270 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
1271 /*
1272 * The hole exists between extents or the tail of
1273 * original file.
1274 */
1170 last_extent = mext_next_extent(orig_inode, 1275 last_extent = mext_next_extent(orig_inode,
1171 holecheck_path, &ext_cur); 1276 holecheck_path, &ext_cur);
1172 if (last_extent < 0) { 1277 if (last_extent < 0) {
1173 ret = last_extent; 1278 ret1 = last_extent;
1174 goto out; 1279 goto out;
1175 } 1280 }
1176 last_extent = mext_next_extent(orig_inode, orig_path, 1281 last_extent = mext_next_extent(orig_inode, orig_path,
1177 &ext_dummy); 1282 &ext_dummy);
1178 if (last_extent < 0) { 1283 if (last_extent < 0) {
1179 ret = last_extent; 1284 ret1 = last_extent;
1180 goto out; 1285 goto out;
1181 } 1286 }
1182 } 1287 seq_start = le32_to_cpu(ext_cur->ee_block);
1183 seq_start = block_start; 1288 } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
1289 /* The hole exists at the beginning of original file. */
1290 seq_start = le32_to_cpu(ext_cur->ee_block);
1291 else
1292 seq_start = block_start;
1184 1293
1185 /* No blocks within the specified range. */ 1294 /* No blocks within the specified range. */
1186 if (le32_to_cpu(ext_cur->ee_block) > block_end) { 1295 if (le32_to_cpu(ext_cur->ee_block) > block_end) {
1187 ext4_debug("ext4 move extent: The specified range of file " 1296 ext4_debug("ext4 move extent: The specified range of file "
1188 "may be the hole\n"); 1297 "may be the hole\n");
1189 ret = -EINVAL; 1298 ret1 = -EINVAL;
1190 goto out; 1299 goto out;
1191 } 1300 }
1192 1301
@@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1206 last_extent = mext_next_extent(orig_inode, holecheck_path, 1315 last_extent = mext_next_extent(orig_inode, holecheck_path,
1207 &ext_cur); 1316 &ext_cur);
1208 if (last_extent < 0) { 1317 if (last_extent < 0) {
1209 ret = last_extent; 1318 ret1 = last_extent;
1210 break; 1319 break;
1211 } 1320 }
1212 add_blocks = ext4_ext_get_actual_len(ext_cur); 1321 add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1258 while (orig_page_offset <= seq_end_page) { 1367 while (orig_page_offset <= seq_end_page) {
1259 1368
1260 /* Swap original branches with new branches */ 1369 /* Swap original branches with new branches */
1261 ret = move_extent_par_page(o_filp, donor_inode, 1370 ret1 = move_extent_per_page(o_filp, donor_inode,
1262 orig_page_offset, 1371 orig_page_offset,
1263 data_offset_in_page, 1372 data_offset_in_page,
1264 block_len_in_page, uninit); 1373 block_len_in_page, uninit);
1265 if (ret < 0) 1374 if (ret1 < 0)
1266 goto out; 1375 goto out;
1267 orig_page_offset++; 1376 orig_page_offset++;
1268 /* Count how many blocks we have exchanged */ 1377 /* Count how many blocks we have exchanged */
1269 *moved_len += block_len_in_page; 1378 *moved_len += block_len_in_page;
1270 BUG_ON(*moved_len > len); 1379 if (*moved_len > len) {
1380 ext4_error(orig_inode->i_sb, __func__,
1381 "We replaced blocks too much! "
1382 "sum of replaced: %llu requested: %llu",
1383 *moved_len, len);
1384 ret1 = -EIO;
1385 goto out;
1386 }
1271 1387
1272 data_offset_in_page = 0; 1388 data_offset_in_page = 0;
1273 rest_blocks -= block_len_in_page; 1389 rest_blocks -= block_len_in_page;
@@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1280 /* Decrease buffer counter */ 1396 /* Decrease buffer counter */
1281 if (holecheck_path) 1397 if (holecheck_path)
1282 ext4_ext_drop_refs(holecheck_path); 1398 ext4_ext_drop_refs(holecheck_path);
1283 get_ext_path(holecheck_path, orig_inode, 1399 ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
1284 seq_start, ret); 1400 if (ret1)
1285 if (holecheck_path == NULL)
1286 break; 1401 break;
1287 depth = holecheck_path->p_depth; 1402 depth = holecheck_path->p_depth;
1288 1403
1289 /* Decrease buffer counter */ 1404 /* Decrease buffer counter */
1290 if (orig_path) 1405 if (orig_path)
1291 ext4_ext_drop_refs(orig_path); 1406 ext4_ext_drop_refs(orig_path);
1292 get_ext_path(orig_path, orig_inode, seq_start, ret); 1407 ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
1293 if (orig_path == NULL) 1408 if (ret1)
1294 break; 1409 break;
1295 1410
1296 ext_cur = holecheck_path[depth].p_ext; 1411 ext_cur = holecheck_path[depth].p_ext;
@@ -1307,14 +1422,13 @@ out:
1307 ext4_ext_drop_refs(holecheck_path); 1422 ext4_ext_drop_refs(holecheck_path);
1308 kfree(holecheck_path); 1423 kfree(holecheck_path);
1309 } 1424 }
1310out2:
1311 mext_inode_double_unlock(orig_inode, donor_inode);
1312 1425
1313 if (ret) 1426 ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
1314 return ret;
1315 1427
1316 /* All of the specified blocks must be exchanged in succeed */ 1428 if (ret1)
1317 BUG_ON(*moved_len != len); 1429 return ret1;
1430 else if (ret2)
1431 return ret2;
1318 1432
1319 return 0; 1433 return 0;
1320} 1434}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 114abe5d2c1d..42f81d285cd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1518 return retval; 1518 return retval;
1519 1519
1520 if (blocks == 1 && !dx_fallback && 1520 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) 1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
1522 return make_indexed_dir(handle, dentry, inode, bh); 1522 retval = make_indexed_dir(handle, dentry, inode, bh);
1523 if (retval == -ENOSPC)
1524 brelse(bh);
1525 return retval;
1526 }
1523 brelse(bh); 1527 brelse(bh);
1524 } 1528 }
1525 bh = ext4_append(handle, dir, &block, &retval); 1529 bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1528 de = (struct ext4_dir_entry_2 *) bh->b_data; 1532 de = (struct ext4_dir_entry_2 *) bh->b_data;
1529 de->inode = 0; 1533 de->inode = 0;
1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1534 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1531 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1535 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1536 if (retval == -ENOSPC)
1537 brelse(bh);
1538 return retval;
1532} 1539}
1533 1540
1534/* 1541/*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1590 goto cleanup; 1597 goto cleanup;
1591 node2 = (struct dx_node *)(bh2->b_data); 1598 node2 = (struct dx_node *)(bh2->b_data);
1592 entries2 = node2->entries; 1599 entries2 = node2->entries;
1600 memset(&node2->fake, 0, sizeof(struct fake_dirent));
1593 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, 1601 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1594 sb->s_blocksize); 1602 sb->s_blocksize);
1595 node2->fake.inode = 0;
1596 BUFFER_TRACE(frame->bh, "get_write_access"); 1603 BUFFER_TRACE(frame->bh, "get_write_access");
1597 err = ext4_journal_get_write_access(handle, frame->bh); 1604 err = ext4_journal_get_write_access(handle, frame->bh);
1598 if (err) 1605 if (err)
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1657 if (!de) 1664 if (!de)
1658 goto cleanup; 1665 goto cleanup;
1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1666 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1660 bh = NULL; 1667 if (err != -ENOSPC)
1668 bh = NULL;
1661 goto cleanup; 1669 goto cleanup;
1662 1670
1663journal_error: 1671journal_error:
@@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
2310 struct inode *inode = old_dentry->d_inode; 2318 struct inode *inode = old_dentry->d_inode;
2311 int err, retries = 0; 2319 int err, retries = 0;
2312 2320
2313 if (EXT4_DIR_LINK_MAX(inode)) 2321 if (inode->i_nlink >= EXT4_LINK_MAX)
2314 return -EMLINK; 2322 return -EMLINK;
2315 2323
2316 /* 2324 /*
@@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2413 goto end_rename; 2421 goto end_rename;
2414 retval = -EMLINK; 2422 retval = -EMLINK;
2415 if (!new_inode && new_dir != old_dir && 2423 if (!new_inode && new_dir != old_dir &&
2416 new_dir->i_nlink >= EXT4_LINK_MAX) 2424 EXT4_DIR_LINK_MAX(new_dir))
2417 goto end_rename; 2425 goto end_rename;
2418 } 2426 }
2419 if (!new_bh) { 2427 if (!new_bh) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
746 struct inode *inode = NULL; 746 struct inode *inode = NULL;
747 handle_t *handle; 747 handle_t *handle;
748 int gdb_off, gdb_num; 748 int gdb_off, gdb_num;
749 int num_grp_locked = 0;
750 int err, err2; 749 int err, err2;
751 750
752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 751 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
856 * using the new disk blocks. 855 * using the new disk blocks.
857 */ 856 */
858 857
859 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
860 /* Update group descriptor block for new group */ 858 /* Update group descriptor block for new group */
861 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 859 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
862 gdb_off * EXT4_DESC_SIZE(sb)); 860 gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
875 * descriptor 873 * descriptor
876 */ 874 */
877 err = ext4_mb_add_groupinfo(sb, input->group, gdp); 875 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
878 if (err) { 876 if (err)
879 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
880 goto exit_journal; 877 goto exit_journal;
881 }
882 878
883 /* 879 /*
884 * Make the new blocks and inodes valid next. We do this before 880 * Make the new blocks and inodes valid next. We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
920 916
921 /* Update the global fs size fields */ 917 /* Update the global fs size fields */
922 sbi->s_groups_count++; 918 sbi->s_groups_count++;
923 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
924 919
925 ext4_handle_dirty_metadata(handle, NULL, primary); 920 ext4_handle_dirty_metadata(handle, NULL, primary);
926 921
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..a6b1ab734728 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
45#include "ext4_jbd2.h" 45#include "ext4_jbd2.h"
46#include "xattr.h" 46#include "xattr.h"
47#include "acl.h" 47#include "acl.h"
48#include "mballoc.h"
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
50#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
@@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
344 errstr = "Out of memory"; 345 errstr = "Out of memory";
345 break; 346 break;
346 case -EROFS: 347 case -EROFS:
347 if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) 348 if (!sb || (EXT4_SB(sb)->s_journal &&
349 EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
348 errstr = "Journal has aborted"; 350 errstr = "Journal has aborted";
349 else 351 else
350 errstr = "Readonly filesystem"; 352 errstr = "Readonly filesystem";
@@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb,
1279 *journal_devnum = option; 1281 *journal_devnum = option;
1280 break; 1282 break;
1281 case Opt_journal_checksum: 1283 case Opt_journal_checksum:
1282 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1284 break; /* Kept for backwards compatibility */
1283 break;
1284 case Opt_journal_async_commit: 1285 case Opt_journal_async_commit:
1285 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1286 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1286 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1287 break; 1287 break;
1288 case Opt_noload: 1288 case Opt_noload:
1289 set_opt(sbi->s_mount_opt, NOLOAD); 1289 set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
1695 gdp = ext4_get_group_desc(sb, i, NULL); 1695 gdp = ext4_get_group_desc(sb, i, NULL);
1696 1696
1697 flex_group = ext4_flex_group(sbi, i); 1697 flex_group = ext4_flex_group(sbi, i);
1698 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, 1698 atomic_add(ext4_free_inodes_count(sb, gdp),
1699 ext4_free_inodes_count(sb, gdp)); 1699 &sbi->s_flex_groups[flex_group].free_inodes);
1700 atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, 1700 atomic_add(ext4_free_blks_count(sb, gdp),
1701 ext4_free_blks_count(sb, gdp)); 1701 &sbi->s_flex_groups[flex_group].free_blocks);
1702 atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, 1702 atomic_add(ext4_used_dirs_count(sb, gdp),
1703 ext4_used_dirs_count(sb, gdp)); 1703 &sbi->s_flex_groups[flex_group].used_dirs);
1704 } 1704 }
1705 1705
1706 return 1; 1706 return 1;
@@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = {
2253 .release = ext4_sb_release, 2253 .release = ext4_sb_release,
2254}; 2254};
2255 2255
2256/*
2257 * Check whether this filesystem can be mounted based on
2258 * the features present and the RDONLY/RDWR mount requested.
2259 * Returns 1 if this filesystem can be mounted as requested,
2260 * 0 if it cannot be.
2261 */
2262static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2263{
2264 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
2265 ext4_msg(sb, KERN_ERR,
2266 "Couldn't mount because of "
2267 "unsupported optional features (%x)",
2268 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2269 ~EXT4_FEATURE_INCOMPAT_SUPP));
2270 return 0;
2271 }
2272
2273 if (readonly)
2274 return 1;
2275
2276 /* Check that feature set is OK for a read-write mount */
2277 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
2278 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
2279 "unsupported optional features (%x)",
2280 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2281 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2282 return 0;
2283 }
2284 /*
2285 * Large file size enabled file system can only be mounted
2286 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
2287 */
2288 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2289 if (sizeof(blkcnt_t) < sizeof(u64)) {
2290 ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
2291 "cannot be mounted RDWR without "
2292 "CONFIG_LBDAF");
2293 return 0;
2294 }
2295 }
2296 return 1;
2297}
2298
2256static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2299static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2257 __releases(kernel_lock) 2300 __releases(kernel_lock)
2258 __acquires(kernel_lock) 2301 __acquires(kernel_lock)
@@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2274 unsigned int db_count; 2317 unsigned int db_count;
2275 unsigned int i; 2318 unsigned int i;
2276 int needs_recovery, has_huge_files; 2319 int needs_recovery, has_huge_files;
2277 int features;
2278 __u64 blocks_count; 2320 __u64 blocks_count;
2279 int err; 2321 int err;
2280 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 2322 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2401 * previously didn't change the revision level when setting the flags, 2443 * previously didn't change the revision level when setting the flags,
2402 * so there is a chance incompat flags are set on a rev 0 filesystem. 2444 * so there is a chance incompat flags are set on a rev 0 filesystem.
2403 */ 2445 */
2404 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2446 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
2405 if (features) {
2406 ext4_msg(sb, KERN_ERR,
2407 "Couldn't mount because of "
2408 "unsupported optional features (%x)",
2409 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2410 ~EXT4_FEATURE_INCOMPAT_SUPP));
2411 goto failed_mount;
2412 }
2413 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2414 if (!(sb->s_flags & MS_RDONLY) && features) {
2415 ext4_msg(sb, KERN_ERR,
2416 "Couldn't mount RDWR because of "
2417 "unsupported optional features (%x)",
2418 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2419 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2420 goto failed_mount; 2447 goto failed_mount;
2421 } 2448
2422 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2423 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2424 if (has_huge_files) {
2425 /*
2426 * Large file size enabled file system can only be
2427 * mount if kernel is build with CONFIG_LBDAF
2428 */
2429 if (sizeof(root->i_blocks) < sizeof(u64) &&
2430 !(sb->s_flags & MS_RDONLY)) {
2431 ext4_msg(sb, KERN_ERR, "Filesystem with huge "
2432 "files cannot be mounted read-write "
2433 "without CONFIG_LBDAF");
2434 goto failed_mount;
2435 }
2436 }
2437 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 2449 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
2438 2450
2439 if (blocksize < EXT4_MIN_BLOCK_SIZE || 2451 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2469 } 2481 }
2470 } 2482 }
2471 2483
2484 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2485 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2472 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 2486 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
2473 has_huge_files); 2487 has_huge_files);
2474 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 2488 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2549 goto failed_mount; 2563 goto failed_mount;
2550 } 2564 }
2551 2565
2552 if (ext4_blocks_count(es) > 2566 /*
2553 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 2567 * Test whether we have more sectors than will fit in sector_t,
2568 * and whether the max offset is addressable by the page cache.
2569 */
2570 if ((ext4_blocks_count(es) >
2571 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
2572 (ext4_blocks_count(es) >
2573 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2554 ext4_msg(sb, KERN_ERR, "filesystem" 2574 ext4_msg(sb, KERN_ERR, "filesystem"
2555 " too large to mount safely"); 2575 " too large to mount safely on this system");
2556 if (sizeof(sector_t) < 8) 2576 if (sizeof(sector_t) < 8)
2557 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 2577 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2578 ret = -EFBIG;
2558 goto failed_mount; 2579 goto failed_mount;
2559 } 2580 }
2560 2581
@@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2595 goto failed_mount; 2616 goto failed_mount;
2596 } 2617 }
2597 sbi->s_groups_count = blocks_count; 2618 sbi->s_groups_count = blocks_count;
2619 sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
2620 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
2598 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 2621 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2599 EXT4_DESC_PER_BLOCK(sb); 2622 EXT4_DESC_PER_BLOCK(sb);
2600 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), 2623 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2729 goto failed_mount4; 2752 goto failed_mount4;
2730 } 2753 }
2731 2754
2732 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2755 jbd2_journal_set_features(sbi->s_journal,
2733 jbd2_journal_set_features(sbi->s_journal, 2756 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2734 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 2757 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
2758 jbd2_journal_set_features(sbi->s_journal, 0, 0,
2735 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2759 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2736 } else if (test_opt(sb, JOURNAL_CHECKSUM)) { 2760 else
2737 jbd2_journal_set_features(sbi->s_journal,
2738 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2739 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 2761 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2740 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2762 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2741 } else {
2742 jbd2_journal_clear_features(sbi->s_journal,
2743 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2744 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2745 }
2746 2763
2747 /* We have now updated the journal if required, so we can 2764 /* We have now updated the journal if required, so we can
2748 * validate the data journaling mode. */ 2765 * validate the data journaling mode. */
@@ -3208,7 +3225,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3208 clear_buffer_write_io_error(sbh); 3225 clear_buffer_write_io_error(sbh);
3209 set_buffer_uptodate(sbh); 3226 set_buffer_uptodate(sbh);
3210 } 3227 }
3211 es->s_wtime = cpu_to_le32(get_seconds()); 3228 /*
3229 * If the file system is mounted read-only, don't update the
3230 * superblock write time. This avoids updating the superblock
3231 * write time when we are mounting the root file system
3232 * read/only but we need to replay the journal; at that point,
3233 * for people who are east of GMT and who make their clock
3234 * tick in localtime for Windows bug-for-bug compatibility,
3235 * the clock is set in the future, and this will cause e2fsck
3236 * to complain and force a full file system check.
3237 */
3238 if (!(sb->s_flags & MS_RDONLY))
3239 es->s_wtime = cpu_to_le32(get_seconds());
3212 es->s_kbytes_written = 3240 es->s_kbytes_written =
3213 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3241 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3214 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3242 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3477 if (sbi->s_journal) 3505 if (sbi->s_journal)
3478 ext4_mark_recovery_complete(sb, es); 3506 ext4_mark_recovery_complete(sb, es);
3479 } else { 3507 } else {
3480 int ret; 3508 /* Make sure we can mount this feature set readwrite */
3481 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3509 if (!ext4_feature_set_ok(sb, 0)) {
3482 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3483 ext4_msg(sb, KERN_WARNING, "couldn't "
3484 "remount RDWR because of unsupported "
3485 "optional features (%x)",
3486 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3487 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3488 err = -EROFS; 3510 err = -EROFS;
3489 goto restore_opts; 3511 goto restore_opts;
3490 } 3512 }
3491
3492 /* 3513 /*
3493 * Make sure the group descriptor checksums 3514 * Make sure the group descriptor checksums
3494 * are sane. If they aren't, refuse to remount r/w. 3515 * are sane. If they aren't, refuse to remount r/w.
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
810 get_bh(new_bh); 810 get_bh(new_bh);
811 } else { 811 } else {
812 /* We need to allocate a new block */ 812 /* We need to allocate a new block */
813 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 813 ext4_fsblk_t goal, block;
814
815 goal = ext4_group_first_block_no(sb,
814 EXT4_I(inode)->i_block_group); 816 EXT4_I(inode)->i_block_group);
815 ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, 817
818 /* non-extent files can't have physical blocks past 2^32 */
819 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
820 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
821
822 block = ext4_new_meta_blocks(handle, inode,
816 goal, NULL, &error); 823 goal, NULL, &error);
817 if (error) 824 if (error)
818 goto cleanup; 825 goto cleanup;
826
827 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
828 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
829
819 ea_idebug(inode, "creating block %d", block); 830 ea_idebug(inode, "creating block %d", block);
820 831
821 new_bh = sb_getblk(sb, block); 832 new_bh = sb_getblk(sb, block);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0df600e9162d..26d991ddc1e6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
27#include <linux/bio.h> 27#include <linux/bio.h>
28#include <linux/blkdev.h>
28#include <trace/events/jbd2.h> 29#include <trace/events/jbd2.h>
29 30
30/* 31/*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
133 bh->b_end_io = journal_end_buffer_io_sync; 134 bh->b_end_io = journal_end_buffer_io_sync;
134 135
135 if (journal->j_flags & JBD2_BARRIER && 136 if (journal->j_flags & JBD2_BARRIER &&
136 !JBD2_HAS_INCOMPAT_FEATURE(journal, 137 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 138 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
138 set_buffer_ordered(bh); 139 set_buffer_ordered(bh);
139 barrier_done = 1; 140 barrier_done = 1;
140 } 141 }
@@ -706,11 +707,13 @@ start_journal_io:
706 /* Done it all: now write the commit record asynchronously. */ 707 /* Done it all: now write the commit record asynchronously. */
707 708
708 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 709 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
709 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
710 err = journal_submit_commit_record(journal, commit_transaction, 711 err = journal_submit_commit_record(journal, commit_transaction,
711 &cbh, crc32_sum); 712 &cbh, crc32_sum);
712 if (err) 713 if (err)
713 __jbd2_journal_abort_hard(journal); 714 __jbd2_journal_abort_hard(journal);
715 if (journal->j_flags & JBD2_BARRIER)
716 blkdev_issue_flush(journal->j_dev, NULL);
714 } 717 }
715 718
716 /* 719 /*
@@ -833,7 +836,7 @@ wait_for_iobuf:
833 jbd_debug(3, "JBD: commit phase 5\n"); 836 jbd_debug(3, "JBD: commit phase 5\n");
834 837
835 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 838 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
836 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 839 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
837 err = journal_submit_commit_record(journal, commit_transaction, 840 err = journal_submit_commit_record(journal, commit_transaction,
838 &cbh, crc32_sum); 841 &cbh, crc32_sum);
839 if (err) 842 if (err)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..a8a358bc0f21 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
1187 1187
1188 first = be32_to_cpu(sb->s_first); 1188 first = be32_to_cpu(sb->s_first);
1189 last = be32_to_cpu(sb->s_maxlen); 1189 last = be32_to_cpu(sb->s_maxlen);
1190 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1191 printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
1192 first, last);
1193 journal_fail_superblock(journal);
1194 return -EINVAL;
1195 }
1190 1196
1191 journal->j_first = first; 1197 journal->j_first = first;
1192 journal->j_last = last; 1198 journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
57 INIT_LIST_HEAD(&transaction->t_private_list); 57 INIT_LIST_HEAD(&transaction->t_private_list);
58 58
59 /* Set up the commit timer for the new transaction. */ 59 /* Set up the commit timer for the new transaction. */
60 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 60 journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
61 add_timer(&journal->j_commit_timer); 61 add_timer(&journal->j_commit_timer);
62 62
63 J_ASSERT(journal->j_running_transaction == NULL); 63 J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
238 __jbd2_log_space_left(journal)); 238 __jbd2_log_space_left(journal));
239 spin_unlock(&transaction->t_handle_lock); 239 spin_unlock(&transaction->t_handle_lock);
240 spin_unlock(&journal->j_state_lock); 240 spin_unlock(&journal->j_state_lock);
241
242 lock_map_acquire(&handle->h_lockdep_map);
241out: 243out:
242 if (unlikely(new_transaction)) /* It's usually NULL */ 244 if (unlikely(new_transaction)) /* It's usually NULL */
243 kfree(new_transaction); 245 kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
303 handle = ERR_PTR(err); 305 handle = ERR_PTR(err);
304 goto out; 306 goto out;
305 } 307 }
306
307 lock_map_acquire(&handle->h_lockdep_map);
308out: 308out:
309 return handle; 309 return handle;
310} 310}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
426 __jbd2_log_start_commit(journal, transaction->t_tid); 426 __jbd2_log_start_commit(journal, transaction->t_tid);
427 spin_unlock(&journal->j_state_lock); 427 spin_unlock(&journal->j_state_lock);
428 428
429 lock_map_release(&handle->h_lockdep_map);
429 handle->h_buffer_credits = nblocks; 430 handle->h_buffer_credits = nblocks;
430 ret = start_this_handle(journal, handle); 431 ret = start_this_handle(journal, handle);
431 return ret; 432 return ret;
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d97eb652d6ca..52695d3dfd0b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -652,7 +652,7 @@ struct transaction_s
652 * This transaction is being forced and some process is 652 * This transaction is being forced and some process is
653 * waiting for it to finish. 653 * waiting for it to finish.
654 */ 654 */
655 int t_synchronous_commit:1; 655 unsigned int t_synchronous_commit:1;
656 656
657 /* 657 /*
658 * For use by the filesystem to store fs-specific data 658 * For use by the filesystem to store fs-specific data
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 8d433c4e3709..c1bd8f1e8b94 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -5,10 +5,15 @@
5#define _TRACE_EXT4_H 5#define _TRACE_EXT4_H
6 6
7#include <linux/writeback.h> 7#include <linux/writeback.h>
8#include "../../../fs/ext4/ext4.h"
9#include "../../../fs/ext4/mballoc.h"
10#include <linux/tracepoint.h> 8#include <linux/tracepoint.h>
11 9
10struct ext4_allocation_context;
11struct ext4_allocation_request;
12struct ext4_prealloc_space;
13struct ext4_inode_info;
14
15#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
16
12TRACE_EVENT(ext4_free_inode, 17TRACE_EVENT(ext4_free_inode,
13 TP_PROTO(struct inode *inode), 18 TP_PROTO(struct inode *inode),
14 19
@@ -33,8 +38,8 @@ TRACE_EVENT(ext4_free_inode,
33 ), 38 ),
34 39
35 TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", 40 TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu",
36 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode, 41 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
37 __entry->uid, __entry->gid, 42 __entry->mode, __entry->uid, __entry->gid,
38 (unsigned long long) __entry->blocks) 43 (unsigned long long) __entry->blocks)
39); 44);
40 45
@@ -56,7 +61,8 @@ TRACE_EVENT(ext4_request_inode,
56 ), 61 ),
57 62
58 TP_printk("dev %s dir %lu mode %d", 63 TP_printk("dev %s dir %lu mode %d",
59 jbd2_dev_to_name(__entry->dev), __entry->dir, __entry->mode) 64 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir,
65 __entry->mode)
60); 66);
61 67
62TRACE_EVENT(ext4_allocate_inode, 68TRACE_EVENT(ext4_allocate_inode,
@@ -79,7 +85,8 @@ TRACE_EVENT(ext4_allocate_inode,
79 ), 85 ),
80 86
81 TP_printk("dev %s ino %lu dir %lu mode %d", 87 TP_printk("dev %s ino %lu dir %lu mode %d",
82 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->dir, __entry->mode) 88 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
89 (unsigned long) __entry->dir, __entry->mode)
83); 90);
84 91
85TRACE_EVENT(ext4_write_begin, 92TRACE_EVENT(ext4_write_begin,
@@ -106,8 +113,8 @@ TRACE_EVENT(ext4_write_begin,
106 ), 113 ),
107 114
108 TP_printk("dev %s ino %lu pos %llu len %u flags %u", 115 TP_printk("dev %s ino %lu pos %llu len %u flags %u",
109 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, 116 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
110 __entry->flags) 117 __entry->pos, __entry->len, __entry->flags)
111); 118);
112 119
113TRACE_EVENT(ext4_ordered_write_end, 120TRACE_EVENT(ext4_ordered_write_end,
@@ -133,8 +140,8 @@ TRACE_EVENT(ext4_ordered_write_end,
133 ), 140 ),
134 141
135 TP_printk("dev %s ino %lu pos %llu len %u copied %u", 142 TP_printk("dev %s ino %lu pos %llu len %u copied %u",
136 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, 143 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
137 __entry->copied) 144 __entry->pos, __entry->len, __entry->copied)
138); 145);
139 146
140TRACE_EVENT(ext4_writeback_write_end, 147TRACE_EVENT(ext4_writeback_write_end,
@@ -160,8 +167,8 @@ TRACE_EVENT(ext4_writeback_write_end,
160 ), 167 ),
161 168
162 TP_printk("dev %s ino %lu pos %llu len %u copied %u", 169 TP_printk("dev %s ino %lu pos %llu len %u copied %u",
163 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, 170 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
164 __entry->copied) 171 __entry->pos, __entry->len, __entry->copied)
165); 172);
166 173
167TRACE_EVENT(ext4_journalled_write_end, 174TRACE_EVENT(ext4_journalled_write_end,
@@ -186,8 +193,8 @@ TRACE_EVENT(ext4_journalled_write_end,
186 ), 193 ),
187 194
188 TP_printk("dev %s ino %lu pos %llu len %u copied %u", 195 TP_printk("dev %s ino %lu pos %llu len %u copied %u",
189 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, 196 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
190 __entry->copied) 197 __entry->pos, __entry->len, __entry->copied)
191); 198);
192 199
193TRACE_EVENT(ext4_writepage, 200TRACE_EVENT(ext4_writepage,
@@ -209,7 +216,8 @@ TRACE_EVENT(ext4_writepage,
209 ), 216 ),
210 217
211 TP_printk("dev %s ino %lu page_index %lu", 218 TP_printk("dev %s ino %lu page_index %lu",
212 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) 219 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
220 __entry->index)
213); 221);
214 222
215TRACE_EVENT(ext4_da_writepages, 223TRACE_EVENT(ext4_da_writepages,
@@ -243,14 +251,49 @@ TRACE_EVENT(ext4_da_writepages,
243 __entry->range_cyclic = wbc->range_cyclic; 251 __entry->range_cyclic = wbc->range_cyclic;
244 ), 252 ),
245 253
246 TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", 254 TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d",
247 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, 255 jbd2_dev_to_name(__entry->dev),
256 (unsigned long) __entry->ino, __entry->nr_to_write,
248 __entry->pages_skipped, __entry->range_start, 257 __entry->pages_skipped, __entry->range_start,
249 __entry->range_end, __entry->nonblocking, 258 __entry->range_end, __entry->nonblocking,
250 __entry->for_kupdate, __entry->for_reclaim, 259 __entry->for_kupdate, __entry->for_reclaim,
251 __entry->range_cyclic) 260 __entry->range_cyclic)
252); 261);
253 262
263TRACE_EVENT(ext4_da_write_pages,
264 TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
265
266 TP_ARGS(inode, mpd),
267
268 TP_STRUCT__entry(
269 __field( dev_t, dev )
270 __field( ino_t, ino )
271 __field( __u64, b_blocknr )
272 __field( __u32, b_size )
273 __field( __u32, b_state )
274 __field( unsigned long, first_page )
275 __field( int, io_done )
276 __field( int, pages_written )
277 ),
278
279 TP_fast_assign(
280 __entry->dev = inode->i_sb->s_dev;
281 __entry->ino = inode->i_ino;
282 __entry->b_blocknr = mpd->b_blocknr;
283 __entry->b_size = mpd->b_size;
284 __entry->b_state = mpd->b_state;
285 __entry->first_page = mpd->first_page;
286 __entry->io_done = mpd->io_done;
287 __entry->pages_written = mpd->pages_written;
288 ),
289
290 TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
291 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
292 __entry->b_blocknr, __entry->b_size,
293 __entry->b_state, __entry->first_page,
294 __entry->io_done, __entry->pages_written)
295);
296
254TRACE_EVENT(ext4_da_writepages_result, 297TRACE_EVENT(ext4_da_writepages_result,
255 TP_PROTO(struct inode *inode, struct writeback_control *wbc, 298 TP_PROTO(struct inode *inode, struct writeback_control *wbc,
256 int ret, int pages_written), 299 int ret, int pages_written),
@@ -280,7 +323,8 @@ TRACE_EVENT(ext4_da_writepages_result,
280 ), 323 ),
281 324
282 TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", 325 TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d",
283 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret, 326 jbd2_dev_to_name(__entry->dev),
327 (unsigned long) __entry->ino, __entry->ret,
284 __entry->pages_written, __entry->pages_skipped, 328 __entry->pages_written, __entry->pages_skipped,
285 __entry->encountered_congestion, __entry->more_io, 329 __entry->encountered_congestion, __entry->more_io,
286 __entry->no_nrwrite_index_update) 330 __entry->no_nrwrite_index_update)
@@ -309,8 +353,8 @@ TRACE_EVENT(ext4_da_write_begin,
309 ), 353 ),
310 354
311 TP_printk("dev %s ino %lu pos %llu len %u flags %u", 355 TP_printk("dev %s ino %lu pos %llu len %u flags %u",
312 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, 356 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
313 __entry->flags) 357 __entry->pos, __entry->len, __entry->flags)
314); 358);
315 359
316TRACE_EVENT(ext4_da_write_end, 360TRACE_EVENT(ext4_da_write_end,
@@ -336,8 +380,8 @@ TRACE_EVENT(ext4_da_write_end,
336 ), 380 ),
337 381
338 TP_printk("dev %s ino %lu pos %llu len %u copied %u", 382 TP_printk("dev %s ino %lu pos %llu len %u copied %u",
339 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, 383 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
340 __entry->copied) 384 __entry->pos, __entry->len, __entry->copied)
341); 385);
342 386
343TRACE_EVENT(ext4_discard_blocks, 387TRACE_EVENT(ext4_discard_blocks,
@@ -387,8 +431,8 @@ TRACE_EVENT(ext4_mb_new_inode_pa,
387 ), 431 ),
388 432
389 TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", 433 TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu",
390 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart, 434 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
391 __entry->pa_len, __entry->pa_lstart) 435 __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
392); 436);
393 437
394TRACE_EVENT(ext4_mb_new_group_pa, 438TRACE_EVENT(ext4_mb_new_group_pa,
@@ -415,8 +459,8 @@ TRACE_EVENT(ext4_mb_new_group_pa,
415 ), 459 ),
416 460
417 TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", 461 TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu",
418 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart, 462 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
419 __entry->pa_len, __entry->pa_lstart) 463 __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
420); 464);
421 465
422TRACE_EVENT(ext4_mb_release_inode_pa, 466TRACE_EVENT(ext4_mb_release_inode_pa,
@@ -442,8 +486,8 @@ TRACE_EVENT(ext4_mb_release_inode_pa,
442 ), 486 ),
443 487
444 TP_printk("dev %s ino %lu block %llu count %u", 488 TP_printk("dev %s ino %lu block %llu count %u",
445 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block, 489 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
446 __entry->count) 490 __entry->block, __entry->count)
447); 491);
448 492
449TRACE_EVENT(ext4_mb_release_group_pa, 493TRACE_EVENT(ext4_mb_release_group_pa,
@@ -488,7 +532,7 @@ TRACE_EVENT(ext4_discard_preallocations,
488 ), 532 ),
489 533
490 TP_printk("dev %s ino %lu", 534 TP_printk("dev %s ino %lu",
491 jbd2_dev_to_name(__entry->dev), __entry->ino) 535 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino)
492); 536);
493 537
494TRACE_EVENT(ext4_mb_discard_preallocations, 538TRACE_EVENT(ext4_mb_discard_preallocations,
@@ -543,8 +587,8 @@ TRACE_EVENT(ext4_request_blocks,
543 ), 587 ),
544 588
545 TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", 589 TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
546 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags, 590 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
547 __entry->len, 591 __entry->flags, __entry->len,
548 (unsigned long long) __entry->logical, 592 (unsigned long long) __entry->logical,
549 (unsigned long long) __entry->goal, 593 (unsigned long long) __entry->goal,
550 (unsigned long long) __entry->lleft, 594 (unsigned long long) __entry->lleft,
@@ -587,8 +631,8 @@ TRACE_EVENT(ext4_allocate_blocks,
587 ), 631 ),
588 632
589 TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", 633 TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
590 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags, 634 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
591 __entry->len, __entry->block, 635 __entry->flags, __entry->len, __entry->block,
592 (unsigned long long) __entry->logical, 636 (unsigned long long) __entry->logical,
593 (unsigned long long) __entry->goal, 637 (unsigned long long) __entry->goal,
594 (unsigned long long) __entry->lleft, 638 (unsigned long long) __entry->lleft,
@@ -621,8 +665,8 @@ TRACE_EVENT(ext4_free_blocks,
621 ), 665 ),
622 666
623 TP_printk("dev %s ino %lu block %llu count %lu metadata %d", 667 TP_printk("dev %s ino %lu block %llu count %lu metadata %d",
624 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block, 668 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
625 __entry->count, __entry->metadata) 669 __entry->block, __entry->count, __entry->metadata)
626); 670);
627 671
628TRACE_EVENT(ext4_sync_file, 672TRACE_EVENT(ext4_sync_file,
@@ -645,8 +689,8 @@ TRACE_EVENT(ext4_sync_file,
645 ), 689 ),
646 690
647 TP_printk("dev %s ino %ld parent %ld datasync %d ", 691 TP_printk("dev %s ino %ld parent %ld datasync %d ",
648 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->parent, 692 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
649 __entry->datasync) 693 (unsigned long) __entry->parent, __entry->datasync)
650); 694);
651 695
652TRACE_EVENT(ext4_sync_fs, 696TRACE_EVENT(ext4_sync_fs,
@@ -669,6 +713,30 @@ TRACE_EVENT(ext4_sync_fs,
669 __entry->wait) 713 __entry->wait)
670); 714);
671 715
716TRACE_EVENT(ext4_alloc_da_blocks,
717 TP_PROTO(struct inode *inode),
718
719 TP_ARGS(inode),
720
721 TP_STRUCT__entry(
722 __field( dev_t, dev )
723 __field( ino_t, ino )
724 __field( unsigned int, data_blocks )
725 __field( unsigned int, meta_blocks )
726 ),
727
728 TP_fast_assign(
729 __entry->dev = inode->i_sb->s_dev;
730 __entry->ino = inode->i_ino;
731 __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
732 __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
733 ),
734
735 TP_printk("dev %s ino %lu data_blocks %u meta_blocks %u",
736 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
737 __entry->data_blocks, __entry->meta_blocks)
738);
739
672#endif /* _TRACE_EXT4_H */ 740#endif /* _TRACE_EXT4_H */
673 741
674/* This part must be outside protection */ 742/* This part must be outside protection */
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index 10813fa0c8d0..b851f0b4701c 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -159,7 +159,7 @@ TRACE_EVENT(jbd2_submit_inode_data,
159 ), 159 ),
160 160
161 TP_printk("dev %s ino %lu", 161 TP_printk("dev %s ino %lu",
162 jbd2_dev_to_name(__entry->dev), __entry->ino) 162 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino)
163); 163);
164 164
165#endif /* _TRACE_JBD2_H */ 165#endif /* _TRACE_JBD2_H */