aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-24 15:55:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-24 15:55:26 -0400
commit0e01df100b6bf22a1de61b66657502a6454153c5 (patch)
treeaae8f9787efc3014696b3e5ae854c1cf9e472bdd
parenta56f489502e28caac56c8a0735549740f0ae0711 (diff)
parent12735f881952c32b31bc4e433768f18489f79ec9 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Fix a number of bugs, most notably a potential stale data exposure after a crash and a potential BUG_ON crash if a file has the data journalling flag enabled while it has dirty delayed allocation blocks that haven't been written yet. Also fix a potential crash in the new project quota code and a maliciously corrupted file system. In addition, fix some DAX-specific bugs, including when there is a transient ENOSPC situation and races between writes via direct I/O and an mmap'ed segment that could lead to lost I/O. Finally the usual set of miscellaneous cleanups" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (23 commits) ext4: pre-zero allocated blocks for DAX IO ext4: refactor direct IO code ext4: fix race in transient ENOSPC detection ext4: handle transient ENOSPC properly for DAX dax: call get_blocks() with create == 1 for write faults to unwritten extents ext4: remove unmeetable inconsisteny check from ext4_find_extent() jbd2: remove excess descriptions for handle_s ext4: remove unnecessary bio get/put ext4: silence UBSAN in ext4_mb_init() ext4: address UBSAN warning in mb_find_order_for_block() ext4: fix oops on corrupted filesystem ext4: fix check of dqget() return value in ext4_ioctl_setproject() ext4: clean up error handling when orphan list is corrupted ext4: fix hang when processing corrupted orphaned inode list ext4: remove trailing \n from ext4_warning/ext4_error calls ext4: fix races between changing inode journal mode and ext4_writepages ext4: handle unwritten or delalloc buffers before enabling data journaling ext4: fix jbd2 handle extension in ext4_ext_truncate_extend_restart() ext4: do not ask jbd2 to write data for delalloc buffers jbd2: add support for avoiding data writes during transaction commits ...
-rw-r--r--fs/compat.c4
-rw-r--r--fs/dax.c2
-rw-r--r--fs/ext4/balloc.c3
-rw-r--r--fs/ext4/dir.c5
-rw-r--r--fs/ext4/ext4.h20
-rw-r--r--fs/ext4/ext4_jbd2.h15
-rw-r--r--fs/ext4/extents.c20
-rw-r--r--fs/ext4/extents_status.c2
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/ialloc.c59
-rw-r--r--fs/ext4/indirect.c127
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c323
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c12
-rw-r--r--fs/ext4/mmp.c4
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c9
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c4
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/journal.c3
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/readdir.c4
-rw-r--r--include/linux/jbd2.h16
-rw-r--r--kernel/locking/percpu-rwsem.c1
28 files changed, 364 insertions, 313 deletions
diff --git a/fs/compat.c b/fs/compat.c
index 8754e9aa14ad..be6e48b0a46c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -936,6 +936,8 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
936 } 936 }
937 dirent = buf->previous; 937 dirent = buf->previous;
938 if (dirent) { 938 if (dirent) {
939 if (signal_pending(current))
940 return -EINTR;
939 if (__put_user(offset, &dirent->d_off)) 941 if (__put_user(offset, &dirent->d_off))
940 goto efault; 942 goto efault;
941 } 943 }
@@ -1020,6 +1022,8 @@ static int compat_filldir64(struct dir_context *ctx, const char *name,
1020 dirent = buf->previous; 1022 dirent = buf->previous;
1021 1023
1022 if (dirent) { 1024 if (dirent) {
1025 if (signal_pending(current))
1026 return -EINTR;
1023 if (__put_user_unaligned(offset, &dirent->d_off)) 1027 if (__put_user_unaligned(offset, &dirent->d_off))
1024 goto efault; 1028 goto efault;
1025 } 1029 }
diff --git a/fs/dax.c b/fs/dax.c
index a345c168acaa..7d9df93b3a14 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -676,7 +676,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
676 if (error) 676 if (error)
677 goto unlock_page; 677 goto unlock_page;
678 678
679 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 679 if (!buffer_mapped(&bh) && !vmf->cow_page) {
680 if (vmf->flags & FAULT_FLAG_WRITE) { 680 if (vmf->flags & FAULT_FLAG_WRITE) {
681 error = get_block(inode, block, &bh, 1); 681 error = get_block(inode, block, &bh, 1);
682 count_vm_event(PGMAJFAULT); 682 count_vm_event(PGMAJFAULT);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index fe1f50fe764f..3020fd70c392 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -610,7 +610,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
610 610
611 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); 611 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
612 612
613 return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); 613 jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
614 return 1;
614} 615}
615 616
616/* 617/*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 5d00bf060254..68323e3da3fa 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -150,6 +150,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
150 while (ctx->pos < inode->i_size) { 150 while (ctx->pos < inode->i_size) {
151 struct ext4_map_blocks map; 151 struct ext4_map_blocks map;
152 152
153 if (fatal_signal_pending(current)) {
154 err = -ERESTARTSYS;
155 goto errout;
156 }
157 cond_resched();
153 map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); 158 map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
154 map.m_len = 1; 159 map.m_len = 1;
155 err = ext4_map_blocks(NULL, inode, &map, 0); 160 err = ext4_map_blocks(NULL, inode, &map, 0);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 72f4c9e00e97..b84aa1ca480a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,6 +33,7 @@
33#include <linux/ratelimit.h> 33#include <linux/ratelimit.h>
34#include <crypto/hash.h> 34#include <crypto/hash.h>
35#include <linux/falloc.h> 35#include <linux/falloc.h>
36#include <linux/percpu-rwsem.h>
36#ifdef __KERNEL__ 37#ifdef __KERNEL__
37#include <linux/compat.h> 38#include <linux/compat.h>
38#endif 39#endif
@@ -581,6 +582,9 @@ enum {
581#define EXT4_GET_BLOCKS_ZERO 0x0200 582#define EXT4_GET_BLOCKS_ZERO 0x0200
582#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ 583#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
583 EXT4_GET_BLOCKS_ZERO) 584 EXT4_GET_BLOCKS_ZERO)
585 /* Caller will submit data before dropping transaction handle. This
586 * allows jbd2 to avoid submitting data before commit. */
587#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
584 588
585/* 589/*
586 * The bit position of these flags must not overlap with any of the 590 * The bit position of these flags must not overlap with any of the
@@ -1505,6 +1509,9 @@ struct ext4_sb_info {
1505 struct ratelimit_state s_err_ratelimit_state; 1509 struct ratelimit_state s_err_ratelimit_state;
1506 struct ratelimit_state s_warning_ratelimit_state; 1510 struct ratelimit_state s_warning_ratelimit_state;
1507 struct ratelimit_state s_msg_ratelimit_state; 1511 struct ratelimit_state s_msg_ratelimit_state;
1512
1513 /* Barrier between changing inodes' journal flags and writepages ops. */
1514 struct percpu_rw_semaphore s_journal_flag_rwsem;
1508}; 1515};
1509 1516
1510static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1517static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1549,7 +1556,6 @@ enum {
1549 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1556 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
1550 nolocking */ 1557 nolocking */
1551 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1558 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
1552 EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
1553 EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ 1559 EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
1554}; 1560};
1555 1561
@@ -2521,8 +2527,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
2521struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); 2527struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
2522int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, 2528int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
2523 struct buffer_head *bh_result, int create); 2529 struct buffer_head *bh_result, int create);
2524int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 2530int ext4_dax_get_block(struct inode *inode, sector_t iblock,
2525 struct buffer_head *bh_result, int create); 2531 struct buffer_head *bh_result, int create);
2526int ext4_get_block(struct inode *inode, sector_t iblock, 2532int ext4_get_block(struct inode *inode, sector_t iblock,
2527 struct buffer_head *bh_result, int create); 2533 struct buffer_head *bh_result, int create);
2528int ext4_dio_get_block(struct inode *inode, sector_t iblock, 2534int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -2581,7 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
2581/* indirect.c */ 2587/* indirect.c */
2582extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 2588extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
2583 struct ext4_map_blocks *map, int flags); 2589 struct ext4_map_blocks *map, int flags);
2584extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
2585extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2590extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2586extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); 2591extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2587extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2592extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@ -3329,6 +3334,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
3329 } 3334 }
3330} 3335}
3331 3336
3337static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
3338{
3339 int blksize = 1 << inode->i_blkbits;
3340
3341 return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
3342}
3343
3332#endif /* __KERNEL__ */ 3344#endif /* __KERNEL__ */
3333 3345
3334#define EFSBADCRC EBADMSG /* Bad CRC detected */ 3346#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5f5846211095..09c1ef38cbe6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -359,10 +359,21 @@ static inline int ext4_journal_force_commit(journal_t *journal)
359 return 0; 359 return 0;
360} 360}
361 361
362static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 362static inline int ext4_jbd2_inode_add_write(handle_t *handle,
363 struct inode *inode)
363{ 364{
364 if (ext4_handle_valid(handle)) 365 if (ext4_handle_valid(handle))
365 return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); 366 return jbd2_journal_inode_add_write(handle,
367 EXT4_I(inode)->jinode);
368 return 0;
369}
370
371static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
372 struct inode *inode)
373{
374 if (ext4_handle_valid(handle))
375 return jbd2_journal_inode_add_wait(handle,
376 EXT4_I(inode)->jinode);
366 return 0; 377 return 0;
367} 378}
368 379
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 95bf4679ac54..2a2eef9c14e4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -120,9 +120,14 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
120 120
121 if (!ext4_handle_valid(handle)) 121 if (!ext4_handle_valid(handle))
122 return 0; 122 return 0;
123 if (handle->h_buffer_credits > needed) 123 if (handle->h_buffer_credits >= needed)
124 return 0; 124 return 0;
125 err = ext4_journal_extend(handle, needed); 125 /*
126 * If we need to extend the journal get a few extra blocks
127 * while we're at it for efficiency's sake.
128 */
129 needed += 3;
130 err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
126 if (err <= 0) 131 if (err <= 0)
127 return err; 132 return err;
128 err = ext4_truncate_restart_trans(handle, inode, needed); 133 err = ext4_truncate_restart_trans(handle, inode, needed);
@@ -907,13 +912,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
907 912
908 eh = ext_block_hdr(bh); 913 eh = ext_block_hdr(bh);
909 ppos++; 914 ppos++;
910 if (unlikely(ppos > depth)) {
911 put_bh(bh);
912 EXT4_ERROR_INODE(inode,
913 "ppos %d > depth %d", ppos, depth);
914 ret = -EFSCORRUPTED;
915 goto err;
916 }
917 path[ppos].p_bh = bh; 915 path[ppos].p_bh = bh;
918 path[ppos].p_hdr = eh; 916 path[ppos].p_hdr = eh;
919 } 917 }
@@ -2583,7 +2581,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2583 } 2581 }
2584 } else 2582 } else
2585 ext4_error(sbi->s_sb, "strange request: removal(2) " 2583 ext4_error(sbi->s_sb, "strange request: removal(2) "
2586 "%u-%u from %u:%u\n", 2584 "%u-%u from %u:%u",
2587 from, to, le32_to_cpu(ex->ee_block), ee_len); 2585 from, to, le32_to_cpu(ex->ee_block), ee_len);
2588 return 0; 2586 return 0;
2589} 2587}
@@ -3738,7 +3736,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3738 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3736 if (ee_block != map->m_lblk || ee_len > map->m_len) {
3739#ifdef EXT4_DEBUG 3737#ifdef EXT4_DEBUG
3740 ext4_warning("Inode (%ld) finished: extent logical block %llu," 3738 ext4_warning("Inode (%ld) finished: extent logical block %llu,"
3741 " len %u; IO logical block %llu, len %u\n", 3739 " len %u; IO logical block %llu, len %u",
3742 inode->i_ino, (unsigned long long)ee_block, ee_len, 3740 inode->i_ino, (unsigned long long)ee_block, ee_len,
3743 (unsigned long long)map->m_lblk, map->m_len); 3741 (unsigned long long)map->m_lblk, map->m_len);
3744#endif 3742#endif
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e38b987ac7f5..37e059202cd2 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -707,7 +707,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
707 (status & EXTENT_STATUS_WRITTEN)) { 707 (status & EXTENT_STATUS_WRITTEN)) {
708 ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " 708 ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
709 " delayed and written which can potentially " 709 " delayed and written which can potentially "
710 " cause data loss.\n", lblk, len); 710 " cause data loss.", lblk, len);
711 WARN_ON(1); 711 WARN_ON(1);
712 } 712 }
713 713
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 00ff6912adb3..d478110c32a6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
202 if (IS_ERR(handle)) 202 if (IS_ERR(handle))
203 result = VM_FAULT_SIGBUS; 203 result = VM_FAULT_SIGBUS;
204 else 204 else
205 result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); 205 result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL);
206 206
207 if (write) { 207 if (write) {
208 if (!IS_ERR(handle)) 208 if (!IS_ERR(handle))
@@ -238,7 +238,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
238 result = VM_FAULT_SIGBUS; 238 result = VM_FAULT_SIGBUS;
239 else 239 else
240 result = __dax_pmd_fault(vma, addr, pmd, flags, 240 result = __dax_pmd_fault(vma, addr, pmd, flags,
241 ext4_dax_mmap_get_block, NULL); 241 ext4_dax_get_block, NULL);
242 242
243 if (write) { 243 if (write) {
244 if (!IS_ERR(handle)) 244 if (!IS_ERR(handle))
@@ -373,7 +373,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
373 if (ext4_encrypted_inode(d_inode(dir)) && 373 if (ext4_encrypted_inode(d_inode(dir)) &&
374 !ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) { 374 !ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) {
375 ext4_warning(inode->i_sb, 375 ext4_warning(inode->i_sb,
376 "Inconsistent encryption contexts: %lu/%lu\n", 376 "Inconsistent encryption contexts: %lu/%lu",
377 (unsigned long) d_inode(dir)->i_ino, 377 (unsigned long) d_inode(dir)->i_ino,
378 (unsigned long) inode->i_ino); 378 (unsigned long) inode->i_ino);
379 dput(dir); 379 dput(dir);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 237b877d316d..3da4cf8d18b6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1150,25 +1150,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1150 unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); 1150 unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
1151 ext4_group_t block_group; 1151 ext4_group_t block_group;
1152 int bit; 1152 int bit;
1153 struct buffer_head *bitmap_bh; 1153 struct buffer_head *bitmap_bh = NULL;
1154 struct inode *inode = NULL; 1154 struct inode *inode = NULL;
1155 long err = -EIO; 1155 int err = -EFSCORRUPTED;
1156 1156
1157 /* Error cases - e2fsck has already cleaned up for us */ 1157 if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
1158 if (ino > max_ino) { 1158 goto bad_orphan;
1159 ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
1160 err = -EFSCORRUPTED;
1161 goto error;
1162 }
1163 1159
1164 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); 1160 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
1165 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); 1161 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
1166 bitmap_bh = ext4_read_inode_bitmap(sb, block_group); 1162 bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
1167 if (IS_ERR(bitmap_bh)) { 1163 if (IS_ERR(bitmap_bh)) {
1168 err = PTR_ERR(bitmap_bh); 1164 ext4_error(sb, "inode bitmap error %ld for orphan %lu",
1169 ext4_warning(sb, "inode bitmap error %ld for orphan %lu", 1165 ino, PTR_ERR(bitmap_bh));
1170 ino, err); 1166 return (struct inode *) bitmap_bh;
1171 goto error;
1172 } 1167 }
1173 1168
1174 /* Having the inode bit set should be a 100% indicator that this 1169 /* Having the inode bit set should be a 100% indicator that this
@@ -1179,15 +1174,21 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1179 goto bad_orphan; 1174 goto bad_orphan;
1180 1175
1181 inode = ext4_iget(sb, ino); 1176 inode = ext4_iget(sb, ino);
1182 if (IS_ERR(inode)) 1177 if (IS_ERR(inode)) {
1183 goto iget_failed; 1178 err = PTR_ERR(inode);
1179 ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
1180 ino, err);
1181 return inode;
1182 }
1184 1183
1185 /* 1184 /*
1186 * If the orphans has i_nlinks > 0 then it should be able to be 1185 * If the orphans has i_nlinks > 0 then it should be able to
1187 * truncated, otherwise it won't be removed from the orphan list 1186 * be truncated, otherwise it won't be removed from the orphan
1188 * during processing and an infinite loop will result. 1187 * list during processing and an infinite loop will result.
1188 * Similarly, it must not be a bad inode.
1189 */ 1189 */
1190 if (inode->i_nlink && !ext4_can_truncate(inode)) 1190 if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
1191 is_bad_inode(inode))
1191 goto bad_orphan; 1192 goto bad_orphan;
1192 1193
1193 if (NEXT_ORPHAN(inode) > max_ino) 1194 if (NEXT_ORPHAN(inode) > max_ino)
@@ -1195,29 +1196,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
1195 brelse(bitmap_bh); 1196 brelse(bitmap_bh);
1196 return inode; 1197 return inode;
1197 1198
1198iget_failed:
1199 err = PTR_ERR(inode);
1200 inode = NULL;
1201bad_orphan: 1199bad_orphan:
1202 ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); 1200 ext4_error(sb, "bad orphan inode %lu", ino);
1203 printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", 1201 if (bitmap_bh)
1204 bit, (unsigned long long)bitmap_bh->b_blocknr, 1202 printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1205 ext4_test_bit(bit, bitmap_bh->b_data)); 1203 bit, (unsigned long long)bitmap_bh->b_blocknr,
1206 printk(KERN_WARNING "inode=%p\n", inode); 1204 ext4_test_bit(bit, bitmap_bh->b_data));
1207 if (inode) { 1205 if (inode) {
1208 printk(KERN_WARNING "is_bad_inode(inode)=%d\n", 1206 printk(KERN_ERR "is_bad_inode(inode)=%d\n",
1209 is_bad_inode(inode)); 1207 is_bad_inode(inode));
1210 printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", 1208 printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
1211 NEXT_ORPHAN(inode)); 1209 NEXT_ORPHAN(inode));
1212 printk(KERN_WARNING "max_ino=%lu\n", max_ino); 1210 printk(KERN_ERR "max_ino=%lu\n", max_ino);
1213 printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); 1211 printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
1214 /* Avoid freeing blocks if we got a bad deleted inode */ 1212 /* Avoid freeing blocks if we got a bad deleted inode */
1215 if (inode->i_nlink == 0) 1213 if (inode->i_nlink == 0)
1216 inode->i_blocks = 0; 1214 inode->i_blocks = 0;
1217 iput(inode); 1215 iput(inode);
1218 } 1216 }
1219 brelse(bitmap_bh); 1217 brelse(bitmap_bh);
1220error:
1221 return ERR_PTR(err); 1218 return ERR_PTR(err);
1222} 1219}
1223 1220
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 627b7e8f9ef3..bc15c2c17633 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -649,133 +649,6 @@ out:
649} 649}
650 650
651/* 651/*
652 * O_DIRECT for ext3 (or indirect map) based files
653 *
654 * If the O_DIRECT write will extend the file then add this inode to the
655 * orphan list. So recovery will truncate it back to the original size
656 * if the machine crashes during the write.
657 *
658 * If the O_DIRECT write is intantiating holes inside i_size and the machine
659 * crashes then stale disk data _may_ be exposed inside the file. But current
660 * VFS code falls back into buffered path in that case so we are safe.
661 */
662ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
663{
664 struct file *file = iocb->ki_filp;
665 struct inode *inode = file->f_mapping->host;
666 struct ext4_inode_info *ei = EXT4_I(inode);
667 loff_t offset = iocb->ki_pos;
668 handle_t *handle;
669 ssize_t ret;
670 int orphan = 0;
671 size_t count = iov_iter_count(iter);
672 int retries = 0;
673
674 if (iov_iter_rw(iter) == WRITE) {
675 loff_t final_size = offset + count;
676
677 if (final_size > inode->i_size) {
678 /* Credits for sb + inode write */
679 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
680 if (IS_ERR(handle)) {
681 ret = PTR_ERR(handle);
682 goto out;
683 }
684 ret = ext4_orphan_add(handle, inode);
685 if (ret) {
686 ext4_journal_stop(handle);
687 goto out;
688 }
689 orphan = 1;
690 ei->i_disksize = inode->i_size;
691 ext4_journal_stop(handle);
692 }
693 }
694
695retry:
696 if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
697 /*
698 * Nolock dioread optimization may be dynamically disabled
699 * via ext4_inode_block_unlocked_dio(). Check inode's state
700 * while holding extra i_dio_count ref.
701 */
702 inode_dio_begin(inode);
703 smp_mb();
704 if (unlikely(ext4_test_inode_state(inode,
705 EXT4_STATE_DIOREAD_LOCK))) {
706 inode_dio_end(inode);
707 goto locked;
708 }
709 if (IS_DAX(inode))
710 ret = dax_do_io(iocb, inode, iter,
711 ext4_dio_get_block, NULL, 0);
712 else
713 ret = __blockdev_direct_IO(iocb, inode,
714 inode->i_sb->s_bdev, iter,
715 ext4_dio_get_block,
716 NULL, NULL, 0);
717 inode_dio_end(inode);
718 } else {
719locked:
720 if (IS_DAX(inode))
721 ret = dax_do_io(iocb, inode, iter,
722 ext4_dio_get_block, NULL, DIO_LOCKING);
723 else
724 ret = blockdev_direct_IO(iocb, inode, iter,
725 ext4_dio_get_block);
726
727 if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
728 loff_t isize = i_size_read(inode);
729 loff_t end = offset + count;
730
731 if (end > isize)
732 ext4_truncate_failed_write(inode);
733 }
734 }
735 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
736 goto retry;
737
738 if (orphan) {
739 int err;
740
741 /* Credits for sb + inode write */
742 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
743 if (IS_ERR(handle)) {
744 /* This is really bad luck. We've written the data
745 * but cannot extend i_size. Bail out and pretend
746 * the write failed... */
747 ret = PTR_ERR(handle);
748 if (inode->i_nlink)
749 ext4_orphan_del(NULL, inode);
750
751 goto out;
752 }
753 if (inode->i_nlink)
754 ext4_orphan_del(handle, inode);
755 if (ret > 0) {
756 loff_t end = offset + ret;
757 if (end > inode->i_size) {
758 ei->i_disksize = end;
759 i_size_write(inode, end);
760 /*
761 * We're going to return a positive `ret'
762 * here due to non-zero-length I/O, so there's
763 * no way of reporting error returns from
764 * ext4_mark_inode_dirty() to userspace. So
765 * ignore it.
766 */
767 ext4_mark_inode_dirty(handle, inode);
768 }
769 }
770 err = ext4_journal_stop(handle);
771 if (ret == 0)
772 ret = err;
773 }
774out:
775 return ret;
776}
777
778/*
779 * Calculate the number of metadata blocks need to reserve 652 * Calculate the number of metadata blocks need to reserve
780 * to allocate a new block at @lblocks for non extent file based file 653 * to allocate a new block at @lblocks for non extent file based file
781 */ 654 */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 7bc6c855cc18..ff7538c26992 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1780,7 +1780,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data)
1780 ext4_warning(dir->i_sb, 1780 ext4_warning(dir->i_sb,
1781 "bad inline directory (dir #%lu) - " 1781 "bad inline directory (dir #%lu) - "
1782 "inode %u, rec_len %u, name_len %d" 1782 "inode %u, rec_len %u, name_len %d"
1783 "inline size %d\n", 1783 "inline size %d",
1784 dir->i_ino, le32_to_cpu(de->inode), 1784 dir->i_ino, le32_to_cpu(de->inode),
1785 le16_to_cpu(de->rec_len), de->name_len, 1785 le16_to_cpu(de->rec_len), de->name_len,
1786 inline_size); 1786 inline_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 79b298d397b4..f7140ca66e3b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -684,6 +684,24 @@ out_sem:
684 ret = check_block_validity(inode, map); 684 ret = check_block_validity(inode, map);
685 if (ret != 0) 685 if (ret != 0)
686 return ret; 686 return ret;
687
688 /*
689 * Inodes with freshly allocated blocks where contents will be
690 * visible after transaction commit must be on transaction's
691 * ordered data list.
692 */
693 if (map->m_flags & EXT4_MAP_NEW &&
694 !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
695 !(flags & EXT4_GET_BLOCKS_ZERO) &&
696 !IS_NOQUOTA(inode) &&
697 ext4_should_order_data(inode)) {
698 if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
699 ret = ext4_jbd2_inode_add_wait(handle, inode);
700 else
701 ret = ext4_jbd2_inode_add_write(handle, inode);
702 if (ret)
703 return ret;
704 }
687 } 705 }
688 return retval; 706 return retval;
689} 707}
@@ -1289,15 +1307,6 @@ static int ext4_write_end(struct file *file,
1289 int i_size_changed = 0; 1307 int i_size_changed = 0;
1290 1308
1291 trace_ext4_write_end(inode, pos, len, copied); 1309 trace_ext4_write_end(inode, pos, len, copied);
1292 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
1293 ret = ext4_jbd2_file_inode(handle, inode);
1294 if (ret) {
1295 unlock_page(page);
1296 put_page(page);
1297 goto errout;
1298 }
1299 }
1300
1301 if (ext4_has_inline_data(inode)) { 1310 if (ext4_has_inline_data(inode)) {
1302 ret = ext4_write_inline_data_end(inode, pos, len, 1311 ret = ext4_write_inline_data_end(inode, pos, len,
1303 copied, page); 1312 copied, page);
@@ -2313,7 +2322,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2313 * the data was copied into the page cache. 2322 * the data was copied into the page cache.
2314 */ 2323 */
2315 get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 2324 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2316 EXT4_GET_BLOCKS_METADATA_NOFAIL; 2325 EXT4_GET_BLOCKS_METADATA_NOFAIL |
2326 EXT4_GET_BLOCKS_IO_SUBMIT;
2317 dioread_nolock = ext4_should_dioread_nolock(inode); 2327 dioread_nolock = ext4_should_dioread_nolock(inode);
2318 if (dioread_nolock) 2328 if (dioread_nolock)
2319 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2329 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2602,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
2602 struct blk_plug plug; 2612 struct blk_plug plug;
2603 bool give_up_on_write = false; 2613 bool give_up_on_write = false;
2604 2614
2615 percpu_down_read(&sbi->s_journal_flag_rwsem);
2605 trace_ext4_writepages(inode, wbc); 2616 trace_ext4_writepages(inode, wbc);
2606 2617
2607 if (dax_mapping(mapping)) 2618 if (dax_mapping(mapping)) {
2608 return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, 2619 ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
2609 wbc); 2620 wbc);
2621 goto out_writepages;
2622 }
2610 2623
2611 /* 2624 /*
2612 * No pages to write? This is mainly a kludge to avoid starting 2625 * No pages to write? This is mainly a kludge to avoid starting
@@ -2776,6 +2789,7 @@ retry:
2776out_writepages: 2789out_writepages:
2777 trace_ext4_writepages_result(inode, wbc, ret, 2790 trace_ext4_writepages_result(inode, wbc, ret,
2778 nr_to_write - wbc->nr_to_write); 2791 nr_to_write - wbc->nr_to_write);
2792 percpu_up_read(&sbi->s_journal_flag_rwsem);
2779 return ret; 2793 return ret;
2780} 2794}
2781 2795
@@ -3215,75 +3229,52 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3215} 3229}
3216 3230
3217#ifdef CONFIG_FS_DAX 3231#ifdef CONFIG_FS_DAX
3218int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 3232/*
3219 struct buffer_head *bh_result, int create) 3233 * Get block function for DAX IO and mmap faults. It takes care of converting
3234 * unwritten extents to written ones and initializes new / converted blocks
3235 * to zeros.
3236 */
3237int ext4_dax_get_block(struct inode *inode, sector_t iblock,
3238 struct buffer_head *bh_result, int create)
3220{ 3239{
3221 int ret, err; 3240 int ret;
3222 int credits;
3223 struct ext4_map_blocks map;
3224 handle_t *handle = NULL;
3225 int flags = 0;
3226
3227 ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
3228 inode->i_ino, create);
3229 map.m_lblk = iblock;
3230 map.m_len = bh_result->b_size >> inode->i_blkbits;
3231 credits = ext4_chunk_trans_blocks(inode, map.m_len);
3232 if (create) {
3233 flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
3234 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
3235 if (IS_ERR(handle)) {
3236 ret = PTR_ERR(handle);
3237 return ret;
3238 }
3239 }
3240 3241
3241 ret = ext4_map_blocks(handle, inode, &map, flags); 3242 ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
3242 if (create) { 3243 if (!create)
3243 err = ext4_journal_stop(handle); 3244 return _ext4_get_block(inode, iblock, bh_result, 0);
3244 if (ret >= 0 && err < 0)
3245 ret = err;
3246 }
3247 if (ret <= 0)
3248 goto out;
3249 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
3250 int err2;
3251 3245
3252 /* 3246 ret = ext4_get_block_trans(inode, iblock, bh_result,
3253 * We are protected by i_mmap_sem so we know block cannot go 3247 EXT4_GET_BLOCKS_PRE_IO |
3254 * away from under us even though we dropped i_data_sem. 3248 EXT4_GET_BLOCKS_CREATE_ZERO);
3255 * Convert extent to written and write zeros there. 3249 if (ret < 0)
3256 * 3250 return ret;
3257 * Note: We may get here even when create == 0.
3258 */
3259 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
3260 if (IS_ERR(handle)) {
3261 ret = PTR_ERR(handle);
3262 goto out;
3263 }
3264 3251
3265 err = ext4_map_blocks(handle, inode, &map, 3252 if (buffer_unwritten(bh_result)) {
3266 EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
3267 if (err < 0)
3268 ret = err;
3269 err2 = ext4_journal_stop(handle);
3270 if (err2 < 0 && ret > 0)
3271 ret = err2;
3272 }
3273out:
3274 WARN_ON_ONCE(ret == 0 && create);
3275 if (ret > 0) {
3276 map_bh(bh_result, inode->i_sb, map.m_pblk);
3277 /* 3253 /*
3278 * At least for now we have to clear BH_New so that DAX code 3254 * We are protected by i_mmap_sem or i_mutex so we know block
3279 * doesn't attempt to zero blocks again in a racy way. 3255 * cannot go away from under us even though we dropped
3256 * i_data_sem. Convert extent to written and write zeros there.
3280 */ 3257 */
3281 map.m_flags &= ~EXT4_MAP_NEW; 3258 ret = ext4_get_block_trans(inode, iblock, bh_result,
3282 ext4_update_bh_state(bh_result, map.m_flags); 3259 EXT4_GET_BLOCKS_CONVERT |
3283 bh_result->b_size = map.m_len << inode->i_blkbits; 3260 EXT4_GET_BLOCKS_CREATE_ZERO);
3284 ret = 0; 3261 if (ret < 0)
3262 return ret;
3285 } 3263 }
3286 return ret; 3264 /*
3265 * At least for now we have to clear BH_New so that DAX code
3266 * doesn't attempt to zero blocks again in a racy way.
3267 */
3268 clear_buffer_new(bh_result);
3269 return 0;
3270}
3271#else
3272/* Just define empty function, it will never get called. */
3273int ext4_dax_get_block(struct inode *inode, sector_t iblock,
3274 struct buffer_head *bh_result, int create)
3275{
3276 BUG();
3277 return 0;
3287} 3278}
3288#endif 3279#endif
3289 3280
@@ -3316,7 +3307,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3316} 3307}
3317 3308
3318/* 3309/*
3319 * For ext4 extent files, ext4 will do direct-io write to holes, 3310 * Handling of direct IO writes.
3311 *
3312 * For ext4 extent files, ext4 will do direct-io write even to holes,
3320 * preallocated extents, and those write extend the file, no need to 3313 * preallocated extents, and those write extend the file, no need to
3321 * fall back to buffered IO. 3314 * fall back to buffered IO.
3322 * 3315 *
@@ -3334,10 +3327,11 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3334 * if the machine crashes during the write. 3327 * if the machine crashes during the write.
3335 * 3328 *
3336 */ 3329 */
3337static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 3330static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
3338{ 3331{
3339 struct file *file = iocb->ki_filp; 3332 struct file *file = iocb->ki_filp;
3340 struct inode *inode = file->f_mapping->host; 3333 struct inode *inode = file->f_mapping->host;
3334 struct ext4_inode_info *ei = EXT4_I(inode);
3341 ssize_t ret; 3335 ssize_t ret;
3342 loff_t offset = iocb->ki_pos; 3336 loff_t offset = iocb->ki_pos;
3343 size_t count = iov_iter_count(iter); 3337 size_t count = iov_iter_count(iter);
@@ -3345,10 +3339,25 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3345 get_block_t *get_block_func = NULL; 3339 get_block_t *get_block_func = NULL;
3346 int dio_flags = 0; 3340 int dio_flags = 0;
3347 loff_t final_size = offset + count; 3341 loff_t final_size = offset + count;
3342 int orphan = 0;
3343 handle_t *handle;
3348 3344
3349 /* Use the old path for reads and writes beyond i_size. */ 3345 if (final_size > inode->i_size) {
3350 if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) 3346 /* Credits for sb + inode write */
3351 return ext4_ind_direct_IO(iocb, iter); 3347 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3348 if (IS_ERR(handle)) {
3349 ret = PTR_ERR(handle);
3350 goto out;
3351 }
3352 ret = ext4_orphan_add(handle, inode);
3353 if (ret) {
3354 ext4_journal_stop(handle);
3355 goto out;
3356 }
3357 orphan = 1;
3358 ei->i_disksize = inode->i_size;
3359 ext4_journal_stop(handle);
3360 }
3352 3361
3353 BUG_ON(iocb->private == NULL); 3362 BUG_ON(iocb->private == NULL);
3354 3363
@@ -3357,8 +3366,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3357 * conversion. This also disallows race between truncate() and 3366 * conversion. This also disallows race between truncate() and
3358 * overwrite DIO as i_dio_count needs to be incremented under i_mutex. 3367 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3359 */ 3368 */
3360 if (iov_iter_rw(iter) == WRITE) 3369 inode_dio_begin(inode);
3361 inode_dio_begin(inode);
3362 3370
3363 /* If we do a overwrite dio, i_mutex locking can be released */ 3371 /* If we do a overwrite dio, i_mutex locking can be released */
3364 overwrite = *((int *)iocb->private); 3372 overwrite = *((int *)iocb->private);
@@ -3367,7 +3375,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3367 inode_unlock(inode); 3375 inode_unlock(inode);
3368 3376
3369 /* 3377 /*
3370 * We could direct write to holes and fallocate. 3378 * For extent mapped files we could direct write to holes and fallocate.
3371 * 3379 *
3372 * Allocated blocks to fill the hole are marked as unwritten to prevent 3380 * Allocated blocks to fill the hole are marked as unwritten to prevent
3373 * parallel buffered read to expose the stale data before DIO complete 3381 * parallel buffered read to expose the stale data before DIO complete
@@ -3389,7 +3397,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3389 iocb->private = NULL; 3397 iocb->private = NULL;
3390 if (overwrite) 3398 if (overwrite)
3391 get_block_func = ext4_dio_get_block_overwrite; 3399 get_block_func = ext4_dio_get_block_overwrite;
3392 else if (is_sync_kiocb(iocb)) { 3400 else if (IS_DAX(inode)) {
3401 /*
3402 * We can avoid zeroing for aligned DAX writes beyond EOF. Other
3403 * writes need zeroing either because they can race with page
3404 * faults or because they use partial blocks.
3405 */
3406 if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
3407 ext4_aligned_io(inode, offset, count))
3408 get_block_func = ext4_dio_get_block;
3409 else
3410 get_block_func = ext4_dax_get_block;
3411 dio_flags = DIO_LOCKING;
3412 } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
3413 round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
3414 get_block_func = ext4_dio_get_block;
3415 dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
3416 } else if (is_sync_kiocb(iocb)) {
3393 get_block_func = ext4_dio_get_block_unwritten_sync; 3417 get_block_func = ext4_dio_get_block_unwritten_sync;
3394 dio_flags = DIO_LOCKING; 3418 dio_flags = DIO_LOCKING;
3395 } else { 3419 } else {
@@ -3399,10 +3423,10 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3399#ifdef CONFIG_EXT4_FS_ENCRYPTION 3423#ifdef CONFIG_EXT4_FS_ENCRYPTION
3400 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); 3424 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
3401#endif 3425#endif
3402 if (IS_DAX(inode)) 3426 if (IS_DAX(inode)) {
3403 ret = dax_do_io(iocb, inode, iter, get_block_func, 3427 ret = dax_do_io(iocb, inode, iter, get_block_func,
3404 ext4_end_io_dio, dio_flags); 3428 ext4_end_io_dio, dio_flags);
3405 else 3429 } else
3406 ret = __blockdev_direct_IO(iocb, inode, 3430 ret = __blockdev_direct_IO(iocb, inode,
3407 inode->i_sb->s_bdev, iter, 3431 inode->i_sb->s_bdev, iter,
3408 get_block_func, 3432 get_block_func,
@@ -3422,12 +3446,86 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3422 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3446 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3423 } 3447 }
3424 3448
3425 if (iov_iter_rw(iter) == WRITE) 3449 inode_dio_end(inode);
3426 inode_dio_end(inode);
3427 /* take i_mutex locking again if we do a ovewrite dio */ 3450 /* take i_mutex locking again if we do a ovewrite dio */
3428 if (overwrite) 3451 if (overwrite)
3429 inode_lock(inode); 3452 inode_lock(inode);
3430 3453
3454 if (ret < 0 && final_size > inode->i_size)
3455 ext4_truncate_failed_write(inode);
3456
3457 /* Handle extending of i_size after direct IO write */
3458 if (orphan) {
3459 int err;
3460
3461 /* Credits for sb + inode write */
3462 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3463 if (IS_ERR(handle)) {
3464 /* This is really bad luck. We've written the data
3465 * but cannot extend i_size. Bail out and pretend
3466 * the write failed... */
3467 ret = PTR_ERR(handle);
3468 if (inode->i_nlink)
3469 ext4_orphan_del(NULL, inode);
3470
3471 goto out;
3472 }
3473 if (inode->i_nlink)
3474 ext4_orphan_del(handle, inode);
3475 if (ret > 0) {
3476 loff_t end = offset + ret;
3477 if (end > inode->i_size) {
3478 ei->i_disksize = end;
3479 i_size_write(inode, end);
3480 /*
3481 * We're going to return a positive `ret'
3482 * here due to non-zero-length I/O, so there's
3483 * no way of reporting error returns from
3484 * ext4_mark_inode_dirty() to userspace. So
3485 * ignore it.
3486 */
3487 ext4_mark_inode_dirty(handle, inode);
3488 }
3489 }
3490 err = ext4_journal_stop(handle);
3491 if (ret == 0)
3492 ret = err;
3493 }
3494out:
3495 return ret;
3496}
3497
3498static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
3499{
3500 int unlocked = 0;
3501 struct inode *inode = iocb->ki_filp->f_mapping->host;
3502 ssize_t ret;
3503
3504 if (ext4_should_dioread_nolock(inode)) {
3505 /*
3506 * Nolock dioread optimization may be dynamically disabled
3507 * via ext4_inode_block_unlocked_dio(). Check inode's state
3508 * while holding extra i_dio_count ref.
3509 */
3510 inode_dio_begin(inode);
3511 smp_mb();
3512 if (unlikely(ext4_test_inode_state(inode,
3513 EXT4_STATE_DIOREAD_LOCK)))
3514 inode_dio_end(inode);
3515 else
3516 unlocked = 1;
3517 }
3518 if (IS_DAX(inode)) {
3519 ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block,
3520 NULL, unlocked ? 0 : DIO_LOCKING);
3521 } else {
3522 ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
3523 iter, ext4_dio_get_block,
3524 NULL, NULL,
3525 unlocked ? 0 : DIO_LOCKING);
3526 }
3527 if (unlocked)
3528 inode_dio_end(inode);
3431 return ret; 3529 return ret;
3432} 3530}
3433 3531
@@ -3455,10 +3553,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3455 return 0; 3553 return 0;
3456 3554
3457 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); 3555 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
3458 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3556 if (iov_iter_rw(iter) == READ)
3459 ret = ext4_ext_direct_IO(iocb, iter); 3557 ret = ext4_direct_IO_read(iocb, iter);
3460 else 3558 else
3461 ret = ext4_ind_direct_IO(iocb, iter); 3559 ret = ext4_direct_IO_write(iocb, iter);
3462 trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); 3560 trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
3463 return ret; 3561 return ret;
3464} 3562}
@@ -3534,10 +3632,7 @@ void ext4_set_aops(struct inode *inode)
3534{ 3632{
3535 switch (ext4_inode_journal_mode(inode)) { 3633 switch (ext4_inode_journal_mode(inode)) {
3536 case EXT4_INODE_ORDERED_DATA_MODE: 3634 case EXT4_INODE_ORDERED_DATA_MODE:
3537 ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
3538 break;
3539 case EXT4_INODE_WRITEBACK_DATA_MODE: 3635 case EXT4_INODE_WRITEBACK_DATA_MODE:
3540 ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
3541 break; 3636 break;
3542 case EXT4_INODE_JOURNAL_DATA_MODE: 3637 case EXT4_INODE_JOURNAL_DATA_MODE:
3543 inode->i_mapping->a_ops = &ext4_journalled_aops; 3638 inode->i_mapping->a_ops = &ext4_journalled_aops;
@@ -3630,8 +3725,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
3630 } else { 3725 } else {
3631 err = 0; 3726 err = 0;
3632 mark_buffer_dirty(bh); 3727 mark_buffer_dirty(bh);
3633 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) 3728 if (ext4_should_order_data(inode))
3634 err = ext4_jbd2_file_inode(handle, inode); 3729 err = ext4_jbd2_inode_add_write(handle, inode);
3635 } 3730 }
3636 3731
3637unlock: 3732unlock:
@@ -5429,6 +5524,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5429 journal_t *journal; 5524 journal_t *journal;
5430 handle_t *handle; 5525 handle_t *handle;
5431 int err; 5526 int err;
5527 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5432 5528
5433 /* 5529 /*
5434 * We have to be very careful here: changing a data block's 5530 * We have to be very careful here: changing a data block's
@@ -5445,22 +5541,30 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5445 return 0; 5541 return 0;
5446 if (is_journal_aborted(journal)) 5542 if (is_journal_aborted(journal))
5447 return -EROFS; 5543 return -EROFS;
5448 /* We have to allocate physical blocks for delalloc blocks
5449 * before flushing journal. otherwise delalloc blocks can not
5450 * be allocated any more. even more truncate on delalloc blocks
5451 * could trigger BUG by flushing delalloc blocks in journal.
5452 * There is no delalloc block in non-journal data mode.
5453 */
5454 if (val && test_opt(inode->i_sb, DELALLOC)) {
5455 err = ext4_alloc_da_blocks(inode);
5456 if (err < 0)
5457 return err;
5458 }
5459 5544
5460 /* Wait for all existing dio workers */ 5545 /* Wait for all existing dio workers */
5461 ext4_inode_block_unlocked_dio(inode); 5546 ext4_inode_block_unlocked_dio(inode);
5462 inode_dio_wait(inode); 5547 inode_dio_wait(inode);
5463 5548
5549 /*
5550 * Before flushing the journal and switching inode's aops, we have
5551 * to flush all dirty data the inode has. There can be outstanding
5552 * delayed allocations, there can be unwritten extents created by
5553 * fallocate or buffered writes in dioread_nolock mode covered by
5554 * dirty data which can be converted only after flushing the dirty
5555 * data (and journalled aops don't know how to handle these cases).
5556 */
5557 if (val) {
5558 down_write(&EXT4_I(inode)->i_mmap_sem);
5559 err = filemap_write_and_wait(inode->i_mapping);
5560 if (err < 0) {
5561 up_write(&EXT4_I(inode)->i_mmap_sem);
5562 ext4_inode_resume_unlocked_dio(inode);
5563 return err;
5564 }
5565 }
5566
5567 percpu_down_write(&sbi->s_journal_flag_rwsem);
5464 jbd2_journal_lock_updates(journal); 5568 jbd2_journal_lock_updates(journal);
5465 5569
5466 /* 5570 /*
@@ -5477,6 +5581,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5477 err = jbd2_journal_flush(journal); 5581 err = jbd2_journal_flush(journal);
5478 if (err < 0) { 5582 if (err < 0) {
5479 jbd2_journal_unlock_updates(journal); 5583 jbd2_journal_unlock_updates(journal);
5584 percpu_up_write(&sbi->s_journal_flag_rwsem);
5480 ext4_inode_resume_unlocked_dio(inode); 5585 ext4_inode_resume_unlocked_dio(inode);
5481 return err; 5586 return err;
5482 } 5587 }
@@ -5485,6 +5590,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5485 ext4_set_aops(inode); 5590 ext4_set_aops(inode);
5486 5591
5487 jbd2_journal_unlock_updates(journal); 5592 jbd2_journal_unlock_updates(journal);
5593 percpu_up_write(&sbi->s_journal_flag_rwsem);
5594
5595 if (val)
5596 up_write(&EXT4_I(inode)->i_mmap_sem);
5488 ext4_inode_resume_unlocked_dio(inode); 5597 ext4_inode_resume_unlocked_dio(inode);
5489 5598
5490 /* Finally we can mark the inode as dirty. */ 5599 /* Finally we can mark the inode as dirty. */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7497f50cb293..28cc412852af 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -365,7 +365,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
365 struct dquot *transfer_to[MAXQUOTAS] = { }; 365 struct dquot *transfer_to[MAXQUOTAS] = { };
366 366
367 transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); 367 transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
368 if (transfer_to[PRJQUOTA]) { 368 if (!IS_ERR(transfer_to[PRJQUOTA])) {
369 err = __dquot_transfer(inode, transfer_to); 369 err = __dquot_transfer(inode, transfer_to);
370 dqput(transfer_to[PRJQUOTA]); 370 dqput(transfer_to[PRJQUOTA]);
371 if (err) 371 if (err)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index eeeade76012e..c1ab3ec30423 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1266,6 +1266,7 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1266static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1266static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1267{ 1267{
1268 int order = 1; 1268 int order = 1;
1269 int bb_incr = 1 << (e4b->bd_blkbits - 1);
1269 void *bb; 1270 void *bb;
1270 1271
1271 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); 1272 BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
@@ -1278,7 +1279,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1278 /* this block is part of buddy of order 'order' */ 1279 /* this block is part of buddy of order 'order' */
1279 return order; 1280 return order;
1280 } 1281 }
1281 bb += 1 << (e4b->bd_blkbits - order); 1282 bb += bb_incr;
1283 bb_incr >>= 1;
1282 order++; 1284 order++;
1283 } 1285 }
1284 return 0; 1286 return 0;
@@ -2583,7 +2585,7 @@ int ext4_mb_init(struct super_block *sb)
2583{ 2585{
2584 struct ext4_sb_info *sbi = EXT4_SB(sb); 2586 struct ext4_sb_info *sbi = EXT4_SB(sb);
2585 unsigned i, j; 2587 unsigned i, j;
2586 unsigned offset; 2588 unsigned offset, offset_incr;
2587 unsigned max; 2589 unsigned max;
2588 int ret; 2590 int ret;
2589 2591
@@ -2612,11 +2614,13 @@ int ext4_mb_init(struct super_block *sb)
2612 2614
2613 i = 1; 2615 i = 1;
2614 offset = 0; 2616 offset = 0;
2617 offset_incr = 1 << (sb->s_blocksize_bits - 1);
2615 max = sb->s_blocksize << 2; 2618 max = sb->s_blocksize << 2;
2616 do { 2619 do {
2617 sbi->s_mb_offsets[i] = offset; 2620 sbi->s_mb_offsets[i] = offset;
2618 sbi->s_mb_maxs[i] = max; 2621 sbi->s_mb_maxs[i] = max;
2619 offset += 1 << (sb->s_blocksize_bits - i); 2622 offset += offset_incr;
2623 offset_incr = offset_incr >> 1;
2620 max = max >> 1; 2624 max = max >> 1;
2621 i++; 2625 i++;
2622 } while (i <= sb->s_blocksize_bits + 1); 2626 } while (i <= sb->s_blocksize_bits + 1);
@@ -4935,7 +4939,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4935 * boundary. 4939 * boundary.
4936 */ 4940 */
4937 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4941 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4938 ext4_warning(sb, "too much blocks added to group %u\n", 4942 ext4_warning(sb, "too much blocks added to group %u",
4939 block_group); 4943 block_group);
4940 err = -EINVAL; 4944 err = -EINVAL;
4941 goto error_return; 4945 goto error_return;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 24445275d330..23d436d6f8b8 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -121,7 +121,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
121 __ext4_warning(sb, function, line, "%s", msg); 121 __ext4_warning(sb, function, line, "%s", msg);
122 __ext4_warning(sb, function, line, 122 __ext4_warning(sb, function, line,
123 "MMP failure info: last update time: %llu, last update " 123 "MMP failure info: last update time: %llu, last update "
124 "node: %s, last update device: %s\n", 124 "node: %s, last update device: %s",
125 (long long unsigned int) le64_to_cpu(mmp->mmp_time), 125 (long long unsigned int) le64_to_cpu(mmp->mmp_time),
126 mmp->mmp_nodename, mmp->mmp_bdevname); 126 mmp->mmp_nodename, mmp->mmp_bdevname);
127} 127}
@@ -353,7 +353,7 @@ skip:
353 * wait for MMP interval and check mmp_seq. 353 * wait for MMP interval and check mmp_seq.
354 */ 354 */
355 if (schedule_timeout_interruptible(HZ * wait_time) != 0) { 355 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
356 ext4_warning(sb, "MMP startup interrupted, failing mount\n"); 356 ext4_warning(sb, "MMP startup interrupted, failing mount");
357 goto failed; 357 goto failed;
358 } 358 }
359 359
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 325cef48b39a..a920c5d29fac 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -400,7 +400,7 @@ data_copy:
400 400
401 /* Even in case of data=writeback it is reasonable to pin 401 /* Even in case of data=writeback it is reasonable to pin
402 * inode to transaction, to prevent unexpected data loss */ 402 * inode to transaction, to prevent unexpected data loss */
403 *err = ext4_jbd2_file_inode(handle, orig_inode); 403 *err = ext4_jbd2_inode_add_write(handle, orig_inode);
404 404
405unlock_pages: 405unlock_pages:
406 unlock_page(pagep[0]); 406 unlock_page(pagep[0]);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5611ec9348d7..ec4c39952e84 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1107,6 +1107,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
1107 } 1107 }
1108 1108
1109 while (1) { 1109 while (1) {
1110 if (fatal_signal_pending(current)) {
1111 err = -ERESTARTSYS;
1112 goto errout;
1113 }
1114 cond_resched();
1110 block = dx_get_block(frame->at); 1115 block = dx_get_block(frame->at);
1111 ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, 1116 ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
1112 start_hash, start_minor_hash); 1117 start_hash, start_minor_hash);
@@ -1613,7 +1618,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
1613 if (nokey) 1618 if (nokey)
1614 return ERR_PTR(-ENOKEY); 1619 return ERR_PTR(-ENOKEY);
1615 ext4_warning(inode->i_sb, 1620 ext4_warning(inode->i_sb,
1616 "Inconsistent encryption contexts: %lu/%lu\n", 1621 "Inconsistent encryption contexts: %lu/%lu",
1617 (unsigned long) dir->i_ino, 1622 (unsigned long) dir->i_ino,
1618 (unsigned long) inode->i_ino); 1623 (unsigned long) inode->i_ino);
1619 return ERR_PTR(-EPERM); 1624 return ERR_PTR(-EPERM);
@@ -2828,7 +2833,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2828 * list entries can cause panics at unmount time. 2833 * list entries can cause panics at unmount time.
2829 */ 2834 */
2830 mutex_lock(&sbi->s_orphan_lock); 2835 mutex_lock(&sbi->s_orphan_lock);
2831 list_del(&EXT4_I(inode)->i_orphan); 2836 list_del_init(&EXT4_I(inode)->i_orphan);
2832 mutex_unlock(&sbi->s_orphan_lock); 2837 mutex_unlock(&sbi->s_orphan_lock);
2833 } 2838 }
2834 } 2839 }
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index e4fc8ea45d78..2a01df9cc1c3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -342,9 +342,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
342 if (bio) { 342 if (bio) {
343 int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? 343 int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ?
344 WRITE_SYNC : WRITE; 344 WRITE_SYNC : WRITE;
345 bio_get(io->io_bio);
346 submit_bio(io_op, io->io_bio); 345 submit_bio(io_op, io->io_bio);
347 bio_put(io->io_bio);
348 } 346 }
349 io->io_bio = NULL; 347 io->io_bio = NULL;
350} 348}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 34038e3598d5..cf681004b196 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -41,7 +41,7 @@ int ext4_resize_begin(struct super_block *sb)
41 */ 41 */
42 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 42 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
43 ext4_warning(sb, "There are errors in the filesystem, " 43 ext4_warning(sb, "There are errors in the filesystem, "
44 "so online resizing is not allowed\n"); 44 "so online resizing is not allowed");
45 return -EPERM; 45 return -EPERM;
46 } 46 }
47 47
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 304c712dbe12..20c5d52253b4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
859 percpu_counter_destroy(&sbi->s_freeinodes_counter); 859 percpu_counter_destroy(&sbi->s_freeinodes_counter);
860 percpu_counter_destroy(&sbi->s_dirs_counter); 860 percpu_counter_destroy(&sbi->s_dirs_counter);
861 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 861 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
862 percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
862 brelse(sbi->s_sbh); 863 brelse(sbi->s_sbh);
863#ifdef CONFIG_QUOTA 864#ifdef CONFIG_QUOTA
864 for (i = 0; i < EXT4_MAXQUOTAS; i++) 865 for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3930,6 +3931,9 @@ no_journal:
3930 if (!err) 3931 if (!err)
3931 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, 3932 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
3932 GFP_KERNEL); 3933 GFP_KERNEL);
3934 if (!err)
3935 err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
3936
3933 if (err) { 3937 if (err) {
3934 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3938 ext4_msg(sb, KERN_ERR, "insufficient memory");
3935 goto failed_mount6; 3939 goto failed_mount6;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 2ad98d6e19f4..70078096117d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal,
219 219
220 spin_lock(&journal->j_list_lock); 220 spin_lock(&journal->j_list_lock);
221 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 221 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
222 if (!(jinode->i_flags & JI_WRITE_DATA))
223 continue;
222 mapping = jinode->i_vfs_inode->i_mapping; 224 mapping = jinode->i_vfs_inode->i_mapping;
223 jinode->i_flags |= JI_COMMIT_RUNNING; 225 jinode->i_flags |= JI_COMMIT_RUNNING;
224 spin_unlock(&journal->j_list_lock); 226 spin_unlock(&journal->j_list_lock);
@@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
256 /* For locking, see the comment in journal_submit_data_buffers() */ 258 /* For locking, see the comment in journal_submit_data_buffers() */
257 spin_lock(&journal->j_list_lock); 259 spin_lock(&journal->j_list_lock);
258 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 260 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 if (!(jinode->i_flags & JI_WAIT_DATA))
262 continue;
259 jinode->i_flags |= JI_COMMIT_RUNNING; 263 jinode->i_flags |= JI_COMMIT_RUNNING;
260 spin_unlock(&journal->j_list_lock); 264 spin_unlock(&journal->j_list_lock);
261 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); 265 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 435f0b26ac20..b31852f76f46 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
94EXPORT_SYMBOL(jbd2_journal_invalidatepage); 94EXPORT_SYMBOL(jbd2_journal_invalidatepage);
95EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); 95EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
96EXPORT_SYMBOL(jbd2_journal_force_commit); 96EXPORT_SYMBOL(jbd2_journal_force_commit);
97EXPORT_SYMBOL(jbd2_journal_file_inode); 97EXPORT_SYMBOL(jbd2_journal_inode_add_write);
98EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
98EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); 99EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
99EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); 100EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
100EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 101EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 2c56c3e32194..1749519b362f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2462,7 +2462,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2462/* 2462/*
2463 * File inode in the inode list of the handle's transaction 2463 * File inode in the inode list of the handle's transaction
2464 */ 2464 */
2465int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) 2465static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
2466 unsigned long flags)
2466{ 2467{
2467 transaction_t *transaction = handle->h_transaction; 2468 transaction_t *transaction = handle->h_transaction;
2468 journal_t *journal; 2469 journal_t *journal;
@@ -2487,12 +2488,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
2487 * and if jinode->i_next_transaction == transaction, commit code 2488 * and if jinode->i_next_transaction == transaction, commit code
2488 * will only file the inode where we want it. 2489 * will only file the inode where we want it.
2489 */ 2490 */
2490 if (jinode->i_transaction == transaction || 2491 if ((jinode->i_transaction == transaction ||
2491 jinode->i_next_transaction == transaction) 2492 jinode->i_next_transaction == transaction) &&
2493 (jinode->i_flags & flags) == flags)
2492 return 0; 2494 return 0;
2493 2495
2494 spin_lock(&journal->j_list_lock); 2496 spin_lock(&journal->j_list_lock);
2495 2497 jinode->i_flags |= flags;
2498 /* Is inode already attached where we need it? */
2496 if (jinode->i_transaction == transaction || 2499 if (jinode->i_transaction == transaction ||
2497 jinode->i_next_transaction == transaction) 2500 jinode->i_next_transaction == transaction)
2498 goto done; 2501 goto done;
@@ -2523,6 +2526,17 @@ done:
2523 return 0; 2526 return 0;
2524} 2527}
2525 2528
2529int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
2530{
2531 return jbd2_journal_file_inode(handle, jinode,
2532 JI_WRITE_DATA | JI_WAIT_DATA);
2533}
2534
2535int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
2536{
2537 return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA);
2538}
2539
2526/* 2540/*
2527 * File truncate and transaction commit interact with each other in a 2541 * File truncate and transaction commit interact with each other in a
2528 * non-trivial way. If a transaction writing data block A is 2542 * non-trivial way. If a transaction writing data block A is
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index f4cd3c3e9fb7..497a4171ef61 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
619 619
620static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) 620static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
621{ 621{
622 return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); 622 return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
623} 623}
624 624
625static inline int ocfs2_begin_ordered_truncate(struct inode *inode, 625static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
diff --git a/fs/readdir.c b/fs/readdir.c
index a86c6c04b9bc..68ef06efe6bc 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -182,6 +182,8 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
182 } 182 }
183 dirent = buf->previous; 183 dirent = buf->previous;
184 if (dirent) { 184 if (dirent) {
185 if (signal_pending(current))
186 return -EINTR;
185 if (__put_user(offset, &dirent->d_off)) 187 if (__put_user(offset, &dirent->d_off))
186 goto efault; 188 goto efault;
187 } 189 }
@@ -261,6 +263,8 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
261 return -EINVAL; 263 return -EINVAL;
262 dirent = buf->previous; 264 dirent = buf->previous;
263 if (dirent) { 265 if (dirent) {
266 if (signal_pending(current))
267 return -EINTR;
264 if (__put_user(offset, &dirent->d_off)) 268 if (__put_user(offset, &dirent->d_off))
265 goto efault; 269 goto efault;
266 } 270 }
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index fd1083c46c61..efb232c5f668 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -403,11 +403,19 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
403 403
404/* Flags in jbd_inode->i_flags */ 404/* Flags in jbd_inode->i_flags */
405#define __JI_COMMIT_RUNNING 0 405#define __JI_COMMIT_RUNNING 0
406/* Commit of the inode data in progress. We use this flag to protect us from 406#define __JI_WRITE_DATA 1
407#define __JI_WAIT_DATA 2
408
409/*
410 * Commit of the inode data in progress. We use this flag to protect us from
407 * concurrent deletion of inode. We cannot use reference to inode for this 411 * concurrent deletion of inode. We cannot use reference to inode for this
408 * since we cannot afford doing last iput() on behalf of kjournald 412 * since we cannot afford doing last iput() on behalf of kjournald
409 */ 413 */
410#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) 414#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
415/* Write allocated dirty buffers in this inode before commit */
416#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
417/* Wait for outstanding data writes for this inode before commit */
418#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)
411 419
412/** 420/**
413 * struct jbd_inode is the structure linking inodes in ordered mode 421 * struct jbd_inode is the structure linking inodes in ordered mode
@@ -781,9 +789,6 @@ jbd2_time_diff(unsigned long start, unsigned long end)
781 * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the 789 * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the
782 * number that will fit in j_blocksize 790 * number that will fit in j_blocksize
783 * @j_last_sync_writer: most recent pid which did a synchronous write 791 * @j_last_sync_writer: most recent pid which did a synchronous write
784 * @j_history: Buffer storing the transactions statistics history
785 * @j_history_max: Maximum number of transactions in the statistics history
786 * @j_history_cur: Current number of transactions in the statistics history
787 * @j_history_lock: Protect the transactions statistics history 792 * @j_history_lock: Protect the transactions statistics history
788 * @j_proc_entry: procfs entry for the jbd statistics directory 793 * @j_proc_entry: procfs entry for the jbd statistics directory
789 * @j_stats: Overall statistics 794 * @j_stats: Overall statistics
@@ -1270,7 +1275,8 @@ extern int jbd2_journal_clear_err (journal_t *);
1270extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); 1275extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
1271extern int jbd2_journal_force_commit(journal_t *); 1276extern int jbd2_journal_force_commit(journal_t *);
1272extern int jbd2_journal_force_commit_nested(journal_t *); 1277extern int jbd2_journal_force_commit_nested(journal_t *);
1273extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); 1278extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
1279extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
1274extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, 1280extern int jbd2_journal_begin_ordered_truncate(journal_t *journal,
1275 struct jbd2_inode *inode, loff_t new_size); 1281 struct jbd2_inode *inode, loff_t new_size);
1276extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); 1282extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f231e0bb311c..bec0b647f9cc 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
37 free_percpu(brw->fast_read_ctr); 37 free_percpu(brw->fast_read_ctr);
38 brw->fast_read_ctr = NULL; /* catch use after free bugs */ 38 brw->fast_read_ctr = NULL; /* catch use after free bugs */
39} 39}
40EXPORT_SYMBOL_GPL(percpu_free_rwsem);
40 41
41/* 42/*
42 * This is the fast-path for down_read/up_read. If it succeeds we rely 43 * This is the fast-path for down_read/up_read. If it succeeds we rely