aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2008-02-10 01:08:38 -0500
committerTheodore Ts'o <tytso@mit.edu>2008-02-10 01:08:38 -0500
commit7fb5409df092589b86cc9412d926879cb572b7f0 (patch)
tree8201f2fc124d34098776799f8cec89a8f8b4f8bb
parent8009f9fb3067fef6c2ca0c16f6bac786ae28639d (diff)
ext4: Fix Direct I/O locking
We cannot start transaction in ext4_direct_IO() and just let it last during the whole write because dio_get_page() acquires mmap_sem which ranks above transaction start (e.g. because we have dependency chain mmap_sem->PageLock->journal_start, or because we update atime while holding mmap_sem) and thus deadlocks could happen. We solve the problem by starting a transaction separately for each ext4_get_block() call. We *could* have a problem that we allocate a block and before its data are written out the machine crashes and thus we expose stale data. But that does not happen because for hole-filling generic code falls back to buffered writes and for file extension, we add inode to orphan list and thus in case of crash, journal replay will truncate inode back to the original size. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/inode.c107
1 files changed, 53 insertions, 54 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bbfabf876e78..7dd9b50d5ebc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -892,7 +892,16 @@ out:
892 return err; 892 return err;
893} 893}
894 894
895#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) 895/* Maximum number of blocks we map for direct IO at once. */
896#define DIO_MAX_BLOCKS 4096
897/*
898 * Number of credits we need for writing DIO_MAX_BLOCKS:
899 * We need sb + group descriptor + bitmap + inode -> 4
900 * For B blocks with A block pointers per block we need:
901 * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
902 * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
903 */
904#define DIO_CREDITS 25
896 905
897int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 906int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
898 unsigned long max_blocks, struct buffer_head *bh, 907 unsigned long max_blocks, struct buffer_head *bh,
@@ -939,49 +948,31 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
939 struct buffer_head *bh_result, int create) 948 struct buffer_head *bh_result, int create)
940{ 949{
941 handle_t *handle = ext4_journal_current_handle(); 950 handle_t *handle = ext4_journal_current_handle();
942 int ret = 0; 951 int ret = 0, started = 0;
943 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 952 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
944 953
945 if (!create) 954 if (create && !handle) {
946 goto get_block; /* A read */ 955 /* Direct IO write... */
947 956 if (max_blocks > DIO_MAX_BLOCKS)
948 if (max_blocks == 1) 957 max_blocks = DIO_MAX_BLOCKS;
949 goto get_block; /* A single block get */ 958 handle = ext4_journal_start(inode, DIO_CREDITS +
950 959 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
951 if (handle->h_transaction->t_state == T_LOCKED) { 960 if (IS_ERR(handle)) {
952 /*
953 * Huge direct-io writes can hold off commits for long
954 * periods of time. Let this commit run.
955 */
956 ext4_journal_stop(handle);
957 handle = ext4_journal_start(inode, DIO_CREDITS);
958 if (IS_ERR(handle))
959 ret = PTR_ERR(handle); 961 ret = PTR_ERR(handle);
960 goto get_block; 962 goto out;
961 }
962
963 if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
964 /*
965 * Getting low on buffer credits...
966 */
967 ret = ext4_journal_extend(handle, DIO_CREDITS);
968 if (ret > 0) {
969 /*
970 * Couldn't extend the transaction. Start a new one.
971 */
972 ret = ext4_journal_restart(handle, DIO_CREDITS);
973 } 963 }
964 started = 1;
974 } 965 }
975 966
976get_block: 967 ret = ext4_get_blocks_wrap(handle, inode, iblock,
977 if (ret == 0) {
978 ret = ext4_get_blocks_wrap(handle, inode, iblock,
979 max_blocks, bh_result, create, 0); 968 max_blocks, bh_result, create, 0);
980 if (ret > 0) { 969 if (ret > 0) {
981 bh_result->b_size = (ret << inode->i_blkbits); 970 bh_result->b_size = (ret << inode->i_blkbits);
982 ret = 0; 971 ret = 0;
983 }
984 } 972 }
973 if (started)
974 ext4_journal_stop(handle);
975out:
985 return ret; 976 return ret;
986} 977}
987 978
@@ -1671,7 +1662,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
1671 * if the machine crashes during the write. 1662 * if the machine crashes during the write.
1672 * 1663 *
1673 * If the O_DIRECT write is intantiating holes inside i_size and the machine 1664 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1674 * crashes then stale disk data _may_ be exposed inside the file. 1665 * crashes then stale disk data _may_ be exposed inside the file. But current
1666 * VFS code falls back into buffered path in that case so we are safe.
1675 */ 1667 */
1676static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 1668static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1677 const struct iovec *iov, loff_t offset, 1669 const struct iovec *iov, loff_t offset,
@@ -1680,7 +1672,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1680 struct file *file = iocb->ki_filp; 1672 struct file *file = iocb->ki_filp;
1681 struct inode *inode = file->f_mapping->host; 1673 struct inode *inode = file->f_mapping->host;
1682 struct ext4_inode_info *ei = EXT4_I(inode); 1674 struct ext4_inode_info *ei = EXT4_I(inode);
1683 handle_t *handle = NULL; 1675 handle_t *handle;
1684 ssize_t ret; 1676 ssize_t ret;
1685 int orphan = 0; 1677 int orphan = 0;
1686 size_t count = iov_length(iov, nr_segs); 1678 size_t count = iov_length(iov, nr_segs);
@@ -1688,17 +1680,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1688 if (rw == WRITE) { 1680 if (rw == WRITE) {
1689 loff_t final_size = offset + count; 1681 loff_t final_size = offset + count;
1690 1682
1691 handle = ext4_journal_start(inode, DIO_CREDITS);
1692 if (IS_ERR(handle)) {
1693 ret = PTR_ERR(handle);
1694 goto out;
1695 }
1696 if (final_size > inode->i_size) { 1683 if (final_size > inode->i_size) {
1684 /* Credits for sb + inode write */
1685 handle = ext4_journal_start(inode, 2);
1686 if (IS_ERR(handle)) {
1687 ret = PTR_ERR(handle);
1688 goto out;
1689 }
1697 ret = ext4_orphan_add(handle, inode); 1690 ret = ext4_orphan_add(handle, inode);
1698 if (ret) 1691 if (ret) {
1699 goto out_stop; 1692 ext4_journal_stop(handle);
1693 goto out;
1694 }
1700 orphan = 1; 1695 orphan = 1;
1701 ei->i_disksize = inode->i_size; 1696 ei->i_disksize = inode->i_size;
1697 ext4_journal_stop(handle);
1702 } 1698 }
1703 } 1699 }
1704 1700
@@ -1706,18 +1702,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1706 offset, nr_segs, 1702 offset, nr_segs,
1707 ext4_get_block, NULL); 1703 ext4_get_block, NULL);
1708 1704
1709 /* 1705 if (orphan) {
1710 * Reacquire the handle: ext4_get_block() can restart the transaction
1711 */
1712 handle = ext4_journal_current_handle();
1713
1714out_stop:
1715 if (handle) {
1716 int err; 1706 int err;
1717 1707
1718 if (orphan && inode->i_nlink) 1708 /* Credits for sb + inode write */
1709 handle = ext4_journal_start(inode, 2);
1710 if (IS_ERR(handle)) {
1711 /* This is really bad luck. We've written the data
1712 * but cannot extend i_size. Bail out and pretend
1713 * the write failed... */
1714 ret = PTR_ERR(handle);
1715 goto out;
1716 }
1717 if (inode->i_nlink)
1719 ext4_orphan_del(handle, inode); 1718 ext4_orphan_del(handle, inode);
1720 if (orphan && ret > 0) { 1719 if (ret > 0) {
1721 loff_t end = offset + ret; 1720 loff_t end = offset + ret;
1722 if (end > inode->i_size) { 1721 if (end > inode->i_size) {
1723 ei->i_disksize = end; 1722 ei->i_disksize = end;