ext4: Fix Direct I/O locking

We cannot start transaction in ext4_direct_IO() and just let it last during the whole write because dio_get_page() acquires mmap_sem which ranks above transaction start (e.g. because we have dependency chain mmap_sem->PageLock->journal_start, or because we update atime while holding mmap_sem) and thus deadlocks could happen. We solve the problem by starting a transaction separately for each ext4_get_block() call. We *could* have a problem that we allocate a block and before its data are written out the machine crashes and thus we expose stale data. But that does not happen because for hole-filling generic code falls back to buffered writes and for file extension, we add inode to orphan list and thus in case of crash, journal replay will truncate inode back to the original size. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
author: Jan Kara <jack@suse.cz> 2008-02-10 01:08:38 -0500
committer: Theodore Ts'o <tytso@mit.edu> 2008-02-10 01:08:38 -0500
commit: 7fb5409df092589b86cc9412d926879cb572b7f0 (patch)
tree: 8201f2fc124d34098776799f8cec89a8f8b4f8bb /fs/ext4
parent: 8009f9fb3067fef6c2ca0c16f6bac786ae28639d (diff)
1 files changed, 53 insertions, 54 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bbfabf876e78..7dd9b50d5ebc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -892,7 +892,16 @@ out:
        return err;
 }
-#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+/*
+ * Number of credits we need for writing DIO_MAX_BLOCKS:
+ * We need sb + group descriptor + bitmap + inode -> 4
+ * For B blocks with A block pointers per block we need:
+ * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
+ * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ */
+#define DIO_CREDITS 25
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                        unsigned long max_blocks, struct buffer_head *bh,
@@ -939,49 +948,31 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create)
 {
        handle_t *handle = ext4_journal_current_handle();
-        int ret = 0;
+        int ret = 0, started = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        if (!create)
+        if (create && !handle) {
-                goto get_block;         /* A read */
+                /* Direct IO write... */
+                if (max_blocks > DIO_MAX_BLOCKS)
-        if (max_blocks == 1)
+                        max_blocks = DIO_MAX_BLOCKS;
-                goto get_block;         /* A single block get */
+                handle = ext4_journal_start(inode, DIO_CREDITS +
+                              2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
-        if (handle->h_transaction->t_state == T_LOCKED) {
+                if (IS_ERR(handle)) {
-                /*
-                 * Huge direct-io writes can hold off commits for long
-                 * periods of time.  Let this commit run.
-                 */
-                ext4_journal_stop(handle);
-                handle = ext4_journal_start(inode, DIO_CREDITS);
-                if (IS_ERR(handle))
                        ret = PTR_ERR(handle);
-                goto get_block;
+                        goto out;
-        }
-        if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
-                /*
-                 * Getting low on buffer credits...
-                 */
-                ret = ext4_journal_extend(handle, DIO_CREDITS);
-                if (ret > 0) {
-                        /*
-                         * Couldn't extend the transaction.  Start a new one.
-                         */
-                        ret = ext4_journal_restart(handle, DIO_CREDITS);
                }
+                started = 1;
        }
-get_block:
+        ret = ext4_get_blocks_wrap(handle, inode, iblock,
-        if (ret == 0) {
-                ret = ext4_get_blocks_wrap(handle, inode, iblock,
                                        max_blocks, bh_result, create, 0);
-                if (ret > 0) {
+        if (ret > 0) {
-                        bh_result->b_size = (ret << inode->i_blkbits);
+                bh_result->b_size = (ret << inode->i_blkbits);
-                        ret = 0;
+                ret = 0;
-                }
        }
+        if (started)
+                ext4_journal_stop(handle);
+out:
        return ret;
 }
@@ -1671,7 +1662,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 * if the machine crashes during the write.
 *
 * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file.
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
 */
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
@@ -1680,7 +1672,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        struct ext4_inode_info *ei = EXT4_I(inode);
-        handle_t *handle = NULL;
+        handle_t *handle;
        ssize_t ret;
        int orphan = 0;
        size_t count = iov_length(iov, nr_segs);
@@ -1688,17 +1680,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        if (rw == WRITE) {
                loff_t final_size = offset + count;
-                handle = ext4_journal_start(inode, DIO_CREDITS);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        goto out;
-                }
                if (final_size > inode->i_size) {
+                        /* Credits for sb + inode write */
+                        handle = ext4_journal_start(inode, 2);
+                        if (IS_ERR(handle)) {
+                                ret = PTR_ERR(handle);
+                                goto out;
+                        }
                        ret = ext4_orphan_add(handle, inode);
-                        if (ret)
+                        if (ret) {
-                                goto out_stop;
+                                ext4_journal_stop(handle);
+                                goto out;
+                        }
                        orphan = 1;
                        ei->i_disksize = inode->i_size;
+                        ext4_journal_stop(handle);
                }
        }
@@ -1706,18 +1702,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
                                 offset, nr_segs,
                                 ext4_get_block, NULL);
-        /*
+        if (orphan) {
-         * Reacquire the handle: ext4_get_block() can restart the transaction
-         */
-        handle = ext4_journal_current_handle();
-out_stop:
-        if (handle) {
                int err;
-                if (orphan && inode->i_nlink)
+                /* Credits for sb + inode write */
+                handle = ext4_journal_start(inode, 2);
+                if (IS_ERR(handle)) {
+                        /* This is really bad luck. We've written the data
+                         * but cannot extend i_size. Bail out and pretend
+                         * the write failed... */
+                        ret = PTR_ERR(handle);
+                        goto out;
+                }
+                if (inode->i_nlink)
                        ext4_orphan_del(handle, inode);
-                if (orphan && ret > 0) {
+                if (ret > 0) {
                        loff_t end = offset + ret;
                        if (end > inode->i_size) {
                                ei->i_disksize = end;
author	Jan Kara <jack@suse.cz>	2008-02-10 01:08:38 -0500
committer	Theodore Ts'o <tytso@mit.edu>	2008-02-10 01:08:38 -0500
commit	7fb5409df092589b86cc9412d926879cb572b7f0 (patch)
tree	8201f2fc124d34098776799f8cec89a8f8b4f8bb /fs/ext4
parent	8009f9fb3067fef6c2ca0c16f6bac786ae28639d (diff)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bbfabf876e78..7dd9b50d5ebc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c
@@ -892,7 +892,16 @@ out:
892	return err;	892	return err;
893	}	893	}
894		894
895	#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)	895	/* Maximum number of blocks we map for direct IO at once. */
		896	#define DIO_MAX_BLOCKS 4096
		897	/*
		898	* Number of credits we need for writing DIO_MAX_BLOCKS:
		899	* We need sb + group descriptor + bitmap + inode -> 4
		900	* For B blocks with A block pointers per block we need:
		901	* 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
		902	* If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
		903	*/
		904	#define DIO_CREDITS 25
896		905
897	int ext4_get_blocks_wrap(handle_t handle, struct inode inode, sector_t block,	906	int ext4_get_blocks_wrap(handle_t handle, struct inode inode, sector_t block,
898	unsigned long max_blocks, struct buffer_head *bh,	907	unsigned long max_blocks, struct buffer_head *bh,
@@ -939,49 +948,31 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
939	struct buffer_head *bh_result, int create)	948	struct buffer_head *bh_result, int create)
940	{	949	{
941	handle_t *handle = ext4_journal_current_handle();	950	handle_t *handle = ext4_journal_current_handle();
942	int ret = 0;	951	int ret = 0, started = 0;
943	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;	952	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
944		953
945	if (!create)	954	if (create && !handle) {
946	goto get_block; /* A read */	955	/* Direct IO write... */
947		956	if (max_blocks > DIO_MAX_BLOCKS)
948	if (max_blocks == 1)	957	max_blocks = DIO_MAX_BLOCKS;
949	goto get_block; /* A single block get */	958	handle = ext4_journal_start(inode, DIO_CREDITS +
950		959	2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
951	if (handle->h_transaction->t_state == T_LOCKED) {	960	if (IS_ERR(handle)) {
952	/*
953	* Huge direct-io writes can hold off commits for long
954	* periods of time. Let this commit run.
955	*/
956	ext4_journal_stop(handle);
957	handle = ext4_journal_start(inode, DIO_CREDITS);
958	if (IS_ERR(handle))
959	ret = PTR_ERR(handle);	961	ret = PTR_ERR(handle);
960	goto get_block;	962	goto out;
961	}
962
963	if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
964	/*
965	* Getting low on buffer credits...
966	*/
967	ret = ext4_journal_extend(handle, DIO_CREDITS);
968	if (ret > 0) {
969	/*
970	* Couldn't extend the transaction. Start a new one.
971	*/
972	ret = ext4_journal_restart(handle, DIO_CREDITS);
973	}	963	}
		964	started = 1;
974	}	965	}
975		966
976	get_block:	967	ret = ext4_get_blocks_wrap(handle, inode, iblock,
977	if (ret == 0) {
978	ret = ext4_get_blocks_wrap(handle, inode, iblock,
979	max_blocks, bh_result, create, 0);	968	max_blocks, bh_result, create, 0);
980	if (ret > 0) {	969	if (ret > 0) {
981	bh_result->b_size = (ret << inode->i_blkbits);	970	bh_result->b_size = (ret << inode->i_blkbits);
982	ret = 0;	971	ret = 0;
983	}
984	}	972	}
		973	if (started)
		974	ext4_journal_stop(handle);
		975	out:
985	return ret;	976	return ret;
986	}	977	}
987		978
@@ -1671,7 +1662,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
1671	* if the machine crashes during the write.	1662	* if the machine crashes during the write.
1672	*	1663	*
1673	* If the O_DIRECT write is intantiating holes inside i_size and the machine	1664	* If the O_DIRECT write is intantiating holes inside i_size and the machine
1674	* crashes then stale disk data _may_ be exposed inside the file.	1665	* crashes then stale disk data _may_ be exposed inside the file. But current
		1666	* VFS code falls back into buffered path in that case so we are safe.
1675	*/	1667	*/
1676	static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,	1668	static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1677	const struct iovec *iov, loff_t offset,	1669	const struct iovec *iov, loff_t offset,
@@ -1680,7 +1672,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1680	struct file *file = iocb->ki_filp;	1672	struct file *file = iocb->ki_filp;
1681	struct inode *inode = file->f_mapping->host;	1673	struct inode *inode = file->f_mapping->host;
1682	struct ext4_inode_info *ei = EXT4_I(inode);	1674	struct ext4_inode_info *ei = EXT4_I(inode);
1683	handle_t *handle = NULL;	1675	handle_t *handle;
1684	ssize_t ret;	1676	ssize_t ret;
1685	int orphan = 0;	1677	int orphan = 0;
1686	size_t count = iov_length(iov, nr_segs);	1678	size_t count = iov_length(iov, nr_segs);
@@ -1688,17 +1680,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1688	if (rw == WRITE) {	1680	if (rw == WRITE) {
1689	loff_t final_size = offset + count;	1681	loff_t final_size = offset + count;
1690		1682
1691	handle = ext4_journal_start(inode, DIO_CREDITS);
1692	if (IS_ERR(handle)) {
1693	ret = PTR_ERR(handle);
1694	goto out;
1695	}
1696	if (final_size > inode->i_size) {	1683	if (final_size > inode->i_size) {
		1684	/* Credits for sb + inode write */
		1685	handle = ext4_journal_start(inode, 2);
		1686	if (IS_ERR(handle)) {
		1687	ret = PTR_ERR(handle);
		1688	goto out;
		1689	}
1697	ret = ext4_orphan_add(handle, inode);	1690	ret = ext4_orphan_add(handle, inode);
1698	if (ret)	1691	if (ret) {
1699	goto out_stop;	1692	ext4_journal_stop(handle);
		1693	goto out;
		1694	}
1700	orphan = 1;	1695	orphan = 1;
1701	ei->i_disksize = inode->i_size;	1696	ei->i_disksize = inode->i_size;
		1697	ext4_journal_stop(handle);
1702	}	1698	}
1703	}	1699	}
1704		1700
@@ -1706,18 +1702,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
1706	offset, nr_segs,	1702	offset, nr_segs,
1707	ext4_get_block, NULL);	1703	ext4_get_block, NULL);
1708		1704
1709	/*	1705	if (orphan) {
1710	* Reacquire the handle: ext4_get_block() can restart the transaction
1711	*/
1712	handle = ext4_journal_current_handle();
1713
1714	out_stop:
1715	if (handle) {
1716	int err;	1706	int err;
1717		1707
1718	if (orphan && inode->i_nlink)	1708	/* Credits for sb + inode write */
		1709	handle = ext4_journal_start(inode, 2);
		1710	if (IS_ERR(handle)) {
		1711	/* This is really bad luck. We've written the data
		1712	* but cannot extend i_size. Bail out and pretend
		1713	* the write failed... */
		1714	ret = PTR_ERR(handle);
		1715	goto out;
		1716	}
		1717	if (inode->i_nlink)
1719	ext4_orphan_del(handle, inode);	1718	ext4_orphan_del(handle, inode);
1720	if (orphan && ret > 0) {	1719	if (ret > 0) {
1721	loff_t end = offset + ret;	1720	loff_t end = offset + ret;
1722	if (end > inode->i_size) {	1721	if (end > inode->i_size) {
1723	ei->i_disksize = end;	1722	ei->i_disksize = end;