aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2008-02-06 04:40:21 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-06 13:41:21 -0500
commitbd1939de9061dbc5cac44ffb4425aaf4c9b894f1 (patch)
tree99cc44f2b92db9dba1391c81638f7755603c0199
parentd8fd66aaea7fe3e4f1ea044a563f129e3b9f05ff (diff)
ext3: fix lock inversion in direct IO
We cannot start transaction in ext3_direct_IO() and just let it last during the whole write because dio_get_page() acquires mmap_sem which ranks above transaction start (e.g. because we have dependency chain mmap_sem->PageLock->journal_start, or because we update atime while holding mmap_sem) and thus deadlocks could happen. We solve the problem by starting a transaction separately for each ext3_get_block() call. We *could* have a problem that we allocate a block and before its data are written out the machine crashes and thus we expose stale data. But that does not happen because for hole-filling generic code falls back to buffered writes and for file extension, we add inode to orphan list and thus in case of crash, journal replay will truncate inode back to the original size. [akpm@linux-foundation.org: build fix] Signed-off-by: Jan Kara <jack@suse.cz> Cc: <linux-ext4@vger.kernel.org> Cc: Zach Brown <zach.brown@oracle.com> Cc: Badari Pulavarty <pbadari@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/ext3/inode.c106
1 files changed, 52 insertions, 54 deletions
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index a4f2d673d382..8a9ce2d09bde 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -939,55 +939,45 @@ out:
939 return err; 939 return err;
940} 940}
941 941
942#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32) 942/* Maximum number of blocks we map for direct IO at once. */
943#define DIO_MAX_BLOCKS 4096
944/*
945 * Number of credits we need for writing DIO_MAX_BLOCKS:
946 * We need sb + group descriptor + bitmap + inode -> 4
947 * For B blocks with A block pointers per block we need:
948 * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
949 * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
950 */
951#define DIO_CREDITS 25
943 952
944static int ext3_get_block(struct inode *inode, sector_t iblock, 953static int ext3_get_block(struct inode *inode, sector_t iblock,
945 struct buffer_head *bh_result, int create) 954 struct buffer_head *bh_result, int create)
946{ 955{
947 handle_t *handle = ext3_journal_current_handle(); 956 handle_t *handle = ext3_journal_current_handle();
948 int ret = 0; 957 int ret = 0, started = 0;
949 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 958 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
950 959
951 if (!create) 960 if (create && !handle) { /* Direct IO write... */
952 goto get_block; /* A read */ 961 if (max_blocks > DIO_MAX_BLOCKS)
953 962 max_blocks = DIO_MAX_BLOCKS;
954 if (max_blocks == 1) 963 handle = ext3_journal_start(inode, DIO_CREDITS +
955 goto get_block; /* A single block get */ 964 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb));
956 965 if (IS_ERR(handle)) {
957 if (handle->h_transaction->t_state == T_LOCKED) {
958 /*
959 * Huge direct-io writes can hold off commits for long
960 * periods of time. Let this commit run.
961 */
962 ext3_journal_stop(handle);
963 handle = ext3_journal_start(inode, DIO_CREDITS);
964 if (IS_ERR(handle))
965 ret = PTR_ERR(handle); 966 ret = PTR_ERR(handle);
966 goto get_block; 967 goto out;
967 }
968
969 if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
970 /*
971 * Getting low on buffer credits...
972 */
973 ret = ext3_journal_extend(handle, DIO_CREDITS);
974 if (ret > 0) {
975 /*
976 * Couldn't extend the transaction. Start a new one.
977 */
978 ret = ext3_journal_restart(handle, DIO_CREDITS);
979 } 968 }
969 started = 1;
980 } 970 }
981 971
982get_block: 972 ret = ext3_get_blocks_handle(handle, inode, iblock,
983 if (ret == 0) {
984 ret = ext3_get_blocks_handle(handle, inode, iblock,
985 max_blocks, bh_result, create, 0); 973 max_blocks, bh_result, create, 0);
986 if (ret > 0) { 974 if (ret > 0) {
987 bh_result->b_size = (ret << inode->i_blkbits); 975 bh_result->b_size = (ret << inode->i_blkbits);
988 ret = 0; 976 ret = 0;
989 }
990 } 977 }
978 if (started)
979 ext3_journal_stop(handle);
980out:
991 return ret; 981 return ret;
992} 982}
993 983
@@ -1678,7 +1668,8 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
1678 * if the machine crashes during the write. 1668 * if the machine crashes during the write.
1679 * 1669 *
1680 * If the O_DIRECT write is intantiating holes inside i_size and the machine 1670 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1681 * crashes then stale disk data _may_ be exposed inside the file. 1671 * crashes then stale disk data _may_ be exposed inside the file. But current
1672 * VFS code falls back into buffered path in that case so we are safe.
1682 */ 1673 */
1683static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, 1674static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1684 const struct iovec *iov, loff_t offset, 1675 const struct iovec *iov, loff_t offset,
@@ -1687,7 +1678,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1687 struct file *file = iocb->ki_filp; 1678 struct file *file = iocb->ki_filp;
1688 struct inode *inode = file->f_mapping->host; 1679 struct inode *inode = file->f_mapping->host;
1689 struct ext3_inode_info *ei = EXT3_I(inode); 1680 struct ext3_inode_info *ei = EXT3_I(inode);
1690 handle_t *handle = NULL; 1681 handle_t *handle;
1691 ssize_t ret; 1682 ssize_t ret;
1692 int orphan = 0; 1683 int orphan = 0;
1693 size_t count = iov_length(iov, nr_segs); 1684 size_t count = iov_length(iov, nr_segs);
@@ -1695,17 +1686,21 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1695 if (rw == WRITE) { 1686 if (rw == WRITE) {
1696 loff_t final_size = offset + count; 1687 loff_t final_size = offset + count;
1697 1688
1698 handle = ext3_journal_start(inode, DIO_CREDITS);
1699 if (IS_ERR(handle)) {
1700 ret = PTR_ERR(handle);
1701 goto out;
1702 }
1703 if (final_size > inode->i_size) { 1689 if (final_size > inode->i_size) {
1690 /* Credits for sb + inode write */
1691 handle = ext3_journal_start(inode, 2);
1692 if (IS_ERR(handle)) {
1693 ret = PTR_ERR(handle);
1694 goto out;
1695 }
1704 ret = ext3_orphan_add(handle, inode); 1696 ret = ext3_orphan_add(handle, inode);
1705 if (ret) 1697 if (ret) {
1706 goto out_stop; 1698 ext3_journal_stop(handle);
1699 goto out;
1700 }
1707 orphan = 1; 1701 orphan = 1;
1708 ei->i_disksize = inode->i_size; 1702 ei->i_disksize = inode->i_size;
1703 ext3_journal_stop(handle);
1709 } 1704 }
1710 } 1705 }
1711 1706
@@ -1713,18 +1708,21 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1713 offset, nr_segs, 1708 offset, nr_segs,
1714 ext3_get_block, NULL); 1709 ext3_get_block, NULL);
1715 1710
1716 /* 1711 if (orphan) {
1717 * Reacquire the handle: ext3_get_block() can restart the transaction
1718 */
1719 handle = ext3_journal_current_handle();
1720
1721out_stop:
1722 if (handle) {
1723 int err; 1712 int err;
1724 1713
1725 if (orphan && inode->i_nlink) 1714 /* Credits for sb + inode write */
1715 handle = ext3_journal_start(inode, 2);
1716 if (IS_ERR(handle)) {
1717 /* This is really bad luck. We've written the data
1718 * but cannot extend i_size. Bail out and pretend
1719 * the write failed... */
1720 ret = PTR_ERR(handle);
1721 goto out;
1722 }
1723 if (inode->i_nlink)
1726 ext3_orphan_del(handle, inode); 1724 ext3_orphan_del(handle, inode);
1727 if (orphan && ret > 0) { 1725 if (ret > 0) {
1728 loff_t end = offset + ret; 1726 loff_t end = offset + ret;
1729 if (end > inode->i_size) { 1727 if (end > inode->i_size) {
1730 ei->i_disksize = end; 1728 ei->i_disksize = end;