aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDmitry Monakhov <dmonakhov@openvz.org>2012-09-26 12:52:07 -0400
committerTheodore Ts'o <tytso@mit.edu>2012-09-26 12:52:07 -0400
commitbb5574880574fea38c674942cf0360253a2d60fe (patch)
treeb343530fbe7e35ea88ea93b1f4d56be7506135d7 /fs
parentf066055a3449f0e5b0ae4f3ceab4445bead47638 (diff)
ext4: clean up online defrag bugs in move_extent_per_page()
Non-full list of bugs: 1) uninitialized extent optimization does not hold page's lock, and simply replace brunches after that writeback code goes crazy because block mapping changed under it's feets kernel BUG at fs/ext4/inode.c:1434! ( 288'th xfstress) 2) uninitialized extent may became initialized right after we drop i_data_sem, so extent state must be rechecked 3) Locked pages goes uptodate via following sequence: ->readpage(page); lock_page(page); use_that_page(page) But after readpage() one may invalidate it because it is uptodate and unlocked (reclaimer does that) As result kernel bug at include/linux/buffer_head.c:133! 4) We call write_begin() with already opened stansaction which result in following deadlock: ->move_extent_per_page() ->ext4_journal_start()-> hold journal transaction ->write_begin() ->ext4_da_write_begin() ->ext4_nonda_switch() ->writeback_inodes_sb_if_idle() --> will wait for journal_stop() 5) try_to_release_page() may fail and it does fail if one of page's bh was pinned by journal 6) If we about to change page's mapping we MUST hold it's lock during entire remapping procedure, this is true for both pages(original and donor one) Fixes: - Avoid (1) and (2) simply by temproraly drop uninitialized extent handling optimization, this will be reimplemented later. - Fix (3) by manually forcing page to uptodate state w/o dropping it's lock - Fix (4) by rearranging existing locking: from: journal_start(); ->write_begin to: write_begin(); journal_extend() - Fix (5) simply by checking retvalue - Fix (6) by locking both (original and donor one) pages during extent swap with help of mext_page_double_lock() Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/move_extent.c253
1 files changed, 178 insertions, 75 deletions
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e2016f34b58f..c87a746450e5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -736,6 +736,118 @@ out:
736} 736}
737 737
738/** 738/**
739 * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
740 *
741 * @inode1: the inode structure
742 * @inode2: the inode structure
743 * @index: page index
744 * @page: result page vector
745 *
746 * Grab two locked pages for inode's by inode order
747 */
748static int
749mext_page_double_lock(struct inode *inode1, struct inode *inode2,
750 pgoff_t index, struct page *page[2])
751{
752 struct address_space *mapping[2];
753 unsigned fl = AOP_FLAG_NOFS;
754
755 BUG_ON(!inode1 || !inode2);
756 if (inode1 < inode2) {
757 mapping[0] = inode1->i_mapping;
758 mapping[1] = inode2->i_mapping;
759 } else {
760 mapping[0] = inode2->i_mapping;
761 mapping[1] = inode1->i_mapping;
762 }
763
764 page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
765 if (!page[0])
766 return -ENOMEM;
767
768 page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
769 if (!page[1]) {
770 unlock_page(page[0]);
771 page_cache_release(page[0]);
772 return -ENOMEM;
773 }
774
775 if (inode1 > inode2) {
776 struct page *tmp;
777 tmp = page[0];
778 page[0] = page[1];
779 page[1] = tmp;
780 }
781 return 0;
782}
783
784/* Force page buffers uptodate w/o dropping page's lock */
785static int
786mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
787{
788 struct inode *inode = page->mapping->host;
789 sector_t block;
790 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
791 unsigned int blocksize, block_start, block_end;
792 int i, err, nr = 0, partial = 0;
793 BUG_ON(!PageLocked(page));
794 BUG_ON(PageWriteback(page));
795
796 if (PageUptodate(page))
797 return 0;
798
799 blocksize = 1 << inode->i_blkbits;
800 if (!page_has_buffers(page))
801 create_empty_buffers(page, blocksize, 0);
802
803 head = page_buffers(page);
804 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
805 for (bh = head, block_start = 0; bh != head || !block_start;
806 block++, block_start = block_end, bh = bh->b_this_page) {
807 block_end = block_start + blocksize;
808 if (block_end <= from || block_start >= to) {
809 if (!buffer_uptodate(bh))
810 partial = 1;
811 continue;
812 }
813 if (buffer_uptodate(bh))
814 continue;
815 if (!buffer_mapped(bh)) {
816 int err = 0;
817 err = ext4_get_block(inode, block, bh, 0);
818 if (err) {
819 SetPageError(page);
820 return err;
821 }
822 if (!buffer_mapped(bh)) {
823 zero_user(page, block_start, blocksize);
824 if (!err)
825 set_buffer_uptodate(bh);
826 continue;
827 }
828 }
829 BUG_ON(nr >= MAX_BUF_PER_PAGE);
830 arr[nr++] = bh;
831 }
832 /* No io required */
833 if (!nr)
834 goto out;
835
836 for (i = 0; i < nr; i++) {
837 bh = arr[i];
838 if (!bh_uptodate_or_lock(bh)) {
839 err = bh_submit_read(bh);
840 if (err)
841 return err;
842 }
843 }
844out:
845 if (!partial)
846 SetPageUptodate(page);
847 return 0;
848}
849
850/**
739 * move_extent_per_page - Move extent data per page 851 * move_extent_per_page - Move extent data per page
740 * 852 *
741 * @o_filp: file structure of original file 853 * @o_filp: file structure of original file
@@ -757,26 +869,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
757 int block_len_in_page, int uninit, int *err) 869 int block_len_in_page, int uninit, int *err)
758{ 870{
759 struct inode *orig_inode = o_filp->f_dentry->d_inode; 871 struct inode *orig_inode = o_filp->f_dentry->d_inode;
760 struct address_space *mapping = orig_inode->i_mapping; 872 struct page *pagep[2] = {NULL, NULL};
761 struct buffer_head *bh;
762 struct page *page = NULL;
763 const struct address_space_operations *a_ops = mapping->a_ops;
764 handle_t *handle; 873 handle_t *handle;
765 ext4_lblk_t orig_blk_offset; 874 ext4_lblk_t orig_blk_offset;
766 long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 875 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
767 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 876 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
768 unsigned int w_flags = 0; 877 unsigned int w_flags = 0;
769 unsigned int tmp_data_size, data_size, replaced_size; 878 unsigned int tmp_data_size, data_size, replaced_size;
770 void *fsdata; 879 int err2, jblocks, retries = 0;
771 int i, jblocks;
772 int err2 = 0;
773 int replaced_count = 0; 880 int replaced_count = 0;
881 int from = data_offset_in_page << orig_inode->i_blkbits;
774 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 882 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
775 883
776 /* 884 /*
777 * It needs twice the amount of ordinary journal buffers because 885 * It needs twice the amount of ordinary journal buffers because
778 * inode and donor_inode may change each different metadata blocks. 886 * inode and donor_inode may change each different metadata blocks.
779 */ 887 */
888again:
889 *err = 0;
780 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 890 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
781 handle = ext4_journal_start(orig_inode, jblocks); 891 handle = ext4_journal_start(orig_inode, jblocks);
782 if (IS_ERR(handle)) { 892 if (IS_ERR(handle)) {
@@ -790,19 +900,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
790 orig_blk_offset = orig_page_offset * blocks_per_page + 900 orig_blk_offset = orig_page_offset * blocks_per_page +
791 data_offset_in_page; 901 data_offset_in_page;
792 902
793 /*
794 * If orig extent is uninitialized one,
795 * it's not necessary force the page into memory
796 * and then force it to be written out again.
797 * Just swap data blocks between orig and donor.
798 */
799 if (uninit) {
800 replaced_count = mext_replace_branches(handle, orig_inode,
801 donor_inode, orig_blk_offset,
802 block_len_in_page, err);
803 goto out2;
804 }
805
806 offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 903 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
807 904
808 /* Calculate data_size */ 905 /* Calculate data_size */
@@ -824,75 +921,81 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
824 921
825 replaced_size = data_size; 922 replaced_size = data_size;
826 923
827 *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, 924 *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
828 &page, &fsdata); 925 pagep);
829 if (unlikely(*err < 0)) 926 if (unlikely(*err < 0))
830 goto out; 927 goto stop_journal;
831 928
832 if (!PageUptodate(page)) { 929 *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
833 mapping->a_ops->readpage(o_filp, page); 930 if (*err)
834 lock_page(page); 931 goto unlock_pages;
932
933 /* At this point all buffers in range are uptodate, old mapping layout
934 * is no longer required, try to drop it now. */
935 if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
936 (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
937 *err = -EBUSY;
938 goto unlock_pages;
835 } 939 }
836 940
837 /*
838 * try_to_release_page() doesn't call releasepage in writeback mode.
839 * We should care about the order of writing to the same file
840 * by multiple move extent processes.
841 * It needs to call wait_on_page_writeback() to wait for the
842 * writeback of the page.
843 */
844 wait_on_page_writeback(page);
845
846 /* Release old bh and drop refs */
847 try_to_release_page(page, 0);
848
849 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 941 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
850 orig_blk_offset, block_len_in_page, 942 orig_blk_offset,
851 &err2); 943 block_len_in_page, err);
852 if (err2) { 944 if (*err) {
853 if (replaced_count) { 945 if (replaced_count) {
854 block_len_in_page = replaced_count; 946 block_len_in_page = replaced_count;
855 replaced_size = 947 replaced_size =
856 block_len_in_page << orig_inode->i_blkbits; 948 block_len_in_page << orig_inode->i_blkbits;
857 } else 949 } else
858 goto out; 950 goto unlock_pages;
859 } 951 }
952 /* Perform all necessary steps similar write_begin()/write_end()
953 * but keeping in mind that i_size will not change */
954 *err = __block_write_begin(pagep[0], from, from + replaced_size,
955 ext4_get_block);
956 if (!*err)
957 *err = block_commit_write(pagep[0], from, from + replaced_size);
860 958
861 if (!page_has_buffers(page)) 959 if (unlikely(*err < 0))
862 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); 960 goto repair_branches;
863 961
864 bh = page_buffers(page); 962 /* Even in case of data=writeback it is reasonable to pin
865 for (i = 0; i < data_offset_in_page; i++) 963 * inode to transaction, to prevent unexpected data loss */
866 bh = bh->b_this_page; 964 *err = ext4_jbd2_file_inode(handle, orig_inode);
867 965
868 for (i = 0; i < block_len_in_page; i++) { 966unlock_pages:
869 *err = ext4_get_block(orig_inode, 967 unlock_page(pagep[0]);
870 (sector_t)(orig_blk_offset + i), bh, 0); 968 page_cache_release(pagep[0]);
871 if (*err < 0) 969 unlock_page(pagep[1]);
872 goto out; 970 page_cache_release(pagep[1]);
873 971stop_journal:
874 if (bh->b_this_page != NULL)
875 bh = bh->b_this_page;
876 }
877
878 *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
879 page, fsdata);
880 page = NULL;
881
882out:
883 if (unlikely(page)) {
884 if (PageLocked(page))
885 unlock_page(page);
886 page_cache_release(page);
887 ext4_journal_stop(handle);
888 }
889out2:
890 ext4_journal_stop(handle); 972 ext4_journal_stop(handle);
891 973 /* Buffer was busy because probably is pinned to journal transaction,
892 if (err2) 974 * force transaction commit may help to free it. */
893 *err = err2; 975 if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
894 976 &retries))
977 goto again;
895 return replaced_count; 978 return replaced_count;
979
980repair_branches:
981 /*
982 * This should never ever happen!
983 * Extents are swapped already, but we are not able to copy data.
984 * Try to swap extents to it's original places
985 */
986 double_down_write_data_sem(orig_inode, donor_inode);
987 replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
988 orig_blk_offset,
989 block_len_in_page, &err2);
990 double_up_write_data_sem(orig_inode, donor_inode);
991 if (replaced_count != block_len_in_page) {
992 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
993 "Unable to copy data block,"
994 " data will be lost.");
995 *err = -EIO;
996 }
997 replaced_count = 0;
998 goto unlock_pages;
896} 999}
897 1000
898/** 1001/**