diff options
author | Dmitry Monakhov <dmonakhov@openvz.org> | 2012-09-26 12:52:07 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2012-09-26 12:52:07 -0400 |
commit | bb5574880574fea38c674942cf0360253a2d60fe (patch) | |
tree | b343530fbe7e35ea88ea93b1f4d56be7506135d7 /fs/ext4/move_extent.c | |
parent | f066055a3449f0e5b0ae4f3ceab4445bead47638 (diff) |
ext4: clean up online defrag bugs in move_extent_per_page()
Non-full list of bugs:
1) uninitialized extent optimization does not hold page's lock,
and simply replace brunches after that writeback code goes
crazy because block mapping changed under it's feets
kernel BUG at fs/ext4/inode.c:1434! ( 288'th xfstress)
2) uninitialized extent may became initialized right after we
drop i_data_sem, so extent state must be rechecked
3) Locked pages goes uptodate via following sequence:
->readpage(page); lock_page(page); use_that_page(page)
But after readpage() one may invalidate it because it is
uptodate and unlocked (reclaimer does that)
As result kernel bug at include/linux/buffer_head.c:133!
4) We call write_begin() with already opened stansaction which
result in following deadlock:
->move_extent_per_page()
->ext4_journal_start()-> hold journal transaction
->write_begin()
->ext4_da_write_begin()
->ext4_nonda_switch()
->writeback_inodes_sb_if_idle() --> will wait for journal_stop()
5) try_to_release_page() may fail and it does fail if one of page's bh was
pinned by journal
6) If we about to change page's mapping we MUST hold it's lock during entire
remapping procedure, this is true for both pages(original and donor one)
Fixes:
- Avoid (1) and (2) simply by temproraly drop uninitialized extent handling
optimization, this will be reimplemented later.
- Fix (3) by manually forcing page to uptodate state w/o dropping it's lock
- Fix (4) by rearranging existing locking:
from: journal_start(); ->write_begin
to: write_begin(); journal_extend()
- Fix (5) simply by checking retvalue
- Fix (6) by locking both (original and donor one) pages during extent swap
with help of mext_page_double_lock()
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/move_extent.c')
-rw-r--r-- | fs/ext4/move_extent.c | 253 |
1 files changed, 178 insertions, 75 deletions
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e2016f34b58f..c87a746450e5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c | |||
@@ -736,6 +736,118 @@ out: | |||
736 | } | 736 | } |
737 | 737 | ||
738 | /** | 738 | /** |
739 | * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 | ||
740 | * | ||
741 | * @inode1: the inode structure | ||
742 | * @inode2: the inode structure | ||
743 | * @index: page index | ||
744 | * @page: result page vector | ||
745 | * | ||
746 | * Grab two locked pages for inode's by inode order | ||
747 | */ | ||
748 | static int | ||
749 | mext_page_double_lock(struct inode *inode1, struct inode *inode2, | ||
750 | pgoff_t index, struct page *page[2]) | ||
751 | { | ||
752 | struct address_space *mapping[2]; | ||
753 | unsigned fl = AOP_FLAG_NOFS; | ||
754 | |||
755 | BUG_ON(!inode1 || !inode2); | ||
756 | if (inode1 < inode2) { | ||
757 | mapping[0] = inode1->i_mapping; | ||
758 | mapping[1] = inode2->i_mapping; | ||
759 | } else { | ||
760 | mapping[0] = inode2->i_mapping; | ||
761 | mapping[1] = inode1->i_mapping; | ||
762 | } | ||
763 | |||
764 | page[0] = grab_cache_page_write_begin(mapping[0], index, fl); | ||
765 | if (!page[0]) | ||
766 | return -ENOMEM; | ||
767 | |||
768 | page[1] = grab_cache_page_write_begin(mapping[1], index, fl); | ||
769 | if (!page[1]) { | ||
770 | unlock_page(page[0]); | ||
771 | page_cache_release(page[0]); | ||
772 | return -ENOMEM; | ||
773 | } | ||
774 | |||
775 | if (inode1 > inode2) { | ||
776 | struct page *tmp; | ||
777 | tmp = page[0]; | ||
778 | page[0] = page[1]; | ||
779 | page[1] = tmp; | ||
780 | } | ||
781 | return 0; | ||
782 | } | ||
783 | |||
784 | /* Force page buffers uptodate w/o dropping page's lock */ | ||
785 | static int | ||
786 | mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) | ||
787 | { | ||
788 | struct inode *inode = page->mapping->host; | ||
789 | sector_t block; | ||
790 | struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; | ||
791 | unsigned int blocksize, block_start, block_end; | ||
792 | int i, err, nr = 0, partial = 0; | ||
793 | BUG_ON(!PageLocked(page)); | ||
794 | BUG_ON(PageWriteback(page)); | ||
795 | |||
796 | if (PageUptodate(page)) | ||
797 | return 0; | ||
798 | |||
799 | blocksize = 1 << inode->i_blkbits; | ||
800 | if (!page_has_buffers(page)) | ||
801 | create_empty_buffers(page, blocksize, 0); | ||
802 | |||
803 | head = page_buffers(page); | ||
804 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
805 | for (bh = head, block_start = 0; bh != head || !block_start; | ||
806 | block++, block_start = block_end, bh = bh->b_this_page) { | ||
807 | block_end = block_start + blocksize; | ||
808 | if (block_end <= from || block_start >= to) { | ||
809 | if (!buffer_uptodate(bh)) | ||
810 | partial = 1; | ||
811 | continue; | ||
812 | } | ||
813 | if (buffer_uptodate(bh)) | ||
814 | continue; | ||
815 | if (!buffer_mapped(bh)) { | ||
816 | int err = 0; | ||
817 | err = ext4_get_block(inode, block, bh, 0); | ||
818 | if (err) { | ||
819 | SetPageError(page); | ||
820 | return err; | ||
821 | } | ||
822 | if (!buffer_mapped(bh)) { | ||
823 | zero_user(page, block_start, blocksize); | ||
824 | if (!err) | ||
825 | set_buffer_uptodate(bh); | ||
826 | continue; | ||
827 | } | ||
828 | } | ||
829 | BUG_ON(nr >= MAX_BUF_PER_PAGE); | ||
830 | arr[nr++] = bh; | ||
831 | } | ||
832 | /* No io required */ | ||
833 | if (!nr) | ||
834 | goto out; | ||
835 | |||
836 | for (i = 0; i < nr; i++) { | ||
837 | bh = arr[i]; | ||
838 | if (!bh_uptodate_or_lock(bh)) { | ||
839 | err = bh_submit_read(bh); | ||
840 | if (err) | ||
841 | return err; | ||
842 | } | ||
843 | } | ||
844 | out: | ||
845 | if (!partial) | ||
846 | SetPageUptodate(page); | ||
847 | return 0; | ||
848 | } | ||
849 | |||
850 | /** | ||
739 | * move_extent_per_page - Move extent data per page | 851 | * move_extent_per_page - Move extent data per page |
740 | * | 852 | * |
741 | * @o_filp: file structure of original file | 853 | * @o_filp: file structure of original file |
@@ -757,26 +869,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
757 | int block_len_in_page, int uninit, int *err) | 869 | int block_len_in_page, int uninit, int *err) |
758 | { | 870 | { |
759 | struct inode *orig_inode = o_filp->f_dentry->d_inode; | 871 | struct inode *orig_inode = o_filp->f_dentry->d_inode; |
760 | struct address_space *mapping = orig_inode->i_mapping; | 872 | struct page *pagep[2] = {NULL, NULL}; |
761 | struct buffer_head *bh; | ||
762 | struct page *page = NULL; | ||
763 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
764 | handle_t *handle; | 873 | handle_t *handle; |
765 | ext4_lblk_t orig_blk_offset; | 874 | ext4_lblk_t orig_blk_offset; |
766 | long long offs = orig_page_offset << PAGE_CACHE_SHIFT; | 875 | long long offs = orig_page_offset << PAGE_CACHE_SHIFT; |
767 | unsigned long blocksize = orig_inode->i_sb->s_blocksize; | 876 | unsigned long blocksize = orig_inode->i_sb->s_blocksize; |
768 | unsigned int w_flags = 0; | 877 | unsigned int w_flags = 0; |
769 | unsigned int tmp_data_size, data_size, replaced_size; | 878 | unsigned int tmp_data_size, data_size, replaced_size; |
770 | void *fsdata; | 879 | int err2, jblocks, retries = 0; |
771 | int i, jblocks; | ||
772 | int err2 = 0; | ||
773 | int replaced_count = 0; | 880 | int replaced_count = 0; |
881 | int from = data_offset_in_page << orig_inode->i_blkbits; | ||
774 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; | 882 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; |
775 | 883 | ||
776 | /* | 884 | /* |
777 | * It needs twice the amount of ordinary journal buffers because | 885 | * It needs twice the amount of ordinary journal buffers because |
778 | * inode and donor_inode may change each different metadata blocks. | 886 | * inode and donor_inode may change each different metadata blocks. |
779 | */ | 887 | */ |
888 | again: | ||
889 | *err = 0; | ||
780 | jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; | 890 | jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; |
781 | handle = ext4_journal_start(orig_inode, jblocks); | 891 | handle = ext4_journal_start(orig_inode, jblocks); |
782 | if (IS_ERR(handle)) { | 892 | if (IS_ERR(handle)) { |
@@ -790,19 +900,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
790 | orig_blk_offset = orig_page_offset * blocks_per_page + | 900 | orig_blk_offset = orig_page_offset * blocks_per_page + |
791 | data_offset_in_page; | 901 | data_offset_in_page; |
792 | 902 | ||
793 | /* | ||
794 | * If orig extent is uninitialized one, | ||
795 | * it's not necessary force the page into memory | ||
796 | * and then force it to be written out again. | ||
797 | * Just swap data blocks between orig and donor. | ||
798 | */ | ||
799 | if (uninit) { | ||
800 | replaced_count = mext_replace_branches(handle, orig_inode, | ||
801 | donor_inode, orig_blk_offset, | ||
802 | block_len_in_page, err); | ||
803 | goto out2; | ||
804 | } | ||
805 | |||
806 | offs = (long long)orig_blk_offset << orig_inode->i_blkbits; | 903 | offs = (long long)orig_blk_offset << orig_inode->i_blkbits; |
807 | 904 | ||
808 | /* Calculate data_size */ | 905 | /* Calculate data_size */ |
@@ -824,75 +921,81 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
824 | 921 | ||
825 | replaced_size = data_size; | 922 | replaced_size = data_size; |
826 | 923 | ||
827 | *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, | 924 | *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, |
828 | &page, &fsdata); | 925 | pagep); |
829 | if (unlikely(*err < 0)) | 926 | if (unlikely(*err < 0)) |
830 | goto out; | 927 | goto stop_journal; |
831 | 928 | ||
832 | if (!PageUptodate(page)) { | 929 | *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); |
833 | mapping->a_ops->readpage(o_filp, page); | 930 | if (*err) |
834 | lock_page(page); | 931 | goto unlock_pages; |
932 | |||
933 | /* At this point all buffers in range are uptodate, old mapping layout | ||
934 | * is no longer required, try to drop it now. */ | ||
935 | if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || | ||
936 | (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { | ||
937 | *err = -EBUSY; | ||
938 | goto unlock_pages; | ||
835 | } | 939 | } |
836 | 940 | ||
837 | /* | ||
838 | * try_to_release_page() doesn't call releasepage in writeback mode. | ||
839 | * We should care about the order of writing to the same file | ||
840 | * by multiple move extent processes. | ||
841 | * It needs to call wait_on_page_writeback() to wait for the | ||
842 | * writeback of the page. | ||
843 | */ | ||
844 | wait_on_page_writeback(page); | ||
845 | |||
846 | /* Release old bh and drop refs */ | ||
847 | try_to_release_page(page, 0); | ||
848 | |||
849 | replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, | 941 | replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, |
850 | orig_blk_offset, block_len_in_page, | 942 | orig_blk_offset, |
851 | &err2); | 943 | block_len_in_page, err); |
852 | if (err2) { | 944 | if (*err) { |
853 | if (replaced_count) { | 945 | if (replaced_count) { |
854 | block_len_in_page = replaced_count; | 946 | block_len_in_page = replaced_count; |
855 | replaced_size = | 947 | replaced_size = |
856 | block_len_in_page << orig_inode->i_blkbits; | 948 | block_len_in_page << orig_inode->i_blkbits; |
857 | } else | 949 | } else |
858 | goto out; | 950 | goto unlock_pages; |
859 | } | 951 | } |
952 | /* Perform all necessary steps similar write_begin()/write_end() | ||
953 | * but keeping in mind that i_size will not change */ | ||
954 | *err = __block_write_begin(pagep[0], from, from + replaced_size, | ||
955 | ext4_get_block); | ||
956 | if (!*err) | ||
957 | *err = block_commit_write(pagep[0], from, from + replaced_size); | ||
860 | 958 | ||
861 | if (!page_has_buffers(page)) | 959 | if (unlikely(*err < 0)) |
862 | create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); | 960 | goto repair_branches; |
863 | 961 | ||
864 | bh = page_buffers(page); | 962 | /* Even in case of data=writeback it is reasonable to pin |
865 | for (i = 0; i < data_offset_in_page; i++) | 963 | * inode to transaction, to prevent unexpected data loss */ |
866 | bh = bh->b_this_page; | 964 | *err = ext4_jbd2_file_inode(handle, orig_inode); |
867 | 965 | ||
868 | for (i = 0; i < block_len_in_page; i++) { | 966 | unlock_pages: |
869 | *err = ext4_get_block(orig_inode, | 967 | unlock_page(pagep[0]); |
870 | (sector_t)(orig_blk_offset + i), bh, 0); | 968 | page_cache_release(pagep[0]); |
871 | if (*err < 0) | 969 | unlock_page(pagep[1]); |
872 | goto out; | 970 | page_cache_release(pagep[1]); |
873 | 971 | stop_journal: | |
874 | if (bh->b_this_page != NULL) | ||
875 | bh = bh->b_this_page; | ||
876 | } | ||
877 | |||
878 | *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, | ||
879 | page, fsdata); | ||
880 | page = NULL; | ||
881 | |||
882 | out: | ||
883 | if (unlikely(page)) { | ||
884 | if (PageLocked(page)) | ||
885 | unlock_page(page); | ||
886 | page_cache_release(page); | ||
887 | ext4_journal_stop(handle); | ||
888 | } | ||
889 | out2: | ||
890 | ext4_journal_stop(handle); | 972 | ext4_journal_stop(handle); |
891 | 973 | /* Buffer was busy because probably is pinned to journal transaction, | |
892 | if (err2) | 974 | * force transaction commit may help to free it. */ |
893 | *err = err2; | 975 | if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, |
894 | 976 | &retries)) | |
977 | goto again; | ||
895 | return replaced_count; | 978 | return replaced_count; |
979 | |||
980 | repair_branches: | ||
981 | /* | ||
982 | * This should never ever happen! | ||
983 | * Extents are swapped already, but we are not able to copy data. | ||
984 | * Try to swap extents to it's original places | ||
985 | */ | ||
986 | double_down_write_data_sem(orig_inode, donor_inode); | ||
987 | replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, | ||
988 | orig_blk_offset, | ||
989 | block_len_in_page, &err2); | ||
990 | double_up_write_data_sem(orig_inode, donor_inode); | ||
991 | if (replaced_count != block_len_in_page) { | ||
992 | EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), | ||
993 | "Unable to copy data block," | ||
994 | " data will be lost."); | ||
995 | *err = -EIO; | ||
996 | } | ||
997 | replaced_count = 0; | ||
998 | goto unlock_pages; | ||
896 | } | 999 | } |
897 | 1000 | ||
898 | /** | 1001 | /** |