diff options
Diffstat (limited to 'fs/ext4/move_extent.c')
-rw-r--r-- | fs/ext4/move_extent.c | 520 |
1 files changed, 316 insertions, 204 deletions
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c5826c623e7a..292daeeed455 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c | |||
@@ -141,55 +141,21 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, | |||
141 | } | 141 | } |
142 | 142 | ||
143 | /** | 143 | /** |
144 | * mext_check_null_inode - NULL check for two inodes | ||
145 | * | ||
146 | * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. | ||
147 | */ | ||
148 | static int | ||
149 | mext_check_null_inode(struct inode *inode1, struct inode *inode2, | ||
150 | const char *function, unsigned int line) | ||
151 | { | ||
152 | int ret = 0; | ||
153 | |||
154 | if (inode1 == NULL) { | ||
155 | __ext4_error(inode2->i_sb, function, line, | ||
156 | "Both inodes should not be NULL: " | ||
157 | "inode1 NULL inode2 %lu", inode2->i_ino); | ||
158 | ret = -EIO; | ||
159 | } else if (inode2 == NULL) { | ||
160 | __ext4_error(inode1->i_sb, function, line, | ||
161 | "Both inodes should not be NULL: " | ||
162 | "inode1 %lu inode2 NULL", inode1->i_ino); | ||
163 | ret = -EIO; | ||
164 | } | ||
165 | return ret; | ||
166 | } | ||
167 | |||
168 | /** | ||
169 | * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem | 144 | * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem |
170 | * | 145 | * |
171 | * @orig_inode: original inode structure | 146 | * Acquire write lock of i_data_sem of the two inodes |
172 | * @donor_inode: donor inode structure | ||
173 | * Acquire write lock of i_data_sem of the two inodes (orig and donor) by | ||
174 | * i_ino order. | ||
175 | */ | 147 | */ |
176 | static void | 148 | static void |
177 | double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) | 149 | double_down_write_data_sem(struct inode *first, struct inode *second) |
178 | { | 150 | { |
179 | struct inode *first = orig_inode, *second = donor_inode; | 151 | if (first < second) { |
152 | down_write(&EXT4_I(first)->i_data_sem); | ||
153 | down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); | ||
154 | } else { | ||
155 | down_write(&EXT4_I(second)->i_data_sem); | ||
156 | down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING); | ||
180 | 157 | ||
181 | /* | ||
182 | * Use the inode number to provide the stable locking order instead | ||
183 | * of its address, because the C language doesn't guarantee you can | ||
184 | * compare pointers that don't come from the same array. | ||
185 | */ | ||
186 | if (donor_inode->i_ino < orig_inode->i_ino) { | ||
187 | first = donor_inode; | ||
188 | second = orig_inode; | ||
189 | } | 158 | } |
190 | |||
191 | down_write(&EXT4_I(first)->i_data_sem); | ||
192 | down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); | ||
193 | } | 159 | } |
194 | 160 | ||
195 | /** | 161 | /** |
@@ -604,9 +570,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, | |||
604 | diff = donor_off - le32_to_cpu(tmp_dext->ee_block); | 570 | diff = donor_off - le32_to_cpu(tmp_dext->ee_block); |
605 | 571 | ||
606 | ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); | 572 | ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); |
607 | tmp_dext->ee_block = | 573 | le32_add_cpu(&tmp_dext->ee_block, diff); |
608 | cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); | 574 | le16_add_cpu(&tmp_dext->ee_len, -diff); |
609 | tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); | ||
610 | 575 | ||
611 | if (max_count < ext4_ext_get_actual_len(tmp_dext)) | 576 | if (max_count < ext4_ext_get_actual_len(tmp_dext)) |
612 | tmp_dext->ee_len = cpu_to_le16(max_count); | 577 | tmp_dext->ee_len = cpu_to_le16(max_count); |
@@ -629,6 +594,43 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, | |||
629 | } | 594 | } |
630 | 595 | ||
631 | /** | 596 | /** |
597 | * mext_check_coverage - Check that all extents in range has the same type | ||
598 | * | ||
599 | * @inode: inode in question | ||
600 | * @from: block offset of inode | ||
601 | * @count: block count to be checked | ||
602 | * @uninit: extents expected to be uninitialized | ||
603 | * @err: pointer to save error value | ||
604 | * | ||
605 | * Return 1 if all extents in range has expected type, and zero otherwise. | ||
606 | */ | ||
607 | static int | ||
608 | mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, | ||
609 | int uninit, int *err) | ||
610 | { | ||
611 | struct ext4_ext_path *path = NULL; | ||
612 | struct ext4_extent *ext; | ||
613 | ext4_lblk_t last = from + count; | ||
614 | while (from < last) { | ||
615 | *err = get_ext_path(inode, from, &path); | ||
616 | if (*err) | ||
617 | return 0; | ||
618 | ext = path[ext_depth(inode)].p_ext; | ||
619 | if (!ext) { | ||
620 | ext4_ext_drop_refs(path); | ||
621 | return 0; | ||
622 | } | ||
623 | if (uninit != ext4_ext_is_uninitialized(ext)) { | ||
624 | ext4_ext_drop_refs(path); | ||
625 | return 0; | ||
626 | } | ||
627 | from += ext4_ext_get_actual_len(ext); | ||
628 | ext4_ext_drop_refs(path); | ||
629 | } | ||
630 | return 1; | ||
631 | } | ||
632 | |||
633 | /** | ||
632 | * mext_replace_branches - Replace original extents with new extents | 634 | * mext_replace_branches - Replace original extents with new extents |
633 | * | 635 | * |
634 | * @handle: journal handle | 636 | * @handle: journal handle |
@@ -663,9 +665,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, | |||
663 | int replaced_count = 0; | 665 | int replaced_count = 0; |
664 | int dext_alen; | 666 | int dext_alen; |
665 | 667 | ||
666 | /* Protect extent trees against block allocations via delalloc */ | ||
667 | double_down_write_data_sem(orig_inode, donor_inode); | ||
668 | |||
669 | /* Get the original extent for the block "orig_off" */ | 668 | /* Get the original extent for the block "orig_off" */ |
670 | *err = get_ext_path(orig_inode, orig_off, &orig_path); | 669 | *err = get_ext_path(orig_inode, orig_off, &orig_path); |
671 | if (*err) | 670 | if (*err) |
@@ -764,12 +763,122 @@ out: | |||
764 | ext4_ext_invalidate_cache(orig_inode); | 763 | ext4_ext_invalidate_cache(orig_inode); |
765 | ext4_ext_invalidate_cache(donor_inode); | 764 | ext4_ext_invalidate_cache(donor_inode); |
766 | 765 | ||
767 | double_up_write_data_sem(orig_inode, donor_inode); | ||
768 | |||
769 | return replaced_count; | 766 | return replaced_count; |
770 | } | 767 | } |
771 | 768 | ||
772 | /** | 769 | /** |
770 | * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 | ||
771 | * | ||
772 | * @inode1: the inode structure | ||
773 | * @inode2: the inode structure | ||
774 | * @index: page index | ||
775 | * @page: result page vector | ||
776 | * | ||
777 | * Grab two locked pages for inode's by inode order | ||
778 | */ | ||
779 | static int | ||
780 | mext_page_double_lock(struct inode *inode1, struct inode *inode2, | ||
781 | pgoff_t index, struct page *page[2]) | ||
782 | { | ||
783 | struct address_space *mapping[2]; | ||
784 | unsigned fl = AOP_FLAG_NOFS; | ||
785 | |||
786 | BUG_ON(!inode1 || !inode2); | ||
787 | if (inode1 < inode2) { | ||
788 | mapping[0] = inode1->i_mapping; | ||
789 | mapping[1] = inode2->i_mapping; | ||
790 | } else { | ||
791 | mapping[0] = inode2->i_mapping; | ||
792 | mapping[1] = inode1->i_mapping; | ||
793 | } | ||
794 | |||
795 | page[0] = grab_cache_page_write_begin(mapping[0], index, fl); | ||
796 | if (!page[0]) | ||
797 | return -ENOMEM; | ||
798 | |||
799 | page[1] = grab_cache_page_write_begin(mapping[1], index, fl); | ||
800 | if (!page[1]) { | ||
801 | unlock_page(page[0]); | ||
802 | page_cache_release(page[0]); | ||
803 | return -ENOMEM; | ||
804 | } | ||
805 | |||
806 | if (inode1 > inode2) { | ||
807 | struct page *tmp; | ||
808 | tmp = page[0]; | ||
809 | page[0] = page[1]; | ||
810 | page[1] = tmp; | ||
811 | } | ||
812 | return 0; | ||
813 | } | ||
814 | |||
815 | /* Force page buffers uptodate w/o dropping page's lock */ | ||
816 | static int | ||
817 | mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) | ||
818 | { | ||
819 | struct inode *inode = page->mapping->host; | ||
820 | sector_t block; | ||
821 | struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; | ||
822 | unsigned int blocksize, block_start, block_end; | ||
823 | int i, err, nr = 0, partial = 0; | ||
824 | BUG_ON(!PageLocked(page)); | ||
825 | BUG_ON(PageWriteback(page)); | ||
826 | |||
827 | if (PageUptodate(page)) | ||
828 | return 0; | ||
829 | |||
830 | blocksize = 1 << inode->i_blkbits; | ||
831 | if (!page_has_buffers(page)) | ||
832 | create_empty_buffers(page, blocksize, 0); | ||
833 | |||
834 | head = page_buffers(page); | ||
835 | block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
836 | for (bh = head, block_start = 0; bh != head || !block_start; | ||
837 | block++, block_start = block_end, bh = bh->b_this_page) { | ||
838 | block_end = block_start + blocksize; | ||
839 | if (block_end <= from || block_start >= to) { | ||
840 | if (!buffer_uptodate(bh)) | ||
841 | partial = 1; | ||
842 | continue; | ||
843 | } | ||
844 | if (buffer_uptodate(bh)) | ||
845 | continue; | ||
846 | if (!buffer_mapped(bh)) { | ||
847 | int err = 0; | ||
848 | err = ext4_get_block(inode, block, bh, 0); | ||
849 | if (err) { | ||
850 | SetPageError(page); | ||
851 | return err; | ||
852 | } | ||
853 | if (!buffer_mapped(bh)) { | ||
854 | zero_user(page, block_start, blocksize); | ||
855 | if (!err) | ||
856 | set_buffer_uptodate(bh); | ||
857 | continue; | ||
858 | } | ||
859 | } | ||
860 | BUG_ON(nr >= MAX_BUF_PER_PAGE); | ||
861 | arr[nr++] = bh; | ||
862 | } | ||
863 | /* No io required */ | ||
864 | if (!nr) | ||
865 | goto out; | ||
866 | |||
867 | for (i = 0; i < nr; i++) { | ||
868 | bh = arr[i]; | ||
869 | if (!bh_uptodate_or_lock(bh)) { | ||
870 | err = bh_submit_read(bh); | ||
871 | if (err) | ||
872 | return err; | ||
873 | } | ||
874 | } | ||
875 | out: | ||
876 | if (!partial) | ||
877 | SetPageUptodate(page); | ||
878 | return 0; | ||
879 | } | ||
880 | |||
881 | /** | ||
773 | * move_extent_per_page - Move extent data per page | 882 | * move_extent_per_page - Move extent data per page |
774 | * | 883 | * |
775 | * @o_filp: file structure of original file | 884 | * @o_filp: file structure of original file |
@@ -791,26 +900,24 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
791 | int block_len_in_page, int uninit, int *err) | 900 | int block_len_in_page, int uninit, int *err) |
792 | { | 901 | { |
793 | struct inode *orig_inode = o_filp->f_dentry->d_inode; | 902 | struct inode *orig_inode = o_filp->f_dentry->d_inode; |
794 | struct address_space *mapping = orig_inode->i_mapping; | 903 | struct page *pagep[2] = {NULL, NULL}; |
795 | struct buffer_head *bh; | ||
796 | struct page *page = NULL; | ||
797 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
798 | handle_t *handle; | 904 | handle_t *handle; |
799 | ext4_lblk_t orig_blk_offset; | 905 | ext4_lblk_t orig_blk_offset; |
800 | long long offs = orig_page_offset << PAGE_CACHE_SHIFT; | 906 | long long offs = orig_page_offset << PAGE_CACHE_SHIFT; |
801 | unsigned long blocksize = orig_inode->i_sb->s_blocksize; | 907 | unsigned long blocksize = orig_inode->i_sb->s_blocksize; |
802 | unsigned int w_flags = 0; | 908 | unsigned int w_flags = 0; |
803 | unsigned int tmp_data_size, data_size, replaced_size; | 909 | unsigned int tmp_data_size, data_size, replaced_size; |
804 | void *fsdata; | 910 | int err2, jblocks, retries = 0; |
805 | int i, jblocks; | ||
806 | int err2 = 0; | ||
807 | int replaced_count = 0; | 911 | int replaced_count = 0; |
912 | int from = data_offset_in_page << orig_inode->i_blkbits; | ||
808 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; | 913 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; |
809 | 914 | ||
810 | /* | 915 | /* |
811 | * It needs twice the amount of ordinary journal buffers because | 916 | * It needs twice the amount of ordinary journal buffers because |
812 | * inode and donor_inode may change each different metadata blocks. | 917 | * inode and donor_inode may change each different metadata blocks. |
813 | */ | 918 | */ |
919 | again: | ||
920 | *err = 0; | ||
814 | jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; | 921 | jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; |
815 | handle = ext4_journal_start(orig_inode, jblocks); | 922 | handle = ext4_journal_start(orig_inode, jblocks); |
816 | if (IS_ERR(handle)) { | 923 | if (IS_ERR(handle)) { |
@@ -824,19 +931,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
824 | orig_blk_offset = orig_page_offset * blocks_per_page + | 931 | orig_blk_offset = orig_page_offset * blocks_per_page + |
825 | data_offset_in_page; | 932 | data_offset_in_page; |
826 | 933 | ||
827 | /* | ||
828 | * If orig extent is uninitialized one, | ||
829 | * it's not necessary force the page into memory | ||
830 | * and then force it to be written out again. | ||
831 | * Just swap data blocks between orig and donor. | ||
832 | */ | ||
833 | if (uninit) { | ||
834 | replaced_count = mext_replace_branches(handle, orig_inode, | ||
835 | donor_inode, orig_blk_offset, | ||
836 | block_len_in_page, err); | ||
837 | goto out2; | ||
838 | } | ||
839 | |||
840 | offs = (long long)orig_blk_offset << orig_inode->i_blkbits; | 934 | offs = (long long)orig_blk_offset << orig_inode->i_blkbits; |
841 | 935 | ||
842 | /* Calculate data_size */ | 936 | /* Calculate data_size */ |
@@ -858,75 +952,120 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
858 | 952 | ||
859 | replaced_size = data_size; | 953 | replaced_size = data_size; |
860 | 954 | ||
861 | *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, | 955 | *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, |
862 | &page, &fsdata); | 956 | pagep); |
863 | if (unlikely(*err < 0)) | 957 | if (unlikely(*err < 0)) |
864 | goto out; | 958 | goto stop_journal; |
865 | |||
866 | if (!PageUptodate(page)) { | ||
867 | mapping->a_ops->readpage(o_filp, page); | ||
868 | lock_page(page); | ||
869 | } | ||
870 | |||
871 | /* | 959 | /* |
872 | * try_to_release_page() doesn't call releasepage in writeback mode. | 960 | * If orig extent was uninitialized it can become initialized |
873 | * We should care about the order of writing to the same file | 961 | * at any time after i_data_sem was dropped, in order to |
874 | * by multiple move extent processes. | 962 | * serialize with delalloc we have recheck extent while we |
875 | * It needs to call wait_on_page_writeback() to wait for the | 963 | * hold page's lock, if it is still the case data copy is not |
876 | * writeback of the page. | 964 | * necessary, just swap data blocks between orig and donor. |
877 | */ | 965 | */ |
878 | wait_on_page_writeback(page); | 966 | if (uninit) { |
967 | double_down_write_data_sem(orig_inode, donor_inode); | ||
968 | /* If any of extents in range became initialized we have to | ||
969 | * fallback to data copying */ | ||
970 | uninit = mext_check_coverage(orig_inode, orig_blk_offset, | ||
971 | block_len_in_page, 1, err); | ||
972 | if (*err) | ||
973 | goto drop_data_sem; | ||
879 | 974 | ||
880 | /* Release old bh and drop refs */ | 975 | uninit &= mext_check_coverage(donor_inode, orig_blk_offset, |
881 | try_to_release_page(page, 0); | 976 | block_len_in_page, 1, err); |
977 | if (*err) | ||
978 | goto drop_data_sem; | ||
979 | |||
980 | if (!uninit) { | ||
981 | double_up_write_data_sem(orig_inode, donor_inode); | ||
982 | goto data_copy; | ||
983 | } | ||
984 | if ((page_has_private(pagep[0]) && | ||
985 | !try_to_release_page(pagep[0], 0)) || | ||
986 | (page_has_private(pagep[1]) && | ||
987 | !try_to_release_page(pagep[1], 0))) { | ||
988 | *err = -EBUSY; | ||
989 | goto drop_data_sem; | ||
990 | } | ||
991 | replaced_count = mext_replace_branches(handle, orig_inode, | ||
992 | donor_inode, orig_blk_offset, | ||
993 | block_len_in_page, err); | ||
994 | drop_data_sem: | ||
995 | double_up_write_data_sem(orig_inode, donor_inode); | ||
996 | goto unlock_pages; | ||
997 | } | ||
998 | data_copy: | ||
999 | *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); | ||
1000 | if (*err) | ||
1001 | goto unlock_pages; | ||
1002 | |||
1003 | /* At this point all buffers in range are uptodate, old mapping layout | ||
1004 | * is no longer required, try to drop it now. */ | ||
1005 | if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || | ||
1006 | (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { | ||
1007 | *err = -EBUSY; | ||
1008 | goto unlock_pages; | ||
1009 | } | ||
882 | 1010 | ||
883 | replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, | 1011 | replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, |
884 | orig_blk_offset, block_len_in_page, | 1012 | orig_blk_offset, |
885 | &err2); | 1013 | block_len_in_page, err); |
886 | if (err2) { | 1014 | if (*err) { |
887 | if (replaced_count) { | 1015 | if (replaced_count) { |
888 | block_len_in_page = replaced_count; | 1016 | block_len_in_page = replaced_count; |
889 | replaced_size = | 1017 | replaced_size = |
890 | block_len_in_page << orig_inode->i_blkbits; | 1018 | block_len_in_page << orig_inode->i_blkbits; |
891 | } else | 1019 | } else |
892 | goto out; | 1020 | goto unlock_pages; |
893 | } | 1021 | } |
1022 | /* Perform all necessary steps similar write_begin()/write_end() | ||
1023 | * but keeping in mind that i_size will not change */ | ||
1024 | *err = __block_write_begin(pagep[0], from, from + replaced_size, | ||
1025 | ext4_get_block); | ||
1026 | if (!*err) | ||
1027 | *err = block_commit_write(pagep[0], from, from + replaced_size); | ||
894 | 1028 | ||
895 | if (!page_has_buffers(page)) | 1029 | if (unlikely(*err < 0)) |
896 | create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); | 1030 | goto repair_branches; |
897 | 1031 | ||
898 | bh = page_buffers(page); | 1032 | /* Even in case of data=writeback it is reasonable to pin |
899 | for (i = 0; i < data_offset_in_page; i++) | 1033 | * inode to transaction, to prevent unexpected data loss */ |
900 | bh = bh->b_this_page; | 1034 | *err = ext4_jbd2_file_inode(handle, orig_inode); |
901 | 1035 | ||
902 | for (i = 0; i < block_len_in_page; i++) { | 1036 | unlock_pages: |
903 | *err = ext4_get_block(orig_inode, | 1037 | unlock_page(pagep[0]); |
904 | (sector_t)(orig_blk_offset + i), bh, 0); | 1038 | page_cache_release(pagep[0]); |
905 | if (*err < 0) | 1039 | unlock_page(pagep[1]); |
906 | goto out; | 1040 | page_cache_release(pagep[1]); |
907 | 1041 | stop_journal: | |
908 | if (bh->b_this_page != NULL) | ||
909 | bh = bh->b_this_page; | ||
910 | } | ||
911 | |||
912 | *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, | ||
913 | page, fsdata); | ||
914 | page = NULL; | ||
915 | |||
916 | out: | ||
917 | if (unlikely(page)) { | ||
918 | if (PageLocked(page)) | ||
919 | unlock_page(page); | ||
920 | page_cache_release(page); | ||
921 | ext4_journal_stop(handle); | ||
922 | } | ||
923 | out2: | ||
924 | ext4_journal_stop(handle); | 1042 | ext4_journal_stop(handle); |
925 | 1043 | /* Buffer was busy because probably is pinned to journal transaction, | |
926 | if (err2) | 1044 | * force transaction commit may help to free it. */ |
927 | *err = err2; | 1045 | if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, |
928 | 1046 | &retries)) | |
1047 | goto again; | ||
929 | return replaced_count; | 1048 | return replaced_count; |
1049 | |||
1050 | repair_branches: | ||
1051 | /* | ||
1052 | * This should never ever happen! | ||
1053 | * Extents are swapped already, but we are not able to copy data. | ||
1054 | * Try to swap extents to it's original places | ||
1055 | */ | ||
1056 | double_down_write_data_sem(orig_inode, donor_inode); | ||
1057 | replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, | ||
1058 | orig_blk_offset, | ||
1059 | block_len_in_page, &err2); | ||
1060 | double_up_write_data_sem(orig_inode, donor_inode); | ||
1061 | if (replaced_count != block_len_in_page) { | ||
1062 | EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), | ||
1063 | "Unable to copy data block," | ||
1064 | " data will be lost."); | ||
1065 | *err = -EIO; | ||
1066 | } | ||
1067 | replaced_count = 0; | ||
1068 | goto unlock_pages; | ||
930 | } | 1069 | } |
931 | 1070 | ||
932 | /** | 1071 | /** |
@@ -969,14 +1108,6 @@ mext_check_arguments(struct inode *orig_inode, | |||
969 | return -EINVAL; | 1108 | return -EINVAL; |
970 | } | 1109 | } |
971 | 1110 | ||
972 | /* Files should be in the same ext4 FS */ | ||
973 | if (orig_inode->i_sb != donor_inode->i_sb) { | ||
974 | ext4_debug("ext4 move extent: The argument files " | ||
975 | "should be in same FS [ino:orig %lu, donor %lu]\n", | ||
976 | orig_inode->i_ino, donor_inode->i_ino); | ||
977 | return -EINVAL; | ||
978 | } | ||
979 | |||
980 | /* Ext4 move extent supports only extent based file */ | 1111 | /* Ext4 move extent supports only extent based file */ |
981 | if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { | 1112 | if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { |
982 | ext4_debug("ext4 move extent: orig file is not extents " | 1113 | ext4_debug("ext4 move extent: orig file is not extents " |
@@ -1002,7 +1133,6 @@ mext_check_arguments(struct inode *orig_inode, | |||
1002 | } | 1133 | } |
1003 | 1134 | ||
1004 | if ((orig_start >= EXT_MAX_BLOCKS) || | 1135 | if ((orig_start >= EXT_MAX_BLOCKS) || |
1005 | (donor_start >= EXT_MAX_BLOCKS) || | ||
1006 | (*len > EXT_MAX_BLOCKS) || | 1136 | (*len > EXT_MAX_BLOCKS) || |
1007 | (orig_start + *len >= EXT_MAX_BLOCKS)) { | 1137 | (orig_start + *len >= EXT_MAX_BLOCKS)) { |
1008 | ext4_debug("ext4 move extent: Can't handle over [%u] blocks " | 1138 | ext4_debug("ext4 move extent: Can't handle over [%u] blocks " |
@@ -1072,35 +1202,19 @@ mext_check_arguments(struct inode *orig_inode, | |||
1072 | * @inode1: the inode structure | 1202 | * @inode1: the inode structure |
1073 | * @inode2: the inode structure | 1203 | * @inode2: the inode structure |
1074 | * | 1204 | * |
1075 | * Lock two inodes' i_mutex by i_ino order. | 1205 | * Lock two inodes' i_mutex |
1076 | * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. | ||
1077 | */ | 1206 | */ |
1078 | static int | 1207 | static void |
1079 | mext_inode_double_lock(struct inode *inode1, struct inode *inode2) | 1208 | mext_inode_double_lock(struct inode *inode1, struct inode *inode2) |
1080 | { | 1209 | { |
1081 | int ret = 0; | 1210 | BUG_ON(inode1 == inode2); |
1082 | 1211 | if (inode1 < inode2) { | |
1083 | BUG_ON(inode1 == NULL && inode2 == NULL); | ||
1084 | |||
1085 | ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); | ||
1086 | if (ret < 0) | ||
1087 | goto out; | ||
1088 | |||
1089 | if (inode1 == inode2) { | ||
1090 | mutex_lock(&inode1->i_mutex); | ||
1091 | goto out; | ||
1092 | } | ||
1093 | |||
1094 | if (inode1->i_ino < inode2->i_ino) { | ||
1095 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); | 1212 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); |
1096 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); | 1213 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); |
1097 | } else { | 1214 | } else { |
1098 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); | 1215 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); |
1099 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); | 1216 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); |
1100 | } | 1217 | } |
1101 | |||
1102 | out: | ||
1103 | return ret; | ||
1104 | } | 1218 | } |
1105 | 1219 | ||
1106 | /** | 1220 | /** |
@@ -1109,28 +1223,13 @@ out: | |||
1109 | * @inode1: the inode that is released first | 1223 | * @inode1: the inode that is released first |
1110 | * @inode2: the inode that is released second | 1224 | * @inode2: the inode that is released second |
1111 | * | 1225 | * |
1112 | * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. | ||
1113 | */ | 1226 | */ |
1114 | 1227 | ||
1115 | static int | 1228 | static void |
1116 | mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) | 1229 | mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) |
1117 | { | 1230 | { |
1118 | int ret = 0; | 1231 | mutex_unlock(&inode1->i_mutex); |
1119 | 1232 | mutex_unlock(&inode2->i_mutex); | |
1120 | BUG_ON(inode1 == NULL && inode2 == NULL); | ||
1121 | |||
1122 | ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); | ||
1123 | if (ret < 0) | ||
1124 | goto out; | ||
1125 | |||
1126 | if (inode1) | ||
1127 | mutex_unlock(&inode1->i_mutex); | ||
1128 | |||
1129 | if (inode2 && inode2 != inode1) | ||
1130 | mutex_unlock(&inode2->i_mutex); | ||
1131 | |||
1132 | out: | ||
1133 | return ret; | ||
1134 | } | 1233 | } |
1135 | 1234 | ||
1136 | /** | 1235 | /** |
@@ -1187,16 +1286,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1187 | ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; | 1286 | ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; |
1188 | ext4_lblk_t rest_blocks; | 1287 | ext4_lblk_t rest_blocks; |
1189 | pgoff_t orig_page_offset = 0, seq_end_page; | 1288 | pgoff_t orig_page_offset = 0, seq_end_page; |
1190 | int ret1, ret2, depth, last_extent = 0; | 1289 | int ret, depth, last_extent = 0; |
1191 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; | 1290 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; |
1192 | int data_offset_in_page; | 1291 | int data_offset_in_page; |
1193 | int block_len_in_page; | 1292 | int block_len_in_page; |
1194 | int uninit; | 1293 | int uninit; |
1195 | 1294 | ||
1196 | /* orig and donor should be different file */ | 1295 | if (orig_inode->i_sb != donor_inode->i_sb) { |
1197 | if (orig_inode->i_ino == donor_inode->i_ino) { | 1296 | ext4_debug("ext4 move extent: The argument files " |
1297 | "should be in same FS [ino:orig %lu, donor %lu]\n", | ||
1298 | orig_inode->i_ino, donor_inode->i_ino); | ||
1299 | return -EINVAL; | ||
1300 | } | ||
1301 | |||
1302 | /* orig and donor should be different inodes */ | ||
1303 | if (orig_inode == donor_inode) { | ||
1198 | ext4_debug("ext4 move extent: The argument files should not " | 1304 | ext4_debug("ext4 move extent: The argument files should not " |
1199 | "be same file [ino:orig %lu, donor %lu]\n", | 1305 | "be same inode [ino:orig %lu, donor %lu]\n", |
1200 | orig_inode->i_ino, donor_inode->i_ino); | 1306 | orig_inode->i_ino, donor_inode->i_ino); |
1201 | return -EINVAL; | 1307 | return -EINVAL; |
1202 | } | 1308 | } |
@@ -1208,18 +1314,27 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1208 | orig_inode->i_ino, donor_inode->i_ino); | 1314 | orig_inode->i_ino, donor_inode->i_ino); |
1209 | return -EINVAL; | 1315 | return -EINVAL; |
1210 | } | 1316 | } |
1211 | 1317 | /* TODO: This is non obvious task to swap blocks for inodes with full | |
1318 | jornaling enabled */ | ||
1319 | if (ext4_should_journal_data(orig_inode) || | ||
1320 | ext4_should_journal_data(donor_inode)) { | ||
1321 | return -EINVAL; | ||
1322 | } | ||
1212 | /* Protect orig and donor inodes against a truncate */ | 1323 | /* Protect orig and donor inodes against a truncate */ |
1213 | ret1 = mext_inode_double_lock(orig_inode, donor_inode); | 1324 | mext_inode_double_lock(orig_inode, donor_inode); |
1214 | if (ret1 < 0) | 1325 | |
1215 | return ret1; | 1326 | /* Wait for all existing dio workers */ |
1327 | ext4_inode_block_unlocked_dio(orig_inode); | ||
1328 | ext4_inode_block_unlocked_dio(donor_inode); | ||
1329 | inode_dio_wait(orig_inode); | ||
1330 | inode_dio_wait(donor_inode); | ||
1216 | 1331 | ||
1217 | /* Protect extent tree against block allocations via delalloc */ | 1332 | /* Protect extent tree against block allocations via delalloc */ |
1218 | double_down_write_data_sem(orig_inode, donor_inode); | 1333 | double_down_write_data_sem(orig_inode, donor_inode); |
1219 | /* Check the filesystem environment whether move_extent can be done */ | 1334 | /* Check the filesystem environment whether move_extent can be done */ |
1220 | ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, | 1335 | ret = mext_check_arguments(orig_inode, donor_inode, orig_start, |
1221 | donor_start, &len); | 1336 | donor_start, &len); |
1222 | if (ret1) | 1337 | if (ret) |
1223 | goto out; | 1338 | goto out; |
1224 | 1339 | ||
1225 | file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; | 1340 | file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; |
@@ -1227,13 +1342,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1227 | if (file_end < block_end) | 1342 | if (file_end < block_end) |
1228 | len -= block_end - file_end; | 1343 | len -= block_end - file_end; |
1229 | 1344 | ||
1230 | ret1 = get_ext_path(orig_inode, block_start, &orig_path); | 1345 | ret = get_ext_path(orig_inode, block_start, &orig_path); |
1231 | if (ret1) | 1346 | if (ret) |
1232 | goto out; | 1347 | goto out; |
1233 | 1348 | ||
1234 | /* Get path structure to check the hole */ | 1349 | /* Get path structure to check the hole */ |
1235 | ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); | 1350 | ret = get_ext_path(orig_inode, block_start, &holecheck_path); |
1236 | if (ret1) | 1351 | if (ret) |
1237 | goto out; | 1352 | goto out; |
1238 | 1353 | ||
1239 | depth = ext_depth(orig_inode); | 1354 | depth = ext_depth(orig_inode); |
@@ -1252,13 +1367,13 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1252 | last_extent = mext_next_extent(orig_inode, | 1367 | last_extent = mext_next_extent(orig_inode, |
1253 | holecheck_path, &ext_cur); | 1368 | holecheck_path, &ext_cur); |
1254 | if (last_extent < 0) { | 1369 | if (last_extent < 0) { |
1255 | ret1 = last_extent; | 1370 | ret = last_extent; |
1256 | goto out; | 1371 | goto out; |
1257 | } | 1372 | } |
1258 | last_extent = mext_next_extent(orig_inode, orig_path, | 1373 | last_extent = mext_next_extent(orig_inode, orig_path, |
1259 | &ext_dummy); | 1374 | &ext_dummy); |
1260 | if (last_extent < 0) { | 1375 | if (last_extent < 0) { |
1261 | ret1 = last_extent; | 1376 | ret = last_extent; |
1262 | goto out; | 1377 | goto out; |
1263 | } | 1378 | } |
1264 | seq_start = le32_to_cpu(ext_cur->ee_block); | 1379 | seq_start = le32_to_cpu(ext_cur->ee_block); |
@@ -1272,7 +1387,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1272 | if (le32_to_cpu(ext_cur->ee_block) > block_end) { | 1387 | if (le32_to_cpu(ext_cur->ee_block) > block_end) { |
1273 | ext4_debug("ext4 move extent: The specified range of file " | 1388 | ext4_debug("ext4 move extent: The specified range of file " |
1274 | "may be the hole\n"); | 1389 | "may be the hole\n"); |
1275 | ret1 = -EINVAL; | 1390 | ret = -EINVAL; |
1276 | goto out; | 1391 | goto out; |
1277 | } | 1392 | } |
1278 | 1393 | ||
@@ -1292,7 +1407,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1292 | last_extent = mext_next_extent(orig_inode, holecheck_path, | 1407 | last_extent = mext_next_extent(orig_inode, holecheck_path, |
1293 | &ext_cur); | 1408 | &ext_cur); |
1294 | if (last_extent < 0) { | 1409 | if (last_extent < 0) { |
1295 | ret1 = last_extent; | 1410 | ret = last_extent; |
1296 | break; | 1411 | break; |
1297 | } | 1412 | } |
1298 | add_blocks = ext4_ext_get_actual_len(ext_cur); | 1413 | add_blocks = ext4_ext_get_actual_len(ext_cur); |
@@ -1349,18 +1464,18 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1349 | orig_page_offset, | 1464 | orig_page_offset, |
1350 | data_offset_in_page, | 1465 | data_offset_in_page, |
1351 | block_len_in_page, uninit, | 1466 | block_len_in_page, uninit, |
1352 | &ret1); | 1467 | &ret); |
1353 | 1468 | ||
1354 | /* Count how many blocks we have exchanged */ | 1469 | /* Count how many blocks we have exchanged */ |
1355 | *moved_len += block_len_in_page; | 1470 | *moved_len += block_len_in_page; |
1356 | if (ret1 < 0) | 1471 | if (ret < 0) |
1357 | break; | 1472 | break; |
1358 | if (*moved_len > len) { | 1473 | if (*moved_len > len) { |
1359 | EXT4_ERROR_INODE(orig_inode, | 1474 | EXT4_ERROR_INODE(orig_inode, |
1360 | "We replaced blocks too much! " | 1475 | "We replaced blocks too much! " |
1361 | "sum of replaced: %llu requested: %llu", | 1476 | "sum of replaced: %llu requested: %llu", |
1362 | *moved_len, len); | 1477 | *moved_len, len); |
1363 | ret1 = -EIO; | 1478 | ret = -EIO; |
1364 | break; | 1479 | break; |
1365 | } | 1480 | } |
1366 | 1481 | ||
@@ -1374,22 +1489,22 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1374 | } | 1489 | } |
1375 | 1490 | ||
1376 | double_down_write_data_sem(orig_inode, donor_inode); | 1491 | double_down_write_data_sem(orig_inode, donor_inode); |
1377 | if (ret1 < 0) | 1492 | if (ret < 0) |
1378 | break; | 1493 | break; |
1379 | 1494 | ||
1380 | /* Decrease buffer counter */ | 1495 | /* Decrease buffer counter */ |
1381 | if (holecheck_path) | 1496 | if (holecheck_path) |
1382 | ext4_ext_drop_refs(holecheck_path); | 1497 | ext4_ext_drop_refs(holecheck_path); |
1383 | ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); | 1498 | ret = get_ext_path(orig_inode, seq_start, &holecheck_path); |
1384 | if (ret1) | 1499 | if (ret) |
1385 | break; | 1500 | break; |
1386 | depth = holecheck_path->p_depth; | 1501 | depth = holecheck_path->p_depth; |
1387 | 1502 | ||
1388 | /* Decrease buffer counter */ | 1503 | /* Decrease buffer counter */ |
1389 | if (orig_path) | 1504 | if (orig_path) |
1390 | ext4_ext_drop_refs(orig_path); | 1505 | ext4_ext_drop_refs(orig_path); |
1391 | ret1 = get_ext_path(orig_inode, seq_start, &orig_path); | 1506 | ret = get_ext_path(orig_inode, seq_start, &orig_path); |
1392 | if (ret1) | 1507 | if (ret) |
1393 | break; | 1508 | break; |
1394 | 1509 | ||
1395 | ext_cur = holecheck_path[depth].p_ext; | 1510 | ext_cur = holecheck_path[depth].p_ext; |
@@ -1412,12 +1527,9 @@ out: | |||
1412 | kfree(holecheck_path); | 1527 | kfree(holecheck_path); |
1413 | } | 1528 | } |
1414 | double_up_write_data_sem(orig_inode, donor_inode); | 1529 | double_up_write_data_sem(orig_inode, donor_inode); |
1415 | ret2 = mext_inode_double_unlock(orig_inode, donor_inode); | 1530 | ext4_inode_resume_unlocked_dio(orig_inode); |
1416 | 1531 | ext4_inode_resume_unlocked_dio(donor_inode); | |
1417 | if (ret1) | 1532 | mext_inode_double_unlock(orig_inode, donor_inode); |
1418 | return ret1; | ||
1419 | else if (ret2) | ||
1420 | return ret2; | ||
1421 | 1533 | ||
1422 | return 0; | 1534 | return ret; |
1423 | } | 1535 | } |