aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2011-02-28 09:52:08 -0500
committerChris Mason <chris.mason@oracle.com>2011-03-07 10:42:27 -0500
commitb1bf862e9dad431175a1174379476299dbfdc017 (patch)
treea53811b2112fbe2a0106e3ac051fffc63872d317
parentec29ed5b407d618a8128f5942aade9e1758aa14b (diff)
Btrfs: fix regressions in copy_from_user handling
Commit 914ee295af418e936ec20a08c1663eaabe4cd07a fixed deadlocks in btrfs_file_write where we would catch page faults on pages we had locked. But, there were a few problems: 1) The x86-32 iov_iter_copy_from_user_atomic code always fails to copy data when the amount to copy is more than 4K and the offset to start copying from is not page aligned. The result was btrfs_file_write looping forever retrying the iov_iter_copy_from_user_atomic We deal with this by changing btrfs_file_write to drop down to single page copies when iov_iter_copy_from_user_atomic starts returning failure. 2) The btrfs_file_write code was leaking delalloc reservations when iov_iter_copy_from_user_atomic returned zero. The looping above would result in the entire filesystem running out of delalloc reservations and constantly trying to flush things to disk. 3) btrfs_file_write will lock down page cache pages, make sure any writeback is finished, do the copy_from_user and then release them. Before the loop runs we check the first and last pages in the write to see if they are only being partially modified. If the start or end of the write isn't aligned, we make sure the corresponding pages are up to date so that we don't introduce garbage into the file. With the copy_from_user changes, we're allowing the VM to reclaim the pages after a partial update from copy_from_user, but we're not making sure the page cache page is up to date when we loop around to resume the write. We deal with this by pushing the up to date checks down into the page prep code. This fits better with how the rest of file_write works. Signed-off-by: Chris Mason <chris.mason@oracle.com> Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org> cc: stable@kernel.org
-rw-r--r--fs/btrfs/file.c101
1 files changed, 59 insertions, 42 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 65338a1d14ad..13664b315fe2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -762,6 +762,27 @@ out:
762} 762}
763 763
764/* 764/*
765 * on error we return an unlocked page and the error value
766 * on success we return a locked page and 0
767 */
768static int prepare_uptodate_page(struct page *page, u64 pos)
769{
770 int ret = 0;
771
772 if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
773 ret = btrfs_readpage(NULL, page);
774 if (ret)
775 return ret;
776 lock_page(page);
777 if (!PageUptodate(page)) {
778 unlock_page(page);
779 return -EIO;
780 }
781 }
782 return 0;
783}
784
785/*
765 * this gets pages into the page cache and locks them down, it also properly 786 * this gets pages into the page cache and locks them down, it also properly
766 * waits for data=ordered extents to finish before allowing the pages to be 787 * waits for data=ordered extents to finish before allowing the pages to be
767 * modified. 788 * modified.
@@ -776,6 +797,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
776 unsigned long index = pos >> PAGE_CACHE_SHIFT; 797 unsigned long index = pos >> PAGE_CACHE_SHIFT;
777 struct inode *inode = fdentry(file)->d_inode; 798 struct inode *inode = fdentry(file)->d_inode;
778 int err = 0; 799 int err = 0;
800 int faili = 0;
779 u64 start_pos; 801 u64 start_pos;
780 u64 last_pos; 802 u64 last_pos;
781 803
@@ -793,15 +815,24 @@ again:
793 for (i = 0; i < num_pages; i++) { 815 for (i = 0; i < num_pages; i++) {
794 pages[i] = grab_cache_page(inode->i_mapping, index + i); 816 pages[i] = grab_cache_page(inode->i_mapping, index + i);
795 if (!pages[i]) { 817 if (!pages[i]) {
796 int c; 818 faili = i - 1;
797 for (c = i - 1; c >= 0; c--) { 819 err = -ENOMEM;
798 unlock_page(pages[c]); 820 goto fail;
799 page_cache_release(pages[c]); 821 }
800 } 822
801 return -ENOMEM; 823 if (i == 0)
824 err = prepare_uptodate_page(pages[i], pos);
825 if (i == num_pages - 1)
826 err = prepare_uptodate_page(pages[i],
827 pos + write_bytes);
828 if (err) {
829 page_cache_release(pages[i]);
830 faili = i - 1;
831 goto fail;
802 } 832 }
803 wait_on_page_writeback(pages[i]); 833 wait_on_page_writeback(pages[i]);
804 } 834 }
835 err = 0;
805 if (start_pos < inode->i_size) { 836 if (start_pos < inode->i_size) {
806 struct btrfs_ordered_extent *ordered; 837 struct btrfs_ordered_extent *ordered;
807 lock_extent_bits(&BTRFS_I(inode)->io_tree, 838 lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -841,6 +872,14 @@ again:
841 WARN_ON(!PageLocked(pages[i])); 872 WARN_ON(!PageLocked(pages[i]));
842 } 873 }
843 return 0; 874 return 0;
875fail:
876 while (faili >= 0) {
877 unlock_page(pages[faili]);
878 page_cache_release(pages[faili]);
879 faili--;
880 }
881 return err;
882
844} 883}
845 884
846static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 885static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -850,7 +889,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
850 struct file *file = iocb->ki_filp; 889 struct file *file = iocb->ki_filp;
851 struct inode *inode = fdentry(file)->d_inode; 890 struct inode *inode = fdentry(file)->d_inode;
852 struct btrfs_root *root = BTRFS_I(inode)->root; 891 struct btrfs_root *root = BTRFS_I(inode)->root;
853 struct page *pinned[2];
854 struct page **pages = NULL; 892 struct page **pages = NULL;
855 struct iov_iter i; 893 struct iov_iter i;
856 loff_t *ppos = &iocb->ki_pos; 894 loff_t *ppos = &iocb->ki_pos;
@@ -871,9 +909,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
871 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 909 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
872 (file->f_flags & O_DIRECT)); 910 (file->f_flags & O_DIRECT));
873 911
874 pinned[0] = NULL;
875 pinned[1] = NULL;
876
877 start_pos = pos; 912 start_pos = pos;
878 913
879 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 914 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -961,32 +996,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
961 first_index = pos >> PAGE_CACHE_SHIFT; 996 first_index = pos >> PAGE_CACHE_SHIFT;
962 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; 997 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
963 998
964 /*
965 * there are lots of better ways to do this, but this code
966 * makes sure the first and last page in the file range are
967 * up to date and ready for cow
968 */
969 if ((pos & (PAGE_CACHE_SIZE - 1))) {
970 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
971 if (!PageUptodate(pinned[0])) {
972 ret = btrfs_readpage(NULL, pinned[0]);
973 BUG_ON(ret);
974 wait_on_page_locked(pinned[0]);
975 } else {
976 unlock_page(pinned[0]);
977 }
978 }
979 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
980 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
981 if (!PageUptodate(pinned[1])) {
982 ret = btrfs_readpage(NULL, pinned[1]);
983 BUG_ON(ret);
984 wait_on_page_locked(pinned[1]);
985 } else {
986 unlock_page(pinned[1]);
987 }
988 }
989
990 while (iov_iter_count(&i) > 0) { 999 while (iov_iter_count(&i) > 0) {
991 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1000 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
992 size_t write_bytes = min(iov_iter_count(&i), 1001 size_t write_bytes = min(iov_iter_count(&i),
@@ -1023,8 +1032,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1023 1032
1024 copied = btrfs_copy_from_user(pos, num_pages, 1033 copied = btrfs_copy_from_user(pos, num_pages,
1025 write_bytes, pages, &i); 1034 write_bytes, pages, &i);
1026 dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> 1035
1027 PAGE_CACHE_SHIFT; 1036 /*
1037 * if we have trouble faulting in the pages, fall
1038 * back to one page at a time
1039 */
1040 if (copied < write_bytes)
1041 nrptrs = 1;
1042
1043 if (copied == 0)
1044 dirty_pages = 0;
1045 else
1046 dirty_pages = (copied + offset +
1047 PAGE_CACHE_SIZE - 1) >>
1048 PAGE_CACHE_SHIFT;
1028 1049
1029 if (num_pages > dirty_pages) { 1050 if (num_pages > dirty_pages) {
1030 if (copied > 0) 1051 if (copied > 0)
@@ -1068,10 +1089,6 @@ out:
1068 err = ret; 1089 err = ret;
1069 1090
1070 kfree(pages); 1091 kfree(pages);
1071 if (pinned[0])
1072 page_cache_release(pinned[0]);
1073 if (pinned[1])
1074 page_cache_release(pinned[1]);
1075 *ppos = pos; 1092 *ppos = pos;
1076 1093
1077 /* 1094 /*