Btrfs: fix regressions in copy_from_user handling

Commit 914ee295af418e936ec20a08c1663eaabe4cd07a fixed deadlocks in btrfs_file_write where we would catch page faults on pages we had locked. But, there were a few problems: 1) The x86-32 iov_iter_copy_from_user_atomic code always fails to copy data when the amount to copy is more than 4K and the offset to start copying from is not page aligned. The result was btrfs_file_write looping forever retrying the iov_iter_copy_from_user_atomic We deal with this by changing btrfs_file_write to drop down to single page copies when iov_iter_copy_from_user_atomic starts returning failure. 2) The btrfs_file_write code was leaking delalloc reservations when iov_iter_copy_from_user_atomic returned zero. The looping above would result in the entire filesystem running out of delalloc reservations and constantly trying to flush things to disk. 3) btrfs_file_write will lock down page cache pages, make sure any writeback is finished, do the copy_from_user and then release them. Before the loop runs we check the first and last pages in the write to see if they are only being partially modified. If the start or end of the write isn't aligned, we make sure the corresponding pages are up to date so that we don't introduce garbage into the file. With the copy_from_user changes, we're allowing the VM to reclaim the pages after a partial update from copy_from_user, but we're not making sure the page cache page is up to date when we loop around to resume the write. We deal with this by pushing the up to date checks down into the page prep code. This fits better with how the rest of file_write works. Signed-off-by: Chris Mason <chris.mason@oracle.com> Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org> cc: stable@kernel.org
author: Chris Mason <chris.mason@oracle.com> 2011-02-28 09:52:08 -0500
committer: Chris Mason <chris.mason@oracle.com> 2011-03-07 10:42:27 -0500
commit: b1bf862e9dad431175a1174379476299dbfdc017 (patch)
tree: a53811b2112fbe2a0106e3ac051fffc63872d317 /fs
parent: ec29ed5b407d618a8128f5942aade9e1758aa14b (diff)
1 files changed, 59 insertions, 42 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 65338a1d14ad..13664b315fe2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -762,6 +762,27 @@ out:
 }
 /*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+        int ret = 0;
+        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+                ret = btrfs_readpage(NULL, page);
+                if (ret)
+                        return ret;
+                lock_page(page);
+                if (!PageUptodate(page)) {
+                        unlock_page(page);
+                        return -EIO;
+                }
+        }
+        return 0;
+}
+/*
 * this gets pages into the page cache and locks them down, it also properly
 * waits for data=ordered extents to finish before allowing the pages to be
 * modified.
@@ -776,6 +797,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
        int err = 0;
+        int faili = 0;
        u64 start_pos;
        u64 last_pos;
@@ -793,15 +815,24 @@ again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = grab_cache_page(inode->i_mapping, index + i);
                if (!pages[i]) {
-                        int c;
+                        faili = i - 1;
-                        for (c = i - 1; c >= 0; c--) {
+                        err = -ENOMEM;
-                                unlock_page(pages[c]);
+                        goto fail;
-                                page_cache_release(pages[c]);
+                }
-                        }
-                        return -ENOMEM;
+                if (i == 0)
+                        err = prepare_uptodate_page(pages[i], pos);
+                if (i == num_pages - 1)
+                        err = prepare_uptodate_page(pages[i],
+                                                    pos + write_bytes);
+                if (err) {
+                        page_cache_release(pages[i]);
+                        faili = i - 1;
+                        goto fail;
                }
                wait_on_page_writeback(pages[i]);
        }
+        err = 0;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -841,6 +872,14 @@ again:
                WARN_ON(!PageLocked(pages[i]));
        }
        return 0;
+fail:
+        while (faili >= 0) {
+                unlock_page(pages[faili]);
+                page_cache_release(pages[faili]);
+                faili--;
+        }
+        return err;
 }
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -850,7 +889,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page *pinned[2];
        struct page **pages = NULL;
        struct iov_iter i;
        loff_t *ppos = &iocb->ki_pos;
@@ -871,9 +909,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
-        pinned[0] = NULL;
-        pinned[1] = NULL;
        start_pos = pos;
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -961,32 +996,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        first_index = pos >> PAGE_CACHE_SHIFT;
        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
-        /*
-         * there are lots of better ways to do this, but this code
-         * makes sure the first and last page in the file range are
-         * up to date and ready for cow
-         */
-        if ((pos & (PAGE_CACHE_SIZE - 1))) {
-                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
-                if (!PageUptodate(pinned[0])) {
-                        ret = btrfs_readpage(NULL, pinned[0]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[0]);
-                } else {
-                        unlock_page(pinned[0]);
-                }
-        }
-        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
-                if (!PageUptodate(pinned[1])) {
-                        ret = btrfs_readpage(NULL, pinned[1]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[1]);
-                } else {
-                        unlock_page(pinned[1]);
-                }
-        }
        while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
                size_t write_bytes = min(iov_iter_count(&i),
@@ -1023,8 +1032,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                copied = btrfs_copy_from_user(pos, num_pages,
                                           write_bytes, pages, &i);
-                dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
-                                PAGE_CACHE_SHIFT;
+                /*
+                 * if we have trouble faulting in the pages, fall
+                 * back to one page at a time
+                 */
+                if (copied < write_bytes)
+                        nrptrs = 1;
+                if (copied == 0)
+                        dirty_pages = 0;
+                else
+                        dirty_pages = (copied + offset +
+                                       PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT;
                if (num_pages > dirty_pages) {
                        if (copied > 0)
@@ -1068,10 +1089,6 @@ out:
                err = ret;
        kfree(pages);
-        if (pinned[0])
-                page_cache_release(pinned[0]);
-        if (pinned[1])
-                page_cache_release(pinned[1]);
        *ppos = pos;
        /*
author	Chris Mason <chris.mason@oracle.com>	2011-02-28 09:52:08 -0500
committer	Chris Mason <chris.mason@oracle.com>	2011-03-07 10:42:27 -0500
commit	b1bf862e9dad431175a1174379476299dbfdc017 (patch)
tree	a53811b2112fbe2a0106e3ac051fffc63872d317 /fs
parent	ec29ed5b407d618a8128f5942aade9e1758aa14b (diff)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 65338a1d14ad..13664b315fe2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c
@@ -762,6 +762,27 @@ out:
762	}	762	}
763		763
764	/*	764	/*
		765	* on error we return an unlocked page and the error value
		766	* on success we return a locked page and 0
		767	*/
		768	static int prepare_uptodate_page(struct page *page, u64 pos)
		769	{
		770	int ret = 0;
		771
		772	if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
		773	ret = btrfs_readpage(NULL, page);
		774	if (ret)
		775	return ret;
		776	lock_page(page);
		777	if (!PageUptodate(page)) {
		778	unlock_page(page);
		779	return -EIO;
		780	}
		781	}
		782	return 0;
		783	}
		784
		785	/*
765	* this gets pages into the page cache and locks them down, it also properly	786	* this gets pages into the page cache and locks them down, it also properly
766	* waits for data=ordered extents to finish before allowing the pages to be	787	* waits for data=ordered extents to finish before allowing the pages to be
767	* modified.	788	* modified.
@@ -776,6 +797,7 @@ static noinline int prepare_pages(struct btrfs_root root, struct file file,
776	unsigned long index = pos >> PAGE_CACHE_SHIFT;	797	unsigned long index = pos >> PAGE_CACHE_SHIFT;
777	struct inode *inode = fdentry(file)->d_inode;	798	struct inode *inode = fdentry(file)->d_inode;
778	int err = 0;	799	int err = 0;
		800	int faili = 0;
779	u64 start_pos;	801	u64 start_pos;
780	u64 last_pos;	802	u64 last_pos;
781		803
@@ -793,15 +815,24 @@ again:
793	for (i = 0; i < num_pages; i++) {	815	for (i = 0; i < num_pages; i++) {
794	pages[i] = grab_cache_page(inode->i_mapping, index + i);	816	pages[i] = grab_cache_page(inode->i_mapping, index + i);
795	if (!pages[i]) {	817	if (!pages[i]) {
796	int c;	818	faili = i - 1;
797	for (c = i - 1; c >= 0; c--) {	819	err = -ENOMEM;
798	unlock_page(pages[c]);	820	goto fail;
799	page_cache_release(pages[c]);	821	}
800	}	822
801	return -ENOMEM;	823	if (i == 0)
		824	err = prepare_uptodate_page(pages[i], pos);
		825	if (i == num_pages - 1)
		826	err = prepare_uptodate_page(pages[i],
		827	pos + write_bytes);
		828	if (err) {
		829	page_cache_release(pages[i]);
		830	faili = i - 1;
		831	goto fail;
802	}	832	}
803	wait_on_page_writeback(pages[i]);	833	wait_on_page_writeback(pages[i]);
804	}	834	}
		835	err = 0;
805	if (start_pos < inode->i_size) {	836	if (start_pos < inode->i_size) {
806	struct btrfs_ordered_extent *ordered;	837	struct btrfs_ordered_extent *ordered;
807	lock_extent_bits(&BTRFS_I(inode)->io_tree,	838	lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -841,6 +872,14 @@ again:
841	WARN_ON(!PageLocked(pages[i]));	872	WARN_ON(!PageLocked(pages[i]));
842	}	873	}
843	return 0;	874	return 0;
		875	fail:
		876	while (faili >= 0) {
		877	unlock_page(pages[faili]);
		878	page_cache_release(pages[faili]);
		879	faili--;
		880	}
		881	return err;
		882
844	}	883	}
845		884
846	static ssize_t btrfs_file_aio_write(struct kiocb *iocb,	885	static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -850,7 +889,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
850	struct file *file = iocb->ki_filp;	889	struct file *file = iocb->ki_filp;
851	struct inode *inode = fdentry(file)->d_inode;	890	struct inode *inode = fdentry(file)->d_inode;
852	struct btrfs_root *root = BTRFS_I(inode)->root;	891	struct btrfs_root *root = BTRFS_I(inode)->root;
853	struct page *pinned[2];
854	struct page **pages = NULL;	892	struct page **pages = NULL;
855	struct iov_iter i;	893	struct iov_iter i;
856	loff_t *ppos = &iocb->ki_pos;	894	loff_t *ppos = &iocb->ki_pos;
@@ -871,9 +909,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
871	will_write = ((file->f_flags & O_DSYNC) \|\| IS_SYNC(inode) \|\|	909	will_write = ((file->f_flags & O_DSYNC) \|\| IS_SYNC(inode) \|\|
872	(file->f_flags & O_DIRECT));	910	(file->f_flags & O_DIRECT));
873		911
874	pinned[0] = NULL;
875	pinned[1] = NULL;
876
877	start_pos = pos;	912	start_pos = pos;
878		913
879	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);	914	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -961,32 +996,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
961	first_index = pos >> PAGE_CACHE_SHIFT;	996	first_index = pos >> PAGE_CACHE_SHIFT;
962	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;	997	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
963		998
964	/*
965	* there are lots of better ways to do this, but this code
966	* makes sure the first and last page in the file range are
967	* up to date and ready for cow
968	*/
969	if ((pos & (PAGE_CACHE_SIZE - 1))) {
970	pinned[0] = grab_cache_page(inode->i_mapping, first_index);
971	if (!PageUptodate(pinned[0])) {
972	ret = btrfs_readpage(NULL, pinned[0]);
973	BUG_ON(ret);
974	wait_on_page_locked(pinned[0]);
975	} else {
976	unlock_page(pinned[0]);
977	}
978	}
979	if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
980	pinned[1] = grab_cache_page(inode->i_mapping, last_index);
981	if (!PageUptodate(pinned[1])) {
982	ret = btrfs_readpage(NULL, pinned[1]);
983	BUG_ON(ret);
984	wait_on_page_locked(pinned[1]);
985	} else {
986	unlock_page(pinned[1]);
987	}
988	}
989
990	while (iov_iter_count(&i) > 0) {	999	while (iov_iter_count(&i) > 0) {
991	size_t offset = pos & (PAGE_CACHE_SIZE - 1);	1000	size_t offset = pos & (PAGE_CACHE_SIZE - 1);
992	size_t write_bytes = min(iov_iter_count(&i),	1001	size_t write_bytes = min(iov_iter_count(&i),
@@ -1023,8 +1032,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1023		1032
1024	copied = btrfs_copy_from_user(pos, num_pages,	1033	copied = btrfs_copy_from_user(pos, num_pages,
1025	write_bytes, pages, &i);	1034	write_bytes, pages, &i);
1026	dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>	1035
1027	PAGE_CACHE_SHIFT;	1036	/*
		1037	* if we have trouble faulting in the pages, fall
		1038	* back to one page at a time
		1039	*/
		1040	if (copied < write_bytes)
		1041	nrptrs = 1;
		1042
		1043	if (copied == 0)
		1044	dirty_pages = 0;
		1045	else
		1046	dirty_pages = (copied + offset +
		1047	PAGE_CACHE_SIZE - 1) >>
		1048	PAGE_CACHE_SHIFT;
1028		1049
1029	if (num_pages > dirty_pages) {	1050	if (num_pages > dirty_pages) {
1030	if (copied > 0)	1051	if (copied > 0)
@@ -1068,10 +1089,6 @@ out:
1068	err = ret;	1089	err = ret;
1069		1090
1070	kfree(pages);	1091	kfree(pages);
1071	if (pinned[0])
1072	page_cache_release(pinned[0]);
1073	if (pinned[1])
1074	page_cache_release(pinned[1]);
1075	*ppos = pos;	1092	*ppos = pos;
1076		1093
1077	/*	1094	/*