fs: introduce write_begin, write_end, and perform_write aops

These are intended to replace prepare_write and commit_write with more flexible alternatives that are also able to avoid the buffered write deadlock problems efficiently (which prepare_write is unable to do). [mark.fasheh@oracle.com: API design contributions, code review and fixes] [akpm@linux-foundation.org: various fixes] [dmonakhov@sw.ru: new aop block_write_begin fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Nick Piggin <npiggin@suse.de> 2007-10-16 04:25:01 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-16 12:42:55 -0400
commit: afddba49d18f346e5cc2938b6ed7c512db18ca68 (patch)
tree: 4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca /fs/buffer.c
parent: 637aff46f94a754207c80c8c64bf1b74f24b967d (diff)
1 files changed, 169 insertions, 32 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 9ece6c2086d0..68b8fbdc1b28 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1770,6 +1770,48 @@ recover:
        goto done;
 }
+/*
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
+ */
+void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+        unsigned int block_start, block_end;
+        struct buffer_head *head, *bh;
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                return;
+        bh = head = page_buffers(page);
+        block_start = 0;
+        do {
+                block_end = block_start + bh->b_size;
+                if (buffer_new(bh)) {
+                        if (block_end > from && block_start < to) {
+                                if (!PageUptodate(page)) {
+                                        unsigned start, size;
+                                        start = max(from, block_start);
+                                        size = min(to, block_end) - start;
+                                        zero_user_page(page, start, size, KM_USER0);
+                                        set_buffer_uptodate(bh);
+                                }
+                                clear_buffer_new(bh);
+                                mark_buffer_dirty(bh);
+                        }
+                }
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
+}
+EXPORT_SYMBOL(page_zero_new_buffers);
 static int __block_prepare_write(struct inode *inode, struct page *page,
                unsigned from, unsigned to, get_block_t *get_block)
 {
@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
-        if (!err) {
+        if (unlikely(err))
-                bh = head;
+                page_zero_new_buffers(page, from, to);
-                do {
-                        if (buffer_new(bh))
-                                clear_buffer_new(bh);
-                } while ((bh = bh->b_this_page) != head);
-                return 0;
-        }
-        /* Error case: */
-        /*
-         * Zero out any newly allocated blocks to avoid exposing stale
-         * data.  If BH_New is set, we know that the block was newly
-         * allocated in the above loop.
-         */
-        bh = head;
-        block_start = 0;
-        do {
-                block_end = block_start+blocksize;
-                if (block_end <= from)
-                        goto next_bh;
-                if (block_start >= to)
-                        break;
-                if (buffer_new(bh)) {
-                        clear_buffer_new(bh);
-                        zero_user_page(page, block_start, bh->b_size, KM_USER0);
-                        set_buffer_uptodate(bh);
-                        mark_buffer_dirty(bh);
-                }
-next_bh:
-                block_start = block_end;
-                bh = bh->b_this_page;
-        } while (bh != head);
        return err;
 }
@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
                }
+                clear_buffer_new(bh);
        }
        /*
@@ -1924,6 +1937,130 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 /*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        struct inode *inode = mapping->host;
+        int status = 0;
+        struct page *page;
+        pgoff_t index;
+        unsigned start, end;
+        int ownpage = 0;
+        index = pos >> PAGE_CACHE_SHIFT;
+        start = pos & (PAGE_CACHE_SIZE - 1);
+        end = start + len;
+        page = *pagep;
+        if (page == NULL) {
+                ownpage = 1;
+                page = __grab_cache_page(mapping, index);
+                if (!page) {
+                        status = -ENOMEM;
+                        goto out;
+                }
+                *pagep = page;
+        } else
+                BUG_ON(!PageLocked(page));
+        status = __block_prepare_write(inode, page, start, end, get_block);
+        if (unlikely(status)) {
+                ClearPageUptodate(page);
+                if (ownpage) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        *pagep = NULL;
+                        /*
+                         * prepare_write() may have instantiated a few blocks
+                         * outside i_size.  Trim these off again. Don't need
+                         * i_size_read because we hold i_mutex.
+                         */
+                        if (pos + len > inode->i_size)
+                                vmtruncate(inode, inode->i_size);
+                }
+                goto out;
+        }
+out:
+        return status;
+}
+EXPORT_SYMBOL(block_write_begin);
+int block_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        unsigned start;
+        start = pos & (PAGE_CACHE_SIZE - 1);
+        if (unlikely(copied < len)) {
+                /*
+                 * The buffers that were written will now be uptodate, so we
+                 * don't have to worry about a readpage reading them and
+                 * overwriting a partial write. However if we have encountered
+                 * a short write and only partially written into a buffer, it
+                 * will not be marked uptodate, so a readpage might come in and
+                 * destroy our partial write.
+                 *
+                 * Do the simplest thing, and just treat any short write to a
+                 * non uptodate page as a zero-length write, and force the
+                 * caller to redo the whole thing.
+                 */
+                if (!PageUptodate(page))
+                        copied = 0;
+                page_zero_new_buffers(page, start+copied, start+len);
+        }
+        flush_dcache_page(page);
+        /* This could be a short (even 0-length) commit */
+        __block_commit_write(inode, page, start, start+copied);
+        return copied;
+}
+EXPORT_SYMBOL(block_write_end);
+int generic_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold i_mutex.
+         *
+         * But it's important to update i_size while still holding page lock:
+         * page writeout could otherwise come in and zero beyond i_size.
+         */
+        if (pos+copied > inode->i_size) {
+                i_size_write(inode, pos+copied);
+                mark_inode_dirty(inode);
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
+EXPORT_SYMBOL(generic_write_end);
+/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
author	Nick Piggin <npiggin@suse.de>	2007-10-16 04:25:01 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-16 12:42:55 -0400
commit	afddba49d18f346e5cc2938b6ed7c512db18ca68 (patch)
tree	4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca /fs/buffer.c
parent	637aff46f94a754207c80c8c64bf1b74f24b967d (diff)

diff --git a/fs/buffer.c b/fs/buffer.c index 9ece6c2086d0..68b8fbdc1b28 100644 --- a/fs/buffer.c +++ b/fs/buffer.c
@@ -1770,6 +1770,48 @@ recover:
1770	goto done;	1770	goto done;
1771	}	1771	}
1772		1772
		1773	/*
		1774	* If a page has any new buffers, zero them out here, and mark them uptodate
		1775	* and dirty so they'll be written out (in order to prevent uninitialised
		1776	* block data from leaking). And clear the new bit.
		1777	*/
		1778	void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
		1779	{
		1780	unsigned int block_start, block_end;
		1781	struct buffer_head head, bh;
		1782
		1783	BUG_ON(!PageLocked(page));
		1784	if (!page_has_buffers(page))
		1785	return;
		1786
		1787	bh = head = page_buffers(page);
		1788	block_start = 0;
		1789	do {
		1790	block_end = block_start + bh->b_size;
		1791
		1792	if (buffer_new(bh)) {
		1793	if (block_end > from && block_start < to) {
		1794	if (!PageUptodate(page)) {
		1795	unsigned start, size;
		1796
		1797	start = max(from, block_start);
		1798	size = min(to, block_end) - start;
		1799
		1800	zero_user_page(page, start, size, KM_USER0);
		1801	set_buffer_uptodate(bh);
		1802	}
		1803
		1804	clear_buffer_new(bh);
		1805	mark_buffer_dirty(bh);
		1806	}
		1807	}
		1808
		1809	block_start = block_end;
		1810	bh = bh->b_this_page;
		1811	} while (bh != head);
		1812	}
		1813	EXPORT_SYMBOL(page_zero_new_buffers);
		1814
1773	static int __block_prepare_write(struct inode inode, struct page page,	1815	static int __block_prepare_write(struct inode inode, struct page page,
1774	unsigned from, unsigned to, get_block_t *get_block)	1816	unsigned from, unsigned to, get_block_t *get_block)
1775	{	1817	{
@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode inode, struct page page,
1854	if (!buffer_uptodate(*wait_bh))	1896	if (!buffer_uptodate(*wait_bh))
1855	err = -EIO;	1897	err = -EIO;
1856	}	1898	}
1857	if (!err) {	1899	if (unlikely(err))
1858	bh = head;	1900	page_zero_new_buffers(page, from, to);
1859	do {
1860	if (buffer_new(bh))
1861	clear_buffer_new(bh);
1862	} while ((bh = bh->b_this_page) != head);
1863	return 0;
1864	}
1865	/* Error case: */
1866	/*
1867	* Zero out any newly allocated blocks to avoid exposing stale
1868	* data. If BH_New is set, we know that the block was newly
1869	* allocated in the above loop.
1870	*/
1871	bh = head;
1872	block_start = 0;
1873	do {
1874	block_end = block_start+blocksize;
1875	if (block_end <= from)
1876	goto next_bh;
1877	if (block_start >= to)
1878	break;
1879	if (buffer_new(bh)) {
1880	clear_buffer_new(bh);
1881	zero_user_page(page, block_start, bh->b_size, KM_USER0);
1882	set_buffer_uptodate(bh);
1883	mark_buffer_dirty(bh);
1884	}
1885	next_bh:
1886	block_start = block_end;
1887	bh = bh->b_this_page;
1888	} while (bh != head);
1889	return err;	1901	return err;
1890	}	1902	}
1891		1903
@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode inode, struct page page,
1910	set_buffer_uptodate(bh);	1922	set_buffer_uptodate(bh);
1911	mark_buffer_dirty(bh);	1923	mark_buffer_dirty(bh);
1912	}	1924	}
		1925	clear_buffer_new(bh);
1913	}	1926	}
1914		1927
1915	/*	1928	/*
@@ -1924,6 +1937,130 @@ static int __block_commit_write(struct inode inode, struct page page,
1924	}	1937	}
1925		1938
1926	/*	1939	/*
		1940	* block_write_begin takes care of the basic task of block allocation and
		1941	* bringing partial write blocks uptodate first.
		1942	*
		1943	* If *pagep is not NULL, then block_write_begin uses the locked page
		1944	* at *pagep rather than allocating its own. In this case, the page will
		1945	* not be unlocked or deallocated on failure.
		1946	*/
		1947	int block_write_begin(struct file file, struct address_space mapping,
		1948	loff_t pos, unsigned len, unsigned flags,
		1949	struct page pagep, void fsdata,
		1950	get_block_t *get_block)
		1951	{
		1952	struct inode *inode = mapping->host;
		1953	int status = 0;
		1954	struct page *page;
		1955	pgoff_t index;
		1956	unsigned start, end;
		1957	int ownpage = 0;
		1958
		1959	index = pos >> PAGE_CACHE_SHIFT;
		1960	start = pos & (PAGE_CACHE_SIZE - 1);
		1961	end = start + len;
		1962
		1963	page = *pagep;
		1964	if (page == NULL) {
		1965	ownpage = 1;
		1966	page = __grab_cache_page(mapping, index);
		1967	if (!page) {
		1968	status = -ENOMEM;
		1969	goto out;
		1970	}
		1971	*pagep = page;
		1972	} else
		1973	BUG_ON(!PageLocked(page));
		1974
		1975	status = __block_prepare_write(inode, page, start, end, get_block);
		1976	if (unlikely(status)) {
		1977	ClearPageUptodate(page);
		1978
		1979	if (ownpage) {
		1980	unlock_page(page);
		1981	page_cache_release(page);
		1982	*pagep = NULL;
		1983
		1984	/*
		1985	* prepare_write() may have instantiated a few blocks
		1986	* outside i_size. Trim these off again. Don't need
		1987	* i_size_read because we hold i_mutex.
		1988	*/
		1989	if (pos + len > inode->i_size)
		1990	vmtruncate(inode, inode->i_size);
		1991	}
		1992	goto out;
		1993	}
		1994
		1995	out:
		1996	return status;
		1997	}
		1998	EXPORT_SYMBOL(block_write_begin);
		1999
		2000	int block_write_end(struct file file, struct address_space mapping,
		2001	loff_t pos, unsigned len, unsigned copied,
		2002	struct page page, void fsdata)
		2003	{
		2004	struct inode *inode = mapping->host;
		2005	unsigned start;
		2006
		2007	start = pos & (PAGE_CACHE_SIZE - 1);
		2008
		2009	if (unlikely(copied < len)) {
		2010	/*
		2011	* The buffers that were written will now be uptodate, so we
		2012	* don't have to worry about a readpage reading them and
		2013	* overwriting a partial write. However if we have encountered
		2014	* a short write and only partially written into a buffer, it
		2015	* will not be marked uptodate, so a readpage might come in and
		2016	* destroy our partial write.
		2017	*
		2018	* Do the simplest thing, and just treat any short write to a
		2019	* non uptodate page as a zero-length write, and force the
		2020	* caller to redo the whole thing.
		2021	*/
		2022	if (!PageUptodate(page))
		2023	copied = 0;
		2024
		2025	page_zero_new_buffers(page, start+copied, start+len);
		2026	}
		2027	flush_dcache_page(page);
		2028
		2029	/* This could be a short (even 0-length) commit */
		2030	__block_commit_write(inode, page, start, start+copied);
		2031
		2032	return copied;
		2033	}
		2034	EXPORT_SYMBOL(block_write_end);
		2035
		2036	int generic_write_end(struct file file, struct address_space mapping,
		2037	loff_t pos, unsigned len, unsigned copied,
		2038	struct page page, void fsdata)
		2039	{
		2040	struct inode *inode = mapping->host;
		2041
		2042	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
		2043
		2044	/*
		2045	* No need to use i_size_read() here, the i_size
		2046	* cannot change under us because we hold i_mutex.
		2047	*
		2048	* But it's important to update i_size while still holding page lock:
		2049	* page writeout could otherwise come in and zero beyond i_size.
		2050	*/
		2051	if (pos+copied > inode->i_size) {
		2052	i_size_write(inode, pos+copied);
		2053	mark_inode_dirty(inode);
		2054	}
		2055
		2056	unlock_page(page);
		2057	page_cache_release(page);
		2058
		2059	return copied;
		2060	}
		2061	EXPORT_SYMBOL(generic_write_end);
		2062
		2063	/*
1927	* Generic "read page" function for block devices that have the normal	2064	* Generic "read page" function for block devices that have the normal
1928	* get_block functionality. This is most of the block device filesystems.	2065	* get_block functionality. This is most of the block device filesystems.
1929	* Reads the page asynchronously --- the unlock_buffer() and	2066	* Reads the page asynchronously --- the unlock_buffer() and