diff options
author | Nick Piggin <npiggin@suse.de> | 2007-10-16 04:25:01 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:42:55 -0400 |
commit | afddba49d18f346e5cc2938b6ed7c512db18ca68 (patch) | |
tree | 4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca /fs/buffer.c | |
parent | 637aff46f94a754207c80c8c64bf1b74f24b967d (diff) |
fs: introduce write_begin, write_end, and perform_write aops
These are intended to replace prepare_write and commit_write with more
flexible alternatives that are also able to avoid the buffered write
deadlock problems efficiently (which prepare_write is unable to do).
[mark.fasheh@oracle.com: API design contributions, code review and fixes]
[akpm@linux-foundation.org: various fixes]
[dmonakhov@sw.ru: new aop block_write_begin fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/buffer.c')
-rw-r--r-- | fs/buffer.c | 201 |
1 files changed, 169 insertions, 32 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index 9ece6c2086d0..68b8fbdc1b28 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -1770,6 +1770,48 @@ recover: | |||
1770 | goto done; | 1770 | goto done; |
1771 | } | 1771 | } |
1772 | 1772 | ||
1773 | /* | ||
1774 | * If a page has any new buffers, zero them out here, and mark them uptodate | ||
1775 | * and dirty so they'll be written out (in order to prevent uninitialised | ||
1776 | * block data from leaking). And clear the new bit. | ||
1777 | */ | ||
1778 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) | ||
1779 | { | ||
1780 | unsigned int block_start, block_end; | ||
1781 | struct buffer_head *head, *bh; | ||
1782 | |||
1783 | BUG_ON(!PageLocked(page)); | ||
1784 | if (!page_has_buffers(page)) | ||
1785 | return; | ||
1786 | |||
1787 | bh = head = page_buffers(page); | ||
1788 | block_start = 0; | ||
1789 | do { | ||
1790 | block_end = block_start + bh->b_size; | ||
1791 | |||
1792 | if (buffer_new(bh)) { | ||
1793 | if (block_end > from && block_start < to) { | ||
1794 | if (!PageUptodate(page)) { | ||
1795 | unsigned start, size; | ||
1796 | |||
1797 | start = max(from, block_start); | ||
1798 | size = min(to, block_end) - start; | ||
1799 | |||
1800 | zero_user_page(page, start, size, KM_USER0); | ||
1801 | set_buffer_uptodate(bh); | ||
1802 | } | ||
1803 | |||
1804 | clear_buffer_new(bh); | ||
1805 | mark_buffer_dirty(bh); | ||
1806 | } | ||
1807 | } | ||
1808 | |||
1809 | block_start = block_end; | ||
1810 | bh = bh->b_this_page; | ||
1811 | } while (bh != head); | ||
1812 | } | ||
1813 | EXPORT_SYMBOL(page_zero_new_buffers); | ||
1814 | |||
1773 | static int __block_prepare_write(struct inode *inode, struct page *page, | 1815 | static int __block_prepare_write(struct inode *inode, struct page *page, |
1774 | unsigned from, unsigned to, get_block_t *get_block) | 1816 | unsigned from, unsigned to, get_block_t *get_block) |
1775 | { | 1817 | { |
@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page, | |||
1854 | if (!buffer_uptodate(*wait_bh)) | 1896 | if (!buffer_uptodate(*wait_bh)) |
1855 | err = -EIO; | 1897 | err = -EIO; |
1856 | } | 1898 | } |
1857 | if (!err) { | 1899 | if (unlikely(err)) |
1858 | bh = head; | 1900 | page_zero_new_buffers(page, from, to); |
1859 | do { | ||
1860 | if (buffer_new(bh)) | ||
1861 | clear_buffer_new(bh); | ||
1862 | } while ((bh = bh->b_this_page) != head); | ||
1863 | return 0; | ||
1864 | } | ||
1865 | /* Error case: */ | ||
1866 | /* | ||
1867 | * Zero out any newly allocated blocks to avoid exposing stale | ||
1868 | * data. If BH_New is set, we know that the block was newly | ||
1869 | * allocated in the above loop. | ||
1870 | */ | ||
1871 | bh = head; | ||
1872 | block_start = 0; | ||
1873 | do { | ||
1874 | block_end = block_start+blocksize; | ||
1875 | if (block_end <= from) | ||
1876 | goto next_bh; | ||
1877 | if (block_start >= to) | ||
1878 | break; | ||
1879 | if (buffer_new(bh)) { | ||
1880 | clear_buffer_new(bh); | ||
1881 | zero_user_page(page, block_start, bh->b_size, KM_USER0); | ||
1882 | set_buffer_uptodate(bh); | ||
1883 | mark_buffer_dirty(bh); | ||
1884 | } | ||
1885 | next_bh: | ||
1886 | block_start = block_end; | ||
1887 | bh = bh->b_this_page; | ||
1888 | } while (bh != head); | ||
1889 | return err; | 1901 | return err; |
1890 | } | 1902 | } |
1891 | 1903 | ||
@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page, | |||
1910 | set_buffer_uptodate(bh); | 1922 | set_buffer_uptodate(bh); |
1911 | mark_buffer_dirty(bh); | 1923 | mark_buffer_dirty(bh); |
1912 | } | 1924 | } |
1925 | clear_buffer_new(bh); | ||
1913 | } | 1926 | } |
1914 | 1927 | ||
1915 | /* | 1928 | /* |
@@ -1924,6 +1937,130 @@ static int __block_commit_write(struct inode *inode, struct page *page, | |||
1924 | } | 1937 | } |
1925 | 1938 | ||
1926 | /* | 1939 | /* |
1940 | * block_write_begin takes care of the basic task of block allocation and | ||
1941 | * bringing partial write blocks uptodate first. | ||
1942 | * | ||
1943 | * If *pagep is not NULL, then block_write_begin uses the locked page | ||
1944 | * at *pagep rather than allocating its own. In this case, the page will | ||
1945 | * not be unlocked or deallocated on failure. | ||
1946 | */ | ||
1947 | int block_write_begin(struct file *file, struct address_space *mapping, | ||
1948 | loff_t pos, unsigned len, unsigned flags, | ||
1949 | struct page **pagep, void **fsdata, | ||
1950 | get_block_t *get_block) | ||
1951 | { | ||
1952 | struct inode *inode = mapping->host; | ||
1953 | int status = 0; | ||
1954 | struct page *page; | ||
1955 | pgoff_t index; | ||
1956 | unsigned start, end; | ||
1957 | int ownpage = 0; | ||
1958 | |||
1959 | index = pos >> PAGE_CACHE_SHIFT; | ||
1960 | start = pos & (PAGE_CACHE_SIZE - 1); | ||
1961 | end = start + len; | ||
1962 | |||
1963 | page = *pagep; | ||
1964 | if (page == NULL) { | ||
1965 | ownpage = 1; | ||
1966 | page = __grab_cache_page(mapping, index); | ||
1967 | if (!page) { | ||
1968 | status = -ENOMEM; | ||
1969 | goto out; | ||
1970 | } | ||
1971 | *pagep = page; | ||
1972 | } else | ||
1973 | BUG_ON(!PageLocked(page)); | ||
1974 | |||
1975 | status = __block_prepare_write(inode, page, start, end, get_block); | ||
1976 | if (unlikely(status)) { | ||
1977 | ClearPageUptodate(page); | ||
1978 | |||
1979 | if (ownpage) { | ||
1980 | unlock_page(page); | ||
1981 | page_cache_release(page); | ||
1982 | *pagep = NULL; | ||
1983 | |||
1984 | /* | ||
1985 | * prepare_write() may have instantiated a few blocks | ||
1986 | * outside i_size. Trim these off again. Don't need | ||
1987 | * i_size_read because we hold i_mutex. | ||
1988 | */ | ||
1989 | if (pos + len > inode->i_size) | ||
1990 | vmtruncate(inode, inode->i_size); | ||
1991 | } | ||
1992 | goto out; | ||
1993 | } | ||
1994 | |||
1995 | out: | ||
1996 | return status; | ||
1997 | } | ||
1998 | EXPORT_SYMBOL(block_write_begin); | ||
1999 | |||
2000 | int block_write_end(struct file *file, struct address_space *mapping, | ||
2001 | loff_t pos, unsigned len, unsigned copied, | ||
2002 | struct page *page, void *fsdata) | ||
2003 | { | ||
2004 | struct inode *inode = mapping->host; | ||
2005 | unsigned start; | ||
2006 | |||
2007 | start = pos & (PAGE_CACHE_SIZE - 1); | ||
2008 | |||
2009 | if (unlikely(copied < len)) { | ||
2010 | /* | ||
2011 | * The buffers that were written will now be uptodate, so we | ||
2012 | * don't have to worry about a readpage reading them and | ||
2013 | * overwriting a partial write. However if we have encountered | ||
2014 | * a short write and only partially written into a buffer, it | ||
2015 | * will not be marked uptodate, so a readpage might come in and | ||
2016 | * destroy our partial write. | ||
2017 | * | ||
2018 | * Do the simplest thing, and just treat any short write to a | ||
2019 | * non uptodate page as a zero-length write, and force the | ||
2020 | * caller to redo the whole thing. | ||
2021 | */ | ||
2022 | if (!PageUptodate(page)) | ||
2023 | copied = 0; | ||
2024 | |||
2025 | page_zero_new_buffers(page, start+copied, start+len); | ||
2026 | } | ||
2027 | flush_dcache_page(page); | ||
2028 | |||
2029 | /* This could be a short (even 0-length) commit */ | ||
2030 | __block_commit_write(inode, page, start, start+copied); | ||
2031 | |||
2032 | return copied; | ||
2033 | } | ||
2034 | EXPORT_SYMBOL(block_write_end); | ||
2035 | |||
2036 | int generic_write_end(struct file *file, struct address_space *mapping, | ||
2037 | loff_t pos, unsigned len, unsigned copied, | ||
2038 | struct page *page, void *fsdata) | ||
2039 | { | ||
2040 | struct inode *inode = mapping->host; | ||
2041 | |||
2042 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | ||
2043 | |||
2044 | /* | ||
2045 | * No need to use i_size_read() here, the i_size | ||
2046 | * cannot change under us because we hold i_mutex. | ||
2047 | * | ||
2048 | * But it's important to update i_size while still holding page lock: | ||
2049 | * page writeout could otherwise come in and zero beyond i_size. | ||
2050 | */ | ||
2051 | if (pos+copied > inode->i_size) { | ||
2052 | i_size_write(inode, pos+copied); | ||
2053 | mark_inode_dirty(inode); | ||
2054 | } | ||
2055 | |||
2056 | unlock_page(page); | ||
2057 | page_cache_release(page); | ||
2058 | |||
2059 | return copied; | ||
2060 | } | ||
2061 | EXPORT_SYMBOL(generic_write_end); | ||
2062 | |||
2063 | /* | ||
1927 | * Generic "read page" function for block devices that have the normal | 2064 | * Generic "read page" function for block devices that have the normal |
1928 | * get_block functionality. This is most of the block device filesystems. | 2065 | * get_block functionality. This is most of the block device filesystems. |
1929 | * Reads the page asynchronously --- the unlock_buffer() and | 2066 | * Reads the page asynchronously --- the unlock_buffer() and |