aboutsummaryrefslogtreecommitdiffstats
path: root/fs/buffer.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/buffer.c')
-rw-r--r--fs/buffer.c732
1 files changed, 471 insertions, 261 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 0e5ec371ce72..76403b1764c5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -110,10 +110,14 @@ static void buffer_io_error(struct buffer_head *bh)
110} 110}
111 111
112/* 112/*
113 * Default synchronous end-of-IO handler.. Just mark it up-to-date and 113 * End-of-IO handler helper function which does not touch the bh after
114 * unlock the buffer. This is what ll_rw_block uses too. 114 * unlocking it.
115 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
116 * a race there is benign: unlock_buffer() only use the bh's address for
117 * hashing after unlocking the buffer, so it doesn't actually touch the bh
118 * itself.
115 */ 119 */
116void end_buffer_read_sync(struct buffer_head *bh, int uptodate) 120static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
117{ 121{
118 if (uptodate) { 122 if (uptodate) {
119 set_buffer_uptodate(bh); 123 set_buffer_uptodate(bh);
@@ -122,6 +126,15 @@ void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
122 clear_buffer_uptodate(bh); 126 clear_buffer_uptodate(bh);
123 } 127 }
124 unlock_buffer(bh); 128 unlock_buffer(bh);
129}
130
131/*
132 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
133 * unlock the buffer. This is what ll_rw_block uses too.
134 */
135void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
136{
137 __end_buffer_read_notouch(bh, uptodate);
125 put_bh(bh); 138 put_bh(bh);
126} 139}
127 140
@@ -697,6 +710,8 @@ static int __set_page_dirty(struct page *page,
697 710
698 if (mapping_cap_account_dirty(mapping)) { 711 if (mapping_cap_account_dirty(mapping)) {
699 __inc_zone_page_state(page, NR_FILE_DIRTY); 712 __inc_zone_page_state(page, NR_FILE_DIRTY);
713 __inc_bdi_stat(mapping->backing_dev_info,
714 BDI_RECLAIMABLE);
700 task_io_account_write(PAGE_CACHE_SIZE); 715 task_io_account_write(PAGE_CACHE_SIZE);
701 } 716 }
702 radix_tree_tag_set(&mapping->page_tree, 717 radix_tree_tag_set(&mapping->page_tree,
@@ -1715,7 +1730,6 @@ done:
1715 * The page and buffer_heads can be released at any time from 1730 * The page and buffer_heads can be released at any time from
1716 * here on. 1731 * here on.
1717 */ 1732 */
1718 wbc->pages_skipped++; /* We didn't write this page */
1719 } 1733 }
1720 return err; 1734 return err;
1721 1735
@@ -1757,6 +1771,48 @@ recover:
1757 goto done; 1771 goto done;
1758} 1772}
1759 1773
1774/*
1775 * If a page has any new buffers, zero them out here, and mark them uptodate
1776 * and dirty so they'll be written out (in order to prevent uninitialised
1777 * block data from leaking). And clear the new bit.
1778 */
1779void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1780{
1781 unsigned int block_start, block_end;
1782 struct buffer_head *head, *bh;
1783
1784 BUG_ON(!PageLocked(page));
1785 if (!page_has_buffers(page))
1786 return;
1787
1788 bh = head = page_buffers(page);
1789 block_start = 0;
1790 do {
1791 block_end = block_start + bh->b_size;
1792
1793 if (buffer_new(bh)) {
1794 if (block_end > from && block_start < to) {
1795 if (!PageUptodate(page)) {
1796 unsigned start, size;
1797
1798 start = max(from, block_start);
1799 size = min(to, block_end) - start;
1800
1801 zero_user_page(page, start, size, KM_USER0);
1802 set_buffer_uptodate(bh);
1803 }
1804
1805 clear_buffer_new(bh);
1806 mark_buffer_dirty(bh);
1807 }
1808 }
1809
1810 block_start = block_end;
1811 bh = bh->b_this_page;
1812 } while (bh != head);
1813}
1814EXPORT_SYMBOL(page_zero_new_buffers);
1815
1760static int __block_prepare_write(struct inode *inode, struct page *page, 1816static int __block_prepare_write(struct inode *inode, struct page *page,
1761 unsigned from, unsigned to, get_block_t *get_block) 1817 unsigned from, unsigned to, get_block_t *get_block)
1762{ 1818{
@@ -1800,7 +1856,9 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
1800 unmap_underlying_metadata(bh->b_bdev, 1856 unmap_underlying_metadata(bh->b_bdev,
1801 bh->b_blocknr); 1857 bh->b_blocknr);
1802 if (PageUptodate(page)) { 1858 if (PageUptodate(page)) {
1859 clear_buffer_new(bh);
1803 set_buffer_uptodate(bh); 1860 set_buffer_uptodate(bh);
1861 mark_buffer_dirty(bh);
1804 continue; 1862 continue;
1805 } 1863 }
1806 if (block_end > to || block_start < from) { 1864 if (block_end > to || block_start < from) {
@@ -1839,38 +1897,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
1839 if (!buffer_uptodate(*wait_bh)) 1897 if (!buffer_uptodate(*wait_bh))
1840 err = -EIO; 1898 err = -EIO;
1841 } 1899 }
1842 if (!err) { 1900 if (unlikely(err))
1843 bh = head; 1901 page_zero_new_buffers(page, from, to);
1844 do {
1845 if (buffer_new(bh))
1846 clear_buffer_new(bh);
1847 } while ((bh = bh->b_this_page) != head);
1848 return 0;
1849 }
1850 /* Error case: */
1851 /*
1852 * Zero out any newly allocated blocks to avoid exposing stale
1853 * data. If BH_New is set, we know that the block was newly
1854 * allocated in the above loop.
1855 */
1856 bh = head;
1857 block_start = 0;
1858 do {
1859 block_end = block_start+blocksize;
1860 if (block_end <= from)
1861 goto next_bh;
1862 if (block_start >= to)
1863 break;
1864 if (buffer_new(bh)) {
1865 clear_buffer_new(bh);
1866 zero_user_page(page, block_start, bh->b_size, KM_USER0);
1867 set_buffer_uptodate(bh);
1868 mark_buffer_dirty(bh);
1869 }
1870next_bh:
1871 block_start = block_end;
1872 bh = bh->b_this_page;
1873 } while (bh != head);
1874 return err; 1902 return err;
1875} 1903}
1876 1904
@@ -1895,6 +1923,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1895 set_buffer_uptodate(bh); 1923 set_buffer_uptodate(bh);
1896 mark_buffer_dirty(bh); 1924 mark_buffer_dirty(bh);
1897 } 1925 }
1926 clear_buffer_new(bh);
1898 } 1927 }
1899 1928
1900 /* 1929 /*
@@ -1909,6 +1938,130 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1909} 1938}
1910 1939
1911/* 1940/*
1941 * block_write_begin takes care of the basic task of block allocation and
1942 * bringing partial write blocks uptodate first.
1943 *
1944 * If *pagep is not NULL, then block_write_begin uses the locked page
1945 * at *pagep rather than allocating its own. In this case, the page will
1946 * not be unlocked or deallocated on failure.
1947 */
1948int block_write_begin(struct file *file, struct address_space *mapping,
1949 loff_t pos, unsigned len, unsigned flags,
1950 struct page **pagep, void **fsdata,
1951 get_block_t *get_block)
1952{
1953 struct inode *inode = mapping->host;
1954 int status = 0;
1955 struct page *page;
1956 pgoff_t index;
1957 unsigned start, end;
1958 int ownpage = 0;
1959
1960 index = pos >> PAGE_CACHE_SHIFT;
1961 start = pos & (PAGE_CACHE_SIZE - 1);
1962 end = start + len;
1963
1964 page = *pagep;
1965 if (page == NULL) {
1966 ownpage = 1;
1967 page = __grab_cache_page(mapping, index);
1968 if (!page) {
1969 status = -ENOMEM;
1970 goto out;
1971 }
1972 *pagep = page;
1973 } else
1974 BUG_ON(!PageLocked(page));
1975
1976 status = __block_prepare_write(inode, page, start, end, get_block);
1977 if (unlikely(status)) {
1978 ClearPageUptodate(page);
1979
1980 if (ownpage) {
1981 unlock_page(page);
1982 page_cache_release(page);
1983 *pagep = NULL;
1984
1985 /*
1986 * prepare_write() may have instantiated a few blocks
1987 * outside i_size. Trim these off again. Don't need
1988 * i_size_read because we hold i_mutex.
1989 */
1990 if (pos + len > inode->i_size)
1991 vmtruncate(inode, inode->i_size);
1992 }
1993 goto out;
1994 }
1995
1996out:
1997 return status;
1998}
1999EXPORT_SYMBOL(block_write_begin);
2000
2001int block_write_end(struct file *file, struct address_space *mapping,
2002 loff_t pos, unsigned len, unsigned copied,
2003 struct page *page, void *fsdata)
2004{
2005 struct inode *inode = mapping->host;
2006 unsigned start;
2007
2008 start = pos & (PAGE_CACHE_SIZE - 1);
2009
2010 if (unlikely(copied < len)) {
2011 /*
2012 * The buffers that were written will now be uptodate, so we
2013 * don't have to worry about a readpage reading them and
2014 * overwriting a partial write. However if we have encountered
2015 * a short write and only partially written into a buffer, it
2016 * will not be marked uptodate, so a readpage might come in and
2017 * destroy our partial write.
2018 *
2019 * Do the simplest thing, and just treat any short write to a
2020 * non uptodate page as a zero-length write, and force the
2021 * caller to redo the whole thing.
2022 */
2023 if (!PageUptodate(page))
2024 copied = 0;
2025
2026 page_zero_new_buffers(page, start+copied, start+len);
2027 }
2028 flush_dcache_page(page);
2029
2030 /* This could be a short (even 0-length) commit */
2031 __block_commit_write(inode, page, start, start+copied);
2032
2033 return copied;
2034}
2035EXPORT_SYMBOL(block_write_end);
2036
2037int generic_write_end(struct file *file, struct address_space *mapping,
2038 loff_t pos, unsigned len, unsigned copied,
2039 struct page *page, void *fsdata)
2040{
2041 struct inode *inode = mapping->host;
2042
2043 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2044
2045 /*
2046 * No need to use i_size_read() here, the i_size
2047 * cannot change under us because we hold i_mutex.
2048 *
2049 * But it's important to update i_size while still holding page lock:
2050 * page writeout could otherwise come in and zero beyond i_size.
2051 */
2052 if (pos+copied > inode->i_size) {
2053 i_size_write(inode, pos+copied);
2054 mark_inode_dirty(inode);
2055 }
2056
2057 unlock_page(page);
2058 page_cache_release(page);
2059
2060 return copied;
2061}
2062EXPORT_SYMBOL(generic_write_end);
2063
2064/*
1912 * Generic "read page" function for block devices that have the normal 2065 * Generic "read page" function for block devices that have the normal
1913 * get_block functionality. This is most of the block device filesystems. 2066 * get_block functionality. This is most of the block device filesystems.
1914 * Reads the page asynchronously --- the unlock_buffer() and 2067 * Reads the page asynchronously --- the unlock_buffer() and
@@ -2004,14 +2157,14 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
2004} 2157}
2005 2158
2006/* utility function for filesystems that need to do work on expanding 2159/* utility function for filesystems that need to do work on expanding
2007 * truncates. Uses prepare/commit_write to allow the filesystem to 2160 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2008 * deal with the hole. 2161 * deal with the hole.
2009 */ 2162 */
2010static int __generic_cont_expand(struct inode *inode, loff_t size, 2163int generic_cont_expand_simple(struct inode *inode, loff_t size)
2011 pgoff_t index, unsigned int offset)
2012{ 2164{
2013 struct address_space *mapping = inode->i_mapping; 2165 struct address_space *mapping = inode->i_mapping;
2014 struct page *page; 2166 struct page *page;
2167 void *fsdata;
2015 unsigned long limit; 2168 unsigned long limit;
2016 int err; 2169 int err;
2017 2170
@@ -2024,140 +2177,115 @@ static int __generic_cont_expand(struct inode *inode, loff_t size,
2024 if (size > inode->i_sb->s_maxbytes) 2177 if (size > inode->i_sb->s_maxbytes)
2025 goto out; 2178 goto out;
2026 2179
2027 err = -ENOMEM; 2180 err = pagecache_write_begin(NULL, mapping, size, 0,
2028 page = grab_cache_page(mapping, index); 2181 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2029 if (!page) 2182 &page, &fsdata);
2030 goto out; 2183 if (err)
2031 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2032 if (err) {
2033 /*
2034 * ->prepare_write() may have instantiated a few blocks
2035 * outside i_size. Trim these off again.
2036 */
2037 unlock_page(page);
2038 page_cache_release(page);
2039 vmtruncate(inode, inode->i_size);
2040 goto out; 2184 goto out;
2041 }
2042 2185
2043 err = mapping->a_ops->commit_write(NULL, page, offset, offset); 2186 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2187 BUG_ON(err > 0);
2044 2188
2045 unlock_page(page);
2046 page_cache_release(page);
2047 if (err > 0)
2048 err = 0;
2049out: 2189out:
2050 return err; 2190 return err;
2051} 2191}
2052 2192
2053int generic_cont_expand(struct inode *inode, loff_t size) 2193int cont_expand_zero(struct file *file, struct address_space *mapping,
2194 loff_t pos, loff_t *bytes)
2054{ 2195{
2055 pgoff_t index; 2196 struct inode *inode = mapping->host;
2056 unsigned int offset; 2197 unsigned blocksize = 1 << inode->i_blkbits;
2198 struct page *page;
2199 void *fsdata;
2200 pgoff_t index, curidx;
2201 loff_t curpos;
2202 unsigned zerofrom, offset, len;
2203 int err = 0;
2057 2204
2058 offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */ 2205 index = pos >> PAGE_CACHE_SHIFT;
2206 offset = pos & ~PAGE_CACHE_MASK;
2059 2207
2060 /* ugh. in prepare/commit_write, if from==to==start of block, we 2208 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2061 ** skip the prepare. make sure we never send an offset for the start 2209 zerofrom = curpos & ~PAGE_CACHE_MASK;
2062 ** of a block 2210 if (zerofrom & (blocksize-1)) {
2063 */ 2211 *bytes |= (blocksize-1);
2064 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 2212 (*bytes)++;
2065 /* caller must handle this extra byte. */ 2213 }
2066 offset++; 2214 len = PAGE_CACHE_SIZE - zerofrom;
2067 }
2068 index = size >> PAGE_CACHE_SHIFT;
2069 2215
2070 return __generic_cont_expand(inode, size, index, offset); 2216 err = pagecache_write_begin(file, mapping, curpos, len,
2071} 2217 AOP_FLAG_UNINTERRUPTIBLE,
2218 &page, &fsdata);
2219 if (err)
2220 goto out;
2221 zero_user_page(page, zerofrom, len, KM_USER0);
2222 err = pagecache_write_end(file, mapping, curpos, len, len,
2223 page, fsdata);
2224 if (err < 0)
2225 goto out;
2226 BUG_ON(err != len);
2227 err = 0;
2228 }
2072 2229
2073int generic_cont_expand_simple(struct inode *inode, loff_t size) 2230 /* page covers the boundary, find the boundary offset */
2074{ 2231 if (index == curidx) {
2075 loff_t pos = size - 1; 2232 zerofrom = curpos & ~PAGE_CACHE_MASK;
2076 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 2233 /* if we will expand the thing last block will be filled */
2077 unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1; 2234 if (offset <= zerofrom) {
2235 goto out;
2236 }
2237 if (zerofrom & (blocksize-1)) {
2238 *bytes |= (blocksize-1);
2239 (*bytes)++;
2240 }
2241 len = offset - zerofrom;
2078 2242
2079 /* prepare/commit_write can handle even if from==to==start of block. */ 2243 err = pagecache_write_begin(file, mapping, curpos, len,
2080 return __generic_cont_expand(inode, size, index, offset); 2244 AOP_FLAG_UNINTERRUPTIBLE,
2245 &page, &fsdata);
2246 if (err)
2247 goto out;
2248 zero_user_page(page, zerofrom, len, KM_USER0);
2249 err = pagecache_write_end(file, mapping, curpos, len, len,
2250 page, fsdata);
2251 if (err < 0)
2252 goto out;
2253 BUG_ON(err != len);
2254 err = 0;
2255 }
2256out:
2257 return err;
2081} 2258}
2082 2259
2083/* 2260/*
2084 * For moronic filesystems that do not allow holes in file. 2261 * For moronic filesystems that do not allow holes in file.
2085 * We may have to extend the file. 2262 * We may have to extend the file.
2086 */ 2263 */
2087 2264int cont_write_begin(struct file *file, struct address_space *mapping,
2088int cont_prepare_write(struct page *page, unsigned offset, 2265 loff_t pos, unsigned len, unsigned flags,
2089 unsigned to, get_block_t *get_block, loff_t *bytes) 2266 struct page **pagep, void **fsdata,
2267 get_block_t *get_block, loff_t *bytes)
2090{ 2268{
2091 struct address_space *mapping = page->mapping;
2092 struct inode *inode = mapping->host; 2269 struct inode *inode = mapping->host;
2093 struct page *new_page;
2094 pgoff_t pgpos;
2095 long status;
2096 unsigned zerofrom;
2097 unsigned blocksize = 1 << inode->i_blkbits; 2270 unsigned blocksize = 1 << inode->i_blkbits;
2271 unsigned zerofrom;
2272 int err;
2098 2273
2099 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { 2274 err = cont_expand_zero(file, mapping, pos, bytes);
2100 status = -ENOMEM; 2275 if (err)
2101 new_page = grab_cache_page(mapping, pgpos); 2276 goto out;
2102 if (!new_page)
2103 goto out;
2104 /* we might sleep */
2105 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2106 unlock_page(new_page);
2107 page_cache_release(new_page);
2108 continue;
2109 }
2110 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2111 if (zerofrom & (blocksize-1)) {
2112 *bytes |= (blocksize-1);
2113 (*bytes)++;
2114 }
2115 status = __block_prepare_write(inode, new_page, zerofrom,
2116 PAGE_CACHE_SIZE, get_block);
2117 if (status)
2118 goto out_unmap;
2119 zero_user_page(new_page, zerofrom, PAGE_CACHE_SIZE - zerofrom,
2120 KM_USER0);
2121 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2122 unlock_page(new_page);
2123 page_cache_release(new_page);
2124 }
2125
2126 if (page->index < pgpos) {
2127 /* completely inside the area */
2128 zerofrom = offset;
2129 } else {
2130 /* page covers the boundary, find the boundary offset */
2131 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2132
2133 /* if we will expand the thing last block will be filled */
2134 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2135 *bytes |= (blocksize-1);
2136 (*bytes)++;
2137 }
2138 2277
2139 /* starting below the boundary? Nothing to zero out */ 2278 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2140 if (offset <= zerofrom) 2279 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2141 zerofrom = offset; 2280 *bytes |= (blocksize-1);
2142 } 2281 (*bytes)++;
2143 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2144 if (status)
2145 goto out1;
2146 if (zerofrom < offset) {
2147 zero_user_page(page, zerofrom, offset - zerofrom, KM_USER0);
2148 __block_commit_write(inode, page, zerofrom, offset);
2149 } 2282 }
2150 return 0;
2151out1:
2152 ClearPageUptodate(page);
2153 return status;
2154 2283
2155out_unmap: 2284 *pagep = NULL;
2156 ClearPageUptodate(new_page); 2285 err = block_write_begin(file, mapping, pos, len,
2157 unlock_page(new_page); 2286 flags, pagep, fsdata, get_block);
2158 page_cache_release(new_page);
2159out: 2287out:
2160 return status; 2288 return err;
2161} 2289}
2162 2290
2163int block_prepare_write(struct page *page, unsigned from, unsigned to, 2291int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2242,81 +2370,129 @@ out_unlock:
2242} 2370}
2243 2371
2244/* 2372/*
2245 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed 2373 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2246 * immediately, while under the page lock. So it needs a special end_io 2374 * immediately, while under the page lock. So it needs a special end_io
2247 * handler which does not touch the bh after unlocking it. 2375 * handler which does not touch the bh after unlocking it.
2248 *
2249 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2250 * a race there is benign: unlock_buffer() only use the bh's address for
2251 * hashing after unlocking the buffer, so it doesn't actually touch the bh
2252 * itself.
2253 */ 2376 */
2254static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) 2377static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2255{ 2378{
2256 if (uptodate) { 2379 __end_buffer_read_notouch(bh, uptodate);
2257 set_buffer_uptodate(bh); 2380}
2258 } else { 2381
2259 /* This happens, due to failed READA attempts. */ 2382/*
2260 clear_buffer_uptodate(bh); 2383 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2261 } 2384 * the page (converting it to circular linked list and taking care of page
2262 unlock_buffer(bh); 2385 * dirty races).
2386 */
2387static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2388{
2389 struct buffer_head *bh;
2390
2391 BUG_ON(!PageLocked(page));
2392
2393 spin_lock(&page->mapping->private_lock);
2394 bh = head;
2395 do {
2396 if (PageDirty(page))
2397 set_buffer_dirty(bh);
2398 if (!bh->b_this_page)
2399 bh->b_this_page = head;
2400 bh = bh->b_this_page;
2401 } while (bh != head);
2402 attach_page_buffers(page, head);
2403 spin_unlock(&page->mapping->private_lock);
2263} 2404}
2264 2405
2265/* 2406/*
2266 * On entry, the page is fully not uptodate. 2407 * On entry, the page is fully not uptodate.
2267 * On exit the page is fully uptodate in the areas outside (from,to) 2408 * On exit the page is fully uptodate in the areas outside (from,to)
2268 */ 2409 */
2269int nobh_prepare_write(struct page *page, unsigned from, unsigned to, 2410int nobh_write_begin(struct file *file, struct address_space *mapping,
2411 loff_t pos, unsigned len, unsigned flags,
2412 struct page **pagep, void **fsdata,
2270 get_block_t *get_block) 2413 get_block_t *get_block)
2271{ 2414{
2272 struct inode *inode = page->mapping->host; 2415 struct inode *inode = mapping->host;
2273 const unsigned blkbits = inode->i_blkbits; 2416 const unsigned blkbits = inode->i_blkbits;
2274 const unsigned blocksize = 1 << blkbits; 2417 const unsigned blocksize = 1 << blkbits;
2275 struct buffer_head map_bh; 2418 struct buffer_head *head, *bh;
2276 struct buffer_head *read_bh[MAX_BUF_PER_PAGE]; 2419 struct page *page;
2420 pgoff_t index;
2421 unsigned from, to;
2277 unsigned block_in_page; 2422 unsigned block_in_page;
2278 unsigned block_start; 2423 unsigned block_start, block_end;
2279 sector_t block_in_file; 2424 sector_t block_in_file;
2280 char *kaddr; 2425 char *kaddr;
2281 int nr_reads = 0; 2426 int nr_reads = 0;
2282 int i;
2283 int ret = 0; 2427 int ret = 0;
2284 int is_mapped_to_disk = 1; 2428 int is_mapped_to_disk = 1;
2285 2429
2430 index = pos >> PAGE_CACHE_SHIFT;
2431 from = pos & (PAGE_CACHE_SIZE - 1);
2432 to = from + len;
2433
2434 page = __grab_cache_page(mapping, index);
2435 if (!page)
2436 return -ENOMEM;
2437 *pagep = page;
2438 *fsdata = NULL;
2439
2440 if (page_has_buffers(page)) {
2441 unlock_page(page);
2442 page_cache_release(page);
2443 *pagep = NULL;
2444 return block_write_begin(file, mapping, pos, len, flags, pagep,
2445 fsdata, get_block);
2446 }
2447
2286 if (PageMappedToDisk(page)) 2448 if (PageMappedToDisk(page))
2287 return 0; 2449 return 0;
2288 2450
2451 /*
2452 * Allocate buffers so that we can keep track of state, and potentially
2453 * attach them to the page if an error occurs. In the common case of
2454 * no error, they will just be freed again without ever being attached
2455 * to the page (which is all OK, because we're under the page lock).
2456 *
2457 * Be careful: the buffer linked list is a NULL terminated one, rather
2458 * than the circular one we're used to.
2459 */
2460 head = alloc_page_buffers(page, blocksize, 0);
2461 if (!head) {
2462 ret = -ENOMEM;
2463 goto out_release;
2464 }
2465
2289 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); 2466 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2290 map_bh.b_page = page;
2291 2467
2292 /* 2468 /*
2293 * We loop across all blocks in the page, whether or not they are 2469 * We loop across all blocks in the page, whether or not they are
2294 * part of the affected region. This is so we can discover if the 2470 * part of the affected region. This is so we can discover if the
2295 * page is fully mapped-to-disk. 2471 * page is fully mapped-to-disk.
2296 */ 2472 */
2297 for (block_start = 0, block_in_page = 0; 2473 for (block_start = 0, block_in_page = 0, bh = head;
2298 block_start < PAGE_CACHE_SIZE; 2474 block_start < PAGE_CACHE_SIZE;
2299 block_in_page++, block_start += blocksize) { 2475 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2300 unsigned block_end = block_start + blocksize;
2301 int create; 2476 int create;
2302 2477
2303 map_bh.b_state = 0; 2478 block_end = block_start + blocksize;
2479 bh->b_state = 0;
2304 create = 1; 2480 create = 1;
2305 if (block_start >= to) 2481 if (block_start >= to)
2306 create = 0; 2482 create = 0;
2307 map_bh.b_size = blocksize;
2308 ret = get_block(inode, block_in_file + block_in_page, 2483 ret = get_block(inode, block_in_file + block_in_page,
2309 &map_bh, create); 2484 bh, create);
2310 if (ret) 2485 if (ret)
2311 goto failed; 2486 goto failed;
2312 if (!buffer_mapped(&map_bh)) 2487 if (!buffer_mapped(bh))
2313 is_mapped_to_disk = 0; 2488 is_mapped_to_disk = 0;
2314 if (buffer_new(&map_bh)) 2489 if (buffer_new(bh))
2315 unmap_underlying_metadata(map_bh.b_bdev, 2490 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2316 map_bh.b_blocknr); 2491 if (PageUptodate(page)) {
2317 if (PageUptodate(page)) 2492 set_buffer_uptodate(bh);
2318 continue; 2493 continue;
2319 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) { 2494 }
2495 if (buffer_new(bh) || !buffer_mapped(bh)) {
2320 kaddr = kmap_atomic(page, KM_USER0); 2496 kaddr = kmap_atomic(page, KM_USER0);
2321 if (block_start < from) 2497 if (block_start < from)
2322 memset(kaddr+block_start, 0, from-block_start); 2498 memset(kaddr+block_start, 0, from-block_start);
@@ -2326,49 +2502,26 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2326 kunmap_atomic(kaddr, KM_USER0); 2502 kunmap_atomic(kaddr, KM_USER0);
2327 continue; 2503 continue;
2328 } 2504 }
2329 if (buffer_uptodate(&map_bh)) 2505 if (buffer_uptodate(bh))
2330 continue; /* reiserfs does this */ 2506 continue; /* reiserfs does this */
2331 if (block_start < from || block_end > to) { 2507 if (block_start < from || block_end > to) {
2332 struct buffer_head *bh = alloc_buffer_head(GFP_NOFS); 2508 lock_buffer(bh);
2333 2509 bh->b_end_io = end_buffer_read_nobh;
2334 if (!bh) { 2510 submit_bh(READ, bh);
2335 ret = -ENOMEM; 2511 nr_reads++;
2336 goto failed;
2337 }
2338 bh->b_state = map_bh.b_state;
2339 atomic_set(&bh->b_count, 0);
2340 bh->b_this_page = NULL;
2341 bh->b_page = page;
2342 bh->b_blocknr = map_bh.b_blocknr;
2343 bh->b_size = blocksize;
2344 bh->b_data = (char *)(long)block_start;
2345 bh->b_bdev = map_bh.b_bdev;
2346 bh->b_private = NULL;
2347 read_bh[nr_reads++] = bh;
2348 } 2512 }
2349 } 2513 }
2350 2514
2351 if (nr_reads) { 2515 if (nr_reads) {
2352 struct buffer_head *bh;
2353
2354 /* 2516 /*
2355 * The page is locked, so these buffers are protected from 2517 * The page is locked, so these buffers are protected from
2356 * any VM or truncate activity. Hence we don't need to care 2518 * any VM or truncate activity. Hence we don't need to care
2357 * for the buffer_head refcounts. 2519 * for the buffer_head refcounts.
2358 */ 2520 */
2359 for (i = 0; i < nr_reads; i++) { 2521 for (bh = head; bh; bh = bh->b_this_page) {
2360 bh = read_bh[i];
2361 lock_buffer(bh);
2362 bh->b_end_io = end_buffer_read_nobh;
2363 submit_bh(READ, bh);
2364 }
2365 for (i = 0; i < nr_reads; i++) {
2366 bh = read_bh[i];
2367 wait_on_buffer(bh); 2522 wait_on_buffer(bh);
2368 if (!buffer_uptodate(bh)) 2523 if (!buffer_uptodate(bh))
2369 ret = -EIO; 2524 ret = -EIO;
2370 free_buffer_head(bh);
2371 read_bh[i] = NULL;
2372 } 2525 }
2373 if (ret) 2526 if (ret)
2374 goto failed; 2527 goto failed;
@@ -2377,44 +2530,70 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2377 if (is_mapped_to_disk) 2530 if (is_mapped_to_disk)
2378 SetPageMappedToDisk(page); 2531 SetPageMappedToDisk(page);
2379 2532
2533 *fsdata = head; /* to be released by nobh_write_end */
2534
2380 return 0; 2535 return 0;
2381 2536
2382failed: 2537failed:
2383 for (i = 0; i < nr_reads; i++) { 2538 BUG_ON(!ret);
2384 if (read_bh[i])
2385 free_buffer_head(read_bh[i]);
2386 }
2387
2388 /* 2539 /*
2389 * Error recovery is pretty slack. Clear the page and mark it dirty 2540 * Error recovery is a bit difficult. We need to zero out blocks that
2390 * so we'll later zero out any blocks which _were_ allocated. 2541 * were newly allocated, and dirty them to ensure they get written out.
2542 * Buffers need to be attached to the page at this point, otherwise
2543 * the handling of potential IO errors during writeout would be hard
2544 * (could try doing synchronous writeout, but what if that fails too?)
2391 */ 2545 */
2392 zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); 2546 attach_nobh_buffers(page, head);
2393 SetPageUptodate(page); 2547 page_zero_new_buffers(page, from, to);
2394 set_page_dirty(page); 2548
2549out_release:
2550 unlock_page(page);
2551 page_cache_release(page);
2552 *pagep = NULL;
2553
2554 if (pos + len > inode->i_size)
2555 vmtruncate(inode, inode->i_size);
2556
2395 return ret; 2557 return ret;
2396} 2558}
2397EXPORT_SYMBOL(nobh_prepare_write); 2559EXPORT_SYMBOL(nobh_write_begin);
2398 2560
2399/* 2561int nobh_write_end(struct file *file, struct address_space *mapping,
2400 * Make sure any changes to nobh_commit_write() are reflected in 2562 loff_t pos, unsigned len, unsigned copied,
2401 * nobh_truncate_page(), since it doesn't call commit_write(). 2563 struct page *page, void *fsdata)
2402 */
2403int nobh_commit_write(struct file *file, struct page *page,
2404 unsigned from, unsigned to)
2405{ 2564{
2406 struct inode *inode = page->mapping->host; 2565 struct inode *inode = page->mapping->host;
2407 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 2566 struct buffer_head *head = NULL;
2567 struct buffer_head *bh;
2568
2569 if (!PageMappedToDisk(page)) {
2570 if (unlikely(copied < len) && !page_has_buffers(page))
2571 attach_nobh_buffers(page, head);
2572 if (page_has_buffers(page))
2573 return generic_write_end(file, mapping, pos, len,
2574 copied, page, fsdata);
2575 }
2408 2576
2409 SetPageUptodate(page); 2577 SetPageUptodate(page);
2410 set_page_dirty(page); 2578 set_page_dirty(page);
2411 if (pos > inode->i_size) { 2579 if (pos+copied > inode->i_size) {
2412 i_size_write(inode, pos); 2580 i_size_write(inode, pos+copied);
2413 mark_inode_dirty(inode); 2581 mark_inode_dirty(inode);
2414 } 2582 }
2415 return 0; 2583
2584 unlock_page(page);
2585 page_cache_release(page);
2586
2587 head = fsdata;
2588 while (head) {
2589 bh = head;
2590 head = head->b_this_page;
2591 free_buffer_head(bh);
2592 }
2593
2594 return copied;
2416} 2595}
2417EXPORT_SYMBOL(nobh_commit_write); 2596EXPORT_SYMBOL(nobh_write_end);
2418 2597
2419/* 2598/*
2420 * nobh_writepage() - based on block_full_write_page() except 2599 * nobh_writepage() - based on block_full_write_page() except
@@ -2467,44 +2646,79 @@ out:
2467} 2646}
2468EXPORT_SYMBOL(nobh_writepage); 2647EXPORT_SYMBOL(nobh_writepage);
2469 2648
2470/* 2649int nobh_truncate_page(struct address_space *mapping,
2471 * This function assumes that ->prepare_write() uses nobh_prepare_write(). 2650 loff_t from, get_block_t *get_block)
2472 */
2473int nobh_truncate_page(struct address_space *mapping, loff_t from)
2474{ 2651{
2475 struct inode *inode = mapping->host;
2476 unsigned blocksize = 1 << inode->i_blkbits;
2477 pgoff_t index = from >> PAGE_CACHE_SHIFT; 2652 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2478 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2653 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2479 unsigned to; 2654 unsigned blocksize;
2655 sector_t iblock;
2656 unsigned length, pos;
2657 struct inode *inode = mapping->host;
2480 struct page *page; 2658 struct page *page;
2481 const struct address_space_operations *a_ops = mapping->a_ops; 2659 struct buffer_head map_bh;
2482 int ret = 0; 2660 int err;
2483 2661
2484 if ((offset & (blocksize - 1)) == 0) 2662 blocksize = 1 << inode->i_blkbits;
2485 goto out; 2663 length = offset & (blocksize - 1);
2664
2665 /* Block boundary? Nothing to do */
2666 if (!length)
2667 return 0;
2668
2669 length = blocksize - length;
2670 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2486 2671
2487 ret = -ENOMEM;
2488 page = grab_cache_page(mapping, index); 2672 page = grab_cache_page(mapping, index);
2673 err = -ENOMEM;
2489 if (!page) 2674 if (!page)
2490 goto out; 2675 goto out;
2491 2676
2492 to = (offset + blocksize) & ~(blocksize - 1); 2677 if (page_has_buffers(page)) {
2493 ret = a_ops->prepare_write(NULL, page, offset, to); 2678has_buffers:
2494 if (ret == 0) { 2679 unlock_page(page);
2495 zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, 2680 page_cache_release(page);
2496 KM_USER0); 2681 return block_truncate_page(mapping, from, get_block);
2497 /* 2682 }
2498 * It would be more correct to call aops->commit_write() 2683
2499 * here, but this is more efficient. 2684 /* Find the buffer that contains "offset" */
2500 */ 2685 pos = blocksize;
2501 SetPageUptodate(page); 2686 while (offset >= pos) {
2502 set_page_dirty(page); 2687 iblock++;
2688 pos += blocksize;
2503 } 2689 }
2690
2691 err = get_block(inode, iblock, &map_bh, 0);
2692 if (err)
2693 goto unlock;
2694 /* unmapped? It's a hole - nothing to do */
2695 if (!buffer_mapped(&map_bh))
2696 goto unlock;
2697
2698 /* Ok, it's mapped. Make sure it's up-to-date */
2699 if (!PageUptodate(page)) {
2700 err = mapping->a_ops->readpage(NULL, page);
2701 if (err) {
2702 page_cache_release(page);
2703 goto out;
2704 }
2705 lock_page(page);
2706 if (!PageUptodate(page)) {
2707 err = -EIO;
2708 goto unlock;
2709 }
2710 if (page_has_buffers(page))
2711 goto has_buffers;
2712 }
2713 zero_user_page(page, offset, length, KM_USER0);
2714 set_page_dirty(page);
2715 err = 0;
2716
2717unlock:
2504 unlock_page(page); 2718 unlock_page(page);
2505 page_cache_release(page); 2719 page_cache_release(page);
2506out: 2720out:
2507 return ret; 2721 return err;
2508} 2722}
2509EXPORT_SYMBOL(nobh_truncate_page); 2723EXPORT_SYMBOL(nobh_truncate_page);
2510 2724
@@ -2634,13 +2848,10 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2634 return tmp.b_blocknr; 2848 return tmp.b_blocknr;
2635} 2849}
2636 2850
2637static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err) 2851static void end_bio_bh_io_sync(struct bio *bio, int err)
2638{ 2852{
2639 struct buffer_head *bh = bio->bi_private; 2853 struct buffer_head *bh = bio->bi_private;
2640 2854
2641 if (bio->bi_size)
2642 return 1;
2643
2644 if (err == -EOPNOTSUPP) { 2855 if (err == -EOPNOTSUPP) {
2645 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2856 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2646 set_bit(BH_Eopnotsupp, &bh->b_state); 2857 set_bit(BH_Eopnotsupp, &bh->b_state);
@@ -2648,7 +2859,6 @@ static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2648 2859
2649 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); 2860 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2650 bio_put(bio); 2861 bio_put(bio);
2651 return 0;
2652} 2862}
2653 2863
2654int submit_bh(int rw, struct buffer_head * bh) 2864int submit_bh(int rw, struct buffer_head * bh)
@@ -2960,7 +3170,8 @@ static void recalc_bh_state(void)
2960 3170
2961struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3171struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2962{ 3172{
2963 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); 3173 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep,
3174 set_migrateflags(gfp_flags, __GFP_RECLAIMABLE));
2964 if (ret) { 3175 if (ret) {
2965 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3176 INIT_LIST_HEAD(&ret->b_assoc_buffers);
2966 get_cpu_var(bh_accounting).nr++; 3177 get_cpu_var(bh_accounting).nr++;
@@ -3028,14 +3239,13 @@ EXPORT_SYMBOL(block_read_full_page);
3028EXPORT_SYMBOL(block_sync_page); 3239EXPORT_SYMBOL(block_sync_page);
3029EXPORT_SYMBOL(block_truncate_page); 3240EXPORT_SYMBOL(block_truncate_page);
3030EXPORT_SYMBOL(block_write_full_page); 3241EXPORT_SYMBOL(block_write_full_page);
3031EXPORT_SYMBOL(cont_prepare_write); 3242EXPORT_SYMBOL(cont_write_begin);
3032EXPORT_SYMBOL(end_buffer_read_sync); 3243EXPORT_SYMBOL(end_buffer_read_sync);
3033EXPORT_SYMBOL(end_buffer_write_sync); 3244EXPORT_SYMBOL(end_buffer_write_sync);
3034EXPORT_SYMBOL(file_fsync); 3245EXPORT_SYMBOL(file_fsync);
3035EXPORT_SYMBOL(fsync_bdev); 3246EXPORT_SYMBOL(fsync_bdev);
3036EXPORT_SYMBOL(generic_block_bmap); 3247EXPORT_SYMBOL(generic_block_bmap);
3037EXPORT_SYMBOL(generic_commit_write); 3248EXPORT_SYMBOL(generic_commit_write);
3038EXPORT_SYMBOL(generic_cont_expand);
3039EXPORT_SYMBOL(generic_cont_expand_simple); 3249EXPORT_SYMBOL(generic_cont_expand_simple);
3040EXPORT_SYMBOL(init_buffer); 3250EXPORT_SYMBOL(init_buffer);
3041EXPORT_SYMBOL(invalidate_bdev); 3251EXPORT_SYMBOL(invalidate_bdev);