aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-10-16 04:24:48 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:42:54 -0400
commita4b0672db3a698d0684ee6e54f44e2e162a3da1b (patch)
treed53c52b251856520b0fd02969bfcdd4b5d332da6
parent68671f35fe8d785277118a333c88768a4f894917 (diff)
fs: fix nobh error handling
nobh mode error handling is not just pretty slack, it's wrong. One cannot zero out the whole page to ensure new blocks are zeroed, because it just brings the whole page "uptodate" with zeroes even if that may not be the correct uptodate data. Also, other parts of the page may already contain dirty data which would get lost by zeroing it out. Thirdly, the writeback of zeroes to the new blocks will also erase existing blocks. All these conditions are pagecache and/or filesystem corruption. The problem comes about because we didn't keep track of which buffers actually are new or old. However it is not enough just to keep only this state, because at the point we start dirtying parts of the page (new blocks, with zeroes), the handling of IO errors becomes impossible without buffers because the page may only be partially uptodate, in which case the page flags allone cannot capture the state of the parts of the page. So allocate all buffers for the page upfront, but leave them unattached so that they don't pick up any other references and can be freed when we're done. If the error path is hit, then zero the new buffers as the regular buffer path does, then attach the buffers to the page so that it can actually be written out correctly and be subject to the normal IO error handling paths. As an upshot, we save 1K of kernel stack on ia64 or powerpc 64K page systems. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/buffer.c138
1 files changed, 82 insertions, 56 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index b144fc367b8b..09bb80c479d8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2274,51 +2274,64 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2274 struct inode *inode = page->mapping->host; 2274 struct inode *inode = page->mapping->host;
2275 const unsigned blkbits = inode->i_blkbits; 2275 const unsigned blkbits = inode->i_blkbits;
2276 const unsigned blocksize = 1 << blkbits; 2276 const unsigned blocksize = 1 << blkbits;
2277 struct buffer_head map_bh; 2277 struct buffer_head *head, *bh;
2278 struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2279 unsigned block_in_page; 2278 unsigned block_in_page;
2280 unsigned block_start; 2279 unsigned block_start, block_end;
2281 sector_t block_in_file; 2280 sector_t block_in_file;
2282 char *kaddr; 2281 char *kaddr;
2283 int nr_reads = 0; 2282 int nr_reads = 0;
2284 int i;
2285 int ret = 0; 2283 int ret = 0;
2286 int is_mapped_to_disk = 1; 2284 int is_mapped_to_disk = 1;
2287 2285
2286 if (page_has_buffers(page))
2287 return block_prepare_write(page, from, to, get_block);
2288
2288 if (PageMappedToDisk(page)) 2289 if (PageMappedToDisk(page))
2289 return 0; 2290 return 0;
2290 2291
2292 /*
2293 * Allocate buffers so that we can keep track of state, and potentially
2294 * attach them to the page if an error occurs. In the common case of
2295 * no error, they will just be freed again without ever being attached
2296 * to the page (which is all OK, because we're under the page lock).
2297 *
2298 * Be careful: the buffer linked list is a NULL terminated one, rather
2299 * than the circular one we're used to.
2300 */
2301 head = alloc_page_buffers(page, blocksize, 0);
2302 if (!head)
2303 return -ENOMEM;
2304
2291 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); 2305 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2292 map_bh.b_page = page;
2293 2306
2294 /* 2307 /*
2295 * We loop across all blocks in the page, whether or not they are 2308 * We loop across all blocks in the page, whether or not they are
2296 * part of the affected region. This is so we can discover if the 2309 * part of the affected region. This is so we can discover if the
2297 * page is fully mapped-to-disk. 2310 * page is fully mapped-to-disk.
2298 */ 2311 */
2299 for (block_start = 0, block_in_page = 0; 2312 for (block_start = 0, block_in_page = 0, bh = head;
2300 block_start < PAGE_CACHE_SIZE; 2313 block_start < PAGE_CACHE_SIZE;
2301 block_in_page++, block_start += blocksize) { 2314 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2302 unsigned block_end = block_start + blocksize;
2303 int create; 2315 int create;
2304 2316
2305 map_bh.b_state = 0; 2317 block_end = block_start + blocksize;
2318 bh->b_state = 0;
2306 create = 1; 2319 create = 1;
2307 if (block_start >= to) 2320 if (block_start >= to)
2308 create = 0; 2321 create = 0;
2309 map_bh.b_size = blocksize;
2310 ret = get_block(inode, block_in_file + block_in_page, 2322 ret = get_block(inode, block_in_file + block_in_page,
2311 &map_bh, create); 2323 bh, create);
2312 if (ret) 2324 if (ret)
2313 goto failed; 2325 goto failed;
2314 if (!buffer_mapped(&map_bh)) 2326 if (!buffer_mapped(bh))
2315 is_mapped_to_disk = 0; 2327 is_mapped_to_disk = 0;
2316 if (buffer_new(&map_bh)) 2328 if (buffer_new(bh))
2317 unmap_underlying_metadata(map_bh.b_bdev, 2329 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2318 map_bh.b_blocknr); 2330 if (PageUptodate(page)) {
2319 if (PageUptodate(page)) 2331 set_buffer_uptodate(bh);
2320 continue; 2332 continue;
2321 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) { 2333 }
2334 if (buffer_new(bh) || !buffer_mapped(bh)) {
2322 kaddr = kmap_atomic(page, KM_USER0); 2335 kaddr = kmap_atomic(page, KM_USER0);
2323 if (block_start < from) 2336 if (block_start < from)
2324 memset(kaddr+block_start, 0, from-block_start); 2337 memset(kaddr+block_start, 0, from-block_start);
@@ -2328,49 +2341,26 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2328 kunmap_atomic(kaddr, KM_USER0); 2341 kunmap_atomic(kaddr, KM_USER0);
2329 continue; 2342 continue;
2330 } 2343 }
2331 if (buffer_uptodate(&map_bh)) 2344 if (buffer_uptodate(bh))
2332 continue; /* reiserfs does this */ 2345 continue; /* reiserfs does this */
2333 if (block_start < from || block_end > to) { 2346 if (block_start < from || block_end > to) {
2334 struct buffer_head *bh = alloc_buffer_head(GFP_NOFS); 2347 lock_buffer(bh);
2335 2348 bh->b_end_io = end_buffer_read_nobh;
2336 if (!bh) { 2349 submit_bh(READ, bh);
2337 ret = -ENOMEM; 2350 nr_reads++;
2338 goto failed;
2339 }
2340 bh->b_state = map_bh.b_state;
2341 atomic_set(&bh->b_count, 0);
2342 bh->b_this_page = NULL;
2343 bh->b_page = page;
2344 bh->b_blocknr = map_bh.b_blocknr;
2345 bh->b_size = blocksize;
2346 bh->b_data = (char *)(long)block_start;
2347 bh->b_bdev = map_bh.b_bdev;
2348 bh->b_private = NULL;
2349 read_bh[nr_reads++] = bh;
2350 } 2351 }
2351 } 2352 }
2352 2353
2353 if (nr_reads) { 2354 if (nr_reads) {
2354 struct buffer_head *bh;
2355
2356 /* 2355 /*
2357 * The page is locked, so these buffers are protected from 2356 * The page is locked, so these buffers are protected from
2358 * any VM or truncate activity. Hence we don't need to care 2357 * any VM or truncate activity. Hence we don't need to care
2359 * for the buffer_head refcounts. 2358 * for the buffer_head refcounts.
2360 */ 2359 */
2361 for (i = 0; i < nr_reads; i++) { 2360 for (bh = head; bh; bh = bh->b_this_page) {
2362 bh = read_bh[i];
2363 lock_buffer(bh);
2364 bh->b_end_io = end_buffer_read_nobh;
2365 submit_bh(READ, bh);
2366 }
2367 for (i = 0; i < nr_reads; i++) {
2368 bh = read_bh[i];
2369 wait_on_buffer(bh); 2361 wait_on_buffer(bh);
2370 if (!buffer_uptodate(bh)) 2362 if (!buffer_uptodate(bh))
2371 ret = -EIO; 2363 ret = -EIO;
2372 free_buffer_head(bh);
2373 read_bh[i] = NULL;
2374 } 2364 }
2375 if (ret) 2365 if (ret)
2376 goto failed; 2366 goto failed;
@@ -2379,21 +2369,54 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2379 if (is_mapped_to_disk) 2369 if (is_mapped_to_disk)
2380 SetPageMappedToDisk(page); 2370 SetPageMappedToDisk(page);
2381 2371
2372 do {
2373 bh = head;
2374 head = head->b_this_page;
2375 free_buffer_head(bh);
2376 } while (head);
2377
2382 return 0; 2378 return 0;
2383 2379
2384failed: 2380failed:
2385 for (i = 0; i < nr_reads; i++) {
2386 if (read_bh[i])
2387 free_buffer_head(read_bh[i]);
2388 }
2389
2390 /* 2381 /*
2391 * Error recovery is pretty slack. Clear the page and mark it dirty 2382 * Error recovery is a bit difficult. We need to zero out blocks that
2392 * so we'll later zero out any blocks which _were_ allocated. 2383 * were newly allocated, and dirty them to ensure they get written out.
2384 * Buffers need to be attached to the page at this point, otherwise
2385 * the handling of potential IO errors during writeout would be hard
2386 * (could try doing synchronous writeout, but what if that fails too?)
2393 */ 2387 */
2394 zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); 2388 spin_lock(&page->mapping->private_lock);
2395 SetPageUptodate(page); 2389 bh = head;
2396 set_page_dirty(page); 2390 block_start = 0;
2391 do {
2392 if (PageUptodate(page))
2393 set_buffer_uptodate(bh);
2394 if (PageDirty(page))
2395 set_buffer_dirty(bh);
2396
2397 block_end = block_start+blocksize;
2398 if (block_end <= from)
2399 goto next;
2400 if (block_start >= to)
2401 goto next;
2402
2403 if (buffer_new(bh)) {
2404 clear_buffer_new(bh);
2405 if (!buffer_uptodate(bh)) {
2406 zero_user_page(page, block_start, bh->b_size, KM_USER0);
2407 set_buffer_uptodate(bh);
2408 }
2409 mark_buffer_dirty(bh);
2410 }
2411next:
2412 block_start = block_end;
2413 if (!bh->b_this_page)
2414 bh->b_this_page = head;
2415 bh = bh->b_this_page;
2416 } while (bh != head);
2417 attach_page_buffers(page, head);
2418 spin_unlock(&page->mapping->private_lock);
2419
2397 return ret; 2420 return ret;
2398} 2421}
2399EXPORT_SYMBOL(nobh_prepare_write); 2422EXPORT_SYMBOL(nobh_prepare_write);
@@ -2408,6 +2431,9 @@ int nobh_commit_write(struct file *file, struct page *page,
2408 struct inode *inode = page->mapping->host; 2431 struct inode *inode = page->mapping->host;
2409 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 2432 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2410 2433
2434 if (page_has_buffers(page))
2435 return generic_commit_write(file, page, from, to);
2436
2411 SetPageUptodate(page); 2437 SetPageUptodate(page);
2412 set_page_dirty(page); 2438 set_page_dirty(page);
2413 if (pos > inode->i_size) { 2439 if (pos > inode->i_size) {