fs: fix nobh error handling

nobh mode error handling is not just pretty slack, it's wrong. One cannot zero out the whole page to ensure new blocks are zeroed, because it just brings the whole page "uptodate" with zeroes even if that may not be the correct uptodate data. Also, other parts of the page may already contain dirty data which would get lost by zeroing it out. Thirdly, the writeback of zeroes to the new blocks will also erase existing blocks. All these conditions are pagecache and/or filesystem corruption. The problem comes about because we didn't keep track of which buffers actually are new or old. However it is not enough just to keep only this state, because at the point we start dirtying parts of the page (new blocks, with zeroes), the handling of IO errors becomes impossible without buffers because the page may only be partially uptodate, in which case the page flags allone cannot capture the state of the parts of the page. So allocate all buffers for the page upfront, but leave them unattached so that they don't pick up any other references and can be freed when we're done. If the error path is hit, then zero the new buffers as the regular buffer path does, then attach the buffers to the page so that it can actually be written out correctly and be subject to the normal IO error handling paths. As an upshot, we save 1K of kernel stack on ia64 or powerpc 64K page systems. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Nick Piggin <npiggin@suse.de> 2007-10-16 04:24:48 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-16 12:42:54 -0400
commit: a4b0672db3a698d0684ee6e54f44e2e162a3da1b (patch)
tree: d53c52b251856520b0fd02969bfcdd4b5d332da6 /fs
parent: 68671f35fe8d785277118a333c88768a4f894917 (diff)
1 files changed, 82 insertions, 56 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index b144fc367b8b..09bb80c479d8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2274,51 +2274,64 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
        struct inode *inode = page->mapping->host;
        const unsigned blkbits = inode->i_blkbits;
        const unsigned blocksize = 1 << blkbits;
-        struct buffer_head map_bh;
+        struct buffer_head *head, *bh;
-        struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
        unsigned block_in_page;
-        unsigned block_start;
+        unsigned block_start, block_end;
        sector_t block_in_file;
        char *kaddr;
        int nr_reads = 0;
-        int i;
        int ret = 0;
        int is_mapped_to_disk = 1;
+        if (page_has_buffers(page))
+                return block_prepare_write(page, from, to, get_block);
        if (PageMappedToDisk(page))
                return 0;
+        /*
+         * Allocate buffers so that we can keep track of state, and potentially
+         * attach them to the page if an error occurs. In the common case of
+         * no error, they will just be freed again without ever being attached
+         * to the page (which is all OK, because we're under the page lock).
+         *
+         * Be careful: the buffer linked list is a NULL terminated one, rather
+         * than the circular one we're used to.
+         */
+        head = alloc_page_buffers(page, blocksize, 0);
+        if (!head)
+                return -ENOMEM;
        block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
-        map_bh.b_page = page;
        /*
         * We loop across all blocks in the page, whether or not they are
         * part of the affected region.  This is so we can discover if the
         * page is fully mapped-to-disk.
         */
-        for (block_start = 0, block_in_page = 0;
+        for (block_start = 0, block_in_page = 0, bh = head;
                  block_start < PAGE_CACHE_SIZE;
-                  block_in_page++, block_start += blocksize) {
+                  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
-                unsigned block_end = block_start + blocksize;
                int create;
-                map_bh.b_state = 0;
+                block_end = block_start + blocksize;
+                bh->b_state = 0;
                create = 1;
                if (block_start >= to)
                        create = 0;
-                map_bh.b_size = blocksize;
                ret = get_block(inode, block_in_file + block_in_page,
-                                        &map_bh, create);
+                                        bh, create);
                if (ret)
                        goto failed;
-                if (!buffer_mapped(&map_bh))
+                if (!buffer_mapped(bh))
                        is_mapped_to_disk = 0;
-                if (buffer_new(&map_bh))
+                if (buffer_new(bh))
-                        unmap_underlying_metadata(map_bh.b_bdev,
+                        unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
-                                                        map_bh.b_blocknr);
+                if (PageUptodate(page)) {
-                if (PageUptodate(page))
+                        set_buffer_uptodate(bh);
                        continue;
-                if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
+                }
+                if (buffer_new(bh) || !buffer_mapped(bh)) {
                        kaddr = kmap_atomic(page, KM_USER0);
                        if (block_start < from)
                                memset(kaddr+block_start, 0, from-block_start);
@@ -2328,49 +2341,26 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
                        kunmap_atomic(kaddr, KM_USER0);
                        continue;
                }
-                if (buffer_uptodate(&map_bh))
+                if (buffer_uptodate(bh))
                        continue;       /* reiserfs does this */
                if (block_start < from || block_end > to) {
-                        struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
+                        lock_buffer(bh);
+                        bh->b_end_io = end_buffer_read_nobh;
-                        if (!bh) {
+                        submit_bh(READ, bh);
-                                ret = -ENOMEM;
+                        nr_reads++;
-                                goto failed;
-                        }
-                        bh->b_state = map_bh.b_state;
-                        atomic_set(&bh->b_count, 0);
-                        bh->b_this_page = NULL;
-                        bh->b_page = page;
-                        bh->b_blocknr = map_bh.b_blocknr;
-                        bh->b_size = blocksize;
-                        bh->b_data = (char *)(long)block_start;
-                        bh->b_bdev = map_bh.b_bdev;
-                        bh->b_private = NULL;
-                        read_bh[nr_reads++] = bh;
                }
        }
        if (nr_reads) {
-                struct buffer_head *bh;
                /*
                 * The page is locked, so these buffers are protected from
                 * any VM or truncate activity.  Hence we don't need to care
                 * for the buffer_head refcounts.
                 */
-                for (i = 0; i < nr_reads; i++) {
+                for (bh = head; bh; bh = bh->b_this_page) {
-                        bh = read_bh[i];
-                        lock_buffer(bh);
-                        bh->b_end_io = end_buffer_read_nobh;
-                        submit_bh(READ, bh);
-                }
-                for (i = 0; i < nr_reads; i++) {
-                        bh = read_bh[i];
                        wait_on_buffer(bh);
                        if (!buffer_uptodate(bh))
                                ret = -EIO;
-                        free_buffer_head(bh);
-                        read_bh[i] = NULL;
                }
                if (ret)
                        goto failed;
@@ -2379,21 +2369,54 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
        if (is_mapped_to_disk)
                SetPageMappedToDisk(page);
+        do {
+                bh = head;
+                head = head->b_this_page;
+                free_buffer_head(bh);
+        } while (head);
        return 0;
 failed:
-        for (i = 0; i < nr_reads; i++) {
-                if (read_bh[i])
-                        free_buffer_head(read_bh[i]);
-        }
        /*
-         * Error recovery is pretty slack.  Clear the page and mark it dirty
+         * Error recovery is a bit difficult. We need to zero out blocks that
-         * so we'll later zero out any blocks which _were_ allocated.
+         * were newly allocated, and dirty them to ensure they get written out.
+         * Buffers need to be attached to the page at this point, otherwise
+         * the handling of potential IO errors during writeout would be hard
+         * (could try doing synchronous writeout, but what if that fails too?)
         */
-        zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+        spin_lock(&page->mapping->private_lock);
-        SetPageUptodate(page);
+        bh = head;
-        set_page_dirty(page);
+        block_start = 0;
+        do {
+                if (PageUptodate(page))
+                        set_buffer_uptodate(bh);
+                if (PageDirty(page))
+                        set_buffer_dirty(bh);
+                block_end = block_start+blocksize;
+                if (block_end <= from)
+                        goto next;
+                if (block_start >= to)
+                        goto next;
+                if (buffer_new(bh)) {
+                        clear_buffer_new(bh);
+                        if (!buffer_uptodate(bh)) {
+                                zero_user_page(page, block_start, bh->b_size, KM_USER0);
+                                set_buffer_uptodate(bh);
+                        }
+                        mark_buffer_dirty(bh);
+                }
+next:
+                block_start = block_end;
+                if (!bh->b_this_page)
+                        bh->b_this_page = head;
+                bh = bh->b_this_page;
+        } while (bh != head);
+        attach_page_buffers(page, head);
+        spin_unlock(&page->mapping->private_lock);
        return ret;
 }
 EXPORT_SYMBOL(nobh_prepare_write);
@@ -2408,6 +2431,9 @@ int nobh_commit_write(struct file *file, struct page *page,
        struct inode *inode = page->mapping->host;
        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+        if (page_has_buffers(page))
+                return generic_commit_write(file, page, from, to);
        SetPageUptodate(page);
        set_page_dirty(page);
        if (pos > inode->i_size) {
author	Nick Piggin <npiggin@suse.de>	2007-10-16 04:24:48 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-16 12:42:54 -0400
commit	a4b0672db3a698d0684ee6e54f44e2e162a3da1b (patch)
tree	d53c52b251856520b0fd02969bfcdd4b5d332da6 /fs
parent	68671f35fe8d785277118a333c88768a4f894917 (diff)

diff --git a/fs/buffer.c b/fs/buffer.c index b144fc367b8b..09bb80c479d8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c
@@ -2274,51 +2274,64 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2274	struct inode *inode = page->mapping->host;	2274	struct inode *inode = page->mapping->host;
2275	const unsigned blkbits = inode->i_blkbits;	2275	const unsigned blkbits = inode->i_blkbits;
2276	const unsigned blocksize = 1 << blkbits;	2276	const unsigned blocksize = 1 << blkbits;
2277	struct buffer_head map_bh;	2277	struct buffer_head head, bh;
2278	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2279	unsigned block_in_page;	2278	unsigned block_in_page;
2280	unsigned block_start;	2279	unsigned block_start, block_end;
2281	sector_t block_in_file;	2280	sector_t block_in_file;
2282	char *kaddr;	2281	char *kaddr;
2283	int nr_reads = 0;	2282	int nr_reads = 0;
2284	int i;
2285	int ret = 0;	2283	int ret = 0;
2286	int is_mapped_to_disk = 1;	2284	int is_mapped_to_disk = 1;
2287		2285
		2286	if (page_has_buffers(page))
		2287	return block_prepare_write(page, from, to, get_block);
		2288
2288	if (PageMappedToDisk(page))	2289	if (PageMappedToDisk(page))
2289	return 0;	2290	return 0;
2290		2291
		2292	/*
		2293	* Allocate buffers so that we can keep track of state, and potentially
		2294	* attach them to the page if an error occurs. In the common case of
		2295	* no error, they will just be freed again without ever being attached
		2296	* to the page (which is all OK, because we're under the page lock).
		2297	*
		2298	* Be careful: the buffer linked list is a NULL terminated one, rather
		2299	* than the circular one we're used to.
		2300	*/
		2301	head = alloc_page_buffers(page, blocksize, 0);
		2302	if (!head)
		2303	return -ENOMEM;
		2304
2291	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);	2305	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2292	map_bh.b_page = page;
2293		2306
2294	/*	2307	/*
2295	* We loop across all blocks in the page, whether or not they are	2308	* We loop across all blocks in the page, whether or not they are
2296	* part of the affected region. This is so we can discover if the	2309	* part of the affected region. This is so we can discover if the
2297	* page is fully mapped-to-disk.	2310	* page is fully mapped-to-disk.
2298	*/	2311	*/
2299	for (block_start = 0, block_in_page = 0;	2312	for (block_start = 0, block_in_page = 0, bh = head;
2300	block_start < PAGE_CACHE_SIZE;	2313	block_start < PAGE_CACHE_SIZE;
2301	block_in_page++, block_start += blocksize) {	2314	block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2302	unsigned block_end = block_start + blocksize;
2303	int create;	2315	int create;
2304		2316
2305	map_bh.b_state = 0;	2317	block_end = block_start + blocksize;
		2318	bh->b_state = 0;
2306	create = 1;	2319	create = 1;
2307	if (block_start >= to)	2320	if (block_start >= to)
2308	create = 0;	2321	create = 0;
2309	map_bh.b_size = blocksize;
2310	ret = get_block(inode, block_in_file + block_in_page,	2322	ret = get_block(inode, block_in_file + block_in_page,
2311	&map_bh, create);	2323	bh, create);
2312	if (ret)	2324	if (ret)
2313	goto failed;	2325	goto failed;
2314	if (!buffer_mapped(&map_bh))	2326	if (!buffer_mapped(bh))
2315	is_mapped_to_disk = 0;	2327	is_mapped_to_disk = 0;
2316	if (buffer_new(&map_bh))	2328	if (buffer_new(bh))
2317	unmap_underlying_metadata(map_bh.b_bdev,	2329	unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2318	map_bh.b_blocknr);	2330	if (PageUptodate(page)) {
2319	if (PageUptodate(page))	2331	set_buffer_uptodate(bh);
2320	continue;	2332	continue;
2321	if (buffer_new(&map_bh) \|\| !buffer_mapped(&map_bh)) {	2333	}
		2334	if (buffer_new(bh) \|\| !buffer_mapped(bh)) {
2322	kaddr = kmap_atomic(page, KM_USER0);	2335	kaddr = kmap_atomic(page, KM_USER0);
2323	if (block_start < from)	2336	if (block_start < from)
2324	memset(kaddr+block_start, 0, from-block_start);	2337	memset(kaddr+block_start, 0, from-block_start);
@@ -2328,49 +2341,26 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2328	kunmap_atomic(kaddr, KM_USER0);	2341	kunmap_atomic(kaddr, KM_USER0);
2329	continue;	2342	continue;
2330	}	2343	}
2331	if (buffer_uptodate(&map_bh))	2344	if (buffer_uptodate(bh))
2332	continue; /* reiserfs does this */	2345	continue; /* reiserfs does this */
2333	if (block_start < from \|\| block_end > to) {	2346	if (block_start < from \|\| block_end > to) {
2334	struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);	2347	lock_buffer(bh);
2335		2348	bh->b_end_io = end_buffer_read_nobh;
2336	if (!bh) {	2349	submit_bh(READ, bh);
2337	ret = -ENOMEM;	2350	nr_reads++;
2338	goto failed;
2339	}
2340	bh->b_state = map_bh.b_state;
2341	atomic_set(&bh->b_count, 0);
2342	bh->b_this_page = NULL;
2343	bh->b_page = page;
2344	bh->b_blocknr = map_bh.b_blocknr;
2345	bh->b_size = blocksize;
2346	bh->b_data = (char *)(long)block_start;
2347	bh->b_bdev = map_bh.b_bdev;
2348	bh->b_private = NULL;
2349	read_bh[nr_reads++] = bh;
2350	}	2351	}
2351	}	2352	}
2352		2353
2353	if (nr_reads) {	2354	if (nr_reads) {
2354	struct buffer_head *bh;
2355
2356	/*	2355	/*
2357	* The page is locked, so these buffers are protected from	2356	* The page is locked, so these buffers are protected from
2358	* any VM or truncate activity. Hence we don't need to care	2357	* any VM or truncate activity. Hence we don't need to care
2359	* for the buffer_head refcounts.	2358	* for the buffer_head refcounts.
2360	*/	2359	*/
2361	for (i = 0; i < nr_reads; i++) {	2360	for (bh = head; bh; bh = bh->b_this_page) {
2362	bh = read_bh[i];
2363	lock_buffer(bh);
2364	bh->b_end_io = end_buffer_read_nobh;
2365	submit_bh(READ, bh);
2366	}
2367	for (i = 0; i < nr_reads; i++) {
2368	bh = read_bh[i];
2369	wait_on_buffer(bh);	2361	wait_on_buffer(bh);
2370	if (!buffer_uptodate(bh))	2362	if (!buffer_uptodate(bh))
2371	ret = -EIO;	2363	ret = -EIO;
2372	free_buffer_head(bh);
2373	read_bh[i] = NULL;
2374	}	2364	}
2375	if (ret)	2365	if (ret)
2376	goto failed;	2366	goto failed;
@@ -2379,21 +2369,54 @@ int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2379	if (is_mapped_to_disk)	2369	if (is_mapped_to_disk)
2380	SetPageMappedToDisk(page);	2370	SetPageMappedToDisk(page);
2381		2371
		2372	do {
		2373	bh = head;
		2374	head = head->b_this_page;
		2375	free_buffer_head(bh);
		2376	} while (head);
		2377
2382	return 0;	2378	return 0;
2383		2379
2384	failed:	2380	failed:
2385	for (i = 0; i < nr_reads; i++) {
2386	if (read_bh[i])
2387	free_buffer_head(read_bh[i]);
2388	}
2389
2390	/*	2381	/*
2391	* Error recovery is pretty slack. Clear the page and mark it dirty	2382	* Error recovery is a bit difficult. We need to zero out blocks that
2392	* so we'll later zero out any blocks which _were_ allocated.	2383	* were newly allocated, and dirty them to ensure they get written out.
		2384	* Buffers need to be attached to the page at this point, otherwise
		2385	* the handling of potential IO errors during writeout would be hard
		2386	* (could try doing synchronous writeout, but what if that fails too?)
2393	*/	2387	*/
2394	zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);	2388	spin_lock(&page->mapping->private_lock);
2395	SetPageUptodate(page);	2389	bh = head;
2396	set_page_dirty(page);	2390	block_start = 0;
		2391	do {
		2392	if (PageUptodate(page))
		2393	set_buffer_uptodate(bh);
		2394	if (PageDirty(page))
		2395	set_buffer_dirty(bh);
		2396
		2397	block_end = block_start+blocksize;
		2398	if (block_end <= from)
		2399	goto next;
		2400	if (block_start >= to)
		2401	goto next;
		2402
		2403	if (buffer_new(bh)) {
		2404	clear_buffer_new(bh);
		2405	if (!buffer_uptodate(bh)) {
		2406	zero_user_page(page, block_start, bh->b_size, KM_USER0);
		2407	set_buffer_uptodate(bh);
		2408	}
		2409	mark_buffer_dirty(bh);
		2410	}
		2411	next:
		2412	block_start = block_end;
		2413	if (!bh->b_this_page)
		2414	bh->b_this_page = head;
		2415	bh = bh->b_this_page;
		2416	} while (bh != head);
		2417	attach_page_buffers(page, head);
		2418	spin_unlock(&page->mapping->private_lock);
		2419
2397	return ret;	2420	return ret;
2398	}	2421	}
2399	EXPORT_SYMBOL(nobh_prepare_write);	2422	EXPORT_SYMBOL(nobh_prepare_write);
@@ -2408,6 +2431,9 @@ int nobh_commit_write(struct file file, struct page page,
2408	struct inode *inode = page->mapping->host;	2431	struct inode *inode = page->mapping->host;
2409	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;	2432	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2410		2433
		2434	if (page_has_buffers(page))
		2435	return generic_commit_write(file, page, from, to);
		2436
2411	SetPageUptodate(page);	2437	SetPageUptodate(page);
2412	set_page_dirty(page);	2438	set_page_dirty(page);
2413	if (pos > inode->i_size) {	2439	if (pos > inode->i_size) {