diff options
author | npiggin@suse.de <npiggin@suse.de> | 2010-05-26 11:05:33 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2010-05-27 22:15:33 -0400 |
commit | 7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f (patch) | |
tree | e575d9c55e2a6ccc645dcb3ae2564de458b428f2 /fs/buffer.c | |
parent | 7000d3c424e5bb350e502a477fb0e1ed42f8b10e (diff) |
fs: introduce new truncate sequence
Introduce a new truncate calling sequence into fs/mm subsystems. Rather than
setattr > vmtruncate > truncate, have filesystems call their truncate sequence
from ->setattr if filesystem specific operations are required. vmtruncate is
deprecated, and truncate_pagecache and inode_newsize_ok helpers introduced
previously should be used.
simple_setattr is introduced for simple in-ram filesystems to implement
the new truncate sequence. Eventually all filesystems should be converted
to implement a setattr, and the default code in notify_change should go
away.
simple_setsize is also introduced to perform just the ATTR_SIZE portion
of simple_setattr (ie. changing i_size and trimming pagecache).
To implement the new truncate sequence:
- filesystem specific manipulations (eg freeing blocks) must be done in
the setattr method rather than ->truncate.
- vmtruncate can not be used by core code to trim blocks past i_size in
the event of write failure after allocation, so this must be performed
in the fs code.
- convert usage of helpers block_write_begin, nobh_write_begin,
cont_write_begin, and *blockdev_direct_IO* to use _newtrunc postfixed
variants. These avoid calling vmtruncate to trim blocks (see previous).
- inode_setattr should not be used. generic_setattr is a new function
to be used to copy simple attributes into the generic inode.
- make use of the better opportunity to handle errors with the new sequence.
Big problem with the previous calling sequence: the filesystem is not called
until i_size has already changed. This means it is not allowed to fail the
call, and also it does not know what the previous i_size was. Also, generic
code calling vmtruncate to truncate allocated blocks in case of error had
no good way to return a meaningful error (or, for example, atomically handle
block deallocation).
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs/buffer.c')
-rw-r--r-- | fs/buffer.c | 123 |
1 files changed, 98 insertions, 25 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index e8aa7081d25c..d54812b198e9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, | |||
1949 | } | 1949 | } |
1950 | 1950 | ||
1951 | /* | 1951 | /* |
1952 | * block_write_begin takes care of the basic task of block allocation and | 1952 | * Filesystems implementing the new truncate sequence should use the |
1953 | * bringing partial write blocks uptodate first. | 1953 | * _newtrunc postfix variant which won't incorrectly call vmtruncate. |
1954 | * | 1954 | * The filesystem needs to handle block truncation upon failure. |
1955 | * If *pagep is not NULL, then block_write_begin uses the locked page | ||
1956 | * at *pagep rather than allocating its own. In this case, the page will | ||
1957 | * not be unlocked or deallocated on failure. | ||
1958 | */ | 1955 | */ |
1959 | int block_write_begin(struct file *file, struct address_space *mapping, | 1956 | int block_write_begin_newtrunc(struct file *file, struct address_space *mapping, |
1960 | loff_t pos, unsigned len, unsigned flags, | 1957 | loff_t pos, unsigned len, unsigned flags, |
1961 | struct page **pagep, void **fsdata, | 1958 | struct page **pagep, void **fsdata, |
1962 | get_block_t *get_block) | 1959 | get_block_t *get_block) |
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping, | |||
1992 | unlock_page(page); | 1989 | unlock_page(page); |
1993 | page_cache_release(page); | 1990 | page_cache_release(page); |
1994 | *pagep = NULL; | 1991 | *pagep = NULL; |
1995 | |||
1996 | /* | ||
1997 | * prepare_write() may have instantiated a few blocks | ||
1998 | * outside i_size. Trim these off again. Don't need | ||
1999 | * i_size_read because we hold i_mutex. | ||
2000 | */ | ||
2001 | if (pos + len > inode->i_size) | ||
2002 | vmtruncate(inode, inode->i_size); | ||
2003 | } | 1992 | } |
2004 | } | 1993 | } |
2005 | 1994 | ||
2006 | out: | 1995 | out: |
2007 | return status; | 1996 | return status; |
2008 | } | 1997 | } |
1998 | EXPORT_SYMBOL(block_write_begin_newtrunc); | ||
1999 | |||
2000 | /* | ||
2001 | * block_write_begin takes care of the basic task of block allocation and | ||
2002 | * bringing partial write blocks uptodate first. | ||
2003 | * | ||
2004 | * If *pagep is not NULL, then block_write_begin uses the locked page | ||
2005 | * at *pagep rather than allocating its own. In this case, the page will | ||
2006 | * not be unlocked or deallocated on failure. | ||
2007 | */ | ||
2008 | int block_write_begin(struct file *file, struct address_space *mapping, | ||
2009 | loff_t pos, unsigned len, unsigned flags, | ||
2010 | struct page **pagep, void **fsdata, | ||
2011 | get_block_t *get_block) | ||
2012 | { | ||
2013 | int ret; | ||
2014 | |||
2015 | ret = block_write_begin_newtrunc(file, mapping, pos, len, flags, | ||
2016 | pagep, fsdata, get_block); | ||
2017 | |||
2018 | /* | ||
2019 | * prepare_write() may have instantiated a few blocks | ||
2020 | * outside i_size. Trim these off again. Don't need | ||
2021 | * i_size_read because we hold i_mutex. | ||
2022 | * | ||
2023 | * Filesystems which pass down their own page also cannot | ||
2024 | * call into vmtruncate here because it would lead to lock | ||
2025 | * inversion problems (*pagep is locked). This is a further | ||
2026 | * example of where the old truncate sequence is inadequate. | ||
2027 | */ | ||
2028 | if (unlikely(ret) && *pagep == NULL) { | ||
2029 | loff_t isize = mapping->host->i_size; | ||
2030 | if (pos + len > isize) | ||
2031 | vmtruncate(mapping->host, isize); | ||
2032 | } | ||
2033 | |||
2034 | return ret; | ||
2035 | } | ||
2009 | EXPORT_SYMBOL(block_write_begin); | 2036 | EXPORT_SYMBOL(block_write_begin); |
2010 | 2037 | ||
2011 | int block_write_end(struct file *file, struct address_space *mapping, | 2038 | int block_write_end(struct file *file, struct address_space *mapping, |
@@ -2324,7 +2351,7 @@ out: | |||
2324 | * For moronic filesystems that do not allow holes in file. | 2351 | * For moronic filesystems that do not allow holes in file. |
2325 | * We may have to extend the file. | 2352 | * We may have to extend the file. |
2326 | */ | 2353 | */ |
2327 | int cont_write_begin(struct file *file, struct address_space *mapping, | 2354 | int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping, |
2328 | loff_t pos, unsigned len, unsigned flags, | 2355 | loff_t pos, unsigned len, unsigned flags, |
2329 | struct page **pagep, void **fsdata, | 2356 | struct page **pagep, void **fsdata, |
2330 | get_block_t *get_block, loff_t *bytes) | 2357 | get_block_t *get_block, loff_t *bytes) |
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping, | |||
2345 | } | 2372 | } |
2346 | 2373 | ||
2347 | *pagep = NULL; | 2374 | *pagep = NULL; |
2348 | err = block_write_begin(file, mapping, pos, len, | 2375 | err = block_write_begin_newtrunc(file, mapping, pos, len, |
2349 | flags, pagep, fsdata, get_block); | 2376 | flags, pagep, fsdata, get_block); |
2350 | out: | 2377 | out: |
2351 | return err; | 2378 | return err; |
2352 | } | 2379 | } |
2380 | EXPORT_SYMBOL(cont_write_begin_newtrunc); | ||
2381 | |||
2382 | int cont_write_begin(struct file *file, struct address_space *mapping, | ||
2383 | loff_t pos, unsigned len, unsigned flags, | ||
2384 | struct page **pagep, void **fsdata, | ||
2385 | get_block_t *get_block, loff_t *bytes) | ||
2386 | { | ||
2387 | int ret; | ||
2388 | |||
2389 | ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags, | ||
2390 | pagep, fsdata, get_block, bytes); | ||
2391 | if (unlikely(ret)) { | ||
2392 | loff_t isize = mapping->host->i_size; | ||
2393 | if (pos + len > isize) | ||
2394 | vmtruncate(mapping->host, isize); | ||
2395 | } | ||
2396 | |||
2397 | return ret; | ||
2398 | } | ||
2353 | EXPORT_SYMBOL(cont_write_begin); | 2399 | EXPORT_SYMBOL(cont_write_begin); |
2354 | 2400 | ||
2355 | int block_prepare_write(struct page *page, unsigned from, unsigned to, | 2401 | int block_prepare_write(struct page *page, unsigned from, unsigned to, |
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write); | |||
2381 | * | 2427 | * |
2382 | * We are not allowed to take the i_mutex here so we have to play games to | 2428 | * We are not allowed to take the i_mutex here so we have to play games to |
2383 | * protect against truncate races as the page could now be beyond EOF. Because | 2429 | * protect against truncate races as the page could now be beyond EOF. Because |
2384 | * vmtruncate() writes the inode size before removing pages, once we have the | 2430 | * truncate writes the inode size before removing pages, once we have the |
2385 | * page lock we can determine safely if the page is beyond EOF. If it is not | 2431 | * page lock we can determine safely if the page is beyond EOF. If it is not |
2386 | * beyond EOF, then the page is guaranteed safe against truncation until we | 2432 | * beyond EOF, then the page is guaranteed safe against truncation until we |
2387 | * unlock the page. | 2433 | * unlock the page. |
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) | |||
2464 | } | 2510 | } |
2465 | 2511 | ||
2466 | /* | 2512 | /* |
2467 | * On entry, the page is fully not uptodate. | 2513 | * Filesystems implementing the new truncate sequence should use the |
2468 | * On exit the page is fully uptodate in the areas outside (from,to) | 2514 | * _newtrunc postfix variant which won't incorrectly call vmtruncate. |
2515 | * The filesystem needs to handle block truncation upon failure. | ||
2469 | */ | 2516 | */ |
2470 | int nobh_write_begin(struct file *file, struct address_space *mapping, | 2517 | int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping, |
2471 | loff_t pos, unsigned len, unsigned flags, | 2518 | loff_t pos, unsigned len, unsigned flags, |
2472 | struct page **pagep, void **fsdata, | 2519 | struct page **pagep, void **fsdata, |
2473 | get_block_t *get_block) | 2520 | get_block_t *get_block) |
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, | |||
2500 | unlock_page(page); | 2547 | unlock_page(page); |
2501 | page_cache_release(page); | 2548 | page_cache_release(page); |
2502 | *pagep = NULL; | 2549 | *pagep = NULL; |
2503 | return block_write_begin(file, mapping, pos, len, flags, pagep, | 2550 | return block_write_begin_newtrunc(file, mapping, pos, len, |
2504 | fsdata, get_block); | 2551 | flags, pagep, fsdata, get_block); |
2505 | } | 2552 | } |
2506 | 2553 | ||
2507 | if (PageMappedToDisk(page)) | 2554 | if (PageMappedToDisk(page)) |
@@ -2605,8 +2652,34 @@ out_release: | |||
2605 | page_cache_release(page); | 2652 | page_cache_release(page); |
2606 | *pagep = NULL; | 2653 | *pagep = NULL; |
2607 | 2654 | ||
2608 | if (pos + len > inode->i_size) | 2655 | return ret; |
2609 | vmtruncate(inode, inode->i_size); | 2656 | } |
2657 | EXPORT_SYMBOL(nobh_write_begin_newtrunc); | ||
2658 | |||
2659 | /* | ||
2660 | * On entry, the page is fully not uptodate. | ||
2661 | * On exit the page is fully uptodate in the areas outside (from,to) | ||
2662 | */ | ||
2663 | int nobh_write_begin(struct file *file, struct address_space *mapping, | ||
2664 | loff_t pos, unsigned len, unsigned flags, | ||
2665 | struct page **pagep, void **fsdata, | ||
2666 | get_block_t *get_block) | ||
2667 | { | ||
2668 | int ret; | ||
2669 | |||
2670 | ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, | ||
2671 | pagep, fsdata, get_block); | ||
2672 | |||
2673 | /* | ||
2674 | * prepare_write() may have instantiated a few blocks | ||
2675 | * outside i_size. Trim these off again. Don't need | ||
2676 | * i_size_read because we hold i_mutex. | ||
2677 | */ | ||
2678 | if (unlikely(ret)) { | ||
2679 | loff_t isize = mapping->host->i_size; | ||
2680 | if (pos + len > isize) | ||
2681 | vmtruncate(mapping->host, isize); | ||
2682 | } | ||
2610 | 2683 | ||
2611 | return ret; | 2684 | return ret; |
2612 | } | 2685 | } |