diff options
author | npiggin@suse.de <npiggin@suse.de> | 2010-05-26 11:05:33 -0400 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2010-05-27 22:15:33 -0400 |
commit | 7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f (patch) | |
tree | e575d9c55e2a6ccc645dcb3ae2564de458b428f2 | |
parent | 7000d3c424e5bb350e502a477fb0e1ed42f8b10e (diff) |
fs: introduce new truncate sequence
Introduce a new truncate calling sequence into fs/mm subsystems. Rather than
setattr > vmtruncate > truncate, have filesystems call their truncate sequence
from ->setattr if filesystem specific operations are required. vmtruncate is
deprecated, and truncate_pagecache and inode_newsize_ok helpers introduced
previously should be used.
simple_setattr is introduced for simple in-ram filesystems to implement
the new truncate sequence. Eventually all filesystems should be converted
to implement a setattr, and the default code in notify_change should go
away.
simple_setsize is also introduced to perform just the ATTR_SIZE portion
of simple_setattr (ie. changing i_size and trimming pagecache).
To implement the new truncate sequence:
- filesystem specific manipulations (eg freeing blocks) must be done in
the setattr method rather than ->truncate.
- vmtruncate can not be used by core code to trim blocks past i_size in
the event of write failure after allocation, so this must be performed
in the fs code.
- convert usage of helpers block_write_begin, nobh_write_begin,
cont_write_begin, and *blockdev_direct_IO* to use _newtrunc postfixed
variants. These avoid calling vmtruncate to trim blocks (see previous).
- inode_setattr should not be used. generic_setattr is a new function
to be used to copy simple attributes into the generic inode.
- make use of the better opportunity to handle errors with the new sequence.
Big problem with the previous calling sequence: the filesystem is not called
until i_size has already changed. This means it is not allowed to fail the
call, and also it does not know what the previous i_size was. Also, generic
code calling vmtruncate to truncate allocated blocks in case of error had
no good way to return a meaningful error (or, for example, atomically handle
block deallocation).
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r-- | Documentation/filesystems/vfs.txt | 7 | ||||
-rw-r--r-- | fs/attr.c | 50 | ||||
-rw-r--r-- | fs/buffer.c | 123 | ||||
-rw-r--r-- | fs/direct-io.c | 61 | ||||
-rw-r--r-- | fs/libfs.c | 76 | ||||
-rw-r--r-- | include/linux/buffer_head.h | 9 | ||||
-rw-r--r-- | include/linux/fs.h | 27 | ||||
-rw-r--r-- | mm/truncate.c | 10 |
8 files changed, 300 insertions, 63 deletions
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index d4f5731dcbbb..94677e7dcb13 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -401,11 +401,16 @@ otherwise noted. | |||
401 | started might not be in the page cache at the end of the | 401 | started might not be in the page cache at the end of the |
402 | walk). | 402 | walk). |
403 | 403 | ||
404 | truncate: called by the VFS to change the size of a file. The | 404 | truncate: Deprecated. This will not be called if ->setsize is defined. |
405 | Called by the VFS to change the size of a file. The | ||
405 | i_size field of the inode is set to the desired size by the | 406 | i_size field of the inode is set to the desired size by the |
406 | VFS before this method is called. This method is called by | 407 | VFS before this method is called. This method is called by |
407 | the truncate(2) system call and related functionality. | 408 | the truncate(2) system call and related functionality. |
408 | 409 | ||
410 | Note: ->truncate and vmtruncate are deprecated. Do not add new | ||
411 | instances/calls of these. Filesystems should be converted to do their | ||
412 | truncate sequence via ->setattr(). | ||
413 | |||
409 | permission: called by the VFS to check for access rights on a POSIX-like | 414 | permission: called by the VFS to check for access rights on a POSIX-like |
410 | filesystem. | 415 | filesystem. |
411 | 416 | ||
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok); | |||
67 | * @offset: the new size to assign to the inode | 67 | * @offset: the new size to assign to the inode |
68 | * @Returns: 0 on success, -ve errno on failure | 68 | * @Returns: 0 on success, -ve errno on failure |
69 | * | 69 | * |
70 | * inode_newsize_ok must be called with i_mutex held. | ||
71 | * | ||
70 | * inode_newsize_ok will check filesystem limits and ulimits to check that the | 72 | * inode_newsize_ok will check filesystem limits and ulimits to check that the |
71 | * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ | 73 | * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ |
72 | * when necessary. Caller must not proceed with inode size change if failure is | 74 | * when necessary. Caller must not proceed with inode size change if failure is |
73 | * returned. @inode must be a file (not directory), with appropriate | 75 | * returned. @inode must be a file (not directory), with appropriate |
74 | * permissions to allow truncate (inode_newsize_ok does NOT check these | 76 | * permissions to allow truncate (inode_newsize_ok does NOT check these |
75 | * conditions). | 77 | * conditions). |
76 | * | ||
77 | * inode_newsize_ok must be called with i_mutex held. | ||
78 | */ | 78 | */ |
79 | int inode_newsize_ok(const struct inode *inode, loff_t offset) | 79 | int inode_newsize_ok(const struct inode *inode, loff_t offset) |
80 | { | 80 | { |
@@ -104,17 +104,25 @@ out_big: | |||
104 | } | 104 | } |
105 | EXPORT_SYMBOL(inode_newsize_ok); | 105 | EXPORT_SYMBOL(inode_newsize_ok); |
106 | 106 | ||
107 | int inode_setattr(struct inode * inode, struct iattr * attr) | 107 | /** |
108 | * generic_setattr - copy simple metadata updates into the generic inode | ||
109 | * @inode: the inode to be updated | ||
110 | * @attr: the new attributes | ||
111 | * | ||
112 | * generic_setattr must be called with i_mutex held. | ||
113 | * | ||
114 | * generic_setattr updates the inode's metadata with that specified | ||
115 | * in attr. Noticably missing is inode size update, which is more complex | ||
116 | * as it requires pagecache updates. See simple_setsize. | ||
117 | * | ||
118 | * The inode is not marked as dirty after this operation. The rationale is | ||
119 | * that for "simple" filesystems, the struct inode is the inode storage. | ||
120 | * The caller is free to mark the inode dirty afterwards if needed. | ||
121 | */ | ||
122 | void generic_setattr(struct inode *inode, const struct iattr *attr) | ||
108 | { | 123 | { |
109 | unsigned int ia_valid = attr->ia_valid; | 124 | unsigned int ia_valid = attr->ia_valid; |
110 | 125 | ||
111 | if (ia_valid & ATTR_SIZE && | ||
112 | attr->ia_size != i_size_read(inode)) { | ||
113 | int error = vmtruncate(inode, attr->ia_size); | ||
114 | if (error) | ||
115 | return error; | ||
116 | } | ||
117 | |||
118 | if (ia_valid & ATTR_UID) | 126 | if (ia_valid & ATTR_UID) |
119 | inode->i_uid = attr->ia_uid; | 127 | inode->i_uid = attr->ia_uid; |
120 | if (ia_valid & ATTR_GID) | 128 | if (ia_valid & ATTR_GID) |
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr) | |||
135 | mode &= ~S_ISGID; | 143 | mode &= ~S_ISGID; |
136 | inode->i_mode = mode; | 144 | inode->i_mode = mode; |
137 | } | 145 | } |
146 | } | ||
147 | EXPORT_SYMBOL(generic_setattr); | ||
148 | |||
149 | /* | ||
150 | * note this function is deprecated, the new truncate sequence should be | ||
151 | * used instead -- see eg. simple_setsize, generic_setattr. | ||
152 | */ | ||
153 | int inode_setattr(struct inode *inode, const struct iattr *attr) | ||
154 | { | ||
155 | unsigned int ia_valid = attr->ia_valid; | ||
156 | |||
157 | if (ia_valid & ATTR_SIZE && | ||
158 | attr->ia_size != i_size_read(inode)) { | ||
159 | int error; | ||
160 | |||
161 | error = vmtruncate(inode, attr->ia_size); | ||
162 | if (error) | ||
163 | return error; | ||
164 | } | ||
165 | |||
166 | generic_setattr(inode, attr); | ||
167 | |||
138 | mark_inode_dirty(inode); | 168 | mark_inode_dirty(inode); |
139 | 169 | ||
140 | return 0; | 170 | return 0; |
diff --git a/fs/buffer.c b/fs/buffer.c index e8aa7081d25c..d54812b198e9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, | |||
1949 | } | 1949 | } |
1950 | 1950 | ||
1951 | /* | 1951 | /* |
1952 | * block_write_begin takes care of the basic task of block allocation and | 1952 | * Filesystems implementing the new truncate sequence should use the |
1953 | * bringing partial write blocks uptodate first. | 1953 | * _newtrunc postfix variant which won't incorrectly call vmtruncate. |
1954 | * | 1954 | * The filesystem needs to handle block truncation upon failure. |
1955 | * If *pagep is not NULL, then block_write_begin uses the locked page | ||
1956 | * at *pagep rather than allocating its own. In this case, the page will | ||
1957 | * not be unlocked or deallocated on failure. | ||
1958 | */ | 1955 | */ |
1959 | int block_write_begin(struct file *file, struct address_space *mapping, | 1956 | int block_write_begin_newtrunc(struct file *file, struct address_space *mapping, |
1960 | loff_t pos, unsigned len, unsigned flags, | 1957 | loff_t pos, unsigned len, unsigned flags, |
1961 | struct page **pagep, void **fsdata, | 1958 | struct page **pagep, void **fsdata, |
1962 | get_block_t *get_block) | 1959 | get_block_t *get_block) |
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping, | |||
1992 | unlock_page(page); | 1989 | unlock_page(page); |
1993 | page_cache_release(page); | 1990 | page_cache_release(page); |
1994 | *pagep = NULL; | 1991 | *pagep = NULL; |
1995 | |||
1996 | /* | ||
1997 | * prepare_write() may have instantiated a few blocks | ||
1998 | * outside i_size. Trim these off again. Don't need | ||
1999 | * i_size_read because we hold i_mutex. | ||
2000 | */ | ||
2001 | if (pos + len > inode->i_size) | ||
2002 | vmtruncate(inode, inode->i_size); | ||
2003 | } | 1992 | } |
2004 | } | 1993 | } |
2005 | 1994 | ||
2006 | out: | 1995 | out: |
2007 | return status; | 1996 | return status; |
2008 | } | 1997 | } |
1998 | EXPORT_SYMBOL(block_write_begin_newtrunc); | ||
1999 | |||
2000 | /* | ||
2001 | * block_write_begin takes care of the basic task of block allocation and | ||
2002 | * bringing partial write blocks uptodate first. | ||
2003 | * | ||
2004 | * If *pagep is not NULL, then block_write_begin uses the locked page | ||
2005 | * at *pagep rather than allocating its own. In this case, the page will | ||
2006 | * not be unlocked or deallocated on failure. | ||
2007 | */ | ||
2008 | int block_write_begin(struct file *file, struct address_space *mapping, | ||
2009 | loff_t pos, unsigned len, unsigned flags, | ||
2010 | struct page **pagep, void **fsdata, | ||
2011 | get_block_t *get_block) | ||
2012 | { | ||
2013 | int ret; | ||
2014 | |||
2015 | ret = block_write_begin_newtrunc(file, mapping, pos, len, flags, | ||
2016 | pagep, fsdata, get_block); | ||
2017 | |||
2018 | /* | ||
2019 | * prepare_write() may have instantiated a few blocks | ||
2020 | * outside i_size. Trim these off again. Don't need | ||
2021 | * i_size_read because we hold i_mutex. | ||
2022 | * | ||
2023 | * Filesystems which pass down their own page also cannot | ||
2024 | * call into vmtruncate here because it would lead to lock | ||
2025 | * inversion problems (*pagep is locked). This is a further | ||
2026 | * example of where the old truncate sequence is inadequate. | ||
2027 | */ | ||
2028 | if (unlikely(ret) && *pagep == NULL) { | ||
2029 | loff_t isize = mapping->host->i_size; | ||
2030 | if (pos + len > isize) | ||
2031 | vmtruncate(mapping->host, isize); | ||
2032 | } | ||
2033 | |||
2034 | return ret; | ||
2035 | } | ||
2009 | EXPORT_SYMBOL(block_write_begin); | 2036 | EXPORT_SYMBOL(block_write_begin); |
2010 | 2037 | ||
2011 | int block_write_end(struct file *file, struct address_space *mapping, | 2038 | int block_write_end(struct file *file, struct address_space *mapping, |
@@ -2324,7 +2351,7 @@ out: | |||
2324 | * For moronic filesystems that do not allow holes in file. | 2351 | * For moronic filesystems that do not allow holes in file. |
2325 | * We may have to extend the file. | 2352 | * We may have to extend the file. |
2326 | */ | 2353 | */ |
2327 | int cont_write_begin(struct file *file, struct address_space *mapping, | 2354 | int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping, |
2328 | loff_t pos, unsigned len, unsigned flags, | 2355 | loff_t pos, unsigned len, unsigned flags, |
2329 | struct page **pagep, void **fsdata, | 2356 | struct page **pagep, void **fsdata, |
2330 | get_block_t *get_block, loff_t *bytes) | 2357 | get_block_t *get_block, loff_t *bytes) |
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping, | |||
2345 | } | 2372 | } |
2346 | 2373 | ||
2347 | *pagep = NULL; | 2374 | *pagep = NULL; |
2348 | err = block_write_begin(file, mapping, pos, len, | 2375 | err = block_write_begin_newtrunc(file, mapping, pos, len, |
2349 | flags, pagep, fsdata, get_block); | 2376 | flags, pagep, fsdata, get_block); |
2350 | out: | 2377 | out: |
2351 | return err; | 2378 | return err; |
2352 | } | 2379 | } |
2380 | EXPORT_SYMBOL(cont_write_begin_newtrunc); | ||
2381 | |||
2382 | int cont_write_begin(struct file *file, struct address_space *mapping, | ||
2383 | loff_t pos, unsigned len, unsigned flags, | ||
2384 | struct page **pagep, void **fsdata, | ||
2385 | get_block_t *get_block, loff_t *bytes) | ||
2386 | { | ||
2387 | int ret; | ||
2388 | |||
2389 | ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags, | ||
2390 | pagep, fsdata, get_block, bytes); | ||
2391 | if (unlikely(ret)) { | ||
2392 | loff_t isize = mapping->host->i_size; | ||
2393 | if (pos + len > isize) | ||
2394 | vmtruncate(mapping->host, isize); | ||
2395 | } | ||
2396 | |||
2397 | return ret; | ||
2398 | } | ||
2353 | EXPORT_SYMBOL(cont_write_begin); | 2399 | EXPORT_SYMBOL(cont_write_begin); |
2354 | 2400 | ||
2355 | int block_prepare_write(struct page *page, unsigned from, unsigned to, | 2401 | int block_prepare_write(struct page *page, unsigned from, unsigned to, |
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write); | |||
2381 | * | 2427 | * |
2382 | * We are not allowed to take the i_mutex here so we have to play games to | 2428 | * We are not allowed to take the i_mutex here so we have to play games to |
2383 | * protect against truncate races as the page could now be beyond EOF. Because | 2429 | * protect against truncate races as the page could now be beyond EOF. Because |
2384 | * vmtruncate() writes the inode size before removing pages, once we have the | 2430 | * truncate writes the inode size before removing pages, once we have the |
2385 | * page lock we can determine safely if the page is beyond EOF. If it is not | 2431 | * page lock we can determine safely if the page is beyond EOF. If it is not |
2386 | * beyond EOF, then the page is guaranteed safe against truncation until we | 2432 | * beyond EOF, then the page is guaranteed safe against truncation until we |
2387 | * unlock the page. | 2433 | * unlock the page. |
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) | |||
2464 | } | 2510 | } |
2465 | 2511 | ||
2466 | /* | 2512 | /* |
2467 | * On entry, the page is fully not uptodate. | 2513 | * Filesystems implementing the new truncate sequence should use the |
2468 | * On exit the page is fully uptodate in the areas outside (from,to) | 2514 | * _newtrunc postfix variant which won't incorrectly call vmtruncate. |
2515 | * The filesystem needs to handle block truncation upon failure. | ||
2469 | */ | 2516 | */ |
2470 | int nobh_write_begin(struct file *file, struct address_space *mapping, | 2517 | int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping, |
2471 | loff_t pos, unsigned len, unsigned flags, | 2518 | loff_t pos, unsigned len, unsigned flags, |
2472 | struct page **pagep, void **fsdata, | 2519 | struct page **pagep, void **fsdata, |
2473 | get_block_t *get_block) | 2520 | get_block_t *get_block) |
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, | |||
2500 | unlock_page(page); | 2547 | unlock_page(page); |
2501 | page_cache_release(page); | 2548 | page_cache_release(page); |
2502 | *pagep = NULL; | 2549 | *pagep = NULL; |
2503 | return block_write_begin(file, mapping, pos, len, flags, pagep, | 2550 | return block_write_begin_newtrunc(file, mapping, pos, len, |
2504 | fsdata, get_block); | 2551 | flags, pagep, fsdata, get_block); |
2505 | } | 2552 | } |
2506 | 2553 | ||
2507 | if (PageMappedToDisk(page)) | 2554 | if (PageMappedToDisk(page)) |
@@ -2605,8 +2652,34 @@ out_release: | |||
2605 | page_cache_release(page); | 2652 | page_cache_release(page); |
2606 | *pagep = NULL; | 2653 | *pagep = NULL; |
2607 | 2654 | ||
2608 | if (pos + len > inode->i_size) | 2655 | return ret; |
2609 | vmtruncate(inode, inode->i_size); | 2656 | } |
2657 | EXPORT_SYMBOL(nobh_write_begin_newtrunc); | ||
2658 | |||
2659 | /* | ||
2660 | * On entry, the page is fully not uptodate. | ||
2661 | * On exit the page is fully uptodate in the areas outside (from,to) | ||
2662 | */ | ||
2663 | int nobh_write_begin(struct file *file, struct address_space *mapping, | ||
2664 | loff_t pos, unsigned len, unsigned flags, | ||
2665 | struct page **pagep, void **fsdata, | ||
2666 | get_block_t *get_block) | ||
2667 | { | ||
2668 | int ret; | ||
2669 | |||
2670 | ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, | ||
2671 | pagep, fsdata, get_block); | ||
2672 | |||
2673 | /* | ||
2674 | * prepare_write() may have instantiated a few blocks | ||
2675 | * outside i_size. Trim these off again. Don't need | ||
2676 | * i_size_read because we hold i_mutex. | ||
2677 | */ | ||
2678 | if (unlikely(ret)) { | ||
2679 | loff_t isize = mapping->host->i_size; | ||
2680 | if (pos + len > isize) | ||
2681 | vmtruncate(mapping->host, isize); | ||
2682 | } | ||
2610 | 2683 | ||
2611 | return ret; | 2684 | return ret; |
2612 | } | 2685 | } |
diff --git a/fs/direct-io.c b/fs/direct-io.c index da111aacb46e..7600aacf531d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -1134,27 +1134,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1134 | return ret; | 1134 | return ret; |
1135 | } | 1135 | } |
1136 | 1136 | ||
1137 | /* | ||
1138 | * This is a library function for use by filesystem drivers. | ||
1139 | * | ||
1140 | * The locking rules are governed by the flags parameter: | ||
1141 | * - if the flags value contains DIO_LOCKING we use a fancy locking | ||
1142 | * scheme for dumb filesystems. | ||
1143 | * For writes this function is called under i_mutex and returns with | ||
1144 | * i_mutex held, for reads, i_mutex is not held on entry, but it is | ||
1145 | * taken and dropped again before returning. | ||
1146 | * For reads and writes i_alloc_sem is taken in shared mode and released | ||
1147 | * on I/O completion (which may happen asynchronously after returning to | ||
1148 | * the caller). | ||
1149 | * | ||
1150 | * - if the flags value does NOT contain DIO_LOCKING we don't use any | ||
1151 | * internal locking but rather rely on the filesystem to synchronize | ||
1152 | * direct I/O reads/writes versus each other and truncate. | ||
1153 | * For reads and writes both i_mutex and i_alloc_sem are not held on | ||
1154 | * entry and are never taken. | ||
1155 | */ | ||
1156 | ssize_t | 1137 | ssize_t |
1157 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 1138 | __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, |
1158 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | 1139 | struct block_device *bdev, const struct iovec *iov, loff_t offset, |
1159 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | 1140 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, |
1160 | dio_submit_t submit_io, int flags) | 1141 | dio_submit_t submit_io, int flags) |
@@ -1247,9 +1228,46 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1247 | nr_segs, blkbits, get_block, end_io, | 1228 | nr_segs, blkbits, get_block, end_io, |
1248 | submit_io, dio); | 1229 | submit_io, dio); |
1249 | 1230 | ||
1231 | out: | ||
1232 | return retval; | ||
1233 | } | ||
1234 | EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc); | ||
1235 | |||
1236 | /* | ||
1237 | * This is a library function for use by filesystem drivers. | ||
1238 | * | ||
1239 | * The locking rules are governed by the flags parameter: | ||
1240 | * - if the flags value contains DIO_LOCKING we use a fancy locking | ||
1241 | * scheme for dumb filesystems. | ||
1242 | * For writes this function is called under i_mutex and returns with | ||
1243 | * i_mutex held, for reads, i_mutex is not held on entry, but it is | ||
1244 | * taken and dropped again before returning. | ||
1245 | * For reads and writes i_alloc_sem is taken in shared mode and released | ||
1246 | * on I/O completion (which may happen asynchronously after returning to | ||
1247 | * the caller). | ||
1248 | * | ||
1249 | * - if the flags value does NOT contain DIO_LOCKING we don't use any | ||
1250 | * internal locking but rather rely on the filesystem to synchronize | ||
1251 | * direct I/O reads/writes versus each other and truncate. | ||
1252 | * For reads and writes both i_mutex and i_alloc_sem are not held on | ||
1253 | * entry and are never taken. | ||
1254 | */ | ||
1255 | ssize_t | ||
1256 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | ||
1257 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | ||
1258 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | ||
1259 | dio_submit_t submit_io, int flags) | ||
1260 | { | ||
1261 | ssize_t retval; | ||
1262 | |||
1263 | retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, | ||
1264 | offset, nr_segs, get_block, end_io, submit_io, flags); | ||
1250 | /* | 1265 | /* |
1251 | * In case of error extending write may have instantiated a few | 1266 | * In case of error extending write may have instantiated a few |
1252 | * blocks outside i_size. Trim these off again for DIO_LOCKING. | 1267 | * blocks outside i_size. Trim these off again for DIO_LOCKING. |
1268 | * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in | ||
1269 | * their own manner. This is a further example of where the old | ||
1270 | * truncate sequence is inadequate. | ||
1253 | * | 1271 | * |
1254 | * NOTE: filesystems with their own locking have to handle this | 1272 | * NOTE: filesystems with their own locking have to handle this |
1255 | * on their own. | 1273 | * on their own. |
@@ -1257,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | |||
1257 | if (flags & DIO_LOCKING) { | 1275 | if (flags & DIO_LOCKING) { |
1258 | if (unlikely((rw & WRITE) && retval < 0)) { | 1276 | if (unlikely((rw & WRITE) && retval < 0)) { |
1259 | loff_t isize = i_size_read(inode); | 1277 | loff_t isize = i_size_read(inode); |
1278 | loff_t end = offset + iov_length(iov, nr_segs); | ||
1279 | |||
1260 | if (end > isize) | 1280 | if (end > isize) |
1261 | vmtruncate(inode, isize); | 1281 | vmtruncate(inode, isize); |
1262 | } | 1282 | } |
1263 | } | 1283 | } |
1264 | 1284 | ||
1265 | out: | ||
1266 | return retval; | 1285 | return retval; |
1267 | } | 1286 | } |
1268 | EXPORT_SYMBOL(__blockdev_direct_IO); | 1287 | EXPORT_SYMBOL(__blockdev_direct_IO); |
diff --git a/fs/libfs.c b/fs/libfs.c index b84d0a7a2204..09e1016eb774 100644 --- a/fs/libfs.c +++ b/fs/libfs.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/mount.h> | 9 | #include <linux/mount.h> |
10 | #include <linux/vfs.h> | 10 | #include <linux/vfs.h> |
11 | #include <linux/quotaops.h> | ||
11 | #include <linux/mutex.h> | 12 | #include <linux/mutex.h> |
12 | #include <linux/exportfs.h> | 13 | #include <linux/exportfs.h> |
13 | #include <linux/writeback.h> | 14 | #include <linux/writeback.h> |
@@ -325,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
325 | return 0; | 326 | return 0; |
326 | } | 327 | } |
327 | 328 | ||
329 | /** | ||
330 | * simple_setsize - handle core mm and vfs requirements for file size change | ||
331 | * @inode: inode | ||
332 | * @newsize: new file size | ||
333 | * | ||
334 | * Returns 0 on success, -error on failure. | ||
335 | * | ||
336 | * simple_setsize must be called with inode_mutex held. | ||
337 | * | ||
338 | * simple_setsize will check that the requested new size is OK (see | ||
339 | * inode_newsize_ok), and then will perform the necessary i_size update | ||
340 | * and pagecache truncation (if necessary). It will be typically be called | ||
341 | * from the filesystem's setattr function when ATTR_SIZE is passed in. | ||
342 | * | ||
343 | * The inode itself must have correct permissions and attributes to allow | ||
344 | * i_size to be changed, this function then just checks that the new size | ||
345 | * requested is valid. | ||
346 | * | ||
347 | * In the case of simple in-memory filesystems with inodes stored solely | ||
348 | * in the inode cache, and file data in the pagecache, nothing more needs | ||
349 | * to be done to satisfy a truncate request. Filesystems with on-disk | ||
350 | * blocks for example will need to free them in the case of truncate, in | ||
351 | * that case it may be easier not to use simple_setsize (but each of its | ||
352 | * components will likely be required at some point to update pagecache | ||
353 | * and inode etc). | ||
354 | */ | ||
355 | int simple_setsize(struct inode *inode, loff_t newsize) | ||
356 | { | ||
357 | loff_t oldsize; | ||
358 | int error; | ||
359 | |||
360 | error = inode_newsize_ok(inode, newsize); | ||
361 | if (error) | ||
362 | return error; | ||
363 | |||
364 | oldsize = inode->i_size; | ||
365 | i_size_write(inode, newsize); | ||
366 | truncate_pagecache(inode, oldsize, newsize); | ||
367 | |||
368 | return error; | ||
369 | } | ||
370 | EXPORT_SYMBOL(simple_setsize); | ||
371 | |||
372 | /** | ||
373 | * simple_setattr - setattr for simple in-memory filesystem | ||
374 | * @dentry: dentry | ||
375 | * @iattr: iattr structure | ||
376 | * | ||
377 | * Returns 0 on success, -error on failure. | ||
378 | * | ||
379 | * simple_setattr implements setattr for an in-memory filesystem which | ||
380 | * does not store its own file data or metadata (eg. uses the page cache | ||
381 | * and inode cache as its data store). | ||
382 | */ | ||
383 | int simple_setattr(struct dentry *dentry, struct iattr *iattr) | ||
384 | { | ||
385 | struct inode *inode = dentry->d_inode; | ||
386 | int error; | ||
387 | |||
388 | error = inode_change_ok(inode, iattr); | ||
389 | if (error) | ||
390 | return error; | ||
391 | |||
392 | if (iattr->ia_valid & ATTR_SIZE) { | ||
393 | error = simple_setsize(inode, iattr->ia_size); | ||
394 | if (error) | ||
395 | return error; | ||
396 | } | ||
397 | |||
398 | generic_setattr(inode, iattr); | ||
399 | |||
400 | return error; | ||
401 | } | ||
402 | EXPORT_SYMBOL(simple_setattr); | ||
403 | |||
328 | int simple_readpage(struct file *file, struct page *page) | 404 | int simple_readpage(struct file *file, struct page *page) |
329 | { | 405 | { |
330 | clear_highpage(page); | 406 | clear_highpage(page); |
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 05e5f5996216..1b9ba193b789 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -203,6 +203,9 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, | |||
203 | int block_read_full_page(struct page*, get_block_t*); | 203 | int block_read_full_page(struct page*, get_block_t*); |
204 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, | 204 | int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, |
205 | unsigned long from); | 205 | unsigned long from); |
206 | int block_write_begin_newtrunc(struct file *, struct address_space *, | ||
207 | loff_t, unsigned, unsigned, | ||
208 | struct page **, void **, get_block_t*); | ||
206 | int block_write_begin(struct file *, struct address_space *, | 209 | int block_write_begin(struct file *, struct address_space *, |
207 | loff_t, unsigned, unsigned, | 210 | loff_t, unsigned, unsigned, |
208 | struct page **, void **, get_block_t*); | 211 | struct page **, void **, get_block_t*); |
@@ -214,6 +217,9 @@ int generic_write_end(struct file *, struct address_space *, | |||
214 | struct page *, void *); | 217 | struct page *, void *); |
215 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); | 218 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); |
216 | int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); | 219 | int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); |
220 | int cont_write_begin_newtrunc(struct file *, struct address_space *, loff_t, | ||
221 | unsigned, unsigned, struct page **, void **, | ||
222 | get_block_t *, loff_t *); | ||
217 | int cont_write_begin(struct file *, struct address_space *, loff_t, | 223 | int cont_write_begin(struct file *, struct address_space *, loff_t, |
218 | unsigned, unsigned, struct page **, void **, | 224 | unsigned, unsigned, struct page **, void **, |
219 | get_block_t *, loff_t *); | 225 | get_block_t *, loff_t *); |
@@ -225,6 +231,9 @@ void block_sync_page(struct page *); | |||
225 | sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); | 231 | sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); |
226 | int block_truncate_page(struct address_space *, loff_t, get_block_t *); | 232 | int block_truncate_page(struct address_space *, loff_t, get_block_t *); |
227 | int file_fsync(struct file *, int); | 233 | int file_fsync(struct file *, int); |
234 | int nobh_write_begin_newtrunc(struct file *, struct address_space *, | ||
235 | loff_t, unsigned, unsigned, | ||
236 | struct page **, void **, get_block_t*); | ||
228 | int nobh_write_begin(struct file *, struct address_space *, | 237 | int nobh_write_begin(struct file *, struct address_space *, |
229 | loff_t, unsigned, unsigned, | 238 | loff_t, unsigned, unsigned, |
230 | struct page **, void **, get_block_t*); | 239 | struct page **, void **, get_block_t*); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index acf6c52a50dd..3428393942a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -2257,6 +2257,10 @@ typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, | |||
2257 | loff_t file_offset); | 2257 | loff_t file_offset); |
2258 | void dio_end_io(struct bio *bio, int error); | 2258 | void dio_end_io(struct bio *bio, int error); |
2259 | 2259 | ||
2260 | ssize_t __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, | ||
2261 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | ||
2262 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | ||
2263 | dio_submit_t submit_io, int lock_type); | ||
2260 | ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, | 2264 | ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
2261 | struct block_device *bdev, const struct iovec *iov, loff_t offset, | 2265 | struct block_device *bdev, const struct iovec *iov, loff_t offset, |
2262 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, | 2266 | unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, |
@@ -2270,6 +2274,24 @@ enum { | |||
2270 | DIO_SKIP_HOLES = 0x02, | 2274 | DIO_SKIP_HOLES = 0x02, |
2271 | }; | 2275 | }; |
2272 | 2276 | ||
2277 | static inline ssize_t blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, | ||
2278 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | ||
2279 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | ||
2280 | dio_iodone_t end_io) | ||
2281 | { | ||
2282 | return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset, | ||
2283 | nr_segs, get_block, end_io, NULL, | ||
2284 | DIO_LOCKING | DIO_SKIP_HOLES); | ||
2285 | } | ||
2286 | |||
2287 | static inline ssize_t blockdev_direct_IO_no_locking_newtrunc(int rw, struct kiocb *iocb, | ||
2288 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | ||
2289 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | ||
2290 | dio_iodone_t end_io) | ||
2291 | { | ||
2292 | return __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov, offset, | ||
2293 | nr_segs, get_block, end_io, NULL, 0); | ||
2294 | } | ||
2273 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, | 2295 | static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, |
2274 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, | 2296 | struct inode *inode, struct block_device *bdev, const struct iovec *iov, |
2275 | loff_t offset, unsigned long nr_segs, get_block_t get_block, | 2297 | loff_t offset, unsigned long nr_segs, get_block_t get_block, |
@@ -2342,12 +2364,14 @@ extern int dcache_dir_open(struct inode *, struct file *); | |||
2342 | extern int dcache_dir_close(struct inode *, struct file *); | 2364 | extern int dcache_dir_close(struct inode *, struct file *); |
2343 | extern loff_t dcache_dir_lseek(struct file *, loff_t, int); | 2365 | extern loff_t dcache_dir_lseek(struct file *, loff_t, int); |
2344 | extern int dcache_readdir(struct file *, void *, filldir_t); | 2366 | extern int dcache_readdir(struct file *, void *, filldir_t); |
2367 | extern int simple_setattr(struct dentry *, struct iattr *); | ||
2345 | extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); | 2368 | extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); |
2346 | extern int simple_statfs(struct dentry *, struct kstatfs *); | 2369 | extern int simple_statfs(struct dentry *, struct kstatfs *); |
2347 | extern int simple_link(struct dentry *, struct inode *, struct dentry *); | 2370 | extern int simple_link(struct dentry *, struct inode *, struct dentry *); |
2348 | extern int simple_unlink(struct inode *, struct dentry *); | 2371 | extern int simple_unlink(struct inode *, struct dentry *); |
2349 | extern int simple_rmdir(struct inode *, struct dentry *); | 2372 | extern int simple_rmdir(struct inode *, struct dentry *); |
2350 | extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); | 2373 | extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); |
2374 | extern int simple_setsize(struct inode *, loff_t); | ||
2351 | extern int noop_fsync(struct file *, int); | 2375 | extern int noop_fsync(struct file *, int); |
2352 | extern int simple_empty(struct dentry *); | 2376 | extern int simple_empty(struct dentry *); |
2353 | extern int simple_readpage(struct file *file, struct page *page); | 2377 | extern int simple_readpage(struct file *file, struct page *page); |
@@ -2384,7 +2408,8 @@ extern int buffer_migrate_page(struct address_space *, | |||
2384 | 2408 | ||
2385 | extern int inode_change_ok(const struct inode *, struct iattr *); | 2409 | extern int inode_change_ok(const struct inode *, struct iattr *); |
2386 | extern int inode_newsize_ok(const struct inode *, loff_t offset); | 2410 | extern int inode_newsize_ok(const struct inode *, loff_t offset); |
2387 | extern int __must_check inode_setattr(struct inode *, struct iattr *); | 2411 | extern int __must_check inode_setattr(struct inode *, const struct iattr *); |
2412 | extern void generic_setattr(struct inode *inode, const struct iattr *attr); | ||
2388 | 2413 | ||
2389 | extern void file_update_time(struct file *file); | 2414 | extern void file_update_time(struct file *file); |
2390 | 2415 | ||
diff --git a/mm/truncate.c b/mm/truncate.c index f42675a3615d..937571b8b233 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -548,18 +548,18 @@ EXPORT_SYMBOL(truncate_pagecache); | |||
548 | * NOTE! We have to be ready to update the memory sharing | 548 | * NOTE! We have to be ready to update the memory sharing |
549 | * between the file and the memory map for a potential last | 549 | * between the file and the memory map for a potential last |
550 | * incomplete page. Ugly, but necessary. | 550 | * incomplete page. Ugly, but necessary. |
551 | * | ||
552 | * This function is deprecated and simple_setsize or truncate_pagecache | ||
553 | * should be used instead. | ||
551 | */ | 554 | */ |
552 | int vmtruncate(struct inode *inode, loff_t offset) | 555 | int vmtruncate(struct inode *inode, loff_t offset) |
553 | { | 556 | { |
554 | loff_t oldsize; | ||
555 | int error; | 557 | int error; |
556 | 558 | ||
557 | error = inode_newsize_ok(inode, offset); | 559 | error = simple_setsize(inode, offset); |
558 | if (error) | 560 | if (error) |
559 | return error; | 561 | return error; |
560 | oldsize = inode->i_size; | 562 | |
561 | i_size_write(inode, offset); | ||
562 | truncate_pagecache(inode, oldsize, offset); | ||
563 | if (inode->i_op->truncate) | 563 | if (inode->i_op->truncate) |
564 | inode->i_op->truncate(inode); | 564 | inode->i_op->truncate(inode); |
565 | 565 | ||