diff options
author | Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | 2009-06-14 17:58:45 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2009-06-14 17:58:45 -0400 |
commit | 43ce1d23b43330634507a049b55c36e91d27282e (patch) | |
tree | d7532d7c0bea60b7bfe29dad51b4b92122acdd5a /fs/ext4 | |
parent | c364b22c9580a885e0f8c0d0f9710d67dc448958 (diff) |
ext4: Fix mmap/truncate race when blocksize < pagesize && !nodellaoc
This patch fixes the mmap/truncate race that was fixed for delayed
allocation by merging ext4_{journalled,normal,da}_writepage() into
ext4_writepage().
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/inode.c | 234 |
1 files changed, 57 insertions, 177 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1275f34589c7..97c48b5b0578 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -47,6 +47,10 @@ | |||
47 | 47 | ||
48 | #define MPAGE_DA_EXTENT_TAIL 0x01 | 48 | #define MPAGE_DA_EXTENT_TAIL 0x01 |
49 | 49 | ||
50 | static int __ext4_journalled_writepage(struct page *page, | ||
51 | struct writeback_control *wbc, | ||
52 | unsigned int len); | ||
53 | |||
50 | static inline int ext4_begin_ordered_truncate(struct inode *inode, | 54 | static inline int ext4_begin_ordered_truncate(struct inode *inode, |
51 | loff_t new_size) | 55 | loff_t new_size) |
52 | { | 56 | { |
@@ -2392,7 +2396,7 @@ static int __mpage_da_writepage(struct page *page, | |||
2392 | * We need to try to allocate | 2396 | * We need to try to allocate |
2393 | * unmapped blocks in the same page. | 2397 | * unmapped blocks in the same page. |
2394 | * Otherwise we won't make progress | 2398 | * Otherwise we won't make progress |
2395 | * with the page in ext4_da_writepage | 2399 | * with the page in ext4_writepage |
2396 | */ | 2400 | */ |
2397 | if (ext4_bh_delay_or_unwritten(NULL, bh)) { | 2401 | if (ext4_bh_delay_or_unwritten(NULL, bh)) { |
2398 | mpage_add_bh_to_extent(mpd, logical, | 2402 | mpage_add_bh_to_extent(mpd, logical, |
@@ -2519,13 +2523,47 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, | |||
2519 | } | 2523 | } |
2520 | 2524 | ||
2521 | /* | 2525 | /* |
2526 | * Note that we don't need to start a transaction unless we're journaling data | ||
2527 | * because we should have holes filled from ext4_page_mkwrite(). We even don't | ||
2528 | * need to file the inode to the transaction's list in ordered mode because if | ||
2529 | * we are writing back data added by write(), the inode is already there and if | ||
2530 | * we are writing back data modified via mmap(), noone guarantees in which | ||
2531 | * transaction the data will hit the disk. In case we are journaling data, we | ||
2532 | * cannot start transaction directly because transaction start ranks above page | ||
2533 | * lock so we have to do some magic. | ||
2534 | * | ||
2522 | * This function can get called via... | 2535 | * This function can get called via... |
2523 | * - ext4_da_writepages after taking page lock (have journal handle) | 2536 | * - ext4_da_writepages after taking page lock (have journal handle) |
2524 | * - journal_submit_inode_data_buffers (no journal handle) | 2537 | * - journal_submit_inode_data_buffers (no journal handle) |
2525 | * - shrink_page_list via pdflush (no journal handle) | 2538 | * - shrink_page_list via pdflush (no journal handle) |
2526 | * - grab_page_cache when doing write_begin (have journal handle) | 2539 | * - grab_page_cache when doing write_begin (have journal handle) |
2540 | * | ||
2541 | * We don't do any block allocation in this function. If we have page with | ||
2542 | * multiple blocks we need to write those buffer_heads that are mapped. This | ||
2543 | * is important for mmaped based write. So if we do with blocksize 1K | ||
2544 | * truncate(f, 1024); | ||
2545 | * a = mmap(f, 0, 4096); | ||
2546 | * a[0] = 'a'; | ||
2547 | * truncate(f, 4096); | ||
2548 | * we have in the page first buffer_head mapped via page_mkwrite call back | ||
2549 | * but other bufer_heads would be unmapped but dirty(dirty done via the | ||
2550 | * do_wp_page). So writepage should write the first block. If we modify | ||
2551 | * the mmap area beyond 1024 we will again get a page_fault and the | ||
2552 | * page_mkwrite callback will do the block allocation and mark the | ||
2553 | * buffer_heads mapped. | ||
2554 | * | ||
2555 | * We redirty the page if we have any buffer_heads that is either delay or | ||
2556 | * unwritten in the page. | ||
2557 | * | ||
2558 | * We can get recursively called as show below. | ||
2559 | * | ||
2560 | * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> | ||
2561 | * ext4_writepage() | ||
2562 | * | ||
2563 | * But since we don't do any block allocation we should not deadlock. | ||
2564 | * Page also have the dirty flag cleared so we don't get recurive page_lock. | ||
2527 | */ | 2565 | */ |
2528 | static int ext4_da_writepage(struct page *page, | 2566 | static int ext4_writepage(struct page *page, |
2529 | struct writeback_control *wbc) | 2567 | struct writeback_control *wbc) |
2530 | { | 2568 | { |
2531 | int ret = 0; | 2569 | int ret = 0; |
@@ -2534,7 +2572,7 @@ static int ext4_da_writepage(struct page *page, | |||
2534 | struct buffer_head *page_bufs; | 2572 | struct buffer_head *page_bufs; |
2535 | struct inode *inode = page->mapping->host; | 2573 | struct inode *inode = page->mapping->host; |
2536 | 2574 | ||
2537 | trace_ext4_da_writepage(inode, page); | 2575 | trace_ext4_writepage(inode, page); |
2538 | size = i_size_read(inode); | 2576 | size = i_size_read(inode); |
2539 | if (page->index == size >> PAGE_CACHE_SHIFT) | 2577 | if (page->index == size >> PAGE_CACHE_SHIFT) |
2540 | len = size & ~PAGE_CACHE_MASK; | 2578 | len = size & ~PAGE_CACHE_MASK; |
@@ -2596,6 +2634,15 @@ static int ext4_da_writepage(struct page *page, | |||
2596 | block_commit_write(page, 0, len); | 2634 | block_commit_write(page, 0, len); |
2597 | } | 2635 | } |
2598 | 2636 | ||
2637 | if (PageChecked(page) && ext4_should_journal_data(inode)) { | ||
2638 | /* | ||
2639 | * It's mmapped pagecache. Add buffers and journal it. There | ||
2640 | * doesn't seem much point in redirtying the page here. | ||
2641 | */ | ||
2642 | ClearPageChecked(page); | ||
2643 | return __ext4_journalled_writepage(page, wbc, len); | ||
2644 | } | ||
2645 | |||
2599 | if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) | 2646 | if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) |
2600 | ret = nobh_writepage(page, noalloc_get_block_write, wbc); | 2647 | ret = nobh_writepage(page, noalloc_get_block_write, wbc); |
2601 | else | 2648 | else |
@@ -3135,112 +3182,10 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) | |||
3135 | return 0; | 3182 | return 0; |
3136 | } | 3183 | } |
3137 | 3184 | ||
3138 | /* | ||
3139 | * Note that we don't need to start a transaction unless we're journaling data | ||
3140 | * because we should have holes filled from ext4_page_mkwrite(). We even don't | ||
3141 | * need to file the inode to the transaction's list in ordered mode because if | ||
3142 | * we are writing back data added by write(), the inode is already there and if | ||
3143 | * we are writing back data modified via mmap(), noone guarantees in which | ||
3144 | * transaction the data will hit the disk. In case we are journaling data, we | ||
3145 | * cannot start transaction directly because transaction start ranks above page | ||
3146 | * lock so we have to do some magic. | ||
3147 | * | ||
3148 | * In all journaling modes block_write_full_page() will start the I/O. | ||
3149 | * | ||
3150 | * Problem: | ||
3151 | * | ||
3152 | * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> | ||
3153 | * ext4_writepage() | ||
3154 | * | ||
3155 | * Similar for: | ||
3156 | * | ||
3157 | * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... | ||
3158 | * | ||
3159 | * Same applies to ext4_get_block(). We will deadlock on various things like | ||
3160 | * lock_journal and i_data_sem | ||
3161 | * | ||
3162 | * Setting PF_MEMALLOC here doesn't work - too many internal memory | ||
3163 | * allocations fail. | ||
3164 | * | ||
3165 | * 16May01: If we're reentered then journal_current_handle() will be | ||
3166 | * non-zero. We simply *return*. | ||
3167 | * | ||
3168 | * 1 July 2001: @@@ FIXME: | ||
3169 | * In journalled data mode, a data buffer may be metadata against the | ||
3170 | * current transaction. But the same file is part of a shared mapping | ||
3171 | * and someone does a writepage() on it. | ||
3172 | * | ||
3173 | * We will move the buffer onto the async_data list, but *after* it has | ||
3174 | * been dirtied. So there's a small window where we have dirty data on | ||
3175 | * BJ_Metadata. | ||
3176 | * | ||
3177 | * Note that this only applies to the last partial page in the file. The | ||
3178 | * bit which block_write_full_page() uses prepare/commit for. (That's | ||
3179 | * broken code anyway: it's wrong for msync()). | ||
3180 | * | ||
3181 | * It's a rare case: affects the final partial page, for journalled data | ||
3182 | * where the file is subject to bith write() and writepage() in the same | ||
3183 | * transction. To fix it we'll need a custom block_write_full_page(). | ||
3184 | * We'll probably need that anyway for journalling writepage() output. | ||
3185 | * | ||
3186 | * We don't honour synchronous mounts for writepage(). That would be | ||
3187 | * disastrous. Any write() or metadata operation will sync the fs for | ||
3188 | * us. | ||
3189 | * | ||
3190 | */ | ||
3191 | static int __ext4_normal_writepage(struct page *page, | ||
3192 | struct writeback_control *wbc) | ||
3193 | { | ||
3194 | struct inode *inode = page->mapping->host; | ||
3195 | |||
3196 | if (test_opt(inode->i_sb, NOBH)) | ||
3197 | return nobh_writepage(page, noalloc_get_block_write, wbc); | ||
3198 | else | ||
3199 | return block_write_full_page(page, noalloc_get_block_write, | ||
3200 | wbc); | ||
3201 | } | ||
3202 | |||
3203 | static int ext4_normal_writepage(struct page *page, | ||
3204 | struct writeback_control *wbc) | ||
3205 | { | ||
3206 | struct inode *inode = page->mapping->host; | ||
3207 | loff_t size = i_size_read(inode); | ||
3208 | loff_t len; | ||
3209 | |||
3210 | trace_ext4_normal_writepage(inode, page); | ||
3211 | J_ASSERT(PageLocked(page)); | ||
3212 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
3213 | len = size & ~PAGE_CACHE_MASK; | ||
3214 | else | ||
3215 | len = PAGE_CACHE_SIZE; | ||
3216 | |||
3217 | if (page_has_buffers(page)) { | ||
3218 | /* if page has buffers it should all be mapped | ||
3219 | * and allocated. If there are not buffers attached | ||
3220 | * to the page we know the page is dirty but it lost | ||
3221 | * buffers. That means that at some moment in time | ||
3222 | * after write_begin() / write_end() has been called | ||
3223 | * all buffers have been clean and thus they must have been | ||
3224 | * written at least once. So they are all mapped and we can | ||
3225 | * happily proceed with mapping them and writing the page. | ||
3226 | */ | ||
3227 | BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | ||
3228 | ext4_bh_delay_or_unwritten)); | ||
3229 | } | ||
3230 | |||
3231 | if (!ext4_journal_current_handle()) | ||
3232 | return __ext4_normal_writepage(page, wbc); | ||
3233 | |||
3234 | redirty_page_for_writepage(wbc, page); | ||
3235 | unlock_page(page); | ||
3236 | return 0; | ||
3237 | } | ||
3238 | |||
3239 | static int __ext4_journalled_writepage(struct page *page, | 3185 | static int __ext4_journalled_writepage(struct page *page, |
3240 | struct writeback_control *wbc) | 3186 | struct writeback_control *wbc, |
3187 | unsigned int len) | ||
3241 | { | 3188 | { |
3242 | loff_t size; | ||
3243 | unsigned int len; | ||
3244 | struct address_space *mapping = page->mapping; | 3189 | struct address_space *mapping = page->mapping; |
3245 | struct inode *inode = mapping->host; | 3190 | struct inode *inode = mapping->host; |
3246 | struct buffer_head *page_bufs; | 3191 | struct buffer_head *page_bufs; |
@@ -3248,16 +3193,8 @@ static int __ext4_journalled_writepage(struct page *page, | |||
3248 | int ret = 0; | 3193 | int ret = 0; |
3249 | int err; | 3194 | int err; |
3250 | 3195 | ||
3251 | size = i_size_read(inode); | ||
3252 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
3253 | len = size & ~PAGE_CACHE_MASK; | ||
3254 | else | ||
3255 | len = PAGE_CACHE_SIZE; | ||
3256 | ret = block_prepare_write(page, 0, len, noalloc_get_block_write); | ||
3257 | if (ret != 0) | ||
3258 | goto out_unlock; | ||
3259 | |||
3260 | page_bufs = page_buffers(page); | 3196 | page_bufs = page_buffers(page); |
3197 | BUG_ON(!page_bufs); | ||
3261 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); | 3198 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); |
3262 | /* As soon as we unlock the page, it can go away, but we have | 3199 | /* As soon as we unlock the page, it can go away, but we have |
3263 | * references to buffers so we are safe */ | 3200 | * references to buffers so we are safe */ |
@@ -3282,67 +3219,10 @@ static int __ext4_journalled_writepage(struct page *page, | |||
3282 | 3219 | ||
3283 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); | 3220 | walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); |
3284 | EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; | 3221 | EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; |
3285 | goto out; | ||
3286 | |||
3287 | out_unlock: | ||
3288 | unlock_page(page); | ||
3289 | out: | 3222 | out: |
3290 | return ret; | 3223 | return ret; |
3291 | } | 3224 | } |
3292 | 3225 | ||
3293 | static int ext4_journalled_writepage(struct page *page, | ||
3294 | struct writeback_control *wbc) | ||
3295 | { | ||
3296 | struct inode *inode = page->mapping->host; | ||
3297 | loff_t size = i_size_read(inode); | ||
3298 | loff_t len; | ||
3299 | |||
3300 | trace_ext4_journalled_writepage(inode, page); | ||
3301 | J_ASSERT(PageLocked(page)); | ||
3302 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
3303 | len = size & ~PAGE_CACHE_MASK; | ||
3304 | else | ||
3305 | len = PAGE_CACHE_SIZE; | ||
3306 | |||
3307 | if (page_has_buffers(page)) { | ||
3308 | /* if page has buffers it should all be mapped | ||
3309 | * and allocated. If there are not buffers attached | ||
3310 | * to the page we know the page is dirty but it lost | ||
3311 | * buffers. That means that at some moment in time | ||
3312 | * after write_begin() / write_end() has been called | ||
3313 | * all buffers have been clean and thus they must have been | ||
3314 | * written at least once. So they are all mapped and we can | ||
3315 | * happily proceed with mapping them and writing the page. | ||
3316 | */ | ||
3317 | BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | ||
3318 | ext4_bh_delay_or_unwritten)); | ||
3319 | } | ||
3320 | |||
3321 | if (ext4_journal_current_handle()) | ||
3322 | goto no_write; | ||
3323 | |||
3324 | if (PageChecked(page)) { | ||
3325 | /* | ||
3326 | * It's mmapped pagecache. Add buffers and journal it. There | ||
3327 | * doesn't seem much point in redirtying the page here. | ||
3328 | */ | ||
3329 | ClearPageChecked(page); | ||
3330 | return __ext4_journalled_writepage(page, wbc); | ||
3331 | } else { | ||
3332 | /* | ||
3333 | * It may be a page full of checkpoint-mode buffers. We don't | ||
3334 | * really know unless we go poke around in the buffer_heads. | ||
3335 | * But block_write_full_page will do the right thing. | ||
3336 | */ | ||
3337 | return block_write_full_page(page, noalloc_get_block_write, | ||
3338 | wbc); | ||
3339 | } | ||
3340 | no_write: | ||
3341 | redirty_page_for_writepage(wbc, page); | ||
3342 | unlock_page(page); | ||
3343 | return 0; | ||
3344 | } | ||
3345 | |||
3346 | static int ext4_readpage(struct file *file, struct page *page) | 3226 | static int ext4_readpage(struct file *file, struct page *page) |
3347 | { | 3227 | { |
3348 | return mpage_readpage(page, ext4_get_block); | 3228 | return mpage_readpage(page, ext4_get_block); |
@@ -3489,7 +3369,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) | |||
3489 | static const struct address_space_operations ext4_ordered_aops = { | 3369 | static const struct address_space_operations ext4_ordered_aops = { |
3490 | .readpage = ext4_readpage, | 3370 | .readpage = ext4_readpage, |
3491 | .readpages = ext4_readpages, | 3371 | .readpages = ext4_readpages, |
3492 | .writepage = ext4_normal_writepage, | 3372 | .writepage = ext4_writepage, |
3493 | .sync_page = block_sync_page, | 3373 | .sync_page = block_sync_page, |
3494 | .write_begin = ext4_write_begin, | 3374 | .write_begin = ext4_write_begin, |
3495 | .write_end = ext4_ordered_write_end, | 3375 | .write_end = ext4_ordered_write_end, |
@@ -3504,7 +3384,7 @@ static const struct address_space_operations ext4_ordered_aops = { | |||
3504 | static const struct address_space_operations ext4_writeback_aops = { | 3384 | static const struct address_space_operations ext4_writeback_aops = { |
3505 | .readpage = ext4_readpage, | 3385 | .readpage = ext4_readpage, |
3506 | .readpages = ext4_readpages, | 3386 | .readpages = ext4_readpages, |
3507 | .writepage = ext4_normal_writepage, | 3387 | .writepage = ext4_writepage, |
3508 | .sync_page = block_sync_page, | 3388 | .sync_page = block_sync_page, |
3509 | .write_begin = ext4_write_begin, | 3389 | .write_begin = ext4_write_begin, |
3510 | .write_end = ext4_writeback_write_end, | 3390 | .write_end = ext4_writeback_write_end, |
@@ -3519,7 +3399,7 @@ static const struct address_space_operations ext4_writeback_aops = { | |||
3519 | static const struct address_space_operations ext4_journalled_aops = { | 3399 | static const struct address_space_operations ext4_journalled_aops = { |
3520 | .readpage = ext4_readpage, | 3400 | .readpage = ext4_readpage, |
3521 | .readpages = ext4_readpages, | 3401 | .readpages = ext4_readpages, |
3522 | .writepage = ext4_journalled_writepage, | 3402 | .writepage = ext4_writepage, |
3523 | .sync_page = block_sync_page, | 3403 | .sync_page = block_sync_page, |
3524 | .write_begin = ext4_write_begin, | 3404 | .write_begin = ext4_write_begin, |
3525 | .write_end = ext4_journalled_write_end, | 3405 | .write_end = ext4_journalled_write_end, |
@@ -3533,7 +3413,7 @@ static const struct address_space_operations ext4_journalled_aops = { | |||
3533 | static const struct address_space_operations ext4_da_aops = { | 3413 | static const struct address_space_operations ext4_da_aops = { |
3534 | .readpage = ext4_readpage, | 3414 | .readpage = ext4_readpage, |
3535 | .readpages = ext4_readpages, | 3415 | .readpages = ext4_readpages, |
3536 | .writepage = ext4_da_writepage, | 3416 | .writepage = ext4_writepage, |
3537 | .writepages = ext4_da_writepages, | 3417 | .writepages = ext4_da_writepages, |
3538 | .sync_page = block_sync_page, | 3418 | .sync_page = block_sync_page, |
3539 | .write_begin = ext4_da_write_begin, | 3419 | .write_begin = ext4_da_write_begin, |