aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>2009-06-14 17:58:45 -0400
committerTheodore Ts'o <tytso@mit.edu>2009-06-14 17:58:45 -0400
commit43ce1d23b43330634507a049b55c36e91d27282e (patch)
treed7532d7c0bea60b7bfe29dad51b4b92122acdd5a
parentc364b22c9580a885e0f8c0d0f9710d67dc448958 (diff)
ext4: Fix mmap/truncate race when blocksize < pagesize && !nodellaoc
This patch fixes the mmap/truncate race that was fixed for delayed allocation by merging ext4_{journalled,normal,da}_writepage() into ext4_writepage(). Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Acked-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/inode.c234
-rw-r--r--include/trace/events/ext4.h45
2 files changed, 58 insertions, 221 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1275f34589c7..97c48b5b0578 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,6 +47,10 @@
47 47
48#define MPAGE_DA_EXTENT_TAIL 0x01 48#define MPAGE_DA_EXTENT_TAIL 0x01
49 49
50static int __ext4_journalled_writepage(struct page *page,
51 struct writeback_control *wbc,
52 unsigned int len);
53
50static inline int ext4_begin_ordered_truncate(struct inode *inode, 54static inline int ext4_begin_ordered_truncate(struct inode *inode,
51 loff_t new_size) 55 loff_t new_size)
52{ 56{
@@ -2392,7 +2396,7 @@ static int __mpage_da_writepage(struct page *page,
2392 * We need to try to allocate 2396 * We need to try to allocate
2393 * unmapped blocks in the same page. 2397 * unmapped blocks in the same page.
2394 * Otherwise we won't make progress 2398 * Otherwise we won't make progress
2395 * with the page in ext4_da_writepage 2399 * with the page in ext4_writepage
2396 */ 2400 */
2397 if (ext4_bh_delay_or_unwritten(NULL, bh)) { 2401 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2398 mpage_add_bh_to_extent(mpd, logical, 2402 mpage_add_bh_to_extent(mpd, logical,
@@ -2519,13 +2523,47 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2519} 2523}
2520 2524
2521/* 2525/*
2526 * Note that we don't need to start a transaction unless we're journaling data
2527 * because we should have holes filled from ext4_page_mkwrite(). We even don't
2528 * need to file the inode to the transaction's list in ordered mode because if
2529 * we are writing back data added by write(), the inode is already there and if
2530 * we are writing back data modified via mmap(), noone guarantees in which
2531 * transaction the data will hit the disk. In case we are journaling data, we
2532 * cannot start transaction directly because transaction start ranks above page
2533 * lock so we have to do some magic.
2534 *
2522 * This function can get called via... 2535 * This function can get called via...
2523 * - ext4_da_writepages after taking page lock (have journal handle) 2536 * - ext4_da_writepages after taking page lock (have journal handle)
2524 * - journal_submit_inode_data_buffers (no journal handle) 2537 * - journal_submit_inode_data_buffers (no journal handle)
2525 * - shrink_page_list via pdflush (no journal handle) 2538 * - shrink_page_list via pdflush (no journal handle)
2526 * - grab_page_cache when doing write_begin (have journal handle) 2539 * - grab_page_cache when doing write_begin (have journal handle)
2540 *
2541 * We don't do any block allocation in this function. If we have page with
2542 * multiple blocks we need to write those buffer_heads that are mapped. This
2543 * is important for mmaped based write. So if we do with blocksize 1K
2544 * truncate(f, 1024);
2545 * a = mmap(f, 0, 4096);
2546 * a[0] = 'a';
2547 * truncate(f, 4096);
2548 * we have in the page first buffer_head mapped via page_mkwrite call back
2549 * but other bufer_heads would be unmapped but dirty(dirty done via the
2550 * do_wp_page). So writepage should write the first block. If we modify
2551 * the mmap area beyond 1024 we will again get a page_fault and the
2552 * page_mkwrite callback will do the block allocation and mark the
2553 * buffer_heads mapped.
2554 *
2555 * We redirty the page if we have any buffer_heads that is either delay or
2556 * unwritten in the page.
2557 *
2558 * We can get recursively called as show below.
2559 *
2560 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
2561 * ext4_writepage()
2562 *
2563 * But since we don't do any block allocation we should not deadlock.
2564 * Page also have the dirty flag cleared so we don't get recurive page_lock.
2527 */ 2565 */
2528static int ext4_da_writepage(struct page *page, 2566static int ext4_writepage(struct page *page,
2529 struct writeback_control *wbc) 2567 struct writeback_control *wbc)
2530{ 2568{
2531 int ret = 0; 2569 int ret = 0;
@@ -2534,7 +2572,7 @@ static int ext4_da_writepage(struct page *page,
2534 struct buffer_head *page_bufs; 2572 struct buffer_head *page_bufs;
2535 struct inode *inode = page->mapping->host; 2573 struct inode *inode = page->mapping->host;
2536 2574
2537 trace_ext4_da_writepage(inode, page); 2575 trace_ext4_writepage(inode, page);
2538 size = i_size_read(inode); 2576 size = i_size_read(inode);
2539 if (page->index == size >> PAGE_CACHE_SHIFT) 2577 if (page->index == size >> PAGE_CACHE_SHIFT)
2540 len = size & ~PAGE_CACHE_MASK; 2578 len = size & ~PAGE_CACHE_MASK;
@@ -2596,6 +2634,15 @@ static int ext4_da_writepage(struct page *page,
2596 block_commit_write(page, 0, len); 2634 block_commit_write(page, 0, len);
2597 } 2635 }
2598 2636
2637 if (PageChecked(page) && ext4_should_journal_data(inode)) {
2638 /*
2639 * It's mmapped pagecache. Add buffers and journal it. There
2640 * doesn't seem much point in redirtying the page here.
2641 */
2642 ClearPageChecked(page);
2643 return __ext4_journalled_writepage(page, wbc, len);
2644 }
2645
2599 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2646 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
2600 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2647 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2601 else 2648 else
@@ -3135,112 +3182,10 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
3135 return 0; 3182 return 0;
3136} 3183}
3137 3184
3138/*
3139 * Note that we don't need to start a transaction unless we're journaling data
3140 * because we should have holes filled from ext4_page_mkwrite(). We even don't
3141 * need to file the inode to the transaction's list in ordered mode because if
3142 * we are writing back data added by write(), the inode is already there and if
3143 * we are writing back data modified via mmap(), noone guarantees in which
3144 * transaction the data will hit the disk. In case we are journaling data, we
3145 * cannot start transaction directly because transaction start ranks above page
3146 * lock so we have to do some magic.
3147 *
3148 * In all journaling modes block_write_full_page() will start the I/O.
3149 *
3150 * Problem:
3151 *
3152 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
3153 * ext4_writepage()
3154 *
3155 * Similar for:
3156 *
3157 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
3158 *
3159 * Same applies to ext4_get_block(). We will deadlock on various things like
3160 * lock_journal and i_data_sem
3161 *
3162 * Setting PF_MEMALLOC here doesn't work - too many internal memory
3163 * allocations fail.
3164 *
3165 * 16May01: If we're reentered then journal_current_handle() will be
3166 * non-zero. We simply *return*.
3167 *
3168 * 1 July 2001: @@@ FIXME:
3169 * In journalled data mode, a data buffer may be metadata against the
3170 * current transaction. But the same file is part of a shared mapping
3171 * and someone does a writepage() on it.
3172 *
3173 * We will move the buffer onto the async_data list, but *after* it has
3174 * been dirtied. So there's a small window where we have dirty data on
3175 * BJ_Metadata.
3176 *
3177 * Note that this only applies to the last partial page in the file. The
3178 * bit which block_write_full_page() uses prepare/commit for. (That's
3179 * broken code anyway: it's wrong for msync()).
3180 *
3181 * It's a rare case: affects the final partial page, for journalled data
3182 * where the file is subject to bith write() and writepage() in the same
3183 * transction. To fix it we'll need a custom block_write_full_page().
3184 * We'll probably need that anyway for journalling writepage() output.
3185 *
3186 * We don't honour synchronous mounts for writepage(). That would be
3187 * disastrous. Any write() or metadata operation will sync the fs for
3188 * us.
3189 *
3190 */
3191static int __ext4_normal_writepage(struct page *page,
3192 struct writeback_control *wbc)
3193{
3194 struct inode *inode = page->mapping->host;
3195
3196 if (test_opt(inode->i_sb, NOBH))
3197 return nobh_writepage(page, noalloc_get_block_write, wbc);
3198 else
3199 return block_write_full_page(page, noalloc_get_block_write,
3200 wbc);
3201}
3202
3203static int ext4_normal_writepage(struct page *page,
3204 struct writeback_control *wbc)
3205{
3206 struct inode *inode = page->mapping->host;
3207 loff_t size = i_size_read(inode);
3208 loff_t len;
3209
3210 trace_ext4_normal_writepage(inode, page);
3211 J_ASSERT(PageLocked(page));
3212 if (page->index == size >> PAGE_CACHE_SHIFT)
3213 len = size & ~PAGE_CACHE_MASK;
3214 else
3215 len = PAGE_CACHE_SIZE;
3216
3217 if (page_has_buffers(page)) {
3218 /* if page has buffers it should all be mapped
3219 * and allocated. If there are not buffers attached
3220 * to the page we know the page is dirty but it lost
3221 * buffers. That means that at some moment in time
3222 * after write_begin() / write_end() has been called
3223 * all buffers have been clean and thus they must have been
3224 * written at least once. So they are all mapped and we can
3225 * happily proceed with mapping them and writing the page.
3226 */
3227 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
3228 ext4_bh_delay_or_unwritten));
3229 }
3230
3231 if (!ext4_journal_current_handle())
3232 return __ext4_normal_writepage(page, wbc);
3233
3234 redirty_page_for_writepage(wbc, page);
3235 unlock_page(page);
3236 return 0;
3237}
3238
3239static int __ext4_journalled_writepage(struct page *page, 3185static int __ext4_journalled_writepage(struct page *page,
3240 struct writeback_control *wbc) 3186 struct writeback_control *wbc,
3187 unsigned int len)
3241{ 3188{
3242 loff_t size;
3243 unsigned int len;
3244 struct address_space *mapping = page->mapping; 3189 struct address_space *mapping = page->mapping;
3245 struct inode *inode = mapping->host; 3190 struct inode *inode = mapping->host;
3246 struct buffer_head *page_bufs; 3191 struct buffer_head *page_bufs;
@@ -3248,16 +3193,8 @@ static int __ext4_journalled_writepage(struct page *page,
3248 int ret = 0; 3193 int ret = 0;
3249 int err; 3194 int err;
3250 3195
3251 size = i_size_read(inode);
3252 if (page->index == size >> PAGE_CACHE_SHIFT)
3253 len = size & ~PAGE_CACHE_MASK;
3254 else
3255 len = PAGE_CACHE_SIZE;
3256 ret = block_prepare_write(page, 0, len, noalloc_get_block_write);
3257 if (ret != 0)
3258 goto out_unlock;
3259
3260 page_bufs = page_buffers(page); 3196 page_bufs = page_buffers(page);
3197 BUG_ON(!page_bufs);
3261 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 3198 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
3262 /* As soon as we unlock the page, it can go away, but we have 3199 /* As soon as we unlock the page, it can go away, but we have
3263 * references to buffers so we are safe */ 3200 * references to buffers so we are safe */
@@ -3282,67 +3219,10 @@ static int __ext4_journalled_writepage(struct page *page,
3282 3219
3283 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 3220 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
3284 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 3221 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
3285 goto out;
3286
3287out_unlock:
3288 unlock_page(page);
3289out: 3222out:
3290 return ret; 3223 return ret;
3291} 3224}
3292 3225
3293static int ext4_journalled_writepage(struct page *page,
3294 struct writeback_control *wbc)
3295{
3296 struct inode *inode = page->mapping->host;
3297 loff_t size = i_size_read(inode);
3298 loff_t len;
3299
3300 trace_ext4_journalled_writepage(inode, page);
3301 J_ASSERT(PageLocked(page));
3302 if (page->index == size >> PAGE_CACHE_SHIFT)
3303 len = size & ~PAGE_CACHE_MASK;
3304 else
3305 len = PAGE_CACHE_SIZE;
3306
3307 if (page_has_buffers(page)) {
3308 /* if page has buffers it should all be mapped
3309 * and allocated. If there are not buffers attached
3310 * to the page we know the page is dirty but it lost
3311 * buffers. That means that at some moment in time
3312 * after write_begin() / write_end() has been called
3313 * all buffers have been clean and thus they must have been
3314 * written at least once. So they are all mapped and we can
3315 * happily proceed with mapping them and writing the page.
3316 */
3317 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
3318 ext4_bh_delay_or_unwritten));
3319 }
3320
3321 if (ext4_journal_current_handle())
3322 goto no_write;
3323
3324 if (PageChecked(page)) {
3325 /*
3326 * It's mmapped pagecache. Add buffers and journal it. There
3327 * doesn't seem much point in redirtying the page here.
3328 */
3329 ClearPageChecked(page);
3330 return __ext4_journalled_writepage(page, wbc);
3331 } else {
3332 /*
3333 * It may be a page full of checkpoint-mode buffers. We don't
3334 * really know unless we go poke around in the buffer_heads.
3335 * But block_write_full_page will do the right thing.
3336 */
3337 return block_write_full_page(page, noalloc_get_block_write,
3338 wbc);
3339 }
3340no_write:
3341 redirty_page_for_writepage(wbc, page);
3342 unlock_page(page);
3343 return 0;
3344}
3345
3346static int ext4_readpage(struct file *file, struct page *page) 3226static int ext4_readpage(struct file *file, struct page *page)
3347{ 3227{
3348 return mpage_readpage(page, ext4_get_block); 3228 return mpage_readpage(page, ext4_get_block);
@@ -3489,7 +3369,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
3489static const struct address_space_operations ext4_ordered_aops = { 3369static const struct address_space_operations ext4_ordered_aops = {
3490 .readpage = ext4_readpage, 3370 .readpage = ext4_readpage,
3491 .readpages = ext4_readpages, 3371 .readpages = ext4_readpages,
3492 .writepage = ext4_normal_writepage, 3372 .writepage = ext4_writepage,
3493 .sync_page = block_sync_page, 3373 .sync_page = block_sync_page,
3494 .write_begin = ext4_write_begin, 3374 .write_begin = ext4_write_begin,
3495 .write_end = ext4_ordered_write_end, 3375 .write_end = ext4_ordered_write_end,
@@ -3504,7 +3384,7 @@ static const struct address_space_operations ext4_ordered_aops = {
3504static const struct address_space_operations ext4_writeback_aops = { 3384static const struct address_space_operations ext4_writeback_aops = {
3505 .readpage = ext4_readpage, 3385 .readpage = ext4_readpage,
3506 .readpages = ext4_readpages, 3386 .readpages = ext4_readpages,
3507 .writepage = ext4_normal_writepage, 3387 .writepage = ext4_writepage,
3508 .sync_page = block_sync_page, 3388 .sync_page = block_sync_page,
3509 .write_begin = ext4_write_begin, 3389 .write_begin = ext4_write_begin,
3510 .write_end = ext4_writeback_write_end, 3390 .write_end = ext4_writeback_write_end,
@@ -3519,7 +3399,7 @@ static const struct address_space_operations ext4_writeback_aops = {
3519static const struct address_space_operations ext4_journalled_aops = { 3399static const struct address_space_operations ext4_journalled_aops = {
3520 .readpage = ext4_readpage, 3400 .readpage = ext4_readpage,
3521 .readpages = ext4_readpages, 3401 .readpages = ext4_readpages,
3522 .writepage = ext4_journalled_writepage, 3402 .writepage = ext4_writepage,
3523 .sync_page = block_sync_page, 3403 .sync_page = block_sync_page,
3524 .write_begin = ext4_write_begin, 3404 .write_begin = ext4_write_begin,
3525 .write_end = ext4_journalled_write_end, 3405 .write_end = ext4_journalled_write_end,
@@ -3533,7 +3413,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3533static const struct address_space_operations ext4_da_aops = { 3413static const struct address_space_operations ext4_da_aops = {
3534 .readpage = ext4_readpage, 3414 .readpage = ext4_readpage,
3535 .readpages = ext4_readpages, 3415 .readpages = ext4_readpages,
3536 .writepage = ext4_da_writepage, 3416 .writepage = ext4_writepage,
3537 .writepages = ext4_da_writepages, 3417 .writepages = ext4_da_writepages,
3538 .sync_page = block_sync_page, 3418 .sync_page = block_sync_page,
3539 .write_begin = ext4_da_write_begin, 3419 .write_begin = ext4_da_write_begin,
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index b456fb0a3c57..dfbc9b0edc88 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -190,7 +190,7 @@ TRACE_EVENT(ext4_journalled_write_end,
190 __entry->copied) 190 __entry->copied)
191); 191);
192 192
193TRACE_EVENT(ext4_da_writepage, 193TRACE_EVENT(ext4_writepage,
194 TP_PROTO(struct inode *inode, struct page *page), 194 TP_PROTO(struct inode *inode, struct page *page),
195 195
196 TP_ARGS(inode, page), 196 TP_ARGS(inode, page),
@@ -342,49 +342,6 @@ TRACE_EVENT(ext4_da_write_end,
342 __entry->copied) 342 __entry->copied)
343); 343);
344 344
345TRACE_EVENT(ext4_normal_writepage,
346 TP_PROTO(struct inode *inode, struct page *page),
347
348 TP_ARGS(inode, page),
349
350 TP_STRUCT__entry(
351 __field( dev_t, dev )
352 __field( ino_t, ino )
353 __field( pgoff_t, index )
354 ),
355
356 TP_fast_assign(
357 __entry->dev = inode->i_sb->s_dev;
358 __entry->ino = inode->i_ino;
359 __entry->index = page->index;
360 ),
361
362 TP_printk("dev %s ino %lu page_index %lu",
363 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index)
364);
365
366TRACE_EVENT(ext4_journalled_writepage,
367 TP_PROTO(struct inode *inode, struct page *page),
368
369 TP_ARGS(inode, page),
370
371 TP_STRUCT__entry(
372 __field( dev_t, dev )
373 __field( ino_t, ino )
374 __field( pgoff_t, index )
375
376 ),
377
378 TP_fast_assign(
379 __entry->dev = inode->i_sb->s_dev;
380 __entry->ino = inode->i_ino;
381 __entry->index = page->index;
382 ),
383
384 TP_printk("dev %s ino %lu page_index %lu",
385 jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index)
386);
387
388TRACE_EVENT(ext4_discard_blocks, 345TRACE_EVENT(ext4_discard_blocks,
389 TP_PROTO(struct super_block *sb, unsigned long long blk, 346 TP_PROTO(struct super_block *sb, unsigned long long blk,
390 unsigned long long count), 347 unsigned long long count),