aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2012-05-03 08:48:03 -0400
committerFengguang Wu <fengguang.wu@intel.com>2012-05-06 01:43:41 -0400
commit169ebd90131b2ffca74bb2dbe7eeacd39fb83714 (patch)
tree513136e110970da388680947c225ef8ca05a6dd7 /fs/fs-writeback.c
parentdbd5768f87ff6fb0a4fe09c4d7b6c4a24de99430 (diff)
writeback: Avoid iput() from flusher thread
Doing iput() from flusher thread (writeback_sb_inodes()) can create problems because iput() can do a lot of work - for example truncate the inode if it's the last iput on unlinked file. Some filesystems depend on flusher thread progressing (e.g. because they need to flush delay allocated blocks to reduce allocation uncertainty) and so flusher thread doing truncate creates interesting dependencies and possibilities for deadlocks. We get rid of iput() in flusher thread by using the fact that I_SYNC inode flag effectively pins the inode in memory. So if we take care to either hold i_lock or have I_SYNC set, we can get away without taking inode reference in writeback_sb_inodes(). As a side effect of these changes, we also fix possible use-after-free in wb_writeback() because inode_wait_for_writeback() call could try to reacquire i_lock on the inode that was already free. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c66
1 files changed, 53 insertions, 13 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5f2c68289610..8d2fb8c88cf3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -326,9 +326,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
326} 326}
327 327
328/* 328/*
329 * Wait for writeback on an inode to complete. 329 * Wait for writeback on an inode to complete. Called with i_lock held.
330 * Caller must make sure inode cannot go away when we drop i_lock.
330 */ 331 */
331static void inode_wait_for_writeback(struct inode *inode) 332static void __inode_wait_for_writeback(struct inode *inode)
333 __releases(inode->i_lock)
334 __acquires(inode->i_lock)
332{ 335{
333 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 336 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
334 wait_queue_head_t *wqh; 337 wait_queue_head_t *wqh;
@@ -342,6 +345,36 @@ static void inode_wait_for_writeback(struct inode *inode)
342} 345}
343 346
344/* 347/*
348 * Wait for writeback on an inode to complete. Caller must have inode pinned.
349 */
350void inode_wait_for_writeback(struct inode *inode)
351{
352 spin_lock(&inode->i_lock);
353 __inode_wait_for_writeback(inode);
354 spin_unlock(&inode->i_lock);
355}
356
357/*
358 * Sleep until I_SYNC is cleared. This function must be called with i_lock
359 * held and drops it. It is aimed for callers not holding any inode reference
360 * so once i_lock is dropped, inode can go away.
361 */
362static void inode_sleep_on_writeback(struct inode *inode)
363 __releases(inode->i_lock)
364{
365 DEFINE_WAIT(wait);
366 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
367 int sleep;
368
369 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
370 sleep = inode->i_state & I_SYNC;
371 spin_unlock(&inode->i_lock);
372 if (sleep)
373 schedule();
374 finish_wait(wqh, &wait);
375}
376
377/*
345 * Find proper writeback list for the inode depending on its current state and 378 * Find proper writeback list for the inode depending on its current state and
346 * possibly also change of its state while we were doing writeback. Here we 379 * possibly also change of its state while we were doing writeback. Here we
347 * handle things such as livelock prevention or fairness of writeback among 380 * handle things such as livelock prevention or fairness of writeback among
@@ -479,9 +512,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
479 if (wbc->sync_mode != WB_SYNC_ALL) 512 if (wbc->sync_mode != WB_SYNC_ALL)
480 goto out; 513 goto out;
481 /* 514 /*
482 * It's a data-integrity sync. We must wait. 515 * It's a data-integrity sync. We must wait. Since callers hold
516 * inode reference or inode has I_WILL_FREE set, it cannot go
517 * away under us.
483 */ 518 */
484 inode_wait_for_writeback(inode); 519 __inode_wait_for_writeback(inode);
485 } 520 }
486 WARN_ON(inode->i_state & I_SYNC); 521 WARN_ON(inode->i_state & I_SYNC);
487 /* 522 /*
@@ -620,20 +655,28 @@ static long writeback_sb_inodes(struct super_block *sb,
620 } 655 }
621 spin_unlock(&wb->list_lock); 656 spin_unlock(&wb->list_lock);
622 657
623 __iget(inode);
624 /* 658 /*
625 * We already requeued the inode if it had I_SYNC set and we 659 * We already requeued the inode if it had I_SYNC set and we
626 * are doing WB_SYNC_NONE writeback. So this catches only the 660 * are doing WB_SYNC_NONE writeback. So this catches only the
627 * WB_SYNC_ALL case. 661 * WB_SYNC_ALL case.
628 */ 662 */
629 if (inode->i_state & I_SYNC) 663 if (inode->i_state & I_SYNC) {
630 inode_wait_for_writeback(inode); 664 /* Wait for I_SYNC. This function drops i_lock... */
665 inode_sleep_on_writeback(inode);
666 /* Inode may be gone, start again */
667 continue;
668 }
631 inode->i_state |= I_SYNC; 669 inode->i_state |= I_SYNC;
632 spin_unlock(&inode->i_lock); 670 spin_unlock(&inode->i_lock);
671
633 write_chunk = writeback_chunk_size(wb->bdi, work); 672 write_chunk = writeback_chunk_size(wb->bdi, work);
634 wbc.nr_to_write = write_chunk; 673 wbc.nr_to_write = write_chunk;
635 wbc.pages_skipped = 0; 674 wbc.pages_skipped = 0;
636 675
676 /*
677 * We use I_SYNC to pin the inode in memory. While it is set
678 * evict_inode() will wait so the inode cannot be freed.
679 */
637 __writeback_single_inode(inode, wb, &wbc); 680 __writeback_single_inode(inode, wb, &wbc);
638 681
639 work->nr_pages -= write_chunk - wbc.nr_to_write; 682 work->nr_pages -= write_chunk - wbc.nr_to_write;
@@ -645,10 +688,7 @@ static long writeback_sb_inodes(struct super_block *sb,
645 requeue_inode(inode, wb, &wbc); 688 requeue_inode(inode, wb, &wbc);
646 inode_sync_complete(inode); 689 inode_sync_complete(inode);
647 spin_unlock(&inode->i_lock); 690 spin_unlock(&inode->i_lock);
648 spin_unlock(&wb->list_lock); 691 cond_resched_lock(&wb->list_lock);
649 iput(inode);
650 cond_resched();
651 spin_lock(&wb->list_lock);
652 /* 692 /*
653 * bail out to wb_writeback() often enough to check 693 * bail out to wb_writeback() often enough to check
654 * background threshold and other termination conditions. 694 * background threshold and other termination conditions.
@@ -843,8 +883,8 @@ static long wb_writeback(struct bdi_writeback *wb,
843 inode = wb_inode(wb->b_more_io.prev); 883 inode = wb_inode(wb->b_more_io.prev);
844 spin_lock(&inode->i_lock); 884 spin_lock(&inode->i_lock);
845 spin_unlock(&wb->list_lock); 885 spin_unlock(&wb->list_lock);
846 inode_wait_for_writeback(inode); 886 /* This function drops i_lock... */
847 spin_unlock(&inode->i_lock); 887 inode_sleep_on_writeback(inode);
848 spin_lock(&wb->list_lock); 888 spin_lock(&wb->list_lock);
849 } 889 }
850 } 890 }