aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2012-05-03 08:48:03 -0400
committerFengguang Wu <fengguang.wu@intel.com>2012-05-06 01:43:41 -0400
commit169ebd90131b2ffca74bb2dbe7eeacd39fb83714 (patch)
tree513136e110970da388680947c225ef8ca05a6dd7
parentdbd5768f87ff6fb0a4fe09c4d7b6c4a24de99430 (diff)
writeback: Avoid iput() from flusher thread
Doing iput() from flusher thread (writeback_sb_inodes()) can create problems because iput() can do a lot of work - for example truncate the inode if it's the last iput on unlinked file. Some filesystems depend on flusher thread progressing (e.g. because they need to flush delay allocated blocks to reduce allocation uncertainty) and so flusher thread doing truncate creates interesting dependencies and possibilities for deadlocks. We get rid of iput() in flusher thread by using the fact that I_SYNC inode flag effectively pins the inode in memory. So if we take care to either hold i_lock or have I_SYNC set, we can get away without taking inode reference in writeback_sb_inodes(). As a side effect of these changes, we also fix possible use-after-free in wb_writeback() because inode_wait_for_writeback() call could try to reacquire i_lock on the inode that was already free. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
-rw-r--r--fs/fs-writeback.c66
-rw-r--r--fs/inode.c8
-rw-r--r--include/linux/fs.h7
-rw-r--r--include/linux/writeback.h7
4 files changed, 65 insertions, 23 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5f2c68289610..8d2fb8c88cf3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -326,9 +326,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
326} 326}
327 327
328/* 328/*
329 * Wait for writeback on an inode to complete. 329 * Wait for writeback on an inode to complete. Called with i_lock held.
330 * Caller must make sure inode cannot go away when we drop i_lock.
330 */ 331 */
331static void inode_wait_for_writeback(struct inode *inode) 332static void __inode_wait_for_writeback(struct inode *inode)
333 __releases(inode->i_lock)
334 __acquires(inode->i_lock)
332{ 335{
333 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 336 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
334 wait_queue_head_t *wqh; 337 wait_queue_head_t *wqh;
@@ -342,6 +345,36 @@ static void inode_wait_for_writeback(struct inode *inode)
342} 345}
343 346
344/* 347/*
348 * Wait for writeback on an inode to complete. Caller must have inode pinned.
349 */
350void inode_wait_for_writeback(struct inode *inode)
351{
352 spin_lock(&inode->i_lock);
353 __inode_wait_for_writeback(inode);
354 spin_unlock(&inode->i_lock);
355}
356
357/*
358 * Sleep until I_SYNC is cleared. This function must be called with i_lock
359 * held and drops it. It is aimed for callers not holding any inode reference
360 * so once i_lock is dropped, inode can go away.
361 */
362static void inode_sleep_on_writeback(struct inode *inode)
363 __releases(inode->i_lock)
364{
365 DEFINE_WAIT(wait);
366 wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
367 int sleep;
368
369 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
370 sleep = inode->i_state & I_SYNC;
371 spin_unlock(&inode->i_lock);
372 if (sleep)
373 schedule();
374 finish_wait(wqh, &wait);
375}
376
377/*
345 * Find proper writeback list for the inode depending on its current state and 378 * Find proper writeback list for the inode depending on its current state and
346 * possibly also change of its state while we were doing writeback. Here we 379 * possibly also change of its state while we were doing writeback. Here we
347 * handle things such as livelock prevention or fairness of writeback among 380 * handle things such as livelock prevention or fairness of writeback among
@@ -479,9 +512,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
479 if (wbc->sync_mode != WB_SYNC_ALL) 512 if (wbc->sync_mode != WB_SYNC_ALL)
480 goto out; 513 goto out;
481 /* 514 /*
482 * It's a data-integrity sync. We must wait. 515 * It's a data-integrity sync. We must wait. Since callers hold
516 * inode reference or inode has I_WILL_FREE set, it cannot go
517 * away under us.
483 */ 518 */
484 inode_wait_for_writeback(inode); 519 __inode_wait_for_writeback(inode);
485 } 520 }
486 WARN_ON(inode->i_state & I_SYNC); 521 WARN_ON(inode->i_state & I_SYNC);
487 /* 522 /*
@@ -620,20 +655,28 @@ static long writeback_sb_inodes(struct super_block *sb,
620 } 655 }
621 spin_unlock(&wb->list_lock); 656 spin_unlock(&wb->list_lock);
622 657
623 __iget(inode);
624 /* 658 /*
625 * We already requeued the inode if it had I_SYNC set and we 659 * We already requeued the inode if it had I_SYNC set and we
626 * are doing WB_SYNC_NONE writeback. So this catches only the 660 * are doing WB_SYNC_NONE writeback. So this catches only the
627 * WB_SYNC_ALL case. 661 * WB_SYNC_ALL case.
628 */ 662 */
629 if (inode->i_state & I_SYNC) 663 if (inode->i_state & I_SYNC) {
630 inode_wait_for_writeback(inode); 664 /* Wait for I_SYNC. This function drops i_lock... */
665 inode_sleep_on_writeback(inode);
666 /* Inode may be gone, start again */
667 continue;
668 }
631 inode->i_state |= I_SYNC; 669 inode->i_state |= I_SYNC;
632 spin_unlock(&inode->i_lock); 670 spin_unlock(&inode->i_lock);
671
633 write_chunk = writeback_chunk_size(wb->bdi, work); 672 write_chunk = writeback_chunk_size(wb->bdi, work);
634 wbc.nr_to_write = write_chunk; 673 wbc.nr_to_write = write_chunk;
635 wbc.pages_skipped = 0; 674 wbc.pages_skipped = 0;
636 675
676 /*
677 * We use I_SYNC to pin the inode in memory. While it is set
678 * evict_inode() will wait so the inode cannot be freed.
679 */
637 __writeback_single_inode(inode, wb, &wbc); 680 __writeback_single_inode(inode, wb, &wbc);
638 681
639 work->nr_pages -= write_chunk - wbc.nr_to_write; 682 work->nr_pages -= write_chunk - wbc.nr_to_write;
@@ -645,10 +688,7 @@ static long writeback_sb_inodes(struct super_block *sb,
645 requeue_inode(inode, wb, &wbc); 688 requeue_inode(inode, wb, &wbc);
646 inode_sync_complete(inode); 689 inode_sync_complete(inode);
647 spin_unlock(&inode->i_lock); 690 spin_unlock(&inode->i_lock);
648 spin_unlock(&wb->list_lock); 691 cond_resched_lock(&wb->list_lock);
649 iput(inode);
650 cond_resched();
651 spin_lock(&wb->list_lock);
652 /* 692 /*
653 * bail out to wb_writeback() often enough to check 693 * bail out to wb_writeback() often enough to check
654 * background threshold and other termination conditions. 694 * background threshold and other termination conditions.
@@ -843,8 +883,8 @@ static long wb_writeback(struct bdi_writeback *wb,
843 inode = wb_inode(wb->b_more_io.prev); 883 inode = wb_inode(wb->b_more_io.prev);
844 spin_lock(&inode->i_lock); 884 spin_lock(&inode->i_lock);
845 spin_unlock(&wb->list_lock); 885 spin_unlock(&wb->list_lock);
846 inode_wait_for_writeback(inode); 886 /* This function drops i_lock... */
847 spin_unlock(&inode->i_lock); 887 inode_sleep_on_writeback(inode);
848 spin_lock(&wb->list_lock); 888 spin_lock(&wb->list_lock);
849 } 889 }
850 } 890 }
diff --git a/fs/inode.c b/fs/inode.c
index 02c0fa5e16a4..f4e145016611 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -530,7 +530,13 @@ static void evict(struct inode *inode)
530 530
531 inode_sb_list_del(inode); 531 inode_sb_list_del(inode);
532 532
533 inode_sync_wait(inode); 533 /*
534 * Wait for flusher thread to be done with the inode so that filesystem
535 * does not start destroying it while writeback is still running. Since
536 * the inode has I_FREEING set, flusher thread won't start new work on
537 * the inode. We just have to wait for running writeback to finish.
538 */
539 inode_wait_for_writeback(inode);
534 540
535 if (op->evict_inode) { 541 if (op->evict_inode) {
536 op->evict_inode(inode); 542 op->evict_inode(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c79316c79ee3..1c71e7f4d234 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1753,9 +1753,10 @@ struct super_operations {
1753 * anew. Other functions will just ignore such inodes, 1753 * anew. Other functions will just ignore such inodes,
1754 * if appropriate. I_NEW is used for waiting. 1754 * if appropriate. I_NEW is used for waiting.
1755 * 1755 *
1756 * I_SYNC Synchonized write of dirty inode data. The bits is 1756 * I_SYNC Writeback of inode is running. The bit is set during
1757 * set during data writeback, and cleared with a wakeup 1757 * data writeback, and cleared with a wakeup on the bit
1758 * on the bit address once it is done. 1758 * address once it is done. The bit is also used to pin
1759 * the inode in memory for flusher thread.
1759 * 1760 *
1760 * I_REFERENCED Marks the inode as recently references on the LRU list. 1761 * I_REFERENCED Marks the inode as recently references on the LRU list.
1761 * 1762 *
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3309736ff059..6d0a0fcd80e7 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -95,6 +95,7 @@ long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
95 enum wb_reason reason); 95 enum wb_reason reason);
96long wb_do_writeback(struct bdi_writeback *wb, int force_wait); 96long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
97void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); 97void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
98void inode_wait_for_writeback(struct inode *inode);
98 99
99/* writeback.h requires fs.h; it, too, is not included from here. */ 100/* writeback.h requires fs.h; it, too, is not included from here. */
100static inline void wait_on_inode(struct inode *inode) 101static inline void wait_on_inode(struct inode *inode)
@@ -102,12 +103,6 @@ static inline void wait_on_inode(struct inode *inode)
102 might_sleep(); 103 might_sleep();
103 wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); 104 wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE);
104} 105}
105static inline void inode_sync_wait(struct inode *inode)
106{
107 might_sleep();
108 wait_on_bit(&inode->i_state, __I_SYNC, inode_wait,
109 TASK_UNINTERRUPTIBLE);
110}
111 106
112 107
113/* 108/*