diff options
author | Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> | 2015-02-05 15:25:20 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-05 16:35:29 -0500 |
commit | 7ef3ff2fea8bf5e4a21cef47ad87710a3d0fdb52 (patch) | |
tree | fae90c5009ed6bf5009cedf9646c0a4edd71304c /fs/nilfs2 | |
parent | 81cca6fb1271b91f06b008976ea132bec2f14d86 (diff) |
nilfs2: fix deadlock of segment constructor over I_SYNC flag
Nilfs2 eventually hangs in a stress test with fsstress program. This
issue was caused by the following deadlock over I_SYNC flag between
nilfs_segctor_thread() and writeback_sb_inodes():
nilfs_segctor_thread()
nilfs_segctor_thread_construct()
nilfs_segctor_unlock()
nilfs_dispose_list()
iput()
iput_final()
evict()
inode_wait_for_writeback() * wait for I_SYNC flag
writeback_sb_inodes()
* set I_SYNC flag on inode->i_state
__writeback_single_inode()
do_writepages()
nilfs_writepages()
nilfs_construct_dsync_segment()
nilfs_segctor_sync()
* wait for completion of segment constructor
inode_sync_complete()
* clear I_SYNC flag after __writeback_single_inode() completed
writeback_sb_inodes() calls do_writepages() for dirty inodes after
setting I_SYNC flag on inode->i_state. do_writepages() in turn calls
nilfs_writepages(), which can run segment constructor and wait for its
completion. On the other hand, segment constructor calls iput(), which
can call evict() and wait for the I_SYNC flag on
inode_wait_for_writeback().
Since segment constructor doesn't know when I_SYNC will be set, it
cannot know whether iput() will block or not unless inode->i_nlink has a
non-zero count. We can prevent evict() from being called in iput() by
implementing sop->drop_inode(), but it's not preferable to leave inodes
with i_nlink == 0 for long periods because it even defers file
truncation and inode deallocation. So, this instead resolves the
deadlock by calling iput() asynchronously with a workqueue for inodes
with i_nlink == 0.
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/nilfs2')
-rw-r--r-- | fs/nilfs2/nilfs.h | 2 | ||||
-rw-r--r-- | fs/nilfs2/segment.c | 44 | ||||
-rw-r--r-- | fs/nilfs2/segment.h | 5 |
3 files changed, 44 insertions, 7 deletions
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 91093cd74f0d..385704027575 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h | |||
@@ -141,7 +141,6 @@ enum { | |||
141 | * @ti_save: Backup of journal_info field of task_struct | 141 | * @ti_save: Backup of journal_info field of task_struct |
142 | * @ti_flags: Flags | 142 | * @ti_flags: Flags |
143 | * @ti_count: Nest level | 143 | * @ti_count: Nest level |
144 | * @ti_garbage: List of inode to be put when releasing semaphore | ||
145 | */ | 144 | */ |
146 | struct nilfs_transaction_info { | 145 | struct nilfs_transaction_info { |
147 | u32 ti_magic; | 146 | u32 ti_magic; |
@@ -150,7 +149,6 @@ struct nilfs_transaction_info { | |||
150 | one of other filesystems has a bug. */ | 149 | one of other filesystems has a bug. */ |
151 | unsigned short ti_flags; | 150 | unsigned short ti_flags; |
152 | unsigned short ti_count; | 151 | unsigned short ti_count; |
153 | struct list_head ti_garbage; | ||
154 | }; | 152 | }; |
155 | 153 | ||
156 | /* ti_magic */ | 154 | /* ti_magic */ |
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 7ef18fc656c2..469086b9f99b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c | |||
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struct super_block *sb, | |||
305 | ti->ti_count = 0; | 305 | ti->ti_count = 0; |
306 | ti->ti_save = cur_ti; | 306 | ti->ti_save = cur_ti; |
307 | ti->ti_magic = NILFS_TI_MAGIC; | 307 | ti->ti_magic = NILFS_TI_MAGIC; |
308 | INIT_LIST_HEAD(&ti->ti_garbage); | ||
309 | current->journal_info = ti; | 308 | current->journal_info = ti; |
310 | 309 | ||
311 | for (;;) { | 310 | for (;;) { |
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(struct super_block *sb) | |||
332 | 331 | ||
333 | up_write(&nilfs->ns_segctor_sem); | 332 | up_write(&nilfs->ns_segctor_sem); |
334 | current->journal_info = ti->ti_save; | 333 | current->journal_info = ti->ti_save; |
335 | if (!list_empty(&ti->ti_garbage)) | ||
336 | nilfs_dispose_list(nilfs, &ti->ti_garbage, 0); | ||
337 | } | 334 | } |
338 | 335 | ||
339 | static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, | 336 | static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, |
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs, | |||
746 | } | 743 | } |
747 | } | 744 | } |
748 | 745 | ||
746 | static void nilfs_iput_work_func(struct work_struct *work) | ||
747 | { | ||
748 | struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info, | ||
749 | sc_iput_work); | ||
750 | struct the_nilfs *nilfs = sci->sc_super->s_fs_info; | ||
751 | |||
752 | nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0); | ||
753 | } | ||
754 | |||
749 | static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs, | 755 | static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs, |
750 | struct nilfs_root *root) | 756 | struct nilfs_root *root) |
751 | { | 757 | { |
@@ -1900,8 +1906,8 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, | |||
1900 | static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, | 1906 | static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, |
1901 | struct the_nilfs *nilfs) | 1907 | struct the_nilfs *nilfs) |
1902 | { | 1908 | { |
1903 | struct nilfs_transaction_info *ti = current->journal_info; | ||
1904 | struct nilfs_inode_info *ii, *n; | 1909 | struct nilfs_inode_info *ii, *n; |
1910 | int defer_iput = false; | ||
1905 | 1911 | ||
1906 | spin_lock(&nilfs->ns_inode_lock); | 1912 | spin_lock(&nilfs->ns_inode_lock); |
1907 | list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { | 1913 | list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { |
@@ -1912,9 +1918,24 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, | |||
1912 | clear_bit(NILFS_I_BUSY, &ii->i_state); | 1918 | clear_bit(NILFS_I_BUSY, &ii->i_state); |
1913 | brelse(ii->i_bh); | 1919 | brelse(ii->i_bh); |
1914 | ii->i_bh = NULL; | 1920 | ii->i_bh = NULL; |
1915 | list_move_tail(&ii->i_dirty, &ti->ti_garbage); | 1921 | list_del_init(&ii->i_dirty); |
1922 | if (!ii->vfs_inode.i_nlink) { | ||
1923 | /* | ||
1924 | * Defer calling iput() to avoid a deadlock | ||
1925 | * over I_SYNC flag for inodes with i_nlink == 0 | ||
1926 | */ | ||
1927 | list_add_tail(&ii->i_dirty, &sci->sc_iput_queue); | ||
1928 | defer_iput = true; | ||
1929 | } else { | ||
1930 | spin_unlock(&nilfs->ns_inode_lock); | ||
1931 | iput(&ii->vfs_inode); | ||
1932 | spin_lock(&nilfs->ns_inode_lock); | ||
1933 | } | ||
1916 | } | 1934 | } |
1917 | spin_unlock(&nilfs->ns_inode_lock); | 1935 | spin_unlock(&nilfs->ns_inode_lock); |
1936 | |||
1937 | if (defer_iput) | ||
1938 | schedule_work(&sci->sc_iput_work); | ||
1918 | } | 1939 | } |
1919 | 1940 | ||
1920 | /* | 1941 | /* |
@@ -2583,6 +2604,8 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, | |||
2583 | INIT_LIST_HEAD(&sci->sc_segbufs); | 2604 | INIT_LIST_HEAD(&sci->sc_segbufs); |
2584 | INIT_LIST_HEAD(&sci->sc_write_logs); | 2605 | INIT_LIST_HEAD(&sci->sc_write_logs); |
2585 | INIT_LIST_HEAD(&sci->sc_gc_inodes); | 2606 | INIT_LIST_HEAD(&sci->sc_gc_inodes); |
2607 | INIT_LIST_HEAD(&sci->sc_iput_queue); | ||
2608 | INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func); | ||
2586 | init_timer(&sci->sc_timer); | 2609 | init_timer(&sci->sc_timer); |
2587 | 2610 | ||
2588 | sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; | 2611 | sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; |
@@ -2609,6 +2632,8 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) | |||
2609 | ret = nilfs_segctor_construct(sci, SC_LSEG_SR); | 2632 | ret = nilfs_segctor_construct(sci, SC_LSEG_SR); |
2610 | nilfs_transaction_unlock(sci->sc_super); | 2633 | nilfs_transaction_unlock(sci->sc_super); |
2611 | 2634 | ||
2635 | flush_work(&sci->sc_iput_work); | ||
2636 | |||
2612 | } while (ret && retrycount-- > 0); | 2637 | } while (ret && retrycount-- > 0); |
2613 | } | 2638 | } |
2614 | 2639 | ||
@@ -2633,6 +2658,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) | |||
2633 | || sci->sc_seq_request != sci->sc_seq_done); | 2658 | || sci->sc_seq_request != sci->sc_seq_done); |
2634 | spin_unlock(&sci->sc_state_lock); | 2659 | spin_unlock(&sci->sc_state_lock); |
2635 | 2660 | ||
2661 | if (flush_work(&sci->sc_iput_work)) | ||
2662 | flag = true; | ||
2663 | |||
2636 | if (flag || !nilfs_segctor_confirm(sci)) | 2664 | if (flag || !nilfs_segctor_confirm(sci)) |
2637 | nilfs_segctor_write_out(sci); | 2665 | nilfs_segctor_write_out(sci); |
2638 | 2666 | ||
@@ -2642,6 +2670,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) | |||
2642 | nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); | 2670 | nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); |
2643 | } | 2671 | } |
2644 | 2672 | ||
2673 | if (!list_empty(&sci->sc_iput_queue)) { | ||
2674 | nilfs_warning(sci->sc_super, __func__, | ||
2675 | "iput queue is not empty\n"); | ||
2676 | nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1); | ||
2677 | } | ||
2678 | |||
2645 | WARN_ON(!list_empty(&sci->sc_segbufs)); | 2679 | WARN_ON(!list_empty(&sci->sc_segbufs)); |
2646 | WARN_ON(!list_empty(&sci->sc_write_logs)); | 2680 | WARN_ON(!list_empty(&sci->sc_write_logs)); |
2647 | 2681 | ||
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 38a1d0013314..a48d6de1e02c 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/types.h> | 26 | #include <linux/types.h> |
27 | #include <linux/fs.h> | 27 | #include <linux/fs.h> |
28 | #include <linux/buffer_head.h> | 28 | #include <linux/buffer_head.h> |
29 | #include <linux/workqueue.h> | ||
29 | #include <linux/nilfs2_fs.h> | 30 | #include <linux/nilfs2_fs.h> |
30 | #include "nilfs.h" | 31 | #include "nilfs.h" |
31 | 32 | ||
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer { | |||
92 | * @sc_nblk_inc: Block count of current generation | 93 | * @sc_nblk_inc: Block count of current generation |
93 | * @sc_dirty_files: List of files to be written | 94 | * @sc_dirty_files: List of files to be written |
94 | * @sc_gc_inodes: List of GC inodes having blocks to be written | 95 | * @sc_gc_inodes: List of GC inodes having blocks to be written |
96 | * @sc_iput_queue: list of inodes for which iput should be done | ||
97 | * @sc_iput_work: work struct to defer iput call | ||
95 | * @sc_freesegs: array of segment numbers to be freed | 98 | * @sc_freesegs: array of segment numbers to be freed |
96 | * @sc_nfreesegs: number of segments on @sc_freesegs | 99 | * @sc_nfreesegs: number of segments on @sc_freesegs |
97 | * @sc_dsync_inode: inode whose data pages are written for a sync operation | 100 | * @sc_dsync_inode: inode whose data pages are written for a sync operation |
@@ -135,6 +138,8 @@ struct nilfs_sc_info { | |||
135 | 138 | ||
136 | struct list_head sc_dirty_files; | 139 | struct list_head sc_dirty_files; |
137 | struct list_head sc_gc_inodes; | 140 | struct list_head sc_gc_inodes; |
141 | struct list_head sc_iput_queue; | ||
142 | struct work_struct sc_iput_work; | ||
138 | 143 | ||
139 | __u64 *sc_freesegs; | 144 | __u64 *sc_freesegs; |
140 | size_t sc_nfreesegs; | 145 | size_t sc_nfreesegs; |