aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2009-03-12 17:31:38 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-03-12 19:20:24 -0400
commit7ef0d7377cb287e08f3ae94cebc919448e1f5dff (patch)
tree3ab288db22eb17e76b5db1d9b8c6f7517570632f /fs/fs-writeback.c
parentf272b7bc447553410dde691aa31fc531adf9c175 (diff)
fs: new inode i_state corruption fix
There was a report of a data corruption http://lkml.org/lkml/2008/11/14/121. There is a script included to reproduce the problem. During testing, I encountered a number of strange things with ext3, so I tried ext2 to attempt to reduce complexity of the problem. I found that fsstress would quickly hang in wait_on_inode, waiting for I_LOCK to be cleared, even though instrumentation showed that unlock_new_inode had already been called for that inode. This points to memory scribble, or synchronisation problme. i_state of I_NEW inodes is not protected by inode_lock because other processes are not supposed to touch them until I_LOCK (and I_NEW) is cleared. Adding WARN_ON(inode->i_state & I_NEW) to sites where we modify i_state revealed that generic_sync_sb_inodes is picking up new inodes from the inode lists and passing them to __writeback_single_inode without waiting for I_NEW. Subsequently modifying i_state causes corruption. In my case it would look like this: CPU0 CPU1 unlock_new_inode() __sync_single_inode() reg <- inode->i_state reg -> reg & ~(I_LOCK|I_NEW) reg <- inode->i_state reg -> inode->i_state reg -> reg | I_SYNC reg -> inode->i_state Non-atomic RMW on CPU1 overwrites CPU0 store and sets I_LOCK|I_NEW again. Fix for this is rather than wait for I_NEW inodes, just skip over them: inodes concurrently being created are not subject to data integrity operations, and should not significantly contribute to dirty memory either. After this change, I'm unable to reproduce any of the added warnings or hangs after ~1hour of running. Previously, the new warnings would start immediately and hang would happen in under 5 minutes. I'm also testing on ext3 now, and so far no problems there either. I don't know whether this fixes the problem reported above, but it fixes a real problem for me. Cc: "Jorge Boncompte [DTI2]" <jorge@dti2.net> Reported-by: Adrian Hunter <ext-adrian.hunter@nokia.com> Cc: Jan Kara <jack@suse.cz> Cc: <stable@kernel.org> Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c9
1 files changed, 8 insertions, 1 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5eaa62fd17f..e3fe9918faaf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -274,6 +274,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
274 int ret; 274 int ret;
275 275
276 BUG_ON(inode->i_state & I_SYNC); 276 BUG_ON(inode->i_state & I_SYNC);
277 WARN_ON(inode->i_state & I_NEW);
277 278
278 /* Set I_SYNC, reset I_DIRTY */ 279 /* Set I_SYNC, reset I_DIRTY */
279 dirty = inode->i_state & I_DIRTY; 280 dirty = inode->i_state & I_DIRTY;
@@ -298,6 +299,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
298 } 299 }
299 300
300 spin_lock(&inode_lock); 301 spin_lock(&inode_lock);
302 WARN_ON(inode->i_state & I_NEW);
301 inode->i_state &= ~I_SYNC; 303 inode->i_state &= ~I_SYNC;
302 if (!(inode->i_state & I_FREEING)) { 304 if (!(inode->i_state & I_FREEING)) {
303 if (!(inode->i_state & I_DIRTY) && 305 if (!(inode->i_state & I_DIRTY) &&
@@ -470,6 +472,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
470 break; 472 break;
471 } 473 }
472 474
475 if (inode->i_state & I_NEW) {
476 requeue_io(inode);
477 continue;
478 }
479
473 if (wbc->nonblocking && bdi_write_congested(bdi)) { 480 if (wbc->nonblocking && bdi_write_congested(bdi)) {
474 wbc->encountered_congestion = 1; 481 wbc->encountered_congestion = 1;
475 if (!sb_is_blkdev_sb(sb)) 482 if (!sb_is_blkdev_sb(sb))
@@ -531,7 +538,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
531 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 538 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
532 struct address_space *mapping; 539 struct address_space *mapping;
533 540
534 if (inode->i_state & (I_FREEING|I_WILL_FREE)) 541 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
535 continue; 542 continue;
536 mapping = inode->i_mapping; 543 mapping = inode->i_mapping;
537 if (mapping->nrpages == 0) 544 if (mapping->nrpages == 0)