aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2009-03-12 17:31:38 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-03-12 19:20:24 -0400
commit7ef0d7377cb287e08f3ae94cebc919448e1f5dff (patch)
tree3ab288db22eb17e76b5db1d9b8c6f7517570632f
parentf272b7bc447553410dde691aa31fc531adf9c175 (diff)
fs: new inode i_state corruption fix
There was a report of a data corruption http://lkml.org/lkml/2008/11/14/121. There is a script included to reproduce the problem. During testing, I encountered a number of strange things with ext3, so I tried ext2 to attempt to reduce complexity of the problem. I found that fsstress would quickly hang in wait_on_inode, waiting for I_LOCK to be cleared, even though instrumentation showed that unlock_new_inode had already been called for that inode. This points to memory scribble, or synchronisation problme. i_state of I_NEW inodes is not protected by inode_lock because other processes are not supposed to touch them until I_LOCK (and I_NEW) is cleared. Adding WARN_ON(inode->i_state & I_NEW) to sites where we modify i_state revealed that generic_sync_sb_inodes is picking up new inodes from the inode lists and passing them to __writeback_single_inode without waiting for I_NEW. Subsequently modifying i_state causes corruption. In my case it would look like this: CPU0 CPU1 unlock_new_inode() __sync_single_inode() reg <- inode->i_state reg -> reg & ~(I_LOCK|I_NEW) reg <- inode->i_state reg -> inode->i_state reg -> reg | I_SYNC reg -> inode->i_state Non-atomic RMW on CPU1 overwrites CPU0 store and sets I_LOCK|I_NEW again. Fix for this is rather than wait for I_NEW inodes, just skip over them: inodes concurrently being created are not subject to data integrity operations, and should not significantly contribute to dirty memory either. After this change, I'm unable to reproduce any of the added warnings or hangs after ~1hour of running. Previously, the new warnings would start immediately and hang would happen in under 5 minutes. I'm also testing on ext3 now, and so far no problems there either. I don't know whether this fixes the problem reported above, but it fixes a real problem for me. Cc: "Jorge Boncompte [DTI2]" <jorge@dti2.net> Reported-by: Adrian Hunter <ext-adrian.hunter@nokia.com> Cc: Jan Kara <jack@suse.cz> Cc: <stable@kernel.org> Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/fs-writeback.c9
-rw-r--r--fs/inode.c7
2 files changed, 15 insertions, 1 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5eaa62fd17f..e3fe9918faaf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -274,6 +274,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
274 int ret; 274 int ret;
275 275
276 BUG_ON(inode->i_state & I_SYNC); 276 BUG_ON(inode->i_state & I_SYNC);
277 WARN_ON(inode->i_state & I_NEW);
277 278
278 /* Set I_SYNC, reset I_DIRTY */ 279 /* Set I_SYNC, reset I_DIRTY */
279 dirty = inode->i_state & I_DIRTY; 280 dirty = inode->i_state & I_DIRTY;
@@ -298,6 +299,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
298 } 299 }
299 300
300 spin_lock(&inode_lock); 301 spin_lock(&inode_lock);
302 WARN_ON(inode->i_state & I_NEW);
301 inode->i_state &= ~I_SYNC; 303 inode->i_state &= ~I_SYNC;
302 if (!(inode->i_state & I_FREEING)) { 304 if (!(inode->i_state & I_FREEING)) {
303 if (!(inode->i_state & I_DIRTY) && 305 if (!(inode->i_state & I_DIRTY) &&
@@ -470,6 +472,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
470 break; 472 break;
471 } 473 }
472 474
475 if (inode->i_state & I_NEW) {
476 requeue_io(inode);
477 continue;
478 }
479
473 if (wbc->nonblocking && bdi_write_congested(bdi)) { 480 if (wbc->nonblocking && bdi_write_congested(bdi)) {
474 wbc->encountered_congestion = 1; 481 wbc->encountered_congestion = 1;
475 if (!sb_is_blkdev_sb(sb)) 482 if (!sb_is_blkdev_sb(sb))
@@ -531,7 +538,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
531 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 538 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
532 struct address_space *mapping; 539 struct address_space *mapping;
533 540
534 if (inode->i_state & (I_FREEING|I_WILL_FREE)) 541 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
535 continue; 542 continue;
536 mapping = inode->i_mapping; 543 mapping = inode->i_mapping;
537 if (mapping->nrpages == 0) 544 if (mapping->nrpages == 0)
diff --git a/fs/inode.c b/fs/inode.c
index 913ab2d9a5d1..826fb0b9d1c3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -359,6 +359,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
359 invalidate_inode_buffers(inode); 359 invalidate_inode_buffers(inode);
360 if (!atomic_read(&inode->i_count)) { 360 if (!atomic_read(&inode->i_count)) {
361 list_move(&inode->i_list, dispose); 361 list_move(&inode->i_list, dispose);
362 WARN_ON(inode->i_state & I_NEW);
362 inode->i_state |= I_FREEING; 363 inode->i_state |= I_FREEING;
363 count++; 364 count++;
364 continue; 365 continue;
@@ -460,6 +461,7 @@ static void prune_icache(int nr_to_scan)
460 continue; 461 continue;
461 } 462 }
462 list_move(&inode->i_list, &freeable); 463 list_move(&inode->i_list, &freeable);
464 WARN_ON(inode->i_state & I_NEW);
463 inode->i_state |= I_FREEING; 465 inode->i_state |= I_FREEING;
464 nr_pruned++; 466 nr_pruned++;
465 } 467 }
@@ -656,6 +658,7 @@ void unlock_new_inode(struct inode *inode)
656 * just created it (so there can be no old holders 658 * just created it (so there can be no old holders
657 * that haven't tested I_LOCK). 659 * that haven't tested I_LOCK).
658 */ 660 */
661 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
659 inode->i_state &= ~(I_LOCK|I_NEW); 662 inode->i_state &= ~(I_LOCK|I_NEW);
660 wake_up_inode(inode); 663 wake_up_inode(inode);
661} 664}
@@ -1145,6 +1148,7 @@ void generic_delete_inode(struct inode *inode)
1145 1148
1146 list_del_init(&inode->i_list); 1149 list_del_init(&inode->i_list);
1147 list_del_init(&inode->i_sb_list); 1150 list_del_init(&inode->i_sb_list);
1151 WARN_ON(inode->i_state & I_NEW);
1148 inode->i_state |= I_FREEING; 1152 inode->i_state |= I_FREEING;
1149 inodes_stat.nr_inodes--; 1153 inodes_stat.nr_inodes--;
1150 spin_unlock(&inode_lock); 1154 spin_unlock(&inode_lock);
@@ -1186,16 +1190,19 @@ static void generic_forget_inode(struct inode *inode)
1186 spin_unlock(&inode_lock); 1190 spin_unlock(&inode_lock);
1187 return; 1191 return;
1188 } 1192 }
1193 WARN_ON(inode->i_state & I_NEW);
1189 inode->i_state |= I_WILL_FREE; 1194 inode->i_state |= I_WILL_FREE;
1190 spin_unlock(&inode_lock); 1195 spin_unlock(&inode_lock);
1191 write_inode_now(inode, 1); 1196 write_inode_now(inode, 1);
1192 spin_lock(&inode_lock); 1197 spin_lock(&inode_lock);
1198 WARN_ON(inode->i_state & I_NEW);
1193 inode->i_state &= ~I_WILL_FREE; 1199 inode->i_state &= ~I_WILL_FREE;
1194 inodes_stat.nr_unused--; 1200 inodes_stat.nr_unused--;
1195 hlist_del_init(&inode->i_hash); 1201 hlist_del_init(&inode->i_hash);
1196 } 1202 }
1197 list_del_init(&inode->i_list); 1203 list_del_init(&inode->i_list);
1198 list_del_init(&inode->i_sb_list); 1204 list_del_init(&inode->i_sb_list);
1205 WARN_ON(inode->i_state & I_NEW);
1199 inode->i_state |= I_FREEING; 1206 inode->i_state |= I_FREEING;
1200 inodes_stat.nr_inodes--; 1207 inodes_stat.nr_inodes--;
1201 spin_unlock(&inode_lock); 1208 spin_unlock(&inode_lock);