aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSteven Rostedt <rostedt@goodmis.org>2014-06-09 14:06:07 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2014-07-09 19:37:29 -0400
commit485d44022a152c0254dd63445fdb81c4194cbf0e (patch)
treed67126d760940e40207746345a17ccfbfb73f883
parentb76fc285337b6b256e9ba20a40cfd043f70c27af (diff)
debugfs: Fix corrupted loop in debugfs_remove_recursive
[ I'm currently running my tests on it now, and so far, after a few hours it has yet to blow up. I'll run it for 24 hours which it never succeeded in the past. ] The tracing code has a way to make directories within the debugfs file system as well as deleting them using mkdir/rmdir in the instance directory. This is very limited in functionality, such as there is no renames, and the parent directory "instance" can not be modified. The tracing code creates the instance directory from the debugfs code and then replaces the dentry->d_inode->i_op with its own to allow for mkdir/rmdir to work. When these are called, the d_entry and inode locks need to be released to call the instance creation and deletion code. That code has its own accounting and locking to serialize everything to prevent multiple users from causing harm. As the parent "instance" directory can not be modified this simplifies things. I created a stress test that creates several threads that randomly creates and deletes directories thousands of times a second. The code stood up to this test and I submitted it a while ago. Recently I added a new test that adds readers to the mix. While the instance directories were being added and deleted, readers would read from these directories and even enable tracing within them. This test was able to trigger a bug: general protection fault: 0000 [#1] PREEMPT SMP Modules linked in: ... CPU: 3 PID: 17789 Comm: rmdir Tainted: G W 3.15.0-rc2-test+ #41 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 task: ffff88003786ca60 ti: ffff880077018000 task.ti: ffff880077018000 RIP: 0010:[<ffffffff811ed5eb>] [<ffffffff811ed5eb>] debugfs_remove_recursive+0x1bd/0x367 RSP: 0018:ffff880077019df8 EFLAGS: 00010246 RAX: 0000000000000002 RBX: ffff88006f0fe490 RCX: 0000000000000000 RDX: dead000000100058 RSI: 0000000000000246 RDI: ffff88003786d454 RBP: ffff88006f0fe640 R08: 0000000000000628 R09: 0000000000000000 R10: 0000000000000628 R11: ffff8800795110a0 R12: ffff88006f0fe640 R13: ffff88006f0fe640 R14: ffffffff81817d0b R15: ffffffff818188b7 FS: 00007ff13ae24700(0000) GS:ffff88007d580000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000003054ec7be0 CR3: 0000000076d51000 CR4: 00000000000007e0 Stack: ffff88007a41ebe0 dead000000100058 00000000fffffffe ffff88006f0fe640 0000000000000000 ffff88006f0fe678 ffff88007a41ebe0 ffff88003793a000 00000000fffffffe ffffffff810bde82 ffff88006f0fe640 ffff88007a41eb28 Call Trace: [<ffffffff810bde82>] ? instance_rmdir+0x15b/0x1de [<ffffffff81132e2d>] ? vfs_rmdir+0x80/0xd3 [<ffffffff81132f51>] ? do_rmdir+0xd1/0x139 [<ffffffff8124ad9e>] ? trace_hardirqs_on_thunk+0x3a/0x3c [<ffffffff814fea62>] ? system_call_fastpath+0x16/0x1b Code: fe ff ff 48 8d 75 30 48 89 df e8 c9 fd ff ff 85 c0 75 13 48 c7 c6 b8 cc d2 81 48 c7 c7 b0 cc d2 81 e8 8c 7a f5 ff 48 8b 54 24 08 <48> 8b 82 a8 00 00 00 48 89 d3 48 2d a8 00 00 00 48 89 44 24 08 RIP [<ffffffff811ed5eb>] debugfs_remove_recursive+0x1bd/0x367 RSP <ffff880077019df8> It took a while, but every time it triggered, it was always in the same place: list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) { Where the child->d_u.d_child seemed to be corrupted. I added lots of trace_printk()s to see what was wrong, and sure enough, it was always the child's d_u.d_child field. I looked around to see what touches it and noticed that in __dentry_kill() which calls dentry_free(): static void dentry_free(struct dentry *dentry) { /* if dentry was never visible to RCU, immediate free is OK */ if (!(dentry->d_flags & DCACHE_RCUACCESS)) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } I also noticed that __dentry_kill() unlinks the child->d_u.child under the parent->d_lock spin_lock. Looking back at the loop in debugfs_remove_recursive() it never takes the parent->d_lock to do the list walk. Adding more tracing, I was able to prove this was the issue: ftrace-t-15385 1.... 246662024us : dentry_kill <ffffffff81138b91>: free ffff88006d573600 rmdir-15409 2.... 246662024us : debugfs_remove_recursive <ffffffff811ec7e5>: child=ffff88006d573600 next=dead000000100058 The dentry_kill freed ffff88006d573600 just as the remove recursive was walking it. In order to fix this, the list walk needs to be modified a bit to take the parent->d_lock. The safe version is no longer necessary, as every time we remove a child, the parent->d_lock must be released and the list walk must start over. Each time a child is removed, even though it may still be on the list, it should be skipped by the first check in the loop: if (!debugfs_positive(child)) continue; Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--fs/debugfs/inode.c33
1 files changed, 26 insertions, 7 deletions
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 8c41b52da358..16a46b6a6fee 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -534,7 +534,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
534 */ 534 */
535void debugfs_remove_recursive(struct dentry *dentry) 535void debugfs_remove_recursive(struct dentry *dentry)
536{ 536{
537 struct dentry *child, *next, *parent; 537 struct dentry *child, *parent;
538 538
539 if (IS_ERR_OR_NULL(dentry)) 539 if (IS_ERR_OR_NULL(dentry))
540 return; 540 return;
@@ -546,30 +546,49 @@ void debugfs_remove_recursive(struct dentry *dentry)
546 parent = dentry; 546 parent = dentry;
547 down: 547 down:
548 mutex_lock(&parent->d_inode->i_mutex); 548 mutex_lock(&parent->d_inode->i_mutex);
549 list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) { 549 loop:
550 /*
551 * The parent->d_subdirs is protected by the d_lock. Outside that
552 * lock, the child can be unlinked and set to be freed which can
553 * use the d_u.d_child as the rcu head and corrupt this list.
554 */
555 spin_lock(&parent->d_lock);
556 list_for_each_entry(child, &parent->d_subdirs, d_u.d_child) {
550 if (!debugfs_positive(child)) 557 if (!debugfs_positive(child))
551 continue; 558 continue;
552 559
553 /* perhaps simple_empty(child) makes more sense */ 560 /* perhaps simple_empty(child) makes more sense */
554 if (!list_empty(&child->d_subdirs)) { 561 if (!list_empty(&child->d_subdirs)) {
562 spin_unlock(&parent->d_lock);
555 mutex_unlock(&parent->d_inode->i_mutex); 563 mutex_unlock(&parent->d_inode->i_mutex);
556 parent = child; 564 parent = child;
557 goto down; 565 goto down;
558 } 566 }
559 up: 567
568 spin_unlock(&parent->d_lock);
569
560 if (!__debugfs_remove(child, parent)) 570 if (!__debugfs_remove(child, parent))
561 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 571 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
572
573 /*
574 * The parent->d_lock protects agaist child from unlinking
575 * from d_subdirs. When releasing the parent->d_lock we can
576 * no longer trust that the next pointer is valid.
577 * Restart the loop. We'll skip this one with the
578 * debugfs_positive() check.
579 */
580 goto loop;
562 } 581 }
582 spin_unlock(&parent->d_lock);
563 583
564 mutex_unlock(&parent->d_inode->i_mutex); 584 mutex_unlock(&parent->d_inode->i_mutex);
565 child = parent; 585 child = parent;
566 parent = parent->d_parent; 586 parent = parent->d_parent;
567 mutex_lock(&parent->d_inode->i_mutex); 587 mutex_lock(&parent->d_inode->i_mutex);
568 588
569 if (child != dentry) { 589 if (child != dentry)
570 next = list_next_entry(child, d_u.d_child); 590 /* go up */
571 goto up; 591 goto loop;
572 }
573 592
574 if (!__debugfs_remove(child, parent)) 593 if (!__debugfs_remove(child, parent))
575 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 594 simple_release_fs(&debugfs_mount, &debugfs_mount_count);