diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-08 21:13:49 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-08 21:13:49 -0400 |
commit | e5c832d5558826cc6e9a24746cfdec8e7780063a (patch) | |
tree | 2b40ee4754dc80b81018ac91282ade4bdcd3c562 /fs | |
parent | 0d98439ea3c6ffb2af931f6de4480e744634e2c5 (diff) |
vfs: fix dentry RCU to refcounting possibly sleeping dput()
This is the fix that the last two commits indirectly led up to - making
sure that we don't call dput() in a bad context on the dentries we've
looked up in RCU mode after the sequence count validation fails.
This basically expands d_rcu_to_refcount() into the callers, and then
fixes the callers to delay the dput() in the failure case until _after_
we've dropped all locks and are no longer in an RCU-locked region.
The case of 'complete_walk()' was trivial, since its failure case did
the unlock_rcu_walk() directly after the call to d_rcu_to_refcount(),
and as such that is just a pure expansion of the function with a trivial
movement of the resulting dput() to after 'unlock_rcu_walk()'.
In contrast, the unlazy_walk() case was much more complicated, because
not only does convert two different dentries from RCU to be reference
counted, but it used to not call unlock_rcu_walk() at all, and instead
just returned an error and let the caller clean everything up in
"terminate_walk()".
Happily, one of the dentries in question (called "parent" inside
unlazy_walk()) is the dentry of "nd->path", which terminate_walk() wants
a refcount to anyway for the non-RCU case.
So what the new and improved unlazy_walk() does is to first turn that
dentry into a refcounted one, and once that is set up, the error cases
can continue to use the terminate_walk() helper for cleanup, but for the
non-RCU case. Which makes it possible to drop out of RCU mode if we
actually hit the sequence number failure case.
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/namei.c | 102 |
1 files changed, 49 insertions, 53 deletions
diff --git a/fs/namei.c b/fs/namei.c index cc4bcfaa8624..56e4f4d537d0 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -494,37 +494,6 @@ static inline void unlock_rcu_walk(void) | |||
494 | br_read_unlock(&vfsmount_lock); | 494 | br_read_unlock(&vfsmount_lock); |
495 | } | 495 | } |
496 | 496 | ||
497 | /* | ||
498 | * When we move over from the RCU domain to properly refcounted | ||
499 | * long-lived dentries, we need to check the sequence numbers | ||
500 | * we got before lookup very carefully. | ||
501 | * | ||
502 | * We cannot blindly increment a dentry refcount - even if it | ||
503 | * is not locked - if it is zero, because it may have gone | ||
504 | * through the final d_kill() logic already. | ||
505 | * | ||
506 | * So for a zero refcount, we need to get the spinlock (which is | ||
507 | * safe even for a dead dentry because the de-allocation is | ||
508 | * RCU-delayed), and check the sequence count under the lock. | ||
509 | * | ||
510 | * Once we have checked the sequence count, we know it is live, | ||
511 | * and since we hold the spinlock it cannot die from under us. | ||
512 | * | ||
513 | * In contrast, if the reference count wasn't zero, we can just | ||
514 | * increment the lockref without having to take the spinlock. | ||
515 | * Even if the sequence number ends up being stale, we haven't | ||
516 | * gone through the final dput() and killed the dentry yet. | ||
517 | */ | ||
518 | static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq) | ||
519 | { | ||
520 | if (likely(lockref_get_not_dead(&dentry->d_lockref))) { | ||
521 | if (!read_seqcount_retry(validate, seq)) | ||
522 | return 0; | ||
523 | dput(dentry); | ||
524 | } | ||
525 | return -ECHILD; | ||
526 | } | ||
527 | |||
528 | /** | 497 | /** |
529 | * unlazy_walk - try to switch to ref-walk mode. | 498 | * unlazy_walk - try to switch to ref-walk mode. |
530 | * @nd: nameidata pathwalk data | 499 | * @nd: nameidata pathwalk data |
@@ -539,16 +508,29 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) | |||
539 | { | 508 | { |
540 | struct fs_struct *fs = current->fs; | 509 | struct fs_struct *fs = current->fs; |
541 | struct dentry *parent = nd->path.dentry; | 510 | struct dentry *parent = nd->path.dentry; |
542 | int want_root = 0; | ||
543 | 511 | ||
544 | BUG_ON(!(nd->flags & LOOKUP_RCU)); | 512 | BUG_ON(!(nd->flags & LOOKUP_RCU)); |
545 | if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { | 513 | |
546 | want_root = 1; | 514 | /* |
547 | spin_lock(&fs->lock); | 515 | * Get a reference to the parent first: we're |
548 | if (nd->root.mnt != fs->root.mnt || | 516 | * going to make "path_put(nd->path)" valid in |
549 | nd->root.dentry != fs->root.dentry) | 517 | * non-RCU context for "terminate_walk()". |
550 | goto err_root; | 518 | * |
551 | } | 519 | * If this doesn't work, return immediately with |
520 | * RCU walking still active (and then we will do | ||
521 | * the RCU walk cleanup in terminate_walk()). | ||
522 | */ | ||
523 | if (!lockref_get_not_dead(&parent->d_lockref)) | ||
524 | return -ECHILD; | ||
525 | |||
526 | /* | ||
527 | * After the mntget(), we terminate_walk() will do | ||
528 | * the right thing for non-RCU mode, and all our | ||
529 | * subsequent exit cases should unlock_rcu_walk() | ||
530 | * before returning. | ||
531 | */ | ||
532 | mntget(nd->path.mnt); | ||
533 | nd->flags &= ~LOOKUP_RCU; | ||
552 | 534 | ||
553 | /* | 535 | /* |
554 | * For a negative lookup, the lookup sequence point is the parents | 536 | * For a negative lookup, the lookup sequence point is the parents |
@@ -562,30 +544,39 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) | |||
562 | * be valid if the child sequence number is still valid. | 544 | * be valid if the child sequence number is still valid. |
563 | */ | 545 | */ |
564 | if (!dentry) { | 546 | if (!dentry) { |
565 | if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0) | 547 | if (read_seqcount_retry(&parent->d_seq, nd->seq)) |
566 | goto err_root; | 548 | goto out; |
567 | BUG_ON(nd->inode != parent->d_inode); | 549 | BUG_ON(nd->inode != parent->d_inode); |
568 | } else { | 550 | } else { |
569 | if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) | 551 | if (!lockref_get_not_dead(&dentry->d_lockref)) |
570 | goto err_root; | 552 | goto out; |
571 | if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0) | 553 | if (read_seqcount_retry(&dentry->d_seq, nd->seq)) |
572 | goto err_parent; | 554 | goto drop_dentry; |
573 | } | 555 | } |
574 | if (want_root) { | 556 | |
557 | /* | ||
558 | * Sequence counts matched. Now make sure that the root is | ||
559 | * still valid and get it if required. | ||
560 | */ | ||
561 | if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { | ||
562 | spin_lock(&fs->lock); | ||
563 | if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) | ||
564 | goto unlock_and_drop_dentry; | ||
575 | path_get(&nd->root); | 565 | path_get(&nd->root); |
576 | spin_unlock(&fs->lock); | 566 | spin_unlock(&fs->lock); |
577 | } | 567 | } |
578 | mntget(nd->path.mnt); | ||
579 | 568 | ||
580 | unlock_rcu_walk(); | 569 | unlock_rcu_walk(); |
581 | nd->flags &= ~LOOKUP_RCU; | ||
582 | return 0; | 570 | return 0; |
583 | 571 | ||
584 | err_parent: | 572 | unlock_and_drop_dentry: |
573 | spin_unlock(&fs->lock); | ||
574 | drop_dentry: | ||
575 | unlock_rcu_walk(); | ||
585 | dput(dentry); | 576 | dput(dentry); |
586 | err_root: | 577 | return -ECHILD; |
587 | if (want_root) | 578 | out: |
588 | spin_unlock(&fs->lock); | 579 | unlock_rcu_walk(); |
589 | return -ECHILD; | 580 | return -ECHILD; |
590 | } | 581 | } |
591 | 582 | ||
@@ -614,10 +605,15 @@ static int complete_walk(struct nameidata *nd) | |||
614 | if (!(nd->flags & LOOKUP_ROOT)) | 605 | if (!(nd->flags & LOOKUP_ROOT)) |
615 | nd->root.mnt = NULL; | 606 | nd->root.mnt = NULL; |
616 | 607 | ||
617 | if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) { | 608 | if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { |
618 | unlock_rcu_walk(); | 609 | unlock_rcu_walk(); |
619 | return -ECHILD; | 610 | return -ECHILD; |
620 | } | 611 | } |
612 | if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { | ||
613 | unlock_rcu_walk(); | ||
614 | dput(dentry); | ||
615 | return -ECHILD; | ||
616 | } | ||
621 | mntget(nd->path.mnt); | 617 | mntget(nd->path.mnt); |
622 | unlock_rcu_walk(); | 618 | unlock_rcu_walk(); |
623 | } | 619 | } |