vfs: fix dentry RCU to refcounting possibly sleeping dput()

This is the fix that the last two commits indirectly led up to - making sure that we don't call dput() in a bad context on the dentries we've looked up in RCU mode after the sequence count validation fails. This basically expands d_rcu_to_refcount() into the callers, and then fixes the callers to delay the dput() in the failure case until _after_ we've dropped all locks and are no longer in an RCU-locked region. The case of 'complete_walk()' was trivial, since its failure case did the unlock_rcu_walk() directly after the call to d_rcu_to_refcount(), and as such that is just a pure expansion of the function with a trivial movement of the resulting dput() to after 'unlock_rcu_walk()'. In contrast, the unlazy_walk() case was much more complicated, because not only does convert two different dentries from RCU to be reference counted, but it used to not call unlock_rcu_walk() at all, and instead just returned an error and let the caller clean everything up in "terminate_walk()". Happily, one of the dentries in question (called "parent" inside unlazy_walk()) is the dentry of "nd->path", which terminate_walk() wants a refcount to anyway for the non-RCU case. So what the new and improved unlazy_walk() does is to first turn that dentry into a refcounted one, and once that is set up, the error cases can continue to use the terminate_walk() helper for cleanup, but for the non-RCU case. Which makes it possible to drop out of RCU mode if we actually hit the sequence number failure case. Acked-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-08 21:13:49 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-09-08 21:13:49 -0400
commit: e5c832d5558826cc6e9a24746cfdec8e7780063a (patch)
tree: 2b40ee4754dc80b81018ac91282ade4bdcd3c562 /fs
parent: 0d98439ea3c6ffb2af931f6de4480e744634e2c5 (diff)
1 files changed, 49 insertions, 53 deletions
diff --git a/fs/namei.c b/fs/namei.c
index cc4bcfaa8624..56e4f4d537d0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -494,37 +494,6 @@ static inline void unlock_rcu_walk(void)
        br_read_unlock(&vfsmount_lock);
 }
-/*
- * When we move over from the RCU domain to properly refcounted
- * long-lived dentries, we need to check the sequence numbers
- * we got before lookup very carefully.
- *
- * We cannot blindly increment a dentry refcount - even if it
- * is not locked - if it is zero, because it may have gone
- * through the final d_kill() logic already.
- *
- * So for a zero refcount, we need to get the spinlock (which is
- * safe even for a dead dentry because the de-allocation is
- * RCU-delayed), and check the sequence count under the lock.
- *
- * Once we have checked the sequence count, we know it is live,
- * and since we hold the spinlock it cannot die from under us.
- *
- * In contrast, if the reference count wasn't zero, we can just
- * increment the lockref without having to take the spinlock.
- * Even if the sequence number ends up being stale, we haven't
- * gone through the final dput() and killed the dentry yet.
- */
-static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq)
-{
-        if (likely(lockref_get_not_dead(&dentry->d_lockref))) {
-                if (!read_seqcount_retry(validate, seq))
-                                return 0;
-                dput(dentry);
-        }
-        return -ECHILD;
-}
 /**
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
@@ -539,16 +508,29 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
-        int want_root = 0;
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
-                want_root = 1;
+        /*
-                spin_lock(&fs->lock);
+         * Get a reference to the parent first: we're
-                if (nd->root.mnt != fs->root.mnt ||
+         * going to make "path_put(nd->path)" valid in
-                                nd->root.dentry != fs->root.dentry)
+         * non-RCU context for "terminate_walk()".
-                        goto err_root;
+         *
-        }
+         * If this doesn't work, return immediately with
+         * RCU walking still active (and then we will do
+         * the RCU walk cleanup in terminate_walk()).
+         */
+        if (!lockref_get_not_dead(&parent->d_lockref))
+                return -ECHILD;
+        /*
+         * After the mntget(), we terminate_walk() will do
+         * the right thing for non-RCU mode, and all our
+         * subsequent exit cases should unlock_rcu_walk()
+         * before returning.
+         */
+        mntget(nd->path.mnt);
+        nd->flags &= ~LOOKUP_RCU;
        /*
         * For a negative lookup, the lookup sequence point is the parents
@@ -562,30 +544,39 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
         * be valid if the child sequence number is still valid.
         */
        if (!dentry) {
-                if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0)
+                if (read_seqcount_retry(&parent->d_seq, nd->seq))
-                        goto err_root;
+                        goto out;
                BUG_ON(nd->inode != parent->d_inode);
        } else {
-                if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0)
+                if (!lockref_get_not_dead(&dentry->d_lockref))
-                        goto err_root;
+                        goto out;
-                if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0)
+                if (read_seqcount_retry(&dentry->d_seq, nd->seq))
-                        goto err_parent;
+                        goto drop_dentry;
        }
-        if (want_root) {
+        /*
+         * Sequence counts matched. Now make sure that the root is
+         * still valid and get it if required.
+         */
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                spin_lock(&fs->lock);
+                if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
+                        goto unlock_and_drop_dentry;
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
-        mntget(nd->path.mnt);
        unlock_rcu_walk();
-        nd->flags &= ~LOOKUP_RCU;
        return 0;
-err_parent:
+unlock_and_drop_dentry:
+        spin_unlock(&fs->lock);
+drop_dentry:
+        unlock_rcu_walk();
        dput(dentry);
-err_root:
+        return -ECHILD;
-        if (want_root)
+out:
-                spin_unlock(&fs->lock);
+        unlock_rcu_walk();
        return -ECHILD;
 }
@@ -614,10 +605,15 @@ static int complete_walk(struct nameidata *nd)
                if (!(nd->flags & LOOKUP_ROOT))
                        nd->root.mnt = NULL;
-                if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) {
+                if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
                        unlock_rcu_walk();
                        return -ECHILD;
                }
+                if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
+                        unlock_rcu_walk();
+                        dput(dentry);
+                        return -ECHILD;
+                }
                mntget(nd->path.mnt);
                unlock_rcu_walk();
        }
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-08 21:13:49 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-09-08 21:13:49 -0400
commit	e5c832d5558826cc6e9a24746cfdec8e7780063a (patch)
tree	2b40ee4754dc80b81018ac91282ade4bdcd3c562 /fs
parent	0d98439ea3c6ffb2af931f6de4480e744634e2c5 (diff)

diff --git a/fs/namei.c b/fs/namei.c index cc4bcfaa8624..56e4f4d537d0 100644 --- a/fs/namei.c +++ b/fs/namei.c
@@ -494,37 +494,6 @@ static inline void unlock_rcu_walk(void)
494	br_read_unlock(&vfsmount_lock);	494	br_read_unlock(&vfsmount_lock);
495	}	495	}
496		496
497	/*
498	* When we move over from the RCU domain to properly refcounted
499	* long-lived dentries, we need to check the sequence numbers
500	* we got before lookup very carefully.
501	*
502	* We cannot blindly increment a dentry refcount - even if it
503	* is not locked - if it is zero, because it may have gone
504	* through the final d_kill() logic already.
505	*
506	* So for a zero refcount, we need to get the spinlock (which is
507	* safe even for a dead dentry because the de-allocation is
508	* RCU-delayed), and check the sequence count under the lock.
509	*
510	* Once we have checked the sequence count, we know it is live,
511	* and since we hold the spinlock it cannot die from under us.
512	*
513	* In contrast, if the reference count wasn't zero, we can just
514	* increment the lockref without having to take the spinlock.
515	* Even if the sequence number ends up being stale, we haven't
516	* gone through the final dput() and killed the dentry yet.
517	*/
518	static inline int d_rcu_to_refcount(struct dentry dentry, seqcount_t validate, unsigned seq)
519	{
520	if (likely(lockref_get_not_dead(&dentry->d_lockref))) {
521	if (!read_seqcount_retry(validate, seq))
522	return 0;
523	dput(dentry);
524	}
525	return -ECHILD;
526	}
527
528	/**	497	/**
529	* unlazy_walk - try to switch to ref-walk mode.	498	* unlazy_walk - try to switch to ref-walk mode.
530	* @nd: nameidata pathwalk data	499	* @nd: nameidata pathwalk data
@@ -539,16 +508,29 @@ static int unlazy_walk(struct nameidata nd, struct dentry dentry)
539	{	508	{
540	struct fs_struct *fs = current->fs;	509	struct fs_struct *fs = current->fs;
541	struct dentry *parent = nd->path.dentry;	510	struct dentry *parent = nd->path.dentry;
542	int want_root = 0;
543		511
544	BUG_ON(!(nd->flags & LOOKUP_RCU));	512	BUG_ON(!(nd->flags & LOOKUP_RCU));
545	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {	513
546	want_root = 1;	514	/*
547	spin_lock(&fs->lock);	515	* Get a reference to the parent first: we're
548	if (nd->root.mnt != fs->root.mnt \|\|	516	* going to make "path_put(nd->path)" valid in
549	nd->root.dentry != fs->root.dentry)	517	* non-RCU context for "terminate_walk()".
550	goto err_root;	518	*
551	}	519	* If this doesn't work, return immediately with
		520	* RCU walking still active (and then we will do
		521	* the RCU walk cleanup in terminate_walk()).
		522	*/
		523	if (!lockref_get_not_dead(&parent->d_lockref))
		524	return -ECHILD;
		525
		526	/*
		527	* After the mntget(), we terminate_walk() will do
		528	* the right thing for non-RCU mode, and all our
		529	* subsequent exit cases should unlock_rcu_walk()
		530	* before returning.
		531	*/
		532	mntget(nd->path.mnt);
		533	nd->flags &= ~LOOKUP_RCU;
552		534
553	/*	535	/*
554	* For a negative lookup, the lookup sequence point is the parents	536	* For a negative lookup, the lookup sequence point is the parents
@@ -562,30 +544,39 @@ static int unlazy_walk(struct nameidata nd, struct dentry dentry)
562	* be valid if the child sequence number is still valid.	544	* be valid if the child sequence number is still valid.
563	*/	545	*/
564	if (!dentry) {	546	if (!dentry) {
565	if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0)	547	if (read_seqcount_retry(&parent->d_seq, nd->seq))
566	goto err_root;	548	goto out;
567	BUG_ON(nd->inode != parent->d_inode);	549	BUG_ON(nd->inode != parent->d_inode);
568	} else {	550	} else {
569	if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0)	551	if (!lockref_get_not_dead(&dentry->d_lockref))
570	goto err_root;	552	goto out;
571	if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0)	553	if (read_seqcount_retry(&dentry->d_seq, nd->seq))
572	goto err_parent;	554	goto drop_dentry;
573	}	555	}
574	if (want_root) {	556
		557	/*
		558	* Sequence counts matched. Now make sure that the root is
		559	* still valid and get it if required.
		560	*/
		561	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
		562	spin_lock(&fs->lock);
		563	if (nd->root.mnt != fs->root.mnt \|\| nd->root.dentry != fs->root.dentry)
		564	goto unlock_and_drop_dentry;
575	path_get(&nd->root);	565	path_get(&nd->root);
576	spin_unlock(&fs->lock);	566	spin_unlock(&fs->lock);
577	}	567	}
578	mntget(nd->path.mnt);
579		568
580	unlock_rcu_walk();	569	unlock_rcu_walk();
581	nd->flags &= ~LOOKUP_RCU;
582	return 0;	570	return 0;
583		571
584	err_parent:	572	unlock_and_drop_dentry:
		573	spin_unlock(&fs->lock);
		574	drop_dentry:
		575	unlock_rcu_walk();
585	dput(dentry);	576	dput(dentry);
586	err_root:	577	return -ECHILD;
587	if (want_root)	578	out:
588	spin_unlock(&fs->lock);	579	unlock_rcu_walk();
589	return -ECHILD;	580	return -ECHILD;
590	}	581	}
591		582
@@ -614,10 +605,15 @@ static int complete_walk(struct nameidata *nd)
614	if (!(nd->flags & LOOKUP_ROOT))	605	if (!(nd->flags & LOOKUP_ROOT))
615	nd->root.mnt = NULL;	606	nd->root.mnt = NULL;
616		607
617	if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) {	608	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
618	unlock_rcu_walk();	609	unlock_rcu_walk();
619	return -ECHILD;	610	return -ECHILD;
620	}	611	}
		612	if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
		613	unlock_rcu_walk();
		614	dput(dentry);
		615	return -ECHILD;
		616	}
621	mntget(nd->path.mnt);	617	mntget(nd->path.mnt);
622	unlock_rcu_walk();	618	unlock_rcu_walk();
623	}	619	}