aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2014-02-03 14:02:56 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2014-02-07 18:42:40 -0500
commit35beab0635f3cdd475e3c11a304b866c25b76fcf (patch)
tree23505ba741ef2933c93514f89f2d646212c67802 /fs
parentabd54f028ec30976d6e797e7474ec91d96186a0c (diff)
kernfs: restructure removal path to fix possible premature return
The recursive nature of kernfs_remove() means that, even if kernfs_remove() is not allowed to be called multiple times on the same node, there may be race conditions between removal of parent and its descendants. While we can claim that kernfs_remove() shouldn't be called on one of the descendants while the removal of an ancestor is in progress, such rule is unnecessarily restrictive and very difficult to enforce. It's better to simply allow invoking kernfs_remove() as the caller sees fit as long as the caller ensures that the node is accessible. The current behavior in such situations is broken. Whoever enters removal path first takes the node off the hierarchy and then deactivates. Following removers either return as soon as it notices that it's not the first one or can't even find the target node as it has already been removed from the hierarchy. In both cases, the following removers may finish prematurely while the nodes which should be removed and drained are still being processed by the first one. This patch restructures so that multiple removers, whether through recursion or direction invocation, always follow the following rules. * When there are multiple concurrent removers, only one puts the base ref. * Regardless of which one puts the base ref, all removers are blocked until the target node is fully deactivated and removed. To achieve the above, removal path now first marks all descendants including self REMOVED and then deactivates and unlinks leftmost descendant one-by-one. kernfs_deactivate() is called directly from __kernfs_removal() and drops and regrabs kernfs_mutex for each descendant to drain active refs. As this means that multiple removers can enter kernfs_deactivate() for the same node, the function is updated so that it can handle multiple deactivators of the same node - only one actually deactivates but all wait till drain completion. The restructured removal path guarantees that a removed node gets unlinked only after the node is deactivated and drained. Combined with proper multiple deactivator handling, this guarantees that any invocation of kernfs_remove() returns only after the node itself and all its descendants are deactivated, drained and removed. v2: Draining separated into a separate loop (used to be in the same loop as unlink) and done from __kernfs_deactivate(). This is to allow exposing deactivation as a separate interface later. Root node removal was broken in v1 patch. Fixed. v3: Revert most of v2 except for root node removal fix and simplification of KERNFS_REMOVED setting loop. v4: Refreshed on top of ("kernfs: make kernfs_deactivate() honor KERNFS_LOCKDEP flag"). Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/kernfs/dir.c129
1 files changed, 68 insertions, 61 deletions
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2193d30156ef..3ac93737174a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -106,18 +106,24 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
106 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree 106 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
107 * @kn: kernfs_node of interest 107 * @kn: kernfs_node of interest
108 * 108 *
109 * Unlink @kn from its sibling rbtree which starts from 109 * Try to unlink @kn from its sibling rbtree which starts from
110 * kn->parent->dir.children. 110 * kn->parent->dir.children. Returns %true if @kn was actually
111 * removed, %false if @kn wasn't on the rbtree.
111 * 112 *
112 * Locking: 113 * Locking:
113 * mutex_lock(kernfs_mutex) 114 * mutex_lock(kernfs_mutex)
114 */ 115 */
115static void kernfs_unlink_sibling(struct kernfs_node *kn) 116static bool kernfs_unlink_sibling(struct kernfs_node *kn)
116{ 117{
118 if (RB_EMPTY_NODE(&kn->rb))
119 return false;
120
117 if (kernfs_type(kn) == KERNFS_DIR) 121 if (kernfs_type(kn) == KERNFS_DIR)
118 kn->parent->dir.subdirs--; 122 kn->parent->dir.subdirs--;
119 123
120 rb_erase(&kn->rb, &kn->parent->dir.children); 124 rb_erase(&kn->rb, &kn->parent->dir.children);
125 RB_CLEAR_NODE(&kn->rb);
126 return true;
121} 127}
122 128
123/** 129/**
@@ -171,26 +177,34 @@ void kernfs_put_active(struct kernfs_node *kn)
171 * kernfs_deactivate - deactivate kernfs_node 177 * kernfs_deactivate - deactivate kernfs_node
172 * @kn: kernfs_node to deactivate 178 * @kn: kernfs_node to deactivate
173 * 179 *
174 * Deny new active references and drain existing ones. 180 * Deny new active references and drain existing ones. Mutiple
181 * removers may invoke this function concurrently on @kn and all will
182 * return after deactivation and draining are complete.
175 */ 183 */
176static void kernfs_deactivate(struct kernfs_node *kn) 184static void kernfs_deactivate(struct kernfs_node *kn)
185 __releases(&kernfs_mutex) __acquires(&kernfs_mutex)
177{ 186{
178 struct kernfs_root *root = kernfs_root(kn); 187 struct kernfs_root *root = kernfs_root(kn);
179 188
189 lockdep_assert_held(&kernfs_mutex);
180 BUG_ON(!(kn->flags & KERNFS_REMOVED)); 190 BUG_ON(!(kn->flags & KERNFS_REMOVED));
181 191
182 if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF)) 192 if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
183 return; 193 return;
184 194
185 if (kn->flags & KERNFS_LOCKDEP) 195 /* only the first invocation on @kn should deactivate it */
186 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); 196 if (atomic_read(&kn->active) >= 0)
197 atomic_add(KN_DEACTIVATED_BIAS, &kn->active);
187 198
188 atomic_add(KN_DEACTIVATED_BIAS, &kn->active); 199 mutex_unlock(&kernfs_mutex);
189 200
190 if ((kn->flags & KERNFS_LOCKDEP) && 201 if (kn->flags & KERNFS_LOCKDEP) {
191 atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) 202 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
192 lock_contended(&kn->dep_map, _RET_IP_); 203 if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
204 lock_contended(&kn->dep_map, _RET_IP_);
205 }
193 206
207 /* but everyone should wait for draining */
194 wait_event(root->deactivate_waitq, 208 wait_event(root->deactivate_waitq,
195 atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); 209 atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
196 210
@@ -198,6 +212,8 @@ static void kernfs_deactivate(struct kernfs_node *kn)
198 lock_acquired(&kn->dep_map, _RET_IP_); 212 lock_acquired(&kn->dep_map, _RET_IP_);
199 rwsem_release(&kn->dep_map, 1, _RET_IP_); 213 rwsem_release(&kn->dep_map, 1, _RET_IP_);
200 } 214 }
215
216 mutex_lock(&kernfs_mutex);
201} 217}
202 218
203/** 219/**
@@ -347,6 +363,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
347 363
348 atomic_set(&kn->count, 1); 364 atomic_set(&kn->count, 1);
349 atomic_set(&kn->active, 0); 365 atomic_set(&kn->active, 0);
366 RB_CLEAR_NODE(&kn->rb);
350 367
351 kn->name = name; 368 kn->name = name;
352 kn->mode = mode; 369 kn->mode = mode;
@@ -454,49 +471,6 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
454} 471}
455 472
456/** 473/**
457 * kernfs_remove_one - remove kernfs_node from parent
458 * @acxt: addrm context to use
459 * @kn: kernfs_node to be removed
460 *
461 * Mark @kn removed and drop nlink of parent inode if @kn is a
462 * directory. @kn is unlinked from the children list.
463 *
464 * This function should be called between calls to
465 * kernfs_addrm_start() and kernfs_addrm_finish() and should be
466 * passed the same @acxt as passed to kernfs_addrm_start().
467 *
468 * LOCKING:
469 * Determined by kernfs_addrm_start().
470 */
471static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
472 struct kernfs_node *kn)
473{
474 struct kernfs_iattrs *ps_iattr;
475
476 /*
477 * Removal can be called multiple times on the same node. Only the
478 * first invocation is effective and puts the base ref.
479 */
480 if (kn->flags & KERNFS_REMOVED)
481 return;
482
483 if (kn->parent) {
484 kernfs_unlink_sibling(kn);
485
486 /* Update timestamps on the parent */
487 ps_iattr = kn->parent->iattr;
488 if (ps_iattr) {
489 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
490 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
491 }
492 }
493
494 kn->flags |= KERNFS_REMOVED;
495 kn->u.removed_list = acxt->removed;
496 acxt->removed = kn;
497}
498
499/**
500 * kernfs_addrm_finish - finish up kernfs_node add/remove 474 * kernfs_addrm_finish - finish up kernfs_node add/remove
501 * @acxt: addrm context to finish up 475 * @acxt: addrm context to finish up
502 * 476 *
@@ -519,7 +493,6 @@ void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
519 493
520 acxt->removed = kn->u.removed_list; 494 acxt->removed = kn->u.removed_list;
521 495
522 kernfs_deactivate(kn);
523 kernfs_unmap_bin_file(kn); 496 kernfs_unmap_bin_file(kn);
524 kernfs_put(kn); 497 kernfs_put(kn);
525 } 498 }
@@ -828,20 +801,54 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
828static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, 801static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
829 struct kernfs_node *kn) 802 struct kernfs_node *kn)
830{ 803{
831 struct kernfs_node *pos, *next; 804 struct kernfs_node *pos;
805
806 lockdep_assert_held(&kernfs_mutex);
832 807
833 if (!kn) 808 if (!kn)
834 return; 809 return;
835 810
836 pr_debug("kernfs %s: removing\n", kn->name); 811 pr_debug("kernfs %s: removing\n", kn->name);
837 812
838 next = NULL; 813 /* disable lookup and node creation under @kn */
814 pos = NULL;
815 while ((pos = kernfs_next_descendant_post(pos, kn)))
816 pos->flags |= KERNFS_REMOVED;
817
818 /* deactivate and unlink the subtree node-by-node */
839 do { 819 do {
840 pos = next; 820 pos = kernfs_leftmost_descendant(kn);
841 next = kernfs_next_descendant_post(pos, kn); 821
842 if (pos) 822 /*
843 kernfs_remove_one(acxt, pos); 823 * kernfs_deactivate() drops kernfs_mutex temporarily and
844 } while (next); 824 * @pos's base ref could have been put by someone else by
825 * the time the function returns. Make sure it doesn't go
826 * away underneath us.
827 */
828 kernfs_get(pos);
829
830 kernfs_deactivate(pos);
831
832 /*
833 * kernfs_unlink_sibling() succeeds once per node. Use it
834 * to decide who's responsible for cleanups.
835 */
836 if (!pos->parent || kernfs_unlink_sibling(pos)) {
837 struct kernfs_iattrs *ps_iattr =
838 pos->parent ? pos->parent->iattr : NULL;
839
840 /* update timestamps on the parent */
841 if (ps_iattr) {
842 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
843 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
844 }
845
846 pos->u.removed_list = acxt->removed;
847 acxt->removed = pos;
848 }
849
850 kernfs_put(pos);
851 } while (pos != kn);
845} 852}
846 853
847/** 854/**