blkcg: use double locking instead of RCU for blkg synchronization

blkgs are chained from both blkcgs and request_queues and thus subjected to two locks - blkcg->lock and q->queue_lock. As both blkcg and q can go away anytime, locking during removal is tricky. It's currently solved by wrapping removal inside RCU, which makes the synchronization complex. There are three locks to worry about - the outer RCU, q lock and blkcg lock, and it leads to nasty subtle complications like conditional synchronize_rcu() on queue exit paths. For all other paths, blkcg lock is naturally nested inside q lock and the only exception is blkcg removal path, which is a very cold path and can be implemented as clumsy but conceptually-simple reverse double lock dancing. This patch updates blkg removal path such that blkgs are removed while holding both q and blkcg locks, which is trivial for request queue exit path - blkg_destroy_all(). The blkcg removal path, blkiocg_pre_destroy(), implements reverse double lock dancing essentially identical to ioc_release_fn(). This simplifies blkg locking - no half-dead blkgs to worry about. Now unnecessary RCU annotations will be removed by the next patch. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Tejun Heo <tj@kernel.org> 2012-03-05 16:15:21 -0500
committer: Jens Axboe <axboe@kernel.dk> 2012-03-06 15:27:24 -0500
commit: 9f13ef678efd977487fc0c2e489f17c9a8c67a3e (patch)
tree: e58a2dd153ad24b2ea173d5dfb575c507e1f7589 /block
parent: e8989fae38d9831c72b20375a206a919ca468c52 (diff)
3 files changed, 51 insertions, 99 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index cad5f15cf49b..e9e3b038c702 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -620,32 +620,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
-static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-        hlist_del_init_rcu(&blkg->blkcg_node);
-}
-/*
- * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
- * indicating that blk_group was unhashed by the time we got to it.
- */
-int blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-        struct blkio_cgroup *blkcg = blkg->blkcg;
-        unsigned long flags;
-        int ret = 1;
-        spin_lock_irqsave(&blkcg->lock, flags);
-        if (!hlist_unhashed(&blkg->blkcg_node)) {
-                __blkiocg_del_blkio_group(blkg);
-                ret = 0;
-        }
-        spin_unlock_irqrestore(&blkcg->lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 /* called under rcu_read_lock(). */
 struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
                                struct request_queue *q)
@@ -663,12 +637,16 @@ EXPORT_SYMBOL_GPL(blkg_lookup);
 static void blkg_destroy(struct blkio_group *blkg)
 {
        struct request_queue *q = blkg->q;
+        struct blkio_cgroup *blkcg = blkg->blkcg;
        lockdep_assert_held(q->queue_lock);
+        lockdep_assert_held(&blkcg->lock);
        /* Something wrong if we are trying to remove same group twice */
        WARN_ON_ONCE(list_empty(&blkg->q_node));
+        WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
        list_del_init(&blkg->q_node);
+        hlist_del_init_rcu(&blkg->blkcg_node);
        WARN_ON_ONCE(q->nr_blkgs <= 0);
        q->nr_blkgs--;
@@ -713,45 +691,33 @@ void update_root_blkg_pd(struct request_queue *q, enum blkio_policy_id plid)
 }
 EXPORT_SYMBOL_GPL(update_root_blkg_pd);
+/**
+ * blkg_destroy_all - destroy all blkgs associated with a request_queue
+ * @q: request_queue of interest
+ * @destroy_root: whether to destroy root blkg or not
+ *
+ * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
+ * destroyed; otherwise, root blkg is left alone.
+ */
 void blkg_destroy_all(struct request_queue *q, bool destroy_root)
 {
        struct blkio_group *blkg, *n;
-        while (true) {
+        spin_lock_irq(q->queue_lock);
-                bool done = true;
-                spin_lock_irq(q->queue_lock);
-                list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
-                        /* skip root? */
-                        if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
-                                continue;
-                        /*
-                         * If cgroup removal path got to blk_group first
-                         * and removed it from cgroup list, then it will
-                         * take care of destroying cfqg also.
-                         */
-                        if (!blkiocg_del_blkio_group(blkg))
-                                blkg_destroy(blkg);
-                        else
-                                done = false;
-                }
-                spin_unlock_irq(q->queue_lock);
+        list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+                struct blkio_cgroup *blkcg = blkg->blkcg;
-                /*
+                /* skip root? */
-                 * Group list may not be empty if we raced cgroup removal
+                if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
-                 * and lost.  cgroup removal is guaranteed to make forward
+                        continue;
-                 * progress and retrying after a while is enough.  This
-                 * ugliness is scheduled to be removed after locking
-                 * update.
-                 */
-                if (done)
-                        break;
-                msleep(10);     /* just some random duration I like */
+                spin_lock(&blkcg->lock);
+                blkg_destroy(blkg);
+                spin_unlock(&blkcg->lock);
        }
+        spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blkg_destroy_all);
@@ -1600,45 +1566,45 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
                                ARRAY_SIZE(blkio_files));
 }
+/**
+ * blkiocg_pre_destroy - cgroup pre_destroy callback
+ * @subsys: cgroup subsys
+ * @cgroup: cgroup of interest
+ *
+ * This function is called when @cgroup is about to go away and responsible
+ * for shooting down all blkgs associated with @cgroup.  blkgs should be
+ * removed while holding both q and blkcg locks.  As blkcg lock is nested
+ * inside q lock, this function performs reverse double lock dancing.
+ *
+ * This is the blkcg counterpart of ioc_release_fn().
+ */
 static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
                               struct cgroup *cgroup)
 {
        struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-        unsigned long flags;
-        struct blkio_group *blkg;
-        struct request_queue *q;
        rcu_read_lock();
+        spin_lock_irq(&blkcg->lock);
-        do {
+        while (!hlist_empty(&blkcg->blkg_list)) {
-                spin_lock_irqsave(&blkcg->lock, flags);
+                struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
+                                                struct blkio_group, blkcg_node);
+                struct request_queue *q = rcu_dereference(blkg->q);
-                if (hlist_empty(&blkcg->blkg_list)) {
+                if (spin_trylock(q->queue_lock)) {
-                        spin_unlock_irqrestore(&blkcg->lock, flags);
+                        blkg_destroy(blkg);
-                        break;
+                        spin_unlock(q->queue_lock);
+                } else {
+                        spin_unlock_irq(&blkcg->lock);
+                        rcu_read_unlock();
+                        cpu_relax();
+                        rcu_read_lock();
+                        spin_lock(&blkcg->lock);
                }
+        }
-                blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
+        spin_unlock_irq(&blkcg->lock);
-                                        blkcg_node);
-                q = rcu_dereference(blkg->q);
-                __blkiocg_del_blkio_group(blkg);
-                spin_unlock_irqrestore(&blkcg->lock, flags);
-                /*
-                 * This blkio_group is being unlinked as associated cgroup is
-                 * going away. Let all the IO controlling policies know about
-                 * this event.
-                 */
-                spin_lock(&blkio_list_lock);
-                spin_lock_irqsave(q->queue_lock, flags);
-                blkg_destroy(blkg);
-                spin_unlock_irqrestore(q->queue_lock, flags);
-                spin_unlock(&blkio_list_lock);
-        } while (1);
        rcu_read_unlock();
        return 0;
 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 6e8ee86a2870..df73040a6a5f 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -380,7 +380,6 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
-extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
                                       struct request_queue *q);
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
@@ -416,9 +415,6 @@ cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 static inline struct blkio_cgroup *
 task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
-static inline int
-blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
                                              void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
diff --git a/block/cfq.h b/block/cfq.h
index 5584e1b63ca8..c8b15ef57e5d 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -79,11 +79,6 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
                                        direction, sync);
 }
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-        return blkiocg_del_blkio_group(blkg);
-}
 #else /* CFQ_GROUP_IOSCHED */
 static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
                        struct blkio_policy_type *pol,
@@ -119,10 +114,5 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
                        struct blkio_policy_type *pol, uint64_t start_time,
                        uint64_t io_start_time, bool direction, bool sync) { }
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-        return 0;
-}
 #endif /* CFQ_GROUP_IOSCHED */
 #endif
author	Tejun Heo <tj@kernel.org>	2012-03-05 16:15:21 -0500
committer	Jens Axboe <axboe@kernel.dk>	2012-03-06 15:27:24 -0500
commit	9f13ef678efd977487fc0c2e489f17c9a8c67a3e (patch)
tree	e58a2dd153ad24b2ea173d5dfb575c507e1f7589 /block
parent	e8989fae38d9831c72b20375a206a919ca468c52 (diff)