blk-cgroup: Allow sleeping while dynamically allocating a group

Currently, all the cfq_group or throtl_group allocations happen while we are holding ->queue_lock and sleeping is not allowed. Soon, we will move to per cpu stats and also need to allocate the per group stats. As one can not call alloc_percpu() from atomic context as it can sleep, we need to drop ->queue_lock, allocate the group, retake the lock and continue processing. In throttling code, I check the queue DEAD flag again to make sure that driver did not call blk_cleanup_queue() in the mean time. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
author: Vivek Goyal <vgoyal@redhat.com> 2011-05-19 15:38:23 -0400
committer: Jens Axboe <jaxboe@fusionio.com> 2011-05-20 14:34:52 -0400
commit: f469a7b4d5b1d1d053200a9015fd25d59c057f49 (patch)
tree: 2c68c0689e40955b186e350b15d44d0b260f4655 /block
parent: 56edf7d75db5b14d628b46623c414ffbeed68d7f (diff)
3 files changed, 205 insertions, 67 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 3fe00a14822a..9e8e297374b9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1550,7 +1550,8 @@ static inline void __generic_make_request(struct bio *bio)
                        goto end_io;
                }
-                blk_throtl_bio(q, &bio);
+                if (blk_throtl_bio(q, &bio))
+                        goto end_io;
                /*
                 * If bio = NULL, bio has been throttled and will be submitted
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index fa9a900c1254..c201967b33cd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -188,8 +188,40 @@ throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
        td->nr_undestroyed_grps++;
 }
-static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
+static void throtl_init_add_tg_lists(struct throtl_data *td,
-                        struct blkio_cgroup *blkcg)
+                        struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+{
+        struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+        unsigned int major, minor;
+        /* Add group onto cgroup list */
+        sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+        blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
+                                MKDEV(major, minor), BLKIO_POLICY_THROTL);
+        tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
+        tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
+        tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
+        tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
+        throtl_add_group_to_td_list(td, tg);
+}
+/* Should be called without queue lock and outside of rcu period */
+static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+{
+        struct throtl_grp *tg = NULL;
+        tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+        if (!tg)
+                return NULL;
+        throtl_init_group(tg);
+        return tg;
+}
+static struct
+throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
        struct throtl_grp *tg = NULL;
        void *key = td;
@@ -197,12 +229,6 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
        unsigned int major, minor;
        /*
-         * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
-         * tree of blkg (instead of traversing through hash list all
-         * the time.
-         */
-        /*
         * This is the common case when there are no blkio cgroups.
         * Avoid lookup in this case
         */
@@ -215,43 +241,83 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
        if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
                tg->blkg.dev = MKDEV(major, minor);
-                goto done;
        }
-        if (tg)
-                goto done;
-        tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
-        if (!tg)
-                goto done;
-        throtl_init_group(tg);
-        /* Add group onto cgroup list */
-        sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-        blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
-                                MKDEV(major, minor), BLKIO_POLICY_THROTL);
-        tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
-        tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
-        tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
-        tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
-        throtl_add_group_to_td_list(td, tg);
-done:
        return tg;
 }
+/*
+ * This function returns with queue lock unlocked in case of error, like
+ * request queue is no more
+ */
 static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 {
-        struct throtl_grp *tg = NULL;
+        struct throtl_grp *tg = NULL, *__tg = NULL;
        struct blkio_cgroup *blkcg;
+        struct request_queue *q = td->queue;
        rcu_read_lock();
        blkcg = task_blkio_cgroup(current);
-        tg = throtl_find_alloc_tg(td, blkcg);
+        tg = throtl_find_tg(td, blkcg);
-        if (!tg)
+        if (tg) {
+                rcu_read_unlock();
+                return tg;
+        }
+        /*
+         * Need to allocate a group. Allocation of group also needs allocation
+         * of per cpu stats which in-turn takes a mutex() and can block. Hence
+         * we need to drop rcu lock and queue_lock before we call alloc
+         *
+         * Take the request queue reference to make sure queue does not
+         * go away once we return from allocation.
+         */
+        blk_get_queue(q);
+        rcu_read_unlock();
+        spin_unlock_irq(q->queue_lock);
+        tg = throtl_alloc_tg(td);
+        /*
+         * We might have slept in group allocation. Make sure queue is not
+         * dead
+         */
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+                blk_put_queue(q);
+                if (tg)
+                        kfree(tg);
+                return ERR_PTR(-ENODEV);
+        }
+        blk_put_queue(q);
+        /* Group allocated and queue is still alive. take the lock */
+        spin_lock_irq(q->queue_lock);
+        /*
+         * Initialize the new group. After sleeping, read the blkcg again.
+         */
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        /*
+         * If some other thread already allocated the group while we were
+         * not holding queue lock, free up the group
+         */
+        __tg = throtl_find_tg(td, blkcg);
+        if (__tg) {
+                kfree(tg);
+                rcu_read_unlock();
+                return __tg;
+        }
+        /* Group allocation failed. Account the IO to root group */
+        if (!tg) {
                tg = &td->root_tg;
+                return tg;
+        }
+        throtl_init_add_tg_lists(td, tg, blkcg);
        rcu_read_unlock();
        return tg;
 }
@@ -1014,6 +1080,15 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
        spin_lock_irq(q->queue_lock);
        tg = throtl_get_tg(td);
+        if (IS_ERR(tg)) {
+                if (PTR_ERR(tg) == -ENODEV) {
+                        /*
+                         * Queue is gone. No queue lock held here.
+                         */
+                        return -ENODEV;
+                }
+        }
        if (tg->nr_queued[rw]) {
                /*
                 * There is already another bio queued in same dir. No
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e2e6719832e1..606020fe93f3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1016,28 +1016,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
        cfqg->needs_update = true;
 }
-static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
+static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
-                struct blkio_cgroup *blkcg)
+                        struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
 {
-        struct cfq_group *cfqg = NULL;
-        void *key = cfqd;
-        int i, j;
-        struct cfq_rb_root *st;
        struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
        unsigned int major, minor;
-        cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+        /*
-        if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+         * Add group onto cgroup list. It might happen that bdi->dev is
+         * not initialized yet. Initialize this new group without major
+         * and minor info and this info will be filled in once a new thread
+         * comes for IO.
+         */
+        if (bdi->dev) {
                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-                cfqg->blkg.dev = MKDEV(major, minor);
+                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-                goto done;
+                                        (void *)cfqd, MKDEV(major, minor));
-        }
+        } else
-        if (cfqg)
+                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
-                goto done;
+                                        (void *)cfqd, 0);
+        cfqd->nr_blkcg_linked_grps++;
+        cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
+        /* Add group on cfqd list */
+        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+}
+/*
+ * Should be called from sleepable context. No request queue lock as per
+ * cpu stats are allocated dynamically and alloc_percpu needs to be called
+ * from sleepable context.
+ */
+static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
+{
+        struct cfq_group *cfqg = NULL;
+        int i, j;
+        struct cfq_rb_root *st;
        cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
        if (!cfqg)
-                goto done;
+                return NULL;
        for_each_cfqg_st(cfqg, i, j, st)
                *st = CFQ_RB_ROOT;
@@ -1050,28 +1069,31 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
         * or cgroup deletion path depending on who is exiting first.
         */
        cfqg->ref = 1;
+        return cfqg;
+}
+static struct cfq_group *
+cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
+{
+        struct cfq_group *cfqg = NULL;
+        void *key = cfqd;
+        struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+        unsigned int major, minor;
        /*
-         * Add group onto cgroup list. It might happen that bdi->dev is
+         * This is the common case when there are no blkio cgroups.
-         * not initialized yet. Initialize this new group without major
+         * Avoid lookup in this case
-         * and minor info and this info will be filled in once a new thread
-         * comes for IO. See code above.
         */
-        if (bdi->dev) {
+        if (blkcg == &blkio_root_cgroup)
-                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+                cfqg = &cfqd->root_group;
-                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+        else
-                                        MKDEV(major, minor));
+                cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
-        } else
-                cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
-                                        0);
-        cfqd->nr_blkcg_linked_grps++;
-        cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
-        /* Add group on cfqd list */
+        if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-        hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+                cfqg->blkg.dev = MKDEV(major, minor);
+        }
-done:
        return cfqg;
 }
@@ -1082,13 +1104,53 @@ done:
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
        struct blkio_cgroup *blkcg;
-        struct cfq_group *cfqg = NULL;
+        struct cfq_group *cfqg = NULL, *__cfqg = NULL;
+        struct request_queue *q = cfqd->queue;
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        cfqg = cfq_find_cfqg(cfqd, blkcg);
+        if (cfqg) {
+                rcu_read_unlock();
+                return cfqg;
+        }
+        /*
+         * Need to allocate a group. Allocation of group also needs allocation
+         * of per cpu stats which in-turn takes a mutex() and can block. Hence
+         * we need to drop rcu lock and queue_lock before we call alloc.
+         *
+         * Not taking any queue reference here and assuming that queue is
+         * around by the time we return. CFQ queue allocation code does
+         * the same. It might be racy though.
+         */
+        rcu_read_unlock();
+        spin_unlock_irq(q->queue_lock);
+        cfqg = cfq_alloc_cfqg(cfqd);
+        spin_lock_irq(q->queue_lock);
        rcu_read_lock();
        blkcg = task_blkio_cgroup(current);
-        cfqg = cfq_find_alloc_cfqg(cfqd, blkcg);
+        /*
+         * If some other thread already allocated the group while we were
+         * not holding queue lock, free up the group
+         */
+        __cfqg = cfq_find_cfqg(cfqd, blkcg);
+        if (__cfqg) {
+                kfree(cfqg);
+                rcu_read_unlock();
+                return __cfqg;
+        }
        if (!cfqg)
                cfqg = &cfqd->root_group;
+        cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
        rcu_read_unlock();
        return cfqg;
 }
author	Vivek Goyal <vgoyal@redhat.com>	2011-05-19 15:38:23 -0400
committer	Jens Axboe <jaxboe@fusionio.com>	2011-05-20 14:34:52 -0400
commit	f469a7b4d5b1d1d053200a9015fd25d59c057f49 (patch)
tree	2c68c0689e40955b186e350b15d44d0b260f4655 /block
parent	56edf7d75db5b14d628b46623c414ffbeed68d7f (diff)

diff --git a/block/blk-core.c b/block/blk-core.c index 3fe00a14822a..9e8e297374b9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c
@@ -1550,7 +1550,8 @@ static inline void __generic_make_request(struct bio *bio)
1550	goto end_io;	1550	goto end_io;
1551	}	1551	}
1552		1552
1553	blk_throtl_bio(q, &bio);	1553	if (blk_throtl_bio(q, &bio))
		1554	goto end_io;
1554		1555
1555	/*	1556	/*
1556	* If bio = NULL, bio has been throttled and will be submitted	1557	* If bio = NULL, bio has been throttled and will be submitted


diff --git a/block/blk-throttle.c b/block/blk-throttle.c index fa9a900c1254..c201967b33cd 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c
@@ -188,8 +188,40 @@ throtl_add_group_to_td_list(struct throtl_data td, struct throtl_grp tg)
188	td->nr_undestroyed_grps++;	188	td->nr_undestroyed_grps++;
189	}	189	}
190		190
191	static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,	191	static void throtl_init_add_tg_lists(struct throtl_data *td,
192	struct blkio_cgroup *blkcg)	192	struct throtl_grp tg, struct blkio_cgroup blkcg)
		193	{
		194	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
		195	unsigned int major, minor;
		196
		197	/* Add group onto cgroup list */
		198	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
		199	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
		200	MKDEV(major, minor), BLKIO_POLICY_THROTL);
		201
		202	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
		203	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
		204	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
		205	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
		206
		207	throtl_add_group_to_td_list(td, tg);
		208	}
		209
		210	/* Should be called without queue lock and outside of rcu period */
		211	static struct throtl_grp throtl_alloc_tg(struct throtl_data td)
		212	{
		213	struct throtl_grp *tg = NULL;
		214
		215	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
		216	if (!tg)
		217	return NULL;
		218
		219	throtl_init_group(tg);
		220	return tg;
		221	}
		222
		223	static struct
		224	throtl_grp throtl_find_tg(struct throtl_data td, struct blkio_cgroup *blkcg)
193	{	225	{
194	struct throtl_grp *tg = NULL;	226	struct throtl_grp *tg = NULL;
195	void *key = td;	227	void *key = td;
@@ -197,12 +229,6 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
197	unsigned int major, minor;	229	unsigned int major, minor;
198		230
199	/*	231	/*
200	* TODO: Speed up blkiocg_lookup_group() by maintaining a radix
201	* tree of blkg (instead of traversing through hash list all
202	* the time.
203	*/
204
205	/*
206	* This is the common case when there are no blkio cgroups.	232	* This is the common case when there are no blkio cgroups.
207	* Avoid lookup in this case	233	* Avoid lookup in this case
208	*/	234	*/
@@ -215,43 +241,83 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
215	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {	241	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
216	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);	242	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
217	tg->blkg.dev = MKDEV(major, minor);	243	tg->blkg.dev = MKDEV(major, minor);
218	goto done;
219	}	244	}
220		245
221	if (tg)
222	goto done;
223
224	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
225	if (!tg)
226	goto done;
227
228	throtl_init_group(tg);
229
230	/* Add group onto cgroup list */
231	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
232	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
233	MKDEV(major, minor), BLKIO_POLICY_THROTL);
234
235	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
236	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
237	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
238	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
239
240	throtl_add_group_to_td_list(td, tg);
241	done:
242	return tg;	246	return tg;
243	}	247	}
244		248
		249	/*
		250	* This function returns with queue lock unlocked in case of error, like
		251	* request queue is no more
		252	*/
245	static struct throtl_grp * throtl_get_tg(struct throtl_data *td)	253	static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
246	{	254	{
247	struct throtl_grp *tg = NULL;	255	struct throtl_grp tg = NULL, __tg = NULL;
248	struct blkio_cgroup *blkcg;	256	struct blkio_cgroup *blkcg;
		257	struct request_queue *q = td->queue;
249		258
250	rcu_read_lock();	259	rcu_read_lock();
251	blkcg = task_blkio_cgroup(current);	260	blkcg = task_blkio_cgroup(current);
252	tg = throtl_find_alloc_tg(td, blkcg);	261	tg = throtl_find_tg(td, blkcg);
253	if (!tg)	262	if (tg) {
		263	rcu_read_unlock();
		264	return tg;
		265	}
		266
		267	/*
		268	* Need to allocate a group. Allocation of group also needs allocation
		269	* of per cpu stats which in-turn takes a mutex() and can block. Hence
		270	* we need to drop rcu lock and queue_lock before we call alloc
		271	*
		272	* Take the request queue reference to make sure queue does not
		273	* go away once we return from allocation.
		274	*/
		275	blk_get_queue(q);
		276	rcu_read_unlock();
		277	spin_unlock_irq(q->queue_lock);
		278
		279	tg = throtl_alloc_tg(td);
		280	/*
		281	* We might have slept in group allocation. Make sure queue is not
		282	* dead
		283	*/
		284	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
		285	blk_put_queue(q);
		286	if (tg)
		287	kfree(tg);
		288
		289	return ERR_PTR(-ENODEV);
		290	}
		291	blk_put_queue(q);
		292
		293	/* Group allocated and queue is still alive. take the lock */
		294	spin_lock_irq(q->queue_lock);
		295
		296	/*
		297	* Initialize the new group. After sleeping, read the blkcg again.
		298	*/
		299	rcu_read_lock();
		300	blkcg = task_blkio_cgroup(current);
		301
		302	/*
		303	* If some other thread already allocated the group while we were
		304	* not holding queue lock, free up the group
		305	*/
		306	__tg = throtl_find_tg(td, blkcg);
		307
		308	if (__tg) {
		309	kfree(tg);
		310	rcu_read_unlock();
		311	return __tg;
		312	}
		313
		314	/* Group allocation failed. Account the IO to root group */
		315	if (!tg) {
254	tg = &td->root_tg;	316	tg = &td->root_tg;
		317	return tg;
		318	}
		319
		320	throtl_init_add_tg_lists(td, tg, blkcg);
255	rcu_read_unlock();	321	rcu_read_unlock();
256	return tg;	322	return tg;
257	}	323	}
@@ -1014,6 +1080,15 @@ int blk_throtl_bio(struct request_queue q, struct bio *biop)
1014	spin_lock_irq(q->queue_lock);	1080	spin_lock_irq(q->queue_lock);
1015	tg = throtl_get_tg(td);	1081	tg = throtl_get_tg(td);
1016		1082
		1083	if (IS_ERR(tg)) {
		1084	if (PTR_ERR(tg) == -ENODEV) {
		1085	/*
		1086	* Queue is gone. No queue lock held here.
		1087	*/
		1088	return -ENODEV;
		1089	}
		1090	}
		1091
1017	if (tg->nr_queued[rw]) {	1092	if (tg->nr_queued[rw]) {
1018	/*	1093	/*
1019	* There is already another bio queued in same dir. No	1094	* There is already another bio queued in same dir. No


diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e2e6719832e1..606020fe93f3 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c
@@ -1016,28 +1016,47 @@ void cfq_update_blkio_group_weight(void key, struct blkio_group blkg,
1016	cfqg->needs_update = true;	1016	cfqg->needs_update = true;
1017	}	1017	}
1018		1018
1019	static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,	1019	static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
1020	struct blkio_cgroup *blkcg)	1020	struct cfq_group cfqg, struct blkio_cgroup blkcg)
1021	{	1021	{
1022	struct cfq_group *cfqg = NULL;
1023	void *key = cfqd;
1024	int i, j;
1025	struct cfq_rb_root *st;
1026	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;	1022	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1027	unsigned int major, minor;	1023	unsigned int major, minor;
1028		1024
1029	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));	1025	/*
1030	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {	1026	* Add group onto cgroup list. It might happen that bdi->dev is
		1027	* not initialized yet. Initialize this new group without major
		1028	* and minor info and this info will be filled in once a new thread
		1029	* comes for IO.
		1030	*/
		1031	if (bdi->dev) {
1031	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);	1032	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1032	cfqg->blkg.dev = MKDEV(major, minor);	1033	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1033	goto done;	1034	(void *)cfqd, MKDEV(major, minor));
1034	}	1035	} else
1035	if (cfqg)	1036	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1036	goto done;	1037	(void *)cfqd, 0);
		1038
		1039	cfqd->nr_blkcg_linked_grps++;
		1040	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
		1041
		1042	/* Add group on cfqd list */
		1043	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
		1044	}
		1045
		1046	/*
		1047	* Should be called from sleepable context. No request queue lock as per
		1048	* cpu stats are allocated dynamically and alloc_percpu needs to be called
		1049	* from sleepable context.
		1050	*/
		1051	static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
		1052	{
		1053	struct cfq_group *cfqg = NULL;
		1054	int i, j;
		1055	struct cfq_rb_root *st;
1037		1056
1038	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);	1057	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1039	if (!cfqg)	1058	if (!cfqg)
1040	goto done;	1059	return NULL;
1041		1060
1042	for_each_cfqg_st(cfqg, i, j, st)	1061	for_each_cfqg_st(cfqg, i, j, st)
1043	*st = CFQ_RB_ROOT;	1062	*st = CFQ_RB_ROOT;
@@ -1050,28 +1069,31 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
1050	* or cgroup deletion path depending on who is exiting first.	1069	* or cgroup deletion path depending on who is exiting first.
1051	*/	1070	*/
1052	cfqg->ref = 1;	1071	cfqg->ref = 1;
		1072	return cfqg;
		1073	}
		1074
		1075	static struct cfq_group *
		1076	cfq_find_cfqg(struct cfq_data cfqd, struct blkio_cgroup blkcg)
		1077	{
		1078	struct cfq_group *cfqg = NULL;
		1079	void *key = cfqd;
		1080	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
		1081	unsigned int major, minor;
1053		1082
1054	/*	1083	/*
1055	* Add group onto cgroup list. It might happen that bdi->dev is	1084	* This is the common case when there are no blkio cgroups.
1056	* not initialized yet. Initialize this new group without major	1085	* Avoid lookup in this case
1057	* and minor info and this info will be filled in once a new thread
1058	* comes for IO. See code above.
1059	*/	1086	*/
1060	if (bdi->dev) {	1087	if (blkcg == &blkio_root_cgroup)
1061	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);	1088	cfqg = &cfqd->root_group;
1062	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,	1089	else
1063	MKDEV(major, minor));	1090	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1064	} else
1065	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1066	0);
1067
1068	cfqd->nr_blkcg_linked_grps++;
1069	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1070		1091
1071	/* Add group on cfqd list */	1092	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1072	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);	1093	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
		1094	cfqg->blkg.dev = MKDEV(major, minor);
		1095	}
1073		1096
1074	done:
1075	return cfqg;	1097	return cfqg;
1076	}	1098	}
1077		1099
@@ -1082,13 +1104,53 @@ done:
1082	static struct cfq_group cfq_get_cfqg(struct cfq_data cfqd)	1104	static struct cfq_group cfq_get_cfqg(struct cfq_data cfqd)
1083	{	1105	{
1084	struct blkio_cgroup *blkcg;	1106	struct blkio_cgroup *blkcg;
1085	struct cfq_group *cfqg = NULL;	1107	struct cfq_group cfqg = NULL, __cfqg = NULL;
		1108	struct request_queue *q = cfqd->queue;
		1109
		1110	rcu_read_lock();
		1111	blkcg = task_blkio_cgroup(current);
		1112	cfqg = cfq_find_cfqg(cfqd, blkcg);
		1113	if (cfqg) {
		1114	rcu_read_unlock();
		1115	return cfqg;
		1116	}
		1117
		1118	/*
		1119	* Need to allocate a group. Allocation of group also needs allocation
		1120	* of per cpu stats which in-turn takes a mutex() and can block. Hence
		1121	* we need to drop rcu lock and queue_lock before we call alloc.
		1122	*
		1123	* Not taking any queue reference here and assuming that queue is
		1124	* around by the time we return. CFQ queue allocation code does
		1125	* the same. It might be racy though.
		1126	*/
		1127
		1128	rcu_read_unlock();
		1129	spin_unlock_irq(q->queue_lock);
		1130
		1131	cfqg = cfq_alloc_cfqg(cfqd);
		1132
		1133	spin_lock_irq(q->queue_lock);
1086		1134
1087	rcu_read_lock();	1135	rcu_read_lock();
1088	blkcg = task_blkio_cgroup(current);	1136	blkcg = task_blkio_cgroup(current);
1089	cfqg = cfq_find_alloc_cfqg(cfqd, blkcg);	1137
		1138	/*
		1139	* If some other thread already allocated the group while we were
		1140	* not holding queue lock, free up the group
		1141	*/
		1142	__cfqg = cfq_find_cfqg(cfqd, blkcg);
		1143
		1144	if (__cfqg) {
		1145	kfree(cfqg);
		1146	rcu_read_unlock();
		1147	return __cfqg;
		1148	}
		1149
1090	if (!cfqg)	1150	if (!cfqg)
1091	cfqg = &cfqd->root_group;	1151	cfqg = &cfqd->root_group;
		1152
		1153	cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1092	rcu_read_unlock();	1154	rcu_read_unlock();
1093	return cfqg;	1155	return cfqg;
1094	}	1156	}