1 files changed, 237 insertions, 76 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 252a81a306f7..a62be8d0dc1b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -78,6 +78,8 @@ struct throtl_grp {
        /* Some throttle limits got updated for the group */
        int limits_changed;
+        struct rcu_head rcu_head;
 };
 struct throtl_data
@@ -88,7 +90,7 @@ struct throtl_data
        /* service tree for active throtl groups */
        struct throtl_rb_root tg_service_tree;
-        struct throtl_grp root_tg;
+        struct throtl_grp *root_tg;
        struct request_queue *queue;
        /* Total Number of queued bios on READ and WRITE lists */
@@ -151,56 +153,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
        return tg;
 }
-static void throtl_put_tg(struct throtl_grp *tg)
+static void throtl_free_tg(struct rcu_head *head)
 {
-        BUG_ON(atomic_read(&tg->ref) <= 0);
+        struct throtl_grp *tg;
-        if (!atomic_dec_and_test(&tg->ref))
-                return;
+        tg = container_of(head, struct throtl_grp, rcu_head);
+        free_percpu(tg->blkg.stats_cpu);
        kfree(tg);
 }
-static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
+static void throtl_put_tg(struct throtl_grp *tg)
-                        struct blkio_cgroup *blkcg)
 {
-        struct throtl_grp *tg = NULL;
+        BUG_ON(atomic_read(&tg->ref) <= 0);
-        void *key = td;
+        if (!atomic_dec_and_test(&tg->ref))
-        struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+                return;
-        unsigned int major, minor;
        /*
-         * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
+         * A group is freed in rcu manner. But having an rcu lock does not
-         * tree of blkg (instead of traversing through hash list all
+         * mean that one can access all the fields of blkg and assume these
-         * the time.
+         * are valid. For example, don't try to follow throtl_data and
+         * request queue links.
+         *
+         * Having a reference to blkg under an rcu allows acess to only
+         * values local to groups like group stats and group rate limits
         */
+        call_rcu(&tg->rcu_head, throtl_free_tg);
+}
-        /*
+static void throtl_init_group(struct throtl_grp *tg)
-         * This is the common case when there are no blkio cgroups.
+{
-         * Avoid lookup in this case
-         */
-        if (blkcg == &blkio_root_cgroup)
-                tg = &td->root_tg;
-        else
-                tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
-        /* Fill in device details for root group */
-        if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
-                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
-                tg->blkg.dev = MKDEV(major, minor);
-                goto done;
-        }
-        if (tg)
-                goto done;
-        tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
-        if (!tg)
-                goto done;
        INIT_HLIST_NODE(&tg->tg_node);
        RB_CLEAR_NODE(&tg->rb_node);
        bio_list_init(&tg->bio_lists[0]);
        bio_list_init(&tg->bio_lists[1]);
-        td->limits_changed = false;
+        tg->limits_changed = false;
+        /* Practically unlimited BW */
+        tg->bps[0] = tg->bps[1] = -1;
+        tg->iops[0] = tg->iops[1] = -1;
        /*
         * Take the initial reference that will be released on destroy
@@ -209,33 +199,181 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
         * exit or cgroup deletion path depending on who is exiting first.
         */
        atomic_set(&tg->ref, 1);
+}
+/* Should be called with rcu read lock held (needed for blkcg) */
+static void
+throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
+{
+        hlist_add_head(&tg->tg_node, &td->tg_list);
+        td->nr_undestroyed_grps++;
+}
+static void
+__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+        struct backing_dev_info *bdi = &td->queue->backing_dev_info;
+        unsigned int major, minor;
+        if (!tg || tg->blkg.dev)
+                return;
+        /*
+         * Fill in device details for a group which might not have been
+         * filled at group creation time as queue was being instantiated
+         * and driver had not attached a device yet
+         */
+        if (bdi->dev && dev_name(bdi->dev)) {
+                sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+                tg->blkg.dev = MKDEV(major, minor);
+        }
+}
+/*
+ * Should be called with without queue lock held. Here queue lock will be
+ * taken rarely. It will be taken only once during life time of a group
+ * if need be
+ */
+static void
+throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
+{
+        if (!tg || tg->blkg.dev)
+                return;
+        spin_lock_irq(td->queue->queue_lock);
+        __throtl_tg_fill_dev_details(td, tg);
+        spin_unlock_irq(td->queue->queue_lock);
+}
+static void throtl_init_add_tg_lists(struct throtl_data *td,
+                        struct throtl_grp *tg, struct blkio_cgroup *blkcg)
+{
+        __throtl_tg_fill_dev_details(td, tg);
        /* Add group onto cgroup list */
-        sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
        blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
-                                MKDEV(major, minor), BLKIO_POLICY_THROTL);
+                                tg->blkg.dev, BLKIO_POLICY_THROTL);
        tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
        tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
        tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
        tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
-        hlist_add_head(&tg->tg_node, &td->tg_list);
+        throtl_add_group_to_td_list(td, tg);
-        td->nr_undestroyed_grps++;
+}
-done:
+/* Should be called without queue lock and outside of rcu period */
+static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+{
+        struct throtl_grp *tg = NULL;
+        int ret;
+        tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+        if (!tg)
+                return NULL;
+        ret = blkio_alloc_blkg_stats(&tg->blkg);
+        if (ret) {
+                kfree(tg);
+                return NULL;
+        }
+        throtl_init_group(tg);
        return tg;
 }
-static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+static struct
+throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
        struct throtl_grp *tg = NULL;
+        void *key = td;
+        /*
+         * This is the common case when there are no blkio cgroups.
+         * Avoid lookup in this case
+         */
+        if (blkcg == &blkio_root_cgroup)
+                tg = td->root_tg;
+        else
+                tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
+        __throtl_tg_fill_dev_details(td, tg);
+        return tg;
+}
+/*
+ * This function returns with queue lock unlocked in case of error, like
+ * request queue is no more
+ */
+static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
+{
+        struct throtl_grp *tg = NULL, *__tg = NULL;
        struct blkio_cgroup *blkcg;
+        struct request_queue *q = td->queue;
        rcu_read_lock();
        blkcg = task_blkio_cgroup(current);
-        tg = throtl_find_alloc_tg(td, blkcg);
+        tg = throtl_find_tg(td, blkcg);
-        if (!tg)
+        if (tg) {
-                tg = &td->root_tg;
+                rcu_read_unlock();
+                return tg;
+        }
+        /*
+         * Need to allocate a group. Allocation of group also needs allocation
+         * of per cpu stats which in-turn takes a mutex() and can block. Hence
+         * we need to drop rcu lock and queue_lock before we call alloc
+         *
+         * Take the request queue reference to make sure queue does not
+         * go away once we return from allocation.
+         */
+        blk_get_queue(q);
+        rcu_read_unlock();
+        spin_unlock_irq(q->queue_lock);
+        tg = throtl_alloc_tg(td);
+        /*
+         * We might have slept in group allocation. Make sure queue is not
+         * dead
+         */
+        if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+                blk_put_queue(q);
+                if (tg)
+                        kfree(tg);
+                return ERR_PTR(-ENODEV);
+        }
+        blk_put_queue(q);
+        /* Group allocated and queue is still alive. take the lock */
+        spin_lock_irq(q->queue_lock);
+        /*
+         * Initialize the new group. After sleeping, read the blkcg again.
+         */
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        /*
+         * If some other thread already allocated the group while we were
+         * not holding queue lock, free up the group
+         */
+        __tg = throtl_find_tg(td, blkcg);
+        if (__tg) {
+                kfree(tg);
+                rcu_read_unlock();
+                return __tg;
+        }
+        /* Group allocation failed. Account the IO to root group */
+        if (!tg) {
+                tg = td->root_tg;
+                return tg;
+        }
+        throtl_init_add_tg_lists(td, tg, blkcg);
        rcu_read_unlock();
        return tg;
 }
@@ -544,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
        return 0;
 }
+static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
+        if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
+                return 1;
+        return 0;
+}
 /*
 * Returns whether one can dispatch a bio or not. Also returns approx number
 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
@@ -608,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
        tg->bytes_disp[rw] += bio->bi_size;
        tg->io_disp[rw]++;
-        /*
-         * TODO: This will take blkg->stats_lock. Figure out a way
-         * to avoid this cost.
-         */
        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
 }
@@ -989,15 +1129,51 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
        struct throtl_grp *tg;
        struct bio *bio = *biop;
        bool rw = bio_data_dir(bio), update_disptime = true;
+        struct blkio_cgroup *blkcg;
        if (bio->bi_rw & REQ_THROTTLED) {
                bio->bi_rw &= ~REQ_THROTTLED;
                return 0;
        }
+        /*
+         * A throtl_grp pointer retrieved under rcu can be used to access
+         * basic fields like stats and io rates. If a group has no rules,
+         * just update the dispatch stats in lockless manner and return.
+         */
+        rcu_read_lock();
+        blkcg = task_blkio_cgroup(current);
+        tg = throtl_find_tg(td, blkcg);
+        if (tg) {
+                throtl_tg_fill_dev_details(td, tg);
+                if (tg_no_rule_group(tg, rw)) {
+                        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
+                                        rw, bio->bi_rw & REQ_SYNC);
+                        rcu_read_unlock();
+                        return 0;
+                }
+        }
+        rcu_read_unlock();
+        /*
+         * Either group has not been allocated yet or it is not an unlimited
+         * IO group
+         */
        spin_lock_irq(q->queue_lock);
        tg = throtl_get_tg(td);
+        if (IS_ERR(tg)) {
+                if (PTR_ERR(tg) == -ENODEV) {
+                        /*
+                         * Queue is gone. No queue lock held here.
+                         */
+                        return -ENODEV;
+                }
+        }
        if (tg->nr_queued[rw]) {
                /*
                 * There is already another bio queued in same dir. No
@@ -1060,39 +1236,24 @@ int blk_throtl_init(struct request_queue *q)
        INIT_HLIST_HEAD(&td->tg_list);
        td->tg_service_tree = THROTL_RB_ROOT;
        td->limits_changed = false;
+        INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
-        /* Init root group */
+        /* alloc and Init root group. */
-        tg = &td->root_tg;
+        td->queue = q;
-        INIT_HLIST_NODE(&tg->tg_node);
+        tg = throtl_alloc_tg(td);
-        RB_CLEAR_NODE(&tg->rb_node);
-        bio_list_init(&tg->bio_lists[0]);
-        bio_list_init(&tg->bio_lists[1]);
-        /* Practically unlimited BW */
-        tg->bps[0] = tg->bps[1] = -1;
-        tg->iops[0] = tg->iops[1] = -1;
-        td->limits_changed = false;
-        /*
+        if (!tg) {
-         * Set root group reference to 2. One reference will be dropped when
+                kfree(td);
-         * all groups on tg_list are being deleted during queue exit. Other
+                return -ENOMEM;
-         * reference will remain there as we don't want to delete this group
+        }
-         * as it is statically allocated and gets destroyed when throtl_data
-         * goes away.
-         */
-        atomic_set(&tg->ref, 2);
-        hlist_add_head(&tg->tg_node, &td->tg_list);
-        td->nr_undestroyed_grps++;
-        INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
+        td->root_tg = tg;
        rcu_read_lock();
-        blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
+        throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
-                                        0, BLKIO_POLICY_THROTL);
        rcu_read_unlock();
        /* Attach throtl data to request queue */
-        td->queue = q;
        q->td = td;
        return 0;
 }

diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 252a81a306f7..a62be8d0dc1b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c
@@ -78,6 +78,8 @@ struct throtl_grp {
78		78
79	/* Some throttle limits got updated for the group */	79	/* Some throttle limits got updated for the group */
80	int limits_changed;	80	int limits_changed;
		81
		82	struct rcu_head rcu_head;
81	};	83	};
82		84
83	struct throtl_data	85	struct throtl_data
@@ -88,7 +90,7 @@ struct throtl_data
88	/* service tree for active throtl groups */	90	/* service tree for active throtl groups */
89	struct throtl_rb_root tg_service_tree;	91	struct throtl_rb_root tg_service_tree;
90		92
91	struct throtl_grp root_tg;	93	struct throtl_grp *root_tg;
92	struct request_queue *queue;	94	struct request_queue *queue;
93		95
94	/* Total Number of queued bios on READ and WRITE lists */	96	/* Total Number of queued bios on READ and WRITE lists */
@@ -151,56 +153,44 @@ static inline struct throtl_grp throtl_ref_get_tg(struct throtl_grp tg)
151	return tg;	153	return tg;
152	}	154	}
153		155
154	static void throtl_put_tg(struct throtl_grp *tg)	156	static void throtl_free_tg(struct rcu_head *head)
155	{	157	{
156	BUG_ON(atomic_read(&tg->ref) <= 0);	158	struct throtl_grp *tg;
157	if (!atomic_dec_and_test(&tg->ref))	159
158	return;	160	tg = container_of(head, struct throtl_grp, rcu_head);
		161	free_percpu(tg->blkg.stats_cpu);
159	kfree(tg);	162	kfree(tg);
160	}	163	}
161		164
162	static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,	165	static void throtl_put_tg(struct throtl_grp *tg)
163	struct blkio_cgroup *blkcg)
164	{	166	{
165	struct throtl_grp *tg = NULL;	167	BUG_ON(atomic_read(&tg->ref) <= 0);
166	void *key = td;	168	if (!atomic_dec_and_test(&tg->ref))
167	struct backing_dev_info *bdi = &td->queue->backing_dev_info;	169	return;
168	unsigned int major, minor;
169		170
170	/*	171	/*
171	* TODO: Speed up blkiocg_lookup_group() by maintaining a radix	172	* A group is freed in rcu manner. But having an rcu lock does not
172	* tree of blkg (instead of traversing through hash list all	173	* mean that one can access all the fields of blkg and assume these
173	* the time.	174	* are valid. For example, don't try to follow throtl_data and
		175	* request queue links.
		176	*
		177	* Having a reference to blkg under an rcu allows acess to only
		178	* values local to groups like group stats and group rate limits
174	*/	179	*/
		180	call_rcu(&tg->rcu_head, throtl_free_tg);
		181	}
175		182
176	/*	183	static void throtl_init_group(struct throtl_grp *tg)
177	* This is the common case when there are no blkio cgroups.	184	{
178	* Avoid lookup in this case
179	*/
180	if (blkcg == &blkio_root_cgroup)
181	tg = &td->root_tg;
182	else
183	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
184
185	/* Fill in device details for root group */
186	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
187	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
188	tg->blkg.dev = MKDEV(major, minor);
189	goto done;
190	}
191
192	if (tg)
193	goto done;
194
195	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
196	if (!tg)
197	goto done;
198
199	INIT_HLIST_NODE(&tg->tg_node);	185	INIT_HLIST_NODE(&tg->tg_node);
200	RB_CLEAR_NODE(&tg->rb_node);	186	RB_CLEAR_NODE(&tg->rb_node);
201	bio_list_init(&tg->bio_lists[0]);	187	bio_list_init(&tg->bio_lists[0]);
202	bio_list_init(&tg->bio_lists[1]);	188	bio_list_init(&tg->bio_lists[1]);
203	td->limits_changed = false;	189	tg->limits_changed = false;
		190
		191	/* Practically unlimited BW */
		192	tg->bps[0] = tg->bps[1] = -1;
		193	tg->iops[0] = tg->iops[1] = -1;
204		194
205	/*	195	/*
206	* Take the initial reference that will be released on destroy	196	* Take the initial reference that will be released on destroy
@@ -209,33 +199,181 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
209	* exit or cgroup deletion path depending on who is exiting first.	199	* exit or cgroup deletion path depending on who is exiting first.
210	*/	200	*/
211	atomic_set(&tg->ref, 1);	201	atomic_set(&tg->ref, 1);
		202	}
		203
		204	/* Should be called with rcu read lock held (needed for blkcg) */
		205	static void
		206	throtl_add_group_to_td_list(struct throtl_data td, struct throtl_grp tg)
		207	{
		208	hlist_add_head(&tg->tg_node, &td->tg_list);
		209	td->nr_undestroyed_grps++;
		210	}
		211
		212	static void
		213	__throtl_tg_fill_dev_details(struct throtl_data td, struct throtl_grp tg)
		214	{
		215	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
		216	unsigned int major, minor;
		217
		218	if (!tg \|\| tg->blkg.dev)
		219	return;
		220
		221	/*
		222	* Fill in device details for a group which might not have been
		223	* filled at group creation time as queue was being instantiated
		224	* and driver had not attached a device yet
		225	*/
		226	if (bdi->dev && dev_name(bdi->dev)) {
		227	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
		228	tg->blkg.dev = MKDEV(major, minor);
		229	}
		230	}
		231
		232	/*
		233	* Should be called with without queue lock held. Here queue lock will be
		234	* taken rarely. It will be taken only once during life time of a group
		235	* if need be
		236	*/
		237	static void
		238	throtl_tg_fill_dev_details(struct throtl_data td, struct throtl_grp tg)
		239	{
		240	if (!tg \|\| tg->blkg.dev)
		241	return;
		242
		243	spin_lock_irq(td->queue->queue_lock);
		244	__throtl_tg_fill_dev_details(td, tg);
		245	spin_unlock_irq(td->queue->queue_lock);
		246	}
		247
		248	static void throtl_init_add_tg_lists(struct throtl_data *td,
		249	struct throtl_grp tg, struct blkio_cgroup blkcg)
		250	{
		251	__throtl_tg_fill_dev_details(td, tg);
212		252
213	/* Add group onto cgroup list */	253	/* Add group onto cgroup list */
214	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
215	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,	254	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
216	MKDEV(major, minor), BLKIO_POLICY_THROTL);	255	tg->blkg.dev, BLKIO_POLICY_THROTL);
217		256
218	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);	257	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
219	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);	258	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
220	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);	259	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
221	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);	260	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
222		261
223	hlist_add_head(&tg->tg_node, &td->tg_list);	262	throtl_add_group_to_td_list(td, tg);
224	td->nr_undestroyed_grps++;	263	}
225	done:	264
		265	/* Should be called without queue lock and outside of rcu period */
		266	static struct throtl_grp throtl_alloc_tg(struct throtl_data td)
		267	{
		268	struct throtl_grp *tg = NULL;
		269	int ret;
		270
		271	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
		272	if (!tg)
		273	return NULL;
		274
		275	ret = blkio_alloc_blkg_stats(&tg->blkg);
		276
		277	if (ret) {
		278	kfree(tg);
		279	return NULL;
		280	}
		281
		282	throtl_init_group(tg);
226	return tg;	283	return tg;
227	}	284	}
228		285
229	static struct throtl_grp * throtl_get_tg(struct throtl_data *td)	286	static struct
		287	throtl_grp throtl_find_tg(struct throtl_data td, struct blkio_cgroup *blkcg)
230	{	288	{
231	struct throtl_grp *tg = NULL;	289	struct throtl_grp *tg = NULL;
		290	void *key = td;
		291
		292	/*
		293	* This is the common case when there are no blkio cgroups.
		294	* Avoid lookup in this case
		295	*/
		296	if (blkcg == &blkio_root_cgroup)
		297	tg = td->root_tg;
		298	else
		299	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
		300
		301	__throtl_tg_fill_dev_details(td, tg);
		302	return tg;
		303	}
		304
		305	/*
		306	* This function returns with queue lock unlocked in case of error, like
		307	* request queue is no more
		308	*/
		309	static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
		310	{
		311	struct throtl_grp tg = NULL, __tg = NULL;
232	struct blkio_cgroup *blkcg;	312	struct blkio_cgroup *blkcg;
		313	struct request_queue *q = td->queue;
233		314
234	rcu_read_lock();	315	rcu_read_lock();
235	blkcg = task_blkio_cgroup(current);	316	blkcg = task_blkio_cgroup(current);
236	tg = throtl_find_alloc_tg(td, blkcg);	317	tg = throtl_find_tg(td, blkcg);
237	if (!tg)	318	if (tg) {
238	tg = &td->root_tg;	319	rcu_read_unlock();
		320	return tg;
		321	}
		322
		323	/*
		324	* Need to allocate a group. Allocation of group also needs allocation
		325	* of per cpu stats which in-turn takes a mutex() and can block. Hence
		326	* we need to drop rcu lock and queue_lock before we call alloc
		327	*
		328	* Take the request queue reference to make sure queue does not
		329	* go away once we return from allocation.
		330	*/
		331	blk_get_queue(q);
		332	rcu_read_unlock();
		333	spin_unlock_irq(q->queue_lock);
		334
		335	tg = throtl_alloc_tg(td);
		336	/*
		337	* We might have slept in group allocation. Make sure queue is not
		338	* dead
		339	*/
		340	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
		341	blk_put_queue(q);
		342	if (tg)
		343	kfree(tg);
		344
		345	return ERR_PTR(-ENODEV);
		346	}
		347	blk_put_queue(q);
		348
		349	/* Group allocated and queue is still alive. take the lock */
		350	spin_lock_irq(q->queue_lock);
		351
		352	/*
		353	* Initialize the new group. After sleeping, read the blkcg again.
		354	*/
		355	rcu_read_lock();
		356	blkcg = task_blkio_cgroup(current);
		357
		358	/*
		359	* If some other thread already allocated the group while we were
		360	* not holding queue lock, free up the group
		361	*/
		362	__tg = throtl_find_tg(td, blkcg);
		363
		364	if (__tg) {
		365	kfree(tg);
		366	rcu_read_unlock();
		367	return __tg;
		368	}
		369
		370	/* Group allocation failed. Account the IO to root group */
		371	if (!tg) {
		372	tg = td->root_tg;
		373	return tg;
		374	}
		375
		376	throtl_init_add_tg_lists(td, tg, blkcg);
239	rcu_read_unlock();	377	rcu_read_unlock();
240	return tg;	378	return tg;
241	}	379	}
@@ -544,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data td, struct throtl_grp tg,
544	return 0;	682	return 0;
545	}	683	}
546		684
		685	static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
		686	if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
		687	return 1;
		688	return 0;
		689	}
		690
547	/*	691	/*
548	* Returns whether one can dispatch a bio or not. Also returns approx number	692	* Returns whether one can dispatch a bio or not. Also returns approx number
549	* of jiffies to wait before this bio is with-in IO rate and can be dispatched	693	* of jiffies to wait before this bio is with-in IO rate and can be dispatched
@@ -608,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp tg, struct bio bio)
608	tg->bytes_disp[rw] += bio->bi_size;	752	tg->bytes_disp[rw] += bio->bi_size;
609	tg->io_disp[rw]++;	753	tg->io_disp[rw]++;
610		754
611	/*
612	* TODO: This will take blkg->stats_lock. Figure out a way
613	* to avoid this cost.
614	*/
615	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);	755	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
616	}	756	}
617		757
@@ -989,15 +1129,51 @@ int blk_throtl_bio(struct request_queue q, struct bio *biop)
989	struct throtl_grp *tg;	1129	struct throtl_grp *tg;
990	struct bio bio = biop;	1130	struct bio bio = biop;
991	bool rw = bio_data_dir(bio), update_disptime = true;	1131	bool rw = bio_data_dir(bio), update_disptime = true;
		1132	struct blkio_cgroup *blkcg;
992		1133
993	if (bio->bi_rw & REQ_THROTTLED) {	1134	if (bio->bi_rw & REQ_THROTTLED) {
994	bio->bi_rw &= ~REQ_THROTTLED;	1135	bio->bi_rw &= ~REQ_THROTTLED;
995	return 0;	1136	return 0;
996	}	1137	}
997		1138
		1139	/*
		1140	* A throtl_grp pointer retrieved under rcu can be used to access
		1141	* basic fields like stats and io rates. If a group has no rules,
		1142	* just update the dispatch stats in lockless manner and return.
		1143	*/
		1144
		1145	rcu_read_lock();
		1146	blkcg = task_blkio_cgroup(current);
		1147	tg = throtl_find_tg(td, blkcg);
		1148	if (tg) {
		1149	throtl_tg_fill_dev_details(td, tg);
		1150
		1151	if (tg_no_rule_group(tg, rw)) {
		1152	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
		1153	rw, bio->bi_rw & REQ_SYNC);
		1154	rcu_read_unlock();
		1155	return 0;
		1156	}
		1157	}
		1158	rcu_read_unlock();
		1159
		1160	/*
		1161	* Either group has not been allocated yet or it is not an unlimited
		1162	* IO group
		1163	*/
		1164
998	spin_lock_irq(q->queue_lock);	1165	spin_lock_irq(q->queue_lock);
999	tg = throtl_get_tg(td);	1166	tg = throtl_get_tg(td);
1000		1167
		1168	if (IS_ERR(tg)) {
		1169	if (PTR_ERR(tg) == -ENODEV) {
		1170	/*
		1171	* Queue is gone. No queue lock held here.
		1172	*/
		1173	return -ENODEV;
		1174	}
		1175	}
		1176
1001	if (tg->nr_queued[rw]) {	1177	if (tg->nr_queued[rw]) {
1002	/*	1178	/*
1003	* There is already another bio queued in same dir. No	1179	* There is already another bio queued in same dir. No
@@ -1060,39 +1236,24 @@ int blk_throtl_init(struct request_queue *q)
1060	INIT_HLIST_HEAD(&td->tg_list);	1236	INIT_HLIST_HEAD(&td->tg_list);
1061	td->tg_service_tree = THROTL_RB_ROOT;	1237	td->tg_service_tree = THROTL_RB_ROOT;
1062	td->limits_changed = false;	1238	td->limits_changed = false;
		1239	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1063		1240
1064	/* Init root group */	1241	/* alloc and Init root group. */
1065	tg = &td->root_tg;	1242	td->queue = q;
1066	INIT_HLIST_NODE(&tg->tg_node);	1243	tg = throtl_alloc_tg(td);
1067	RB_CLEAR_NODE(&tg->rb_node);
1068	bio_list_init(&tg->bio_lists[0]);
1069	bio_list_init(&tg->bio_lists[1]);
1070
1071	/* Practically unlimited BW */
1072	tg->bps[0] = tg->bps[1] = -1;
1073	tg->iops[0] = tg->iops[1] = -1;
1074	td->limits_changed = false;
1075		1244
1076	/*	1245	if (!tg) {
1077	* Set root group reference to 2. One reference will be dropped when	1246	kfree(td);
1078	* all groups on tg_list are being deleted during queue exit. Other	1247	return -ENOMEM;
1079	* reference will remain there as we don't want to delete this group	1248	}
1080	* as it is statically allocated and gets destroyed when throtl_data
1081	* goes away.
1082	*/
1083	atomic_set(&tg->ref, 2);
1084	hlist_add_head(&tg->tg_node, &td->tg_list);
1085	td->nr_undestroyed_grps++;
1086		1249
1087	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);	1250	td->root_tg = tg;
1088		1251
1089	rcu_read_lock();	1252	rcu_read_lock();
1090	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,	1253	throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
1091	0, BLKIO_POLICY_THROTL);
1092	rcu_read_unlock();	1254	rcu_read_unlock();
1093		1255
1094	/* Attach throtl data to request queue */	1256	/* Attach throtl data to request queue */
1095	td->queue = q;
1096	q->td = td;	1257	q->td = td;
1097	return 0;	1258	return 0;
1098	}	1259	}