aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-throttle.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-05-14 16:52:38 -0400
committerTejun Heo <tj@kernel.org>2013-05-14 16:52:38 -0400
commitc5cc2070b45333f40a3f99319b83c8caeb62ec05 (patch)
tree170f75af6b7cef304840e495c66f58c0bda5930e /block/blk-throttle.c
parent2e48a530a3a7daebd0cc17866304a36d39b611de (diff)
blk-throttle: add throtl_qnode for dispatch fairness
With flat hierarchy, there's only single level of dispatching happening and fairness beyond that point is the responsibility of the rest of the block layer and driver, which usually works out okay; however, with the planned hierarchy support, service_queue->bio_lists[] can be filled up by bios from a single source. While the limits would still be honored, it'd be very easy to starve IOs from siblings or children. To avoid such starvation, this patch implements throtl_qnode and converts service_queue->bio_lists[] to lists of per-source qnodes which in turn contains the bio's. For example, when a bio is dispatched from a child group, the bio doesn't get queued on ->bio_lists[] directly but it first gets queued on the group's qnode which in turn gets queued on service_queue->queued[]. When dispatching for the upper level, the ->queued[] list is consumed in round-robing order so that the dispatch windows is consumed fairly by all IO sources. There are two ways a bio can come to a throtl_grp - directly queued to the group or dispatched from a child. For the former throtl_grp->qnode_on_self[rw] is used. For the latter, the child's ->qnode_on_parent[rw]. Note that this means that the child which is contributing a bio to its parent should stay pinned until all its bios are dispatched to its grand-parent. This patch moves blkg refcnting from bio add/remove spots to qnode activation/deactivation so that the blkg containing an active qnode is always pinned. As child pins the parent, this is sufficient for keeping the relevant sub-tree pinned while bios are in flight. The starvation issue was spotted by Vivek Goyal. v2: The original patch used the same throtl_grp->qnode_on_self/parent for reads and writes causing RWs to be queued incorrectly if there already are outstanding IOs in the other direction. They should be throtl_grp->qnode_on_self/parent[2] so that READs and WRITEs can use different qnodes. Spotted by Vivek Goyal. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Vivek Goyal <vgoyal@redhat.com>
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r--block/blk-throttle.c201
1 files changed, 176 insertions, 25 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index bc65077f6e43..541bd0dabb9a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -26,6 +26,35 @@ static struct blkcg_policy blkcg_policy_throtl;
26/* A workqueue to queue throttle related work */ 26/* A workqueue to queue throttle related work */
27static struct workqueue_struct *kthrotld_workqueue; 27static struct workqueue_struct *kthrotld_workqueue;
28 28
29/*
30 * To implement hierarchical throttling, throtl_grps form a tree and bios
31 * are dispatched upwards level by level until they reach the top and get
32 * issued. When dispatching bios from the children and local group at each
33 * level, if the bios are dispatched into a single bio_list, there's a risk
34 * of a local or child group which can queue many bios at once filling up
35 * the list starving others.
36 *
37 * To avoid such starvation, dispatched bios are queued separately
38 * according to where they came from. When they are again dispatched to
39 * the parent, they're popped in round-robin order so that no single source
40 * hogs the dispatch window.
41 *
42 * throtl_qnode is used to keep the queued bios separated by their sources.
43 * Bios are queued to throtl_qnode which in turn is queued to
44 * throtl_service_queue and then dispatched in round-robin order.
45 *
46 * It's also used to track the reference counts on blkg's. A qnode always
47 * belongs to a throtl_grp and gets queued on itself or the parent, so
48 * incrementing the reference of the associated throtl_grp when a qnode is
49 * queued and decrementing when dequeued is enough to keep the whole blkg
50 * tree pinned while bios are in flight.
51 */
52struct throtl_qnode {
53 struct list_head node; /* service_queue->queued[] */
54 struct bio_list bios; /* queued bios */
55 struct throtl_grp *tg; /* tg this qnode belongs to */
56};
57
29struct throtl_service_queue { 58struct throtl_service_queue {
30 struct throtl_service_queue *parent_sq; /* the parent service_queue */ 59 struct throtl_service_queue *parent_sq; /* the parent service_queue */
31 60
@@ -33,7 +62,7 @@ struct throtl_service_queue {
33 * Bios queued directly to this service_queue or dispatched from 62 * Bios queued directly to this service_queue or dispatched from
34 * children throtl_grp's. 63 * children throtl_grp's.
35 */ 64 */
36 struct bio_list bio_lists[2]; /* queued bios [READ/WRITE] */ 65 struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
37 unsigned int nr_queued[2]; /* number of queued bios */ 66 unsigned int nr_queued[2]; /* number of queued bios */
38 67
39 /* 68 /*
@@ -76,6 +105,17 @@ struct throtl_grp {
76 struct throtl_service_queue service_queue; 105 struct throtl_service_queue service_queue;
77 106
78 /* 107 /*
108 * qnode_on_self is used when bios are directly queued to this
109 * throtl_grp so that local bios compete fairly with bios
110 * dispatched from children. qnode_on_parent is used when bios are
111 * dispatched from this throtl_grp into its parent and will compete
112 * with the sibling qnode_on_parents and the parent's
113 * qnode_on_self.
114 */
115 struct throtl_qnode qnode_on_self[2];
116 struct throtl_qnode qnode_on_parent[2];
117
118 /*
79 * Dispatch time in jiffies. This is the estimated time when group 119 * Dispatch time in jiffies. This is the estimated time when group
80 * will unthrottle and is ready to dispatch more bio. It is used as 120 * will unthrottle and is ready to dispatch more bio. It is used as
81 * key to sort active groups in service tree. 121 * key to sort active groups in service tree.
@@ -250,12 +290,95 @@ alloc_stats:
250 goto alloc_stats; 290 goto alloc_stats;
251} 291}
252 292
293static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
294{
295 INIT_LIST_HEAD(&qn->node);
296 bio_list_init(&qn->bios);
297 qn->tg = tg;
298}
299
300/**
301 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
302 * @bio: bio being added
303 * @qn: qnode to add bio to
304 * @queued: the service_queue->queued[] list @qn belongs to
305 *
306 * Add @bio to @qn and put @qn on @queued if it's not already on.
307 * @qn->tg's reference count is bumped when @qn is activated. See the
308 * comment on top of throtl_qnode definition for details.
309 */
310static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
311 struct list_head *queued)
312{
313 bio_list_add(&qn->bios, bio);
314 if (list_empty(&qn->node)) {
315 list_add_tail(&qn->node, queued);
316 blkg_get(tg_to_blkg(qn->tg));
317 }
318}
319
320/**
321 * throtl_peek_queued - peek the first bio on a qnode list
322 * @queued: the qnode list to peek
323 */
324static struct bio *throtl_peek_queued(struct list_head *queued)
325{
326 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
327 struct bio *bio;
328
329 if (list_empty(queued))
330 return NULL;
331
332 bio = bio_list_peek(&qn->bios);
333 WARN_ON_ONCE(!bio);
334 return bio;
335}
336
337/**
338 * throtl_pop_queued - pop the first bio form a qnode list
339 * @queued: the qnode list to pop a bio from
340 * @tg_to_put: optional out argument for throtl_grp to put
341 *
342 * Pop the first bio from the qnode list @queued. After popping, the first
343 * qnode is removed from @queued if empty or moved to the end of @queued so
344 * that the popping order is round-robin.
345 *
346 * When the first qnode is removed, its associated throtl_grp should be put
347 * too. If @tg_to_put is NULL, this function automatically puts it;
348 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
349 * responsible for putting it.
350 */
351static struct bio *throtl_pop_queued(struct list_head *queued,
352 struct throtl_grp **tg_to_put)
353{
354 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
355 struct bio *bio;
356
357 if (list_empty(queued))
358 return NULL;
359
360 bio = bio_list_pop(&qn->bios);
361 WARN_ON_ONCE(!bio);
362
363 if (bio_list_empty(&qn->bios)) {
364 list_del_init(&qn->node);
365 if (tg_to_put)
366 *tg_to_put = qn->tg;
367 else
368 blkg_put(tg_to_blkg(qn->tg));
369 } else {
370 list_move_tail(&qn->node, queued);
371 }
372
373 return bio;
374}
375
253/* init a service_queue, assumes the caller zeroed it */ 376/* init a service_queue, assumes the caller zeroed it */
254static void throtl_service_queue_init(struct throtl_service_queue *sq, 377static void throtl_service_queue_init(struct throtl_service_queue *sq,
255 struct throtl_service_queue *parent_sq) 378 struct throtl_service_queue *parent_sq)
256{ 379{
257 bio_list_init(&sq->bio_lists[0]); 380 INIT_LIST_HEAD(&sq->queued[0]);
258 bio_list_init(&sq->bio_lists[1]); 381 INIT_LIST_HEAD(&sq->queued[1]);
259 sq->pending_tree = RB_ROOT; 382 sq->pending_tree = RB_ROOT;
260 sq->parent_sq = parent_sq; 383 sq->parent_sq = parent_sq;
261 setup_timer(&sq->pending_timer, throtl_pending_timer_fn, 384 setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
@@ -272,8 +395,14 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
272 struct throtl_grp *tg = blkg_to_tg(blkg); 395 struct throtl_grp *tg = blkg_to_tg(blkg);
273 struct throtl_data *td = blkg->q->td; 396 struct throtl_data *td = blkg->q->td;
274 unsigned long flags; 397 unsigned long flags;
398 int rw;
275 399
276 throtl_service_queue_init(&tg->service_queue, &td->service_queue); 400 throtl_service_queue_init(&tg->service_queue, &td->service_queue);
401 for (rw = READ; rw <= WRITE; rw++) {
402 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
403 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
404 }
405
277 RB_CLEAR_NODE(&tg->rb_node); 406 RB_CLEAR_NODE(&tg->rb_node);
278 tg->td = td; 407 tg->td = td;
279 408
@@ -715,7 +844,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
715 * queued. 844 * queued.
716 */ 845 */
717 BUG_ON(tg->service_queue.nr_queued[rw] && 846 BUG_ON(tg->service_queue.nr_queued[rw] &&
718 bio != bio_list_peek(&tg->service_queue.bio_lists[rw])); 847 bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
719 848
720 /* If tg->bps = -1, then BW is unlimited */ 849 /* If tg->bps = -1, then BW is unlimited */
721 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 850 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -806,11 +935,24 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
806 } 935 }
807} 936}
808 937
809static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg) 938/**
939 * throtl_add_bio_tg - add a bio to the specified throtl_grp
940 * @bio: bio to add
941 * @qn: qnode to use
942 * @tg: the target throtl_grp
943 *
944 * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
945 * tg->qnode_on_self[] is used.
946 */
947static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
948 struct throtl_grp *tg)
810{ 949{
811 struct throtl_service_queue *sq = &tg->service_queue; 950 struct throtl_service_queue *sq = &tg->service_queue;
812 bool rw = bio_data_dir(bio); 951 bool rw = bio_data_dir(bio);
813 952
953 if (!qn)
954 qn = &tg->qnode_on_self[rw];
955
814 /* 956 /*
815 * If @tg doesn't currently have any bios queued in the same 957 * If @tg doesn't currently have any bios queued in the same
816 * direction, queueing @bio can change when @tg should be 958 * direction, queueing @bio can change when @tg should be
@@ -820,9 +962,8 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg)
820 if (!sq->nr_queued[rw]) 962 if (!sq->nr_queued[rw])
821 tg->flags |= THROTL_TG_WAS_EMPTY; 963 tg->flags |= THROTL_TG_WAS_EMPTY;
822 964
823 bio_list_add(&sq->bio_lists[rw], bio); 965 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
824 /* Take a bio reference on tg */ 966
825 blkg_get(tg_to_blkg(tg));
826 sq->nr_queued[rw]++; 967 sq->nr_queued[rw]++;
827 throtl_enqueue_tg(tg); 968 throtl_enqueue_tg(tg);
828} 969}
@@ -833,10 +974,10 @@ static void tg_update_disptime(struct throtl_grp *tg)
833 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 974 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
834 struct bio *bio; 975 struct bio *bio;
835 976
836 if ((bio = bio_list_peek(&sq->bio_lists[READ]))) 977 if ((bio = throtl_peek_queued(&sq->queued[READ])))
837 tg_may_dispatch(tg, bio, &read_wait); 978 tg_may_dispatch(tg, bio, &read_wait);
838 979
839 if ((bio = bio_list_peek(&sq->bio_lists[WRITE]))) 980 if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
840 tg_may_dispatch(tg, bio, &write_wait); 981 tg_may_dispatch(tg, bio, &write_wait);
841 982
842 min_wait = min(read_wait, write_wait); 983 min_wait = min(read_wait, write_wait);
@@ -856,9 +997,16 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
856 struct throtl_service_queue *sq = &tg->service_queue; 997 struct throtl_service_queue *sq = &tg->service_queue;
857 struct throtl_service_queue *parent_sq = sq->parent_sq; 998 struct throtl_service_queue *parent_sq = sq->parent_sq;
858 struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 999 struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
1000 struct throtl_grp *tg_to_put = NULL;
859 struct bio *bio; 1001 struct bio *bio;
860 1002
861 bio = bio_list_pop(&sq->bio_lists[rw]); 1003 /*
1004 * @bio is being transferred from @tg to @parent_sq. Popping a bio
1005 * from @tg may put its reference and @parent_sq might end up
1006 * getting released prematurely. Remember the tg to put and put it
1007 * after @bio is transferred to @parent_sq.
1008 */
1009 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
862 sq->nr_queued[rw]--; 1010 sq->nr_queued[rw]--;
863 1011
864 throtl_charge_bio(tg, bio); 1012 throtl_charge_bio(tg, bio);
@@ -871,17 +1019,18 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
871 * responsible for issuing these bios. 1019 * responsible for issuing these bios.
872 */ 1020 */
873 if (parent_tg) { 1021 if (parent_tg) {
874 throtl_add_bio_tg(bio, parent_tg); 1022 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
875 } else { 1023 } else {
876 bio_list_add(&parent_sq->bio_lists[rw], bio); 1024 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
1025 &parent_sq->queued[rw]);
877 BUG_ON(tg->td->nr_queued[rw] <= 0); 1026 BUG_ON(tg->td->nr_queued[rw] <= 0);
878 tg->td->nr_queued[rw]--; 1027 tg->td->nr_queued[rw]--;
879 } 1028 }
880 1029
881 throtl_trim_slice(tg, rw); 1030 throtl_trim_slice(tg, rw);
882 1031
883 /* @bio is transferred to parent, drop its blkg reference */ 1032 if (tg_to_put)
884 blkg_put(tg_to_blkg(tg)); 1033 blkg_put(tg_to_blkg(tg_to_put));
885} 1034}
886 1035
887static int throtl_dispatch_tg(struct throtl_grp *tg) 1036static int throtl_dispatch_tg(struct throtl_grp *tg)
@@ -894,7 +1043,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
894 1043
895 /* Try to dispatch 75% READS and 25% WRITES */ 1044 /* Try to dispatch 75% READS and 25% WRITES */
896 1045
897 while ((bio = bio_list_peek(&sq->bio_lists[READ])) && 1046 while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
898 tg_may_dispatch(tg, bio, NULL)) { 1047 tg_may_dispatch(tg, bio, NULL)) {
899 1048
900 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1049 tg_dispatch_one_bio(tg, bio_data_dir(bio));
@@ -904,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
904 break; 1053 break;
905 } 1054 }
906 1055
907 while ((bio = bio_list_peek(&sq->bio_lists[WRITE])) && 1056 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
908 tg_may_dispatch(tg, bio, NULL)) { 1057 tg_may_dispatch(tg, bio, NULL)) {
909 1058
910 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1059 tg_dispatch_one_bio(tg, bio_data_dir(bio));
@@ -1039,10 +1188,9 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work)
1039 bio_list_init(&bio_list_on_stack); 1188 bio_list_init(&bio_list_on_stack);
1040 1189
1041 spin_lock_irq(q->queue_lock); 1190 spin_lock_irq(q->queue_lock);
1042 for (rw = READ; rw <= WRITE; rw++) { 1191 for (rw = READ; rw <= WRITE; rw++)
1043 bio_list_merge(&bio_list_on_stack, &td_sq->bio_lists[rw]); 1192 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
1044 bio_list_init(&td_sq->bio_lists[rw]); 1193 bio_list_add(&bio_list_on_stack, bio);
1045 }
1046 spin_unlock_irq(q->queue_lock); 1194 spin_unlock_irq(q->queue_lock);
1047 1195
1048 if (!bio_list_empty(&bio_list_on_stack)) { 1196 if (!bio_list_empty(&bio_list_on_stack)) {
@@ -1241,6 +1389,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
1241bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1389bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1242{ 1390{
1243 struct throtl_data *td = q->td; 1391 struct throtl_data *td = q->td;
1392 struct throtl_qnode *qn = NULL;
1244 struct throtl_grp *tg; 1393 struct throtl_grp *tg;
1245 struct throtl_service_queue *sq; 1394 struct throtl_service_queue *sq;
1246 bool rw = bio_data_dir(bio); 1395 bool rw = bio_data_dir(bio);
@@ -1308,6 +1457,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1308 * Climb up the ladder. If we''re already at the top, it 1457 * Climb up the ladder. If we''re already at the top, it
1309 * can be executed directly. 1458 * can be executed directly.
1310 */ 1459 */
1460 qn = &tg->qnode_on_parent[rw];
1311 sq = sq->parent_sq; 1461 sq = sq->parent_sq;
1312 tg = sq_to_tg(sq); 1462 tg = sq_to_tg(sq);
1313 if (!tg) 1463 if (!tg)
@@ -1323,7 +1473,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1323 1473
1324 bio_associate_current(bio); 1474 bio_associate_current(bio);
1325 tg->td->nr_queued[rw]++; 1475 tg->td->nr_queued[rw]++;
1326 throtl_add_bio_tg(bio, tg); 1476 throtl_add_bio_tg(bio, qn, tg);
1327 throttled = true; 1477 throttled = true;
1328 1478
1329 /* 1479 /*
@@ -1367,9 +1517,9 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq)
1367 1517
1368 throtl_dequeue_tg(tg); 1518 throtl_dequeue_tg(tg);
1369 1519
1370 while ((bio = bio_list_peek(&sq->bio_lists[READ]))) 1520 while ((bio = throtl_peek_queued(&sq->queued[READ])))
1371 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1521 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1372 while ((bio = bio_list_peek(&sq->bio_lists[WRITE]))) 1522 while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
1373 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1523 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1374 } 1524 }
1375} 1525}
@@ -1411,7 +1561,8 @@ void blk_throtl_drain(struct request_queue *q)
1411 1561
1412 /* all bios now should be in td->service_queue, issue them */ 1562 /* all bios now should be in td->service_queue, issue them */
1413 for (rw = READ; rw <= WRITE; rw++) 1563 for (rw = READ; rw <= WRITE; rw++)
1414 while ((bio = bio_list_pop(&td->service_queue.bio_lists[rw]))) 1564 while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
1565 NULL)))
1415 generic_make_request(bio); 1566 generic_make_request(bio);
1416 1567
1417 spin_lock_irq(q->queue_lock); 1568 spin_lock_irq(q->queue_lock);