diff options
-rw-r--r-- | block/blk-throttle.c | 201 |
1 files changed, 176 insertions, 25 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index bc65077f6e43..541bd0dabb9a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -26,6 +26,35 @@ static struct blkcg_policy blkcg_policy_throtl; | |||
26 | /* A workqueue to queue throttle related work */ | 26 | /* A workqueue to queue throttle related work */ |
27 | static struct workqueue_struct *kthrotld_workqueue; | 27 | static struct workqueue_struct *kthrotld_workqueue; |
28 | 28 | ||
29 | /* | ||
30 | * To implement hierarchical throttling, throtl_grps form a tree and bios | ||
31 | * are dispatched upwards level by level until they reach the top and get | ||
32 | * issued. When dispatching bios from the children and local group at each | ||
33 | * level, if the bios are dispatched into a single bio_list, there's a risk | ||
34 | * of a local or child group which can queue many bios at once filling up | ||
35 | * the list starving others. | ||
36 | * | ||
37 | * To avoid such starvation, dispatched bios are queued separately | ||
38 | * according to where they came from. When they are again dispatched to | ||
39 | * the parent, they're popped in round-robin order so that no single source | ||
40 | * hogs the dispatch window. | ||
41 | * | ||
42 | * throtl_qnode is used to keep the queued bios separated by their sources. | ||
43 | * Bios are queued to throtl_qnode which in turn is queued to | ||
44 | * throtl_service_queue and then dispatched in round-robin order. | ||
45 | * | ||
46 | * It's also used to track the reference counts on blkg's. A qnode always | ||
47 | * belongs to a throtl_grp and gets queued on itself or the parent, so | ||
48 | * incrementing the reference of the associated throtl_grp when a qnode is | ||
49 | * queued and decrementing when dequeued is enough to keep the whole blkg | ||
50 | * tree pinned while bios are in flight. | ||
51 | */ | ||
52 | struct throtl_qnode { | ||
53 | struct list_head node; /* service_queue->queued[] */ | ||
54 | struct bio_list bios; /* queued bios */ | ||
55 | struct throtl_grp *tg; /* tg this qnode belongs to */ | ||
56 | }; | ||
57 | |||
29 | struct throtl_service_queue { | 58 | struct throtl_service_queue { |
30 | struct throtl_service_queue *parent_sq; /* the parent service_queue */ | 59 | struct throtl_service_queue *parent_sq; /* the parent service_queue */ |
31 | 60 | ||
@@ -33,7 +62,7 @@ struct throtl_service_queue { | |||
33 | * Bios queued directly to this service_queue or dispatched from | 62 | * Bios queued directly to this service_queue or dispatched from |
34 | * children throtl_grp's. | 63 | * children throtl_grp's. |
35 | */ | 64 | */ |
36 | struct bio_list bio_lists[2]; /* queued bios [READ/WRITE] */ | 65 | struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ |
37 | unsigned int nr_queued[2]; /* number of queued bios */ | 66 | unsigned int nr_queued[2]; /* number of queued bios */ |
38 | 67 | ||
39 | /* | 68 | /* |
@@ -76,6 +105,17 @@ struct throtl_grp { | |||
76 | struct throtl_service_queue service_queue; | 105 | struct throtl_service_queue service_queue; |
77 | 106 | ||
78 | /* | 107 | /* |
108 | * qnode_on_self is used when bios are directly queued to this | ||
109 | * throtl_grp so that local bios compete fairly with bios | ||
110 | * dispatched from children. qnode_on_parent is used when bios are | ||
111 | * dispatched from this throtl_grp into its parent and will compete | ||
112 | * with the sibling qnode_on_parents and the parent's | ||
113 | * qnode_on_self. | ||
114 | */ | ||
115 | struct throtl_qnode qnode_on_self[2]; | ||
116 | struct throtl_qnode qnode_on_parent[2]; | ||
117 | |||
118 | /* | ||
79 | * Dispatch time in jiffies. This is the estimated time when group | 119 | * Dispatch time in jiffies. This is the estimated time when group |
80 | * will unthrottle and is ready to dispatch more bio. It is used as | 120 | * will unthrottle and is ready to dispatch more bio. It is used as |
81 | * key to sort active groups in service tree. | 121 | * key to sort active groups in service tree. |
@@ -250,12 +290,95 @@ alloc_stats: | |||
250 | goto alloc_stats; | 290 | goto alloc_stats; |
251 | } | 291 | } |
252 | 292 | ||
293 | static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) | ||
294 | { | ||
295 | INIT_LIST_HEAD(&qn->node); | ||
296 | bio_list_init(&qn->bios); | ||
297 | qn->tg = tg; | ||
298 | } | ||
299 | |||
300 | /** | ||
301 | * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it | ||
302 | * @bio: bio being added | ||
303 | * @qn: qnode to add bio to | ||
304 | * @queued: the service_queue->queued[] list @qn belongs to | ||
305 | * | ||
306 | * Add @bio to @qn and put @qn on @queued if it's not already on. | ||
307 | * @qn->tg's reference count is bumped when @qn is activated. See the | ||
308 | * comment on top of throtl_qnode definition for details. | ||
309 | */ | ||
310 | static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, | ||
311 | struct list_head *queued) | ||
312 | { | ||
313 | bio_list_add(&qn->bios, bio); | ||
314 | if (list_empty(&qn->node)) { | ||
315 | list_add_tail(&qn->node, queued); | ||
316 | blkg_get(tg_to_blkg(qn->tg)); | ||
317 | } | ||
318 | } | ||
319 | |||
320 | /** | ||
321 | * throtl_peek_queued - peek the first bio on a qnode list | ||
322 | * @queued: the qnode list to peek | ||
323 | */ | ||
324 | static struct bio *throtl_peek_queued(struct list_head *queued) | ||
325 | { | ||
326 | struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); | ||
327 | struct bio *bio; | ||
328 | |||
329 | if (list_empty(queued)) | ||
330 | return NULL; | ||
331 | |||
332 | bio = bio_list_peek(&qn->bios); | ||
333 | WARN_ON_ONCE(!bio); | ||
334 | return bio; | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * throtl_pop_queued - pop the first bio form a qnode list | ||
339 | * @queued: the qnode list to pop a bio from | ||
340 | * @tg_to_put: optional out argument for throtl_grp to put | ||
341 | * | ||
342 | * Pop the first bio from the qnode list @queued. After popping, the first | ||
343 | * qnode is removed from @queued if empty or moved to the end of @queued so | ||
344 | * that the popping order is round-robin. | ||
345 | * | ||
346 | * When the first qnode is removed, its associated throtl_grp should be put | ||
347 | * too. If @tg_to_put is NULL, this function automatically puts it; | ||
348 | * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is | ||
349 | * responsible for putting it. | ||
350 | */ | ||
351 | static struct bio *throtl_pop_queued(struct list_head *queued, | ||
352 | struct throtl_grp **tg_to_put) | ||
353 | { | ||
354 | struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); | ||
355 | struct bio *bio; | ||
356 | |||
357 | if (list_empty(queued)) | ||
358 | return NULL; | ||
359 | |||
360 | bio = bio_list_pop(&qn->bios); | ||
361 | WARN_ON_ONCE(!bio); | ||
362 | |||
363 | if (bio_list_empty(&qn->bios)) { | ||
364 | list_del_init(&qn->node); | ||
365 | if (tg_to_put) | ||
366 | *tg_to_put = qn->tg; | ||
367 | else | ||
368 | blkg_put(tg_to_blkg(qn->tg)); | ||
369 | } else { | ||
370 | list_move_tail(&qn->node, queued); | ||
371 | } | ||
372 | |||
373 | return bio; | ||
374 | } | ||
375 | |||
253 | /* init a service_queue, assumes the caller zeroed it */ | 376 | /* init a service_queue, assumes the caller zeroed it */ |
254 | static void throtl_service_queue_init(struct throtl_service_queue *sq, | 377 | static void throtl_service_queue_init(struct throtl_service_queue *sq, |
255 | struct throtl_service_queue *parent_sq) | 378 | struct throtl_service_queue *parent_sq) |
256 | { | 379 | { |
257 | bio_list_init(&sq->bio_lists[0]); | 380 | INIT_LIST_HEAD(&sq->queued[0]); |
258 | bio_list_init(&sq->bio_lists[1]); | 381 | INIT_LIST_HEAD(&sq->queued[1]); |
259 | sq->pending_tree = RB_ROOT; | 382 | sq->pending_tree = RB_ROOT; |
260 | sq->parent_sq = parent_sq; | 383 | sq->parent_sq = parent_sq; |
261 | setup_timer(&sq->pending_timer, throtl_pending_timer_fn, | 384 | setup_timer(&sq->pending_timer, throtl_pending_timer_fn, |
@@ -272,8 +395,14 @@ static void throtl_pd_init(struct blkcg_gq *blkg) | |||
272 | struct throtl_grp *tg = blkg_to_tg(blkg); | 395 | struct throtl_grp *tg = blkg_to_tg(blkg); |
273 | struct throtl_data *td = blkg->q->td; | 396 | struct throtl_data *td = blkg->q->td; |
274 | unsigned long flags; | 397 | unsigned long flags; |
398 | int rw; | ||
275 | 399 | ||
276 | throtl_service_queue_init(&tg->service_queue, &td->service_queue); | 400 | throtl_service_queue_init(&tg->service_queue, &td->service_queue); |
401 | for (rw = READ; rw <= WRITE; rw++) { | ||
402 | throtl_qnode_init(&tg->qnode_on_self[rw], tg); | ||
403 | throtl_qnode_init(&tg->qnode_on_parent[rw], tg); | ||
404 | } | ||
405 | |||
277 | RB_CLEAR_NODE(&tg->rb_node); | 406 | RB_CLEAR_NODE(&tg->rb_node); |
278 | tg->td = td; | 407 | tg->td = td; |
279 | 408 | ||
@@ -715,7 +844,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, | |||
715 | * queued. | 844 | * queued. |
716 | */ | 845 | */ |
717 | BUG_ON(tg->service_queue.nr_queued[rw] && | 846 | BUG_ON(tg->service_queue.nr_queued[rw] && |
718 | bio != bio_list_peek(&tg->service_queue.bio_lists[rw])); | 847 | bio != throtl_peek_queued(&tg->service_queue.queued[rw])); |
719 | 848 | ||
720 | /* If tg->bps = -1, then BW is unlimited */ | 849 | /* If tg->bps = -1, then BW is unlimited */ |
721 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { | 850 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { |
@@ -806,11 +935,24 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | |||
806 | } | 935 | } |
807 | } | 936 | } |
808 | 937 | ||
809 | static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg) | 938 | /** |
939 | * throtl_add_bio_tg - add a bio to the specified throtl_grp | ||
940 | * @bio: bio to add | ||
941 | * @qn: qnode to use | ||
942 | * @tg: the target throtl_grp | ||
943 | * | ||
944 | * Add @bio to @tg's service_queue using @qn. If @qn is not specified, | ||
945 | * tg->qnode_on_self[] is used. | ||
946 | */ | ||
947 | static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, | ||
948 | struct throtl_grp *tg) | ||
810 | { | 949 | { |
811 | struct throtl_service_queue *sq = &tg->service_queue; | 950 | struct throtl_service_queue *sq = &tg->service_queue; |
812 | bool rw = bio_data_dir(bio); | 951 | bool rw = bio_data_dir(bio); |
813 | 952 | ||
953 | if (!qn) | ||
954 | qn = &tg->qnode_on_self[rw]; | ||
955 | |||
814 | /* | 956 | /* |
815 | * If @tg doesn't currently have any bios queued in the same | 957 | * If @tg doesn't currently have any bios queued in the same |
816 | * direction, queueing @bio can change when @tg should be | 958 | * direction, queueing @bio can change when @tg should be |
@@ -820,9 +962,8 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg) | |||
820 | if (!sq->nr_queued[rw]) | 962 | if (!sq->nr_queued[rw]) |
821 | tg->flags |= THROTL_TG_WAS_EMPTY; | 963 | tg->flags |= THROTL_TG_WAS_EMPTY; |
822 | 964 | ||
823 | bio_list_add(&sq->bio_lists[rw], bio); | 965 | throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); |
824 | /* Take a bio reference on tg */ | 966 | |
825 | blkg_get(tg_to_blkg(tg)); | ||
826 | sq->nr_queued[rw]++; | 967 | sq->nr_queued[rw]++; |
827 | throtl_enqueue_tg(tg); | 968 | throtl_enqueue_tg(tg); |
828 | } | 969 | } |
@@ -833,10 +974,10 @@ static void tg_update_disptime(struct throtl_grp *tg) | |||
833 | unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; | 974 | unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; |
834 | struct bio *bio; | 975 | struct bio *bio; |
835 | 976 | ||
836 | if ((bio = bio_list_peek(&sq->bio_lists[READ]))) | 977 | if ((bio = throtl_peek_queued(&sq->queued[READ]))) |
837 | tg_may_dispatch(tg, bio, &read_wait); | 978 | tg_may_dispatch(tg, bio, &read_wait); |
838 | 979 | ||
839 | if ((bio = bio_list_peek(&sq->bio_lists[WRITE]))) | 980 | if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) |
840 | tg_may_dispatch(tg, bio, &write_wait); | 981 | tg_may_dispatch(tg, bio, &write_wait); |
841 | 982 | ||
842 | min_wait = min(read_wait, write_wait); | 983 | min_wait = min(read_wait, write_wait); |
@@ -856,9 +997,16 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) | |||
856 | struct throtl_service_queue *sq = &tg->service_queue; | 997 | struct throtl_service_queue *sq = &tg->service_queue; |
857 | struct throtl_service_queue *parent_sq = sq->parent_sq; | 998 | struct throtl_service_queue *parent_sq = sq->parent_sq; |
858 | struct throtl_grp *parent_tg = sq_to_tg(parent_sq); | 999 | struct throtl_grp *parent_tg = sq_to_tg(parent_sq); |
1000 | struct throtl_grp *tg_to_put = NULL; | ||
859 | struct bio *bio; | 1001 | struct bio *bio; |
860 | 1002 | ||
861 | bio = bio_list_pop(&sq->bio_lists[rw]); | 1003 | /* |
1004 | * @bio is being transferred from @tg to @parent_sq. Popping a bio | ||
1005 | * from @tg may put its reference and @parent_sq might end up | ||
1006 | * getting released prematurely. Remember the tg to put and put it | ||
1007 | * after @bio is transferred to @parent_sq. | ||
1008 | */ | ||
1009 | bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); | ||
862 | sq->nr_queued[rw]--; | 1010 | sq->nr_queued[rw]--; |
863 | 1011 | ||
864 | throtl_charge_bio(tg, bio); | 1012 | throtl_charge_bio(tg, bio); |
@@ -871,17 +1019,18 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) | |||
871 | * responsible for issuing these bios. | 1019 | * responsible for issuing these bios. |
872 | */ | 1020 | */ |
873 | if (parent_tg) { | 1021 | if (parent_tg) { |
874 | throtl_add_bio_tg(bio, parent_tg); | 1022 | throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); |
875 | } else { | 1023 | } else { |
876 | bio_list_add(&parent_sq->bio_lists[rw], bio); | 1024 | throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], |
1025 | &parent_sq->queued[rw]); | ||
877 | BUG_ON(tg->td->nr_queued[rw] <= 0); | 1026 | BUG_ON(tg->td->nr_queued[rw] <= 0); |
878 | tg->td->nr_queued[rw]--; | 1027 | tg->td->nr_queued[rw]--; |
879 | } | 1028 | } |
880 | 1029 | ||
881 | throtl_trim_slice(tg, rw); | 1030 | throtl_trim_slice(tg, rw); |
882 | 1031 | ||
883 | /* @bio is transferred to parent, drop its blkg reference */ | 1032 | if (tg_to_put) |
884 | blkg_put(tg_to_blkg(tg)); | 1033 | blkg_put(tg_to_blkg(tg_to_put)); |
885 | } | 1034 | } |
886 | 1035 | ||
887 | static int throtl_dispatch_tg(struct throtl_grp *tg) | 1036 | static int throtl_dispatch_tg(struct throtl_grp *tg) |
@@ -894,7 +1043,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) | |||
894 | 1043 | ||
895 | /* Try to dispatch 75% READS and 25% WRITES */ | 1044 | /* Try to dispatch 75% READS and 25% WRITES */ |
896 | 1045 | ||
897 | while ((bio = bio_list_peek(&sq->bio_lists[READ])) && | 1046 | while ((bio = throtl_peek_queued(&sq->queued[READ])) && |
898 | tg_may_dispatch(tg, bio, NULL)) { | 1047 | tg_may_dispatch(tg, bio, NULL)) { |
899 | 1048 | ||
900 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); | 1049 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); |
@@ -904,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) | |||
904 | break; | 1053 | break; |
905 | } | 1054 | } |
906 | 1055 | ||
907 | while ((bio = bio_list_peek(&sq->bio_lists[WRITE])) && | 1056 | while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && |
908 | tg_may_dispatch(tg, bio, NULL)) { | 1057 | tg_may_dispatch(tg, bio, NULL)) { |
909 | 1058 | ||
910 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); | 1059 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); |
@@ -1039,10 +1188,9 @@ void blk_throtl_dispatch_work_fn(struct work_struct *work) | |||
1039 | bio_list_init(&bio_list_on_stack); | 1188 | bio_list_init(&bio_list_on_stack); |
1040 | 1189 | ||
1041 | spin_lock_irq(q->queue_lock); | 1190 | spin_lock_irq(q->queue_lock); |
1042 | for (rw = READ; rw <= WRITE; rw++) { | 1191 | for (rw = READ; rw <= WRITE; rw++) |
1043 | bio_list_merge(&bio_list_on_stack, &td_sq->bio_lists[rw]); | 1192 | while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) |
1044 | bio_list_init(&td_sq->bio_lists[rw]); | 1193 | bio_list_add(&bio_list_on_stack, bio); |
1045 | } | ||
1046 | spin_unlock_irq(q->queue_lock); | 1194 | spin_unlock_irq(q->queue_lock); |
1047 | 1195 | ||
1048 | if (!bio_list_empty(&bio_list_on_stack)) { | 1196 | if (!bio_list_empty(&bio_list_on_stack)) { |
@@ -1241,6 +1389,7 @@ static struct blkcg_policy blkcg_policy_throtl = { | |||
1241 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | 1389 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) |
1242 | { | 1390 | { |
1243 | struct throtl_data *td = q->td; | 1391 | struct throtl_data *td = q->td; |
1392 | struct throtl_qnode *qn = NULL; | ||
1244 | struct throtl_grp *tg; | 1393 | struct throtl_grp *tg; |
1245 | struct throtl_service_queue *sq; | 1394 | struct throtl_service_queue *sq; |
1246 | bool rw = bio_data_dir(bio); | 1395 | bool rw = bio_data_dir(bio); |
@@ -1308,6 +1457,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1308 | * Climb up the ladder. If we''re already at the top, it | 1457 | * Climb up the ladder. If we''re already at the top, it |
1309 | * can be executed directly. | 1458 | * can be executed directly. |
1310 | */ | 1459 | */ |
1460 | qn = &tg->qnode_on_parent[rw]; | ||
1311 | sq = sq->parent_sq; | 1461 | sq = sq->parent_sq; |
1312 | tg = sq_to_tg(sq); | 1462 | tg = sq_to_tg(sq); |
1313 | if (!tg) | 1463 | if (!tg) |
@@ -1323,7 +1473,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1323 | 1473 | ||
1324 | bio_associate_current(bio); | 1474 | bio_associate_current(bio); |
1325 | tg->td->nr_queued[rw]++; | 1475 | tg->td->nr_queued[rw]++; |
1326 | throtl_add_bio_tg(bio, tg); | 1476 | throtl_add_bio_tg(bio, qn, tg); |
1327 | throttled = true; | 1477 | throttled = true; |
1328 | 1478 | ||
1329 | /* | 1479 | /* |
@@ -1367,9 +1517,9 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq) | |||
1367 | 1517 | ||
1368 | throtl_dequeue_tg(tg); | 1518 | throtl_dequeue_tg(tg); |
1369 | 1519 | ||
1370 | while ((bio = bio_list_peek(&sq->bio_lists[READ]))) | 1520 | while ((bio = throtl_peek_queued(&sq->queued[READ]))) |
1371 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); | 1521 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); |
1372 | while ((bio = bio_list_peek(&sq->bio_lists[WRITE]))) | 1522 | while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) |
1373 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); | 1523 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); |
1374 | } | 1524 | } |
1375 | } | 1525 | } |
@@ -1411,7 +1561,8 @@ void blk_throtl_drain(struct request_queue *q) | |||
1411 | 1561 | ||
1412 | /* all bios now should be in td->service_queue, issue them */ | 1562 | /* all bios now should be in td->service_queue, issue them */ |
1413 | for (rw = READ; rw <= WRITE; rw++) | 1563 | for (rw = READ; rw <= WRITE; rw++) |
1414 | while ((bio = bio_list_pop(&td->service_queue.bio_lists[rw]))) | 1564 | while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], |
1565 | NULL))) | ||
1415 | generic_make_request(bio); | 1566 | generic_make_request(bio); |
1416 | 1567 | ||
1417 | spin_lock_irq(q->queue_lock); | 1568 | spin_lock_irq(q->queue_lock); |