diff options
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r-- | block/blk-throttle.c | 505 |
1 files changed, 200 insertions, 305 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b23193518ac7..c75a2636dd40 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -83,14 +83,6 @@ enum tg_state_flags { | |||
83 | 83 | ||
84 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) | 84 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) |
85 | 85 | ||
86 | /* Per-cpu group stats */ | ||
87 | struct tg_stats_cpu { | ||
88 | /* total bytes transferred */ | ||
89 | struct blkg_rwstat service_bytes; | ||
90 | /* total IOs serviced, post merge */ | ||
91 | struct blkg_rwstat serviced; | ||
92 | }; | ||
93 | |||
94 | struct throtl_grp { | 86 | struct throtl_grp { |
95 | /* must be the first member */ | 87 | /* must be the first member */ |
96 | struct blkg_policy_data pd; | 88 | struct blkg_policy_data pd; |
@@ -141,12 +133,6 @@ struct throtl_grp { | |||
141 | /* When did we start a new slice */ | 133 | /* When did we start a new slice */ |
142 | unsigned long slice_start[2]; | 134 | unsigned long slice_start[2]; |
143 | unsigned long slice_end[2]; | 135 | unsigned long slice_end[2]; |
144 | |||
145 | /* Per cpu stats pointer */ | ||
146 | struct tg_stats_cpu __percpu *stats_cpu; | ||
147 | |||
148 | /* List of tgs waiting for per cpu stats memory to be allocated */ | ||
149 | struct list_head stats_alloc_node; | ||
150 | }; | 136 | }; |
151 | 137 | ||
152 | struct throtl_data | 138 | struct throtl_data |
@@ -168,13 +154,6 @@ struct throtl_data | |||
168 | struct work_struct dispatch_work; | 154 | struct work_struct dispatch_work; |
169 | }; | 155 | }; |
170 | 156 | ||
171 | /* list and work item to allocate percpu group stats */ | ||
172 | static DEFINE_SPINLOCK(tg_stats_alloc_lock); | ||
173 | static LIST_HEAD(tg_stats_alloc_list); | ||
174 | |||
175 | static void tg_stats_alloc_fn(struct work_struct *); | ||
176 | static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); | ||
177 | |||
178 | static void throtl_pending_timer_fn(unsigned long arg); | 157 | static void throtl_pending_timer_fn(unsigned long arg); |
179 | 158 | ||
180 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) | 159 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) |
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) | |||
192 | return pd_to_blkg(&tg->pd); | 171 | return pd_to_blkg(&tg->pd); |
193 | } | 172 | } |
194 | 173 | ||
195 | static inline struct throtl_grp *td_root_tg(struct throtl_data *td) | ||
196 | { | ||
197 | return blkg_to_tg(td->queue->root_blkg); | ||
198 | } | ||
199 | |||
200 | /** | 174 | /** |
201 | * sq_to_tg - return the throl_grp the specified service queue belongs to | 175 | * sq_to_tg - return the throl_grp the specified service queue belongs to |
202 | * @sq: the throtl_service_queue of interest | 176 | * @sq: the throtl_service_queue of interest |
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) | |||
256 | } \ | 230 | } \ |
257 | } while (0) | 231 | } while (0) |
258 | 232 | ||
259 | static void tg_stats_init(struct tg_stats_cpu *tg_stats) | ||
260 | { | ||
261 | blkg_rwstat_init(&tg_stats->service_bytes); | ||
262 | blkg_rwstat_init(&tg_stats->serviced); | ||
263 | } | ||
264 | |||
265 | /* | ||
266 | * Worker for allocating per cpu stat for tgs. This is scheduled on the | ||
267 | * system_wq once there are some groups on the alloc_list waiting for | ||
268 | * allocation. | ||
269 | */ | ||
270 | static void tg_stats_alloc_fn(struct work_struct *work) | ||
271 | { | ||
272 | static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ | ||
273 | struct delayed_work *dwork = to_delayed_work(work); | ||
274 | bool empty = false; | ||
275 | |||
276 | alloc_stats: | ||
277 | if (!stats_cpu) { | ||
278 | int cpu; | ||
279 | |||
280 | stats_cpu = alloc_percpu(struct tg_stats_cpu); | ||
281 | if (!stats_cpu) { | ||
282 | /* allocation failed, try again after some time */ | ||
283 | schedule_delayed_work(dwork, msecs_to_jiffies(10)); | ||
284 | return; | ||
285 | } | ||
286 | for_each_possible_cpu(cpu) | ||
287 | tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); | ||
288 | } | ||
289 | |||
290 | spin_lock_irq(&tg_stats_alloc_lock); | ||
291 | |||
292 | if (!list_empty(&tg_stats_alloc_list)) { | ||
293 | struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, | ||
294 | struct throtl_grp, | ||
295 | stats_alloc_node); | ||
296 | swap(tg->stats_cpu, stats_cpu); | ||
297 | list_del_init(&tg->stats_alloc_node); | ||
298 | } | ||
299 | |||
300 | empty = list_empty(&tg_stats_alloc_list); | ||
301 | spin_unlock_irq(&tg_stats_alloc_lock); | ||
302 | if (!empty) | ||
303 | goto alloc_stats; | ||
304 | } | ||
305 | |||
306 | static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) | 233 | static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) |
307 | { | 234 | { |
308 | INIT_LIST_HEAD(&qn->node); | 235 | INIT_LIST_HEAD(&qn->node); |
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued, | |||
387 | } | 314 | } |
388 | 315 | ||
389 | /* init a service_queue, assumes the caller zeroed it */ | 316 | /* init a service_queue, assumes the caller zeroed it */ |
390 | static void throtl_service_queue_init(struct throtl_service_queue *sq, | 317 | static void throtl_service_queue_init(struct throtl_service_queue *sq) |
391 | struct throtl_service_queue *parent_sq) | ||
392 | { | 318 | { |
393 | INIT_LIST_HEAD(&sq->queued[0]); | 319 | INIT_LIST_HEAD(&sq->queued[0]); |
394 | INIT_LIST_HEAD(&sq->queued[1]); | 320 | INIT_LIST_HEAD(&sq->queued[1]); |
395 | sq->pending_tree = RB_ROOT; | 321 | sq->pending_tree = RB_ROOT; |
396 | sq->parent_sq = parent_sq; | ||
397 | setup_timer(&sq->pending_timer, throtl_pending_timer_fn, | 322 | setup_timer(&sq->pending_timer, throtl_pending_timer_fn, |
398 | (unsigned long)sq); | 323 | (unsigned long)sq); |
399 | } | 324 | } |
400 | 325 | ||
401 | static void throtl_service_queue_exit(struct throtl_service_queue *sq) | 326 | static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) |
402 | { | 327 | { |
403 | del_timer_sync(&sq->pending_timer); | 328 | struct throtl_grp *tg; |
329 | int rw; | ||
330 | |||
331 | tg = kzalloc_node(sizeof(*tg), gfp, node); | ||
332 | if (!tg) | ||
333 | return NULL; | ||
334 | |||
335 | throtl_service_queue_init(&tg->service_queue); | ||
336 | |||
337 | for (rw = READ; rw <= WRITE; rw++) { | ||
338 | throtl_qnode_init(&tg->qnode_on_self[rw], tg); | ||
339 | throtl_qnode_init(&tg->qnode_on_parent[rw], tg); | ||
340 | } | ||
341 | |||
342 | RB_CLEAR_NODE(&tg->rb_node); | ||
343 | tg->bps[READ] = -1; | ||
344 | tg->bps[WRITE] = -1; | ||
345 | tg->iops[READ] = -1; | ||
346 | tg->iops[WRITE] = -1; | ||
347 | |||
348 | return &tg->pd; | ||
404 | } | 349 | } |
405 | 350 | ||
406 | static void throtl_pd_init(struct blkcg_gq *blkg) | 351 | static void throtl_pd_init(struct blkg_policy_data *pd) |
407 | { | 352 | { |
408 | struct throtl_grp *tg = blkg_to_tg(blkg); | 353 | struct throtl_grp *tg = pd_to_tg(pd); |
354 | struct blkcg_gq *blkg = tg_to_blkg(tg); | ||
409 | struct throtl_data *td = blkg->q->td; | 355 | struct throtl_data *td = blkg->q->td; |
410 | struct throtl_service_queue *parent_sq; | 356 | struct throtl_service_queue *sq = &tg->service_queue; |
411 | unsigned long flags; | ||
412 | int rw; | ||
413 | 357 | ||
414 | /* | 358 | /* |
415 | * If on the default hierarchy, we switch to properly hierarchical | 359 | * If on the default hierarchy, we switch to properly hierarchical |
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg) | |||
424 | * Limits of a group don't interact with limits of other groups | 368 | * Limits of a group don't interact with limits of other groups |
425 | * regardless of the position of the group in the hierarchy. | 369 | * regardless of the position of the group in the hierarchy. |
426 | */ | 370 | */ |
427 | parent_sq = &td->service_queue; | 371 | sq->parent_sq = &td->service_queue; |
428 | |||
429 | if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) | 372 | if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) |
430 | parent_sq = &blkg_to_tg(blkg->parent)->service_queue; | 373 | sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; |
431 | |||
432 | throtl_service_queue_init(&tg->service_queue, parent_sq); | ||
433 | |||
434 | for (rw = READ; rw <= WRITE; rw++) { | ||
435 | throtl_qnode_init(&tg->qnode_on_self[rw], tg); | ||
436 | throtl_qnode_init(&tg->qnode_on_parent[rw], tg); | ||
437 | } | ||
438 | |||
439 | RB_CLEAR_NODE(&tg->rb_node); | ||
440 | tg->td = td; | 374 | tg->td = td; |
441 | |||
442 | tg->bps[READ] = -1; | ||
443 | tg->bps[WRITE] = -1; | ||
444 | tg->iops[READ] = -1; | ||
445 | tg->iops[WRITE] = -1; | ||
446 | |||
447 | /* | ||
448 | * Ugh... We need to perform per-cpu allocation for tg->stats_cpu | ||
449 | * but percpu allocator can't be called from IO path. Queue tg on | ||
450 | * tg_stats_alloc_list and allocate from work item. | ||
451 | */ | ||
452 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); | ||
453 | list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); | ||
454 | schedule_delayed_work(&tg_stats_alloc_work, 0); | ||
455 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | ||
456 | } | 375 | } |
457 | 376 | ||
458 | /* | 377 | /* |
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg) | |||
470 | (tg->bps[rw] != -1 || tg->iops[rw] != -1); | 389 | (tg->bps[rw] != -1 || tg->iops[rw] != -1); |
471 | } | 390 | } |
472 | 391 | ||
473 | static void throtl_pd_online(struct blkcg_gq *blkg) | 392 | static void throtl_pd_online(struct blkg_policy_data *pd) |
474 | { | 393 | { |
475 | /* | 394 | /* |
476 | * We don't want new groups to escape the limits of its ancestors. | 395 | * We don't want new groups to escape the limits of its ancestors. |
477 | * Update has_rules[] after a new group is brought online. | 396 | * Update has_rules[] after a new group is brought online. |
478 | */ | 397 | */ |
479 | tg_update_has_rules(blkg_to_tg(blkg)); | 398 | tg_update_has_rules(pd_to_tg(pd)); |
480 | } | ||
481 | |||
482 | static void throtl_pd_exit(struct blkcg_gq *blkg) | ||
483 | { | ||
484 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
485 | unsigned long flags; | ||
486 | |||
487 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); | ||
488 | list_del_init(&tg->stats_alloc_node); | ||
489 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | ||
490 | |||
491 | free_percpu(tg->stats_cpu); | ||
492 | |||
493 | throtl_service_queue_exit(&tg->service_queue); | ||
494 | } | ||
495 | |||
496 | static void throtl_pd_reset_stats(struct blkcg_gq *blkg) | ||
497 | { | ||
498 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
499 | int cpu; | ||
500 | |||
501 | if (tg->stats_cpu == NULL) | ||
502 | return; | ||
503 | |||
504 | for_each_possible_cpu(cpu) { | ||
505 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | ||
506 | |||
507 | blkg_rwstat_reset(&sc->service_bytes); | ||
508 | blkg_rwstat_reset(&sc->serviced); | ||
509 | } | ||
510 | } | ||
511 | |||
512 | static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, | ||
513 | struct blkcg *blkcg) | ||
514 | { | ||
515 | /* | ||
516 | * This is the common case when there are no blkcgs. Avoid lookup | ||
517 | * in this case | ||
518 | */ | ||
519 | if (blkcg == &blkcg_root) | ||
520 | return td_root_tg(td); | ||
521 | |||
522 | return blkg_to_tg(blkg_lookup(blkcg, td->queue)); | ||
523 | } | 399 | } |
524 | 400 | ||
525 | static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, | 401 | static void throtl_pd_free(struct blkg_policy_data *pd) |
526 | struct blkcg *blkcg) | ||
527 | { | 402 | { |
528 | struct request_queue *q = td->queue; | 403 | struct throtl_grp *tg = pd_to_tg(pd); |
529 | struct throtl_grp *tg = NULL; | ||
530 | |||
531 | /* | ||
532 | * This is the common case when there are no blkcgs. Avoid lookup | ||
533 | * in this case | ||
534 | */ | ||
535 | if (blkcg == &blkcg_root) { | ||
536 | tg = td_root_tg(td); | ||
537 | } else { | ||
538 | struct blkcg_gq *blkg; | ||
539 | |||
540 | blkg = blkg_lookup_create(blkcg, q); | ||
541 | |||
542 | /* if %NULL and @q is alive, fall back to root_tg */ | ||
543 | if (!IS_ERR(blkg)) | ||
544 | tg = blkg_to_tg(blkg); | ||
545 | else if (!blk_queue_dying(q)) | ||
546 | tg = td_root_tg(td); | ||
547 | } | ||
548 | 404 | ||
549 | return tg; | 405 | del_timer_sync(&tg->service_queue.pending_timer); |
406 | kfree(tg); | ||
550 | } | 407 | } |
551 | 408 | ||
552 | static struct throtl_grp * | 409 | static struct throtl_grp * |
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, | |||
956 | return 0; | 813 | return 0; |
957 | } | 814 | } |
958 | 815 | ||
959 | static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, | ||
960 | int rw) | ||
961 | { | ||
962 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
963 | struct tg_stats_cpu *stats_cpu; | ||
964 | unsigned long flags; | ||
965 | |||
966 | /* If per cpu stats are not allocated yet, don't do any accounting. */ | ||
967 | if (tg->stats_cpu == NULL) | ||
968 | return; | ||
969 | |||
970 | /* | ||
971 | * Disabling interrupts to provide mutual exclusion between two | ||
972 | * writes on same cpu. It probably is not needed for 64bit. Not | ||
973 | * optimizing that case yet. | ||
974 | */ | ||
975 | local_irq_save(flags); | ||
976 | |||
977 | stats_cpu = this_cpu_ptr(tg->stats_cpu); | ||
978 | |||
979 | blkg_rwstat_add(&stats_cpu->serviced, rw, 1); | ||
980 | blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); | ||
981 | |||
982 | local_irq_restore(flags); | ||
983 | } | ||
984 | |||
985 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | 816 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) |
986 | { | 817 | { |
987 | bool rw = bio_data_dir(bio); | 818 | bool rw = bio_data_dir(bio); |
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | |||
995 | * more than once as a throttled bio will go through blk-throtl the | 826 | * more than once as a throttled bio will go through blk-throtl the |
996 | * second time when it eventually gets issued. Set it when a bio | 827 | * second time when it eventually gets issued. Set it when a bio |
997 | * is being charged to a tg. | 828 | * is being charged to a tg. |
998 | * | ||
999 | * Dispatch stats aren't recursive and each @bio should only be | ||
1000 | * accounted by the @tg it was originally associated with. Let's | ||
1001 | * update the stats when setting REQ_THROTTLED for the first time | ||
1002 | * which is guaranteed to be for the @bio's original tg. | ||
1003 | */ | 829 | */ |
1004 | if (!(bio->bi_rw & REQ_THROTTLED)) { | 830 | if (!(bio->bi_rw & REQ_THROTTLED)) |
1005 | bio->bi_rw |= REQ_THROTTLED; | 831 | bio->bi_rw |= REQ_THROTTLED; |
1006 | throtl_update_dispatch_stats(tg_to_blkg(tg), | ||
1007 | bio->bi_iter.bi_size, bio->bi_rw); | ||
1008 | } | ||
1009 | } | 832 | } |
1010 | 833 | ||
1011 | /** | 834 | /** |
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) | |||
1285 | } | 1108 | } |
1286 | } | 1109 | } |
1287 | 1110 | ||
1288 | static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, | ||
1289 | struct blkg_policy_data *pd, int off) | ||
1290 | { | ||
1291 | struct throtl_grp *tg = pd_to_tg(pd); | ||
1292 | struct blkg_rwstat rwstat = { }, tmp; | ||
1293 | int i, cpu; | ||
1294 | |||
1295 | if (tg->stats_cpu == NULL) | ||
1296 | return 0; | ||
1297 | |||
1298 | for_each_possible_cpu(cpu) { | ||
1299 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | ||
1300 | |||
1301 | tmp = blkg_rwstat_read((void *)sc + off); | ||
1302 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | ||
1303 | rwstat.cnt[i] += tmp.cnt[i]; | ||
1304 | } | ||
1305 | |||
1306 | return __blkg_prfill_rwstat(sf, pd, &rwstat); | ||
1307 | } | ||
1308 | |||
1309 | static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) | ||
1310 | { | ||
1311 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, | ||
1312 | &blkcg_policy_throtl, seq_cft(sf)->private, true); | ||
1313 | return 0; | ||
1314 | } | ||
1315 | |||
1316 | static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, | 1111 | static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, |
1317 | int off) | 1112 | int off) |
1318 | { | 1113 | { |
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) | |||
1349 | return 0; | 1144 | return 0; |
1350 | } | 1145 | } |
1351 | 1146 | ||
1352 | static ssize_t tg_set_conf(struct kernfs_open_file *of, | 1147 | static void tg_conf_updated(struct throtl_grp *tg) |
1353 | char *buf, size_t nbytes, loff_t off, bool is_u64) | ||
1354 | { | 1148 | { |
1355 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | 1149 | struct throtl_service_queue *sq = &tg->service_queue; |
1356 | struct blkg_conf_ctx ctx; | ||
1357 | struct throtl_grp *tg; | ||
1358 | struct throtl_service_queue *sq; | ||
1359 | struct blkcg_gq *blkg; | ||
1360 | struct cgroup_subsys_state *pos_css; | 1150 | struct cgroup_subsys_state *pos_css; |
1361 | int ret; | 1151 | struct blkcg_gq *blkg; |
1362 | |||
1363 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | ||
1364 | if (ret) | ||
1365 | return ret; | ||
1366 | |||
1367 | tg = blkg_to_tg(ctx.blkg); | ||
1368 | sq = &tg->service_queue; | ||
1369 | |||
1370 | if (!ctx.v) | ||
1371 | ctx.v = -1; | ||
1372 | |||
1373 | if (is_u64) | ||
1374 | *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; | ||
1375 | else | ||
1376 | *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; | ||
1377 | 1152 | ||
1378 | throtl_log(&tg->service_queue, | 1153 | throtl_log(&tg->service_queue, |
1379 | "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", | 1154 | "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", |
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, | |||
1387 | * restrictions in the whole hierarchy and allows them to bypass | 1162 | * restrictions in the whole hierarchy and allows them to bypass |
1388 | * blk-throttle. | 1163 | * blk-throttle. |
1389 | */ | 1164 | */ |
1390 | blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) | 1165 | blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) |
1391 | tg_update_has_rules(blkg_to_tg(blkg)); | 1166 | tg_update_has_rules(blkg_to_tg(blkg)); |
1392 | 1167 | ||
1393 | /* | 1168 | /* |
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, | |||
1405 | tg_update_disptime(tg); | 1180 | tg_update_disptime(tg); |
1406 | throtl_schedule_next_dispatch(sq->parent_sq, true); | 1181 | throtl_schedule_next_dispatch(sq->parent_sq, true); |
1407 | } | 1182 | } |
1183 | } | ||
1184 | |||
1185 | static ssize_t tg_set_conf(struct kernfs_open_file *of, | ||
1186 | char *buf, size_t nbytes, loff_t off, bool is_u64) | ||
1187 | { | ||
1188 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | ||
1189 | struct blkg_conf_ctx ctx; | ||
1190 | struct throtl_grp *tg; | ||
1191 | int ret; | ||
1192 | u64 v; | ||
1408 | 1193 | ||
1194 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | ||
1195 | if (ret) | ||
1196 | return ret; | ||
1197 | |||
1198 | ret = -EINVAL; | ||
1199 | if (sscanf(ctx.body, "%llu", &v) != 1) | ||
1200 | goto out_finish; | ||
1201 | if (!v) | ||
1202 | v = -1; | ||
1203 | |||
1204 | tg = blkg_to_tg(ctx.blkg); | ||
1205 | |||
1206 | if (is_u64) | ||
1207 | *(u64 *)((void *)tg + of_cft(of)->private) = v; | ||
1208 | else | ||
1209 | *(unsigned int *)((void *)tg + of_cft(of)->private) = v; | ||
1210 | |||
1211 | tg_conf_updated(tg); | ||
1212 | ret = 0; | ||
1213 | out_finish: | ||
1409 | blkg_conf_finish(&ctx); | 1214 | blkg_conf_finish(&ctx); |
1410 | return nbytes; | 1215 | return ret ?: nbytes; |
1411 | } | 1216 | } |
1412 | 1217 | ||
1413 | static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, | 1218 | static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, |
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, | |||
1422 | return tg_set_conf(of, buf, nbytes, off, false); | 1227 | return tg_set_conf(of, buf, nbytes, off, false); |
1423 | } | 1228 | } |
1424 | 1229 | ||
1425 | static struct cftype throtl_files[] = { | 1230 | static struct cftype throtl_legacy_files[] = { |
1426 | { | 1231 | { |
1427 | .name = "throttle.read_bps_device", | 1232 | .name = "throttle.read_bps_device", |
1428 | .private = offsetof(struct throtl_grp, bps[READ]), | 1233 | .private = offsetof(struct throtl_grp, bps[READ]), |
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = { | |||
1449 | }, | 1254 | }, |
1450 | { | 1255 | { |
1451 | .name = "throttle.io_service_bytes", | 1256 | .name = "throttle.io_service_bytes", |
1452 | .private = offsetof(struct tg_stats_cpu, service_bytes), | 1257 | .private = (unsigned long)&blkcg_policy_throtl, |
1453 | .seq_show = tg_print_cpu_rwstat, | 1258 | .seq_show = blkg_print_stat_bytes, |
1454 | }, | 1259 | }, |
1455 | { | 1260 | { |
1456 | .name = "throttle.io_serviced", | 1261 | .name = "throttle.io_serviced", |
1457 | .private = offsetof(struct tg_stats_cpu, serviced), | 1262 | .private = (unsigned long)&blkcg_policy_throtl, |
1458 | .seq_show = tg_print_cpu_rwstat, | 1263 | .seq_show = blkg_print_stat_ios, |
1264 | }, | ||
1265 | { } /* terminate */ | ||
1266 | }; | ||
1267 | |||
1268 | static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, | ||
1269 | int off) | ||
1270 | { | ||
1271 | struct throtl_grp *tg = pd_to_tg(pd); | ||
1272 | const char *dname = blkg_dev_name(pd->blkg); | ||
1273 | char bufs[4][21] = { "max", "max", "max", "max" }; | ||
1274 | |||
1275 | if (!dname) | ||
1276 | return 0; | ||
1277 | if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && | ||
1278 | tg->iops[READ] == -1 && tg->iops[WRITE] == -1) | ||
1279 | return 0; | ||
1280 | |||
1281 | if (tg->bps[READ] != -1) | ||
1282 | snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); | ||
1283 | if (tg->bps[WRITE] != -1) | ||
1284 | snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); | ||
1285 | if (tg->iops[READ] != -1) | ||
1286 | snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); | ||
1287 | if (tg->iops[WRITE] != -1) | ||
1288 | snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); | ||
1289 | |||
1290 | seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", | ||
1291 | dname, bufs[0], bufs[1], bufs[2], bufs[3]); | ||
1292 | return 0; | ||
1293 | } | ||
1294 | |||
1295 | static int tg_print_max(struct seq_file *sf, void *v) | ||
1296 | { | ||
1297 | blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, | ||
1298 | &blkcg_policy_throtl, seq_cft(sf)->private, false); | ||
1299 | return 0; | ||
1300 | } | ||
1301 | |||
1302 | static ssize_t tg_set_max(struct kernfs_open_file *of, | ||
1303 | char *buf, size_t nbytes, loff_t off) | ||
1304 | { | ||
1305 | struct blkcg *blkcg = css_to_blkcg(of_css(of)); | ||
1306 | struct blkg_conf_ctx ctx; | ||
1307 | struct throtl_grp *tg; | ||
1308 | u64 v[4]; | ||
1309 | int ret; | ||
1310 | |||
1311 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | ||
1312 | if (ret) | ||
1313 | return ret; | ||
1314 | |||
1315 | tg = blkg_to_tg(ctx.blkg); | ||
1316 | |||
1317 | v[0] = tg->bps[READ]; | ||
1318 | v[1] = tg->bps[WRITE]; | ||
1319 | v[2] = tg->iops[READ]; | ||
1320 | v[3] = tg->iops[WRITE]; | ||
1321 | |||
1322 | while (true) { | ||
1323 | char tok[27]; /* wiops=18446744073709551616 */ | ||
1324 | char *p; | ||
1325 | u64 val = -1; | ||
1326 | int len; | ||
1327 | |||
1328 | if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) | ||
1329 | break; | ||
1330 | if (tok[0] == '\0') | ||
1331 | break; | ||
1332 | ctx.body += len; | ||
1333 | |||
1334 | ret = -EINVAL; | ||
1335 | p = tok; | ||
1336 | strsep(&p, "="); | ||
1337 | if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) | ||
1338 | goto out_finish; | ||
1339 | |||
1340 | ret = -ERANGE; | ||
1341 | if (!val) | ||
1342 | goto out_finish; | ||
1343 | |||
1344 | ret = -EINVAL; | ||
1345 | if (!strcmp(tok, "rbps")) | ||
1346 | v[0] = val; | ||
1347 | else if (!strcmp(tok, "wbps")) | ||
1348 | v[1] = val; | ||
1349 | else if (!strcmp(tok, "riops")) | ||
1350 | v[2] = min_t(u64, val, UINT_MAX); | ||
1351 | else if (!strcmp(tok, "wiops")) | ||
1352 | v[3] = min_t(u64, val, UINT_MAX); | ||
1353 | else | ||
1354 | goto out_finish; | ||
1355 | } | ||
1356 | |||
1357 | tg->bps[READ] = v[0]; | ||
1358 | tg->bps[WRITE] = v[1]; | ||
1359 | tg->iops[READ] = v[2]; | ||
1360 | tg->iops[WRITE] = v[3]; | ||
1361 | |||
1362 | tg_conf_updated(tg); | ||
1363 | ret = 0; | ||
1364 | out_finish: | ||
1365 | blkg_conf_finish(&ctx); | ||
1366 | return ret ?: nbytes; | ||
1367 | } | ||
1368 | |||
1369 | static struct cftype throtl_files[] = { | ||
1370 | { | ||
1371 | .name = "max", | ||
1372 | .flags = CFTYPE_NOT_ON_ROOT, | ||
1373 | .seq_show = tg_print_max, | ||
1374 | .write = tg_set_max, | ||
1459 | }, | 1375 | }, |
1460 | { } /* terminate */ | 1376 | { } /* terminate */ |
1461 | }; | 1377 | }; |
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q) | |||
1468 | } | 1384 | } |
1469 | 1385 | ||
1470 | static struct blkcg_policy blkcg_policy_throtl = { | 1386 | static struct blkcg_policy blkcg_policy_throtl = { |
1471 | .pd_size = sizeof(struct throtl_grp), | 1387 | .dfl_cftypes = throtl_files, |
1472 | .cftypes = throtl_files, | 1388 | .legacy_cftypes = throtl_legacy_files, |
1473 | 1389 | ||
1390 | .pd_alloc_fn = throtl_pd_alloc, | ||
1474 | .pd_init_fn = throtl_pd_init, | 1391 | .pd_init_fn = throtl_pd_init, |
1475 | .pd_online_fn = throtl_pd_online, | 1392 | .pd_online_fn = throtl_pd_online, |
1476 | .pd_exit_fn = throtl_pd_exit, | 1393 | .pd_free_fn = throtl_pd_free, |
1477 | .pd_reset_stats_fn = throtl_pd_reset_stats, | ||
1478 | }; | 1394 | }; |
1479 | 1395 | ||
1480 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | 1396 | bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, |
1397 | struct bio *bio) | ||
1481 | { | 1398 | { |
1482 | struct throtl_data *td = q->td; | ||
1483 | struct throtl_qnode *qn = NULL; | 1399 | struct throtl_qnode *qn = NULL; |
1484 | struct throtl_grp *tg; | 1400 | struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); |
1485 | struct throtl_service_queue *sq; | 1401 | struct throtl_service_queue *sq; |
1486 | bool rw = bio_data_dir(bio); | 1402 | bool rw = bio_data_dir(bio); |
1487 | struct blkcg *blkcg; | ||
1488 | bool throttled = false; | 1403 | bool throttled = false; |
1489 | 1404 | ||
1405 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
1406 | |||
1490 | /* see throtl_charge_bio() */ | 1407 | /* see throtl_charge_bio() */ |
1491 | if (bio->bi_rw & REQ_THROTTLED) | 1408 | if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) |
1492 | goto out; | 1409 | goto out; |
1493 | 1410 | ||
1494 | /* | ||
1495 | * A throtl_grp pointer retrieved under rcu can be used to access | ||
1496 | * basic fields like stats and io rates. If a group has no rules, | ||
1497 | * just update the dispatch stats in lockless manner and return. | ||
1498 | */ | ||
1499 | rcu_read_lock(); | ||
1500 | blkcg = bio_blkcg(bio); | ||
1501 | tg = throtl_lookup_tg(td, blkcg); | ||
1502 | if (tg) { | ||
1503 | if (!tg->has_rules[rw]) { | ||
1504 | throtl_update_dispatch_stats(tg_to_blkg(tg), | ||
1505 | bio->bi_iter.bi_size, bio->bi_rw); | ||
1506 | goto out_unlock_rcu; | ||
1507 | } | ||
1508 | } | ||
1509 | |||
1510 | /* | ||
1511 | * Either group has not been allocated yet or it is not an unlimited | ||
1512 | * IO group | ||
1513 | */ | ||
1514 | spin_lock_irq(q->queue_lock); | 1411 | spin_lock_irq(q->queue_lock); |
1515 | tg = throtl_lookup_create_tg(td, blkcg); | 1412 | |
1516 | if (unlikely(!tg)) | 1413 | if (unlikely(blk_queue_bypass(q))) |
1517 | goto out_unlock; | 1414 | goto out_unlock; |
1518 | 1415 | ||
1519 | sq = &tg->service_queue; | 1416 | sq = &tg->service_queue; |
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1580 | 1477 | ||
1581 | out_unlock: | 1478 | out_unlock: |
1582 | spin_unlock_irq(q->queue_lock); | 1479 | spin_unlock_irq(q->queue_lock); |
1583 | out_unlock_rcu: | ||
1584 | rcu_read_unlock(); | ||
1585 | out: | 1480 | out: |
1586 | /* | 1481 | /* |
1587 | * As multiple blk-throtls may stack in the same issue path, we | 1482 | * As multiple blk-throtls may stack in the same issue path, we |
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q) | |||
1667 | return -ENOMEM; | 1562 | return -ENOMEM; |
1668 | 1563 | ||
1669 | INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); | 1564 | INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); |
1670 | throtl_service_queue_init(&td->service_queue, NULL); | 1565 | throtl_service_queue_init(&td->service_queue); |
1671 | 1566 | ||
1672 | q->td = td; | 1567 | q->td = td; |
1673 | td->queue = q; | 1568 | td->queue = q; |