aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-throttle.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r--block/blk-throttle.c505
1 files changed, 200 insertions, 305 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b23193518ac7..c75a2636dd40 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -83,14 +83,6 @@ enum tg_state_flags {
83 83
84#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 84#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
85 85
86/* Per-cpu group stats */
87struct tg_stats_cpu {
88 /* total bytes transferred */
89 struct blkg_rwstat service_bytes;
90 /* total IOs serviced, post merge */
91 struct blkg_rwstat serviced;
92};
93
94struct throtl_grp { 86struct throtl_grp {
95 /* must be the first member */ 87 /* must be the first member */
96 struct blkg_policy_data pd; 88 struct blkg_policy_data pd;
@@ -141,12 +133,6 @@ struct throtl_grp {
141 /* When did we start a new slice */ 133 /* When did we start a new slice */
142 unsigned long slice_start[2]; 134 unsigned long slice_start[2];
143 unsigned long slice_end[2]; 135 unsigned long slice_end[2];
144
145 /* Per cpu stats pointer */
146 struct tg_stats_cpu __percpu *stats_cpu;
147
148 /* List of tgs waiting for per cpu stats memory to be allocated */
149 struct list_head stats_alloc_node;
150}; 136};
151 137
152struct throtl_data 138struct throtl_data
@@ -168,13 +154,6 @@ struct throtl_data
168 struct work_struct dispatch_work; 154 struct work_struct dispatch_work;
169}; 155};
170 156
171/* list and work item to allocate percpu group stats */
172static DEFINE_SPINLOCK(tg_stats_alloc_lock);
173static LIST_HEAD(tg_stats_alloc_list);
174
175static void tg_stats_alloc_fn(struct work_struct *);
176static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
177
178static void throtl_pending_timer_fn(unsigned long arg); 157static void throtl_pending_timer_fn(unsigned long arg);
179 158
180static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 159static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
192 return pd_to_blkg(&tg->pd); 171 return pd_to_blkg(&tg->pd);
193} 172}
194 173
195static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
196{
197 return blkg_to_tg(td->queue->root_blkg);
198}
199
200/** 174/**
201 * sq_to_tg - return the throl_grp the specified service queue belongs to 175 * sq_to_tg - return the throl_grp the specified service queue belongs to
202 * @sq: the throtl_service_queue of interest 176 * @sq: the throtl_service_queue of interest
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
256 } \ 230 } \
257} while (0) 231} while (0)
258 232
259static void tg_stats_init(struct tg_stats_cpu *tg_stats)
260{
261 blkg_rwstat_init(&tg_stats->service_bytes);
262 blkg_rwstat_init(&tg_stats->serviced);
263}
264
265/*
266 * Worker for allocating per cpu stat for tgs. This is scheduled on the
267 * system_wq once there are some groups on the alloc_list waiting for
268 * allocation.
269 */
270static void tg_stats_alloc_fn(struct work_struct *work)
271{
272 static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */
273 struct delayed_work *dwork = to_delayed_work(work);
274 bool empty = false;
275
276alloc_stats:
277 if (!stats_cpu) {
278 int cpu;
279
280 stats_cpu = alloc_percpu(struct tg_stats_cpu);
281 if (!stats_cpu) {
282 /* allocation failed, try again after some time */
283 schedule_delayed_work(dwork, msecs_to_jiffies(10));
284 return;
285 }
286 for_each_possible_cpu(cpu)
287 tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
288 }
289
290 spin_lock_irq(&tg_stats_alloc_lock);
291
292 if (!list_empty(&tg_stats_alloc_list)) {
293 struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
294 struct throtl_grp,
295 stats_alloc_node);
296 swap(tg->stats_cpu, stats_cpu);
297 list_del_init(&tg->stats_alloc_node);
298 }
299
300 empty = list_empty(&tg_stats_alloc_list);
301 spin_unlock_irq(&tg_stats_alloc_lock);
302 if (!empty)
303 goto alloc_stats;
304}
305
306static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 233static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
307{ 234{
308 INIT_LIST_HEAD(&qn->node); 235 INIT_LIST_HEAD(&qn->node);
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
387} 314}
388 315
389/* init a service_queue, assumes the caller zeroed it */ 316/* init a service_queue, assumes the caller zeroed it */
390static void throtl_service_queue_init(struct throtl_service_queue *sq, 317static void throtl_service_queue_init(struct throtl_service_queue *sq)
391 struct throtl_service_queue *parent_sq)
392{ 318{
393 INIT_LIST_HEAD(&sq->queued[0]); 319 INIT_LIST_HEAD(&sq->queued[0]);
394 INIT_LIST_HEAD(&sq->queued[1]); 320 INIT_LIST_HEAD(&sq->queued[1]);
395 sq->pending_tree = RB_ROOT; 321 sq->pending_tree = RB_ROOT;
396 sq->parent_sq = parent_sq;
397 setup_timer(&sq->pending_timer, throtl_pending_timer_fn, 322 setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
398 (unsigned long)sq); 323 (unsigned long)sq);
399} 324}
400 325
401static void throtl_service_queue_exit(struct throtl_service_queue *sq) 326static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
402{ 327{
403 del_timer_sync(&sq->pending_timer); 328 struct throtl_grp *tg;
329 int rw;
330
331 tg = kzalloc_node(sizeof(*tg), gfp, node);
332 if (!tg)
333 return NULL;
334
335 throtl_service_queue_init(&tg->service_queue);
336
337 for (rw = READ; rw <= WRITE; rw++) {
338 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
339 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
340 }
341
342 RB_CLEAR_NODE(&tg->rb_node);
343 tg->bps[READ] = -1;
344 tg->bps[WRITE] = -1;
345 tg->iops[READ] = -1;
346 tg->iops[WRITE] = -1;
347
348 return &tg->pd;
404} 349}
405 350
406static void throtl_pd_init(struct blkcg_gq *blkg) 351static void throtl_pd_init(struct blkg_policy_data *pd)
407{ 352{
408 struct throtl_grp *tg = blkg_to_tg(blkg); 353 struct throtl_grp *tg = pd_to_tg(pd);
354 struct blkcg_gq *blkg = tg_to_blkg(tg);
409 struct throtl_data *td = blkg->q->td; 355 struct throtl_data *td = blkg->q->td;
410 struct throtl_service_queue *parent_sq; 356 struct throtl_service_queue *sq = &tg->service_queue;
411 unsigned long flags;
412 int rw;
413 357
414 /* 358 /*
415 * If on the default hierarchy, we switch to properly hierarchical 359 * If on the default hierarchy, we switch to properly hierarchical
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
424 * Limits of a group don't interact with limits of other groups 368 * Limits of a group don't interact with limits of other groups
425 * regardless of the position of the group in the hierarchy. 369 * regardless of the position of the group in the hierarchy.
426 */ 370 */
427 parent_sq = &td->service_queue; 371 sq->parent_sq = &td->service_queue;
428
429 if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) 372 if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
430 parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 373 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
431
432 throtl_service_queue_init(&tg->service_queue, parent_sq);
433
434 for (rw = READ; rw <= WRITE; rw++) {
435 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
436 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
437 }
438
439 RB_CLEAR_NODE(&tg->rb_node);
440 tg->td = td; 374 tg->td = td;
441
442 tg->bps[READ] = -1;
443 tg->bps[WRITE] = -1;
444 tg->iops[READ] = -1;
445 tg->iops[WRITE] = -1;
446
447 /*
448 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
449 * but percpu allocator can't be called from IO path. Queue tg on
450 * tg_stats_alloc_list and allocate from work item.
451 */
452 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
453 list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
454 schedule_delayed_work(&tg_stats_alloc_work, 0);
455 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
456} 375}
457 376
458/* 377/*
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg)
470 (tg->bps[rw] != -1 || tg->iops[rw] != -1); 389 (tg->bps[rw] != -1 || tg->iops[rw] != -1);
471} 390}
472 391
473static void throtl_pd_online(struct blkcg_gq *blkg) 392static void throtl_pd_online(struct blkg_policy_data *pd)
474{ 393{
475 /* 394 /*
476 * We don't want new groups to escape the limits of its ancestors. 395 * We don't want new groups to escape the limits of its ancestors.
477 * Update has_rules[] after a new group is brought online. 396 * Update has_rules[] after a new group is brought online.
478 */ 397 */
479 tg_update_has_rules(blkg_to_tg(blkg)); 398 tg_update_has_rules(pd_to_tg(pd));
480}
481
482static void throtl_pd_exit(struct blkcg_gq *blkg)
483{
484 struct throtl_grp *tg = blkg_to_tg(blkg);
485 unsigned long flags;
486
487 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
488 list_del_init(&tg->stats_alloc_node);
489 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
490
491 free_percpu(tg->stats_cpu);
492
493 throtl_service_queue_exit(&tg->service_queue);
494}
495
496static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
497{
498 struct throtl_grp *tg = blkg_to_tg(blkg);
499 int cpu;
500
501 if (tg->stats_cpu == NULL)
502 return;
503
504 for_each_possible_cpu(cpu) {
505 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
506
507 blkg_rwstat_reset(&sc->service_bytes);
508 blkg_rwstat_reset(&sc->serviced);
509 }
510}
511
512static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
513 struct blkcg *blkcg)
514{
515 /*
516 * This is the common case when there are no blkcgs. Avoid lookup
517 * in this case
518 */
519 if (blkcg == &blkcg_root)
520 return td_root_tg(td);
521
522 return blkg_to_tg(blkg_lookup(blkcg, td->queue));
523} 399}
524 400
525static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 401static void throtl_pd_free(struct blkg_policy_data *pd)
526 struct blkcg *blkcg)
527{ 402{
528 struct request_queue *q = td->queue; 403 struct throtl_grp *tg = pd_to_tg(pd);
529 struct throtl_grp *tg = NULL;
530
531 /*
532 * This is the common case when there are no blkcgs. Avoid lookup
533 * in this case
534 */
535 if (blkcg == &blkcg_root) {
536 tg = td_root_tg(td);
537 } else {
538 struct blkcg_gq *blkg;
539
540 blkg = blkg_lookup_create(blkcg, q);
541
542 /* if %NULL and @q is alive, fall back to root_tg */
543 if (!IS_ERR(blkg))
544 tg = blkg_to_tg(blkg);
545 else if (!blk_queue_dying(q))
546 tg = td_root_tg(td);
547 }
548 404
549 return tg; 405 del_timer_sync(&tg->service_queue.pending_timer);
406 kfree(tg);
550} 407}
551 408
552static struct throtl_grp * 409static struct throtl_grp *
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
956 return 0; 813 return 0;
957} 814}
958 815
959static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
960 int rw)
961{
962 struct throtl_grp *tg = blkg_to_tg(blkg);
963 struct tg_stats_cpu *stats_cpu;
964 unsigned long flags;
965
966 /* If per cpu stats are not allocated yet, don't do any accounting. */
967 if (tg->stats_cpu == NULL)
968 return;
969
970 /*
971 * Disabling interrupts to provide mutual exclusion between two
972 * writes on same cpu. It probably is not needed for 64bit. Not
973 * optimizing that case yet.
974 */
975 local_irq_save(flags);
976
977 stats_cpu = this_cpu_ptr(tg->stats_cpu);
978
979 blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
980 blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
981
982 local_irq_restore(flags);
983}
984
985static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 816static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
986{ 817{
987 bool rw = bio_data_dir(bio); 818 bool rw = bio_data_dir(bio);
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
995 * more than once as a throttled bio will go through blk-throtl the 826 * more than once as a throttled bio will go through blk-throtl the
996 * second time when it eventually gets issued. Set it when a bio 827 * second time when it eventually gets issued. Set it when a bio
997 * is being charged to a tg. 828 * is being charged to a tg.
998 *
999 * Dispatch stats aren't recursive and each @bio should only be
1000 * accounted by the @tg it was originally associated with. Let's
1001 * update the stats when setting REQ_THROTTLED for the first time
1002 * which is guaranteed to be for the @bio's original tg.
1003 */ 829 */
1004 if (!(bio->bi_rw & REQ_THROTTLED)) { 830 if (!(bio->bi_rw & REQ_THROTTLED))
1005 bio->bi_rw |= REQ_THROTTLED; 831 bio->bi_rw |= REQ_THROTTLED;
1006 throtl_update_dispatch_stats(tg_to_blkg(tg),
1007 bio->bi_iter.bi_size, bio->bi_rw);
1008 }
1009} 832}
1010 833
1011/** 834/**
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
1285 } 1108 }
1286} 1109}
1287 1110
1288static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
1289 struct blkg_policy_data *pd, int off)
1290{
1291 struct throtl_grp *tg = pd_to_tg(pd);
1292 struct blkg_rwstat rwstat = { }, tmp;
1293 int i, cpu;
1294
1295 if (tg->stats_cpu == NULL)
1296 return 0;
1297
1298 for_each_possible_cpu(cpu) {
1299 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
1300
1301 tmp = blkg_rwstat_read((void *)sc + off);
1302 for (i = 0; i < BLKG_RWSTAT_NR; i++)
1303 rwstat.cnt[i] += tmp.cnt[i];
1304 }
1305
1306 return __blkg_prfill_rwstat(sf, pd, &rwstat);
1307}
1308
1309static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
1310{
1311 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
1312 &blkcg_policy_throtl, seq_cft(sf)->private, true);
1313 return 0;
1314}
1315
1316static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1111static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
1317 int off) 1112 int off)
1318{ 1113{
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
1349 return 0; 1144 return 0;
1350} 1145}
1351 1146
1352static ssize_t tg_set_conf(struct kernfs_open_file *of, 1147static void tg_conf_updated(struct throtl_grp *tg)
1353 char *buf, size_t nbytes, loff_t off, bool is_u64)
1354{ 1148{
1355 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1149 struct throtl_service_queue *sq = &tg->service_queue;
1356 struct blkg_conf_ctx ctx;
1357 struct throtl_grp *tg;
1358 struct throtl_service_queue *sq;
1359 struct blkcg_gq *blkg;
1360 struct cgroup_subsys_state *pos_css; 1150 struct cgroup_subsys_state *pos_css;
1361 int ret; 1151 struct blkcg_gq *blkg;
1362
1363 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1364 if (ret)
1365 return ret;
1366
1367 tg = blkg_to_tg(ctx.blkg);
1368 sq = &tg->service_queue;
1369
1370 if (!ctx.v)
1371 ctx.v = -1;
1372
1373 if (is_u64)
1374 *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
1375 else
1376 *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
1377 1152
1378 throtl_log(&tg->service_queue, 1153 throtl_log(&tg->service_queue,
1379 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1154 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
1387 * restrictions in the whole hierarchy and allows them to bypass 1162 * restrictions in the whole hierarchy and allows them to bypass
1388 * blk-throttle. 1163 * blk-throttle.
1389 */ 1164 */
1390 blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) 1165 blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
1391 tg_update_has_rules(blkg_to_tg(blkg)); 1166 tg_update_has_rules(blkg_to_tg(blkg));
1392 1167
1393 /* 1168 /*
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
1405 tg_update_disptime(tg); 1180 tg_update_disptime(tg);
1406 throtl_schedule_next_dispatch(sq->parent_sq, true); 1181 throtl_schedule_next_dispatch(sq->parent_sq, true);
1407 } 1182 }
1183}
1184
1185static ssize_t tg_set_conf(struct kernfs_open_file *of,
1186 char *buf, size_t nbytes, loff_t off, bool is_u64)
1187{
1188 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1189 struct blkg_conf_ctx ctx;
1190 struct throtl_grp *tg;
1191 int ret;
1192 u64 v;
1408 1193
1194 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1195 if (ret)
1196 return ret;
1197
1198 ret = -EINVAL;
1199 if (sscanf(ctx.body, "%llu", &v) != 1)
1200 goto out_finish;
1201 if (!v)
1202 v = -1;
1203
1204 tg = blkg_to_tg(ctx.blkg);
1205
1206 if (is_u64)
1207 *(u64 *)((void *)tg + of_cft(of)->private) = v;
1208 else
1209 *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
1210
1211 tg_conf_updated(tg);
1212 ret = 0;
1213out_finish:
1409 blkg_conf_finish(&ctx); 1214 blkg_conf_finish(&ctx);
1410 return nbytes; 1215 return ret ?: nbytes;
1411} 1216}
1412 1217
1413static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1218static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
1422 return tg_set_conf(of, buf, nbytes, off, false); 1227 return tg_set_conf(of, buf, nbytes, off, false);
1423} 1228}
1424 1229
1425static struct cftype throtl_files[] = { 1230static struct cftype throtl_legacy_files[] = {
1426 { 1231 {
1427 .name = "throttle.read_bps_device", 1232 .name = "throttle.read_bps_device",
1428 .private = offsetof(struct throtl_grp, bps[READ]), 1233 .private = offsetof(struct throtl_grp, bps[READ]),
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = {
1449 }, 1254 },
1450 { 1255 {
1451 .name = "throttle.io_service_bytes", 1256 .name = "throttle.io_service_bytes",
1452 .private = offsetof(struct tg_stats_cpu, service_bytes), 1257 .private = (unsigned long)&blkcg_policy_throtl,
1453 .seq_show = tg_print_cpu_rwstat, 1258 .seq_show = blkg_print_stat_bytes,
1454 }, 1259 },
1455 { 1260 {
1456 .name = "throttle.io_serviced", 1261 .name = "throttle.io_serviced",
1457 .private = offsetof(struct tg_stats_cpu, serviced), 1262 .private = (unsigned long)&blkcg_policy_throtl,
1458 .seq_show = tg_print_cpu_rwstat, 1263 .seq_show = blkg_print_stat_ios,
1264 },
1265 { } /* terminate */
1266};
1267
1268static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
1269 int off)
1270{
1271 struct throtl_grp *tg = pd_to_tg(pd);
1272 const char *dname = blkg_dev_name(pd->blkg);
1273 char bufs[4][21] = { "max", "max", "max", "max" };
1274
1275 if (!dname)
1276 return 0;
1277 if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
1278 tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
1279 return 0;
1280
1281 if (tg->bps[READ] != -1)
1282 snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
1283 if (tg->bps[WRITE] != -1)
1284 snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
1285 if (tg->iops[READ] != -1)
1286 snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
1287 if (tg->iops[WRITE] != -1)
1288 snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
1289
1290 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
1291 dname, bufs[0], bufs[1], bufs[2], bufs[3]);
1292 return 0;
1293}
1294
1295static int tg_print_max(struct seq_file *sf, void *v)
1296{
1297 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
1298 &blkcg_policy_throtl, seq_cft(sf)->private, false);
1299 return 0;
1300}
1301
1302static ssize_t tg_set_max(struct kernfs_open_file *of,
1303 char *buf, size_t nbytes, loff_t off)
1304{
1305 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1306 struct blkg_conf_ctx ctx;
1307 struct throtl_grp *tg;
1308 u64 v[4];
1309 int ret;
1310
1311 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1312 if (ret)
1313 return ret;
1314
1315 tg = blkg_to_tg(ctx.blkg);
1316
1317 v[0] = tg->bps[READ];
1318 v[1] = tg->bps[WRITE];
1319 v[2] = tg->iops[READ];
1320 v[3] = tg->iops[WRITE];
1321
1322 while (true) {
1323 char tok[27]; /* wiops=18446744073709551616 */
1324 char *p;
1325 u64 val = -1;
1326 int len;
1327
1328 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
1329 break;
1330 if (tok[0] == '\0')
1331 break;
1332 ctx.body += len;
1333
1334 ret = -EINVAL;
1335 p = tok;
1336 strsep(&p, "=");
1337 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
1338 goto out_finish;
1339
1340 ret = -ERANGE;
1341 if (!val)
1342 goto out_finish;
1343
1344 ret = -EINVAL;
1345 if (!strcmp(tok, "rbps"))
1346 v[0] = val;
1347 else if (!strcmp(tok, "wbps"))
1348 v[1] = val;
1349 else if (!strcmp(tok, "riops"))
1350 v[2] = min_t(u64, val, UINT_MAX);
1351 else if (!strcmp(tok, "wiops"))
1352 v[3] = min_t(u64, val, UINT_MAX);
1353 else
1354 goto out_finish;
1355 }
1356
1357 tg->bps[READ] = v[0];
1358 tg->bps[WRITE] = v[1];
1359 tg->iops[READ] = v[2];
1360 tg->iops[WRITE] = v[3];
1361
1362 tg_conf_updated(tg);
1363 ret = 0;
1364out_finish:
1365 blkg_conf_finish(&ctx);
1366 return ret ?: nbytes;
1367}
1368
1369static struct cftype throtl_files[] = {
1370 {
1371 .name = "max",
1372 .flags = CFTYPE_NOT_ON_ROOT,
1373 .seq_show = tg_print_max,
1374 .write = tg_set_max,
1459 }, 1375 },
1460 { } /* terminate */ 1376 { } /* terminate */
1461}; 1377};
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q)
1468} 1384}
1469 1385
1470static struct blkcg_policy blkcg_policy_throtl = { 1386static struct blkcg_policy blkcg_policy_throtl = {
1471 .pd_size = sizeof(struct throtl_grp), 1387 .dfl_cftypes = throtl_files,
1472 .cftypes = throtl_files, 1388 .legacy_cftypes = throtl_legacy_files,
1473 1389
1390 .pd_alloc_fn = throtl_pd_alloc,
1474 .pd_init_fn = throtl_pd_init, 1391 .pd_init_fn = throtl_pd_init,
1475 .pd_online_fn = throtl_pd_online, 1392 .pd_online_fn = throtl_pd_online,
1476 .pd_exit_fn = throtl_pd_exit, 1393 .pd_free_fn = throtl_pd_free,
1477 .pd_reset_stats_fn = throtl_pd_reset_stats,
1478}; 1394};
1479 1395
1480bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1396bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
1397 struct bio *bio)
1481{ 1398{
1482 struct throtl_data *td = q->td;
1483 struct throtl_qnode *qn = NULL; 1399 struct throtl_qnode *qn = NULL;
1484 struct throtl_grp *tg; 1400 struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
1485 struct throtl_service_queue *sq; 1401 struct throtl_service_queue *sq;
1486 bool rw = bio_data_dir(bio); 1402 bool rw = bio_data_dir(bio);
1487 struct blkcg *blkcg;
1488 bool throttled = false; 1403 bool throttled = false;
1489 1404
1405 WARN_ON_ONCE(!rcu_read_lock_held());
1406
1490 /* see throtl_charge_bio() */ 1407 /* see throtl_charge_bio() */
1491 if (bio->bi_rw & REQ_THROTTLED) 1408 if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
1492 goto out; 1409 goto out;
1493 1410
1494 /*
1495 * A throtl_grp pointer retrieved under rcu can be used to access
1496 * basic fields like stats and io rates. If a group has no rules,
1497 * just update the dispatch stats in lockless manner and return.
1498 */
1499 rcu_read_lock();
1500 blkcg = bio_blkcg(bio);
1501 tg = throtl_lookup_tg(td, blkcg);
1502 if (tg) {
1503 if (!tg->has_rules[rw]) {
1504 throtl_update_dispatch_stats(tg_to_blkg(tg),
1505 bio->bi_iter.bi_size, bio->bi_rw);
1506 goto out_unlock_rcu;
1507 }
1508 }
1509
1510 /*
1511 * Either group has not been allocated yet or it is not an unlimited
1512 * IO group
1513 */
1514 spin_lock_irq(q->queue_lock); 1411 spin_lock_irq(q->queue_lock);
1515 tg = throtl_lookup_create_tg(td, blkcg); 1412
1516 if (unlikely(!tg)) 1413 if (unlikely(blk_queue_bypass(q)))
1517 goto out_unlock; 1414 goto out_unlock;
1518 1415
1519 sq = &tg->service_queue; 1416 sq = &tg->service_queue;
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1580 1477
1581out_unlock: 1478out_unlock:
1582 spin_unlock_irq(q->queue_lock); 1479 spin_unlock_irq(q->queue_lock);
1583out_unlock_rcu:
1584 rcu_read_unlock();
1585out: 1480out:
1586 /* 1481 /*
1587 * As multiple blk-throtls may stack in the same issue path, we 1482 * As multiple blk-throtls may stack in the same issue path, we
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q)
1667 return -ENOMEM; 1562 return -ENOMEM;
1668 1563
1669 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1564 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
1670 throtl_service_queue_init(&td->service_queue, NULL); 1565 throtl_service_queue_init(&td->service_queue);
1671 1566
1672 q->td = td; 1567 q->td = td;
1673 td->queue = q; 1568 td->queue = q;