aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-01-09 11:05:11 -0500
committerTejun Heo <tj@kernel.org>2013-01-09 11:05:11 -0500
commit1d3650f713e7f6392b02fde450c5bae40291e65b (patch)
treebc1645e9ce6d762894b51505fc90253f9a9aed58 /block
parent7918ffb5b83e3373206ada84873c674fbddf61cc (diff)
cfq-iosched: implement hierarchy-ready cfq_group charge scaling
Currently, cfqg charges are scaled directly according to cfqg->weight. Regardless of the number of active cfqgs or the amount of active weights, a given weight value always scales charge the same way. This works fine as long as all cfqgs are treated equally regardless of their positions in the hierarchy, which is what cfq currently implements. It can't work in hierarchical settings because the interpretation of a given weight value depends on where the weight is located in the hierarchy. This patch reimplements cfqg charge scaling so that it can be used to support hierarchy properly. The scheme is fairly simple and light-weight. * When a cfqg is added to the service tree, v(disktime)weight is calculated. It walks up the tree to root calculating the fraction it has in the hierarchy. At each level, the fraction can be calculated as cfqg->weight / parent->level_weight By compounding these, the global fraction of vdisktime the cfqg has claim to - vfraction - can be determined. * When the cfqg needs to be charged, the charge is scaled inversely proportionally to the vfraction. The new scaling scheme uses the same CFQ_SERVICE_SHIFT for fixed point representation as before; however, the smallest scaling factor is now 1 (ie. 1 << CFQ_SERVICE_SHIFT). This is different from before where 1 was for CFQ_WEIGHT_DEFAULT and higher weight would result in smaller scaling factor. While this shifts the global scale of vdisktime a bit, it doesn't change the relative relationships among cfqgs and the scheduling result isn't different. cfq_group_notify_queue_add uses fixed CFQ_IDLE_DELAY when appending new cfqg to the service tree. The specific value of CFQ_IDLE_DELAY didn't have any relevance to vdisktime before and is unlikely to cause any visible behavior difference now especially as the scale shift isn't that large. As the new scheme now makes proper distinction between cfqg->weight and ->leaf_weight, reverse the weight aliasing for root cfqgs. For root, both weights are now mapped to ->leaf_weight instead of the other way around. Because we're still using cfqg_flat_parent(), this patch shouldn't change the scheduling behavior in any noticeable way. v2: Beefed up comments on vfraction as requested by Vivek. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Vivek Goyal <vgoyal@redhat.com>
Diffstat (limited to 'block')
-rw-r--r--block/cfq-iosched.c107
1 files changed, 77 insertions, 30 deletions
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7701c3fe49b9..b24acf66d5b5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -237,6 +237,18 @@ struct cfq_group {
237 unsigned int children_weight; 237 unsigned int children_weight;
238 238
239 /* 239 /*
240 * vfraction is the fraction of vdisktime that the tasks in this
241 * cfqg are entitled to. This is determined by compounding the
242 * ratios walking up from this cfqg to the root.
243 *
244 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
245 * vfractions on a service tree is approximately 1. The sum may
246 * deviate a bit due to rounding errors and fluctuations caused by
247 * cfqgs entering and leaving the service tree.
248 */
249 unsigned int vfraction;
250
251 /*
240 * There are two weights - (internal) weight is the weight of this 252 * There are two weights - (internal) weight is the weight of this
241 * cfqg against the sibling cfqgs. leaf_weight is the wight of 253 * cfqg against the sibling cfqgs. leaf_weight is the wight of
242 * this cfqg against the child cfqgs. For the root cfqg, both 254 * this cfqg against the child cfqgs. For the root cfqg, both
@@ -891,13 +903,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
891 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 903 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
892} 904}
893 905
894static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 906/**
907 * cfqg_scale_charge - scale disk time charge according to cfqg weight
908 * @charge: disk time being charged
909 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
910 *
911 * Scale @charge according to @vfraction, which is in range (0, 1]. The
912 * scaling is inversely proportional.
913 *
914 * scaled = charge / vfraction
915 *
916 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
917 */
918static inline u64 cfqg_scale_charge(unsigned long charge,
919 unsigned int vfraction)
895{ 920{
896 u64 d = delta << CFQ_SERVICE_SHIFT; 921 u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
897 922
898 d = d * CFQ_WEIGHT_DEFAULT; 923 /* charge / vfraction */
899 do_div(d, cfqg->weight); 924 c <<= CFQ_SERVICE_SHIFT;
900 return d; 925 do_div(c, vfraction);
926 return c;
901} 927}
902 928
903static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 929static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
@@ -1237,7 +1263,9 @@ cfq_update_group_weight(struct cfq_group *cfqg)
1237static void 1263static void
1238cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 1264cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1239{ 1265{
1266 unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
1240 struct cfq_group *pos = cfqg; 1267 struct cfq_group *pos = cfqg;
1268 struct cfq_group *parent;
1241 bool propagate; 1269 bool propagate;
1242 1270
1243 /* add to the service tree */ 1271 /* add to the service tree */
@@ -1248,22 +1276,34 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1248 st->total_weight += cfqg->weight; 1276 st->total_weight += cfqg->weight;
1249 1277
1250 /* 1278 /*
1251 * Activate @cfqg and propagate activation upwards until we meet an 1279 * Activate @cfqg and calculate the portion of vfraction @cfqg is
1252 * already activated node or reach root. 1280 * entitled to. vfraction is calculated by walking the tree
1281 * towards the root calculating the fraction it has at each level.
1282 * The compounded ratio is how much vfraction @cfqg owns.
1283 *
1284 * Start with the proportion tasks in this cfqg has against active
1285 * children cfqgs - its leaf_weight against children_weight.
1253 */ 1286 */
1254 propagate = !pos->nr_active++; 1287 propagate = !pos->nr_active++;
1255 pos->children_weight += pos->leaf_weight; 1288 pos->children_weight += pos->leaf_weight;
1289 vfr = vfr * pos->leaf_weight / pos->children_weight;
1256 1290
1257 while (propagate) { 1291 /*
1258 struct cfq_group *parent = cfqg_flat_parent(pos); 1292 * Compound ->weight walking up the tree. Both activation and
1259 1293 * vfraction calculation are done in the same loop. Propagation
1260 if (!parent) 1294 * stops once an already activated node is met. vfraction
1261 break; 1295 * calculation should always continue to the root.
1262 1296 */
1263 propagate = !parent->nr_active++; 1297 while ((parent = cfqg_flat_parent(pos))) {
1264 parent->children_weight += pos->weight; 1298 if (propagate) {
1299 propagate = !parent->nr_active++;
1300 parent->children_weight += pos->weight;
1301 }
1302 vfr = vfr * pos->weight / parent->children_weight;
1265 pos = parent; 1303 pos = parent;
1266 } 1304 }
1305
1306 cfqg->vfraction = max_t(unsigned, vfr, 1);
1267} 1307}
1268 1308
1269static void 1309static void
@@ -1309,6 +1349,7 @@ cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1309 1349
1310 /* @pos has 0 nr_active at this point */ 1350 /* @pos has 0 nr_active at this point */
1311 WARN_ON_ONCE(pos->children_weight); 1351 WARN_ON_ONCE(pos->children_weight);
1352 pos->vfraction = 0;
1312 1353
1313 if (!parent) 1354 if (!parent)
1314 break; 1355 break;
@@ -1381,6 +1422,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1381 unsigned int used_sl, charge, unaccounted_sl = 0; 1422 unsigned int used_sl, charge, unaccounted_sl = 0;
1382 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 1423 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1383 - cfqg->service_tree_idle.count; 1424 - cfqg->service_tree_idle.count;
1425 unsigned int vfr;
1384 1426
1385 BUG_ON(nr_sync < 0); 1427 BUG_ON(nr_sync < 0);
1386 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); 1428 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
@@ -1390,10 +1432,15 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1390 else if (!cfq_cfqq_sync(cfqq) && !nr_sync) 1432 else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1391 charge = cfqq->allocated_slice; 1433 charge = cfqq->allocated_slice;
1392 1434
1393 /* Can't update vdisktime while group is on service tree */ 1435 /*
1436 * Can't update vdisktime while on service tree and cfqg->vfraction
1437 * is valid only while on it. Cache vfr, leave the service tree,
1438 * update vdisktime and go back on. The re-addition to the tree
1439 * will also update the weights as necessary.
1440 */
1441 vfr = cfqg->vfraction;
1394 cfq_group_service_tree_del(st, cfqg); 1442 cfq_group_service_tree_del(st, cfqg);
1395 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 1443 cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1396 /* If a new weight was requested, update now, off tree */
1397 cfq_group_service_tree_add(st, cfqg); 1444 cfq_group_service_tree_add(st, cfqg);
1398 1445
1399 /* This group is being expired. Save the context */ 1446 /* This group is being expired. Save the context */
@@ -1669,44 +1716,44 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
1669#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1716#endif /* CONFIG_DEBUG_BLK_CGROUP */
1670 1717
1671static struct cftype cfq_blkcg_files[] = { 1718static struct cftype cfq_blkcg_files[] = {
1719 /* on root, weight is mapped to leaf_weight */
1672 { 1720 {
1673 .name = "weight_device", 1721 .name = "weight_device",
1674 .read_seq_string = cfqg_print_weight_device, 1722 .flags = CFTYPE_ONLY_ON_ROOT,
1675 .write_string = cfqg_set_weight_device, 1723 .read_seq_string = cfqg_print_leaf_weight_device,
1724 .write_string = cfqg_set_leaf_weight_device,
1676 .max_write_len = 256, 1725 .max_write_len = 256,
1677 }, 1726 },
1678 { 1727 {
1679 .name = "weight", 1728 .name = "weight",
1680 .read_seq_string = cfq_print_weight, 1729 .flags = CFTYPE_ONLY_ON_ROOT,
1681 .write_u64 = cfq_set_weight, 1730 .read_seq_string = cfq_print_leaf_weight,
1731 .write_u64 = cfq_set_leaf_weight,
1682 }, 1732 },
1683 1733
1684 /* on root, leaf_weight is mapped to weight */ 1734 /* no such mapping necessary for !roots */
1685 { 1735 {
1686 .name = "leaf_weight_device", 1736 .name = "weight_device",
1687 .flags = CFTYPE_ONLY_ON_ROOT, 1737 .flags = CFTYPE_NOT_ON_ROOT,
1688 .read_seq_string = cfqg_print_weight_device, 1738 .read_seq_string = cfqg_print_weight_device,
1689 .write_string = cfqg_set_weight_device, 1739 .write_string = cfqg_set_weight_device,
1690 .max_write_len = 256, 1740 .max_write_len = 256,
1691 }, 1741 },
1692 { 1742 {
1693 .name = "leaf_weight", 1743 .name = "weight",
1694 .flags = CFTYPE_ONLY_ON_ROOT, 1744 .flags = CFTYPE_NOT_ON_ROOT,
1695 .read_seq_string = cfq_print_weight, 1745 .read_seq_string = cfq_print_weight,
1696 .write_u64 = cfq_set_weight, 1746 .write_u64 = cfq_set_weight,
1697 }, 1747 },
1698 1748
1699 /* no such mapping necessary for !roots */
1700 { 1749 {
1701 .name = "leaf_weight_device", 1750 .name = "leaf_weight_device",
1702 .flags = CFTYPE_NOT_ON_ROOT,
1703 .read_seq_string = cfqg_print_leaf_weight_device, 1751 .read_seq_string = cfqg_print_leaf_weight_device,
1704 .write_string = cfqg_set_leaf_weight_device, 1752 .write_string = cfqg_set_leaf_weight_device,
1705 .max_write_len = 256, 1753 .max_write_len = 256,
1706 }, 1754 },
1707 { 1755 {
1708 .name = "leaf_weight", 1756 .name = "leaf_weight",
1709 .flags = CFTYPE_NOT_ON_ROOT,
1710 .read_seq_string = cfq_print_leaf_weight, 1757 .read_seq_string = cfq_print_leaf_weight,
1711 .write_u64 = cfq_set_leaf_weight, 1758 .write_u64 = cfq_set_leaf_weight,
1712 }, 1759 },