diff options
author | Tejun Heo <tj@kernel.org> | 2013-01-09 11:05:11 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2013-01-09 11:05:11 -0500 |
commit | 1d3650f713e7f6392b02fde450c5bae40291e65b (patch) | |
tree | bc1645e9ce6d762894b51505fc90253f9a9aed58 /block/cfq-iosched.c | |
parent | 7918ffb5b83e3373206ada84873c674fbddf61cc (diff) |
cfq-iosched: implement hierarchy-ready cfq_group charge scaling
Currently, cfqg charges are scaled directly according to cfqg->weight.
Regardless of the number of active cfqgs or the amount of active
weights, a given weight value always scales charge the same way. This
works fine as long as all cfqgs are treated equally regardless of
their positions in the hierarchy, which is what cfq currently
implements. It can't work in hierarchical settings because the
interpretation of a given weight value depends on where the weight is
located in the hierarchy.
This patch reimplements cfqg charge scaling so that it can be used to
support hierarchy properly. The scheme is fairly simple and
light-weight.
* When a cfqg is added to the service tree, v(disktime)weight is
calculated. It walks up the tree to root calculating the fraction
it has in the hierarchy. At each level, the fraction can be
calculated as
cfqg->weight / parent->level_weight
By compounding these, the global fraction of vdisktime the cfqg has
claim to - vfraction - can be determined.
* When the cfqg needs to be charged, the charge is scaled inversely
proportionally to the vfraction.
The new scaling scheme uses the same CFQ_SERVICE_SHIFT for fixed point
representation as before; however, the smallest scaling factor is now
1 (ie. 1 << CFQ_SERVICE_SHIFT). This is different from before where 1
was for CFQ_WEIGHT_DEFAULT and higher weight would result in smaller
scaling factor.
While this shifts the global scale of vdisktime a bit, it doesn't
change the relative relationships among cfqgs and the scheduling
result isn't different.
cfq_group_notify_queue_add uses fixed CFQ_IDLE_DELAY when appending
new cfqg to the service tree. The specific value of CFQ_IDLE_DELAY
didn't have any relevance to vdisktime before and is unlikely to cause
any visible behavior difference now especially as the scale shift
isn't that large.
As the new scheme now makes proper distinction between cfqg->weight
and ->leaf_weight, reverse the weight aliasing for root cfqgs. For
root, both weights are now mapped to ->leaf_weight instead of the
other way around.
Because we're still using cfqg_flat_parent(), this patch shouldn't
change the scheduling behavior in any noticeable way.
v2: Beefed up comments on vfraction as requested by Vivek.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Diffstat (limited to 'block/cfq-iosched.c')
-rw-r--r-- | block/cfq-iosched.c | 107 |
1 files changed, 77 insertions, 30 deletions
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7701c3fe49b9..b24acf66d5b5 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -237,6 +237,18 @@ struct cfq_group { | |||
237 | unsigned int children_weight; | 237 | unsigned int children_weight; |
238 | 238 | ||
239 | /* | 239 | /* |
240 | * vfraction is the fraction of vdisktime that the tasks in this | ||
241 | * cfqg are entitled to. This is determined by compounding the | ||
242 | * ratios walking up from this cfqg to the root. | ||
243 | * | ||
244 | * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all | ||
245 | * vfractions on a service tree is approximately 1. The sum may | ||
246 | * deviate a bit due to rounding errors and fluctuations caused by | ||
247 | * cfqgs entering and leaving the service tree. | ||
248 | */ | ||
249 | unsigned int vfraction; | ||
250 | |||
251 | /* | ||
240 | * There are two weights - (internal) weight is the weight of this | 252 | * There are two weights - (internal) weight is the weight of this |
241 | * cfqg against the sibling cfqgs. leaf_weight is the wight of | 253 | * cfqg against the sibling cfqgs. leaf_weight is the wight of |
242 | * this cfqg against the child cfqgs. For the root cfqg, both | 254 | * this cfqg against the child cfqgs. For the root cfqg, both |
@@ -891,13 +903,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
891 | return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); | 903 | return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); |
892 | } | 904 | } |
893 | 905 | ||
894 | static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) | 906 | /** |
907 | * cfqg_scale_charge - scale disk time charge according to cfqg weight | ||
908 | * @charge: disk time being charged | ||
909 | * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT | ||
910 | * | ||
911 | * Scale @charge according to @vfraction, which is in range (0, 1]. The | ||
912 | * scaling is inversely proportional. | ||
913 | * | ||
914 | * scaled = charge / vfraction | ||
915 | * | ||
916 | * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. | ||
917 | */ | ||
918 | static inline u64 cfqg_scale_charge(unsigned long charge, | ||
919 | unsigned int vfraction) | ||
895 | { | 920 | { |
896 | u64 d = delta << CFQ_SERVICE_SHIFT; | 921 | u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ |
897 | 922 | ||
898 | d = d * CFQ_WEIGHT_DEFAULT; | 923 | /* charge / vfraction */ |
899 | do_div(d, cfqg->weight); | 924 | c <<= CFQ_SERVICE_SHIFT; |
900 | return d; | 925 | do_div(c, vfraction); |
926 | return c; | ||
901 | } | 927 | } |
902 | 928 | ||
903 | static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) | 929 | static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) |
@@ -1237,7 +1263,9 @@ cfq_update_group_weight(struct cfq_group *cfqg) | |||
1237 | static void | 1263 | static void |
1238 | cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | 1264 | cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) |
1239 | { | 1265 | { |
1266 | unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ | ||
1240 | struct cfq_group *pos = cfqg; | 1267 | struct cfq_group *pos = cfqg; |
1268 | struct cfq_group *parent; | ||
1241 | bool propagate; | 1269 | bool propagate; |
1242 | 1270 | ||
1243 | /* add to the service tree */ | 1271 | /* add to the service tree */ |
@@ -1248,22 +1276,34 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | |||
1248 | st->total_weight += cfqg->weight; | 1276 | st->total_weight += cfqg->weight; |
1249 | 1277 | ||
1250 | /* | 1278 | /* |
1251 | * Activate @cfqg and propagate activation upwards until we meet an | 1279 | * Activate @cfqg and calculate the portion of vfraction @cfqg is |
1252 | * already activated node or reach root. | 1280 | * entitled to. vfraction is calculated by walking the tree |
1281 | * towards the root calculating the fraction it has at each level. | ||
1282 | * The compounded ratio is how much vfraction @cfqg owns. | ||
1283 | * | ||
1284 | * Start with the proportion tasks in this cfqg has against active | ||
1285 | * children cfqgs - its leaf_weight against children_weight. | ||
1253 | */ | 1286 | */ |
1254 | propagate = !pos->nr_active++; | 1287 | propagate = !pos->nr_active++; |
1255 | pos->children_weight += pos->leaf_weight; | 1288 | pos->children_weight += pos->leaf_weight; |
1289 | vfr = vfr * pos->leaf_weight / pos->children_weight; | ||
1256 | 1290 | ||
1257 | while (propagate) { | 1291 | /* |
1258 | struct cfq_group *parent = cfqg_flat_parent(pos); | 1292 | * Compound ->weight walking up the tree. Both activation and |
1259 | 1293 | * vfraction calculation are done in the same loop. Propagation | |
1260 | if (!parent) | 1294 | * stops once an already activated node is met. vfraction |
1261 | break; | 1295 | * calculation should always continue to the root. |
1262 | 1296 | */ | |
1263 | propagate = !parent->nr_active++; | 1297 | while ((parent = cfqg_flat_parent(pos))) { |
1264 | parent->children_weight += pos->weight; | 1298 | if (propagate) { |
1299 | propagate = !parent->nr_active++; | ||
1300 | parent->children_weight += pos->weight; | ||
1301 | } | ||
1302 | vfr = vfr * pos->weight / parent->children_weight; | ||
1265 | pos = parent; | 1303 | pos = parent; |
1266 | } | 1304 | } |
1305 | |||
1306 | cfqg->vfraction = max_t(unsigned, vfr, 1); | ||
1267 | } | 1307 | } |
1268 | 1308 | ||
1269 | static void | 1309 | static void |
@@ -1309,6 +1349,7 @@ cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) | |||
1309 | 1349 | ||
1310 | /* @pos has 0 nr_active at this point */ | 1350 | /* @pos has 0 nr_active at this point */ |
1311 | WARN_ON_ONCE(pos->children_weight); | 1351 | WARN_ON_ONCE(pos->children_weight); |
1352 | pos->vfraction = 0; | ||
1312 | 1353 | ||
1313 | if (!parent) | 1354 | if (!parent) |
1314 | break; | 1355 | break; |
@@ -1381,6 +1422,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
1381 | unsigned int used_sl, charge, unaccounted_sl = 0; | 1422 | unsigned int used_sl, charge, unaccounted_sl = 0; |
1382 | int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) | 1423 | int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) |
1383 | - cfqg->service_tree_idle.count; | 1424 | - cfqg->service_tree_idle.count; |
1425 | unsigned int vfr; | ||
1384 | 1426 | ||
1385 | BUG_ON(nr_sync < 0); | 1427 | BUG_ON(nr_sync < 0); |
1386 | used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); | 1428 | used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); |
@@ -1390,10 +1432,15 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
1390 | else if (!cfq_cfqq_sync(cfqq) && !nr_sync) | 1432 | else if (!cfq_cfqq_sync(cfqq) && !nr_sync) |
1391 | charge = cfqq->allocated_slice; | 1433 | charge = cfqq->allocated_slice; |
1392 | 1434 | ||
1393 | /* Can't update vdisktime while group is on service tree */ | 1435 | /* |
1436 | * Can't update vdisktime while on service tree and cfqg->vfraction | ||
1437 | * is valid only while on it. Cache vfr, leave the service tree, | ||
1438 | * update vdisktime and go back on. The re-addition to the tree | ||
1439 | * will also update the weights as necessary. | ||
1440 | */ | ||
1441 | vfr = cfqg->vfraction; | ||
1394 | cfq_group_service_tree_del(st, cfqg); | 1442 | cfq_group_service_tree_del(st, cfqg); |
1395 | cfqg->vdisktime += cfq_scale_slice(charge, cfqg); | 1443 | cfqg->vdisktime += cfqg_scale_charge(charge, vfr); |
1396 | /* If a new weight was requested, update now, off tree */ | ||
1397 | cfq_group_service_tree_add(st, cfqg); | 1444 | cfq_group_service_tree_add(st, cfqg); |
1398 | 1445 | ||
1399 | /* This group is being expired. Save the context */ | 1446 | /* This group is being expired. Save the context */ |
@@ -1669,44 +1716,44 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, | |||
1669 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | 1716 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ |
1670 | 1717 | ||
1671 | static struct cftype cfq_blkcg_files[] = { | 1718 | static struct cftype cfq_blkcg_files[] = { |
1719 | /* on root, weight is mapped to leaf_weight */ | ||
1672 | { | 1720 | { |
1673 | .name = "weight_device", | 1721 | .name = "weight_device", |
1674 | .read_seq_string = cfqg_print_weight_device, | 1722 | .flags = CFTYPE_ONLY_ON_ROOT, |
1675 | .write_string = cfqg_set_weight_device, | 1723 | .read_seq_string = cfqg_print_leaf_weight_device, |
1724 | .write_string = cfqg_set_leaf_weight_device, | ||
1676 | .max_write_len = 256, | 1725 | .max_write_len = 256, |
1677 | }, | 1726 | }, |
1678 | { | 1727 | { |
1679 | .name = "weight", | 1728 | .name = "weight", |
1680 | .read_seq_string = cfq_print_weight, | 1729 | .flags = CFTYPE_ONLY_ON_ROOT, |
1681 | .write_u64 = cfq_set_weight, | 1730 | .read_seq_string = cfq_print_leaf_weight, |
1731 | .write_u64 = cfq_set_leaf_weight, | ||
1682 | }, | 1732 | }, |
1683 | 1733 | ||
1684 | /* on root, leaf_weight is mapped to weight */ | 1734 | /* no such mapping necessary for !roots */ |
1685 | { | 1735 | { |
1686 | .name = "leaf_weight_device", | 1736 | .name = "weight_device", |
1687 | .flags = CFTYPE_ONLY_ON_ROOT, | 1737 | .flags = CFTYPE_NOT_ON_ROOT, |
1688 | .read_seq_string = cfqg_print_weight_device, | 1738 | .read_seq_string = cfqg_print_weight_device, |
1689 | .write_string = cfqg_set_weight_device, | 1739 | .write_string = cfqg_set_weight_device, |
1690 | .max_write_len = 256, | 1740 | .max_write_len = 256, |
1691 | }, | 1741 | }, |
1692 | { | 1742 | { |
1693 | .name = "leaf_weight", | 1743 | .name = "weight", |
1694 | .flags = CFTYPE_ONLY_ON_ROOT, | 1744 | .flags = CFTYPE_NOT_ON_ROOT, |
1695 | .read_seq_string = cfq_print_weight, | 1745 | .read_seq_string = cfq_print_weight, |
1696 | .write_u64 = cfq_set_weight, | 1746 | .write_u64 = cfq_set_weight, |
1697 | }, | 1747 | }, |
1698 | 1748 | ||
1699 | /* no such mapping necessary for !roots */ | ||
1700 | { | 1749 | { |
1701 | .name = "leaf_weight_device", | 1750 | .name = "leaf_weight_device", |
1702 | .flags = CFTYPE_NOT_ON_ROOT, | ||
1703 | .read_seq_string = cfqg_print_leaf_weight_device, | 1751 | .read_seq_string = cfqg_print_leaf_weight_device, |
1704 | .write_string = cfqg_set_leaf_weight_device, | 1752 | .write_string = cfqg_set_leaf_weight_device, |
1705 | .max_write_len = 256, | 1753 | .max_write_len = 256, |
1706 | }, | 1754 | }, |
1707 | { | 1755 | { |
1708 | .name = "leaf_weight", | 1756 | .name = "leaf_weight", |
1709 | .flags = CFTYPE_NOT_ON_ROOT, | ||
1710 | .read_seq_string = cfq_print_leaf_weight, | 1757 | .read_seq_string = cfq_print_leaf_weight, |
1711 | .write_u64 = cfq_set_leaf_weight, | 1758 | .write_u64 = cfq_set_leaf_weight, |
1712 | }, | 1759 | }, |