diff options
| -rw-r--r-- | Documentation/block/cfq-iosched.txt | 58 | ||||
| -rw-r--r-- | Documentation/cgroups/blkio-controller.txt | 35 | ||||
| -rw-r--r-- | block/cfq-iosched.c | 21 |
3 files changed, 88 insertions, 26 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt index d89b4fe724d7..a5eb7d19a65d 100644 --- a/Documentation/block/cfq-iosched.txt +++ b/Documentation/block/cfq-iosched.txt | |||
| @@ -102,6 +102,64 @@ processing of request. Therefore, increasing the value can imporve the | |||
| 102 | performace although this can cause the latency of some I/O to increase due | 102 | performace although this can cause the latency of some I/O to increase due |
| 103 | to more number of requests. | 103 | to more number of requests. |
| 104 | 104 | ||
| 105 | CFQ Group scheduling | ||
| 106 | ==================== | ||
| 107 | |||
| 108 | CFQ supports blkio cgroup and has "blkio." prefixed files in each | ||
| 109 | blkio cgroup directory. It is weight-based and there are four knobs | ||
| 110 | for configuration - weight[_device] and leaf_weight[_device]. | ||
| 111 | Internal cgroup nodes (the ones with children) can also have tasks in | ||
| 112 | them, so the former two configure how much proportion the cgroup as a | ||
| 113 | whole is entitled to at its parent's level while the latter two | ||
| 114 | configure how much proportion the tasks in the cgroup have compared to | ||
| 115 | its direct children. | ||
| 116 | |||
| 117 | Another way to think about it is assuming that each internal node has | ||
| 118 | an implicit leaf child node which hosts all the tasks whose weight is | ||
| 119 | configured by leaf_weight[_device]. Let's assume a blkio hierarchy | ||
| 120 | composed of five cgroups - root, A, B, AA and AB - with the following | ||
| 121 | weights where the names represent the hierarchy. | ||
| 122 | |||
| 123 | weight leaf_weight | ||
| 124 | root : 125 125 | ||
| 125 | A : 500 750 | ||
| 126 | B : 250 500 | ||
| 127 | AA : 500 500 | ||
| 128 | AB : 1000 500 | ||
| 129 | |||
| 130 | root never has a parent making its weight is meaningless. For backward | ||
| 131 | compatibility, weight is always kept in sync with leaf_weight. B, AA | ||
| 132 | and AB have no child and thus its tasks have no children cgroup to | ||
| 133 | compete with. They always get 100% of what the cgroup won at the | ||
| 134 | parent level. Considering only the weights which matter, the hierarchy | ||
| 135 | looks like the following. | ||
| 136 | |||
| 137 | root | ||
| 138 | / | \ | ||
| 139 | A B leaf | ||
| 140 | 500 250 125 | ||
| 141 | / | \ | ||
| 142 | AA AB leaf | ||
| 143 | 500 1000 750 | ||
| 144 | |||
| 145 | If all cgroups have active IOs and competing with each other, disk | ||
| 146 | time will be distributed like the following. | ||
| 147 | |||
| 148 | Distribution below root. The total active weight at this level is | ||
| 149 | A:500 + B:250 + C:125 = 875. | ||
| 150 | |||
| 151 | root-leaf : 125 / 875 =~ 14% | ||
| 152 | A : 500 / 875 =~ 57% | ||
| 153 | B(-leaf) : 250 / 875 =~ 28% | ||
| 154 | |||
| 155 | A has children and further distributes its 57% among the children and | ||
| 156 | the implicit leaf node. The total active weight at this level is | ||
| 157 | AA:500 + AB:1000 + A-leaf:750 = 2250. | ||
| 158 | |||
| 159 | A-leaf : ( 750 / 2250) * A =~ 19% | ||
| 160 | AA(-leaf) : ( 500 / 2250) * A =~ 12% | ||
| 161 | AB(-leaf) : (1000 / 2250) * A =~ 25% | ||
| 162 | |||
| 105 | CFQ IOPS Mode for group scheduling | 163 | CFQ IOPS Mode for group scheduling |
| 106 | =================================== | 164 | =================================== |
| 107 | Basic CFQ design is to provide priority based time slices. Higher priority | 165 | Basic CFQ design is to provide priority based time slices. Higher priority |
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index b4b1fb3a83f0..1b70843c574e 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt | |||
| @@ -94,13 +94,11 @@ Throttling/Upper Limit policy | |||
| 94 | 94 | ||
| 95 | Hierarchical Cgroups | 95 | Hierarchical Cgroups |
| 96 | ==================== | 96 | ==================== |
| 97 | - Currently none of the IO control policy supports hierarchical groups. But | 97 | - Currently only CFQ supports hierarchical groups. For throttling, |
| 98 | cgroup interface does allow creation of hierarchical cgroups and internally | 98 | cgroup interface does allow creation of hierarchical cgroups and |
| 99 | IO policies treat them as flat hierarchy. | 99 | internally it treats them as flat hierarchy. |
| 100 | 100 | ||
| 101 | So this patch will allow creation of cgroup hierarchcy but at the backend | 101 | If somebody created a hierarchy like as follows. |
| 102 | everything will be treated as flat. So if somebody created a hierarchy like | ||
| 103 | as follows. | ||
| 104 | 102 | ||
| 105 | root | 103 | root |
| 106 | / \ | 104 | / \ |
| @@ -108,16 +106,20 @@ Hierarchical Cgroups | |||
| 108 | | | 106 | | |
| 109 | test3 | 107 | test3 |
| 110 | 108 | ||
| 111 | CFQ and throttling will practically treat all groups at same level. | 109 | CFQ will handle the hierarchy correctly but and throttling will |
| 110 | practically treat all groups at same level. For details on CFQ | ||
| 111 | hierarchy support, refer to Documentation/block/cfq-iosched.txt. | ||
| 112 | Throttling will treat the hierarchy as if it looks like the | ||
| 113 | following. | ||
| 112 | 114 | ||
| 113 | pivot | 115 | pivot |
| 114 | / / \ \ | 116 | / / \ \ |
| 115 | root test1 test2 test3 | 117 | root test1 test2 test3 |
| 116 | 118 | ||
| 117 | Down the line we can implement hierarchical accounting/control support | 119 | Nesting cgroups, while allowed, isn't officially supported and blkio |
| 118 | and also introduce a new cgroup file "use_hierarchy" which will control | 120 | genereates warning when cgroups nest. Once throttling implements |
| 119 | whether cgroup hierarchy is viewed as flat or hierarchical by the policy.. | 121 | hierarchy support, hierarchy will be supported and the warning will |
| 120 | This is how memory controller also has implemented the things. | 122 | be removed. |
| 121 | 123 | ||
| 122 | Various user visible config options | 124 | Various user visible config options |
| 123 | =================================== | 125 | =================================== |
| @@ -172,6 +174,12 @@ Proportional weight policy files | |||
| 172 | dev weight | 174 | dev weight |
| 173 | 8:16 300 | 175 | 8:16 300 |
| 174 | 176 | ||
| 177 | - blkio.leaf_weight[_device] | ||
| 178 | - Equivalents of blkio.weight[_device] for the purpose of | ||
| 179 | deciding how much weight tasks in the given cgroup has while | ||
| 180 | competing with the cgroup's child cgroups. For details, | ||
| 181 | please refer to Documentation/block/cfq-iosched.txt. | ||
| 182 | |||
| 175 | - blkio.time | 183 | - blkio.time |
| 176 | - disk time allocated to cgroup per device in milliseconds. First | 184 | - disk time allocated to cgroup per device in milliseconds. First |
| 177 | two fields specify the major and minor number of the device and | 185 | two fields specify the major and minor number of the device and |
| @@ -279,6 +287,11 @@ Proportional weight policy files | |||
| 279 | and minor number of the device and third field specifies the number | 287 | and minor number of the device and third field specifies the number |
| 280 | of times a group was dequeued from a particular device. | 288 | of times a group was dequeued from a particular device. |
| 281 | 289 | ||
| 290 | - blkio.*_recursive | ||
| 291 | - Recursive version of various stats. These files show the | ||
| 292 | same information as their non-recursive counterparts but | ||
| 293 | include stats from all the descendant cgroups. | ||
| 294 | |||
| 282 | Throttling/Upper limit policy files | 295 | Throttling/Upper limit policy files |
| 283 | ----------------------------------- | 296 | ----------------------------------- |
| 284 | - blkio.throttle.read_bps_device | 297 | - blkio.throttle.read_bps_device |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ee342826fd98..e8f31069b379 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
| @@ -606,20 +606,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) | |||
| 606 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); | 606 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); |
| 607 | } | 607 | } |
| 608 | 608 | ||
| 609 | /* | 609 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) |
| 610 | * Determine the parent cfqg for weight calculation. Currently, cfqg | ||
| 611 | * scheduling is flat and the root is the parent of everyone else. | ||
| 612 | */ | ||
| 613 | static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg) | ||
| 614 | { | 610 | { |
| 615 | struct blkcg_gq *blkg = cfqg_to_blkg(cfqg); | 611 | struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; |
| 616 | struct cfq_group *root; | ||
| 617 | |||
| 618 | while (blkg->parent) | ||
| 619 | blkg = blkg->parent; | ||
| 620 | root = blkg_to_cfqg(blkg); | ||
| 621 | 612 | ||
| 622 | return root != cfqg ? root : NULL; | 613 | return pblkg ? blkg_to_cfqg(pblkg) : NULL; |
| 623 | } | 614 | } |
| 624 | 615 | ||
| 625 | static inline void cfqg_get(struct cfq_group *cfqg) | 616 | static inline void cfqg_get(struct cfq_group *cfqg) |
| @@ -722,7 +713,7 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg) | |||
| 722 | 713 | ||
| 723 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ | 714 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ |
| 724 | 715 | ||
| 725 | static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg) { return NULL; } | 716 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } |
| 726 | static inline void cfqg_get(struct cfq_group *cfqg) { } | 717 | static inline void cfqg_get(struct cfq_group *cfqg) { } |
| 727 | static inline void cfqg_put(struct cfq_group *cfqg) { } | 718 | static inline void cfqg_put(struct cfq_group *cfqg) { } |
| 728 | 719 | ||
| @@ -1290,7 +1281,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | |||
| 1290 | * stops once an already activated node is met. vfraction | 1281 | * stops once an already activated node is met. vfraction |
| 1291 | * calculation should always continue to the root. | 1282 | * calculation should always continue to the root. |
| 1292 | */ | 1283 | */ |
| 1293 | while ((parent = cfqg_flat_parent(pos))) { | 1284 | while ((parent = cfqg_parent(pos))) { |
| 1294 | if (propagate) { | 1285 | if (propagate) { |
| 1295 | propagate = !parent->nr_active++; | 1286 | propagate = !parent->nr_active++; |
| 1296 | parent->children_weight += pos->weight; | 1287 | parent->children_weight += pos->weight; |
| @@ -1341,7 +1332,7 @@ cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) | |||
| 1341 | pos->children_weight -= pos->leaf_weight; | 1332 | pos->children_weight -= pos->leaf_weight; |
| 1342 | 1333 | ||
| 1343 | while (propagate) { | 1334 | while (propagate) { |
| 1344 | struct cfq_group *parent = cfqg_flat_parent(pos); | 1335 | struct cfq_group *parent = cfqg_parent(pos); |
| 1345 | 1336 | ||
| 1346 | /* @pos has 0 nr_active at this point */ | 1337 | /* @pos has 0 nr_active at this point */ |
| 1347 | WARN_ON_ONCE(pos->children_weight); | 1338 | WARN_ON_ONCE(pos->children_weight); |
