diff options
-rw-r--r-- | Documentation/block/cfq-iosched.txt | 58 | ||||
-rw-r--r-- | Documentation/cgroups/blkio-controller.txt | 35 | ||||
-rw-r--r-- | block/cfq-iosched.c | 21 |
3 files changed, 88 insertions, 26 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt index d89b4fe724d7..a5eb7d19a65d 100644 --- a/Documentation/block/cfq-iosched.txt +++ b/Documentation/block/cfq-iosched.txt | |||
@@ -102,6 +102,64 @@ processing of request. Therefore, increasing the value can imporve the | |||
102 | performace although this can cause the latency of some I/O to increase due | 102 | performace although this can cause the latency of some I/O to increase due |
103 | to more number of requests. | 103 | to more number of requests. |
104 | 104 | ||
105 | CFQ Group scheduling | ||
106 | ==================== | ||
107 | |||
108 | CFQ supports blkio cgroup and has "blkio." prefixed files in each | ||
109 | blkio cgroup directory. It is weight-based and there are four knobs | ||
110 | for configuration - weight[_device] and leaf_weight[_device]. | ||
111 | Internal cgroup nodes (the ones with children) can also have tasks in | ||
112 | them, so the former two configure how much proportion the cgroup as a | ||
113 | whole is entitled to at its parent's level while the latter two | ||
114 | configure how much proportion the tasks in the cgroup have compared to | ||
115 | its direct children. | ||
116 | |||
117 | Another way to think about it is assuming that each internal node has | ||
118 | an implicit leaf child node which hosts all the tasks whose weight is | ||
119 | configured by leaf_weight[_device]. Let's assume a blkio hierarchy | ||
120 | composed of five cgroups - root, A, B, AA and AB - with the following | ||
121 | weights where the names represent the hierarchy. | ||
122 | |||
123 | weight leaf_weight | ||
124 | root : 125 125 | ||
125 | A : 500 750 | ||
126 | B : 250 500 | ||
127 | AA : 500 500 | ||
128 | AB : 1000 500 | ||
129 | |||
130 | root never has a parent making its weight is meaningless. For backward | ||
131 | compatibility, weight is always kept in sync with leaf_weight. B, AA | ||
132 | and AB have no child and thus its tasks have no children cgroup to | ||
133 | compete with. They always get 100% of what the cgroup won at the | ||
134 | parent level. Considering only the weights which matter, the hierarchy | ||
135 | looks like the following. | ||
136 | |||
137 | root | ||
138 | / | \ | ||
139 | A B leaf | ||
140 | 500 250 125 | ||
141 | / | \ | ||
142 | AA AB leaf | ||
143 | 500 1000 750 | ||
144 | |||
145 | If all cgroups have active IOs and competing with each other, disk | ||
146 | time will be distributed like the following. | ||
147 | |||
148 | Distribution below root. The total active weight at this level is | ||
149 | A:500 + B:250 + C:125 = 875. | ||
150 | |||
151 | root-leaf : 125 / 875 =~ 14% | ||
152 | A : 500 / 875 =~ 57% | ||
153 | B(-leaf) : 250 / 875 =~ 28% | ||
154 | |||
155 | A has children and further distributes its 57% among the children and | ||
156 | the implicit leaf node. The total active weight at this level is | ||
157 | AA:500 + AB:1000 + A-leaf:750 = 2250. | ||
158 | |||
159 | A-leaf : ( 750 / 2250) * A =~ 19% | ||
160 | AA(-leaf) : ( 500 / 2250) * A =~ 12% | ||
161 | AB(-leaf) : (1000 / 2250) * A =~ 25% | ||
162 | |||
105 | CFQ IOPS Mode for group scheduling | 163 | CFQ IOPS Mode for group scheduling |
106 | =================================== | 164 | =================================== |
107 | Basic CFQ design is to provide priority based time slices. Higher priority | 165 | Basic CFQ design is to provide priority based time slices. Higher priority |
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index b4b1fb3a83f0..1b70843c574e 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt | |||
@@ -94,13 +94,11 @@ Throttling/Upper Limit policy | |||
94 | 94 | ||
95 | Hierarchical Cgroups | 95 | Hierarchical Cgroups |
96 | ==================== | 96 | ==================== |
97 | - Currently none of the IO control policy supports hierarchical groups. But | 97 | - Currently only CFQ supports hierarchical groups. For throttling, |
98 | cgroup interface does allow creation of hierarchical cgroups and internally | 98 | cgroup interface does allow creation of hierarchical cgroups and |
99 | IO policies treat them as flat hierarchy. | 99 | internally it treats them as flat hierarchy. |
100 | 100 | ||
101 | So this patch will allow creation of cgroup hierarchcy but at the backend | 101 | If somebody created a hierarchy like as follows. |
102 | everything will be treated as flat. So if somebody created a hierarchy like | ||
103 | as follows. | ||
104 | 102 | ||
105 | root | 103 | root |
106 | / \ | 104 | / \ |
@@ -108,16 +106,20 @@ Hierarchical Cgroups | |||
108 | | | 106 | | |
109 | test3 | 107 | test3 |
110 | 108 | ||
111 | CFQ and throttling will practically treat all groups at same level. | 109 | CFQ will handle the hierarchy correctly but and throttling will |
110 | practically treat all groups at same level. For details on CFQ | ||
111 | hierarchy support, refer to Documentation/block/cfq-iosched.txt. | ||
112 | Throttling will treat the hierarchy as if it looks like the | ||
113 | following. | ||
112 | 114 | ||
113 | pivot | 115 | pivot |
114 | / / \ \ | 116 | / / \ \ |
115 | root test1 test2 test3 | 117 | root test1 test2 test3 |
116 | 118 | ||
117 | Down the line we can implement hierarchical accounting/control support | 119 | Nesting cgroups, while allowed, isn't officially supported and blkio |
118 | and also introduce a new cgroup file "use_hierarchy" which will control | 120 | genereates warning when cgroups nest. Once throttling implements |
119 | whether cgroup hierarchy is viewed as flat or hierarchical by the policy.. | 121 | hierarchy support, hierarchy will be supported and the warning will |
120 | This is how memory controller also has implemented the things. | 122 | be removed. |
121 | 123 | ||
122 | Various user visible config options | 124 | Various user visible config options |
123 | =================================== | 125 | =================================== |
@@ -172,6 +174,12 @@ Proportional weight policy files | |||
172 | dev weight | 174 | dev weight |
173 | 8:16 300 | 175 | 8:16 300 |
174 | 176 | ||
177 | - blkio.leaf_weight[_device] | ||
178 | - Equivalents of blkio.weight[_device] for the purpose of | ||
179 | deciding how much weight tasks in the given cgroup has while | ||
180 | competing with the cgroup's child cgroups. For details, | ||
181 | please refer to Documentation/block/cfq-iosched.txt. | ||
182 | |||
175 | - blkio.time | 183 | - blkio.time |
176 | - disk time allocated to cgroup per device in milliseconds. First | 184 | - disk time allocated to cgroup per device in milliseconds. First |
177 | two fields specify the major and minor number of the device and | 185 | two fields specify the major and minor number of the device and |
@@ -279,6 +287,11 @@ Proportional weight policy files | |||
279 | and minor number of the device and third field specifies the number | 287 | and minor number of the device and third field specifies the number |
280 | of times a group was dequeued from a particular device. | 288 | of times a group was dequeued from a particular device. |
281 | 289 | ||
290 | - blkio.*_recursive | ||
291 | - Recursive version of various stats. These files show the | ||
292 | same information as their non-recursive counterparts but | ||
293 | include stats from all the descendant cgroups. | ||
294 | |||
282 | Throttling/Upper limit policy files | 295 | Throttling/Upper limit policy files |
283 | ----------------------------------- | 296 | ----------------------------------- |
284 | - blkio.throttle.read_bps_device | 297 | - blkio.throttle.read_bps_device |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ee342826fd98..e8f31069b379 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -606,20 +606,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) | |||
606 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); | 606 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); |
607 | } | 607 | } |
608 | 608 | ||
609 | /* | 609 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) |
610 | * Determine the parent cfqg for weight calculation. Currently, cfqg | ||
611 | * scheduling is flat and the root is the parent of everyone else. | ||
612 | */ | ||
613 | static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg) | ||
614 | { | 610 | { |
615 | struct blkcg_gq *blkg = cfqg_to_blkg(cfqg); | 611 | struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; |
616 | struct cfq_group *root; | ||
617 | |||
618 | while (blkg->parent) | ||
619 | blkg = blkg->parent; | ||
620 | root = blkg_to_cfqg(blkg); | ||
621 | 612 | ||
622 | return root != cfqg ? root : NULL; | 613 | return pblkg ? blkg_to_cfqg(pblkg) : NULL; |
623 | } | 614 | } |
624 | 615 | ||
625 | static inline void cfqg_get(struct cfq_group *cfqg) | 616 | static inline void cfqg_get(struct cfq_group *cfqg) |
@@ -722,7 +713,7 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg) | |||
722 | 713 | ||
723 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ | 714 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ |
724 | 715 | ||
725 | static inline struct cfq_group *cfqg_flat_parent(struct cfq_group *cfqg) { return NULL; } | 716 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } |
726 | static inline void cfqg_get(struct cfq_group *cfqg) { } | 717 | static inline void cfqg_get(struct cfq_group *cfqg) { } |
727 | static inline void cfqg_put(struct cfq_group *cfqg) { } | 718 | static inline void cfqg_put(struct cfq_group *cfqg) { } |
728 | 719 | ||
@@ -1290,7 +1281,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | |||
1290 | * stops once an already activated node is met. vfraction | 1281 | * stops once an already activated node is met. vfraction |
1291 | * calculation should always continue to the root. | 1282 | * calculation should always continue to the root. |
1292 | */ | 1283 | */ |
1293 | while ((parent = cfqg_flat_parent(pos))) { | 1284 | while ((parent = cfqg_parent(pos))) { |
1294 | if (propagate) { | 1285 | if (propagate) { |
1295 | propagate = !parent->nr_active++; | 1286 | propagate = !parent->nr_active++; |
1296 | parent->children_weight += pos->weight; | 1287 | parent->children_weight += pos->weight; |
@@ -1341,7 +1332,7 @@ cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) | |||
1341 | pos->children_weight -= pos->leaf_weight; | 1332 | pos->children_weight -= pos->leaf_weight; |
1342 | 1333 | ||
1343 | while (propagate) { | 1334 | while (propagate) { |
1344 | struct cfq_group *parent = cfqg_flat_parent(pos); | 1335 | struct cfq_group *parent = cfqg_parent(pos); |
1345 | 1336 | ||
1346 | /* @pos has 0 nr_active at this point */ | 1337 | /* @pos has 0 nr_active at this point */ |
1347 | WARN_ON_ONCE(pos->children_weight); | 1338 | WARN_ON_ONCE(pos->children_weight); |