diff options
-rw-r--r-- | Documentation/block/cfq-iosched.txt | 58 | ||||
-rw-r--r-- | Documentation/cgroups/blkio-controller.txt | 35 | ||||
-rw-r--r-- | block/Kconfig | 1 | ||||
-rw-r--r-- | block/blk-cgroup.c | 277 | ||||
-rw-r--r-- | block/blk-cgroup.h | 68 | ||||
-rw-r--r-- | block/blk-core.c | 18 | ||||
-rw-r--r-- | block/blk-exec.c | 4 | ||||
-rw-r--r-- | block/blk-flush.c | 2 | ||||
-rw-r--r-- | block/blk-lib.c | 6 | ||||
-rw-r--r-- | block/blk-sysfs.c | 9 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/cfq-iosched.c | 629 | ||||
-rw-r--r-- | block/elevator.c | 23 | ||||
-rw-r--r-- | drivers/block/swim3.c | 5 | ||||
-rw-r--r-- | drivers/md/dm.c | 1 | ||||
-rw-r--r-- | drivers/md/raid5.c | 11 | ||||
-rw-r--r-- | fs/bio.c | 2 | ||||
-rw-r--r-- | fs/block_dev.c | 6 | ||||
-rw-r--r-- | fs/buffer.c | 10 | ||||
-rw-r--r-- | fs/fs-writeback.c | 16 | ||||
-rw-r--r-- | include/linux/blkdev.h | 3 | ||||
-rw-r--r-- | include/linux/blktrace_api.h | 1 | ||||
-rw-r--r-- | include/linux/buffer_head.h | 2 | ||||
-rw-r--r-- | include/linux/completion.h | 3 | ||||
-rw-r--r-- | include/linux/elevator.h | 5 | ||||
-rw-r--r-- | include/trace/events/block.h | 104 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 116 | ||||
-rw-r--r-- | kernel/sched/core.c | 57 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 28 | ||||
-rw-r--r-- | mm/page-writeback.c | 2 |
30 files changed, 1246 insertions, 258 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt index d89b4fe724d7..a5eb7d19a65d 100644 --- a/Documentation/block/cfq-iosched.txt +++ b/Documentation/block/cfq-iosched.txt | |||
@@ -102,6 +102,64 @@ processing of request. Therefore, increasing the value can imporve the | |||
102 | performace although this can cause the latency of some I/O to increase due | 102 | performace although this can cause the latency of some I/O to increase due |
103 | to more number of requests. | 103 | to more number of requests. |
104 | 104 | ||
105 | CFQ Group scheduling | ||
106 | ==================== | ||
107 | |||
108 | CFQ supports blkio cgroup and has "blkio." prefixed files in each | ||
109 | blkio cgroup directory. It is weight-based and there are four knobs | ||
110 | for configuration - weight[_device] and leaf_weight[_device]. | ||
111 | Internal cgroup nodes (the ones with children) can also have tasks in | ||
112 | them, so the former two configure how much proportion the cgroup as a | ||
113 | whole is entitled to at its parent's level while the latter two | ||
114 | configure how much proportion the tasks in the cgroup have compared to | ||
115 | its direct children. | ||
116 | |||
117 | Another way to think about it is assuming that each internal node has | ||
118 | an implicit leaf child node which hosts all the tasks whose weight is | ||
119 | configured by leaf_weight[_device]. Let's assume a blkio hierarchy | ||
120 | composed of five cgroups - root, A, B, AA and AB - with the following | ||
121 | weights where the names represent the hierarchy. | ||
122 | |||
123 | weight leaf_weight | ||
124 | root : 125 125 | ||
125 | A : 500 750 | ||
126 | B : 250 500 | ||
127 | AA : 500 500 | ||
128 | AB : 1000 500 | ||
129 | |||
130 | root never has a parent making its weight is meaningless. For backward | ||
131 | compatibility, weight is always kept in sync with leaf_weight. B, AA | ||
132 | and AB have no child and thus its tasks have no children cgroup to | ||
133 | compete with. They always get 100% of what the cgroup won at the | ||
134 | parent level. Considering only the weights which matter, the hierarchy | ||
135 | looks like the following. | ||
136 | |||
137 | root | ||
138 | / | \ | ||
139 | A B leaf | ||
140 | 500 250 125 | ||
141 | / | \ | ||
142 | AA AB leaf | ||
143 | 500 1000 750 | ||
144 | |||
145 | If all cgroups have active IOs and competing with each other, disk | ||
146 | time will be distributed like the following. | ||
147 | |||
148 | Distribution below root. The total active weight at this level is | ||
149 | A:500 + B:250 + C:125 = 875. | ||
150 | |||
151 | root-leaf : 125 / 875 =~ 14% | ||
152 | A : 500 / 875 =~ 57% | ||
153 | B(-leaf) : 250 / 875 =~ 28% | ||
154 | |||
155 | A has children and further distributes its 57% among the children and | ||
156 | the implicit leaf node. The total active weight at this level is | ||
157 | AA:500 + AB:1000 + A-leaf:750 = 2250. | ||
158 | |||
159 | A-leaf : ( 750 / 2250) * A =~ 19% | ||
160 | AA(-leaf) : ( 500 / 2250) * A =~ 12% | ||
161 | AB(-leaf) : (1000 / 2250) * A =~ 25% | ||
162 | |||
105 | CFQ IOPS Mode for group scheduling | 163 | CFQ IOPS Mode for group scheduling |
106 | =================================== | 164 | =================================== |
107 | Basic CFQ design is to provide priority based time slices. Higher priority | 165 | Basic CFQ design is to provide priority based time slices. Higher priority |
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index a794ce91a2d5..da272c8f44e7 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt | |||
@@ -94,13 +94,11 @@ Throttling/Upper Limit policy | |||
94 | 94 | ||
95 | Hierarchical Cgroups | 95 | Hierarchical Cgroups |
96 | ==================== | 96 | ==================== |
97 | - Currently none of the IO control policy supports hierarchical groups. But | 97 | - Currently only CFQ supports hierarchical groups. For throttling, |
98 | cgroup interface does allow creation of hierarchical cgroups and internally | 98 | cgroup interface does allow creation of hierarchical cgroups and |
99 | IO policies treat them as flat hierarchy. | 99 | internally it treats them as flat hierarchy. |
100 | 100 | ||
101 | So this patch will allow creation of cgroup hierarchcy but at the backend | 101 | If somebody created a hierarchy like as follows. |
102 | everything will be treated as flat. So if somebody created a hierarchy like | ||
103 | as follows. | ||
104 | 102 | ||
105 | root | 103 | root |
106 | / \ | 104 | / \ |
@@ -108,16 +106,20 @@ Hierarchical Cgroups | |||
108 | | | 106 | | |
109 | test3 | 107 | test3 |
110 | 108 | ||
111 | CFQ and throttling will practically treat all groups at same level. | 109 | CFQ will handle the hierarchy correctly but and throttling will |
110 | practically treat all groups at same level. For details on CFQ | ||
111 | hierarchy support, refer to Documentation/block/cfq-iosched.txt. | ||
112 | Throttling will treat the hierarchy as if it looks like the | ||
113 | following. | ||
112 | 114 | ||
113 | pivot | 115 | pivot |
114 | / / \ \ | 116 | / / \ \ |
115 | root test1 test2 test3 | 117 | root test1 test2 test3 |
116 | 118 | ||
117 | Down the line we can implement hierarchical accounting/control support | 119 | Nesting cgroups, while allowed, isn't officially supported and blkio |
118 | and also introduce a new cgroup file "use_hierarchy" which will control | 120 | genereates warning when cgroups nest. Once throttling implements |
119 | whether cgroup hierarchy is viewed as flat or hierarchical by the policy.. | 121 | hierarchy support, hierarchy will be supported and the warning will |
120 | This is how memory controller also has implemented the things. | 122 | be removed. |
121 | 123 | ||
122 | Various user visible config options | 124 | Various user visible config options |
123 | =================================== | 125 | =================================== |
@@ -172,6 +174,12 @@ Proportional weight policy files | |||
172 | dev weight | 174 | dev weight |
173 | 8:16 300 | 175 | 8:16 300 |
174 | 176 | ||
177 | - blkio.leaf_weight[_device] | ||
178 | - Equivalents of blkio.weight[_device] for the purpose of | ||
179 | deciding how much weight tasks in the given cgroup has while | ||
180 | competing with the cgroup's child cgroups. For details, | ||
181 | please refer to Documentation/block/cfq-iosched.txt. | ||
182 | |||
175 | - blkio.time | 183 | - blkio.time |
176 | - disk time allocated to cgroup per device in milliseconds. First | 184 | - disk time allocated to cgroup per device in milliseconds. First |
177 | two fields specify the major and minor number of the device and | 185 | two fields specify the major and minor number of the device and |
@@ -279,6 +287,11 @@ Proportional weight policy files | |||
279 | and minor number of the device and third field specifies the number | 287 | and minor number of the device and third field specifies the number |
280 | of times a group was dequeued from a particular device. | 288 | of times a group was dequeued from a particular device. |
281 | 289 | ||
290 | - blkio.*_recursive | ||
291 | - Recursive version of various stats. These files show the | ||
292 | same information as their non-recursive counterparts but | ||
293 | include stats from all the descendant cgroups. | ||
294 | |||
282 | Throttling/Upper limit policy files | 295 | Throttling/Upper limit policy files |
283 | ----------------------------------- | 296 | ----------------------------------- |
284 | - blkio.throttle.read_bps_device | 297 | - blkio.throttle.read_bps_device |
diff --git a/block/Kconfig b/block/Kconfig index 4a85ccf8d4cf..a7e40a7c8214 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
@@ -4,7 +4,6 @@ | |||
4 | menuconfig BLOCK | 4 | menuconfig BLOCK |
5 | bool "Enable the block layer" if EXPERT | 5 | bool "Enable the block layer" if EXPERT |
6 | default y | 6 | default y |
7 | select PERCPU_RWSEM | ||
8 | help | 7 | help |
9 | Provide block layer support for the kernel. | 8 | Provide block layer support for the kernel. |
10 | 9 | ||
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8bdebb6781e1..b2b9837f9dd3 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -26,11 +26,32 @@ | |||
26 | 26 | ||
27 | static DEFINE_MUTEX(blkcg_pol_mutex); | 27 | static DEFINE_MUTEX(blkcg_pol_mutex); |
28 | 28 | ||
29 | struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; | 29 | struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, |
30 | .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; | ||
30 | EXPORT_SYMBOL_GPL(blkcg_root); | 31 | EXPORT_SYMBOL_GPL(blkcg_root); |
31 | 32 | ||
32 | static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; | 33 | static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; |
33 | 34 | ||
35 | static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | ||
36 | struct request_queue *q, bool update_hint); | ||
37 | |||
38 | /** | ||
39 | * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants | ||
40 | * @d_blkg: loop cursor pointing to the current descendant | ||
41 | * @pos_cgrp: used for iteration | ||
42 | * @p_blkg: target blkg to walk descendants of | ||
43 | * | ||
44 | * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU | ||
45 | * read locked. If called under either blkcg or queue lock, the iteration | ||
46 | * is guaranteed to include all and only online blkgs. The caller may | ||
47 | * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip | ||
48 | * subtree. | ||
49 | */ | ||
50 | #define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ | ||
51 | cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ | ||
52 | if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ | ||
53 | (p_blkg)->q, false))) | ||
54 | |||
34 | static bool blkcg_policy_enabled(struct request_queue *q, | 55 | static bool blkcg_policy_enabled(struct request_queue *q, |
35 | const struct blkcg_policy *pol) | 56 | const struct blkcg_policy *pol) |
36 | { | 57 | { |
@@ -112,9 +133,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | |||
112 | 133 | ||
113 | blkg->pd[i] = pd; | 134 | blkg->pd[i] = pd; |
114 | pd->blkg = blkg; | 135 | pd->blkg = blkg; |
136 | pd->plid = i; | ||
115 | 137 | ||
116 | /* invoke per-policy init */ | 138 | /* invoke per-policy init */ |
117 | if (blkcg_policy_enabled(blkg->q, pol)) | 139 | if (pol->pd_init_fn) |
118 | pol->pd_init_fn(blkg); | 140 | pol->pd_init_fn(blkg); |
119 | } | 141 | } |
120 | 142 | ||
@@ -125,8 +147,19 @@ err_free: | |||
125 | return NULL; | 147 | return NULL; |
126 | } | 148 | } |
127 | 149 | ||
150 | /** | ||
151 | * __blkg_lookup - internal version of blkg_lookup() | ||
152 | * @blkcg: blkcg of interest | ||
153 | * @q: request_queue of interest | ||
154 | * @update_hint: whether to update lookup hint with the result or not | ||
155 | * | ||
156 | * This is internal version and shouldn't be used by policy | ||
157 | * implementations. Looks up blkgs for the @blkcg - @q pair regardless of | ||
158 | * @q's bypass state. If @update_hint is %true, the caller should be | ||
159 | * holding @q->queue_lock and lookup hint is updated on success. | ||
160 | */ | ||
128 | static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | 161 | static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, |
129 | struct request_queue *q) | 162 | struct request_queue *q, bool update_hint) |
130 | { | 163 | { |
131 | struct blkcg_gq *blkg; | 164 | struct blkcg_gq *blkg; |
132 | 165 | ||
@@ -135,14 +168,19 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | |||
135 | return blkg; | 168 | return blkg; |
136 | 169 | ||
137 | /* | 170 | /* |
138 | * Hint didn't match. Look up from the radix tree. Note that we | 171 | * Hint didn't match. Look up from the radix tree. Note that the |
139 | * may not be holding queue_lock and thus are not sure whether | 172 | * hint can only be updated under queue_lock as otherwise @blkg |
140 | * @blkg from blkg_tree has already been removed or not, so we | 173 | * could have already been removed from blkg_tree. The caller is |
141 | * can't update hint to the lookup result. Leave it to the caller. | 174 | * responsible for grabbing queue_lock if @update_hint. |
142 | */ | 175 | */ |
143 | blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); | 176 | blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); |
144 | if (blkg && blkg->q == q) | 177 | if (blkg && blkg->q == q) { |
178 | if (update_hint) { | ||
179 | lockdep_assert_held(q->queue_lock); | ||
180 | rcu_assign_pointer(blkcg->blkg_hint, blkg); | ||
181 | } | ||
145 | return blkg; | 182 | return blkg; |
183 | } | ||
146 | 184 | ||
147 | return NULL; | 185 | return NULL; |
148 | } | 186 | } |
@@ -162,7 +200,7 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) | |||
162 | 200 | ||
163 | if (unlikely(blk_queue_bypass(q))) | 201 | if (unlikely(blk_queue_bypass(q))) |
164 | return NULL; | 202 | return NULL; |
165 | return __blkg_lookup(blkcg, q); | 203 | return __blkg_lookup(blkcg, q, false); |
166 | } | 204 | } |
167 | EXPORT_SYMBOL_GPL(blkg_lookup); | 205 | EXPORT_SYMBOL_GPL(blkg_lookup); |
168 | 206 | ||
@@ -170,75 +208,129 @@ EXPORT_SYMBOL_GPL(blkg_lookup); | |||
170 | * If @new_blkg is %NULL, this function tries to allocate a new one as | 208 | * If @new_blkg is %NULL, this function tries to allocate a new one as |
171 | * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. | 209 | * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. |
172 | */ | 210 | */ |
173 | static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, | 211 | static struct blkcg_gq *blkg_create(struct blkcg *blkcg, |
174 | struct request_queue *q, | 212 | struct request_queue *q, |
175 | struct blkcg_gq *new_blkg) | 213 | struct blkcg_gq *new_blkg) |
176 | { | 214 | { |
177 | struct blkcg_gq *blkg; | 215 | struct blkcg_gq *blkg; |
178 | int ret; | 216 | int i, ret; |
179 | 217 | ||
180 | WARN_ON_ONCE(!rcu_read_lock_held()); | 218 | WARN_ON_ONCE(!rcu_read_lock_held()); |
181 | lockdep_assert_held(q->queue_lock); | 219 | lockdep_assert_held(q->queue_lock); |
182 | 220 | ||
183 | /* lookup and update hint on success, see __blkg_lookup() for details */ | ||
184 | blkg = __blkg_lookup(blkcg, q); | ||
185 | if (blkg) { | ||
186 | rcu_assign_pointer(blkcg->blkg_hint, blkg); | ||
187 | goto out_free; | ||
188 | } | ||
189 | |||
190 | /* blkg holds a reference to blkcg */ | 221 | /* blkg holds a reference to blkcg */ |
191 | if (!css_tryget(&blkcg->css)) { | 222 | if (!css_tryget(&blkcg->css)) { |
192 | blkg = ERR_PTR(-EINVAL); | 223 | ret = -EINVAL; |
193 | goto out_free; | 224 | goto err_free_blkg; |
194 | } | 225 | } |
195 | 226 | ||
196 | /* allocate */ | 227 | /* allocate */ |
197 | if (!new_blkg) { | 228 | if (!new_blkg) { |
198 | new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); | 229 | new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); |
199 | if (unlikely(!new_blkg)) { | 230 | if (unlikely(!new_blkg)) { |
200 | blkg = ERR_PTR(-ENOMEM); | 231 | ret = -ENOMEM; |
201 | goto out_put; | 232 | goto err_put_css; |
202 | } | 233 | } |
203 | } | 234 | } |
204 | blkg = new_blkg; | 235 | blkg = new_blkg; |
205 | 236 | ||
206 | /* insert */ | 237 | /* link parent and insert */ |
238 | if (blkcg_parent(blkcg)) { | ||
239 | blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); | ||
240 | if (WARN_ON_ONCE(!blkg->parent)) { | ||
241 | blkg = ERR_PTR(-EINVAL); | ||
242 | goto err_put_css; | ||
243 | } | ||
244 | blkg_get(blkg->parent); | ||
245 | } | ||
246 | |||
207 | spin_lock(&blkcg->lock); | 247 | spin_lock(&blkcg->lock); |
208 | ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); | 248 | ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); |
209 | if (likely(!ret)) { | 249 | if (likely(!ret)) { |
210 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); | 250 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); |
211 | list_add(&blkg->q_node, &q->blkg_list); | 251 | list_add(&blkg->q_node, &q->blkg_list); |
252 | |||
253 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | ||
254 | struct blkcg_policy *pol = blkcg_policy[i]; | ||
255 | |||
256 | if (blkg->pd[i] && pol->pd_online_fn) | ||
257 | pol->pd_online_fn(blkg); | ||
258 | } | ||
212 | } | 259 | } |
260 | blkg->online = true; | ||
213 | spin_unlock(&blkcg->lock); | 261 | spin_unlock(&blkcg->lock); |
214 | 262 | ||
215 | if (!ret) | 263 | if (!ret) |
216 | return blkg; | 264 | return blkg; |
217 | 265 | ||
218 | blkg = ERR_PTR(ret); | 266 | /* @blkg failed fully initialized, use the usual release path */ |
219 | out_put: | 267 | blkg_put(blkg); |
268 | return ERR_PTR(ret); | ||
269 | |||
270 | err_put_css: | ||
220 | css_put(&blkcg->css); | 271 | css_put(&blkcg->css); |
221 | out_free: | 272 | err_free_blkg: |
222 | blkg_free(new_blkg); | 273 | blkg_free(new_blkg); |
223 | return blkg; | 274 | return ERR_PTR(ret); |
224 | } | 275 | } |
225 | 276 | ||
277 | /** | ||
278 | * blkg_lookup_create - lookup blkg, try to create one if not there | ||
279 | * @blkcg: blkcg of interest | ||
280 | * @q: request_queue of interest | ||
281 | * | ||
282 | * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to | ||
283 | * create one. blkg creation is performed recursively from blkcg_root such | ||
284 | * that all non-root blkg's have access to the parent blkg. This function | ||
285 | * should be called under RCU read lock and @q->queue_lock. | ||
286 | * | ||
287 | * Returns pointer to the looked up or created blkg on success, ERR_PTR() | ||
288 | * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not | ||
289 | * dead and bypassing, returns ERR_PTR(-EBUSY). | ||
290 | */ | ||
226 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | 291 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, |
227 | struct request_queue *q) | 292 | struct request_queue *q) |
228 | { | 293 | { |
294 | struct blkcg_gq *blkg; | ||
295 | |||
296 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
297 | lockdep_assert_held(q->queue_lock); | ||
298 | |||
229 | /* | 299 | /* |
230 | * This could be the first entry point of blkcg implementation and | 300 | * This could be the first entry point of blkcg implementation and |
231 | * we shouldn't allow anything to go through for a bypassing queue. | 301 | * we shouldn't allow anything to go through for a bypassing queue. |
232 | */ | 302 | */ |
233 | if (unlikely(blk_queue_bypass(q))) | 303 | if (unlikely(blk_queue_bypass(q))) |
234 | return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); | 304 | return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); |
235 | return __blkg_lookup_create(blkcg, q, NULL); | 305 | |
306 | blkg = __blkg_lookup(blkcg, q, true); | ||
307 | if (blkg) | ||
308 | return blkg; | ||
309 | |||
310 | /* | ||
311 | * Create blkgs walking down from blkcg_root to @blkcg, so that all | ||
312 | * non-root blkgs have access to their parents. | ||
313 | */ | ||
314 | while (true) { | ||
315 | struct blkcg *pos = blkcg; | ||
316 | struct blkcg *parent = blkcg_parent(blkcg); | ||
317 | |||
318 | while (parent && !__blkg_lookup(parent, q, false)) { | ||
319 | pos = parent; | ||
320 | parent = blkcg_parent(parent); | ||
321 | } | ||
322 | |||
323 | blkg = blkg_create(pos, q, NULL); | ||
324 | if (pos == blkcg || IS_ERR(blkg)) | ||
325 | return blkg; | ||
326 | } | ||
236 | } | 327 | } |
237 | EXPORT_SYMBOL_GPL(blkg_lookup_create); | 328 | EXPORT_SYMBOL_GPL(blkg_lookup_create); |
238 | 329 | ||
239 | static void blkg_destroy(struct blkcg_gq *blkg) | 330 | static void blkg_destroy(struct blkcg_gq *blkg) |
240 | { | 331 | { |
241 | struct blkcg *blkcg = blkg->blkcg; | 332 | struct blkcg *blkcg = blkg->blkcg; |
333 | int i; | ||
242 | 334 | ||
243 | lockdep_assert_held(blkg->q->queue_lock); | 335 | lockdep_assert_held(blkg->q->queue_lock); |
244 | lockdep_assert_held(&blkcg->lock); | 336 | lockdep_assert_held(&blkcg->lock); |
@@ -247,6 +339,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) | |||
247 | WARN_ON_ONCE(list_empty(&blkg->q_node)); | 339 | WARN_ON_ONCE(list_empty(&blkg->q_node)); |
248 | WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); | 340 | WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); |
249 | 341 | ||
342 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | ||
343 | struct blkcg_policy *pol = blkcg_policy[i]; | ||
344 | |||
345 | if (blkg->pd[i] && pol->pd_offline_fn) | ||
346 | pol->pd_offline_fn(blkg); | ||
347 | } | ||
348 | blkg->online = false; | ||
349 | |||
250 | radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); | 350 | radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); |
251 | list_del_init(&blkg->q_node); | 351 | list_del_init(&blkg->q_node); |
252 | hlist_del_init_rcu(&blkg->blkcg_node); | 352 | hlist_del_init_rcu(&blkg->blkcg_node); |
@@ -301,8 +401,10 @@ static void blkg_rcu_free(struct rcu_head *rcu_head) | |||
301 | 401 | ||
302 | void __blkg_release(struct blkcg_gq *blkg) | 402 | void __blkg_release(struct blkcg_gq *blkg) |
303 | { | 403 | { |
304 | /* release the extra blkcg reference this blkg has been holding */ | 404 | /* release the blkcg and parent blkg refs this blkg has been holding */ |
305 | css_put(&blkg->blkcg->css); | 405 | css_put(&blkg->blkcg->css); |
406 | if (blkg->parent) | ||
407 | blkg_put(blkg->parent); | ||
306 | 408 | ||
307 | /* | 409 | /* |
308 | * A group is freed in rcu manner. But having an rcu lock does not | 410 | * A group is freed in rcu manner. But having an rcu lock does not |
@@ -401,8 +503,9 @@ static const char *blkg_dev_name(struct blkcg_gq *blkg) | |||
401 | * | 503 | * |
402 | * This function invokes @prfill on each blkg of @blkcg if pd for the | 504 | * This function invokes @prfill on each blkg of @blkcg if pd for the |
403 | * policy specified by @pol exists. @prfill is invoked with @sf, the | 505 | * policy specified by @pol exists. @prfill is invoked with @sf, the |
404 | * policy data and @data. If @show_total is %true, the sum of the return | 506 | * policy data and @data and the matching queue lock held. If @show_total |
405 | * values from @prfill is printed with "Total" label at the end. | 507 | * is %true, the sum of the return values from @prfill is printed with |
508 | * "Total" label at the end. | ||
406 | * | 509 | * |
407 | * This is to be used to construct print functions for | 510 | * This is to be used to construct print functions for |
408 | * cftype->read_seq_string method. | 511 | * cftype->read_seq_string method. |
@@ -416,11 +519,14 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, | |||
416 | struct blkcg_gq *blkg; | 519 | struct blkcg_gq *blkg; |
417 | u64 total = 0; | 520 | u64 total = 0; |
418 | 521 | ||
419 | spin_lock_irq(&blkcg->lock); | 522 | rcu_read_lock(); |
420 | hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) | 523 | hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { |
524 | spin_lock_irq(blkg->q->queue_lock); | ||
421 | if (blkcg_policy_enabled(blkg->q, pol)) | 525 | if (blkcg_policy_enabled(blkg->q, pol)) |
422 | total += prfill(sf, blkg->pd[pol->plid], data); | 526 | total += prfill(sf, blkg->pd[pol->plid], data); |
423 | spin_unlock_irq(&blkcg->lock); | 527 | spin_unlock_irq(blkg->q->queue_lock); |
528 | } | ||
529 | rcu_read_unlock(); | ||
424 | 530 | ||
425 | if (show_total) | 531 | if (show_total) |
426 | seq_printf(sf, "Total %llu\n", (unsigned long long)total); | 532 | seq_printf(sf, "Total %llu\n", (unsigned long long)total); |
@@ -479,6 +585,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | |||
479 | seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); | 585 | seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); |
480 | return v; | 586 | return v; |
481 | } | 587 | } |
588 | EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); | ||
482 | 589 | ||
483 | /** | 590 | /** |
484 | * blkg_prfill_stat - prfill callback for blkg_stat | 591 | * blkg_prfill_stat - prfill callback for blkg_stat |
@@ -512,6 +619,82 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | |||
512 | EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); | 619 | EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); |
513 | 620 | ||
514 | /** | 621 | /** |
622 | * blkg_stat_recursive_sum - collect hierarchical blkg_stat | ||
623 | * @pd: policy private data of interest | ||
624 | * @off: offset to the blkg_stat in @pd | ||
625 | * | ||
626 | * Collect the blkg_stat specified by @off from @pd and all its online | ||
627 | * descendants and return the sum. The caller must be holding the queue | ||
628 | * lock for online tests. | ||
629 | */ | ||
630 | u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) | ||
631 | { | ||
632 | struct blkcg_policy *pol = blkcg_policy[pd->plid]; | ||
633 | struct blkcg_gq *pos_blkg; | ||
634 | struct cgroup *pos_cgrp; | ||
635 | u64 sum; | ||
636 | |||
637 | lockdep_assert_held(pd->blkg->q->queue_lock); | ||
638 | |||
639 | sum = blkg_stat_read((void *)pd + off); | ||
640 | |||
641 | rcu_read_lock(); | ||
642 | blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { | ||
643 | struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); | ||
644 | struct blkg_stat *stat = (void *)pos_pd + off; | ||
645 | |||
646 | if (pos_blkg->online) | ||
647 | sum += blkg_stat_read(stat); | ||
648 | } | ||
649 | rcu_read_unlock(); | ||
650 | |||
651 | return sum; | ||
652 | } | ||
653 | EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); | ||
654 | |||
655 | /** | ||
656 | * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat | ||
657 | * @pd: policy private data of interest | ||
658 | * @off: offset to the blkg_stat in @pd | ||
659 | * | ||
660 | * Collect the blkg_rwstat specified by @off from @pd and all its online | ||
661 | * descendants and return the sum. The caller must be holding the queue | ||
662 | * lock for online tests. | ||
663 | */ | ||
664 | struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, | ||
665 | int off) | ||
666 | { | ||
667 | struct blkcg_policy *pol = blkcg_policy[pd->plid]; | ||
668 | struct blkcg_gq *pos_blkg; | ||
669 | struct cgroup *pos_cgrp; | ||
670 | struct blkg_rwstat sum; | ||
671 | int i; | ||
672 | |||
673 | lockdep_assert_held(pd->blkg->q->queue_lock); | ||
674 | |||
675 | sum = blkg_rwstat_read((void *)pd + off); | ||
676 | |||
677 | rcu_read_lock(); | ||
678 | blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { | ||
679 | struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); | ||
680 | struct blkg_rwstat *rwstat = (void *)pos_pd + off; | ||
681 | struct blkg_rwstat tmp; | ||
682 | |||
683 | if (!pos_blkg->online) | ||
684 | continue; | ||
685 | |||
686 | tmp = blkg_rwstat_read(rwstat); | ||
687 | |||
688 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | ||
689 | sum.cnt[i] += tmp.cnt[i]; | ||
690 | } | ||
691 | rcu_read_unlock(); | ||
692 | |||
693 | return sum; | ||
694 | } | ||
695 | EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); | ||
696 | |||
697 | /** | ||
515 | * blkg_conf_prep - parse and prepare for per-blkg config update | 698 | * blkg_conf_prep - parse and prepare for per-blkg config update |
516 | * @blkcg: target block cgroup | 699 | * @blkcg: target block cgroup |
517 | * @pol: target policy | 700 | * @pol: target policy |
@@ -656,6 +839,7 @@ static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) | |||
656 | return ERR_PTR(-ENOMEM); | 839 | return ERR_PTR(-ENOMEM); |
657 | 840 | ||
658 | blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; | 841 | blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; |
842 | blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; | ||
659 | blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ | 843 | blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ |
660 | done: | 844 | done: |
661 | spin_lock_init(&blkcg->lock); | 845 | spin_lock_init(&blkcg->lock); |
@@ -775,7 +959,7 @@ int blkcg_activate_policy(struct request_queue *q, | |||
775 | const struct blkcg_policy *pol) | 959 | const struct blkcg_policy *pol) |
776 | { | 960 | { |
777 | LIST_HEAD(pds); | 961 | LIST_HEAD(pds); |
778 | struct blkcg_gq *blkg; | 962 | struct blkcg_gq *blkg, *new_blkg; |
779 | struct blkg_policy_data *pd, *n; | 963 | struct blkg_policy_data *pd, *n; |
780 | int cnt = 0, ret; | 964 | int cnt = 0, ret; |
781 | bool preloaded; | 965 | bool preloaded; |
@@ -784,19 +968,27 @@ int blkcg_activate_policy(struct request_queue *q, | |||
784 | return 0; | 968 | return 0; |
785 | 969 | ||
786 | /* preallocations for root blkg */ | 970 | /* preallocations for root blkg */ |
787 | blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); | 971 | new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); |
788 | if (!blkg) | 972 | if (!new_blkg) |
789 | return -ENOMEM; | 973 | return -ENOMEM; |
790 | 974 | ||
791 | preloaded = !radix_tree_preload(GFP_KERNEL); | 975 | preloaded = !radix_tree_preload(GFP_KERNEL); |
792 | 976 | ||
793 | blk_queue_bypass_start(q); | 977 | blk_queue_bypass_start(q); |
794 | 978 | ||
795 | /* make sure the root blkg exists and count the existing blkgs */ | 979 | /* |
980 | * Make sure the root blkg exists and count the existing blkgs. As | ||
981 | * @q is bypassing at this point, blkg_lookup_create() can't be | ||
982 | * used. Open code it. | ||
983 | */ | ||
796 | spin_lock_irq(q->queue_lock); | 984 | spin_lock_irq(q->queue_lock); |
797 | 985 | ||
798 | rcu_read_lock(); | 986 | rcu_read_lock(); |
799 | blkg = __blkg_lookup_create(&blkcg_root, q, blkg); | 987 | blkg = __blkg_lookup(&blkcg_root, q, false); |
988 | if (blkg) | ||
989 | blkg_free(new_blkg); | ||
990 | else | ||
991 | blkg = blkg_create(&blkcg_root, q, new_blkg); | ||
800 | rcu_read_unlock(); | 992 | rcu_read_unlock(); |
801 | 993 | ||
802 | if (preloaded) | 994 | if (preloaded) |
@@ -844,6 +1036,7 @@ int blkcg_activate_policy(struct request_queue *q, | |||
844 | 1036 | ||
845 | blkg->pd[pol->plid] = pd; | 1037 | blkg->pd[pol->plid] = pd; |
846 | pd->blkg = blkg; | 1038 | pd->blkg = blkg; |
1039 | pd->plid = pol->plid; | ||
847 | pol->pd_init_fn(blkg); | 1040 | pol->pd_init_fn(blkg); |
848 | 1041 | ||
849 | spin_unlock(&blkg->blkcg->lock); | 1042 | spin_unlock(&blkg->blkcg->lock); |
@@ -890,6 +1083,8 @@ void blkcg_deactivate_policy(struct request_queue *q, | |||
890 | /* grab blkcg lock too while removing @pd from @blkg */ | 1083 | /* grab blkcg lock too while removing @pd from @blkg */ |
891 | spin_lock(&blkg->blkcg->lock); | 1084 | spin_lock(&blkg->blkcg->lock); |
892 | 1085 | ||
1086 | if (pol->pd_offline_fn) | ||
1087 | pol->pd_offline_fn(blkg); | ||
893 | if (pol->pd_exit_fn) | 1088 | if (pol->pd_exit_fn) |
894 | pol->pd_exit_fn(blkg); | 1089 | pol->pd_exit_fn(blkg); |
895 | 1090 | ||
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 24597309e23d..f2b292925ccd 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
@@ -54,6 +54,7 @@ struct blkcg { | |||
54 | 54 | ||
55 | /* TODO: per-policy storage in blkcg */ | 55 | /* TODO: per-policy storage in blkcg */ |
56 | unsigned int cfq_weight; /* belongs to cfq */ | 56 | unsigned int cfq_weight; /* belongs to cfq */ |
57 | unsigned int cfq_leaf_weight; | ||
57 | }; | 58 | }; |
58 | 59 | ||
59 | struct blkg_stat { | 60 | struct blkg_stat { |
@@ -80,8 +81,9 @@ struct blkg_rwstat { | |||
80 | * beginning and pd_size can't be smaller than pd. | 81 | * beginning and pd_size can't be smaller than pd. |
81 | */ | 82 | */ |
82 | struct blkg_policy_data { | 83 | struct blkg_policy_data { |
83 | /* the blkg this per-policy data belongs to */ | 84 | /* the blkg and policy id this per-policy data belongs to */ |
84 | struct blkcg_gq *blkg; | 85 | struct blkcg_gq *blkg; |
86 | int plid; | ||
85 | 87 | ||
86 | /* used during policy activation */ | 88 | /* used during policy activation */ |
87 | struct list_head alloc_node; | 89 | struct list_head alloc_node; |
@@ -94,17 +96,27 @@ struct blkcg_gq { | |||
94 | struct list_head q_node; | 96 | struct list_head q_node; |
95 | struct hlist_node blkcg_node; | 97 | struct hlist_node blkcg_node; |
96 | struct blkcg *blkcg; | 98 | struct blkcg *blkcg; |
99 | |||
100 | /* all non-root blkcg_gq's are guaranteed to have access to parent */ | ||
101 | struct blkcg_gq *parent; | ||
102 | |||
97 | /* request allocation list for this blkcg-q pair */ | 103 | /* request allocation list for this blkcg-q pair */ |
98 | struct request_list rl; | 104 | struct request_list rl; |
105 | |||
99 | /* reference count */ | 106 | /* reference count */ |
100 | int refcnt; | 107 | int refcnt; |
101 | 108 | ||
109 | /* is this blkg online? protected by both blkcg and q locks */ | ||
110 | bool online; | ||
111 | |||
102 | struct blkg_policy_data *pd[BLKCG_MAX_POLS]; | 112 | struct blkg_policy_data *pd[BLKCG_MAX_POLS]; |
103 | 113 | ||
104 | struct rcu_head rcu_head; | 114 | struct rcu_head rcu_head; |
105 | }; | 115 | }; |
106 | 116 | ||
107 | typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); | 117 | typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); |
118 | typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); | ||
119 | typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); | ||
108 | typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); | 120 | typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); |
109 | typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); | 121 | typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); |
110 | 122 | ||
@@ -117,6 +129,8 @@ struct blkcg_policy { | |||
117 | 129 | ||
118 | /* operations */ | 130 | /* operations */ |
119 | blkcg_pol_init_pd_fn *pd_init_fn; | 131 | blkcg_pol_init_pd_fn *pd_init_fn; |
132 | blkcg_pol_online_pd_fn *pd_online_fn; | ||
133 | blkcg_pol_offline_pd_fn *pd_offline_fn; | ||
120 | blkcg_pol_exit_pd_fn *pd_exit_fn; | 134 | blkcg_pol_exit_pd_fn *pd_exit_fn; |
121 | blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; | 135 | blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; |
122 | }; | 136 | }; |
@@ -150,6 +164,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); | |||
150 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | 164 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, |
151 | int off); | 165 | int off); |
152 | 166 | ||
167 | u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); | ||
168 | struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, | ||
169 | int off); | ||
170 | |||
153 | struct blkg_conf_ctx { | 171 | struct blkg_conf_ctx { |
154 | struct gendisk *disk; | 172 | struct gendisk *disk; |
155 | struct blkcg_gq *blkg; | 173 | struct blkcg_gq *blkg; |
@@ -181,6 +199,19 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) | |||
181 | } | 199 | } |
182 | 200 | ||
183 | /** | 201 | /** |
202 | * blkcg_parent - get the parent of a blkcg | ||
203 | * @blkcg: blkcg of interest | ||
204 | * | ||
205 | * Return the parent blkcg of @blkcg. Can be called anytime. | ||
206 | */ | ||
207 | static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) | ||
208 | { | ||
209 | struct cgroup *pcg = blkcg->css.cgroup->parent; | ||
210 | |||
211 | return pcg ? cgroup_to_blkcg(pcg) : NULL; | ||
212 | } | ||
213 | |||
214 | /** | ||
184 | * blkg_to_pdata - get policy private data | 215 | * blkg_to_pdata - get policy private data |
185 | * @blkg: blkg of interest | 216 | * @blkg: blkg of interest |
186 | * @pol: policy of interest | 217 | * @pol: policy of interest |
@@ -387,6 +418,18 @@ static inline void blkg_stat_reset(struct blkg_stat *stat) | |||
387 | } | 418 | } |
388 | 419 | ||
389 | /** | 420 | /** |
421 | * blkg_stat_merge - merge a blkg_stat into another | ||
422 | * @to: the destination blkg_stat | ||
423 | * @from: the source | ||
424 | * | ||
425 | * Add @from's count to @to. | ||
426 | */ | ||
427 | static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) | ||
428 | { | ||
429 | blkg_stat_add(to, blkg_stat_read(from)); | ||
430 | } | ||
431 | |||
432 | /** | ||
390 | * blkg_rwstat_add - add a value to a blkg_rwstat | 433 | * blkg_rwstat_add - add a value to a blkg_rwstat |
391 | * @rwstat: target blkg_rwstat | 434 | * @rwstat: target blkg_rwstat |
392 | * @rw: mask of REQ_{WRITE|SYNC} | 435 | * @rw: mask of REQ_{WRITE|SYNC} |
@@ -434,14 +477,14 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) | |||
434 | } | 477 | } |
435 | 478 | ||
436 | /** | 479 | /** |
437 | * blkg_rwstat_sum - read the total count of a blkg_rwstat | 480 | * blkg_rwstat_total - read the total count of a blkg_rwstat |
438 | * @rwstat: blkg_rwstat to read | 481 | * @rwstat: blkg_rwstat to read |
439 | * | 482 | * |
440 | * Return the total count of @rwstat regardless of the IO direction. This | 483 | * Return the total count of @rwstat regardless of the IO direction. This |
441 | * function can be called without synchronization and takes care of u64 | 484 | * function can be called without synchronization and takes care of u64 |
442 | * atomicity. | 485 | * atomicity. |
443 | */ | 486 | */ |
444 | static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) | 487 | static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) |
445 | { | 488 | { |
446 | struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); | 489 | struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); |
447 | 490 | ||
@@ -457,6 +500,25 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) | |||
457 | memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); | 500 | memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); |
458 | } | 501 | } |
459 | 502 | ||
503 | /** | ||
504 | * blkg_rwstat_merge - merge a blkg_rwstat into another | ||
505 | * @to: the destination blkg_rwstat | ||
506 | * @from: the source | ||
507 | * | ||
508 | * Add @from's counts to @to. | ||
509 | */ | ||
510 | static inline void blkg_rwstat_merge(struct blkg_rwstat *to, | ||
511 | struct blkg_rwstat *from) | ||
512 | { | ||
513 | struct blkg_rwstat v = blkg_rwstat_read(from); | ||
514 | int i; | ||
515 | |||
516 | u64_stats_update_begin(&to->syncp); | ||
517 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | ||
518 | to->cnt[i] += v.cnt[i]; | ||
519 | u64_stats_update_end(&to->syncp); | ||
520 | } | ||
521 | |||
460 | #else /* CONFIG_BLK_CGROUP */ | 522 | #else /* CONFIG_BLK_CGROUP */ |
461 | 523 | ||
462 | struct cgroup; | 524 | struct cgroup; |
diff --git a/block/blk-core.c b/block/blk-core.c index 277134cb5d32..074b758efc42 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -39,7 +39,6 @@ | |||
39 | 39 | ||
40 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); | 40 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); |
41 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); | 41 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); |
42 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); | ||
43 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); | 42 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); |
44 | 43 | ||
45 | DEFINE_IDA(blk_queue_ida); | 44 | DEFINE_IDA(blk_queue_ida); |
@@ -1348,7 +1347,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | |||
1348 | if (!ll_back_merge_fn(q, req, bio)) | 1347 | if (!ll_back_merge_fn(q, req, bio)) |
1349 | return false; | 1348 | return false; |
1350 | 1349 | ||
1351 | trace_block_bio_backmerge(q, bio); | 1350 | trace_block_bio_backmerge(q, req, bio); |
1352 | 1351 | ||
1353 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) | 1352 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) |
1354 | blk_rq_set_mixed_merge(req); | 1353 | blk_rq_set_mixed_merge(req); |
@@ -1370,7 +1369,7 @@ static bool bio_attempt_front_merge(struct request_queue *q, | |||
1370 | if (!ll_front_merge_fn(q, req, bio)) | 1369 | if (!ll_front_merge_fn(q, req, bio)) |
1371 | return false; | 1370 | return false; |
1372 | 1371 | ||
1373 | trace_block_bio_frontmerge(q, bio); | 1372 | trace_block_bio_frontmerge(q, req, bio); |
1374 | 1373 | ||
1375 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) | 1374 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) |
1376 | blk_rq_set_mixed_merge(req); | 1375 | blk_rq_set_mixed_merge(req); |
@@ -1553,13 +1552,6 @@ get_rq: | |||
1553 | if (list_empty(&plug->list)) | 1552 | if (list_empty(&plug->list)) |
1554 | trace_block_plug(q); | 1553 | trace_block_plug(q); |
1555 | else { | 1554 | else { |
1556 | if (!plug->should_sort) { | ||
1557 | struct request *__rq; | ||
1558 | |||
1559 | __rq = list_entry_rq(plug->list.prev); | ||
1560 | if (__rq->q != q) | ||
1561 | plug->should_sort = 1; | ||
1562 | } | ||
1563 | if (request_count >= BLK_MAX_REQUEST_COUNT) { | 1555 | if (request_count >= BLK_MAX_REQUEST_COUNT) { |
1564 | blk_flush_plug_list(plug, false); | 1556 | blk_flush_plug_list(plug, false); |
1565 | trace_block_plug(q); | 1557 | trace_block_plug(q); |
@@ -2890,7 +2882,6 @@ void blk_start_plug(struct blk_plug *plug) | |||
2890 | plug->magic = PLUG_MAGIC; | 2882 | plug->magic = PLUG_MAGIC; |
2891 | INIT_LIST_HEAD(&plug->list); | 2883 | INIT_LIST_HEAD(&plug->list); |
2892 | INIT_LIST_HEAD(&plug->cb_list); | 2884 | INIT_LIST_HEAD(&plug->cb_list); |
2893 | plug->should_sort = 0; | ||
2894 | 2885 | ||
2895 | /* | 2886 | /* |
2896 | * If this is a nested plug, don't actually assign it. It will be | 2887 | * If this is a nested plug, don't actually assign it. It will be |
@@ -2992,10 +2983,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) | |||
2992 | 2983 | ||
2993 | list_splice_init(&plug->list, &list); | 2984 | list_splice_init(&plug->list, &list); |
2994 | 2985 | ||
2995 | if (plug->should_sort) { | 2986 | list_sort(NULL, &list, plug_rq_cmp); |
2996 | list_sort(NULL, &list, plug_rq_cmp); | ||
2997 | plug->should_sort = 0; | ||
2998 | } | ||
2999 | 2987 | ||
3000 | q = NULL; | 2988 | q = NULL; |
3001 | depth = 0; | 2989 | depth = 0; |
diff --git a/block/blk-exec.c b/block/blk-exec.c index c88202f973d9..e70621396129 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c | |||
@@ -121,9 +121,9 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, | |||
121 | /* Prevent hang_check timer from firing at us during very long I/O */ | 121 | /* Prevent hang_check timer from firing at us during very long I/O */ |
122 | hang_check = sysctl_hung_task_timeout_secs; | 122 | hang_check = sysctl_hung_task_timeout_secs; |
123 | if (hang_check) | 123 | if (hang_check) |
124 | while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); | 124 | while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); |
125 | else | 125 | else |
126 | wait_for_completion(&wait); | 126 | wait_for_completion_io(&wait); |
127 | 127 | ||
128 | if (rq->errors) | 128 | if (rq->errors) |
129 | err = -EIO; | 129 | err = -EIO; |
diff --git a/block/blk-flush.c b/block/blk-flush.c index 720ad607ff91..db8f1b507857 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c | |||
@@ -436,7 +436,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, | |||
436 | 436 | ||
437 | bio_get(bio); | 437 | bio_get(bio); |
438 | submit_bio(WRITE_FLUSH, bio); | 438 | submit_bio(WRITE_FLUSH, bio); |
439 | wait_for_completion(&wait); | 439 | wait_for_completion_io(&wait); |
440 | 440 | ||
441 | /* | 441 | /* |
442 | * The driver must store the error location in ->bi_sector, if | 442 | * The driver must store the error location in ->bi_sector, if |
diff --git a/block/blk-lib.c b/block/blk-lib.c index b3a1f2b70b31..d6f50d572565 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -126,7 +126,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
126 | 126 | ||
127 | /* Wait for bios in-flight */ | 127 | /* Wait for bios in-flight */ |
128 | if (!atomic_dec_and_test(&bb.done)) | 128 | if (!atomic_dec_and_test(&bb.done)) |
129 | wait_for_completion(&wait); | 129 | wait_for_completion_io(&wait); |
130 | 130 | ||
131 | if (!test_bit(BIO_UPTODATE, &bb.flags)) | 131 | if (!test_bit(BIO_UPTODATE, &bb.flags)) |
132 | ret = -EIO; | 132 | ret = -EIO; |
@@ -200,7 +200,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, | |||
200 | 200 | ||
201 | /* Wait for bios in-flight */ | 201 | /* Wait for bios in-flight */ |
202 | if (!atomic_dec_and_test(&bb.done)) | 202 | if (!atomic_dec_and_test(&bb.done)) |
203 | wait_for_completion(&wait); | 203 | wait_for_completion_io(&wait); |
204 | 204 | ||
205 | if (!test_bit(BIO_UPTODATE, &bb.flags)) | 205 | if (!test_bit(BIO_UPTODATE, &bb.flags)) |
206 | ret = -ENOTSUPP; | 206 | ret = -ENOTSUPP; |
@@ -262,7 +262,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | |||
262 | 262 | ||
263 | /* Wait for bios in-flight */ | 263 | /* Wait for bios in-flight */ |
264 | if (!atomic_dec_and_test(&bb.done)) | 264 | if (!atomic_dec_and_test(&bb.done)) |
265 | wait_for_completion(&wait); | 265 | wait_for_completion_io(&wait); |
266 | 266 | ||
267 | if (!test_bit(BIO_UPTODATE, &bb.flags)) | 267 | if (!test_bit(BIO_UPTODATE, &bb.flags)) |
268 | /* One of bios in the batch was completed with error.*/ | 268 | /* One of bios in the batch was completed with error.*/ |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 788147797a79..6206a934eb8c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -497,6 +497,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, | |||
497 | return res; | 497 | return res; |
498 | } | 498 | } |
499 | 499 | ||
500 | static void blk_free_queue_rcu(struct rcu_head *rcu_head) | ||
501 | { | ||
502 | struct request_queue *q = container_of(rcu_head, struct request_queue, | ||
503 | rcu_head); | ||
504 | kmem_cache_free(blk_requestq_cachep, q); | ||
505 | } | ||
506 | |||
500 | /** | 507 | /** |
501 | * blk_release_queue: - release a &struct request_queue when it is no longer needed | 508 | * blk_release_queue: - release a &struct request_queue when it is no longer needed |
502 | * @kobj: the kobj belonging to the request queue to be released | 509 | * @kobj: the kobj belonging to the request queue to be released |
@@ -538,7 +545,7 @@ static void blk_release_queue(struct kobject *kobj) | |||
538 | bdi_destroy(&q->backing_dev_info); | 545 | bdi_destroy(&q->backing_dev_info); |
539 | 546 | ||
540 | ida_simple_remove(&blk_queue_ida, q->id); | 547 | ida_simple_remove(&blk_queue_ida, q->id); |
541 | kmem_cache_free(blk_requestq_cachep, q); | 548 | call_rcu(&q->rcu_head, blk_free_queue_rcu); |
542 | } | 549 | } |
543 | 550 | ||
544 | static const struct sysfs_ops queue_sysfs_ops = { | 551 | static const struct sysfs_ops queue_sysfs_ops = { |
diff --git a/block/blk.h b/block/blk.h index 47fdfdd41520..e837b8f619b7 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -61,7 +61,7 @@ static inline void blk_clear_rq_complete(struct request *rq) | |||
61 | /* | 61 | /* |
62 | * Internal elevator interface | 62 | * Internal elevator interface |
63 | */ | 63 | */ |
64 | #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) | 64 | #define ELV_ON_HASH(rq) hash_hashed(&(rq)->hash) |
65 | 65 | ||
66 | void blk_insert_flush(struct request *rq); | 66 | void blk_insert_flush(struct request *rq); |
67 | void blk_abort_flushes(struct request_queue *q); | 67 | void blk_abort_flushes(struct request_queue *q); |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ec52807cdd09..4f0ade74cfd0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -85,7 +85,6 @@ struct cfq_rb_root { | |||
85 | struct rb_root rb; | 85 | struct rb_root rb; |
86 | struct rb_node *left; | 86 | struct rb_node *left; |
87 | unsigned count; | 87 | unsigned count; |
88 | unsigned total_weight; | ||
89 | u64 min_vdisktime; | 88 | u64 min_vdisktime; |
90 | struct cfq_ttime ttime; | 89 | struct cfq_ttime ttime; |
91 | }; | 90 | }; |
@@ -155,7 +154,7 @@ struct cfq_queue { | |||
155 | * First index in the service_trees. | 154 | * First index in the service_trees. |
156 | * IDLE is handled separately, so it has negative index | 155 | * IDLE is handled separately, so it has negative index |
157 | */ | 156 | */ |
158 | enum wl_prio_t { | 157 | enum wl_class_t { |
159 | BE_WORKLOAD = 0, | 158 | BE_WORKLOAD = 0, |
160 | RT_WORKLOAD = 1, | 159 | RT_WORKLOAD = 1, |
161 | IDLE_WORKLOAD = 2, | 160 | IDLE_WORKLOAD = 2, |
@@ -223,10 +222,45 @@ struct cfq_group { | |||
223 | 222 | ||
224 | /* group service_tree key */ | 223 | /* group service_tree key */ |
225 | u64 vdisktime; | 224 | u64 vdisktime; |
225 | |||
226 | /* | ||
227 | * The number of active cfqgs and sum of their weights under this | ||
228 | * cfqg. This covers this cfqg's leaf_weight and all children's | ||
229 | * weights, but does not cover weights of further descendants. | ||
230 | * | ||
231 | * If a cfqg is on the service tree, it's active. An active cfqg | ||
232 | * also activates its parent and contributes to the children_weight | ||
233 | * of the parent. | ||
234 | */ | ||
235 | int nr_active; | ||
236 | unsigned int children_weight; | ||
237 | |||
238 | /* | ||
239 | * vfraction is the fraction of vdisktime that the tasks in this | ||
240 | * cfqg are entitled to. This is determined by compounding the | ||
241 | * ratios walking up from this cfqg to the root. | ||
242 | * | ||
243 | * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all | ||
244 | * vfractions on a service tree is approximately 1. The sum may | ||
245 | * deviate a bit due to rounding errors and fluctuations caused by | ||
246 | * cfqgs entering and leaving the service tree. | ||
247 | */ | ||
248 | unsigned int vfraction; | ||
249 | |||
250 | /* | ||
251 | * There are two weights - (internal) weight is the weight of this | ||
252 | * cfqg against the sibling cfqgs. leaf_weight is the wight of | ||
253 | * this cfqg against the child cfqgs. For the root cfqg, both | ||
254 | * weights are kept in sync for backward compatibility. | ||
255 | */ | ||
226 | unsigned int weight; | 256 | unsigned int weight; |
227 | unsigned int new_weight; | 257 | unsigned int new_weight; |
228 | unsigned int dev_weight; | 258 | unsigned int dev_weight; |
229 | 259 | ||
260 | unsigned int leaf_weight; | ||
261 | unsigned int new_leaf_weight; | ||
262 | unsigned int dev_leaf_weight; | ||
263 | |||
230 | /* number of cfqq currently on this group */ | 264 | /* number of cfqq currently on this group */ |
231 | int nr_cfqq; | 265 | int nr_cfqq; |
232 | 266 | ||
@@ -248,14 +282,15 @@ struct cfq_group { | |||
248 | struct cfq_rb_root service_trees[2][3]; | 282 | struct cfq_rb_root service_trees[2][3]; |
249 | struct cfq_rb_root service_tree_idle; | 283 | struct cfq_rb_root service_tree_idle; |
250 | 284 | ||
251 | unsigned long saved_workload_slice; | 285 | unsigned long saved_wl_slice; |
252 | enum wl_type_t saved_workload; | 286 | enum wl_type_t saved_wl_type; |
253 | enum wl_prio_t saved_serving_prio; | 287 | enum wl_class_t saved_wl_class; |
254 | 288 | ||
255 | /* number of requests that are on the dispatch list or inside driver */ | 289 | /* number of requests that are on the dispatch list or inside driver */ |
256 | int dispatched; | 290 | int dispatched; |
257 | struct cfq_ttime ttime; | 291 | struct cfq_ttime ttime; |
258 | struct cfqg_stats stats; | 292 | struct cfqg_stats stats; /* stats for this cfqg */ |
293 | struct cfqg_stats dead_stats; /* stats pushed from dead children */ | ||
259 | }; | 294 | }; |
260 | 295 | ||
261 | struct cfq_io_cq { | 296 | struct cfq_io_cq { |
@@ -280,8 +315,8 @@ struct cfq_data { | |||
280 | /* | 315 | /* |
281 | * The priority currently being served | 316 | * The priority currently being served |
282 | */ | 317 | */ |
283 | enum wl_prio_t serving_prio; | 318 | enum wl_class_t serving_wl_class; |
284 | enum wl_type_t serving_type; | 319 | enum wl_type_t serving_wl_type; |
285 | unsigned long workload_expires; | 320 | unsigned long workload_expires; |
286 | struct cfq_group *serving_group; | 321 | struct cfq_group *serving_group; |
287 | 322 | ||
@@ -353,17 +388,17 @@ struct cfq_data { | |||
353 | 388 | ||
354 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); | 389 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); |
355 | 390 | ||
356 | static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, | 391 | static struct cfq_rb_root *st_for(struct cfq_group *cfqg, |
357 | enum wl_prio_t prio, | 392 | enum wl_class_t class, |
358 | enum wl_type_t type) | 393 | enum wl_type_t type) |
359 | { | 394 | { |
360 | if (!cfqg) | 395 | if (!cfqg) |
361 | return NULL; | 396 | return NULL; |
362 | 397 | ||
363 | if (prio == IDLE_WORKLOAD) | 398 | if (class == IDLE_WORKLOAD) |
364 | return &cfqg->service_tree_idle; | 399 | return &cfqg->service_tree_idle; |
365 | 400 | ||
366 | return &cfqg->service_trees[prio][type]; | 401 | return &cfqg->service_trees[class][type]; |
367 | } | 402 | } |
368 | 403 | ||
369 | enum cfqq_state_flags { | 404 | enum cfqq_state_flags { |
@@ -502,7 +537,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) | |||
502 | { | 537 | { |
503 | struct cfqg_stats *stats = &cfqg->stats; | 538 | struct cfqg_stats *stats = &cfqg->stats; |
504 | 539 | ||
505 | if (blkg_rwstat_sum(&stats->queued)) | 540 | if (blkg_rwstat_total(&stats->queued)) |
506 | return; | 541 | return; |
507 | 542 | ||
508 | /* | 543 | /* |
@@ -546,7 +581,7 @@ static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) | |||
546 | struct cfqg_stats *stats = &cfqg->stats; | 581 | struct cfqg_stats *stats = &cfqg->stats; |
547 | 582 | ||
548 | blkg_stat_add(&stats->avg_queue_size_sum, | 583 | blkg_stat_add(&stats->avg_queue_size_sum, |
549 | blkg_rwstat_sum(&stats->queued)); | 584 | blkg_rwstat_total(&stats->queued)); |
550 | blkg_stat_add(&stats->avg_queue_size_samples, 1); | 585 | blkg_stat_add(&stats->avg_queue_size_samples, 1); |
551 | cfqg_stats_update_group_wait_time(stats); | 586 | cfqg_stats_update_group_wait_time(stats); |
552 | } | 587 | } |
@@ -572,6 +607,13 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) | |||
572 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); | 607 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); |
573 | } | 608 | } |
574 | 609 | ||
610 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) | ||
611 | { | ||
612 | struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; | ||
613 | |||
614 | return pblkg ? blkg_to_cfqg(pblkg) : NULL; | ||
615 | } | ||
616 | |||
575 | static inline void cfqg_get(struct cfq_group *cfqg) | 617 | static inline void cfqg_get(struct cfq_group *cfqg) |
576 | { | 618 | { |
577 | return blkg_get(cfqg_to_blkg(cfqg)); | 619 | return blkg_get(cfqg_to_blkg(cfqg)); |
@@ -586,8 +628,9 @@ static inline void cfqg_put(struct cfq_group *cfqg) | |||
586 | char __pbuf[128]; \ | 628 | char __pbuf[128]; \ |
587 | \ | 629 | \ |
588 | blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ | 630 | blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ |
589 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ | 631 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \ |
590 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | 632 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ |
633 | cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ | ||
591 | __pbuf, ##args); \ | 634 | __pbuf, ##args); \ |
592 | } while (0) | 635 | } while (0) |
593 | 636 | ||
@@ -646,11 +689,9 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | |||
646 | io_start_time - start_time); | 689 | io_start_time - start_time); |
647 | } | 690 | } |
648 | 691 | ||
649 | static void cfq_pd_reset_stats(struct blkcg_gq *blkg) | 692 | /* @stats = 0 */ |
693 | static void cfqg_stats_reset(struct cfqg_stats *stats) | ||
650 | { | 694 | { |
651 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | ||
652 | struct cfqg_stats *stats = &cfqg->stats; | ||
653 | |||
654 | /* queued stats shouldn't be cleared */ | 695 | /* queued stats shouldn't be cleared */ |
655 | blkg_rwstat_reset(&stats->service_bytes); | 696 | blkg_rwstat_reset(&stats->service_bytes); |
656 | blkg_rwstat_reset(&stats->serviced); | 697 | blkg_rwstat_reset(&stats->serviced); |
@@ -669,13 +710,58 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg) | |||
669 | #endif | 710 | #endif |
670 | } | 711 | } |
671 | 712 | ||
713 | /* @to += @from */ | ||
714 | static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) | ||
715 | { | ||
716 | /* queued stats shouldn't be cleared */ | ||
717 | blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); | ||
718 | blkg_rwstat_merge(&to->serviced, &from->serviced); | ||
719 | blkg_rwstat_merge(&to->merged, &from->merged); | ||
720 | blkg_rwstat_merge(&to->service_time, &from->service_time); | ||
721 | blkg_rwstat_merge(&to->wait_time, &from->wait_time); | ||
722 | blkg_stat_merge(&from->time, &from->time); | ||
723 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
724 | blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); | ||
725 | blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); | ||
726 | blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); | ||
727 | blkg_stat_merge(&to->dequeue, &from->dequeue); | ||
728 | blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); | ||
729 | blkg_stat_merge(&to->idle_time, &from->idle_time); | ||
730 | blkg_stat_merge(&to->empty_time, &from->empty_time); | ||
731 | #endif | ||
732 | } | ||
733 | |||
734 | /* | ||
735 | * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' | ||
736 | * recursive stats can still account for the amount used by this cfqg after | ||
737 | * it's gone. | ||
738 | */ | ||
739 | static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) | ||
740 | { | ||
741 | struct cfq_group *parent = cfqg_parent(cfqg); | ||
742 | |||
743 | lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); | ||
744 | |||
745 | if (unlikely(!parent)) | ||
746 | return; | ||
747 | |||
748 | cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); | ||
749 | cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); | ||
750 | cfqg_stats_reset(&cfqg->stats); | ||
751 | cfqg_stats_reset(&cfqg->dead_stats); | ||
752 | } | ||
753 | |||
672 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ | 754 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ |
673 | 755 | ||
756 | static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } | ||
674 | static inline void cfqg_get(struct cfq_group *cfqg) { } | 757 | static inline void cfqg_get(struct cfq_group *cfqg) { } |
675 | static inline void cfqg_put(struct cfq_group *cfqg) { } | 758 | static inline void cfqg_put(struct cfq_group *cfqg) { } |
676 | 759 | ||
677 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | 760 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ |
678 | blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) | 761 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ |
762 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | ||
763 | cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ | ||
764 | ##args) | ||
679 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) | 765 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) |
680 | 766 | ||
681 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, | 767 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, |
@@ -732,7 +818,7 @@ static inline bool iops_mode(struct cfq_data *cfqd) | |||
732 | return false; | 818 | return false; |
733 | } | 819 | } |
734 | 820 | ||
735 | static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) | 821 | static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) |
736 | { | 822 | { |
737 | if (cfq_class_idle(cfqq)) | 823 | if (cfq_class_idle(cfqq)) |
738 | return IDLE_WORKLOAD; | 824 | return IDLE_WORKLOAD; |
@@ -751,23 +837,23 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) | |||
751 | return SYNC_WORKLOAD; | 837 | return SYNC_WORKLOAD; |
752 | } | 838 | } |
753 | 839 | ||
754 | static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, | 840 | static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, |
755 | struct cfq_data *cfqd, | 841 | struct cfq_data *cfqd, |
756 | struct cfq_group *cfqg) | 842 | struct cfq_group *cfqg) |
757 | { | 843 | { |
758 | if (wl == IDLE_WORKLOAD) | 844 | if (wl_class == IDLE_WORKLOAD) |
759 | return cfqg->service_tree_idle.count; | 845 | return cfqg->service_tree_idle.count; |
760 | 846 | ||
761 | return cfqg->service_trees[wl][ASYNC_WORKLOAD].count | 847 | return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + |
762 | + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count | 848 | cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + |
763 | + cfqg->service_trees[wl][SYNC_WORKLOAD].count; | 849 | cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; |
764 | } | 850 | } |
765 | 851 | ||
766 | static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, | 852 | static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, |
767 | struct cfq_group *cfqg) | 853 | struct cfq_group *cfqg) |
768 | { | 854 | { |
769 | return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count | 855 | return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + |
770 | + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; | 856 | cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; |
771 | } | 857 | } |
772 | 858 | ||
773 | static void cfq_dispatch_insert(struct request_queue *, struct request *); | 859 | static void cfq_dispatch_insert(struct request_queue *, struct request *); |
@@ -847,13 +933,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
847 | return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); | 933 | return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); |
848 | } | 934 | } |
849 | 935 | ||
850 | static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) | 936 | /** |
937 | * cfqg_scale_charge - scale disk time charge according to cfqg weight | ||
938 | * @charge: disk time being charged | ||
939 | * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT | ||
940 | * | ||
941 | * Scale @charge according to @vfraction, which is in range (0, 1]. The | ||
942 | * scaling is inversely proportional. | ||
943 | * | ||
944 | * scaled = charge / vfraction | ||
945 | * | ||
946 | * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. | ||
947 | */ | ||
948 | static inline u64 cfqg_scale_charge(unsigned long charge, | ||
949 | unsigned int vfraction) | ||
851 | { | 950 | { |
852 | u64 d = delta << CFQ_SERVICE_SHIFT; | 951 | u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ |
853 | 952 | ||
854 | d = d * CFQ_WEIGHT_DEFAULT; | 953 | /* charge / vfraction */ |
855 | do_div(d, cfqg->weight); | 954 | c <<= CFQ_SERVICE_SHIFT; |
856 | return d; | 955 | do_div(c, vfraction); |
956 | return c; | ||
857 | } | 957 | } |
858 | 958 | ||
859 | static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) | 959 | static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) |
@@ -909,9 +1009,7 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, | |||
909 | static inline unsigned | 1009 | static inline unsigned |
910 | cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) | 1010 | cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) |
911 | { | 1011 | { |
912 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | 1012 | return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; |
913 | |||
914 | return cfqd->cfq_target_latency * cfqg->weight / st->total_weight; | ||
915 | } | 1013 | } |
916 | 1014 | ||
917 | static inline unsigned | 1015 | static inline unsigned |
@@ -1178,20 +1276,61 @@ static void | |||
1178 | cfq_update_group_weight(struct cfq_group *cfqg) | 1276 | cfq_update_group_weight(struct cfq_group *cfqg) |
1179 | { | 1277 | { |
1180 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | 1278 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); |
1279 | |||
1181 | if (cfqg->new_weight) { | 1280 | if (cfqg->new_weight) { |
1182 | cfqg->weight = cfqg->new_weight; | 1281 | cfqg->weight = cfqg->new_weight; |
1183 | cfqg->new_weight = 0; | 1282 | cfqg->new_weight = 0; |
1184 | } | 1283 | } |
1284 | |||
1285 | if (cfqg->new_leaf_weight) { | ||
1286 | cfqg->leaf_weight = cfqg->new_leaf_weight; | ||
1287 | cfqg->new_leaf_weight = 0; | ||
1288 | } | ||
1185 | } | 1289 | } |
1186 | 1290 | ||
1187 | static void | 1291 | static void |
1188 | cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | 1292 | cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) |
1189 | { | 1293 | { |
1294 | unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ | ||
1295 | struct cfq_group *pos = cfqg; | ||
1296 | struct cfq_group *parent; | ||
1297 | bool propagate; | ||
1298 | |||
1299 | /* add to the service tree */ | ||
1190 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | 1300 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); |
1191 | 1301 | ||
1192 | cfq_update_group_weight(cfqg); | 1302 | cfq_update_group_weight(cfqg); |
1193 | __cfq_group_service_tree_add(st, cfqg); | 1303 | __cfq_group_service_tree_add(st, cfqg); |
1194 | st->total_weight += cfqg->weight; | 1304 | |
1305 | /* | ||
1306 | * Activate @cfqg and calculate the portion of vfraction @cfqg is | ||
1307 | * entitled to. vfraction is calculated by walking the tree | ||
1308 | * towards the root calculating the fraction it has at each level. | ||
1309 | * The compounded ratio is how much vfraction @cfqg owns. | ||
1310 | * | ||
1311 | * Start with the proportion tasks in this cfqg has against active | ||
1312 | * children cfqgs - its leaf_weight against children_weight. | ||
1313 | */ | ||
1314 | propagate = !pos->nr_active++; | ||
1315 | pos->children_weight += pos->leaf_weight; | ||
1316 | vfr = vfr * pos->leaf_weight / pos->children_weight; | ||
1317 | |||
1318 | /* | ||
1319 | * Compound ->weight walking up the tree. Both activation and | ||
1320 | * vfraction calculation are done in the same loop. Propagation | ||
1321 | * stops once an already activated node is met. vfraction | ||
1322 | * calculation should always continue to the root. | ||
1323 | */ | ||
1324 | while ((parent = cfqg_parent(pos))) { | ||
1325 | if (propagate) { | ||
1326 | propagate = !parent->nr_active++; | ||
1327 | parent->children_weight += pos->weight; | ||
1328 | } | ||
1329 | vfr = vfr * pos->weight / parent->children_weight; | ||
1330 | pos = parent; | ||
1331 | } | ||
1332 | |||
1333 | cfqg->vfraction = max_t(unsigned, vfr, 1); | ||
1195 | } | 1334 | } |
1196 | 1335 | ||
1197 | static void | 1336 | static void |
@@ -1222,7 +1361,32 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
1222 | static void | 1361 | static void |
1223 | cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) | 1362 | cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) |
1224 | { | 1363 | { |
1225 | st->total_weight -= cfqg->weight; | 1364 | struct cfq_group *pos = cfqg; |
1365 | bool propagate; | ||
1366 | |||
1367 | /* | ||
1368 | * Undo activation from cfq_group_service_tree_add(). Deactivate | ||
1369 | * @cfqg and propagate deactivation upwards. | ||
1370 | */ | ||
1371 | propagate = !--pos->nr_active; | ||
1372 | pos->children_weight -= pos->leaf_weight; | ||
1373 | |||
1374 | while (propagate) { | ||
1375 | struct cfq_group *parent = cfqg_parent(pos); | ||
1376 | |||
1377 | /* @pos has 0 nr_active at this point */ | ||
1378 | WARN_ON_ONCE(pos->children_weight); | ||
1379 | pos->vfraction = 0; | ||
1380 | |||
1381 | if (!parent) | ||
1382 | break; | ||
1383 | |||
1384 | propagate = !--parent->nr_active; | ||
1385 | parent->children_weight -= pos->weight; | ||
1386 | pos = parent; | ||
1387 | } | ||
1388 | |||
1389 | /* remove from the service tree */ | ||
1226 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | 1390 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) |
1227 | cfq_rb_erase(&cfqg->rb_node, st); | 1391 | cfq_rb_erase(&cfqg->rb_node, st); |
1228 | } | 1392 | } |
@@ -1241,7 +1405,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
1241 | 1405 | ||
1242 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); | 1406 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); |
1243 | cfq_group_service_tree_del(st, cfqg); | 1407 | cfq_group_service_tree_del(st, cfqg); |
1244 | cfqg->saved_workload_slice = 0; | 1408 | cfqg->saved_wl_slice = 0; |
1245 | cfqg_stats_update_dequeue(cfqg); | 1409 | cfqg_stats_update_dequeue(cfqg); |
1246 | } | 1410 | } |
1247 | 1411 | ||
@@ -1284,6 +1448,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
1284 | unsigned int used_sl, charge, unaccounted_sl = 0; | 1448 | unsigned int used_sl, charge, unaccounted_sl = 0; |
1285 | int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) | 1449 | int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) |
1286 | - cfqg->service_tree_idle.count; | 1450 | - cfqg->service_tree_idle.count; |
1451 | unsigned int vfr; | ||
1287 | 1452 | ||
1288 | BUG_ON(nr_sync < 0); | 1453 | BUG_ON(nr_sync < 0); |
1289 | used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); | 1454 | used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); |
@@ -1293,20 +1458,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
1293 | else if (!cfq_cfqq_sync(cfqq) && !nr_sync) | 1458 | else if (!cfq_cfqq_sync(cfqq) && !nr_sync) |
1294 | charge = cfqq->allocated_slice; | 1459 | charge = cfqq->allocated_slice; |
1295 | 1460 | ||
1296 | /* Can't update vdisktime while group is on service tree */ | 1461 | /* |
1462 | * Can't update vdisktime while on service tree and cfqg->vfraction | ||
1463 | * is valid only while on it. Cache vfr, leave the service tree, | ||
1464 | * update vdisktime and go back on. The re-addition to the tree | ||
1465 | * will also update the weights as necessary. | ||
1466 | */ | ||
1467 | vfr = cfqg->vfraction; | ||
1297 | cfq_group_service_tree_del(st, cfqg); | 1468 | cfq_group_service_tree_del(st, cfqg); |
1298 | cfqg->vdisktime += cfq_scale_slice(charge, cfqg); | 1469 | cfqg->vdisktime += cfqg_scale_charge(charge, vfr); |
1299 | /* If a new weight was requested, update now, off tree */ | ||
1300 | cfq_group_service_tree_add(st, cfqg); | 1470 | cfq_group_service_tree_add(st, cfqg); |
1301 | 1471 | ||
1302 | /* This group is being expired. Save the context */ | 1472 | /* This group is being expired. Save the context */ |
1303 | if (time_after(cfqd->workload_expires, jiffies)) { | 1473 | if (time_after(cfqd->workload_expires, jiffies)) { |
1304 | cfqg->saved_workload_slice = cfqd->workload_expires | 1474 | cfqg->saved_wl_slice = cfqd->workload_expires |
1305 | - jiffies; | 1475 | - jiffies; |
1306 | cfqg->saved_workload = cfqd->serving_type; | 1476 | cfqg->saved_wl_type = cfqd->serving_wl_type; |
1307 | cfqg->saved_serving_prio = cfqd->serving_prio; | 1477 | cfqg->saved_wl_class = cfqd->serving_wl_class; |
1308 | } else | 1478 | } else |
1309 | cfqg->saved_workload_slice = 0; | 1479 | cfqg->saved_wl_slice = 0; |
1310 | 1480 | ||
1311 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, | 1481 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, |
1312 | st->min_vdisktime); | 1482 | st->min_vdisktime); |
@@ -1344,6 +1514,52 @@ static void cfq_pd_init(struct blkcg_gq *blkg) | |||
1344 | 1514 | ||
1345 | cfq_init_cfqg_base(cfqg); | 1515 | cfq_init_cfqg_base(cfqg); |
1346 | cfqg->weight = blkg->blkcg->cfq_weight; | 1516 | cfqg->weight = blkg->blkcg->cfq_weight; |
1517 | cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; | ||
1518 | } | ||
1519 | |||
1520 | static void cfq_pd_offline(struct blkcg_gq *blkg) | ||
1521 | { | ||
1522 | /* | ||
1523 | * @blkg is going offline and will be ignored by | ||
1524 | * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so | ||
1525 | * that they don't get lost. If IOs complete after this point, the | ||
1526 | * stats for them will be lost. Oh well... | ||
1527 | */ | ||
1528 | cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); | ||
1529 | } | ||
1530 | |||
1531 | /* offset delta from cfqg->stats to cfqg->dead_stats */ | ||
1532 | static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - | ||
1533 | offsetof(struct cfq_group, stats); | ||
1534 | |||
1535 | /* to be used by recursive prfill, sums live and dead stats recursively */ | ||
1536 | static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) | ||
1537 | { | ||
1538 | u64 sum = 0; | ||
1539 | |||
1540 | sum += blkg_stat_recursive_sum(pd, off); | ||
1541 | sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); | ||
1542 | return sum; | ||
1543 | } | ||
1544 | |||
1545 | /* to be used by recursive prfill, sums live and dead rwstats recursively */ | ||
1546 | static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, | ||
1547 | int off) | ||
1548 | { | ||
1549 | struct blkg_rwstat a, b; | ||
1550 | |||
1551 | a = blkg_rwstat_recursive_sum(pd, off); | ||
1552 | b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); | ||
1553 | blkg_rwstat_merge(&a, &b); | ||
1554 | return a; | ||
1555 | } | ||
1556 | |||
1557 | static void cfq_pd_reset_stats(struct blkcg_gq *blkg) | ||
1558 | { | ||
1559 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | ||
1560 | |||
1561 | cfqg_stats_reset(&cfqg->stats); | ||
1562 | cfqg_stats_reset(&cfqg->dead_stats); | ||
1347 | } | 1563 | } |
1348 | 1564 | ||
1349 | /* | 1565 | /* |
@@ -1400,6 +1616,26 @@ static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, | |||
1400 | return 0; | 1616 | return 0; |
1401 | } | 1617 | } |
1402 | 1618 | ||
1619 | static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, | ||
1620 | struct blkg_policy_data *pd, int off) | ||
1621 | { | ||
1622 | struct cfq_group *cfqg = pd_to_cfqg(pd); | ||
1623 | |||
1624 | if (!cfqg->dev_leaf_weight) | ||
1625 | return 0; | ||
1626 | return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); | ||
1627 | } | ||
1628 | |||
1629 | static int cfqg_print_leaf_weight_device(struct cgroup *cgrp, | ||
1630 | struct cftype *cft, | ||
1631 | struct seq_file *sf) | ||
1632 | { | ||
1633 | blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), | ||
1634 | cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0, | ||
1635 | false); | ||
1636 | return 0; | ||
1637 | } | ||
1638 | |||
1403 | static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, | 1639 | static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, |
1404 | struct seq_file *sf) | 1640 | struct seq_file *sf) |
1405 | { | 1641 | { |
@@ -1407,8 +1643,16 @@ static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, | |||
1407 | return 0; | 1643 | return 0; |
1408 | } | 1644 | } |
1409 | 1645 | ||
1410 | static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, | 1646 | static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft, |
1411 | const char *buf) | 1647 | struct seq_file *sf) |
1648 | { | ||
1649 | seq_printf(sf, "%u\n", | ||
1650 | cgroup_to_blkcg(cgrp)->cfq_leaf_weight); | ||
1651 | return 0; | ||
1652 | } | ||
1653 | |||
1654 | static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, | ||
1655 | const char *buf, bool is_leaf_weight) | ||
1412 | { | 1656 | { |
1413 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1657 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
1414 | struct blkg_conf_ctx ctx; | 1658 | struct blkg_conf_ctx ctx; |
@@ -1422,8 +1666,13 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, | |||
1422 | ret = -EINVAL; | 1666 | ret = -EINVAL; |
1423 | cfqg = blkg_to_cfqg(ctx.blkg); | 1667 | cfqg = blkg_to_cfqg(ctx.blkg); |
1424 | if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { | 1668 | if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { |
1425 | cfqg->dev_weight = ctx.v; | 1669 | if (!is_leaf_weight) { |
1426 | cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight; | 1670 | cfqg->dev_weight = ctx.v; |
1671 | cfqg->new_weight = ctx.v ?: blkcg->cfq_weight; | ||
1672 | } else { | ||
1673 | cfqg->dev_leaf_weight = ctx.v; | ||
1674 | cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; | ||
1675 | } | ||
1427 | ret = 0; | 1676 | ret = 0; |
1428 | } | 1677 | } |
1429 | 1678 | ||
@@ -1431,7 +1680,20 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, | |||
1431 | return ret; | 1680 | return ret; |
1432 | } | 1681 | } |
1433 | 1682 | ||
1434 | static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1683 | static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, |
1684 | const char *buf) | ||
1685 | { | ||
1686 | return __cfqg_set_weight_device(cgrp, cft, buf, false); | ||
1687 | } | ||
1688 | |||
1689 | static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft, | ||
1690 | const char *buf) | ||
1691 | { | ||
1692 | return __cfqg_set_weight_device(cgrp, cft, buf, true); | ||
1693 | } | ||
1694 | |||
1695 | static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, | ||
1696 | bool is_leaf_weight) | ||
1435 | { | 1697 | { |
1436 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1698 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
1437 | struct blkcg_gq *blkg; | 1699 | struct blkcg_gq *blkg; |
@@ -1440,19 +1702,41 @@ static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) | |||
1440 | return -EINVAL; | 1702 | return -EINVAL; |
1441 | 1703 | ||
1442 | spin_lock_irq(&blkcg->lock); | 1704 | spin_lock_irq(&blkcg->lock); |
1443 | blkcg->cfq_weight = (unsigned int)val; | 1705 | |
1706 | if (!is_leaf_weight) | ||
1707 | blkcg->cfq_weight = val; | ||
1708 | else | ||
1709 | blkcg->cfq_leaf_weight = val; | ||
1444 | 1710 | ||
1445 | hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { | 1711 | hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { |
1446 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | 1712 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); |
1447 | 1713 | ||
1448 | if (cfqg && !cfqg->dev_weight) | 1714 | if (!cfqg) |
1449 | cfqg->new_weight = blkcg->cfq_weight; | 1715 | continue; |
1716 | |||
1717 | if (!is_leaf_weight) { | ||
1718 | if (!cfqg->dev_weight) | ||
1719 | cfqg->new_weight = blkcg->cfq_weight; | ||
1720 | } else { | ||
1721 | if (!cfqg->dev_leaf_weight) | ||
1722 | cfqg->new_leaf_weight = blkcg->cfq_leaf_weight; | ||
1723 | } | ||
1450 | } | 1724 | } |
1451 | 1725 | ||
1452 | spin_unlock_irq(&blkcg->lock); | 1726 | spin_unlock_irq(&blkcg->lock); |
1453 | return 0; | 1727 | return 0; |
1454 | } | 1728 | } |
1455 | 1729 | ||
1730 | static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) | ||
1731 | { | ||
1732 | return __cfq_set_weight(cgrp, cft, val, false); | ||
1733 | } | ||
1734 | |||
1735 | static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) | ||
1736 | { | ||
1737 | return __cfq_set_weight(cgrp, cft, val, true); | ||
1738 | } | ||
1739 | |||
1456 | static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, | 1740 | static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, |
1457 | struct seq_file *sf) | 1741 | struct seq_file *sf) |
1458 | { | 1742 | { |
@@ -1473,6 +1757,42 @@ static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, | |||
1473 | return 0; | 1757 | return 0; |
1474 | } | 1758 | } |
1475 | 1759 | ||
1760 | static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, | ||
1761 | struct blkg_policy_data *pd, int off) | ||
1762 | { | ||
1763 | u64 sum = cfqg_stat_pd_recursive_sum(pd, off); | ||
1764 | |||
1765 | return __blkg_prfill_u64(sf, pd, sum); | ||
1766 | } | ||
1767 | |||
1768 | static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, | ||
1769 | struct blkg_policy_data *pd, int off) | ||
1770 | { | ||
1771 | struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); | ||
1772 | |||
1773 | return __blkg_prfill_rwstat(sf, pd, &sum); | ||
1774 | } | ||
1775 | |||
1776 | static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft, | ||
1777 | struct seq_file *sf) | ||
1778 | { | ||
1779 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | ||
1780 | |||
1781 | blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, | ||
1782 | &blkcg_policy_cfq, cft->private, false); | ||
1783 | return 0; | ||
1784 | } | ||
1785 | |||
1786 | static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft, | ||
1787 | struct seq_file *sf) | ||
1788 | { | ||
1789 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | ||
1790 | |||
1791 | blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, | ||
1792 | &blkcg_policy_cfq, cft->private, true); | ||
1793 | return 0; | ||
1794 | } | ||
1795 | |||
1476 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 1796 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
1477 | static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, | 1797 | static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, |
1478 | struct blkg_policy_data *pd, int off) | 1798 | struct blkg_policy_data *pd, int off) |
@@ -1502,17 +1822,49 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, | |||
1502 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | 1822 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ |
1503 | 1823 | ||
1504 | static struct cftype cfq_blkcg_files[] = { | 1824 | static struct cftype cfq_blkcg_files[] = { |
1825 | /* on root, weight is mapped to leaf_weight */ | ||
1826 | { | ||
1827 | .name = "weight_device", | ||
1828 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
1829 | .read_seq_string = cfqg_print_leaf_weight_device, | ||
1830 | .write_string = cfqg_set_leaf_weight_device, | ||
1831 | .max_write_len = 256, | ||
1832 | }, | ||
1833 | { | ||
1834 | .name = "weight", | ||
1835 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
1836 | .read_seq_string = cfq_print_leaf_weight, | ||
1837 | .write_u64 = cfq_set_leaf_weight, | ||
1838 | }, | ||
1839 | |||
1840 | /* no such mapping necessary for !roots */ | ||
1505 | { | 1841 | { |
1506 | .name = "weight_device", | 1842 | .name = "weight_device", |
1843 | .flags = CFTYPE_NOT_ON_ROOT, | ||
1507 | .read_seq_string = cfqg_print_weight_device, | 1844 | .read_seq_string = cfqg_print_weight_device, |
1508 | .write_string = cfqg_set_weight_device, | 1845 | .write_string = cfqg_set_weight_device, |
1509 | .max_write_len = 256, | 1846 | .max_write_len = 256, |
1510 | }, | 1847 | }, |
1511 | { | 1848 | { |
1512 | .name = "weight", | 1849 | .name = "weight", |
1850 | .flags = CFTYPE_NOT_ON_ROOT, | ||
1513 | .read_seq_string = cfq_print_weight, | 1851 | .read_seq_string = cfq_print_weight, |
1514 | .write_u64 = cfq_set_weight, | 1852 | .write_u64 = cfq_set_weight, |
1515 | }, | 1853 | }, |
1854 | |||
1855 | { | ||
1856 | .name = "leaf_weight_device", | ||
1857 | .read_seq_string = cfqg_print_leaf_weight_device, | ||
1858 | .write_string = cfqg_set_leaf_weight_device, | ||
1859 | .max_write_len = 256, | ||
1860 | }, | ||
1861 | { | ||
1862 | .name = "leaf_weight", | ||
1863 | .read_seq_string = cfq_print_leaf_weight, | ||
1864 | .write_u64 = cfq_set_leaf_weight, | ||
1865 | }, | ||
1866 | |||
1867 | /* statistics, covers only the tasks in the cfqg */ | ||
1516 | { | 1868 | { |
1517 | .name = "time", | 1869 | .name = "time", |
1518 | .private = offsetof(struct cfq_group, stats.time), | 1870 | .private = offsetof(struct cfq_group, stats.time), |
@@ -1553,6 +1905,48 @@ static struct cftype cfq_blkcg_files[] = { | |||
1553 | .private = offsetof(struct cfq_group, stats.queued), | 1905 | .private = offsetof(struct cfq_group, stats.queued), |
1554 | .read_seq_string = cfqg_print_rwstat, | 1906 | .read_seq_string = cfqg_print_rwstat, |
1555 | }, | 1907 | }, |
1908 | |||
1909 | /* the same statictics which cover the cfqg and its descendants */ | ||
1910 | { | ||
1911 | .name = "time_recursive", | ||
1912 | .private = offsetof(struct cfq_group, stats.time), | ||
1913 | .read_seq_string = cfqg_print_stat_recursive, | ||
1914 | }, | ||
1915 | { | ||
1916 | .name = "sectors_recursive", | ||
1917 | .private = offsetof(struct cfq_group, stats.sectors), | ||
1918 | .read_seq_string = cfqg_print_stat_recursive, | ||
1919 | }, | ||
1920 | { | ||
1921 | .name = "io_service_bytes_recursive", | ||
1922 | .private = offsetof(struct cfq_group, stats.service_bytes), | ||
1923 | .read_seq_string = cfqg_print_rwstat_recursive, | ||
1924 | }, | ||
1925 | { | ||
1926 | .name = "io_serviced_recursive", | ||
1927 | .private = offsetof(struct cfq_group, stats.serviced), | ||
1928 | .read_seq_string = cfqg_print_rwstat_recursive, | ||
1929 | }, | ||
1930 | { | ||
1931 | .name = "io_service_time_recursive", | ||
1932 | .private = offsetof(struct cfq_group, stats.service_time), | ||
1933 | .read_seq_string = cfqg_print_rwstat_recursive, | ||
1934 | }, | ||
1935 | { | ||
1936 | .name = "io_wait_time_recursive", | ||
1937 | .private = offsetof(struct cfq_group, stats.wait_time), | ||
1938 | .read_seq_string = cfqg_print_rwstat_recursive, | ||
1939 | }, | ||
1940 | { | ||
1941 | .name = "io_merged_recursive", | ||
1942 | .private = offsetof(struct cfq_group, stats.merged), | ||
1943 | .read_seq_string = cfqg_print_rwstat_recursive, | ||
1944 | }, | ||
1945 | { | ||
1946 | .name = "io_queued_recursive", | ||
1947 | .private = offsetof(struct cfq_group, stats.queued), | ||
1948 | .read_seq_string = cfqg_print_rwstat_recursive, | ||
1949 | }, | ||
1556 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 1950 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
1557 | { | 1951 | { |
1558 | .name = "avg_queue_size", | 1952 | .name = "avg_queue_size", |
@@ -1611,15 +2005,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1611 | struct rb_node **p, *parent; | 2005 | struct rb_node **p, *parent; |
1612 | struct cfq_queue *__cfqq; | 2006 | struct cfq_queue *__cfqq; |
1613 | unsigned long rb_key; | 2007 | unsigned long rb_key; |
1614 | struct cfq_rb_root *service_tree; | 2008 | struct cfq_rb_root *st; |
1615 | int left; | 2009 | int left; |
1616 | int new_cfqq = 1; | 2010 | int new_cfqq = 1; |
1617 | 2011 | ||
1618 | service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), | 2012 | st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq)); |
1619 | cfqq_type(cfqq)); | ||
1620 | if (cfq_class_idle(cfqq)) { | 2013 | if (cfq_class_idle(cfqq)) { |
1621 | rb_key = CFQ_IDLE_DELAY; | 2014 | rb_key = CFQ_IDLE_DELAY; |
1622 | parent = rb_last(&service_tree->rb); | 2015 | parent = rb_last(&st->rb); |
1623 | if (parent && parent != &cfqq->rb_node) { | 2016 | if (parent && parent != &cfqq->rb_node) { |
1624 | __cfqq = rb_entry(parent, struct cfq_queue, rb_node); | 2017 | __cfqq = rb_entry(parent, struct cfq_queue, rb_node); |
1625 | rb_key += __cfqq->rb_key; | 2018 | rb_key += __cfqq->rb_key; |
@@ -1637,7 +2030,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1637 | cfqq->slice_resid = 0; | 2030 | cfqq->slice_resid = 0; |
1638 | } else { | 2031 | } else { |
1639 | rb_key = -HZ; | 2032 | rb_key = -HZ; |
1640 | __cfqq = cfq_rb_first(service_tree); | 2033 | __cfqq = cfq_rb_first(st); |
1641 | rb_key += __cfqq ? __cfqq->rb_key : jiffies; | 2034 | rb_key += __cfqq ? __cfqq->rb_key : jiffies; |
1642 | } | 2035 | } |
1643 | 2036 | ||
@@ -1646,8 +2039,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1646 | /* | 2039 | /* |
1647 | * same position, nothing more to do | 2040 | * same position, nothing more to do |
1648 | */ | 2041 | */ |
1649 | if (rb_key == cfqq->rb_key && | 2042 | if (rb_key == cfqq->rb_key && cfqq->service_tree == st) |
1650 | cfqq->service_tree == service_tree) | ||
1651 | return; | 2043 | return; |
1652 | 2044 | ||
1653 | cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); | 2045 | cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); |
@@ -1656,11 +2048,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1656 | 2048 | ||
1657 | left = 1; | 2049 | left = 1; |
1658 | parent = NULL; | 2050 | parent = NULL; |
1659 | cfqq->service_tree = service_tree; | 2051 | cfqq->service_tree = st; |
1660 | p = &service_tree->rb.rb_node; | 2052 | p = &st->rb.rb_node; |
1661 | while (*p) { | 2053 | while (*p) { |
1662 | struct rb_node **n; | ||
1663 | |||
1664 | parent = *p; | 2054 | parent = *p; |
1665 | __cfqq = rb_entry(parent, struct cfq_queue, rb_node); | 2055 | __cfqq = rb_entry(parent, struct cfq_queue, rb_node); |
1666 | 2056 | ||
@@ -1668,22 +2058,20 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1668 | * sort by key, that represents service time. | 2058 | * sort by key, that represents service time. |
1669 | */ | 2059 | */ |
1670 | if (time_before(rb_key, __cfqq->rb_key)) | 2060 | if (time_before(rb_key, __cfqq->rb_key)) |
1671 | n = &(*p)->rb_left; | 2061 | p = &parent->rb_left; |
1672 | else { | 2062 | else { |
1673 | n = &(*p)->rb_right; | 2063 | p = &parent->rb_right; |
1674 | left = 0; | 2064 | left = 0; |
1675 | } | 2065 | } |
1676 | |||
1677 | p = n; | ||
1678 | } | 2066 | } |
1679 | 2067 | ||
1680 | if (left) | 2068 | if (left) |
1681 | service_tree->left = &cfqq->rb_node; | 2069 | st->left = &cfqq->rb_node; |
1682 | 2070 | ||
1683 | cfqq->rb_key = rb_key; | 2071 | cfqq->rb_key = rb_key; |
1684 | rb_link_node(&cfqq->rb_node, parent, p); | 2072 | rb_link_node(&cfqq->rb_node, parent, p); |
1685 | rb_insert_color(&cfqq->rb_node, &service_tree->rb); | 2073 | rb_insert_color(&cfqq->rb_node, &st->rb); |
1686 | service_tree->count++; | 2074 | st->count++; |
1687 | if (add_front || !new_cfqq) | 2075 | if (add_front || !new_cfqq) |
1688 | return; | 2076 | return; |
1689 | cfq_group_notify_queue_add(cfqd, cfqq->cfqg); | 2077 | cfq_group_notify_queue_add(cfqd, cfqq->cfqg); |
@@ -2029,8 +2417,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, | |||
2029 | struct cfq_queue *cfqq) | 2417 | struct cfq_queue *cfqq) |
2030 | { | 2418 | { |
2031 | if (cfqq) { | 2419 | if (cfqq) { |
2032 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", | 2420 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d", |
2033 | cfqd->serving_prio, cfqd->serving_type); | 2421 | cfqd->serving_wl_class, cfqd->serving_wl_type); |
2034 | cfqg_stats_update_avg_queue_size(cfqq->cfqg); | 2422 | cfqg_stats_update_avg_queue_size(cfqq->cfqg); |
2035 | cfqq->slice_start = 0; | 2423 | cfqq->slice_start = 0; |
2036 | cfqq->dispatch_start = jiffies; | 2424 | cfqq->dispatch_start = jiffies; |
@@ -2116,19 +2504,18 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) | |||
2116 | */ | 2504 | */ |
2117 | static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) | 2505 | static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) |
2118 | { | 2506 | { |
2119 | struct cfq_rb_root *service_tree = | 2507 | struct cfq_rb_root *st = st_for(cfqd->serving_group, |
2120 | service_tree_for(cfqd->serving_group, cfqd->serving_prio, | 2508 | cfqd->serving_wl_class, cfqd->serving_wl_type); |
2121 | cfqd->serving_type); | ||
2122 | 2509 | ||
2123 | if (!cfqd->rq_queued) | 2510 | if (!cfqd->rq_queued) |
2124 | return NULL; | 2511 | return NULL; |
2125 | 2512 | ||
2126 | /* There is nothing to dispatch */ | 2513 | /* There is nothing to dispatch */ |
2127 | if (!service_tree) | 2514 | if (!st) |
2128 | return NULL; | 2515 | return NULL; |
2129 | if (RB_EMPTY_ROOT(&service_tree->rb)) | 2516 | if (RB_EMPTY_ROOT(&st->rb)) |
2130 | return NULL; | 2517 | return NULL; |
2131 | return cfq_rb_first(service_tree); | 2518 | return cfq_rb_first(st); |
2132 | } | 2519 | } |
2133 | 2520 | ||
2134 | static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) | 2521 | static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) |
@@ -2284,17 +2671,17 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, | |||
2284 | 2671 | ||
2285 | static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 2672 | static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
2286 | { | 2673 | { |
2287 | enum wl_prio_t prio = cfqq_prio(cfqq); | 2674 | enum wl_class_t wl_class = cfqq_class(cfqq); |
2288 | struct cfq_rb_root *service_tree = cfqq->service_tree; | 2675 | struct cfq_rb_root *st = cfqq->service_tree; |
2289 | 2676 | ||
2290 | BUG_ON(!service_tree); | 2677 | BUG_ON(!st); |
2291 | BUG_ON(!service_tree->count); | 2678 | BUG_ON(!st->count); |
2292 | 2679 | ||
2293 | if (!cfqd->cfq_slice_idle) | 2680 | if (!cfqd->cfq_slice_idle) |
2294 | return false; | 2681 | return false; |
2295 | 2682 | ||
2296 | /* We never do for idle class queues. */ | 2683 | /* We never do for idle class queues. */ |
2297 | if (prio == IDLE_WORKLOAD) | 2684 | if (wl_class == IDLE_WORKLOAD) |
2298 | return false; | 2685 | return false; |
2299 | 2686 | ||
2300 | /* We do for queues that were marked with idle window flag. */ | 2687 | /* We do for queues that were marked with idle window flag. */ |
@@ -2306,11 +2693,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
2306 | * Otherwise, we do only if they are the last ones | 2693 | * Otherwise, we do only if they are the last ones |
2307 | * in their service tree. | 2694 | * in their service tree. |
2308 | */ | 2695 | */ |
2309 | if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) && | 2696 | if (st->count == 1 && cfq_cfqq_sync(cfqq) && |
2310 | !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false)) | 2697 | !cfq_io_thinktime_big(cfqd, &st->ttime, false)) |
2311 | return true; | 2698 | return true; |
2312 | cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", | 2699 | cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count); |
2313 | service_tree->count); | ||
2314 | return false; | 2700 | return false; |
2315 | } | 2701 | } |
2316 | 2702 | ||
@@ -2493,8 +2879,8 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) | |||
2493 | } | 2879 | } |
2494 | } | 2880 | } |
2495 | 2881 | ||
2496 | static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, | 2882 | static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd, |
2497 | struct cfq_group *cfqg, enum wl_prio_t prio) | 2883 | struct cfq_group *cfqg, enum wl_class_t wl_class) |
2498 | { | 2884 | { |
2499 | struct cfq_queue *queue; | 2885 | struct cfq_queue *queue; |
2500 | int i; | 2886 | int i; |
@@ -2504,7 +2890,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, | |||
2504 | 2890 | ||
2505 | for (i = 0; i <= SYNC_WORKLOAD; ++i) { | 2891 | for (i = 0; i <= SYNC_WORKLOAD; ++i) { |
2506 | /* select the one with lowest rb_key */ | 2892 | /* select the one with lowest rb_key */ |
2507 | queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); | 2893 | queue = cfq_rb_first(st_for(cfqg, wl_class, i)); |
2508 | if (queue && | 2894 | if (queue && |
2509 | (!key_valid || time_before(queue->rb_key, lowest_key))) { | 2895 | (!key_valid || time_before(queue->rb_key, lowest_key))) { |
2510 | lowest_key = queue->rb_key; | 2896 | lowest_key = queue->rb_key; |
@@ -2516,26 +2902,27 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, | |||
2516 | return cur_best; | 2902 | return cur_best; |
2517 | } | 2903 | } |
2518 | 2904 | ||
2519 | static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) | 2905 | static void |
2906 | choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
2520 | { | 2907 | { |
2521 | unsigned slice; | 2908 | unsigned slice; |
2522 | unsigned count; | 2909 | unsigned count; |
2523 | struct cfq_rb_root *st; | 2910 | struct cfq_rb_root *st; |
2524 | unsigned group_slice; | 2911 | unsigned group_slice; |
2525 | enum wl_prio_t original_prio = cfqd->serving_prio; | 2912 | enum wl_class_t original_class = cfqd->serving_wl_class; |
2526 | 2913 | ||
2527 | /* Choose next priority. RT > BE > IDLE */ | 2914 | /* Choose next priority. RT > BE > IDLE */ |
2528 | if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) | 2915 | if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) |
2529 | cfqd->serving_prio = RT_WORKLOAD; | 2916 | cfqd->serving_wl_class = RT_WORKLOAD; |
2530 | else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) | 2917 | else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) |
2531 | cfqd->serving_prio = BE_WORKLOAD; | 2918 | cfqd->serving_wl_class = BE_WORKLOAD; |
2532 | else { | 2919 | else { |
2533 | cfqd->serving_prio = IDLE_WORKLOAD; | 2920 | cfqd->serving_wl_class = IDLE_WORKLOAD; |
2534 | cfqd->workload_expires = jiffies + 1; | 2921 | cfqd->workload_expires = jiffies + 1; |
2535 | return; | 2922 | return; |
2536 | } | 2923 | } |
2537 | 2924 | ||
2538 | if (original_prio != cfqd->serving_prio) | 2925 | if (original_class != cfqd->serving_wl_class) |
2539 | goto new_workload; | 2926 | goto new_workload; |
2540 | 2927 | ||
2541 | /* | 2928 | /* |
@@ -2543,7 +2930,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
2543 | * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload | 2930 | * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload |
2544 | * expiration time | 2931 | * expiration time |
2545 | */ | 2932 | */ |
2546 | st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); | 2933 | st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); |
2547 | count = st->count; | 2934 | count = st->count; |
2548 | 2935 | ||
2549 | /* | 2936 | /* |
@@ -2554,9 +2941,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
2554 | 2941 | ||
2555 | new_workload: | 2942 | new_workload: |
2556 | /* otherwise select new workload type */ | 2943 | /* otherwise select new workload type */ |
2557 | cfqd->serving_type = | 2944 | cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg, |
2558 | cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); | 2945 | cfqd->serving_wl_class); |
2559 | st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); | 2946 | st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); |
2560 | count = st->count; | 2947 | count = st->count; |
2561 | 2948 | ||
2562 | /* | 2949 | /* |
@@ -2567,10 +2954,11 @@ new_workload: | |||
2567 | group_slice = cfq_group_slice(cfqd, cfqg); | 2954 | group_slice = cfq_group_slice(cfqd, cfqg); |
2568 | 2955 | ||
2569 | slice = group_slice * count / | 2956 | slice = group_slice * count / |
2570 | max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], | 2957 | max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class], |
2571 | cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); | 2958 | cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd, |
2959 | cfqg)); | ||
2572 | 2960 | ||
2573 | if (cfqd->serving_type == ASYNC_WORKLOAD) { | 2961 | if (cfqd->serving_wl_type == ASYNC_WORKLOAD) { |
2574 | unsigned int tmp; | 2962 | unsigned int tmp; |
2575 | 2963 | ||
2576 | /* | 2964 | /* |
@@ -2616,14 +3004,14 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd) | |||
2616 | cfqd->serving_group = cfqg; | 3004 | cfqd->serving_group = cfqg; |
2617 | 3005 | ||
2618 | /* Restore the workload type data */ | 3006 | /* Restore the workload type data */ |
2619 | if (cfqg->saved_workload_slice) { | 3007 | if (cfqg->saved_wl_slice) { |
2620 | cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; | 3008 | cfqd->workload_expires = jiffies + cfqg->saved_wl_slice; |
2621 | cfqd->serving_type = cfqg->saved_workload; | 3009 | cfqd->serving_wl_type = cfqg->saved_wl_type; |
2622 | cfqd->serving_prio = cfqg->saved_serving_prio; | 3010 | cfqd->serving_wl_class = cfqg->saved_wl_class; |
2623 | } else | 3011 | } else |
2624 | cfqd->workload_expires = jiffies - 1; | 3012 | cfqd->workload_expires = jiffies - 1; |
2625 | 3013 | ||
2626 | choose_service_tree(cfqd, cfqg); | 3014 | choose_wl_class_and_type(cfqd, cfqg); |
2627 | } | 3015 | } |
2628 | 3016 | ||
2629 | /* | 3017 | /* |
@@ -3205,6 +3593,8 @@ retry: | |||
3205 | spin_lock_irq(cfqd->queue->queue_lock); | 3593 | spin_lock_irq(cfqd->queue->queue_lock); |
3206 | if (new_cfqq) | 3594 | if (new_cfqq) |
3207 | goto retry; | 3595 | goto retry; |
3596 | else | ||
3597 | return &cfqd->oom_cfqq; | ||
3208 | } else { | 3598 | } else { |
3209 | cfqq = kmem_cache_alloc_node(cfq_pool, | 3599 | cfqq = kmem_cache_alloc_node(cfq_pool, |
3210 | gfp_mask | __GFP_ZERO, | 3600 | gfp_mask | __GFP_ZERO, |
@@ -3402,7 +3792,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, | |||
3402 | return true; | 3792 | return true; |
3403 | 3793 | ||
3404 | /* Allow preemption only if we are idling on sync-noidle tree */ | 3794 | /* Allow preemption only if we are idling on sync-noidle tree */ |
3405 | if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && | 3795 | if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && |
3406 | cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && | 3796 | cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && |
3407 | new_cfqq->service_tree->count == 2 && | 3797 | new_cfqq->service_tree->count == 2 && |
3408 | RB_EMPTY_ROOT(&cfqq->sort_list)) | 3798 | RB_EMPTY_ROOT(&cfqq->sort_list)) |
@@ -3454,7 +3844,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
3454 | * doesn't happen | 3844 | * doesn't happen |
3455 | */ | 3845 | */ |
3456 | if (old_type != cfqq_type(cfqq)) | 3846 | if (old_type != cfqq_type(cfqq)) |
3457 | cfqq->cfqg->saved_workload_slice = 0; | 3847 | cfqq->cfqg->saved_wl_slice = 0; |
3458 | 3848 | ||
3459 | /* | 3849 | /* |
3460 | * Put the new queue at the front of the of the current list, | 3850 | * Put the new queue at the front of the of the current list, |
@@ -3636,16 +4026,17 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) | |||
3636 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; | 4026 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; |
3637 | 4027 | ||
3638 | if (sync) { | 4028 | if (sync) { |
3639 | struct cfq_rb_root *service_tree; | 4029 | struct cfq_rb_root *st; |
3640 | 4030 | ||
3641 | RQ_CIC(rq)->ttime.last_end_request = now; | 4031 | RQ_CIC(rq)->ttime.last_end_request = now; |
3642 | 4032 | ||
3643 | if (cfq_cfqq_on_rr(cfqq)) | 4033 | if (cfq_cfqq_on_rr(cfqq)) |
3644 | service_tree = cfqq->service_tree; | 4034 | st = cfqq->service_tree; |
3645 | else | 4035 | else |
3646 | service_tree = service_tree_for(cfqq->cfqg, | 4036 | st = st_for(cfqq->cfqg, cfqq_class(cfqq), |
3647 | cfqq_prio(cfqq), cfqq_type(cfqq)); | 4037 | cfqq_type(cfqq)); |
3648 | service_tree->ttime.last_end_request = now; | 4038 | |
4039 | st->ttime.last_end_request = now; | ||
3649 | if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) | 4040 | if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) |
3650 | cfqd->last_delayed_sync = now; | 4041 | cfqd->last_delayed_sync = now; |
3651 | } | 4042 | } |
@@ -3992,6 +4383,7 @@ static int cfq_init_queue(struct request_queue *q) | |||
3992 | cfq_init_cfqg_base(cfqd->root_group); | 4383 | cfq_init_cfqg_base(cfqd->root_group); |
3993 | #endif | 4384 | #endif |
3994 | cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; | 4385 | cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; |
4386 | cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; | ||
3995 | 4387 | ||
3996 | /* | 4388 | /* |
3997 | * Not strictly needed (since RB_ROOT just clears the node and we | 4389 | * Not strictly needed (since RB_ROOT just clears the node and we |
@@ -4176,6 +4568,7 @@ static struct blkcg_policy blkcg_policy_cfq = { | |||
4176 | .cftypes = cfq_blkcg_files, | 4568 | .cftypes = cfq_blkcg_files, |
4177 | 4569 | ||
4178 | .pd_init_fn = cfq_pd_init, | 4570 | .pd_init_fn = cfq_pd_init, |
4571 | .pd_offline_fn = cfq_pd_offline, | ||
4179 | .pd_reset_stats_fn = cfq_pd_reset_stats, | 4572 | .pd_reset_stats_fn = cfq_pd_reset_stats, |
4180 | }; | 4573 | }; |
4181 | #endif | 4574 | #endif |
diff --git a/block/elevator.c b/block/elevator.c index d0acb31cc083..a0ffdd943c98 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -46,11 +46,6 @@ static LIST_HEAD(elv_list); | |||
46 | /* | 46 | /* |
47 | * Merge hash stuff. | 47 | * Merge hash stuff. |
48 | */ | 48 | */ |
49 | static const int elv_hash_shift = 6; | ||
50 | #define ELV_HASH_BLOCK(sec) ((sec) >> 3) | ||
51 | #define ELV_HASH_FN(sec) \ | ||
52 | (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift)) | ||
53 | #define ELV_HASH_ENTRIES (1 << elv_hash_shift) | ||
54 | #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) | 49 | #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) |
55 | 50 | ||
56 | /* | 51 | /* |
@@ -158,7 +153,6 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, | |||
158 | struct elevator_type *e) | 153 | struct elevator_type *e) |
159 | { | 154 | { |
160 | struct elevator_queue *eq; | 155 | struct elevator_queue *eq; |
161 | int i; | ||
162 | 156 | ||
163 | eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node); | 157 | eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node); |
164 | if (unlikely(!eq)) | 158 | if (unlikely(!eq)) |
@@ -167,14 +161,7 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, | |||
167 | eq->type = e; | 161 | eq->type = e; |
168 | kobject_init(&eq->kobj, &elv_ktype); | 162 | kobject_init(&eq->kobj, &elv_ktype); |
169 | mutex_init(&eq->sysfs_lock); | 163 | mutex_init(&eq->sysfs_lock); |
170 | 164 | hash_init(eq->hash); | |
171 | eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES, | ||
172 | GFP_KERNEL, q->node); | ||
173 | if (!eq->hash) | ||
174 | goto err; | ||
175 | |||
176 | for (i = 0; i < ELV_HASH_ENTRIES; i++) | ||
177 | INIT_HLIST_HEAD(&eq->hash[i]); | ||
178 | 165 | ||
179 | return eq; | 166 | return eq; |
180 | err: | 167 | err: |
@@ -189,7 +176,6 @@ static void elevator_release(struct kobject *kobj) | |||
189 | 176 | ||
190 | e = container_of(kobj, struct elevator_queue, kobj); | 177 | e = container_of(kobj, struct elevator_queue, kobj); |
191 | elevator_put(e->type); | 178 | elevator_put(e->type); |
192 | kfree(e->hash); | ||
193 | kfree(e); | 179 | kfree(e); |
194 | } | 180 | } |
195 | 181 | ||
@@ -261,7 +247,7 @@ EXPORT_SYMBOL(elevator_exit); | |||
261 | 247 | ||
262 | static inline void __elv_rqhash_del(struct request *rq) | 248 | static inline void __elv_rqhash_del(struct request *rq) |
263 | { | 249 | { |
264 | hlist_del_init(&rq->hash); | 250 | hash_del(&rq->hash); |
265 | } | 251 | } |
266 | 252 | ||
267 | static void elv_rqhash_del(struct request_queue *q, struct request *rq) | 253 | static void elv_rqhash_del(struct request_queue *q, struct request *rq) |
@@ -275,7 +261,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq) | |||
275 | struct elevator_queue *e = q->elevator; | 261 | struct elevator_queue *e = q->elevator; |
276 | 262 | ||
277 | BUG_ON(ELV_ON_HASH(rq)); | 263 | BUG_ON(ELV_ON_HASH(rq)); |
278 | hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]); | 264 | hash_add(e->hash, &rq->hash, rq_hash_key(rq)); |
279 | } | 265 | } |
280 | 266 | ||
281 | static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) | 267 | static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) |
@@ -287,11 +273,10 @@ static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) | |||
287 | static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) | 273 | static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) |
288 | { | 274 | { |
289 | struct elevator_queue *e = q->elevator; | 275 | struct elevator_queue *e = q->elevator; |
290 | struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)]; | ||
291 | struct hlist_node *next; | 276 | struct hlist_node *next; |
292 | struct request *rq; | 277 | struct request *rq; |
293 | 278 | ||
294 | hlist_for_each_entry_safe(rq, next, hash_list, hash) { | 279 | hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { |
295 | BUG_ON(!ELV_ON_HASH(rq)); | 280 | BUG_ON(!ELV_ON_HASH(rq)); |
296 | 281 | ||
297 | if (unlikely(!rq_mergeable(rq))) { | 282 | if (unlikely(!rq_mergeable(rq))) { |
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 57763c54363a..758f2ac878cf 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c | |||
@@ -1090,10 +1090,13 @@ static const struct block_device_operations floppy_fops = { | |||
1090 | static void swim3_mb_event(struct macio_dev* mdev, int mb_state) | 1090 | static void swim3_mb_event(struct macio_dev* mdev, int mb_state) |
1091 | { | 1091 | { |
1092 | struct floppy_state *fs = macio_get_drvdata(mdev); | 1092 | struct floppy_state *fs = macio_get_drvdata(mdev); |
1093 | struct swim3 __iomem *sw = fs->swim3; | 1093 | struct swim3 __iomem *sw; |
1094 | 1094 | ||
1095 | if (!fs) | 1095 | if (!fs) |
1096 | return; | 1096 | return; |
1097 | |||
1098 | sw = fs->swim3; | ||
1099 | |||
1097 | if (mb_state != MB_FD) | 1100 | if (mb_state != MB_FD) |
1098 | return; | 1101 | return; |
1099 | 1102 | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e67a4be0080d..bb2cd3ce9b0f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -626,7 +626,6 @@ static void dec_pending(struct dm_io *io, int error) | |||
626 | queue_io(md, bio); | 626 | queue_io(md, bio); |
627 | } else { | 627 | } else { |
628 | /* done with normal IO or empty flush */ | 628 | /* done with normal IO or empty flush */ |
629 | trace_block_bio_complete(md->queue, bio, io_error); | ||
630 | bio_endio(bio, io_error); | 629 | bio_endio(bio, io_error); |
631 | } | 630 | } |
632 | } | 631 | } |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 697f026cb318..5af2d2709081 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -184,8 +184,6 @@ static void return_io(struct bio *return_bi) | |||
184 | return_bi = bi->bi_next; | 184 | return_bi = bi->bi_next; |
185 | bi->bi_next = NULL; | 185 | bi->bi_next = NULL; |
186 | bi->bi_size = 0; | 186 | bi->bi_size = 0; |
187 | trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), | ||
188 | bi, 0); | ||
189 | bio_endio(bi, 0); | 187 | bio_endio(bi, 0); |
190 | bi = return_bi; | 188 | bi = return_bi; |
191 | } | 189 | } |
@@ -3916,8 +3914,6 @@ static void raid5_align_endio(struct bio *bi, int error) | |||
3916 | rdev_dec_pending(rdev, conf->mddev); | 3914 | rdev_dec_pending(rdev, conf->mddev); |
3917 | 3915 | ||
3918 | if (!error && uptodate) { | 3916 | if (!error && uptodate) { |
3919 | trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), | ||
3920 | raid_bi, 0); | ||
3921 | bio_endio(raid_bi, 0); | 3917 | bio_endio(raid_bi, 0); |
3922 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 3918 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
3923 | wake_up(&conf->wait_for_stripe); | 3919 | wake_up(&conf->wait_for_stripe); |
@@ -4376,8 +4372,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4376 | if ( rw == WRITE ) | 4372 | if ( rw == WRITE ) |
4377 | md_write_end(mddev); | 4373 | md_write_end(mddev); |
4378 | 4374 | ||
4379 | trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), | ||
4380 | bi, 0); | ||
4381 | bio_endio(bi, 0); | 4375 | bio_endio(bi, 0); |
4382 | } | 4376 | } |
4383 | } | 4377 | } |
@@ -4754,11 +4748,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
4754 | handled++; | 4748 | handled++; |
4755 | } | 4749 | } |
4756 | remaining = raid5_dec_bi_active_stripes(raid_bio); | 4750 | remaining = raid5_dec_bi_active_stripes(raid_bio); |
4757 | if (remaining == 0) { | 4751 | if (remaining == 0) |
4758 | trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), | ||
4759 | raid_bio, 0); | ||
4760 | bio_endio(raid_bio, 0); | 4752 | bio_endio(raid_bio, 0); |
4761 | } | ||
4762 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 4753 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
4763 | wake_up(&conf->wait_for_stripe); | 4754 | wake_up(&conf->wait_for_stripe); |
4764 | return handled; | 4755 | return handled; |
@@ -1428,6 +1428,8 @@ void bio_endio(struct bio *bio, int error) | |||
1428 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | 1428 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1429 | error = -EIO; | 1429 | error = -EIO; |
1430 | 1430 | ||
1431 | trace_block_bio_complete(bio, error); | ||
1432 | |||
1431 | if (bio->bi_end_io) | 1433 | if (bio->bi_end_io) |
1432 | bio->bi_end_io(bio, error); | 1434 | bio->bi_end_io(bio, error); |
1433 | } | 1435 | } |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 53f5fae5cfbe..aea605c98ba6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -1033,7 +1033,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) | |||
1033 | { | 1033 | { |
1034 | unsigned bsize = bdev_logical_block_size(bdev); | 1034 | unsigned bsize = bdev_logical_block_size(bdev); |
1035 | 1035 | ||
1036 | bdev->bd_inode->i_size = size; | 1036 | mutex_lock(&bdev->bd_inode->i_mutex); |
1037 | i_size_write(bdev->bd_inode, size); | ||
1038 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
1037 | while (bsize < PAGE_CACHE_SIZE) { | 1039 | while (bsize < PAGE_CACHE_SIZE) { |
1038 | if (size & bsize) | 1040 | if (size & bsize) |
1039 | break; | 1041 | break; |
@@ -1118,7 +1120,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1118 | } | 1120 | } |
1119 | } | 1121 | } |
1120 | 1122 | ||
1121 | if (!ret && !bdev->bd_openers) { | 1123 | if (!ret) { |
1122 | bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); | 1124 | bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); |
1123 | bdi = blk_get_backing_dev_info(bdev); | 1125 | bdi = blk_get_backing_dev_info(bdev); |
1124 | if (bdi == NULL) | 1126 | if (bdi == NULL) |
diff --git a/fs/buffer.c b/fs/buffer.c index 8e18281b4077..b4dcb34c9635 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
42 | #include <linux/mpage.h> | 42 | #include <linux/mpage.h> |
43 | #include <linux/bit_spinlock.h> | 43 | #include <linux/bit_spinlock.h> |
44 | #include <trace/events/block.h> | ||
44 | 45 | ||
45 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); | 46 | static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); |
46 | 47 | ||
@@ -53,6 +54,13 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) | |||
53 | } | 54 | } |
54 | EXPORT_SYMBOL(init_buffer); | 55 | EXPORT_SYMBOL(init_buffer); |
55 | 56 | ||
57 | inline void touch_buffer(struct buffer_head *bh) | ||
58 | { | ||
59 | trace_block_touch_buffer(bh); | ||
60 | mark_page_accessed(bh->b_page); | ||
61 | } | ||
62 | EXPORT_SYMBOL(touch_buffer); | ||
63 | |||
56 | static int sleep_on_buffer(void *word) | 64 | static int sleep_on_buffer(void *word) |
57 | { | 65 | { |
58 | io_schedule(); | 66 | io_schedule(); |
@@ -1113,6 +1121,8 @@ void mark_buffer_dirty(struct buffer_head *bh) | |||
1113 | { | 1121 | { |
1114 | WARN_ON_ONCE(!buffer_uptodate(bh)); | 1122 | WARN_ON_ONCE(!buffer_uptodate(bh)); |
1115 | 1123 | ||
1124 | trace_block_dirty_buffer(bh); | ||
1125 | |||
1116 | /* | 1126 | /* |
1117 | * Very *carefully* optimize the it-is-already-dirty case. | 1127 | * Very *carefully* optimize the it-is-already-dirty case. |
1118 | * | 1128 | * |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 310972b72a66..359494ea1bde 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -318,8 +318,14 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) | |||
318 | 318 | ||
319 | static int write_inode(struct inode *inode, struct writeback_control *wbc) | 319 | static int write_inode(struct inode *inode, struct writeback_control *wbc) |
320 | { | 320 | { |
321 | if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) | 321 | int ret; |
322 | return inode->i_sb->s_op->write_inode(inode, wbc); | 322 | |
323 | if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { | ||
324 | trace_writeback_write_inode_start(inode, wbc); | ||
325 | ret = inode->i_sb->s_op->write_inode(inode, wbc); | ||
326 | trace_writeback_write_inode(inode, wbc); | ||
327 | return ret; | ||
328 | } | ||
323 | return 0; | 329 | return 0; |
324 | } | 330 | } |
325 | 331 | ||
@@ -450,6 +456,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) | |||
450 | 456 | ||
451 | WARN_ON(!(inode->i_state & I_SYNC)); | 457 | WARN_ON(!(inode->i_state & I_SYNC)); |
452 | 458 | ||
459 | trace_writeback_single_inode_start(inode, wbc, nr_to_write); | ||
460 | |||
453 | ret = do_writepages(mapping, wbc); | 461 | ret = do_writepages(mapping, wbc); |
454 | 462 | ||
455 | /* | 463 | /* |
@@ -1150,8 +1158,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) | |||
1150 | * dirty the inode itself | 1158 | * dirty the inode itself |
1151 | */ | 1159 | */ |
1152 | if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { | 1160 | if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { |
1161 | trace_writeback_dirty_inode_start(inode, flags); | ||
1162 | |||
1153 | if (sb->s_op->dirty_inode) | 1163 | if (sb->s_op->dirty_inode) |
1154 | sb->s_op->dirty_inode(inode, flags); | 1164 | sb->s_op->dirty_inode(inode, flags); |
1165 | |||
1166 | trace_writeback_dirty_inode(inode, flags); | ||
1155 | } | 1167 | } |
1156 | 1168 | ||
1157 | /* | 1169 | /* |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f94bc83011ed..78feda9bbae2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/gfp.h> | 19 | #include <linux/gfp.h> |
20 | #include <linux/bsg.h> | 20 | #include <linux/bsg.h> |
21 | #include <linux/smp.h> | 21 | #include <linux/smp.h> |
22 | #include <linux/rcupdate.h> | ||
22 | 23 | ||
23 | #include <asm/scatterlist.h> | 24 | #include <asm/scatterlist.h> |
24 | 25 | ||
@@ -437,6 +438,7 @@ struct request_queue { | |||
437 | /* Throttle data */ | 438 | /* Throttle data */ |
438 | struct throtl_data *td; | 439 | struct throtl_data *td; |
439 | #endif | 440 | #endif |
441 | struct rcu_head rcu_head; | ||
440 | }; | 442 | }; |
441 | 443 | ||
442 | #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ | 444 | #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ |
@@ -974,7 +976,6 @@ struct blk_plug { | |||
974 | unsigned long magic; /* detect uninitialized use-cases */ | 976 | unsigned long magic; /* detect uninitialized use-cases */ |
975 | struct list_head list; /* requests */ | 977 | struct list_head list; /* requests */ |
976 | struct list_head cb_list; /* md requires an unplug callback */ | 978 | struct list_head cb_list; /* md requires an unplug callback */ |
977 | unsigned int should_sort; /* list to be sorted before flushing? */ | ||
978 | }; | 979 | }; |
979 | #define BLK_MAX_REQUEST_COUNT 16 | 980 | #define BLK_MAX_REQUEST_COUNT 16 |
980 | 981 | ||
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 7c2e030e72f1..0ea61e07a91c 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h | |||
@@ -12,6 +12,7 @@ | |||
12 | 12 | ||
13 | struct blk_trace { | 13 | struct blk_trace { |
14 | int trace_state; | 14 | int trace_state; |
15 | bool rq_based; | ||
15 | struct rchan *rchan; | 16 | struct rchan *rchan; |
16 | unsigned long __percpu *sequence; | 17 | unsigned long __percpu *sequence; |
17 | unsigned char __percpu *msg_data; | 18 | unsigned char __percpu *msg_data; |
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 458f497738a4..5afc4f94d110 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -126,7 +126,6 @@ BUFFER_FNS(Write_EIO, write_io_error) | |||
126 | BUFFER_FNS(Unwritten, unwritten) | 126 | BUFFER_FNS(Unwritten, unwritten) |
127 | 127 | ||
128 | #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) | 128 | #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) |
129 | #define touch_buffer(bh) mark_page_accessed(bh->b_page) | ||
130 | 129 | ||
131 | /* If we *know* page->private refers to buffer_heads */ | 130 | /* If we *know* page->private refers to buffer_heads */ |
132 | #define page_buffers(page) \ | 131 | #define page_buffers(page) \ |
@@ -142,6 +141,7 @@ BUFFER_FNS(Unwritten, unwritten) | |||
142 | 141 | ||
143 | void mark_buffer_dirty(struct buffer_head *bh); | 142 | void mark_buffer_dirty(struct buffer_head *bh); |
144 | void init_buffer(struct buffer_head *, bh_end_io_t *, void *); | 143 | void init_buffer(struct buffer_head *, bh_end_io_t *, void *); |
144 | void touch_buffer(struct buffer_head *bh); | ||
145 | void set_bh_page(struct buffer_head *bh, | 145 | void set_bh_page(struct buffer_head *bh, |
146 | struct page *page, unsigned long offset); | 146 | struct page *page, unsigned long offset); |
147 | int try_to_free_buffers(struct page *); | 147 | int try_to_free_buffers(struct page *); |
diff --git a/include/linux/completion.h b/include/linux/completion.h index 51494e6b5548..33f0280fd533 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h | |||
@@ -77,10 +77,13 @@ static inline void init_completion(struct completion *x) | |||
77 | } | 77 | } |
78 | 78 | ||
79 | extern void wait_for_completion(struct completion *); | 79 | extern void wait_for_completion(struct completion *); |
80 | extern void wait_for_completion_io(struct completion *); | ||
80 | extern int wait_for_completion_interruptible(struct completion *x); | 81 | extern int wait_for_completion_interruptible(struct completion *x); |
81 | extern int wait_for_completion_killable(struct completion *x); | 82 | extern int wait_for_completion_killable(struct completion *x); |
82 | extern unsigned long wait_for_completion_timeout(struct completion *x, | 83 | extern unsigned long wait_for_completion_timeout(struct completion *x, |
83 | unsigned long timeout); | 84 | unsigned long timeout); |
85 | extern unsigned long wait_for_completion_io_timeout(struct completion *x, | ||
86 | unsigned long timeout); | ||
84 | extern long wait_for_completion_interruptible_timeout( | 87 | extern long wait_for_completion_interruptible_timeout( |
85 | struct completion *x, unsigned long timeout); | 88 | struct completion *x, unsigned long timeout); |
86 | extern long wait_for_completion_killable_timeout( | 89 | extern long wait_for_completion_killable_timeout( |
diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 186620631750..acd0312d46fb 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _LINUX_ELEVATOR_H | 2 | #define _LINUX_ELEVATOR_H |
3 | 3 | ||
4 | #include <linux/percpu.h> | 4 | #include <linux/percpu.h> |
5 | #include <linux/hashtable.h> | ||
5 | 6 | ||
6 | #ifdef CONFIG_BLOCK | 7 | #ifdef CONFIG_BLOCK |
7 | 8 | ||
@@ -96,6 +97,8 @@ struct elevator_type | |||
96 | struct list_head list; | 97 | struct list_head list; |
97 | }; | 98 | }; |
98 | 99 | ||
100 | #define ELV_HASH_BITS 6 | ||
101 | |||
99 | /* | 102 | /* |
100 | * each queue has an elevator_queue associated with it | 103 | * each queue has an elevator_queue associated with it |
101 | */ | 104 | */ |
@@ -105,8 +108,8 @@ struct elevator_queue | |||
105 | void *elevator_data; | 108 | void *elevator_data; |
106 | struct kobject kobj; | 109 | struct kobject kobj; |
107 | struct mutex sysfs_lock; | 110 | struct mutex sysfs_lock; |
108 | struct hlist_head *hash; | ||
109 | unsigned int registered:1; | 111 | unsigned int registered:1; |
112 | DECLARE_HASHTABLE(hash, ELV_HASH_BITS); | ||
110 | }; | 113 | }; |
111 | 114 | ||
112 | /* | 115 | /* |
diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 05c5e61f0a7c..9961726523d0 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h | |||
@@ -6,10 +6,61 @@ | |||
6 | 6 | ||
7 | #include <linux/blktrace_api.h> | 7 | #include <linux/blktrace_api.h> |
8 | #include <linux/blkdev.h> | 8 | #include <linux/blkdev.h> |
9 | #include <linux/buffer_head.h> | ||
9 | #include <linux/tracepoint.h> | 10 | #include <linux/tracepoint.h> |
10 | 11 | ||
11 | #define RWBS_LEN 8 | 12 | #define RWBS_LEN 8 |
12 | 13 | ||
14 | DECLARE_EVENT_CLASS(block_buffer, | ||
15 | |||
16 | TP_PROTO(struct buffer_head *bh), | ||
17 | |||
18 | TP_ARGS(bh), | ||
19 | |||
20 | TP_STRUCT__entry ( | ||
21 | __field( dev_t, dev ) | ||
22 | __field( sector_t, sector ) | ||
23 | __field( size_t, size ) | ||
24 | ), | ||
25 | |||
26 | TP_fast_assign( | ||
27 | __entry->dev = bh->b_bdev->bd_dev; | ||
28 | __entry->sector = bh->b_blocknr; | ||
29 | __entry->size = bh->b_size; | ||
30 | ), | ||
31 | |||
32 | TP_printk("%d,%d sector=%llu size=%zu", | ||
33 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
34 | (unsigned long long)__entry->sector, __entry->size | ||
35 | ) | ||
36 | ); | ||
37 | |||
38 | /** | ||
39 | * block_touch_buffer - mark a buffer accessed | ||
40 | * @bh: buffer_head being touched | ||
41 | * | ||
42 | * Called from touch_buffer(). | ||
43 | */ | ||
44 | DEFINE_EVENT(block_buffer, block_touch_buffer, | ||
45 | |||
46 | TP_PROTO(struct buffer_head *bh), | ||
47 | |||
48 | TP_ARGS(bh) | ||
49 | ); | ||
50 | |||
51 | /** | ||
52 | * block_dirty_buffer - mark a buffer dirty | ||
53 | * @bh: buffer_head being dirtied | ||
54 | * | ||
55 | * Called from mark_buffer_dirty(). | ||
56 | */ | ||
57 | DEFINE_EVENT(block_buffer, block_dirty_buffer, | ||
58 | |||
59 | TP_PROTO(struct buffer_head *bh), | ||
60 | |||
61 | TP_ARGS(bh) | ||
62 | ); | ||
63 | |||
13 | DECLARE_EVENT_CLASS(block_rq_with_error, | 64 | DECLARE_EVENT_CLASS(block_rq_with_error, |
14 | 65 | ||
15 | TP_PROTO(struct request_queue *q, struct request *rq), | 66 | TP_PROTO(struct request_queue *q, struct request *rq), |
@@ -206,7 +257,6 @@ TRACE_EVENT(block_bio_bounce, | |||
206 | 257 | ||
207 | /** | 258 | /** |
208 | * block_bio_complete - completed all work on the block operation | 259 | * block_bio_complete - completed all work on the block operation |
209 | * @q: queue holding the block operation | ||
210 | * @bio: block operation completed | 260 | * @bio: block operation completed |
211 | * @error: io error value | 261 | * @error: io error value |
212 | * | 262 | * |
@@ -215,9 +265,9 @@ TRACE_EVENT(block_bio_bounce, | |||
215 | */ | 265 | */ |
216 | TRACE_EVENT(block_bio_complete, | 266 | TRACE_EVENT(block_bio_complete, |
217 | 267 | ||
218 | TP_PROTO(struct request_queue *q, struct bio *bio, int error), | 268 | TP_PROTO(struct bio *bio, int error), |
219 | 269 | ||
220 | TP_ARGS(q, bio, error), | 270 | TP_ARGS(bio, error), |
221 | 271 | ||
222 | TP_STRUCT__entry( | 272 | TP_STRUCT__entry( |
223 | __field( dev_t, dev ) | 273 | __field( dev_t, dev ) |
@@ -228,7 +278,8 @@ TRACE_EVENT(block_bio_complete, | |||
228 | ), | 278 | ), |
229 | 279 | ||
230 | TP_fast_assign( | 280 | TP_fast_assign( |
231 | __entry->dev = bio->bi_bdev->bd_dev; | 281 | __entry->dev = bio->bi_bdev ? |
282 | bio->bi_bdev->bd_dev : 0; | ||
232 | __entry->sector = bio->bi_sector; | 283 | __entry->sector = bio->bi_sector; |
233 | __entry->nr_sector = bio->bi_size >> 9; | 284 | __entry->nr_sector = bio->bi_size >> 9; |
234 | __entry->error = error; | 285 | __entry->error = error; |
@@ -241,11 +292,11 @@ TRACE_EVENT(block_bio_complete, | |||
241 | __entry->nr_sector, __entry->error) | 292 | __entry->nr_sector, __entry->error) |
242 | ); | 293 | ); |
243 | 294 | ||
244 | DECLARE_EVENT_CLASS(block_bio, | 295 | DECLARE_EVENT_CLASS(block_bio_merge, |
245 | 296 | ||
246 | TP_PROTO(struct request_queue *q, struct bio *bio), | 297 | TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), |
247 | 298 | ||
248 | TP_ARGS(q, bio), | 299 | TP_ARGS(q, rq, bio), |
249 | 300 | ||
250 | TP_STRUCT__entry( | 301 | TP_STRUCT__entry( |
251 | __field( dev_t, dev ) | 302 | __field( dev_t, dev ) |
@@ -272,31 +323,33 @@ DECLARE_EVENT_CLASS(block_bio, | |||
272 | /** | 323 | /** |
273 | * block_bio_backmerge - merging block operation to the end of an existing operation | 324 | * block_bio_backmerge - merging block operation to the end of an existing operation |
274 | * @q: queue holding operation | 325 | * @q: queue holding operation |
326 | * @rq: request bio is being merged into | ||
275 | * @bio: new block operation to merge | 327 | * @bio: new block operation to merge |
276 | * | 328 | * |
277 | * Merging block request @bio to the end of an existing block request | 329 | * Merging block request @bio to the end of an existing block request |
278 | * in queue @q. | 330 | * in queue @q. |
279 | */ | 331 | */ |
280 | DEFINE_EVENT(block_bio, block_bio_backmerge, | 332 | DEFINE_EVENT(block_bio_merge, block_bio_backmerge, |
281 | 333 | ||
282 | TP_PROTO(struct request_queue *q, struct bio *bio), | 334 | TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), |
283 | 335 | ||
284 | TP_ARGS(q, bio) | 336 | TP_ARGS(q, rq, bio) |
285 | ); | 337 | ); |
286 | 338 | ||
287 | /** | 339 | /** |
288 | * block_bio_frontmerge - merging block operation to the beginning of an existing operation | 340 | * block_bio_frontmerge - merging block operation to the beginning of an existing operation |
289 | * @q: queue holding operation | 341 | * @q: queue holding operation |
342 | * @rq: request bio is being merged into | ||
290 | * @bio: new block operation to merge | 343 | * @bio: new block operation to merge |
291 | * | 344 | * |
292 | * Merging block IO operation @bio to the beginning of an existing block | 345 | * Merging block IO operation @bio to the beginning of an existing block |
293 | * operation in queue @q. | 346 | * operation in queue @q. |
294 | */ | 347 | */ |
295 | DEFINE_EVENT(block_bio, block_bio_frontmerge, | 348 | DEFINE_EVENT(block_bio_merge, block_bio_frontmerge, |
296 | 349 | ||
297 | TP_PROTO(struct request_queue *q, struct bio *bio), | 350 | TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), |
298 | 351 | ||
299 | TP_ARGS(q, bio) | 352 | TP_ARGS(q, rq, bio) |
300 | ); | 353 | ); |
301 | 354 | ||
302 | /** | 355 | /** |
@@ -306,11 +359,32 @@ DEFINE_EVENT(block_bio, block_bio_frontmerge, | |||
306 | * | 359 | * |
307 | * About to place the block IO operation @bio into queue @q. | 360 | * About to place the block IO operation @bio into queue @q. |
308 | */ | 361 | */ |
309 | DEFINE_EVENT(block_bio, block_bio_queue, | 362 | TRACE_EVENT(block_bio_queue, |
310 | 363 | ||
311 | TP_PROTO(struct request_queue *q, struct bio *bio), | 364 | TP_PROTO(struct request_queue *q, struct bio *bio), |
312 | 365 | ||
313 | TP_ARGS(q, bio) | 366 | TP_ARGS(q, bio), |
367 | |||
368 | TP_STRUCT__entry( | ||
369 | __field( dev_t, dev ) | ||
370 | __field( sector_t, sector ) | ||
371 | __field( unsigned int, nr_sector ) | ||
372 | __array( char, rwbs, RWBS_LEN ) | ||
373 | __array( char, comm, TASK_COMM_LEN ) | ||
374 | ), | ||
375 | |||
376 | TP_fast_assign( | ||
377 | __entry->dev = bio->bi_bdev->bd_dev; | ||
378 | __entry->sector = bio->bi_sector; | ||
379 | __entry->nr_sector = bio->bi_size >> 9; | ||
380 | blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); | ||
381 | memcpy(__entry->comm, current->comm, TASK_COMM_LEN); | ||
382 | ), | ||
383 | |||
384 | TP_printk("%d,%d %s %llu + %u [%s]", | ||
385 | MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, | ||
386 | (unsigned long long)__entry->sector, | ||
387 | __entry->nr_sector, __entry->comm) | ||
314 | ); | 388 | ); |
315 | 389 | ||
316 | DECLARE_EVENT_CLASS(block_get_rq, | 390 | DECLARE_EVENT_CLASS(block_get_rq, |
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b453d92c2253..6a16fd2e70ed 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
@@ -32,6 +32,115 @@ | |||
32 | 32 | ||
33 | struct wb_writeback_work; | 33 | struct wb_writeback_work; |
34 | 34 | ||
35 | TRACE_EVENT(writeback_dirty_page, | ||
36 | |||
37 | TP_PROTO(struct page *page, struct address_space *mapping), | ||
38 | |||
39 | TP_ARGS(page, mapping), | ||
40 | |||
41 | TP_STRUCT__entry ( | ||
42 | __array(char, name, 32) | ||
43 | __field(unsigned long, ino) | ||
44 | __field(pgoff_t, index) | ||
45 | ), | ||
46 | |||
47 | TP_fast_assign( | ||
48 | strncpy(__entry->name, | ||
49 | mapping ? dev_name(mapping->backing_dev_info->dev) : "(unknown)", 32); | ||
50 | __entry->ino = mapping ? mapping->host->i_ino : 0; | ||
51 | __entry->index = page->index; | ||
52 | ), | ||
53 | |||
54 | TP_printk("bdi %s: ino=%lu index=%lu", | ||
55 | __entry->name, | ||
56 | __entry->ino, | ||
57 | __entry->index | ||
58 | ) | ||
59 | ); | ||
60 | |||
61 | DECLARE_EVENT_CLASS(writeback_dirty_inode_template, | ||
62 | |||
63 | TP_PROTO(struct inode *inode, int flags), | ||
64 | |||
65 | TP_ARGS(inode, flags), | ||
66 | |||
67 | TP_STRUCT__entry ( | ||
68 | __array(char, name, 32) | ||
69 | __field(unsigned long, ino) | ||
70 | __field(unsigned long, flags) | ||
71 | ), | ||
72 | |||
73 | TP_fast_assign( | ||
74 | struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; | ||
75 | |||
76 | /* may be called for files on pseudo FSes w/ unregistered bdi */ | ||
77 | strncpy(__entry->name, | ||
78 | bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); | ||
79 | __entry->ino = inode->i_ino; | ||
80 | __entry->flags = flags; | ||
81 | ), | ||
82 | |||
83 | TP_printk("bdi %s: ino=%lu flags=%s", | ||
84 | __entry->name, | ||
85 | __entry->ino, | ||
86 | show_inode_state(__entry->flags) | ||
87 | ) | ||
88 | ); | ||
89 | |||
90 | DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, | ||
91 | |||
92 | TP_PROTO(struct inode *inode, int flags), | ||
93 | |||
94 | TP_ARGS(inode, flags) | ||
95 | ); | ||
96 | |||
97 | DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, | ||
98 | |||
99 | TP_PROTO(struct inode *inode, int flags), | ||
100 | |||
101 | TP_ARGS(inode, flags) | ||
102 | ); | ||
103 | |||
104 | DECLARE_EVENT_CLASS(writeback_write_inode_template, | ||
105 | |||
106 | TP_PROTO(struct inode *inode, struct writeback_control *wbc), | ||
107 | |||
108 | TP_ARGS(inode, wbc), | ||
109 | |||
110 | TP_STRUCT__entry ( | ||
111 | __array(char, name, 32) | ||
112 | __field(unsigned long, ino) | ||
113 | __field(int, sync_mode) | ||
114 | ), | ||
115 | |||
116 | TP_fast_assign( | ||
117 | strncpy(__entry->name, | ||
118 | dev_name(inode->i_mapping->backing_dev_info->dev), 32); | ||
119 | __entry->ino = inode->i_ino; | ||
120 | __entry->sync_mode = wbc->sync_mode; | ||
121 | ), | ||
122 | |||
123 | TP_printk("bdi %s: ino=%lu sync_mode=%d", | ||
124 | __entry->name, | ||
125 | __entry->ino, | ||
126 | __entry->sync_mode | ||
127 | ) | ||
128 | ); | ||
129 | |||
130 | DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, | ||
131 | |||
132 | TP_PROTO(struct inode *inode, struct writeback_control *wbc), | ||
133 | |||
134 | TP_ARGS(inode, wbc) | ||
135 | ); | ||
136 | |||
137 | DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, | ||
138 | |||
139 | TP_PROTO(struct inode *inode, struct writeback_control *wbc), | ||
140 | |||
141 | TP_ARGS(inode, wbc) | ||
142 | ); | ||
143 | |||
35 | DECLARE_EVENT_CLASS(writeback_work_class, | 144 | DECLARE_EVENT_CLASS(writeback_work_class, |
36 | TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), | 145 | TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), |
37 | TP_ARGS(bdi, work), | 146 | TP_ARGS(bdi, work), |
@@ -479,6 +588,13 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, | |||
479 | ) | 588 | ) |
480 | ); | 589 | ); |
481 | 590 | ||
591 | DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, | ||
592 | TP_PROTO(struct inode *inode, | ||
593 | struct writeback_control *wbc, | ||
594 | unsigned long nr_to_write), | ||
595 | TP_ARGS(inode, wbc, nr_to_write) | ||
596 | ); | ||
597 | |||
482 | DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, | 598 | DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, |
483 | TP_PROTO(struct inode *inode, | 599 | TP_PROTO(struct inode *inode, |
484 | struct writeback_control *wbc, | 600 | struct writeback_control *wbc, |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 12af4270c9c1..7f12624a393c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -3258,7 +3258,8 @@ void complete_all(struct completion *x) | |||
3258 | EXPORT_SYMBOL(complete_all); | 3258 | EXPORT_SYMBOL(complete_all); |
3259 | 3259 | ||
3260 | static inline long __sched | 3260 | static inline long __sched |
3261 | do_wait_for_common(struct completion *x, long timeout, int state) | 3261 | do_wait_for_common(struct completion *x, |
3262 | long (*action)(long), long timeout, int state) | ||
3262 | { | 3263 | { |
3263 | if (!x->done) { | 3264 | if (!x->done) { |
3264 | DECLARE_WAITQUEUE(wait, current); | 3265 | DECLARE_WAITQUEUE(wait, current); |
@@ -3271,7 +3272,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
3271 | } | 3272 | } |
3272 | __set_current_state(state); | 3273 | __set_current_state(state); |
3273 | spin_unlock_irq(&x->wait.lock); | 3274 | spin_unlock_irq(&x->wait.lock); |
3274 | timeout = schedule_timeout(timeout); | 3275 | timeout = action(timeout); |
3275 | spin_lock_irq(&x->wait.lock); | 3276 | spin_lock_irq(&x->wait.lock); |
3276 | } while (!x->done && timeout); | 3277 | } while (!x->done && timeout); |
3277 | __remove_wait_queue(&x->wait, &wait); | 3278 | __remove_wait_queue(&x->wait, &wait); |
@@ -3282,17 +3283,30 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
3282 | return timeout ?: 1; | 3283 | return timeout ?: 1; |
3283 | } | 3284 | } |
3284 | 3285 | ||
3285 | static long __sched | 3286 | static inline long __sched |
3286 | wait_for_common(struct completion *x, long timeout, int state) | 3287 | __wait_for_common(struct completion *x, |
3288 | long (*action)(long), long timeout, int state) | ||
3287 | { | 3289 | { |
3288 | might_sleep(); | 3290 | might_sleep(); |
3289 | 3291 | ||
3290 | spin_lock_irq(&x->wait.lock); | 3292 | spin_lock_irq(&x->wait.lock); |
3291 | timeout = do_wait_for_common(x, timeout, state); | 3293 | timeout = do_wait_for_common(x, action, timeout, state); |
3292 | spin_unlock_irq(&x->wait.lock); | 3294 | spin_unlock_irq(&x->wait.lock); |
3293 | return timeout; | 3295 | return timeout; |
3294 | } | 3296 | } |
3295 | 3297 | ||
3298 | static long __sched | ||
3299 | wait_for_common(struct completion *x, long timeout, int state) | ||
3300 | { | ||
3301 | return __wait_for_common(x, schedule_timeout, timeout, state); | ||
3302 | } | ||
3303 | |||
3304 | static long __sched | ||
3305 | wait_for_common_io(struct completion *x, long timeout, int state) | ||
3306 | { | ||
3307 | return __wait_for_common(x, io_schedule_timeout, timeout, state); | ||
3308 | } | ||
3309 | |||
3296 | /** | 3310 | /** |
3297 | * wait_for_completion: - waits for completion of a task | 3311 | * wait_for_completion: - waits for completion of a task |
3298 | * @x: holds the state of this particular completion | 3312 | * @x: holds the state of this particular completion |
@@ -3329,6 +3343,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) | |||
3329 | EXPORT_SYMBOL(wait_for_completion_timeout); | 3343 | EXPORT_SYMBOL(wait_for_completion_timeout); |
3330 | 3344 | ||
3331 | /** | 3345 | /** |
3346 | * wait_for_completion_io: - waits for completion of a task | ||
3347 | * @x: holds the state of this particular completion | ||
3348 | * | ||
3349 | * This waits to be signaled for completion of a specific task. It is NOT | ||
3350 | * interruptible and there is no timeout. The caller is accounted as waiting | ||
3351 | * for IO. | ||
3352 | */ | ||
3353 | void __sched wait_for_completion_io(struct completion *x) | ||
3354 | { | ||
3355 | wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
3356 | } | ||
3357 | EXPORT_SYMBOL(wait_for_completion_io); | ||
3358 | |||
3359 | /** | ||
3360 | * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) | ||
3361 | * @x: holds the state of this particular completion | ||
3362 | * @timeout: timeout value in jiffies | ||
3363 | * | ||
3364 | * This waits for either a completion of a specific task to be signaled or for a | ||
3365 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
3366 | * interruptible. The caller is accounted as waiting for IO. | ||
3367 | * | ||
3368 | * The return value is 0 if timed out, and positive (at least 1, or number of | ||
3369 | * jiffies left till timeout) if completed. | ||
3370 | */ | ||
3371 | unsigned long __sched | ||
3372 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) | ||
3373 | { | ||
3374 | return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); | ||
3375 | } | ||
3376 | EXPORT_SYMBOL(wait_for_completion_io_timeout); | ||
3377 | |||
3378 | /** | ||
3332 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | 3379 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) |
3333 | * @x: holds the state of this particular completion | 3380 | * @x: holds the state of this particular completion |
3334 | * | 3381 | * |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 71259e2b6b61..9e5b8c272eec 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore, | |||
739 | struct request_queue *q, | 739 | struct request_queue *q, |
740 | struct request *rq) | 740 | struct request *rq) |
741 | { | 741 | { |
742 | struct blk_trace *bt = q->blk_trace; | ||
743 | |||
744 | /* if control ever passes through here, it's a request based driver */ | ||
745 | if (unlikely(bt && !bt->rq_based)) | ||
746 | bt->rq_based = true; | ||
747 | |||
742 | blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); | 748 | blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); |
743 | } | 749 | } |
744 | 750 | ||
@@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore, | |||
774 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); | 780 | blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); |
775 | } | 781 | } |
776 | 782 | ||
777 | static void blk_add_trace_bio_complete(void *ignore, | 783 | static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) |
778 | struct request_queue *q, struct bio *bio, | ||
779 | int error) | ||
780 | { | 784 | { |
785 | struct request_queue *q; | ||
786 | struct blk_trace *bt; | ||
787 | |||
788 | if (!bio->bi_bdev) | ||
789 | return; | ||
790 | |||
791 | q = bdev_get_queue(bio->bi_bdev); | ||
792 | bt = q->blk_trace; | ||
793 | |||
794 | /* | ||
795 | * Request based drivers will generate both rq and bio completions. | ||
796 | * Ignore bio ones. | ||
797 | */ | ||
798 | if (likely(!bt) || bt->rq_based) | ||
799 | return; | ||
800 | |||
781 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); | 801 | blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); |
782 | } | 802 | } |
783 | 803 | ||
784 | static void blk_add_trace_bio_backmerge(void *ignore, | 804 | static void blk_add_trace_bio_backmerge(void *ignore, |
785 | struct request_queue *q, | 805 | struct request_queue *q, |
806 | struct request *rq, | ||
786 | struct bio *bio) | 807 | struct bio *bio) |
787 | { | 808 | { |
788 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); | 809 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); |
@@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore, | |||
790 | 811 | ||
791 | static void blk_add_trace_bio_frontmerge(void *ignore, | 812 | static void blk_add_trace_bio_frontmerge(void *ignore, |
792 | struct request_queue *q, | 813 | struct request_queue *q, |
814 | struct request *rq, | ||
793 | struct bio *bio) | 815 | struct bio *bio) |
794 | { | 816 | { |
795 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); | 817 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cdc377c456c0..742c40583159 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1986,6 +1986,8 @@ int __set_page_dirty_no_writeback(struct page *page) | |||
1986 | */ | 1986 | */ |
1987 | void account_page_dirtied(struct page *page, struct address_space *mapping) | 1987 | void account_page_dirtied(struct page *page, struct address_space *mapping) |
1988 | { | 1988 | { |
1989 | trace_writeback_dirty_page(page, mapping); | ||
1990 | |||
1989 | if (mapping_cap_account_dirty(mapping)) { | 1991 | if (mapping_cap_account_dirty(mapping)) { |
1990 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 1992 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
1991 | __inc_zone_page_state(page, NR_DIRTIED); | 1993 | __inc_zone_page_state(page, NR_DIRTIED); |