aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 15:52:24 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-28 15:52:24 -0500
commitee89f81252179dcbf6cd65bd48299f5e52292d88 (patch)
tree805846cd12821f84cfe619d44c9e3e36e0b0f9e6
parent21f3b24da9328415792efc780f50b9f434c12465 (diff)
parentde33127d8d3f1d570aad8c2223cd81b206636bc1 (diff)
Merge branch 'for-3.9/core' of git://git.kernel.dk/linux-block
Pull block IO core bits from Jens Axboe: "Below are the core block IO bits for 3.9. It was delayed a few days since my workstation kept crashing every 2-8h after pulling it into current -git, but turns out it is a bug in the new pstate code (divide by zero, will report separately). In any case, it contains: - The big cfq/blkcg update from Tejun and and Vivek. - Additional block and writeback tracepoints from Tejun. - Improvement of the should sort (based on queues) logic in the plug flushing. - _io() variants of the wait_for_completion() interface, using io_schedule() instead of schedule() to contribute to io wait properly. - Various little fixes. You'll get two trivial merge conflicts, which should be easy enough to fix up" Fix up the trivial conflicts due to hlist traversal cleanups (commit b67bfe0d42ca: "hlist: drop the node parameter from iterators"). * 'for-3.9/core' of git://git.kernel.dk/linux-block: (39 commits) block: remove redundant check to bd_openers() block: use i_size_write() in bd_set_size() cfq: fix lock imbalance with failed allocations drivers/block/swim3.c: fix null pointer dereference block: don't select PERCPU_RWSEM block: account iowait time when waiting for completion of IO request sched: add wait_for_completion_io[_timeout] writeback: add more tracepoints block: add block_{touch|dirty}_buffer tracepoint buffer: make touch_buffer() an exported function block: add @req to bio_{front|back}_merge tracepoints block: add missing block_bio_complete() tracepoint block: Remove should_sort judgement when flush blk_plug block,elevator: use new hashtable implementation cfq-iosched: add hierarchical cfq_group statistics cfq-iosched: collect stats from dead cfqgs cfq-iosched: separate out cfqg_stats_reset() from cfq_pd_reset_stats() blkcg: make blkcg_print_blkgs() grab q locks instead of blkcg lock block: RCU free request_queue blkcg: implement blkg_[rw]stat_recursive_sum() and blkg_[rw]stat_merge() ...
-rw-r--r--Documentation/block/cfq-iosched.txt58
-rw-r--r--Documentation/cgroups/blkio-controller.txt35
-rw-r--r--block/Kconfig1
-rw-r--r--block/blk-cgroup.c277
-rw-r--r--block/blk-cgroup.h68
-rw-r--r--block/blk-core.c18
-rw-r--r--block/blk-exec.c4
-rw-r--r--block/blk-flush.c2
-rw-r--r--block/blk-lib.c6
-rw-r--r--block/blk-sysfs.c9
-rw-r--r--block/blk.h2
-rw-r--r--block/cfq-iosched.c629
-rw-r--r--block/elevator.c23
-rw-r--r--drivers/block/swim3.c5
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/raid5.c11
-rw-r--r--fs/bio.c2
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/buffer.c10
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--include/linux/blkdev.h3
-rw-r--r--include/linux/blktrace_api.h1
-rw-r--r--include/linux/buffer_head.h2
-rw-r--r--include/linux/completion.h3
-rw-r--r--include/linux/elevator.h5
-rw-r--r--include/trace/events/block.h104
-rw-r--r--include/trace/events/writeback.h116
-rw-r--r--kernel/sched/core.c57
-rw-r--r--kernel/trace/blktrace.c28
-rw-r--r--mm/page-writeback.c2
30 files changed, 1246 insertions, 258 deletions
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
index d89b4fe724d7..a5eb7d19a65d 100644
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -102,6 +102,64 @@ processing of request. Therefore, increasing the value can imporve the
102performace although this can cause the latency of some I/O to increase due 102performace although this can cause the latency of some I/O to increase due
103to more number of requests. 103to more number of requests.
104 104
105CFQ Group scheduling
106====================
107
108CFQ supports blkio cgroup and has "blkio." prefixed files in each
109blkio cgroup directory. It is weight-based and there are four knobs
110for configuration - weight[_device] and leaf_weight[_device].
111Internal cgroup nodes (the ones with children) can also have tasks in
112them, so the former two configure how much proportion the cgroup as a
113whole is entitled to at its parent's level while the latter two
114configure how much proportion the tasks in the cgroup have compared to
115its direct children.
116
117Another way to think about it is assuming that each internal node has
118an implicit leaf child node which hosts all the tasks whose weight is
119configured by leaf_weight[_device]. Let's assume a blkio hierarchy
120composed of five cgroups - root, A, B, AA and AB - with the following
121weights where the names represent the hierarchy.
122
123 weight leaf_weight
124 root : 125 125
125 A : 500 750
126 B : 250 500
127 AA : 500 500
128 AB : 1000 500
129
130root never has a parent making its weight is meaningless. For backward
131compatibility, weight is always kept in sync with leaf_weight. B, AA
132and AB have no child and thus its tasks have no children cgroup to
133compete with. They always get 100% of what the cgroup won at the
134parent level. Considering only the weights which matter, the hierarchy
135looks like the following.
136
137 root
138 / | \
139 A B leaf
140 500 250 125
141 / | \
142 AA AB leaf
143 500 1000 750
144
145If all cgroups have active IOs and competing with each other, disk
146time will be distributed like the following.
147
148Distribution below root. The total active weight at this level is
149A:500 + B:250 + C:125 = 875.
150
151 root-leaf : 125 / 875 =~ 14%
152 A : 500 / 875 =~ 57%
153 B(-leaf) : 250 / 875 =~ 28%
154
155A has children and further distributes its 57% among the children and
156the implicit leaf node. The total active weight at this level is
157AA:500 + AB:1000 + A-leaf:750 = 2250.
158
159 A-leaf : ( 750 / 2250) * A =~ 19%
160 AA(-leaf) : ( 500 / 2250) * A =~ 12%
161 AB(-leaf) : (1000 / 2250) * A =~ 25%
162
105CFQ IOPS Mode for group scheduling 163CFQ IOPS Mode for group scheduling
106=================================== 164===================================
107Basic CFQ design is to provide priority based time slices. Higher priority 165Basic CFQ design is to provide priority based time slices. Higher priority
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index a794ce91a2d5..da272c8f44e7 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -94,13 +94,11 @@ Throttling/Upper Limit policy
94 94
95Hierarchical Cgroups 95Hierarchical Cgroups
96==================== 96====================
97- Currently none of the IO control policy supports hierarchical groups. But 97- Currently only CFQ supports hierarchical groups. For throttling,
98 cgroup interface does allow creation of hierarchical cgroups and internally 98 cgroup interface does allow creation of hierarchical cgroups and
99 IO policies treat them as flat hierarchy. 99 internally it treats them as flat hierarchy.
100 100
101 So this patch will allow creation of cgroup hierarchcy but at the backend 101 If somebody created a hierarchy like as follows.
102 everything will be treated as flat. So if somebody created a hierarchy like
103 as follows.
104 102
105 root 103 root
106 / \ 104 / \
@@ -108,16 +106,20 @@ Hierarchical Cgroups
108 | 106 |
109 test3 107 test3
110 108
111 CFQ and throttling will practically treat all groups at same level. 109 CFQ will handle the hierarchy correctly but and throttling will
110 practically treat all groups at same level. For details on CFQ
111 hierarchy support, refer to Documentation/block/cfq-iosched.txt.
112 Throttling will treat the hierarchy as if it looks like the
113 following.
112 114
113 pivot 115 pivot
114 / / \ \ 116 / / \ \
115 root test1 test2 test3 117 root test1 test2 test3
116 118
117 Down the line we can implement hierarchical accounting/control support 119 Nesting cgroups, while allowed, isn't officially supported and blkio
118 and also introduce a new cgroup file "use_hierarchy" which will control 120 genereates warning when cgroups nest. Once throttling implements
119 whether cgroup hierarchy is viewed as flat or hierarchical by the policy.. 121 hierarchy support, hierarchy will be supported and the warning will
120 This is how memory controller also has implemented the things. 122 be removed.
121 123
122Various user visible config options 124Various user visible config options
123=================================== 125===================================
@@ -172,6 +174,12 @@ Proportional weight policy files
172 dev weight 174 dev weight
173 8:16 300 175 8:16 300
174 176
177- blkio.leaf_weight[_device]
178 - Equivalents of blkio.weight[_device] for the purpose of
179 deciding how much weight tasks in the given cgroup has while
180 competing with the cgroup's child cgroups. For details,
181 please refer to Documentation/block/cfq-iosched.txt.
182
175- blkio.time 183- blkio.time
176 - disk time allocated to cgroup per device in milliseconds. First 184 - disk time allocated to cgroup per device in milliseconds. First
177 two fields specify the major and minor number of the device and 185 two fields specify the major and minor number of the device and
@@ -279,6 +287,11 @@ Proportional weight policy files
279 and minor number of the device and third field specifies the number 287 and minor number of the device and third field specifies the number
280 of times a group was dequeued from a particular device. 288 of times a group was dequeued from a particular device.
281 289
290- blkio.*_recursive
291 - Recursive version of various stats. These files show the
292 same information as their non-recursive counterparts but
293 include stats from all the descendant cgroups.
294
282Throttling/Upper limit policy files 295Throttling/Upper limit policy files
283----------------------------------- 296-----------------------------------
284- blkio.throttle.read_bps_device 297- blkio.throttle.read_bps_device
diff --git a/block/Kconfig b/block/Kconfig
index 4a85ccf8d4cf..a7e40a7c8214 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,7 +4,6 @@
4menuconfig BLOCK 4menuconfig BLOCK
5 bool "Enable the block layer" if EXPERT 5 bool "Enable the block layer" if EXPERT
6 default y 6 default y
7 select PERCPU_RWSEM
8 help 7 help
9 Provide block layer support for the kernel. 8 Provide block layer support for the kernel.
10 9
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8bdebb6781e1..b2b9837f9dd3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -26,11 +26,32 @@
26 26
27static DEFINE_MUTEX(blkcg_pol_mutex); 27static DEFINE_MUTEX(blkcg_pol_mutex);
28 28
29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; 29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
30 .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
30EXPORT_SYMBOL_GPL(blkcg_root); 31EXPORT_SYMBOL_GPL(blkcg_root);
31 32
32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 33static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
33 34
35static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
36 struct request_queue *q, bool update_hint);
37
38/**
39 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
40 * @d_blkg: loop cursor pointing to the current descendant
41 * @pos_cgrp: used for iteration
42 * @p_blkg: target blkg to walk descendants of
43 *
44 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
45 * read locked. If called under either blkcg or queue lock, the iteration
46 * is guaranteed to include all and only online blkgs. The caller may
47 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
48 * subtree.
49 */
50#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
51 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
52 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
53 (p_blkg)->q, false)))
54
34static bool blkcg_policy_enabled(struct request_queue *q, 55static bool blkcg_policy_enabled(struct request_queue *q,
35 const struct blkcg_policy *pol) 56 const struct blkcg_policy *pol)
36{ 57{
@@ -112,9 +133,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
112 133
113 blkg->pd[i] = pd; 134 blkg->pd[i] = pd;
114 pd->blkg = blkg; 135 pd->blkg = blkg;
136 pd->plid = i;
115 137
116 /* invoke per-policy init */ 138 /* invoke per-policy init */
117 if (blkcg_policy_enabled(blkg->q, pol)) 139 if (pol->pd_init_fn)
118 pol->pd_init_fn(blkg); 140 pol->pd_init_fn(blkg);
119 } 141 }
120 142
@@ -125,8 +147,19 @@ err_free:
125 return NULL; 147 return NULL;
126} 148}
127 149
150/**
151 * __blkg_lookup - internal version of blkg_lookup()
152 * @blkcg: blkcg of interest
153 * @q: request_queue of interest
154 * @update_hint: whether to update lookup hint with the result or not
155 *
156 * This is internal version and shouldn't be used by policy
157 * implementations. Looks up blkgs for the @blkcg - @q pair regardless of
158 * @q's bypass state. If @update_hint is %true, the caller should be
159 * holding @q->queue_lock and lookup hint is updated on success.
160 */
128static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 161static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
129 struct request_queue *q) 162 struct request_queue *q, bool update_hint)
130{ 163{
131 struct blkcg_gq *blkg; 164 struct blkcg_gq *blkg;
132 165
@@ -135,14 +168,19 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
135 return blkg; 168 return blkg;
136 169
137 /* 170 /*
138 * Hint didn't match. Look up from the radix tree. Note that we 171 * Hint didn't match. Look up from the radix tree. Note that the
139 * may not be holding queue_lock and thus are not sure whether 172 * hint can only be updated under queue_lock as otherwise @blkg
140 * @blkg from blkg_tree has already been removed or not, so we 173 * could have already been removed from blkg_tree. The caller is
141 * can't update hint to the lookup result. Leave it to the caller. 174 * responsible for grabbing queue_lock if @update_hint.
142 */ 175 */
143 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 176 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
144 if (blkg && blkg->q == q) 177 if (blkg && blkg->q == q) {
178 if (update_hint) {
179 lockdep_assert_held(q->queue_lock);
180 rcu_assign_pointer(blkcg->blkg_hint, blkg);
181 }
145 return blkg; 182 return blkg;
183 }
146 184
147 return NULL; 185 return NULL;
148} 186}
@@ -162,7 +200,7 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
162 200
163 if (unlikely(blk_queue_bypass(q))) 201 if (unlikely(blk_queue_bypass(q)))
164 return NULL; 202 return NULL;
165 return __blkg_lookup(blkcg, q); 203 return __blkg_lookup(blkcg, q, false);
166} 204}
167EXPORT_SYMBOL_GPL(blkg_lookup); 205EXPORT_SYMBOL_GPL(blkg_lookup);
168 206
@@ -170,75 +208,129 @@ EXPORT_SYMBOL_GPL(blkg_lookup);
170 * If @new_blkg is %NULL, this function tries to allocate a new one as 208 * If @new_blkg is %NULL, this function tries to allocate a new one as
171 * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. 209 * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return.
172 */ 210 */
173static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, 211static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
174 struct request_queue *q, 212 struct request_queue *q,
175 struct blkcg_gq *new_blkg) 213 struct blkcg_gq *new_blkg)
176{ 214{
177 struct blkcg_gq *blkg; 215 struct blkcg_gq *blkg;
178 int ret; 216 int i, ret;
179 217
180 WARN_ON_ONCE(!rcu_read_lock_held()); 218 WARN_ON_ONCE(!rcu_read_lock_held());
181 lockdep_assert_held(q->queue_lock); 219 lockdep_assert_held(q->queue_lock);
182 220
183 /* lookup and update hint on success, see __blkg_lookup() for details */
184 blkg = __blkg_lookup(blkcg, q);
185 if (blkg) {
186 rcu_assign_pointer(blkcg->blkg_hint, blkg);
187 goto out_free;
188 }
189
190 /* blkg holds a reference to blkcg */ 221 /* blkg holds a reference to blkcg */
191 if (!css_tryget(&blkcg->css)) { 222 if (!css_tryget(&blkcg->css)) {
192 blkg = ERR_PTR(-EINVAL); 223 ret = -EINVAL;
193 goto out_free; 224 goto err_free_blkg;
194 } 225 }
195 226
196 /* allocate */ 227 /* allocate */
197 if (!new_blkg) { 228 if (!new_blkg) {
198 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); 229 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
199 if (unlikely(!new_blkg)) { 230 if (unlikely(!new_blkg)) {
200 blkg = ERR_PTR(-ENOMEM); 231 ret = -ENOMEM;
201 goto out_put; 232 goto err_put_css;
202 } 233 }
203 } 234 }
204 blkg = new_blkg; 235 blkg = new_blkg;
205 236
206 /* insert */ 237 /* link parent and insert */
238 if (blkcg_parent(blkcg)) {
239 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
240 if (WARN_ON_ONCE(!blkg->parent)) {
241 blkg = ERR_PTR(-EINVAL);
242 goto err_put_css;
243 }
244 blkg_get(blkg->parent);
245 }
246
207 spin_lock(&blkcg->lock); 247 spin_lock(&blkcg->lock);
208 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 248 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
209 if (likely(!ret)) { 249 if (likely(!ret)) {
210 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 250 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
211 list_add(&blkg->q_node, &q->blkg_list); 251 list_add(&blkg->q_node, &q->blkg_list);
252
253 for (i = 0; i < BLKCG_MAX_POLS; i++) {
254 struct blkcg_policy *pol = blkcg_policy[i];
255
256 if (blkg->pd[i] && pol->pd_online_fn)
257 pol->pd_online_fn(blkg);
258 }
212 } 259 }
260 blkg->online = true;
213 spin_unlock(&blkcg->lock); 261 spin_unlock(&blkcg->lock);
214 262
215 if (!ret) 263 if (!ret)
216 return blkg; 264 return blkg;
217 265
218 blkg = ERR_PTR(ret); 266 /* @blkg failed fully initialized, use the usual release path */
219out_put: 267 blkg_put(blkg);
268 return ERR_PTR(ret);
269
270err_put_css:
220 css_put(&blkcg->css); 271 css_put(&blkcg->css);
221out_free: 272err_free_blkg:
222 blkg_free(new_blkg); 273 blkg_free(new_blkg);
223 return blkg; 274 return ERR_PTR(ret);
224} 275}
225 276
277/**
278 * blkg_lookup_create - lookup blkg, try to create one if not there
279 * @blkcg: blkcg of interest
280 * @q: request_queue of interest
281 *
282 * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
283 * create one. blkg creation is performed recursively from blkcg_root such
284 * that all non-root blkg's have access to the parent blkg. This function
285 * should be called under RCU read lock and @q->queue_lock.
286 *
287 * Returns pointer to the looked up or created blkg on success, ERR_PTR()
288 * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not
289 * dead and bypassing, returns ERR_PTR(-EBUSY).
290 */
226struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 291struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
227 struct request_queue *q) 292 struct request_queue *q)
228{ 293{
294 struct blkcg_gq *blkg;
295
296 WARN_ON_ONCE(!rcu_read_lock_held());
297 lockdep_assert_held(q->queue_lock);
298
229 /* 299 /*
230 * This could be the first entry point of blkcg implementation and 300 * This could be the first entry point of blkcg implementation and
231 * we shouldn't allow anything to go through for a bypassing queue. 301 * we shouldn't allow anything to go through for a bypassing queue.
232 */ 302 */
233 if (unlikely(blk_queue_bypass(q))) 303 if (unlikely(blk_queue_bypass(q)))
234 return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); 304 return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
235 return __blkg_lookup_create(blkcg, q, NULL); 305
306 blkg = __blkg_lookup(blkcg, q, true);
307 if (blkg)
308 return blkg;
309
310 /*
311 * Create blkgs walking down from blkcg_root to @blkcg, so that all
312 * non-root blkgs have access to their parents.
313 */
314 while (true) {
315 struct blkcg *pos = blkcg;
316 struct blkcg *parent = blkcg_parent(blkcg);
317
318 while (parent && !__blkg_lookup(parent, q, false)) {
319 pos = parent;
320 parent = blkcg_parent(parent);
321 }
322
323 blkg = blkg_create(pos, q, NULL);
324 if (pos == blkcg || IS_ERR(blkg))
325 return blkg;
326 }
236} 327}
237EXPORT_SYMBOL_GPL(blkg_lookup_create); 328EXPORT_SYMBOL_GPL(blkg_lookup_create);
238 329
239static void blkg_destroy(struct blkcg_gq *blkg) 330static void blkg_destroy(struct blkcg_gq *blkg)
240{ 331{
241 struct blkcg *blkcg = blkg->blkcg; 332 struct blkcg *blkcg = blkg->blkcg;
333 int i;
242 334
243 lockdep_assert_held(blkg->q->queue_lock); 335 lockdep_assert_held(blkg->q->queue_lock);
244 lockdep_assert_held(&blkcg->lock); 336 lockdep_assert_held(&blkcg->lock);
@@ -247,6 +339,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
247 WARN_ON_ONCE(list_empty(&blkg->q_node)); 339 WARN_ON_ONCE(list_empty(&blkg->q_node));
248 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 340 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
249 341
342 for (i = 0; i < BLKCG_MAX_POLS; i++) {
343 struct blkcg_policy *pol = blkcg_policy[i];
344
345 if (blkg->pd[i] && pol->pd_offline_fn)
346 pol->pd_offline_fn(blkg);
347 }
348 blkg->online = false;
349
250 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); 350 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
251 list_del_init(&blkg->q_node); 351 list_del_init(&blkg->q_node);
252 hlist_del_init_rcu(&blkg->blkcg_node); 352 hlist_del_init_rcu(&blkg->blkcg_node);
@@ -301,8 +401,10 @@ static void blkg_rcu_free(struct rcu_head *rcu_head)
301 401
302void __blkg_release(struct blkcg_gq *blkg) 402void __blkg_release(struct blkcg_gq *blkg)
303{ 403{
304 /* release the extra blkcg reference this blkg has been holding */ 404 /* release the blkcg and parent blkg refs this blkg has been holding */
305 css_put(&blkg->blkcg->css); 405 css_put(&blkg->blkcg->css);
406 if (blkg->parent)
407 blkg_put(blkg->parent);
306 408
307 /* 409 /*
308 * A group is freed in rcu manner. But having an rcu lock does not 410 * A group is freed in rcu manner. But having an rcu lock does not
@@ -401,8 +503,9 @@ static const char *blkg_dev_name(struct blkcg_gq *blkg)
401 * 503 *
402 * This function invokes @prfill on each blkg of @blkcg if pd for the 504 * This function invokes @prfill on each blkg of @blkcg if pd for the
403 * policy specified by @pol exists. @prfill is invoked with @sf, the 505 * policy specified by @pol exists. @prfill is invoked with @sf, the
404 * policy data and @data. If @show_total is %true, the sum of the return 506 * policy data and @data and the matching queue lock held. If @show_total
405 * values from @prfill is printed with "Total" label at the end. 507 * is %true, the sum of the return values from @prfill is printed with
508 * "Total" label at the end.
406 * 509 *
407 * This is to be used to construct print functions for 510 * This is to be used to construct print functions for
408 * cftype->read_seq_string method. 511 * cftype->read_seq_string method.
@@ -416,11 +519,14 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
416 struct blkcg_gq *blkg; 519 struct blkcg_gq *blkg;
417 u64 total = 0; 520 u64 total = 0;
418 521
419 spin_lock_irq(&blkcg->lock); 522 rcu_read_lock();
420 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) 523 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
524 spin_lock_irq(blkg->q->queue_lock);
421 if (blkcg_policy_enabled(blkg->q, pol)) 525 if (blkcg_policy_enabled(blkg->q, pol))
422 total += prfill(sf, blkg->pd[pol->plid], data); 526 total += prfill(sf, blkg->pd[pol->plid], data);
423 spin_unlock_irq(&blkcg->lock); 527 spin_unlock_irq(blkg->q->queue_lock);
528 }
529 rcu_read_unlock();
424 530
425 if (show_total) 531 if (show_total)
426 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 532 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
@@ -479,6 +585,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
479 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 585 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
480 return v; 586 return v;
481} 587}
588EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
482 589
483/** 590/**
484 * blkg_prfill_stat - prfill callback for blkg_stat 591 * blkg_prfill_stat - prfill callback for blkg_stat
@@ -512,6 +619,82 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
512EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); 619EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
513 620
514/** 621/**
622 * blkg_stat_recursive_sum - collect hierarchical blkg_stat
623 * @pd: policy private data of interest
624 * @off: offset to the blkg_stat in @pd
625 *
626 * Collect the blkg_stat specified by @off from @pd and all its online
627 * descendants and return the sum. The caller must be holding the queue
628 * lock for online tests.
629 */
630u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
631{
632 struct blkcg_policy *pol = blkcg_policy[pd->plid];
633 struct blkcg_gq *pos_blkg;
634 struct cgroup *pos_cgrp;
635 u64 sum;
636
637 lockdep_assert_held(pd->blkg->q->queue_lock);
638
639 sum = blkg_stat_read((void *)pd + off);
640
641 rcu_read_lock();
642 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
643 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
644 struct blkg_stat *stat = (void *)pos_pd + off;
645
646 if (pos_blkg->online)
647 sum += blkg_stat_read(stat);
648 }
649 rcu_read_unlock();
650
651 return sum;
652}
653EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
654
655/**
656 * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
657 * @pd: policy private data of interest
658 * @off: offset to the blkg_stat in @pd
659 *
660 * Collect the blkg_rwstat specified by @off from @pd and all its online
661 * descendants and return the sum. The caller must be holding the queue
662 * lock for online tests.
663 */
664struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
665 int off)
666{
667 struct blkcg_policy *pol = blkcg_policy[pd->plid];
668 struct blkcg_gq *pos_blkg;
669 struct cgroup *pos_cgrp;
670 struct blkg_rwstat sum;
671 int i;
672
673 lockdep_assert_held(pd->blkg->q->queue_lock);
674
675 sum = blkg_rwstat_read((void *)pd + off);
676
677 rcu_read_lock();
678 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
679 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
680 struct blkg_rwstat *rwstat = (void *)pos_pd + off;
681 struct blkg_rwstat tmp;
682
683 if (!pos_blkg->online)
684 continue;
685
686 tmp = blkg_rwstat_read(rwstat);
687
688 for (i = 0; i < BLKG_RWSTAT_NR; i++)
689 sum.cnt[i] += tmp.cnt[i];
690 }
691 rcu_read_unlock();
692
693 return sum;
694}
695EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
696
697/**
515 * blkg_conf_prep - parse and prepare for per-blkg config update 698 * blkg_conf_prep - parse and prepare for per-blkg config update
516 * @blkcg: target block cgroup 699 * @blkcg: target block cgroup
517 * @pol: target policy 700 * @pol: target policy
@@ -656,6 +839,7 @@ static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
656 return ERR_PTR(-ENOMEM); 839 return ERR_PTR(-ENOMEM);
657 840
658 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; 841 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
842 blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
659 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ 843 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
660done: 844done:
661 spin_lock_init(&blkcg->lock); 845 spin_lock_init(&blkcg->lock);
@@ -775,7 +959,7 @@ int blkcg_activate_policy(struct request_queue *q,
775 const struct blkcg_policy *pol) 959 const struct blkcg_policy *pol)
776{ 960{
777 LIST_HEAD(pds); 961 LIST_HEAD(pds);
778 struct blkcg_gq *blkg; 962 struct blkcg_gq *blkg, *new_blkg;
779 struct blkg_policy_data *pd, *n; 963 struct blkg_policy_data *pd, *n;
780 int cnt = 0, ret; 964 int cnt = 0, ret;
781 bool preloaded; 965 bool preloaded;
@@ -784,19 +968,27 @@ int blkcg_activate_policy(struct request_queue *q,
784 return 0; 968 return 0;
785 969
786 /* preallocations for root blkg */ 970 /* preallocations for root blkg */
787 blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); 971 new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
788 if (!blkg) 972 if (!new_blkg)
789 return -ENOMEM; 973 return -ENOMEM;
790 974
791 preloaded = !radix_tree_preload(GFP_KERNEL); 975 preloaded = !radix_tree_preload(GFP_KERNEL);
792 976
793 blk_queue_bypass_start(q); 977 blk_queue_bypass_start(q);
794 978
795 /* make sure the root blkg exists and count the existing blkgs */ 979 /*
980 * Make sure the root blkg exists and count the existing blkgs. As
981 * @q is bypassing at this point, blkg_lookup_create() can't be
982 * used. Open code it.
983 */
796 spin_lock_irq(q->queue_lock); 984 spin_lock_irq(q->queue_lock);
797 985
798 rcu_read_lock(); 986 rcu_read_lock();
799 blkg = __blkg_lookup_create(&blkcg_root, q, blkg); 987 blkg = __blkg_lookup(&blkcg_root, q, false);
988 if (blkg)
989 blkg_free(new_blkg);
990 else
991 blkg = blkg_create(&blkcg_root, q, new_blkg);
800 rcu_read_unlock(); 992 rcu_read_unlock();
801 993
802 if (preloaded) 994 if (preloaded)
@@ -844,6 +1036,7 @@ int blkcg_activate_policy(struct request_queue *q,
844 1036
845 blkg->pd[pol->plid] = pd; 1037 blkg->pd[pol->plid] = pd;
846 pd->blkg = blkg; 1038 pd->blkg = blkg;
1039 pd->plid = pol->plid;
847 pol->pd_init_fn(blkg); 1040 pol->pd_init_fn(blkg);
848 1041
849 spin_unlock(&blkg->blkcg->lock); 1042 spin_unlock(&blkg->blkcg->lock);
@@ -890,6 +1083,8 @@ void blkcg_deactivate_policy(struct request_queue *q,
890 /* grab blkcg lock too while removing @pd from @blkg */ 1083 /* grab blkcg lock too while removing @pd from @blkg */
891 spin_lock(&blkg->blkcg->lock); 1084 spin_lock(&blkg->blkcg->lock);
892 1085
1086 if (pol->pd_offline_fn)
1087 pol->pd_offline_fn(blkg);
893 if (pol->pd_exit_fn) 1088 if (pol->pd_exit_fn)
894 pol->pd_exit_fn(blkg); 1089 pol->pd_exit_fn(blkg);
895 1090
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 24597309e23d..f2b292925ccd 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -54,6 +54,7 @@ struct blkcg {
54 54
55 /* TODO: per-policy storage in blkcg */ 55 /* TODO: per-policy storage in blkcg */
56 unsigned int cfq_weight; /* belongs to cfq */ 56 unsigned int cfq_weight; /* belongs to cfq */
57 unsigned int cfq_leaf_weight;
57}; 58};
58 59
59struct blkg_stat { 60struct blkg_stat {
@@ -80,8 +81,9 @@ struct blkg_rwstat {
80 * beginning and pd_size can't be smaller than pd. 81 * beginning and pd_size can't be smaller than pd.
81 */ 82 */
82struct blkg_policy_data { 83struct blkg_policy_data {
83 /* the blkg this per-policy data belongs to */ 84 /* the blkg and policy id this per-policy data belongs to */
84 struct blkcg_gq *blkg; 85 struct blkcg_gq *blkg;
86 int plid;
85 87
86 /* used during policy activation */ 88 /* used during policy activation */
87 struct list_head alloc_node; 89 struct list_head alloc_node;
@@ -94,17 +96,27 @@ struct blkcg_gq {
94 struct list_head q_node; 96 struct list_head q_node;
95 struct hlist_node blkcg_node; 97 struct hlist_node blkcg_node;
96 struct blkcg *blkcg; 98 struct blkcg *blkcg;
99
100 /* all non-root blkcg_gq's are guaranteed to have access to parent */
101 struct blkcg_gq *parent;
102
97 /* request allocation list for this blkcg-q pair */ 103 /* request allocation list for this blkcg-q pair */
98 struct request_list rl; 104 struct request_list rl;
105
99 /* reference count */ 106 /* reference count */
100 int refcnt; 107 int refcnt;
101 108
109 /* is this blkg online? protected by both blkcg and q locks */
110 bool online;
111
102 struct blkg_policy_data *pd[BLKCG_MAX_POLS]; 112 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
103 113
104 struct rcu_head rcu_head; 114 struct rcu_head rcu_head;
105}; 115};
106 116
107typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); 117typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
118typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
119typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
108typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); 120typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
109typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); 121typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
110 122
@@ -117,6 +129,8 @@ struct blkcg_policy {
117 129
118 /* operations */ 130 /* operations */
119 blkcg_pol_init_pd_fn *pd_init_fn; 131 blkcg_pol_init_pd_fn *pd_init_fn;
132 blkcg_pol_online_pd_fn *pd_online_fn;
133 blkcg_pol_offline_pd_fn *pd_offline_fn;
120 blkcg_pol_exit_pd_fn *pd_exit_fn; 134 blkcg_pol_exit_pd_fn *pd_exit_fn;
121 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; 135 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
122}; 136};
@@ -150,6 +164,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
150u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 164u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
151 int off); 165 int off);
152 166
167u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
168struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
169 int off);
170
153struct blkg_conf_ctx { 171struct blkg_conf_ctx {
154 struct gendisk *disk; 172 struct gendisk *disk;
155 struct blkcg_gq *blkg; 173 struct blkcg_gq *blkg;
@@ -181,6 +199,19 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
181} 199}
182 200
183/** 201/**
202 * blkcg_parent - get the parent of a blkcg
203 * @blkcg: blkcg of interest
204 *
205 * Return the parent blkcg of @blkcg. Can be called anytime.
206 */
207static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
208{
209 struct cgroup *pcg = blkcg->css.cgroup->parent;
210
211 return pcg ? cgroup_to_blkcg(pcg) : NULL;
212}
213
214/**
184 * blkg_to_pdata - get policy private data 215 * blkg_to_pdata - get policy private data
185 * @blkg: blkg of interest 216 * @blkg: blkg of interest
186 * @pol: policy of interest 217 * @pol: policy of interest
@@ -387,6 +418,18 @@ static inline void blkg_stat_reset(struct blkg_stat *stat)
387} 418}
388 419
389/** 420/**
421 * blkg_stat_merge - merge a blkg_stat into another
422 * @to: the destination blkg_stat
423 * @from: the source
424 *
425 * Add @from's count to @to.
426 */
427static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
428{
429 blkg_stat_add(to, blkg_stat_read(from));
430}
431
432/**
390 * blkg_rwstat_add - add a value to a blkg_rwstat 433 * blkg_rwstat_add - add a value to a blkg_rwstat
391 * @rwstat: target blkg_rwstat 434 * @rwstat: target blkg_rwstat
392 * @rw: mask of REQ_{WRITE|SYNC} 435 * @rw: mask of REQ_{WRITE|SYNC}
@@ -434,14 +477,14 @@ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
434} 477}
435 478
436/** 479/**
437 * blkg_rwstat_sum - read the total count of a blkg_rwstat 480 * blkg_rwstat_total - read the total count of a blkg_rwstat
438 * @rwstat: blkg_rwstat to read 481 * @rwstat: blkg_rwstat to read
439 * 482 *
440 * Return the total count of @rwstat regardless of the IO direction. This 483 * Return the total count of @rwstat regardless of the IO direction. This
441 * function can be called without synchronization and takes care of u64 484 * function can be called without synchronization and takes care of u64
442 * atomicity. 485 * atomicity.
443 */ 486 */
444static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) 487static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
445{ 488{
446 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); 489 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
447 490
@@ -457,6 +500,25 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
457 memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); 500 memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
458} 501}
459 502
503/**
504 * blkg_rwstat_merge - merge a blkg_rwstat into another
505 * @to: the destination blkg_rwstat
506 * @from: the source
507 *
508 * Add @from's counts to @to.
509 */
510static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
511 struct blkg_rwstat *from)
512{
513 struct blkg_rwstat v = blkg_rwstat_read(from);
514 int i;
515
516 u64_stats_update_begin(&to->syncp);
517 for (i = 0; i < BLKG_RWSTAT_NR; i++)
518 to->cnt[i] += v.cnt[i];
519 u64_stats_update_end(&to->syncp);
520}
521
460#else /* CONFIG_BLK_CGROUP */ 522#else /* CONFIG_BLK_CGROUP */
461 523
462struct cgroup; 524struct cgroup;
diff --git a/block/blk-core.c b/block/blk-core.c
index 277134cb5d32..074b758efc42 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,7 +39,6 @@
39 39
40EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); 40EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
41EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); 41EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
42EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
43EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); 42EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
44 43
45DEFINE_IDA(blk_queue_ida); 44DEFINE_IDA(blk_queue_ida);
@@ -1348,7 +1347,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1348 if (!ll_back_merge_fn(q, req, bio)) 1347 if (!ll_back_merge_fn(q, req, bio))
1349 return false; 1348 return false;
1350 1349
1351 trace_block_bio_backmerge(q, bio); 1350 trace_block_bio_backmerge(q, req, bio);
1352 1351
1353 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) 1352 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1354 blk_rq_set_mixed_merge(req); 1353 blk_rq_set_mixed_merge(req);
@@ -1370,7 +1369,7 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1370 if (!ll_front_merge_fn(q, req, bio)) 1369 if (!ll_front_merge_fn(q, req, bio))
1371 return false; 1370 return false;
1372 1371
1373 trace_block_bio_frontmerge(q, bio); 1372 trace_block_bio_frontmerge(q, req, bio);
1374 1373
1375 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) 1374 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1376 blk_rq_set_mixed_merge(req); 1375 blk_rq_set_mixed_merge(req);
@@ -1553,13 +1552,6 @@ get_rq:
1553 if (list_empty(&plug->list)) 1552 if (list_empty(&plug->list))
1554 trace_block_plug(q); 1553 trace_block_plug(q);
1555 else { 1554 else {
1556 if (!plug->should_sort) {
1557 struct request *__rq;
1558
1559 __rq = list_entry_rq(plug->list.prev);
1560 if (__rq->q != q)
1561 plug->should_sort = 1;
1562 }
1563 if (request_count >= BLK_MAX_REQUEST_COUNT) { 1555 if (request_count >= BLK_MAX_REQUEST_COUNT) {
1564 blk_flush_plug_list(plug, false); 1556 blk_flush_plug_list(plug, false);
1565 trace_block_plug(q); 1557 trace_block_plug(q);
@@ -2890,7 +2882,6 @@ void blk_start_plug(struct blk_plug *plug)
2890 plug->magic = PLUG_MAGIC; 2882 plug->magic = PLUG_MAGIC;
2891 INIT_LIST_HEAD(&plug->list); 2883 INIT_LIST_HEAD(&plug->list);
2892 INIT_LIST_HEAD(&plug->cb_list); 2884 INIT_LIST_HEAD(&plug->cb_list);
2893 plug->should_sort = 0;
2894 2885
2895 /* 2886 /*
2896 * If this is a nested plug, don't actually assign it. It will be 2887 * If this is a nested plug, don't actually assign it. It will be
@@ -2992,10 +2983,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2992 2983
2993 list_splice_init(&plug->list, &list); 2984 list_splice_init(&plug->list, &list);
2994 2985
2995 if (plug->should_sort) { 2986 list_sort(NULL, &list, plug_rq_cmp);
2996 list_sort(NULL, &list, plug_rq_cmp);
2997 plug->should_sort = 0;
2998 }
2999 2987
3000 q = NULL; 2988 q = NULL;
3001 depth = 0; 2989 depth = 0;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index c88202f973d9..e70621396129 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -121,9 +121,9 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
121 /* Prevent hang_check timer from firing at us during very long I/O */ 121 /* Prevent hang_check timer from firing at us during very long I/O */
122 hang_check = sysctl_hung_task_timeout_secs; 122 hang_check = sysctl_hung_task_timeout_secs;
123 if (hang_check) 123 if (hang_check)
124 while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); 124 while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
125 else 125 else
126 wait_for_completion(&wait); 126 wait_for_completion_io(&wait);
127 127
128 if (rq->errors) 128 if (rq->errors)
129 err = -EIO; 129 err = -EIO;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 720ad607ff91..db8f1b507857 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -436,7 +436,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
436 436
437 bio_get(bio); 437 bio_get(bio);
438 submit_bio(WRITE_FLUSH, bio); 438 submit_bio(WRITE_FLUSH, bio);
439 wait_for_completion(&wait); 439 wait_for_completion_io(&wait);
440 440
441 /* 441 /*
442 * The driver must store the error location in ->bi_sector, if 442 * The driver must store the error location in ->bi_sector, if
diff --git a/block/blk-lib.c b/block/blk-lib.c
index b3a1f2b70b31..d6f50d572565 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -126,7 +126,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
126 126
127 /* Wait for bios in-flight */ 127 /* Wait for bios in-flight */
128 if (!atomic_dec_and_test(&bb.done)) 128 if (!atomic_dec_and_test(&bb.done))
129 wait_for_completion(&wait); 129 wait_for_completion_io(&wait);
130 130
131 if (!test_bit(BIO_UPTODATE, &bb.flags)) 131 if (!test_bit(BIO_UPTODATE, &bb.flags))
132 ret = -EIO; 132 ret = -EIO;
@@ -200,7 +200,7 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
200 200
201 /* Wait for bios in-flight */ 201 /* Wait for bios in-flight */
202 if (!atomic_dec_and_test(&bb.done)) 202 if (!atomic_dec_and_test(&bb.done))
203 wait_for_completion(&wait); 203 wait_for_completion_io(&wait);
204 204
205 if (!test_bit(BIO_UPTODATE, &bb.flags)) 205 if (!test_bit(BIO_UPTODATE, &bb.flags))
206 ret = -ENOTSUPP; 206 ret = -ENOTSUPP;
@@ -262,7 +262,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
262 262
263 /* Wait for bios in-flight */ 263 /* Wait for bios in-flight */
264 if (!atomic_dec_and_test(&bb.done)) 264 if (!atomic_dec_and_test(&bb.done))
265 wait_for_completion(&wait); 265 wait_for_completion_io(&wait);
266 266
267 if (!test_bit(BIO_UPTODATE, &bb.flags)) 267 if (!test_bit(BIO_UPTODATE, &bb.flags))
268 /* One of bios in the batch was completed with error.*/ 268 /* One of bios in the batch was completed with error.*/
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 788147797a79..6206a934eb8c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -497,6 +497,13 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
497 return res; 497 return res;
498} 498}
499 499
500static void blk_free_queue_rcu(struct rcu_head *rcu_head)
501{
502 struct request_queue *q = container_of(rcu_head, struct request_queue,
503 rcu_head);
504 kmem_cache_free(blk_requestq_cachep, q);
505}
506
500/** 507/**
501 * blk_release_queue: - release a &struct request_queue when it is no longer needed 508 * blk_release_queue: - release a &struct request_queue when it is no longer needed
502 * @kobj: the kobj belonging to the request queue to be released 509 * @kobj: the kobj belonging to the request queue to be released
@@ -538,7 +545,7 @@ static void blk_release_queue(struct kobject *kobj)
538 bdi_destroy(&q->backing_dev_info); 545 bdi_destroy(&q->backing_dev_info);
539 546
540 ida_simple_remove(&blk_queue_ida, q->id); 547 ida_simple_remove(&blk_queue_ida, q->id);
541 kmem_cache_free(blk_requestq_cachep, q); 548 call_rcu(&q->rcu_head, blk_free_queue_rcu);
542} 549}
543 550
544static const struct sysfs_ops queue_sysfs_ops = { 551static const struct sysfs_ops queue_sysfs_ops = {
diff --git a/block/blk.h b/block/blk.h
index 47fdfdd41520..e837b8f619b7 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -61,7 +61,7 @@ static inline void blk_clear_rq_complete(struct request *rq)
61/* 61/*
62 * Internal elevator interface 62 * Internal elevator interface
63 */ 63 */
64#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 64#define ELV_ON_HASH(rq) hash_hashed(&(rq)->hash)
65 65
66void blk_insert_flush(struct request *rq); 66void blk_insert_flush(struct request *rq);
67void blk_abort_flushes(struct request_queue *q); 67void blk_abort_flushes(struct request_queue *q);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ec52807cdd09..4f0ade74cfd0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -85,7 +85,6 @@ struct cfq_rb_root {
85 struct rb_root rb; 85 struct rb_root rb;
86 struct rb_node *left; 86 struct rb_node *left;
87 unsigned count; 87 unsigned count;
88 unsigned total_weight;
89 u64 min_vdisktime; 88 u64 min_vdisktime;
90 struct cfq_ttime ttime; 89 struct cfq_ttime ttime;
91}; 90};
@@ -155,7 +154,7 @@ struct cfq_queue {
155 * First index in the service_trees. 154 * First index in the service_trees.
156 * IDLE is handled separately, so it has negative index 155 * IDLE is handled separately, so it has negative index
157 */ 156 */
158enum wl_prio_t { 157enum wl_class_t {
159 BE_WORKLOAD = 0, 158 BE_WORKLOAD = 0,
160 RT_WORKLOAD = 1, 159 RT_WORKLOAD = 1,
161 IDLE_WORKLOAD = 2, 160 IDLE_WORKLOAD = 2,
@@ -223,10 +222,45 @@ struct cfq_group {
223 222
224 /* group service_tree key */ 223 /* group service_tree key */
225 u64 vdisktime; 224 u64 vdisktime;
225
226 /*
227 * The number of active cfqgs and sum of their weights under this
228 * cfqg. This covers this cfqg's leaf_weight and all children's
229 * weights, but does not cover weights of further descendants.
230 *
231 * If a cfqg is on the service tree, it's active. An active cfqg
232 * also activates its parent and contributes to the children_weight
233 * of the parent.
234 */
235 int nr_active;
236 unsigned int children_weight;
237
238 /*
239 * vfraction is the fraction of vdisktime that the tasks in this
240 * cfqg are entitled to. This is determined by compounding the
241 * ratios walking up from this cfqg to the root.
242 *
243 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
244 * vfractions on a service tree is approximately 1. The sum may
245 * deviate a bit due to rounding errors and fluctuations caused by
246 * cfqgs entering and leaving the service tree.
247 */
248 unsigned int vfraction;
249
250 /*
251 * There are two weights - (internal) weight is the weight of this
252 * cfqg against the sibling cfqgs. leaf_weight is the wight of
253 * this cfqg against the child cfqgs. For the root cfqg, both
254 * weights are kept in sync for backward compatibility.
255 */
226 unsigned int weight; 256 unsigned int weight;
227 unsigned int new_weight; 257 unsigned int new_weight;
228 unsigned int dev_weight; 258 unsigned int dev_weight;
229 259
260 unsigned int leaf_weight;
261 unsigned int new_leaf_weight;
262 unsigned int dev_leaf_weight;
263
230 /* number of cfqq currently on this group */ 264 /* number of cfqq currently on this group */
231 int nr_cfqq; 265 int nr_cfqq;
232 266
@@ -248,14 +282,15 @@ struct cfq_group {
248 struct cfq_rb_root service_trees[2][3]; 282 struct cfq_rb_root service_trees[2][3];
249 struct cfq_rb_root service_tree_idle; 283 struct cfq_rb_root service_tree_idle;
250 284
251 unsigned long saved_workload_slice; 285 unsigned long saved_wl_slice;
252 enum wl_type_t saved_workload; 286 enum wl_type_t saved_wl_type;
253 enum wl_prio_t saved_serving_prio; 287 enum wl_class_t saved_wl_class;
254 288
255 /* number of requests that are on the dispatch list or inside driver */ 289 /* number of requests that are on the dispatch list or inside driver */
256 int dispatched; 290 int dispatched;
257 struct cfq_ttime ttime; 291 struct cfq_ttime ttime;
258 struct cfqg_stats stats; 292 struct cfqg_stats stats; /* stats for this cfqg */
293 struct cfqg_stats dead_stats; /* stats pushed from dead children */
259}; 294};
260 295
261struct cfq_io_cq { 296struct cfq_io_cq {
@@ -280,8 +315,8 @@ struct cfq_data {
280 /* 315 /*
281 * The priority currently being served 316 * The priority currently being served
282 */ 317 */
283 enum wl_prio_t serving_prio; 318 enum wl_class_t serving_wl_class;
284 enum wl_type_t serving_type; 319 enum wl_type_t serving_wl_type;
285 unsigned long workload_expires; 320 unsigned long workload_expires;
286 struct cfq_group *serving_group; 321 struct cfq_group *serving_group;
287 322
@@ -353,17 +388,17 @@ struct cfq_data {
353 388
354static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 389static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
355 390
356static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, 391static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
357 enum wl_prio_t prio, 392 enum wl_class_t class,
358 enum wl_type_t type) 393 enum wl_type_t type)
359{ 394{
360 if (!cfqg) 395 if (!cfqg)
361 return NULL; 396 return NULL;
362 397
363 if (prio == IDLE_WORKLOAD) 398 if (class == IDLE_WORKLOAD)
364 return &cfqg->service_tree_idle; 399 return &cfqg->service_tree_idle;
365 400
366 return &cfqg->service_trees[prio][type]; 401 return &cfqg->service_trees[class][type];
367} 402}
368 403
369enum cfqq_state_flags { 404enum cfqq_state_flags {
@@ -502,7 +537,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
502{ 537{
503 struct cfqg_stats *stats = &cfqg->stats; 538 struct cfqg_stats *stats = &cfqg->stats;
504 539
505 if (blkg_rwstat_sum(&stats->queued)) 540 if (blkg_rwstat_total(&stats->queued))
506 return; 541 return;
507 542
508 /* 543 /*
@@ -546,7 +581,7 @@ static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
546 struct cfqg_stats *stats = &cfqg->stats; 581 struct cfqg_stats *stats = &cfqg->stats;
547 582
548 blkg_stat_add(&stats->avg_queue_size_sum, 583 blkg_stat_add(&stats->avg_queue_size_sum,
549 blkg_rwstat_sum(&stats->queued)); 584 blkg_rwstat_total(&stats->queued));
550 blkg_stat_add(&stats->avg_queue_size_samples, 1); 585 blkg_stat_add(&stats->avg_queue_size_samples, 1);
551 cfqg_stats_update_group_wait_time(stats); 586 cfqg_stats_update_group_wait_time(stats);
552} 587}
@@ -572,6 +607,13 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
572 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); 607 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
573} 608}
574 609
610static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
611{
612 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
613
614 return pblkg ? blkg_to_cfqg(pblkg) : NULL;
615}
616
575static inline void cfqg_get(struct cfq_group *cfqg) 617static inline void cfqg_get(struct cfq_group *cfqg)
576{ 618{
577 return blkg_get(cfqg_to_blkg(cfqg)); 619 return blkg_get(cfqg_to_blkg(cfqg));
@@ -586,8 +628,9 @@ static inline void cfqg_put(struct cfq_group *cfqg)
586 char __pbuf[128]; \ 628 char __pbuf[128]; \
587 \ 629 \
588 blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ 630 blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \
589 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 631 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
590 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 632 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
633 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
591 __pbuf, ##args); \ 634 __pbuf, ##args); \
592} while (0) 635} while (0)
593 636
@@ -646,11 +689,9 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
646 io_start_time - start_time); 689 io_start_time - start_time);
647} 690}
648 691
649static void cfq_pd_reset_stats(struct blkcg_gq *blkg) 692/* @stats = 0 */
693static void cfqg_stats_reset(struct cfqg_stats *stats)
650{ 694{
651 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
652 struct cfqg_stats *stats = &cfqg->stats;
653
654 /* queued stats shouldn't be cleared */ 695 /* queued stats shouldn't be cleared */
655 blkg_rwstat_reset(&stats->service_bytes); 696 blkg_rwstat_reset(&stats->service_bytes);
656 blkg_rwstat_reset(&stats->serviced); 697 blkg_rwstat_reset(&stats->serviced);
@@ -669,13 +710,58 @@ static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
669#endif 710#endif
670} 711}
671 712
713/* @to += @from */
714static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
715{
716 /* queued stats shouldn't be cleared */
717 blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
718 blkg_rwstat_merge(&to->serviced, &from->serviced);
719 blkg_rwstat_merge(&to->merged, &from->merged);
720 blkg_rwstat_merge(&to->service_time, &from->service_time);
721 blkg_rwstat_merge(&to->wait_time, &from->wait_time);
722 blkg_stat_merge(&from->time, &from->time);
723#ifdef CONFIG_DEBUG_BLK_CGROUP
724 blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
725 blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
726 blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
727 blkg_stat_merge(&to->dequeue, &from->dequeue);
728 blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
729 blkg_stat_merge(&to->idle_time, &from->idle_time);
730 blkg_stat_merge(&to->empty_time, &from->empty_time);
731#endif
732}
733
734/*
735 * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
736 * recursive stats can still account for the amount used by this cfqg after
737 * it's gone.
738 */
739static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
740{
741 struct cfq_group *parent = cfqg_parent(cfqg);
742
743 lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
744
745 if (unlikely(!parent))
746 return;
747
748 cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
749 cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
750 cfqg_stats_reset(&cfqg->stats);
751 cfqg_stats_reset(&cfqg->dead_stats);
752}
753
672#else /* CONFIG_CFQ_GROUP_IOSCHED */ 754#else /* CONFIG_CFQ_GROUP_IOSCHED */
673 755
756static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
674static inline void cfqg_get(struct cfq_group *cfqg) { } 757static inline void cfqg_get(struct cfq_group *cfqg) { }
675static inline void cfqg_put(struct cfq_group *cfqg) { } 758static inline void cfqg_put(struct cfq_group *cfqg) { }
676 759
677#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 760#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
678 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 761 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
762 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
763 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
764 ##args)
679#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) 765#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
680 766
681static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, 767static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
@@ -732,7 +818,7 @@ static inline bool iops_mode(struct cfq_data *cfqd)
732 return false; 818 return false;
733} 819}
734 820
735static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 821static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
736{ 822{
737 if (cfq_class_idle(cfqq)) 823 if (cfq_class_idle(cfqq))
738 return IDLE_WORKLOAD; 824 return IDLE_WORKLOAD;
@@ -751,23 +837,23 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
751 return SYNC_WORKLOAD; 837 return SYNC_WORKLOAD;
752} 838}
753 839
754static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, 840static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
755 struct cfq_data *cfqd, 841 struct cfq_data *cfqd,
756 struct cfq_group *cfqg) 842 struct cfq_group *cfqg)
757{ 843{
758 if (wl == IDLE_WORKLOAD) 844 if (wl_class == IDLE_WORKLOAD)
759 return cfqg->service_tree_idle.count; 845 return cfqg->service_tree_idle.count;
760 846
761 return cfqg->service_trees[wl][ASYNC_WORKLOAD].count 847 return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
762 + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count 848 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
763 + cfqg->service_trees[wl][SYNC_WORKLOAD].count; 849 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
764} 850}
765 851
766static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, 852static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
767 struct cfq_group *cfqg) 853 struct cfq_group *cfqg)
768{ 854{
769 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count 855 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
770 + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; 856 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
771} 857}
772 858
773static void cfq_dispatch_insert(struct request_queue *, struct request *); 859static void cfq_dispatch_insert(struct request_queue *, struct request *);
@@ -847,13 +933,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
847 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); 933 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
848} 934}
849 935
850static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) 936/**
937 * cfqg_scale_charge - scale disk time charge according to cfqg weight
938 * @charge: disk time being charged
939 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
940 *
941 * Scale @charge according to @vfraction, which is in range (0, 1]. The
942 * scaling is inversely proportional.
943 *
944 * scaled = charge / vfraction
945 *
946 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
947 */
948static inline u64 cfqg_scale_charge(unsigned long charge,
949 unsigned int vfraction)
851{ 950{
852 u64 d = delta << CFQ_SERVICE_SHIFT; 951 u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
853 952
854 d = d * CFQ_WEIGHT_DEFAULT; 953 /* charge / vfraction */
855 do_div(d, cfqg->weight); 954 c <<= CFQ_SERVICE_SHIFT;
856 return d; 955 do_div(c, vfraction);
956 return c;
857} 957}
858 958
859static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) 959static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
@@ -909,9 +1009,7 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
909static inline unsigned 1009static inline unsigned
910cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) 1010cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
911{ 1011{
912 struct cfq_rb_root *st = &cfqd->grp_service_tree; 1012 return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
913
914 return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
915} 1013}
916 1014
917static inline unsigned 1015static inline unsigned
@@ -1178,20 +1276,61 @@ static void
1178cfq_update_group_weight(struct cfq_group *cfqg) 1276cfq_update_group_weight(struct cfq_group *cfqg)
1179{ 1277{
1180 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1278 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1279
1181 if (cfqg->new_weight) { 1280 if (cfqg->new_weight) {
1182 cfqg->weight = cfqg->new_weight; 1281 cfqg->weight = cfqg->new_weight;
1183 cfqg->new_weight = 0; 1282 cfqg->new_weight = 0;
1184 } 1283 }
1284
1285 if (cfqg->new_leaf_weight) {
1286 cfqg->leaf_weight = cfqg->new_leaf_weight;
1287 cfqg->new_leaf_weight = 0;
1288 }
1185} 1289}
1186 1290
1187static void 1291static void
1188cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) 1292cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1189{ 1293{
1294 unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
1295 struct cfq_group *pos = cfqg;
1296 struct cfq_group *parent;
1297 bool propagate;
1298
1299 /* add to the service tree */
1190 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1300 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1191 1301
1192 cfq_update_group_weight(cfqg); 1302 cfq_update_group_weight(cfqg);
1193 __cfq_group_service_tree_add(st, cfqg); 1303 __cfq_group_service_tree_add(st, cfqg);
1194 st->total_weight += cfqg->weight; 1304
1305 /*
1306 * Activate @cfqg and calculate the portion of vfraction @cfqg is
1307 * entitled to. vfraction is calculated by walking the tree
1308 * towards the root calculating the fraction it has at each level.
1309 * The compounded ratio is how much vfraction @cfqg owns.
1310 *
1311 * Start with the proportion tasks in this cfqg has against active
1312 * children cfqgs - its leaf_weight against children_weight.
1313 */
1314 propagate = !pos->nr_active++;
1315 pos->children_weight += pos->leaf_weight;
1316 vfr = vfr * pos->leaf_weight / pos->children_weight;
1317
1318 /*
1319 * Compound ->weight walking up the tree. Both activation and
1320 * vfraction calculation are done in the same loop. Propagation
1321 * stops once an already activated node is met. vfraction
1322 * calculation should always continue to the root.
1323 */
1324 while ((parent = cfqg_parent(pos))) {
1325 if (propagate) {
1326 propagate = !parent->nr_active++;
1327 parent->children_weight += pos->weight;
1328 }
1329 vfr = vfr * pos->weight / parent->children_weight;
1330 pos = parent;
1331 }
1332
1333 cfqg->vfraction = max_t(unsigned, vfr, 1);
1195} 1334}
1196 1335
1197static void 1336static void
@@ -1222,7 +1361,32 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1222static void 1361static void
1223cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) 1362cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1224{ 1363{
1225 st->total_weight -= cfqg->weight; 1364 struct cfq_group *pos = cfqg;
1365 bool propagate;
1366
1367 /*
1368 * Undo activation from cfq_group_service_tree_add(). Deactivate
1369 * @cfqg and propagate deactivation upwards.
1370 */
1371 propagate = !--pos->nr_active;
1372 pos->children_weight -= pos->leaf_weight;
1373
1374 while (propagate) {
1375 struct cfq_group *parent = cfqg_parent(pos);
1376
1377 /* @pos has 0 nr_active at this point */
1378 WARN_ON_ONCE(pos->children_weight);
1379 pos->vfraction = 0;
1380
1381 if (!parent)
1382 break;
1383
1384 propagate = !--parent->nr_active;
1385 parent->children_weight -= pos->weight;
1386 pos = parent;
1387 }
1388
1389 /* remove from the service tree */
1226 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 1390 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1227 cfq_rb_erase(&cfqg->rb_node, st); 1391 cfq_rb_erase(&cfqg->rb_node, st);
1228} 1392}
@@ -1241,7 +1405,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1241 1405
1242 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 1406 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1243 cfq_group_service_tree_del(st, cfqg); 1407 cfq_group_service_tree_del(st, cfqg);
1244 cfqg->saved_workload_slice = 0; 1408 cfqg->saved_wl_slice = 0;
1245 cfqg_stats_update_dequeue(cfqg); 1409 cfqg_stats_update_dequeue(cfqg);
1246} 1410}
1247 1411
@@ -1284,6 +1448,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1284 unsigned int used_sl, charge, unaccounted_sl = 0; 1448 unsigned int used_sl, charge, unaccounted_sl = 0;
1285 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 1449 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1286 - cfqg->service_tree_idle.count; 1450 - cfqg->service_tree_idle.count;
1451 unsigned int vfr;
1287 1452
1288 BUG_ON(nr_sync < 0); 1453 BUG_ON(nr_sync < 0);
1289 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); 1454 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
@@ -1293,20 +1458,25 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1293 else if (!cfq_cfqq_sync(cfqq) && !nr_sync) 1458 else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1294 charge = cfqq->allocated_slice; 1459 charge = cfqq->allocated_slice;
1295 1460
1296 /* Can't update vdisktime while group is on service tree */ 1461 /*
1462 * Can't update vdisktime while on service tree and cfqg->vfraction
1463 * is valid only while on it. Cache vfr, leave the service tree,
1464 * update vdisktime and go back on. The re-addition to the tree
1465 * will also update the weights as necessary.
1466 */
1467 vfr = cfqg->vfraction;
1297 cfq_group_service_tree_del(st, cfqg); 1468 cfq_group_service_tree_del(st, cfqg);
1298 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 1469 cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1299 /* If a new weight was requested, update now, off tree */
1300 cfq_group_service_tree_add(st, cfqg); 1470 cfq_group_service_tree_add(st, cfqg);
1301 1471
1302 /* This group is being expired. Save the context */ 1472 /* This group is being expired. Save the context */
1303 if (time_after(cfqd->workload_expires, jiffies)) { 1473 if (time_after(cfqd->workload_expires, jiffies)) {
1304 cfqg->saved_workload_slice = cfqd->workload_expires 1474 cfqg->saved_wl_slice = cfqd->workload_expires
1305 - jiffies; 1475 - jiffies;
1306 cfqg->saved_workload = cfqd->serving_type; 1476 cfqg->saved_wl_type = cfqd->serving_wl_type;
1307 cfqg->saved_serving_prio = cfqd->serving_prio; 1477 cfqg->saved_wl_class = cfqd->serving_wl_class;
1308 } else 1478 } else
1309 cfqg->saved_workload_slice = 0; 1479 cfqg->saved_wl_slice = 0;
1310 1480
1311 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 1481 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1312 st->min_vdisktime); 1482 st->min_vdisktime);
@@ -1344,6 +1514,52 @@ static void cfq_pd_init(struct blkcg_gq *blkg)
1344 1514
1345 cfq_init_cfqg_base(cfqg); 1515 cfq_init_cfqg_base(cfqg);
1346 cfqg->weight = blkg->blkcg->cfq_weight; 1516 cfqg->weight = blkg->blkcg->cfq_weight;
1517 cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
1518}
1519
1520static void cfq_pd_offline(struct blkcg_gq *blkg)
1521{
1522 /*
1523 * @blkg is going offline and will be ignored by
1524 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
1525 * that they don't get lost. If IOs complete after this point, the
1526 * stats for them will be lost. Oh well...
1527 */
1528 cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
1529}
1530
1531/* offset delta from cfqg->stats to cfqg->dead_stats */
1532static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
1533 offsetof(struct cfq_group, stats);
1534
1535/* to be used by recursive prfill, sums live and dead stats recursively */
1536static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
1537{
1538 u64 sum = 0;
1539
1540 sum += blkg_stat_recursive_sum(pd, off);
1541 sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
1542 return sum;
1543}
1544
1545/* to be used by recursive prfill, sums live and dead rwstats recursively */
1546static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
1547 int off)
1548{
1549 struct blkg_rwstat a, b;
1550
1551 a = blkg_rwstat_recursive_sum(pd, off);
1552 b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
1553 blkg_rwstat_merge(&a, &b);
1554 return a;
1555}
1556
1557static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
1558{
1559 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1560
1561 cfqg_stats_reset(&cfqg->stats);
1562 cfqg_stats_reset(&cfqg->dead_stats);
1347} 1563}
1348 1564
1349/* 1565/*
@@ -1400,6 +1616,26 @@ static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
1400 return 0; 1616 return 0;
1401} 1617}
1402 1618
1619static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1620 struct blkg_policy_data *pd, int off)
1621{
1622 struct cfq_group *cfqg = pd_to_cfqg(pd);
1623
1624 if (!cfqg->dev_leaf_weight)
1625 return 0;
1626 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1627}
1628
1629static int cfqg_print_leaf_weight_device(struct cgroup *cgrp,
1630 struct cftype *cft,
1631 struct seq_file *sf)
1632{
1633 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1634 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0,
1635 false);
1636 return 0;
1637}
1638
1403static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, 1639static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1404 struct seq_file *sf) 1640 struct seq_file *sf)
1405{ 1641{
@@ -1407,8 +1643,16 @@ static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1407 return 0; 1643 return 0;
1408} 1644}
1409 1645
1410static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, 1646static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
1411 const char *buf) 1647 struct seq_file *sf)
1648{
1649 seq_printf(sf, "%u\n",
1650 cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
1651 return 0;
1652}
1653
1654static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1655 const char *buf, bool is_leaf_weight)
1412{ 1656{
1413 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1657 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1414 struct blkg_conf_ctx ctx; 1658 struct blkg_conf_ctx ctx;
@@ -1422,8 +1666,13 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1422 ret = -EINVAL; 1666 ret = -EINVAL;
1423 cfqg = blkg_to_cfqg(ctx.blkg); 1667 cfqg = blkg_to_cfqg(ctx.blkg);
1424 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { 1668 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1425 cfqg->dev_weight = ctx.v; 1669 if (!is_leaf_weight) {
1426 cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight; 1670 cfqg->dev_weight = ctx.v;
1671 cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
1672 } else {
1673 cfqg->dev_leaf_weight = ctx.v;
1674 cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
1675 }
1427 ret = 0; 1676 ret = 0;
1428 } 1677 }
1429 1678
@@ -1431,7 +1680,20 @@ static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1431 return ret; 1680 return ret;
1432} 1681}
1433 1682
1434static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) 1683static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1684 const char *buf)
1685{
1686 return __cfqg_set_weight_device(cgrp, cft, buf, false);
1687}
1688
1689static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
1690 const char *buf)
1691{
1692 return __cfqg_set_weight_device(cgrp, cft, buf, true);
1693}
1694
1695static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
1696 bool is_leaf_weight)
1435{ 1697{
1436 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1698 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1437 struct blkcg_gq *blkg; 1699 struct blkcg_gq *blkg;
@@ -1440,19 +1702,41 @@ static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1440 return -EINVAL; 1702 return -EINVAL;
1441 1703
1442 spin_lock_irq(&blkcg->lock); 1704 spin_lock_irq(&blkcg->lock);
1443 blkcg->cfq_weight = (unsigned int)val; 1705
1706 if (!is_leaf_weight)
1707 blkcg->cfq_weight = val;
1708 else
1709 blkcg->cfq_leaf_weight = val;
1444 1710
1445 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 1711 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1446 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1712 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1447 1713
1448 if (cfqg && !cfqg->dev_weight) 1714 if (!cfqg)
1449 cfqg->new_weight = blkcg->cfq_weight; 1715 continue;
1716
1717 if (!is_leaf_weight) {
1718 if (!cfqg->dev_weight)
1719 cfqg->new_weight = blkcg->cfq_weight;
1720 } else {
1721 if (!cfqg->dev_leaf_weight)
1722 cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
1723 }
1450 } 1724 }
1451 1725
1452 spin_unlock_irq(&blkcg->lock); 1726 spin_unlock_irq(&blkcg->lock);
1453 return 0; 1727 return 0;
1454} 1728}
1455 1729
1730static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1731{
1732 return __cfq_set_weight(cgrp, cft, val, false);
1733}
1734
1735static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1736{
1737 return __cfq_set_weight(cgrp, cft, val, true);
1738}
1739
1456static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, 1740static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
1457 struct seq_file *sf) 1741 struct seq_file *sf)
1458{ 1742{
@@ -1473,6 +1757,42 @@ static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
1473 return 0; 1757 return 0;
1474} 1758}
1475 1759
1760static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1761 struct blkg_policy_data *pd, int off)
1762{
1763 u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
1764
1765 return __blkg_prfill_u64(sf, pd, sum);
1766}
1767
1768static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1769 struct blkg_policy_data *pd, int off)
1770{
1771 struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
1772
1773 return __blkg_prfill_rwstat(sf, pd, &sum);
1774}
1775
1776static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
1777 struct seq_file *sf)
1778{
1779 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1780
1781 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
1782 &blkcg_policy_cfq, cft->private, false);
1783 return 0;
1784}
1785
1786static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
1787 struct seq_file *sf)
1788{
1789 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1790
1791 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
1792 &blkcg_policy_cfq, cft->private, true);
1793 return 0;
1794}
1795
1476#ifdef CONFIG_DEBUG_BLK_CGROUP 1796#ifdef CONFIG_DEBUG_BLK_CGROUP
1477static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, 1797static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1478 struct blkg_policy_data *pd, int off) 1798 struct blkg_policy_data *pd, int off)
@@ -1502,17 +1822,49 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
1502#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1822#endif /* CONFIG_DEBUG_BLK_CGROUP */
1503 1823
1504static struct cftype cfq_blkcg_files[] = { 1824static struct cftype cfq_blkcg_files[] = {
1825 /* on root, weight is mapped to leaf_weight */
1826 {
1827 .name = "weight_device",
1828 .flags = CFTYPE_ONLY_ON_ROOT,
1829 .read_seq_string = cfqg_print_leaf_weight_device,
1830 .write_string = cfqg_set_leaf_weight_device,
1831 .max_write_len = 256,
1832 },
1833 {
1834 .name = "weight",
1835 .flags = CFTYPE_ONLY_ON_ROOT,
1836 .read_seq_string = cfq_print_leaf_weight,
1837 .write_u64 = cfq_set_leaf_weight,
1838 },
1839
1840 /* no such mapping necessary for !roots */
1505 { 1841 {
1506 .name = "weight_device", 1842 .name = "weight_device",
1843 .flags = CFTYPE_NOT_ON_ROOT,
1507 .read_seq_string = cfqg_print_weight_device, 1844 .read_seq_string = cfqg_print_weight_device,
1508 .write_string = cfqg_set_weight_device, 1845 .write_string = cfqg_set_weight_device,
1509 .max_write_len = 256, 1846 .max_write_len = 256,
1510 }, 1847 },
1511 { 1848 {
1512 .name = "weight", 1849 .name = "weight",
1850 .flags = CFTYPE_NOT_ON_ROOT,
1513 .read_seq_string = cfq_print_weight, 1851 .read_seq_string = cfq_print_weight,
1514 .write_u64 = cfq_set_weight, 1852 .write_u64 = cfq_set_weight,
1515 }, 1853 },
1854
1855 {
1856 .name = "leaf_weight_device",
1857 .read_seq_string = cfqg_print_leaf_weight_device,
1858 .write_string = cfqg_set_leaf_weight_device,
1859 .max_write_len = 256,
1860 },
1861 {
1862 .name = "leaf_weight",
1863 .read_seq_string = cfq_print_leaf_weight,
1864 .write_u64 = cfq_set_leaf_weight,
1865 },
1866
1867 /* statistics, covers only the tasks in the cfqg */
1516 { 1868 {
1517 .name = "time", 1869 .name = "time",
1518 .private = offsetof(struct cfq_group, stats.time), 1870 .private = offsetof(struct cfq_group, stats.time),
@@ -1553,6 +1905,48 @@ static struct cftype cfq_blkcg_files[] = {
1553 .private = offsetof(struct cfq_group, stats.queued), 1905 .private = offsetof(struct cfq_group, stats.queued),
1554 .read_seq_string = cfqg_print_rwstat, 1906 .read_seq_string = cfqg_print_rwstat,
1555 }, 1907 },
1908
1909 /* the same statictics which cover the cfqg and its descendants */
1910 {
1911 .name = "time_recursive",
1912 .private = offsetof(struct cfq_group, stats.time),
1913 .read_seq_string = cfqg_print_stat_recursive,
1914 },
1915 {
1916 .name = "sectors_recursive",
1917 .private = offsetof(struct cfq_group, stats.sectors),
1918 .read_seq_string = cfqg_print_stat_recursive,
1919 },
1920 {
1921 .name = "io_service_bytes_recursive",
1922 .private = offsetof(struct cfq_group, stats.service_bytes),
1923 .read_seq_string = cfqg_print_rwstat_recursive,
1924 },
1925 {
1926 .name = "io_serviced_recursive",
1927 .private = offsetof(struct cfq_group, stats.serviced),
1928 .read_seq_string = cfqg_print_rwstat_recursive,
1929 },
1930 {
1931 .name = "io_service_time_recursive",
1932 .private = offsetof(struct cfq_group, stats.service_time),
1933 .read_seq_string = cfqg_print_rwstat_recursive,
1934 },
1935 {
1936 .name = "io_wait_time_recursive",
1937 .private = offsetof(struct cfq_group, stats.wait_time),
1938 .read_seq_string = cfqg_print_rwstat_recursive,
1939 },
1940 {
1941 .name = "io_merged_recursive",
1942 .private = offsetof(struct cfq_group, stats.merged),
1943 .read_seq_string = cfqg_print_rwstat_recursive,
1944 },
1945 {
1946 .name = "io_queued_recursive",
1947 .private = offsetof(struct cfq_group, stats.queued),
1948 .read_seq_string = cfqg_print_rwstat_recursive,
1949 },
1556#ifdef CONFIG_DEBUG_BLK_CGROUP 1950#ifdef CONFIG_DEBUG_BLK_CGROUP
1557 { 1951 {
1558 .name = "avg_queue_size", 1952 .name = "avg_queue_size",
@@ -1611,15 +2005,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1611 struct rb_node **p, *parent; 2005 struct rb_node **p, *parent;
1612 struct cfq_queue *__cfqq; 2006 struct cfq_queue *__cfqq;
1613 unsigned long rb_key; 2007 unsigned long rb_key;
1614 struct cfq_rb_root *service_tree; 2008 struct cfq_rb_root *st;
1615 int left; 2009 int left;
1616 int new_cfqq = 1; 2010 int new_cfqq = 1;
1617 2011
1618 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 2012 st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
1619 cfqq_type(cfqq));
1620 if (cfq_class_idle(cfqq)) { 2013 if (cfq_class_idle(cfqq)) {
1621 rb_key = CFQ_IDLE_DELAY; 2014 rb_key = CFQ_IDLE_DELAY;
1622 parent = rb_last(&service_tree->rb); 2015 parent = rb_last(&st->rb);
1623 if (parent && parent != &cfqq->rb_node) { 2016 if (parent && parent != &cfqq->rb_node) {
1624 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 2017 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1625 rb_key += __cfqq->rb_key; 2018 rb_key += __cfqq->rb_key;
@@ -1637,7 +2030,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1637 cfqq->slice_resid = 0; 2030 cfqq->slice_resid = 0;
1638 } else { 2031 } else {
1639 rb_key = -HZ; 2032 rb_key = -HZ;
1640 __cfqq = cfq_rb_first(service_tree); 2033 __cfqq = cfq_rb_first(st);
1641 rb_key += __cfqq ? __cfqq->rb_key : jiffies; 2034 rb_key += __cfqq ? __cfqq->rb_key : jiffies;
1642 } 2035 }
1643 2036
@@ -1646,8 +2039,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1646 /* 2039 /*
1647 * same position, nothing more to do 2040 * same position, nothing more to do
1648 */ 2041 */
1649 if (rb_key == cfqq->rb_key && 2042 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
1650 cfqq->service_tree == service_tree)
1651 return; 2043 return;
1652 2044
1653 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); 2045 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
@@ -1656,11 +2048,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1656 2048
1657 left = 1; 2049 left = 1;
1658 parent = NULL; 2050 parent = NULL;
1659 cfqq->service_tree = service_tree; 2051 cfqq->service_tree = st;
1660 p = &service_tree->rb.rb_node; 2052 p = &st->rb.rb_node;
1661 while (*p) { 2053 while (*p) {
1662 struct rb_node **n;
1663
1664 parent = *p; 2054 parent = *p;
1665 __cfqq = rb_entry(parent, struct cfq_queue, rb_node); 2055 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
1666 2056
@@ -1668,22 +2058,20 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1668 * sort by key, that represents service time. 2058 * sort by key, that represents service time.
1669 */ 2059 */
1670 if (time_before(rb_key, __cfqq->rb_key)) 2060 if (time_before(rb_key, __cfqq->rb_key))
1671 n = &(*p)->rb_left; 2061 p = &parent->rb_left;
1672 else { 2062 else {
1673 n = &(*p)->rb_right; 2063 p = &parent->rb_right;
1674 left = 0; 2064 left = 0;
1675 } 2065 }
1676
1677 p = n;
1678 } 2066 }
1679 2067
1680 if (left) 2068 if (left)
1681 service_tree->left = &cfqq->rb_node; 2069 st->left = &cfqq->rb_node;
1682 2070
1683 cfqq->rb_key = rb_key; 2071 cfqq->rb_key = rb_key;
1684 rb_link_node(&cfqq->rb_node, parent, p); 2072 rb_link_node(&cfqq->rb_node, parent, p);
1685 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 2073 rb_insert_color(&cfqq->rb_node, &st->rb);
1686 service_tree->count++; 2074 st->count++;
1687 if (add_front || !new_cfqq) 2075 if (add_front || !new_cfqq)
1688 return; 2076 return;
1689 cfq_group_notify_queue_add(cfqd, cfqq->cfqg); 2077 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
@@ -2029,8 +2417,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
2029 struct cfq_queue *cfqq) 2417 struct cfq_queue *cfqq)
2030{ 2418{
2031 if (cfqq) { 2419 if (cfqq) {
2032 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 2420 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2033 cfqd->serving_prio, cfqd->serving_type); 2421 cfqd->serving_wl_class, cfqd->serving_wl_type);
2034 cfqg_stats_update_avg_queue_size(cfqq->cfqg); 2422 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2035 cfqq->slice_start = 0; 2423 cfqq->slice_start = 0;
2036 cfqq->dispatch_start = jiffies; 2424 cfqq->dispatch_start = jiffies;
@@ -2116,19 +2504,18 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2116 */ 2504 */
2117static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) 2505static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2118{ 2506{
2119 struct cfq_rb_root *service_tree = 2507 struct cfq_rb_root *st = st_for(cfqd->serving_group,
2120 service_tree_for(cfqd->serving_group, cfqd->serving_prio, 2508 cfqd->serving_wl_class, cfqd->serving_wl_type);
2121 cfqd->serving_type);
2122 2509
2123 if (!cfqd->rq_queued) 2510 if (!cfqd->rq_queued)
2124 return NULL; 2511 return NULL;
2125 2512
2126 /* There is nothing to dispatch */ 2513 /* There is nothing to dispatch */
2127 if (!service_tree) 2514 if (!st)
2128 return NULL; 2515 return NULL;
2129 if (RB_EMPTY_ROOT(&service_tree->rb)) 2516 if (RB_EMPTY_ROOT(&st->rb))
2130 return NULL; 2517 return NULL;
2131 return cfq_rb_first(service_tree); 2518 return cfq_rb_first(st);
2132} 2519}
2133 2520
2134static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) 2521static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
@@ -2284,17 +2671,17 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2284 2671
2285static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2672static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2286{ 2673{
2287 enum wl_prio_t prio = cfqq_prio(cfqq); 2674 enum wl_class_t wl_class = cfqq_class(cfqq);
2288 struct cfq_rb_root *service_tree = cfqq->service_tree; 2675 struct cfq_rb_root *st = cfqq->service_tree;
2289 2676
2290 BUG_ON(!service_tree); 2677 BUG_ON(!st);
2291 BUG_ON(!service_tree->count); 2678 BUG_ON(!st->count);
2292 2679
2293 if (!cfqd->cfq_slice_idle) 2680 if (!cfqd->cfq_slice_idle)
2294 return false; 2681 return false;
2295 2682
2296 /* We never do for idle class queues. */ 2683 /* We never do for idle class queues. */
2297 if (prio == IDLE_WORKLOAD) 2684 if (wl_class == IDLE_WORKLOAD)
2298 return false; 2685 return false;
2299 2686
2300 /* We do for queues that were marked with idle window flag. */ 2687 /* We do for queues that were marked with idle window flag. */
@@ -2306,11 +2693,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2306 * Otherwise, we do only if they are the last ones 2693 * Otherwise, we do only if they are the last ones
2307 * in their service tree. 2694 * in their service tree.
2308 */ 2695 */
2309 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) && 2696 if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2310 !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false)) 2697 !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2311 return true; 2698 return true;
2312 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 2699 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2313 service_tree->count);
2314 return false; 2700 return false;
2315} 2701}
2316 2702
@@ -2493,8 +2879,8 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2493 } 2879 }
2494} 2880}
2495 2881
2496static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, 2882static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
2497 struct cfq_group *cfqg, enum wl_prio_t prio) 2883 struct cfq_group *cfqg, enum wl_class_t wl_class)
2498{ 2884{
2499 struct cfq_queue *queue; 2885 struct cfq_queue *queue;
2500 int i; 2886 int i;
@@ -2504,7 +2890,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2504 2890
2505 for (i = 0; i <= SYNC_WORKLOAD; ++i) { 2891 for (i = 0; i <= SYNC_WORKLOAD; ++i) {
2506 /* select the one with lowest rb_key */ 2892 /* select the one with lowest rb_key */
2507 queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); 2893 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
2508 if (queue && 2894 if (queue &&
2509 (!key_valid || time_before(queue->rb_key, lowest_key))) { 2895 (!key_valid || time_before(queue->rb_key, lowest_key))) {
2510 lowest_key = queue->rb_key; 2896 lowest_key = queue->rb_key;
@@ -2516,26 +2902,27 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
2516 return cur_best; 2902 return cur_best;
2517} 2903}
2518 2904
2519static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) 2905static void
2906choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
2520{ 2907{
2521 unsigned slice; 2908 unsigned slice;
2522 unsigned count; 2909 unsigned count;
2523 struct cfq_rb_root *st; 2910 struct cfq_rb_root *st;
2524 unsigned group_slice; 2911 unsigned group_slice;
2525 enum wl_prio_t original_prio = cfqd->serving_prio; 2912 enum wl_class_t original_class = cfqd->serving_wl_class;
2526 2913
2527 /* Choose next priority. RT > BE > IDLE */ 2914 /* Choose next priority. RT > BE > IDLE */
2528 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2915 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2529 cfqd->serving_prio = RT_WORKLOAD; 2916 cfqd->serving_wl_class = RT_WORKLOAD;
2530 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) 2917 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
2531 cfqd->serving_prio = BE_WORKLOAD; 2918 cfqd->serving_wl_class = BE_WORKLOAD;
2532 else { 2919 else {
2533 cfqd->serving_prio = IDLE_WORKLOAD; 2920 cfqd->serving_wl_class = IDLE_WORKLOAD;
2534 cfqd->workload_expires = jiffies + 1; 2921 cfqd->workload_expires = jiffies + 1;
2535 return; 2922 return;
2536 } 2923 }
2537 2924
2538 if (original_prio != cfqd->serving_prio) 2925 if (original_class != cfqd->serving_wl_class)
2539 goto new_workload; 2926 goto new_workload;
2540 2927
2541 /* 2928 /*
@@ -2543,7 +2930,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2543 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2930 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
2544 * expiration time 2931 * expiration time
2545 */ 2932 */
2546 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2933 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2547 count = st->count; 2934 count = st->count;
2548 2935
2549 /* 2936 /*
@@ -2554,9 +2941,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2554 2941
2555new_workload: 2942new_workload:
2556 /* otherwise select new workload type */ 2943 /* otherwise select new workload type */
2557 cfqd->serving_type = 2944 cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
2558 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); 2945 cfqd->serving_wl_class);
2559 st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); 2946 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
2560 count = st->count; 2947 count = st->count;
2561 2948
2562 /* 2949 /*
@@ -2567,10 +2954,11 @@ new_workload:
2567 group_slice = cfq_group_slice(cfqd, cfqg); 2954 group_slice = cfq_group_slice(cfqd, cfqg);
2568 2955
2569 slice = group_slice * count / 2956 slice = group_slice * count /
2570 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], 2957 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
2571 cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); 2958 cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
2959 cfqg));
2572 2960
2573 if (cfqd->serving_type == ASYNC_WORKLOAD) { 2961 if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
2574 unsigned int tmp; 2962 unsigned int tmp;
2575 2963
2576 /* 2964 /*
@@ -2616,14 +3004,14 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
2616 cfqd->serving_group = cfqg; 3004 cfqd->serving_group = cfqg;
2617 3005
2618 /* Restore the workload type data */ 3006 /* Restore the workload type data */
2619 if (cfqg->saved_workload_slice) { 3007 if (cfqg->saved_wl_slice) {
2620 cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; 3008 cfqd->workload_expires = jiffies + cfqg->saved_wl_slice;
2621 cfqd->serving_type = cfqg->saved_workload; 3009 cfqd->serving_wl_type = cfqg->saved_wl_type;
2622 cfqd->serving_prio = cfqg->saved_serving_prio; 3010 cfqd->serving_wl_class = cfqg->saved_wl_class;
2623 } else 3011 } else
2624 cfqd->workload_expires = jiffies - 1; 3012 cfqd->workload_expires = jiffies - 1;
2625 3013
2626 choose_service_tree(cfqd, cfqg); 3014 choose_wl_class_and_type(cfqd, cfqg);
2627} 3015}
2628 3016
2629/* 3017/*
@@ -3205,6 +3593,8 @@ retry:
3205 spin_lock_irq(cfqd->queue->queue_lock); 3593 spin_lock_irq(cfqd->queue->queue_lock);
3206 if (new_cfqq) 3594 if (new_cfqq)
3207 goto retry; 3595 goto retry;
3596 else
3597 return &cfqd->oom_cfqq;
3208 } else { 3598 } else {
3209 cfqq = kmem_cache_alloc_node(cfq_pool, 3599 cfqq = kmem_cache_alloc_node(cfq_pool,
3210 gfp_mask | __GFP_ZERO, 3600 gfp_mask | __GFP_ZERO,
@@ -3402,7 +3792,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3402 return true; 3792 return true;
3403 3793
3404 /* Allow preemption only if we are idling on sync-noidle tree */ 3794 /* Allow preemption only if we are idling on sync-noidle tree */
3405 if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && 3795 if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
3406 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && 3796 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
3407 new_cfqq->service_tree->count == 2 && 3797 new_cfqq->service_tree->count == 2 &&
3408 RB_EMPTY_ROOT(&cfqq->sort_list)) 3798 RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3454,7 +3844,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3454 * doesn't happen 3844 * doesn't happen
3455 */ 3845 */
3456 if (old_type != cfqq_type(cfqq)) 3846 if (old_type != cfqq_type(cfqq))
3457 cfqq->cfqg->saved_workload_slice = 0; 3847 cfqq->cfqg->saved_wl_slice = 0;
3458 3848
3459 /* 3849 /*
3460 * Put the new queue at the front of the of the current list, 3850 * Put the new queue at the front of the of the current list,
@@ -3636,16 +4026,17 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3636 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 4026 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3637 4027
3638 if (sync) { 4028 if (sync) {
3639 struct cfq_rb_root *service_tree; 4029 struct cfq_rb_root *st;
3640 4030
3641 RQ_CIC(rq)->ttime.last_end_request = now; 4031 RQ_CIC(rq)->ttime.last_end_request = now;
3642 4032
3643 if (cfq_cfqq_on_rr(cfqq)) 4033 if (cfq_cfqq_on_rr(cfqq))
3644 service_tree = cfqq->service_tree; 4034 st = cfqq->service_tree;
3645 else 4035 else
3646 service_tree = service_tree_for(cfqq->cfqg, 4036 st = st_for(cfqq->cfqg, cfqq_class(cfqq),
3647 cfqq_prio(cfqq), cfqq_type(cfqq)); 4037 cfqq_type(cfqq));
3648 service_tree->ttime.last_end_request = now; 4038
4039 st->ttime.last_end_request = now;
3649 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) 4040 if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
3650 cfqd->last_delayed_sync = now; 4041 cfqd->last_delayed_sync = now;
3651 } 4042 }
@@ -3992,6 +4383,7 @@ static int cfq_init_queue(struct request_queue *q)
3992 cfq_init_cfqg_base(cfqd->root_group); 4383 cfq_init_cfqg_base(cfqd->root_group);
3993#endif 4384#endif
3994 cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; 4385 cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
4386 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
3995 4387
3996 /* 4388 /*
3997 * Not strictly needed (since RB_ROOT just clears the node and we 4389 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4176,6 +4568,7 @@ static struct blkcg_policy blkcg_policy_cfq = {
4176 .cftypes = cfq_blkcg_files, 4568 .cftypes = cfq_blkcg_files,
4177 4569
4178 .pd_init_fn = cfq_pd_init, 4570 .pd_init_fn = cfq_pd_init,
4571 .pd_offline_fn = cfq_pd_offline,
4179 .pd_reset_stats_fn = cfq_pd_reset_stats, 4572 .pd_reset_stats_fn = cfq_pd_reset_stats,
4180}; 4573};
4181#endif 4574#endif
diff --git a/block/elevator.c b/block/elevator.c
index d0acb31cc083..a0ffdd943c98 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -46,11 +46,6 @@ static LIST_HEAD(elv_list);
46/* 46/*
47 * Merge hash stuff. 47 * Merge hash stuff.
48 */ 48 */
49static const int elv_hash_shift = 6;
50#define ELV_HASH_BLOCK(sec) ((sec) >> 3)
51#define ELV_HASH_FN(sec) \
52 (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
53#define ELV_HASH_ENTRIES (1 << elv_hash_shift)
54#define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) 49#define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq))
55 50
56/* 51/*
@@ -158,7 +153,6 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
158 struct elevator_type *e) 153 struct elevator_type *e)
159{ 154{
160 struct elevator_queue *eq; 155 struct elevator_queue *eq;
161 int i;
162 156
163 eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node); 157 eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node);
164 if (unlikely(!eq)) 158 if (unlikely(!eq))
@@ -167,14 +161,7 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
167 eq->type = e; 161 eq->type = e;
168 kobject_init(&eq->kobj, &elv_ktype); 162 kobject_init(&eq->kobj, &elv_ktype);
169 mutex_init(&eq->sysfs_lock); 163 mutex_init(&eq->sysfs_lock);
170 164 hash_init(eq->hash);
171 eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
172 GFP_KERNEL, q->node);
173 if (!eq->hash)
174 goto err;
175
176 for (i = 0; i < ELV_HASH_ENTRIES; i++)
177 INIT_HLIST_HEAD(&eq->hash[i]);
178 165
179 return eq; 166 return eq;
180err: 167err:
@@ -189,7 +176,6 @@ static void elevator_release(struct kobject *kobj)
189 176
190 e = container_of(kobj, struct elevator_queue, kobj); 177 e = container_of(kobj, struct elevator_queue, kobj);
191 elevator_put(e->type); 178 elevator_put(e->type);
192 kfree(e->hash);
193 kfree(e); 179 kfree(e);
194} 180}
195 181
@@ -261,7 +247,7 @@ EXPORT_SYMBOL(elevator_exit);
261 247
262static inline void __elv_rqhash_del(struct request *rq) 248static inline void __elv_rqhash_del(struct request *rq)
263{ 249{
264 hlist_del_init(&rq->hash); 250 hash_del(&rq->hash);
265} 251}
266 252
267static void elv_rqhash_del(struct request_queue *q, struct request *rq) 253static void elv_rqhash_del(struct request_queue *q, struct request *rq)
@@ -275,7 +261,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq)
275 struct elevator_queue *e = q->elevator; 261 struct elevator_queue *e = q->elevator;
276 262
277 BUG_ON(ELV_ON_HASH(rq)); 263 BUG_ON(ELV_ON_HASH(rq));
278 hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]); 264 hash_add(e->hash, &rq->hash, rq_hash_key(rq));
279} 265}
280 266
281static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) 267static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
@@ -287,11 +273,10 @@ static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
287static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) 273static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
288{ 274{
289 struct elevator_queue *e = q->elevator; 275 struct elevator_queue *e = q->elevator;
290 struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
291 struct hlist_node *next; 276 struct hlist_node *next;
292 struct request *rq; 277 struct request *rq;
293 278
294 hlist_for_each_entry_safe(rq, next, hash_list, hash) { 279 hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
295 BUG_ON(!ELV_ON_HASH(rq)); 280 BUG_ON(!ELV_ON_HASH(rq));
296 281
297 if (unlikely(!rq_mergeable(rq))) { 282 if (unlikely(!rq_mergeable(rq))) {
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 57763c54363a..758f2ac878cf 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1090,10 +1090,13 @@ static const struct block_device_operations floppy_fops = {
1090static void swim3_mb_event(struct macio_dev* mdev, int mb_state) 1090static void swim3_mb_event(struct macio_dev* mdev, int mb_state)
1091{ 1091{
1092 struct floppy_state *fs = macio_get_drvdata(mdev); 1092 struct floppy_state *fs = macio_get_drvdata(mdev);
1093 struct swim3 __iomem *sw = fs->swim3; 1093 struct swim3 __iomem *sw;
1094 1094
1095 if (!fs) 1095 if (!fs)
1096 return; 1096 return;
1097
1098 sw = fs->swim3;
1099
1097 if (mb_state != MB_FD) 1100 if (mb_state != MB_FD)
1098 return; 1101 return;
1099 1102
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e67a4be0080d..bb2cd3ce9b0f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -626,7 +626,6 @@ static void dec_pending(struct dm_io *io, int error)
626 queue_io(md, bio); 626 queue_io(md, bio);
627 } else { 627 } else {
628 /* done with normal IO or empty flush */ 628 /* done with normal IO or empty flush */
629 trace_block_bio_complete(md->queue, bio, io_error);
630 bio_endio(bio, io_error); 629 bio_endio(bio, io_error);
631 } 630 }
632 } 631 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 697f026cb318..5af2d2709081 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -184,8 +184,6 @@ static void return_io(struct bio *return_bi)
184 return_bi = bi->bi_next; 184 return_bi = bi->bi_next;
185 bi->bi_next = NULL; 185 bi->bi_next = NULL;
186 bi->bi_size = 0; 186 bi->bi_size = 0;
187 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
188 bi, 0);
189 bio_endio(bi, 0); 187 bio_endio(bi, 0);
190 bi = return_bi; 188 bi = return_bi;
191 } 189 }
@@ -3916,8 +3914,6 @@ static void raid5_align_endio(struct bio *bi, int error)
3916 rdev_dec_pending(rdev, conf->mddev); 3914 rdev_dec_pending(rdev, conf->mddev);
3917 3915
3918 if (!error && uptodate) { 3916 if (!error && uptodate) {
3919 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
3920 raid_bi, 0);
3921 bio_endio(raid_bi, 0); 3917 bio_endio(raid_bi, 0);
3922 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3918 if (atomic_dec_and_test(&conf->active_aligned_reads))
3923 wake_up(&conf->wait_for_stripe); 3919 wake_up(&conf->wait_for_stripe);
@@ -4376,8 +4372,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4376 if ( rw == WRITE ) 4372 if ( rw == WRITE )
4377 md_write_end(mddev); 4373 md_write_end(mddev);
4378 4374
4379 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
4380 bi, 0);
4381 bio_endio(bi, 0); 4375 bio_endio(bi, 0);
4382 } 4376 }
4383} 4377}
@@ -4754,11 +4748,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4754 handled++; 4748 handled++;
4755 } 4749 }
4756 remaining = raid5_dec_bi_active_stripes(raid_bio); 4750 remaining = raid5_dec_bi_active_stripes(raid_bio);
4757 if (remaining == 0) { 4751 if (remaining == 0)
4758 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
4759 raid_bio, 0);
4760 bio_endio(raid_bio, 0); 4752 bio_endio(raid_bio, 0);
4761 }
4762 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4753 if (atomic_dec_and_test(&conf->active_aligned_reads))
4763 wake_up(&conf->wait_for_stripe); 4754 wake_up(&conf->wait_for_stripe);
4764 return handled; 4755 return handled;
diff --git a/fs/bio.c b/fs/bio.c
index b96fc6ce4855..bb5768f59b32 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1428,6 +1428,8 @@ void bio_endio(struct bio *bio, int error)
1428 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1428 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1429 error = -EIO; 1429 error = -EIO;
1430 1430
1431 trace_block_bio_complete(bio, error);
1432
1431 if (bio->bi_end_io) 1433 if (bio->bi_end_io)
1432 bio->bi_end_io(bio, error); 1434 bio->bi_end_io(bio, error);
1433} 1435}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 53f5fae5cfbe..aea605c98ba6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1033,7 +1033,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
1033{ 1033{
1034 unsigned bsize = bdev_logical_block_size(bdev); 1034 unsigned bsize = bdev_logical_block_size(bdev);
1035 1035
1036 bdev->bd_inode->i_size = size; 1036 mutex_lock(&bdev->bd_inode->i_mutex);
1037 i_size_write(bdev->bd_inode, size);
1038 mutex_unlock(&bdev->bd_inode->i_mutex);
1037 while (bsize < PAGE_CACHE_SIZE) { 1039 while (bsize < PAGE_CACHE_SIZE) {
1038 if (size & bsize) 1040 if (size & bsize)
1039 break; 1041 break;
@@ -1118,7 +1120,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1118 } 1120 }
1119 } 1121 }
1120 1122
1121 if (!ret && !bdev->bd_openers) { 1123 if (!ret) {
1122 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1124 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1123 bdi = blk_get_backing_dev_info(bdev); 1125 bdi = blk_get_backing_dev_info(bdev);
1124 if (bdi == NULL) 1126 if (bdi == NULL)
diff --git a/fs/buffer.c b/fs/buffer.c
index 8e18281b4077..b4dcb34c9635 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
41#include <linux/bitops.h> 41#include <linux/bitops.h>
42#include <linux/mpage.h> 42#include <linux/mpage.h>
43#include <linux/bit_spinlock.h> 43#include <linux/bit_spinlock.h>
44#include <trace/events/block.h>
44 45
45static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); 46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 47
@@ -53,6 +54,13 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
53} 54}
54EXPORT_SYMBOL(init_buffer); 55EXPORT_SYMBOL(init_buffer);
55 56
57inline void touch_buffer(struct buffer_head *bh)
58{
59 trace_block_touch_buffer(bh);
60 mark_page_accessed(bh->b_page);
61}
62EXPORT_SYMBOL(touch_buffer);
63
56static int sleep_on_buffer(void *word) 64static int sleep_on_buffer(void *word)
57{ 65{
58 io_schedule(); 66 io_schedule();
@@ -1113,6 +1121,8 @@ void mark_buffer_dirty(struct buffer_head *bh)
1113{ 1121{
1114 WARN_ON_ONCE(!buffer_uptodate(bh)); 1122 WARN_ON_ONCE(!buffer_uptodate(bh));
1115 1123
1124 trace_block_dirty_buffer(bh);
1125
1116 /* 1126 /*
1117 * Very *carefully* optimize the it-is-already-dirty case. 1127 * Very *carefully* optimize the it-is-already-dirty case.
1118 * 1128 *
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 310972b72a66..359494ea1bde 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -318,8 +318,14 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
318 318
319static int write_inode(struct inode *inode, struct writeback_control *wbc) 319static int write_inode(struct inode *inode, struct writeback_control *wbc)
320{ 320{
321 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 321 int ret;
322 return inode->i_sb->s_op->write_inode(inode, wbc); 322
323 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
324 trace_writeback_write_inode_start(inode, wbc);
325 ret = inode->i_sb->s_op->write_inode(inode, wbc);
326 trace_writeback_write_inode(inode, wbc);
327 return ret;
328 }
323 return 0; 329 return 0;
324} 330}
325 331
@@ -450,6 +456,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
450 456
451 WARN_ON(!(inode->i_state & I_SYNC)); 457 WARN_ON(!(inode->i_state & I_SYNC));
452 458
459 trace_writeback_single_inode_start(inode, wbc, nr_to_write);
460
453 ret = do_writepages(mapping, wbc); 461 ret = do_writepages(mapping, wbc);
454 462
455 /* 463 /*
@@ -1150,8 +1158,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1150 * dirty the inode itself 1158 * dirty the inode itself
1151 */ 1159 */
1152 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 1160 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
1161 trace_writeback_dirty_inode_start(inode, flags);
1162
1153 if (sb->s_op->dirty_inode) 1163 if (sb->s_op->dirty_inode)
1154 sb->s_op->dirty_inode(inode, flags); 1164 sb->s_op->dirty_inode(inode, flags);
1165
1166 trace_writeback_dirty_inode(inode, flags);
1155 } 1167 }
1156 1168
1157 /* 1169 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f94bc83011ed..78feda9bbae2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -19,6 +19,7 @@
19#include <linux/gfp.h> 19#include <linux/gfp.h>
20#include <linux/bsg.h> 20#include <linux/bsg.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/rcupdate.h>
22 23
23#include <asm/scatterlist.h> 24#include <asm/scatterlist.h>
24 25
@@ -437,6 +438,7 @@ struct request_queue {
437 /* Throttle data */ 438 /* Throttle data */
438 struct throtl_data *td; 439 struct throtl_data *td;
439#endif 440#endif
441 struct rcu_head rcu_head;
440}; 442};
441 443
442#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ 444#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
@@ -974,7 +976,6 @@ struct blk_plug {
974 unsigned long magic; /* detect uninitialized use-cases */ 976 unsigned long magic; /* detect uninitialized use-cases */
975 struct list_head list; /* requests */ 977 struct list_head list; /* requests */
976 struct list_head cb_list; /* md requires an unplug callback */ 978 struct list_head cb_list; /* md requires an unplug callback */
977 unsigned int should_sort; /* list to be sorted before flushing? */
978}; 979};
979#define BLK_MAX_REQUEST_COUNT 16 980#define BLK_MAX_REQUEST_COUNT 16
980 981
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 7c2e030e72f1..0ea61e07a91c 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -12,6 +12,7 @@
12 12
13struct blk_trace { 13struct blk_trace {
14 int trace_state; 14 int trace_state;
15 bool rq_based;
15 struct rchan *rchan; 16 struct rchan *rchan;
16 unsigned long __percpu *sequence; 17 unsigned long __percpu *sequence;
17 unsigned char __percpu *msg_data; 18 unsigned char __percpu *msg_data;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 458f497738a4..5afc4f94d110 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -126,7 +126,6 @@ BUFFER_FNS(Write_EIO, write_io_error)
126BUFFER_FNS(Unwritten, unwritten) 126BUFFER_FNS(Unwritten, unwritten)
127 127
128#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) 128#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
129#define touch_buffer(bh) mark_page_accessed(bh->b_page)
130 129
131/* If we *know* page->private refers to buffer_heads */ 130/* If we *know* page->private refers to buffer_heads */
132#define page_buffers(page) \ 131#define page_buffers(page) \
@@ -142,6 +141,7 @@ BUFFER_FNS(Unwritten, unwritten)
142 141
143void mark_buffer_dirty(struct buffer_head *bh); 142void mark_buffer_dirty(struct buffer_head *bh);
144void init_buffer(struct buffer_head *, bh_end_io_t *, void *); 143void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
144void touch_buffer(struct buffer_head *bh);
145void set_bh_page(struct buffer_head *bh, 145void set_bh_page(struct buffer_head *bh,
146 struct page *page, unsigned long offset); 146 struct page *page, unsigned long offset);
147int try_to_free_buffers(struct page *); 147int try_to_free_buffers(struct page *);
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 51494e6b5548..33f0280fd533 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -77,10 +77,13 @@ static inline void init_completion(struct completion *x)
77} 77}
78 78
79extern void wait_for_completion(struct completion *); 79extern void wait_for_completion(struct completion *);
80extern void wait_for_completion_io(struct completion *);
80extern int wait_for_completion_interruptible(struct completion *x); 81extern int wait_for_completion_interruptible(struct completion *x);
81extern int wait_for_completion_killable(struct completion *x); 82extern int wait_for_completion_killable(struct completion *x);
82extern unsigned long wait_for_completion_timeout(struct completion *x, 83extern unsigned long wait_for_completion_timeout(struct completion *x,
83 unsigned long timeout); 84 unsigned long timeout);
85extern unsigned long wait_for_completion_io_timeout(struct completion *x,
86 unsigned long timeout);
84extern long wait_for_completion_interruptible_timeout( 87extern long wait_for_completion_interruptible_timeout(
85 struct completion *x, unsigned long timeout); 88 struct completion *x, unsigned long timeout);
86extern long wait_for_completion_killable_timeout( 89extern long wait_for_completion_killable_timeout(
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 186620631750..acd0312d46fb 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -2,6 +2,7 @@
2#define _LINUX_ELEVATOR_H 2#define _LINUX_ELEVATOR_H
3 3
4#include <linux/percpu.h> 4#include <linux/percpu.h>
5#include <linux/hashtable.h>
5 6
6#ifdef CONFIG_BLOCK 7#ifdef CONFIG_BLOCK
7 8
@@ -96,6 +97,8 @@ struct elevator_type
96 struct list_head list; 97 struct list_head list;
97}; 98};
98 99
100#define ELV_HASH_BITS 6
101
99/* 102/*
100 * each queue has an elevator_queue associated with it 103 * each queue has an elevator_queue associated with it
101 */ 104 */
@@ -105,8 +108,8 @@ struct elevator_queue
105 void *elevator_data; 108 void *elevator_data;
106 struct kobject kobj; 109 struct kobject kobj;
107 struct mutex sysfs_lock; 110 struct mutex sysfs_lock;
108 struct hlist_head *hash;
109 unsigned int registered:1; 111 unsigned int registered:1;
112 DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
110}; 113};
111 114
112/* 115/*
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 05c5e61f0a7c..9961726523d0 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -6,10 +6,61 @@
6 6
7#include <linux/blktrace_api.h> 7#include <linux/blktrace_api.h>
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/buffer_head.h>
9#include <linux/tracepoint.h> 10#include <linux/tracepoint.h>
10 11
11#define RWBS_LEN 8 12#define RWBS_LEN 8
12 13
14DECLARE_EVENT_CLASS(block_buffer,
15
16 TP_PROTO(struct buffer_head *bh),
17
18 TP_ARGS(bh),
19
20 TP_STRUCT__entry (
21 __field( dev_t, dev )
22 __field( sector_t, sector )
23 __field( size_t, size )
24 ),
25
26 TP_fast_assign(
27 __entry->dev = bh->b_bdev->bd_dev;
28 __entry->sector = bh->b_blocknr;
29 __entry->size = bh->b_size;
30 ),
31
32 TP_printk("%d,%d sector=%llu size=%zu",
33 MAJOR(__entry->dev), MINOR(__entry->dev),
34 (unsigned long long)__entry->sector, __entry->size
35 )
36);
37
38/**
39 * block_touch_buffer - mark a buffer accessed
40 * @bh: buffer_head being touched
41 *
42 * Called from touch_buffer().
43 */
44DEFINE_EVENT(block_buffer, block_touch_buffer,
45
46 TP_PROTO(struct buffer_head *bh),
47
48 TP_ARGS(bh)
49);
50
51/**
52 * block_dirty_buffer - mark a buffer dirty
53 * @bh: buffer_head being dirtied
54 *
55 * Called from mark_buffer_dirty().
56 */
57DEFINE_EVENT(block_buffer, block_dirty_buffer,
58
59 TP_PROTO(struct buffer_head *bh),
60
61 TP_ARGS(bh)
62);
63
13DECLARE_EVENT_CLASS(block_rq_with_error, 64DECLARE_EVENT_CLASS(block_rq_with_error,
14 65
15 TP_PROTO(struct request_queue *q, struct request *rq), 66 TP_PROTO(struct request_queue *q, struct request *rq),
@@ -206,7 +257,6 @@ TRACE_EVENT(block_bio_bounce,
206 257
207/** 258/**
208 * block_bio_complete - completed all work on the block operation 259 * block_bio_complete - completed all work on the block operation
209 * @q: queue holding the block operation
210 * @bio: block operation completed 260 * @bio: block operation completed
211 * @error: io error value 261 * @error: io error value
212 * 262 *
@@ -215,9 +265,9 @@ TRACE_EVENT(block_bio_bounce,
215 */ 265 */
216TRACE_EVENT(block_bio_complete, 266TRACE_EVENT(block_bio_complete,
217 267
218 TP_PROTO(struct request_queue *q, struct bio *bio, int error), 268 TP_PROTO(struct bio *bio, int error),
219 269
220 TP_ARGS(q, bio, error), 270 TP_ARGS(bio, error),
221 271
222 TP_STRUCT__entry( 272 TP_STRUCT__entry(
223 __field( dev_t, dev ) 273 __field( dev_t, dev )
@@ -228,7 +278,8 @@ TRACE_EVENT(block_bio_complete,
228 ), 278 ),
229 279
230 TP_fast_assign( 280 TP_fast_assign(
231 __entry->dev = bio->bi_bdev->bd_dev; 281 __entry->dev = bio->bi_bdev ?
282 bio->bi_bdev->bd_dev : 0;
232 __entry->sector = bio->bi_sector; 283 __entry->sector = bio->bi_sector;
233 __entry->nr_sector = bio->bi_size >> 9; 284 __entry->nr_sector = bio->bi_size >> 9;
234 __entry->error = error; 285 __entry->error = error;
@@ -241,11 +292,11 @@ TRACE_EVENT(block_bio_complete,
241 __entry->nr_sector, __entry->error) 292 __entry->nr_sector, __entry->error)
242); 293);
243 294
244DECLARE_EVENT_CLASS(block_bio, 295DECLARE_EVENT_CLASS(block_bio_merge,
245 296
246 TP_PROTO(struct request_queue *q, struct bio *bio), 297 TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio),
247 298
248 TP_ARGS(q, bio), 299 TP_ARGS(q, rq, bio),
249 300
250 TP_STRUCT__entry( 301 TP_STRUCT__entry(
251 __field( dev_t, dev ) 302 __field( dev_t, dev )
@@ -272,31 +323,33 @@ DECLARE_EVENT_CLASS(block_bio,
272/** 323/**
273 * block_bio_backmerge - merging block operation to the end of an existing operation 324 * block_bio_backmerge - merging block operation to the end of an existing operation
274 * @q: queue holding operation 325 * @q: queue holding operation
326 * @rq: request bio is being merged into
275 * @bio: new block operation to merge 327 * @bio: new block operation to merge
276 * 328 *
277 * Merging block request @bio to the end of an existing block request 329 * Merging block request @bio to the end of an existing block request
278 * in queue @q. 330 * in queue @q.
279 */ 331 */
280DEFINE_EVENT(block_bio, block_bio_backmerge, 332DEFINE_EVENT(block_bio_merge, block_bio_backmerge,
281 333
282 TP_PROTO(struct request_queue *q, struct bio *bio), 334 TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio),
283 335
284 TP_ARGS(q, bio) 336 TP_ARGS(q, rq, bio)
285); 337);
286 338
287/** 339/**
288 * block_bio_frontmerge - merging block operation to the beginning of an existing operation 340 * block_bio_frontmerge - merging block operation to the beginning of an existing operation
289 * @q: queue holding operation 341 * @q: queue holding operation
342 * @rq: request bio is being merged into
290 * @bio: new block operation to merge 343 * @bio: new block operation to merge
291 * 344 *
292 * Merging block IO operation @bio to the beginning of an existing block 345 * Merging block IO operation @bio to the beginning of an existing block
293 * operation in queue @q. 346 * operation in queue @q.
294 */ 347 */
295DEFINE_EVENT(block_bio, block_bio_frontmerge, 348DEFINE_EVENT(block_bio_merge, block_bio_frontmerge,
296 349
297 TP_PROTO(struct request_queue *q, struct bio *bio), 350 TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio),
298 351
299 TP_ARGS(q, bio) 352 TP_ARGS(q, rq, bio)
300); 353);
301 354
302/** 355/**
@@ -306,11 +359,32 @@ DEFINE_EVENT(block_bio, block_bio_frontmerge,
306 * 359 *
307 * About to place the block IO operation @bio into queue @q. 360 * About to place the block IO operation @bio into queue @q.
308 */ 361 */
309DEFINE_EVENT(block_bio, block_bio_queue, 362TRACE_EVENT(block_bio_queue,
310 363
311 TP_PROTO(struct request_queue *q, struct bio *bio), 364 TP_PROTO(struct request_queue *q, struct bio *bio),
312 365
313 TP_ARGS(q, bio) 366 TP_ARGS(q, bio),
367
368 TP_STRUCT__entry(
369 __field( dev_t, dev )
370 __field( sector_t, sector )
371 __field( unsigned int, nr_sector )
372 __array( char, rwbs, RWBS_LEN )
373 __array( char, comm, TASK_COMM_LEN )
374 ),
375
376 TP_fast_assign(
377 __entry->dev = bio->bi_bdev->bd_dev;
378 __entry->sector = bio->bi_sector;
379 __entry->nr_sector = bio->bi_size >> 9;
380 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
381 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
382 ),
383
384 TP_printk("%d,%d %s %llu + %u [%s]",
385 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
386 (unsigned long long)__entry->sector,
387 __entry->nr_sector, __entry->comm)
314); 388);
315 389
316DECLARE_EVENT_CLASS(block_get_rq, 390DECLARE_EVENT_CLASS(block_get_rq,
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index b453d92c2253..6a16fd2e70ed 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -32,6 +32,115 @@
32 32
33struct wb_writeback_work; 33struct wb_writeback_work;
34 34
35TRACE_EVENT(writeback_dirty_page,
36
37 TP_PROTO(struct page *page, struct address_space *mapping),
38
39 TP_ARGS(page, mapping),
40
41 TP_STRUCT__entry (
42 __array(char, name, 32)
43 __field(unsigned long, ino)
44 __field(pgoff_t, index)
45 ),
46
47 TP_fast_assign(
48 strncpy(__entry->name,
49 mapping ? dev_name(mapping->backing_dev_info->dev) : "(unknown)", 32);
50 __entry->ino = mapping ? mapping->host->i_ino : 0;
51 __entry->index = page->index;
52 ),
53
54 TP_printk("bdi %s: ino=%lu index=%lu",
55 __entry->name,
56 __entry->ino,
57 __entry->index
58 )
59);
60
61DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
62
63 TP_PROTO(struct inode *inode, int flags),
64
65 TP_ARGS(inode, flags),
66
67 TP_STRUCT__entry (
68 __array(char, name, 32)
69 __field(unsigned long, ino)
70 __field(unsigned long, flags)
71 ),
72
73 TP_fast_assign(
74 struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
75
76 /* may be called for files on pseudo FSes w/ unregistered bdi */
77 strncpy(__entry->name,
78 bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
79 __entry->ino = inode->i_ino;
80 __entry->flags = flags;
81 ),
82
83 TP_printk("bdi %s: ino=%lu flags=%s",
84 __entry->name,
85 __entry->ino,
86 show_inode_state(__entry->flags)
87 )
88);
89
90DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,
91
92 TP_PROTO(struct inode *inode, int flags),
93
94 TP_ARGS(inode, flags)
95);
96
97DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
98
99 TP_PROTO(struct inode *inode, int flags),
100
101 TP_ARGS(inode, flags)
102);
103
104DECLARE_EVENT_CLASS(writeback_write_inode_template,
105
106 TP_PROTO(struct inode *inode, struct writeback_control *wbc),
107
108 TP_ARGS(inode, wbc),
109
110 TP_STRUCT__entry (
111 __array(char, name, 32)
112 __field(unsigned long, ino)
113 __field(int, sync_mode)
114 ),
115
116 TP_fast_assign(
117 strncpy(__entry->name,
118 dev_name(inode->i_mapping->backing_dev_info->dev), 32);
119 __entry->ino = inode->i_ino;
120 __entry->sync_mode = wbc->sync_mode;
121 ),
122
123 TP_printk("bdi %s: ino=%lu sync_mode=%d",
124 __entry->name,
125 __entry->ino,
126 __entry->sync_mode
127 )
128);
129
130DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start,
131
132 TP_PROTO(struct inode *inode, struct writeback_control *wbc),
133
134 TP_ARGS(inode, wbc)
135);
136
137DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,
138
139 TP_PROTO(struct inode *inode, struct writeback_control *wbc),
140
141 TP_ARGS(inode, wbc)
142);
143
35DECLARE_EVENT_CLASS(writeback_work_class, 144DECLARE_EVENT_CLASS(writeback_work_class,
36 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), 145 TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
37 TP_ARGS(bdi, work), 146 TP_ARGS(bdi, work),
@@ -479,6 +588,13 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
479 ) 588 )
480); 589);
481 590
591DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start,
592 TP_PROTO(struct inode *inode,
593 struct writeback_control *wbc,
594 unsigned long nr_to_write),
595 TP_ARGS(inode, wbc, nr_to_write)
596);
597
482DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, 598DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
483 TP_PROTO(struct inode *inode, 599 TP_PROTO(struct inode *inode,
484 struct writeback_control *wbc, 600 struct writeback_control *wbc,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 12af4270c9c1..7f12624a393c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3258,7 +3258,8 @@ void complete_all(struct completion *x)
3258EXPORT_SYMBOL(complete_all); 3258EXPORT_SYMBOL(complete_all);
3259 3259
3260static inline long __sched 3260static inline long __sched
3261do_wait_for_common(struct completion *x, long timeout, int state) 3261do_wait_for_common(struct completion *x,
3262 long (*action)(long), long timeout, int state)
3262{ 3263{
3263 if (!x->done) { 3264 if (!x->done) {
3264 DECLARE_WAITQUEUE(wait, current); 3265 DECLARE_WAITQUEUE(wait, current);
@@ -3271,7 +3272,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
3271 } 3272 }
3272 __set_current_state(state); 3273 __set_current_state(state);
3273 spin_unlock_irq(&x->wait.lock); 3274 spin_unlock_irq(&x->wait.lock);
3274 timeout = schedule_timeout(timeout); 3275 timeout = action(timeout);
3275 spin_lock_irq(&x->wait.lock); 3276 spin_lock_irq(&x->wait.lock);
3276 } while (!x->done && timeout); 3277 } while (!x->done && timeout);
3277 __remove_wait_queue(&x->wait, &wait); 3278 __remove_wait_queue(&x->wait, &wait);
@@ -3282,17 +3283,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
3282 return timeout ?: 1; 3283 return timeout ?: 1;
3283} 3284}
3284 3285
3285static long __sched 3286static inline long __sched
3286wait_for_common(struct completion *x, long timeout, int state) 3287__wait_for_common(struct completion *x,
3288 long (*action)(long), long timeout, int state)
3287{ 3289{
3288 might_sleep(); 3290 might_sleep();
3289 3291
3290 spin_lock_irq(&x->wait.lock); 3292 spin_lock_irq(&x->wait.lock);
3291 timeout = do_wait_for_common(x, timeout, state); 3293 timeout = do_wait_for_common(x, action, timeout, state);
3292 spin_unlock_irq(&x->wait.lock); 3294 spin_unlock_irq(&x->wait.lock);
3293 return timeout; 3295 return timeout;
3294} 3296}
3295 3297
3298static long __sched
3299wait_for_common(struct completion *x, long timeout, int state)
3300{
3301 return __wait_for_common(x, schedule_timeout, timeout, state);
3302}
3303
3304static long __sched
3305wait_for_common_io(struct completion *x, long timeout, int state)
3306{
3307 return __wait_for_common(x, io_schedule_timeout, timeout, state);
3308}
3309
3296/** 3310/**
3297 * wait_for_completion: - waits for completion of a task 3311 * wait_for_completion: - waits for completion of a task
3298 * @x: holds the state of this particular completion 3312 * @x: holds the state of this particular completion
@@ -3329,6 +3343,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3329EXPORT_SYMBOL(wait_for_completion_timeout); 3343EXPORT_SYMBOL(wait_for_completion_timeout);
3330 3344
3331/** 3345/**
3346 * wait_for_completion_io: - waits for completion of a task
3347 * @x: holds the state of this particular completion
3348 *
3349 * This waits to be signaled for completion of a specific task. It is NOT
3350 * interruptible and there is no timeout. The caller is accounted as waiting
3351 * for IO.
3352 */
3353void __sched wait_for_completion_io(struct completion *x)
3354{
3355 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3356}
3357EXPORT_SYMBOL(wait_for_completion_io);
3358
3359/**
3360 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
3361 * @x: holds the state of this particular completion
3362 * @timeout: timeout value in jiffies
3363 *
3364 * This waits for either a completion of a specific task to be signaled or for a
3365 * specified timeout to expire. The timeout is in jiffies. It is not
3366 * interruptible. The caller is accounted as waiting for IO.
3367 *
3368 * The return value is 0 if timed out, and positive (at least 1, or number of
3369 * jiffies left till timeout) if completed.
3370 */
3371unsigned long __sched
3372wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
3373{
3374 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
3375}
3376EXPORT_SYMBOL(wait_for_completion_io_timeout);
3377
3378/**
3332 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 3379 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3333 * @x: holds the state of this particular completion 3380 * @x: holds the state of this particular completion
3334 * 3381 *
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 71259e2b6b61..9e5b8c272eec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore,
739 struct request_queue *q, 739 struct request_queue *q,
740 struct request *rq) 740 struct request *rq)
741{ 741{
742 struct blk_trace *bt = q->blk_trace;
743
744 /* if control ever passes through here, it's a request based driver */
745 if (unlikely(bt && !bt->rq_based))
746 bt->rq_based = true;
747
742 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 748 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
743} 749}
744 750
@@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore,
774 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); 780 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
775} 781}
776 782
777static void blk_add_trace_bio_complete(void *ignore, 783static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
778 struct request_queue *q, struct bio *bio,
779 int error)
780{ 784{
785 struct request_queue *q;
786 struct blk_trace *bt;
787
788 if (!bio->bi_bdev)
789 return;
790
791 q = bdev_get_queue(bio->bi_bdev);
792 bt = q->blk_trace;
793
794 /*
795 * Request based drivers will generate both rq and bio completions.
796 * Ignore bio ones.
797 */
798 if (likely(!bt) || bt->rq_based)
799 return;
800
781 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); 801 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
782} 802}
783 803
784static void blk_add_trace_bio_backmerge(void *ignore, 804static void blk_add_trace_bio_backmerge(void *ignore,
785 struct request_queue *q, 805 struct request_queue *q,
806 struct request *rq,
786 struct bio *bio) 807 struct bio *bio)
787{ 808{
788 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); 809 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
@@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
790 811
791static void blk_add_trace_bio_frontmerge(void *ignore, 812static void blk_add_trace_bio_frontmerge(void *ignore,
792 struct request_queue *q, 813 struct request_queue *q,
814 struct request *rq,
793 struct bio *bio) 815 struct bio *bio)
794{ 816{
795 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); 817 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cdc377c456c0..742c40583159 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1986,6 +1986,8 @@ int __set_page_dirty_no_writeback(struct page *page)
1986 */ 1986 */
1987void account_page_dirtied(struct page *page, struct address_space *mapping) 1987void account_page_dirtied(struct page *page, struct address_space *mapping)
1988{ 1988{
1989 trace_writeback_dirty_page(page, mapping);
1990
1989 if (mapping_cap_account_dirty(mapping)) { 1991 if (mapping_cap_account_dirty(mapping)) {
1990 __inc_zone_page_state(page, NR_FILE_DIRTY); 1992 __inc_zone_page_state(page, NR_FILE_DIRTY);
1991 __inc_zone_page_state(page, NR_DIRTIED); 1993 __inc_zone_page_state(page, NR_DIRTIED);