aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 12:02:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 12:02:41 -0400
commit8cf1a3fce0b95050b63d451c9d561da0da2aa4d6 (patch)
tree0dc7f93474c3be601a5893900db1418dfd60ba5d /block
parentfcff06c438b60f415af5983efe92811d6aa02ad1 (diff)
parent80799fbb7d10c30df78015b3fa21f7ffcfc0eb2c (diff)
Merge branch 'for-3.6/core' of git://git.kernel.dk/linux-block
Pull core block IO bits from Jens Axboe: "The most complicated part if this is the request allocation rework by Tejun, which has been queued up for a long time and has been in for-next ditto as well. There are a few commits from yesterday and today, mostly trivial and obvious fixes. So I'm pretty confident that it is sound. It's also smaller than usual." * 'for-3.6/core' of git://git.kernel.dk/linux-block: block: remove dead func declaration block: add partition resize function to blkpg ioctl block: uninitialized ioc->nr_tasks triggers WARN_ON block: do not artificially constrain max_sectors for stacking drivers blkcg: implement per-blkg request allocation block: prepare for multiple request_lists block: add q->nr_rqs[] and move q->rq.elvpriv to q->nr_rqs_elvpriv blkcg: inline bio_blkcg() and friends block: allocate io_context upfront block: refactor get_request[_wait]() block: drop custom queue draining used by scsi_transport_{iscsi|fc} mempool: add @gfp_mask to mempool_create_node() blkcg: make root blkcg allocation use %GFP_KERNEL blkcg: __blkg_lookup_create() doesn't need radix preload
Diffstat (limited to 'block')
-rw-r--r--block/blk-cgroup.c139
-rw-r--r--block/blk-cgroup.h128
-rw-r--r--block/blk-core.c209
-rw-r--r--block/blk-ioc.c1
-rw-r--r--block/blk-settings.c3
-rw-r--r--block/blk-sysfs.c34
-rw-r--r--block/blk-throttle.c3
-rw-r--r--block/blk.h4
-rw-r--r--block/bsg-lib.c53
-rw-r--r--block/genhd.c20
-rw-r--r--block/ioctl.c59
-rw-r--r--block/partition-generic.c4
12 files changed, 423 insertions, 234 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7dee617358e..f3b44a65fc7a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -31,27 +31,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
31 31
32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
33 33
34struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
35{
36 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
37 struct blkcg, css);
38}
39EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
40
41static struct blkcg *task_blkcg(struct task_struct *tsk)
42{
43 return container_of(task_subsys_state(tsk, blkio_subsys_id),
44 struct blkcg, css);
45}
46
47struct blkcg *bio_blkcg(struct bio *bio)
48{
49 if (bio && bio->bi_css)
50 return container_of(bio->bi_css, struct blkcg, css);
51 return task_blkcg(current);
52}
53EXPORT_SYMBOL_GPL(bio_blkcg);
54
55static bool blkcg_policy_enabled(struct request_queue *q, 34static bool blkcg_policy_enabled(struct request_queue *q,
56 const struct blkcg_policy *pol) 35 const struct blkcg_policy *pol)
57{ 36{
@@ -84,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
84 kfree(pd); 63 kfree(pd);
85 } 64 }
86 65
66 blk_exit_rl(&blkg->rl);
87 kfree(blkg); 67 kfree(blkg);
88} 68}
89 69
@@ -91,16 +71,18 @@ static void blkg_free(struct blkcg_gq *blkg)
91 * blkg_alloc - allocate a blkg 71 * blkg_alloc - allocate a blkg
92 * @blkcg: block cgroup the new blkg is associated with 72 * @blkcg: block cgroup the new blkg is associated with
93 * @q: request_queue the new blkg is associated with 73 * @q: request_queue the new blkg is associated with
74 * @gfp_mask: allocation mask to use
94 * 75 *
95 * Allocate a new blkg assocating @blkcg and @q. 76 * Allocate a new blkg assocating @blkcg and @q.
96 */ 77 */
97static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) 78static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
79 gfp_t gfp_mask)
98{ 80{
99 struct blkcg_gq *blkg; 81 struct blkcg_gq *blkg;
100 int i; 82 int i;
101 83
102 /* alloc and init base part */ 84 /* alloc and init base part */
103 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); 85 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
104 if (!blkg) 86 if (!blkg)
105 return NULL; 87 return NULL;
106 88
@@ -109,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
109 blkg->blkcg = blkcg; 91 blkg->blkcg = blkcg;
110 blkg->refcnt = 1; 92 blkg->refcnt = 1;
111 93
94 /* root blkg uses @q->root_rl, init rl only for !root blkgs */
95 if (blkcg != &blkcg_root) {
96 if (blk_init_rl(&blkg->rl, q, gfp_mask))
97 goto err_free;
98 blkg->rl.blkg = blkg;
99 }
100
112 for (i = 0; i < BLKCG_MAX_POLS; i++) { 101 for (i = 0; i < BLKCG_MAX_POLS; i++) {
113 struct blkcg_policy *pol = blkcg_policy[i]; 102 struct blkcg_policy *pol = blkcg_policy[i];
114 struct blkg_policy_data *pd; 103 struct blkg_policy_data *pd;
@@ -117,11 +106,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
117 continue; 106 continue;
118 107
119 /* alloc per-policy data and attach it to blkg */ 108 /* alloc per-policy data and attach it to blkg */
120 pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); 109 pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
121 if (!pd) { 110 if (!pd)
122 blkg_free(blkg); 111 goto err_free;
123 return NULL;
124 }
125 112
126 blkg->pd[i] = pd; 113 blkg->pd[i] = pd;
127 pd->blkg = blkg; 114 pd->blkg = blkg;
@@ -132,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
132 } 119 }
133 120
134 return blkg; 121 return blkg;
122
123err_free:
124 blkg_free(blkg);
125 return NULL;
135} 126}
136 127
137static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 128static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
@@ -175,9 +166,13 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
175} 166}
176EXPORT_SYMBOL_GPL(blkg_lookup); 167EXPORT_SYMBOL_GPL(blkg_lookup);
177 168
169/*
170 * If @new_blkg is %NULL, this function tries to allocate a new one as
171 * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return.
172 */
178static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, 173static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
179 struct request_queue *q) 174 struct request_queue *q,
180 __releases(q->queue_lock) __acquires(q->queue_lock) 175 struct blkcg_gq *new_blkg)
181{ 176{
182 struct blkcg_gq *blkg; 177 struct blkcg_gq *blkg;
183 int ret; 178 int ret;
@@ -189,24 +184,26 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
189 blkg = __blkg_lookup(blkcg, q); 184 blkg = __blkg_lookup(blkcg, q);
190 if (blkg) { 185 if (blkg) {
191 rcu_assign_pointer(blkcg->blkg_hint, blkg); 186 rcu_assign_pointer(blkcg->blkg_hint, blkg);
192 return blkg; 187 goto out_free;
193 } 188 }
194 189
195 /* blkg holds a reference to blkcg */ 190 /* blkg holds a reference to blkcg */
196 if (!css_tryget(&blkcg->css)) 191 if (!css_tryget(&blkcg->css)) {
197 return ERR_PTR(-EINVAL); 192 blkg = ERR_PTR(-EINVAL);
193 goto out_free;
194 }
198 195
199 /* allocate */ 196 /* allocate */
200 ret = -ENOMEM; 197 if (!new_blkg) {
201 blkg = blkg_alloc(blkcg, q); 198 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
202 if (unlikely(!blkg)) 199 if (unlikely(!new_blkg)) {
203 goto err_put; 200 blkg = ERR_PTR(-ENOMEM);
201 goto out_put;
202 }
203 }
204 blkg = new_blkg;
204 205
205 /* insert */ 206 /* insert */
206 ret = radix_tree_preload(GFP_ATOMIC);
207 if (ret)
208 goto err_free;
209
210 spin_lock(&blkcg->lock); 207 spin_lock(&blkcg->lock);
211 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 208 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
212 if (likely(!ret)) { 209 if (likely(!ret)) {
@@ -215,15 +212,15 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
215 } 212 }
216 spin_unlock(&blkcg->lock); 213 spin_unlock(&blkcg->lock);
217 214
218 radix_tree_preload_end();
219
220 if (!ret) 215 if (!ret)
221 return blkg; 216 return blkg;
222err_free: 217
223 blkg_free(blkg); 218 blkg = ERR_PTR(ret);
224err_put: 219out_put:
225 css_put(&blkcg->css); 220 css_put(&blkcg->css);
226 return ERR_PTR(ret); 221out_free:
222 blkg_free(new_blkg);
223 return blkg;
227} 224}
228 225
229struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 226struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
@@ -235,7 +232,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
235 */ 232 */
236 if (unlikely(blk_queue_bypass(q))) 233 if (unlikely(blk_queue_bypass(q)))
237 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); 234 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
238 return __blkg_lookup_create(blkcg, q); 235 return __blkg_lookup_create(blkcg, q, NULL);
239} 236}
240EXPORT_SYMBOL_GPL(blkg_lookup_create); 237EXPORT_SYMBOL_GPL(blkg_lookup_create);
241 238
@@ -313,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
313} 310}
314EXPORT_SYMBOL_GPL(__blkg_release); 311EXPORT_SYMBOL_GPL(__blkg_release);
315 312
313/*
314 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
315 * because the root blkg uses @q->root_rl instead of its own rl.
316 */
317struct request_list *__blk_queue_next_rl(struct request_list *rl,
318 struct request_queue *q)
319{
320 struct list_head *ent;
321 struct blkcg_gq *blkg;
322
323 /*
324 * Determine the current blkg list_head. The first entry is
325 * root_rl which is off @q->blkg_list and mapped to the head.
326 */
327 if (rl == &q->root_rl) {
328 ent = &q->blkg_list;
329 } else {
330 blkg = container_of(rl, struct blkcg_gq, rl);
331 ent = &blkg->q_node;
332 }
333
334 /* walk to the next list_head, skip root blkcg */
335 ent = ent->next;
336 if (ent == &q->root_blkg->q_node)
337 ent = ent->next;
338 if (ent == &q->blkg_list)
339 return NULL;
340
341 blkg = container_of(ent, struct blkcg_gq, q_node);
342 return &blkg->rl;
343}
344
316static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 345static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
317 u64 val) 346 u64 val)
318{ 347{
@@ -734,24 +763,36 @@ int blkcg_activate_policy(struct request_queue *q,
734 struct blkcg_gq *blkg; 763 struct blkcg_gq *blkg;
735 struct blkg_policy_data *pd, *n; 764 struct blkg_policy_data *pd, *n;
736 int cnt = 0, ret; 765 int cnt = 0, ret;
766 bool preloaded;
737 767
738 if (blkcg_policy_enabled(q, pol)) 768 if (blkcg_policy_enabled(q, pol))
739 return 0; 769 return 0;
740 770
771 /* preallocations for root blkg */
772 blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
773 if (!blkg)
774 return -ENOMEM;
775
776 preloaded = !radix_tree_preload(GFP_KERNEL);
777
741 blk_queue_bypass_start(q); 778 blk_queue_bypass_start(q);
742 779
743 /* make sure the root blkg exists and count the existing blkgs */ 780 /* make sure the root blkg exists and count the existing blkgs */
744 spin_lock_irq(q->queue_lock); 781 spin_lock_irq(q->queue_lock);
745 782
746 rcu_read_lock(); 783 rcu_read_lock();
747 blkg = __blkg_lookup_create(&blkcg_root, q); 784 blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
748 rcu_read_unlock(); 785 rcu_read_unlock();
749 786
787 if (preloaded)
788 radix_tree_preload_end();
789
750 if (IS_ERR(blkg)) { 790 if (IS_ERR(blkg)) {
751 ret = PTR_ERR(blkg); 791 ret = PTR_ERR(blkg);
752 goto out_unlock; 792 goto out_unlock;
753 } 793 }
754 q->root_blkg = blkg; 794 q->root_blkg = blkg;
795 q->root_rl.blkg = blkg;
755 796
756 list_for_each_entry(blkg, &q->blkg_list, q_node) 797 list_for_each_entry(blkg, &q->blkg_list, q_node)
757 cnt++; 798 cnt++;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8ac457ce7783..24597309e23d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
17#include <linux/u64_stats_sync.h> 17#include <linux/u64_stats_sync.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <linux/radix-tree.h> 19#include <linux/radix-tree.h>
20#include <linux/blkdev.h>
20 21
21/* Max limits for throttle policy */ 22/* Max limits for throttle policy */
22#define THROTL_IOPS_MAX UINT_MAX 23#define THROTL_IOPS_MAX UINT_MAX
@@ -93,6 +94,8 @@ struct blkcg_gq {
93 struct list_head q_node; 94 struct list_head q_node;
94 struct hlist_node blkcg_node; 95 struct hlist_node blkcg_node;
95 struct blkcg *blkcg; 96 struct blkcg *blkcg;
97 /* request allocation list for this blkcg-q pair */
98 struct request_list rl;
96 /* reference count */ 99 /* reference count */
97 int refcnt; 100 int refcnt;
98 101
@@ -120,8 +123,6 @@ struct blkcg_policy {
120 123
121extern struct blkcg blkcg_root; 124extern struct blkcg blkcg_root;
122 125
123struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup);
124struct blkcg *bio_blkcg(struct bio *bio);
125struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); 126struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
126struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 127struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
127 struct request_queue *q); 128 struct request_queue *q);
@@ -160,6 +161,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
160void blkg_conf_finish(struct blkg_conf_ctx *ctx); 161void blkg_conf_finish(struct blkg_conf_ctx *ctx);
161 162
162 163
164static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
165{
166 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
167 struct blkcg, css);
168}
169
170static inline struct blkcg *task_blkcg(struct task_struct *tsk)
171{
172 return container_of(task_subsys_state(tsk, blkio_subsys_id),
173 struct blkcg, css);
174}
175
176static inline struct blkcg *bio_blkcg(struct bio *bio)
177{
178 if (bio && bio->bi_css)
179 return container_of(bio->bi_css, struct blkcg, css);
180 return task_blkcg(current);
181}
182
163/** 183/**
164 * blkg_to_pdata - get policy private data 184 * blkg_to_pdata - get policy private data
165 * @blkg: blkg of interest 185 * @blkg: blkg of interest
@@ -234,6 +254,95 @@ static inline void blkg_put(struct blkcg_gq *blkg)
234} 254}
235 255
236/** 256/**
257 * blk_get_rl - get request_list to use
258 * @q: request_queue of interest
259 * @bio: bio which will be attached to the allocated request (may be %NULL)
260 *
261 * The caller wants to allocate a request from @q to use for @bio. Find
262 * the request_list to use and obtain a reference on it. Should be called
263 * under queue_lock. This function is guaranteed to return non-%NULL
264 * request_list.
265 */
266static inline struct request_list *blk_get_rl(struct request_queue *q,
267 struct bio *bio)
268{
269 struct blkcg *blkcg;
270 struct blkcg_gq *blkg;
271
272 rcu_read_lock();
273
274 blkcg = bio_blkcg(bio);
275
276 /* bypass blkg lookup and use @q->root_rl directly for root */
277 if (blkcg == &blkcg_root)
278 goto root_rl;
279
280 /*
281 * Try to use blkg->rl. blkg lookup may fail under memory pressure
282 * or if either the blkcg or queue is going away. Fall back to
283 * root_rl in such cases.
284 */
285 blkg = blkg_lookup_create(blkcg, q);
286 if (unlikely(IS_ERR(blkg)))
287 goto root_rl;
288
289 blkg_get(blkg);
290 rcu_read_unlock();
291 return &blkg->rl;
292root_rl:
293 rcu_read_unlock();
294 return &q->root_rl;
295}
296
297/**
298 * blk_put_rl - put request_list
299 * @rl: request_list to put
300 *
301 * Put the reference acquired by blk_get_rl(). Should be called under
302 * queue_lock.
303 */
304static inline void blk_put_rl(struct request_list *rl)
305{
306 /* root_rl may not have blkg set */
307 if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
308 blkg_put(rl->blkg);
309}
310
311/**
312 * blk_rq_set_rl - associate a request with a request_list
313 * @rq: request of interest
314 * @rl: target request_list
315 *
316 * Associate @rq with @rl so that accounting and freeing can know the
317 * request_list @rq came from.
318 */
319static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
320{
321 rq->rl = rl;
322}
323
324/**
325 * blk_rq_rl - return the request_list a request came from
326 * @rq: request of interest
327 *
328 * Return the request_list @rq is allocated from.
329 */
330static inline struct request_list *blk_rq_rl(struct request *rq)
331{
332 return rq->rl;
333}
334
335struct request_list *__blk_queue_next_rl(struct request_list *rl,
336 struct request_queue *q);
337/**
338 * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
339 *
340 * Should be used under queue_lock.
341 */
342#define blk_queue_for_each_rl(rl, q) \
343 for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
344
345/**
237 * blkg_stat_add - add a value to a blkg_stat 346 * blkg_stat_add - add a value to a blkg_stat
238 * @stat: target blkg_stat 347 * @stat: target blkg_stat
239 * @val: value to add 348 * @val: value to add
@@ -351,6 +460,7 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
351#else /* CONFIG_BLK_CGROUP */ 460#else /* CONFIG_BLK_CGROUP */
352 461
353struct cgroup; 462struct cgroup;
463struct blkcg;
354 464
355struct blkg_policy_data { 465struct blkg_policy_data {
356}; 466};
@@ -361,8 +471,6 @@ struct blkcg_gq {
361struct blkcg_policy { 471struct blkcg_policy {
362}; 472};
363 473
364static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
365static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
366static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } 474static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
367static inline int blkcg_init_queue(struct request_queue *q) { return 0; } 475static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
368static inline void blkcg_drain_queue(struct request_queue *q) { } 476static inline void blkcg_drain_queue(struct request_queue *q) { }
@@ -374,6 +482,9 @@ static inline int blkcg_activate_policy(struct request_queue *q,
374static inline void blkcg_deactivate_policy(struct request_queue *q, 482static inline void blkcg_deactivate_policy(struct request_queue *q,
375 const struct blkcg_policy *pol) { } 483 const struct blkcg_policy *pol) { }
376 484
485static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
486static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
487
377static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, 488static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
378 struct blkcg_policy *pol) { return NULL; } 489 struct blkcg_policy *pol) { return NULL; }
379static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } 490static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
@@ -381,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
381static inline void blkg_get(struct blkcg_gq *blkg) { } 492static inline void blkg_get(struct blkcg_gq *blkg) { }
382static inline void blkg_put(struct blkcg_gq *blkg) { } 493static inline void blkg_put(struct blkcg_gq *blkg) { }
383 494
495static inline struct request_list *blk_get_rl(struct request_queue *q,
496 struct bio *bio) { return &q->root_rl; }
497static inline void blk_put_rl(struct request_list *rl) { }
498static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
499static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
500
501#define blk_queue_for_each_rl(rl, q) \
502 for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
503
384#endif /* CONFIG_BLK_CGROUP */ 504#endif /* CONFIG_BLK_CGROUP */
385#endif /* _BLK_CGROUP_H */ 505#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 93eb3e4f88ce..dd134d834d58 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -387,7 +387,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
387 if (!list_empty(&q->queue_head) && q->request_fn) 387 if (!list_empty(&q->queue_head) && q->request_fn)
388 __blk_run_queue(q); 388 __blk_run_queue(q);
389 389
390 drain |= q->rq.elvpriv; 390 drain |= q->nr_rqs_elvpriv;
391 391
392 /* 392 /*
393 * Unfortunately, requests are queued at and tracked from 393 * Unfortunately, requests are queued at and tracked from
@@ -397,7 +397,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
397 if (drain_all) { 397 if (drain_all) {
398 drain |= !list_empty(&q->queue_head); 398 drain |= !list_empty(&q->queue_head);
399 for (i = 0; i < 2; i++) { 399 for (i = 0; i < 2; i++) {
400 drain |= q->rq.count[i]; 400 drain |= q->nr_rqs[i];
401 drain |= q->in_flight[i]; 401 drain |= q->in_flight[i];
402 drain |= !list_empty(&q->flush_queue[i]); 402 drain |= !list_empty(&q->flush_queue[i]);
403 } 403 }
@@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
416 * left with hung waiters. We need to wake up those waiters. 416 * left with hung waiters. We need to wake up those waiters.
417 */ 417 */
418 if (q->request_fn) { 418 if (q->request_fn) {
419 struct request_list *rl;
420
419 spin_lock_irq(q->queue_lock); 421 spin_lock_irq(q->queue_lock);
420 for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++) 422
421 wake_up_all(&q->rq.wait[i]); 423 blk_queue_for_each_rl(rl, q)
424 for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
425 wake_up_all(&rl->wait[i]);
426
422 spin_unlock_irq(q->queue_lock); 427 spin_unlock_irq(q->queue_lock);
423 } 428 }
424} 429}
@@ -517,28 +522,33 @@ void blk_cleanup_queue(struct request_queue *q)
517} 522}
518EXPORT_SYMBOL(blk_cleanup_queue); 523EXPORT_SYMBOL(blk_cleanup_queue);
519 524
520static int blk_init_free_list(struct request_queue *q) 525int blk_init_rl(struct request_list *rl, struct request_queue *q,
526 gfp_t gfp_mask)
521{ 527{
522 struct request_list *rl = &q->rq;
523
524 if (unlikely(rl->rq_pool)) 528 if (unlikely(rl->rq_pool))
525 return 0; 529 return 0;
526 530
531 rl->q = q;
527 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; 532 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
528 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; 533 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
529 rl->elvpriv = 0;
530 init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); 534 init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
531 init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); 535 init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
532 536
533 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 537 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
534 mempool_free_slab, request_cachep, q->node); 538 mempool_free_slab, request_cachep,
535 539 gfp_mask, q->node);
536 if (!rl->rq_pool) 540 if (!rl->rq_pool)
537 return -ENOMEM; 541 return -ENOMEM;
538 542
539 return 0; 543 return 0;
540} 544}
541 545
546void blk_exit_rl(struct request_list *rl)
547{
548 if (rl->rq_pool)
549 mempool_destroy(rl->rq_pool);
550}
551
542struct request_queue *blk_alloc_queue(gfp_t gfp_mask) 552struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
543{ 553{
544 return blk_alloc_queue_node(gfp_mask, -1); 554 return blk_alloc_queue_node(gfp_mask, -1);
@@ -680,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
680 if (!q) 690 if (!q)
681 return NULL; 691 return NULL;
682 692
683 if (blk_init_free_list(q)) 693 if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
684 return NULL; 694 return NULL;
685 695
686 q->request_fn = rfn; 696 q->request_fn = rfn;
@@ -722,15 +732,15 @@ bool blk_get_queue(struct request_queue *q)
722} 732}
723EXPORT_SYMBOL(blk_get_queue); 733EXPORT_SYMBOL(blk_get_queue);
724 734
725static inline void blk_free_request(struct request_queue *q, struct request *rq) 735static inline void blk_free_request(struct request_list *rl, struct request *rq)
726{ 736{
727 if (rq->cmd_flags & REQ_ELVPRIV) { 737 if (rq->cmd_flags & REQ_ELVPRIV) {
728 elv_put_request(q, rq); 738 elv_put_request(rl->q, rq);
729 if (rq->elv.icq) 739 if (rq->elv.icq)
730 put_io_context(rq->elv.icq->ioc); 740 put_io_context(rq->elv.icq->ioc);
731 } 741 }
732 742
733 mempool_free(rq, q->rq.rq_pool); 743 mempool_free(rq, rl->rq_pool);
734} 744}
735 745
736/* 746/*
@@ -767,18 +777,23 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
767 ioc->last_waited = jiffies; 777 ioc->last_waited = jiffies;
768} 778}
769 779
770static void __freed_request(struct request_queue *q, int sync) 780static void __freed_request(struct request_list *rl, int sync)
771{ 781{
772 struct request_list *rl = &q->rq; 782 struct request_queue *q = rl->q;
773 783
774 if (rl->count[sync] < queue_congestion_off_threshold(q)) 784 /*
785 * bdi isn't aware of blkcg yet. As all async IOs end up root
786 * blkcg anyway, just use root blkcg state.
787 */
788 if (rl == &q->root_rl &&
789 rl->count[sync] < queue_congestion_off_threshold(q))
775 blk_clear_queue_congested(q, sync); 790 blk_clear_queue_congested(q, sync);
776 791
777 if (rl->count[sync] + 1 <= q->nr_requests) { 792 if (rl->count[sync] + 1 <= q->nr_requests) {
778 if (waitqueue_active(&rl->wait[sync])) 793 if (waitqueue_active(&rl->wait[sync]))
779 wake_up(&rl->wait[sync]); 794 wake_up(&rl->wait[sync]);
780 795
781 blk_clear_queue_full(q, sync); 796 blk_clear_rl_full(rl, sync);
782 } 797 }
783} 798}
784 799
@@ -786,19 +801,20 @@ static void __freed_request(struct request_queue *q, int sync)
786 * A request has just been released. Account for it, update the full and 801 * A request has just been released. Account for it, update the full and
787 * congestion status, wake up any waiters. Called under q->queue_lock. 802 * congestion status, wake up any waiters. Called under q->queue_lock.
788 */ 803 */
789static void freed_request(struct request_queue *q, unsigned int flags) 804static void freed_request(struct request_list *rl, unsigned int flags)
790{ 805{
791 struct request_list *rl = &q->rq; 806 struct request_queue *q = rl->q;
792 int sync = rw_is_sync(flags); 807 int sync = rw_is_sync(flags);
793 808
809 q->nr_rqs[sync]--;
794 rl->count[sync]--; 810 rl->count[sync]--;
795 if (flags & REQ_ELVPRIV) 811 if (flags & REQ_ELVPRIV)
796 rl->elvpriv--; 812 q->nr_rqs_elvpriv--;
797 813
798 __freed_request(q, sync); 814 __freed_request(rl, sync);
799 815
800 if (unlikely(rl->starved[sync ^ 1])) 816 if (unlikely(rl->starved[sync ^ 1]))
801 __freed_request(q, sync ^ 1); 817 __freed_request(rl, sync ^ 1);
802} 818}
803 819
804/* 820/*
@@ -837,8 +853,8 @@ static struct io_context *rq_ioc(struct bio *bio)
837} 853}
838 854
839/** 855/**
840 * get_request - get a free request 856 * __get_request - get a free request
841 * @q: request_queue to allocate request from 857 * @rl: request list to allocate from
842 * @rw_flags: RW and SYNC flags 858 * @rw_flags: RW and SYNC flags
843 * @bio: bio to allocate request for (can be %NULL) 859 * @bio: bio to allocate request for (can be %NULL)
844 * @gfp_mask: allocation mask 860 * @gfp_mask: allocation mask
@@ -850,20 +866,16 @@ static struct io_context *rq_ioc(struct bio *bio)
850 * Returns %NULL on failure, with @q->queue_lock held. 866 * Returns %NULL on failure, with @q->queue_lock held.
851 * Returns !%NULL on success, with @q->queue_lock *not held*. 867 * Returns !%NULL on success, with @q->queue_lock *not held*.
852 */ 868 */
853static struct request *get_request(struct request_queue *q, int rw_flags, 869static struct request *__get_request(struct request_list *rl, int rw_flags,
854 struct bio *bio, gfp_t gfp_mask) 870 struct bio *bio, gfp_t gfp_mask)
855{ 871{
872 struct request_queue *q = rl->q;
856 struct request *rq; 873 struct request *rq;
857 struct request_list *rl = &q->rq; 874 struct elevator_type *et = q->elevator->type;
858 struct elevator_type *et; 875 struct io_context *ioc = rq_ioc(bio);
859 struct io_context *ioc;
860 struct io_cq *icq = NULL; 876 struct io_cq *icq = NULL;
861 const bool is_sync = rw_is_sync(rw_flags) != 0; 877 const bool is_sync = rw_is_sync(rw_flags) != 0;
862 bool retried = false;
863 int may_queue; 878 int may_queue;
864retry:
865 et = q->elevator->type;
866 ioc = rq_ioc(bio);
867 879
868 if (unlikely(blk_queue_dead(q))) 880 if (unlikely(blk_queue_dead(q)))
869 return NULL; 881 return NULL;
@@ -875,28 +887,14 @@ retry:
875 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { 887 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
876 if (rl->count[is_sync]+1 >= q->nr_requests) { 888 if (rl->count[is_sync]+1 >= q->nr_requests) {
877 /* 889 /*
878 * We want ioc to record batching state. If it's
879 * not already there, creating a new one requires
880 * dropping queue_lock, which in turn requires
881 * retesting conditions to avoid queue hang.
882 */
883 if (!ioc && !retried) {
884 spin_unlock_irq(q->queue_lock);
885 create_io_context(gfp_mask, q->node);
886 spin_lock_irq(q->queue_lock);
887 retried = true;
888 goto retry;
889 }
890
891 /*
892 * The queue will fill after this allocation, so set 890 * The queue will fill after this allocation, so set
893 * it as full, and mark this process as "batching". 891 * it as full, and mark this process as "batching".
894 * This process will be allowed to complete a batch of 892 * This process will be allowed to complete a batch of
895 * requests, others will be blocked. 893 * requests, others will be blocked.
896 */ 894 */
897 if (!blk_queue_full(q, is_sync)) { 895 if (!blk_rl_full(rl, is_sync)) {
898 ioc_set_batching(q, ioc); 896 ioc_set_batching(q, ioc);
899 blk_set_queue_full(q, is_sync); 897 blk_set_rl_full(rl, is_sync);
900 } else { 898 } else {
901 if (may_queue != ELV_MQUEUE_MUST 899 if (may_queue != ELV_MQUEUE_MUST
902 && !ioc_batching(q, ioc)) { 900 && !ioc_batching(q, ioc)) {
@@ -909,7 +907,12 @@ retry:
909 } 907 }
910 } 908 }
911 } 909 }
912 blk_set_queue_congested(q, is_sync); 910 /*
911 * bdi isn't aware of blkcg yet. As all async IOs end up
912 * root blkcg anyway, just use root blkcg state.
913 */
914 if (rl == &q->root_rl)
915 blk_set_queue_congested(q, is_sync);
913 } 916 }
914 917
915 /* 918 /*
@@ -920,6 +923,7 @@ retry:
920 if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) 923 if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
921 return NULL; 924 return NULL;
922 925
926 q->nr_rqs[is_sync]++;
923 rl->count[is_sync]++; 927 rl->count[is_sync]++;
924 rl->starved[is_sync] = 0; 928 rl->starved[is_sync] = 0;
925 929
@@ -935,7 +939,7 @@ retry:
935 */ 939 */
936 if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { 940 if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
937 rw_flags |= REQ_ELVPRIV; 941 rw_flags |= REQ_ELVPRIV;
938 rl->elvpriv++; 942 q->nr_rqs_elvpriv++;
939 if (et->icq_cache && ioc) 943 if (et->icq_cache && ioc)
940 icq = ioc_lookup_icq(ioc, q); 944 icq = ioc_lookup_icq(ioc, q);
941 } 945 }
@@ -945,22 +949,19 @@ retry:
945 spin_unlock_irq(q->queue_lock); 949 spin_unlock_irq(q->queue_lock);
946 950
947 /* allocate and init request */ 951 /* allocate and init request */
948 rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 952 rq = mempool_alloc(rl->rq_pool, gfp_mask);
949 if (!rq) 953 if (!rq)
950 goto fail_alloc; 954 goto fail_alloc;
951 955
952 blk_rq_init(q, rq); 956 blk_rq_init(q, rq);
957 blk_rq_set_rl(rq, rl);
953 rq->cmd_flags = rw_flags | REQ_ALLOCED; 958 rq->cmd_flags = rw_flags | REQ_ALLOCED;
954 959
955 /* init elvpriv */ 960 /* init elvpriv */
956 if (rw_flags & REQ_ELVPRIV) { 961 if (rw_flags & REQ_ELVPRIV) {
957 if (unlikely(et->icq_cache && !icq)) { 962 if (unlikely(et->icq_cache && !icq)) {
958 create_io_context(gfp_mask, q->node); 963 if (ioc)
959 ioc = rq_ioc(bio); 964 icq = ioc_create_icq(ioc, q, gfp_mask);
960 if (!ioc)
961 goto fail_elvpriv;
962
963 icq = ioc_create_icq(ioc, q, gfp_mask);
964 if (!icq) 965 if (!icq)
965 goto fail_elvpriv; 966 goto fail_elvpriv;
966 } 967 }
@@ -1000,7 +1001,7 @@ fail_elvpriv:
1000 rq->elv.icq = NULL; 1001 rq->elv.icq = NULL;
1001 1002
1002 spin_lock_irq(q->queue_lock); 1003 spin_lock_irq(q->queue_lock);
1003 rl->elvpriv--; 1004 q->nr_rqs_elvpriv--;
1004 spin_unlock_irq(q->queue_lock); 1005 spin_unlock_irq(q->queue_lock);
1005 goto out; 1006 goto out;
1006 1007
@@ -1013,7 +1014,7 @@ fail_alloc:
1013 * queue, but this is pretty rare. 1014 * queue, but this is pretty rare.
1014 */ 1015 */
1015 spin_lock_irq(q->queue_lock); 1016 spin_lock_irq(q->queue_lock);
1016 freed_request(q, rw_flags); 1017 freed_request(rl, rw_flags);
1017 1018
1018 /* 1019 /*
1019 * in the very unlikely event that allocation failed and no 1020 * in the very unlikely event that allocation failed and no
@@ -1029,56 +1030,58 @@ rq_starved:
1029} 1030}
1030 1031
1031/** 1032/**
1032 * get_request_wait - get a free request with retry 1033 * get_request - get a free request
1033 * @q: request_queue to allocate request from 1034 * @q: request_queue to allocate request from
1034 * @rw_flags: RW and SYNC flags 1035 * @rw_flags: RW and SYNC flags
1035 * @bio: bio to allocate request for (can be %NULL) 1036 * @bio: bio to allocate request for (can be %NULL)
1037 * @gfp_mask: allocation mask
1036 * 1038 *
1037 * Get a free request from @q. This function keeps retrying under memory 1039 * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this
1038 * pressure and fails iff @q is dead. 1040 * function keeps retrying under memory pressure and fails iff @q is dead.
1039 * 1041 *
1040 * Must be callled with @q->queue_lock held and, 1042 * Must be callled with @q->queue_lock held and,
1041 * Returns %NULL on failure, with @q->queue_lock held. 1043 * Returns %NULL on failure, with @q->queue_lock held.
1042 * Returns !%NULL on success, with @q->queue_lock *not held*. 1044 * Returns !%NULL on success, with @q->queue_lock *not held*.
1043 */ 1045 */
1044static struct request *get_request_wait(struct request_queue *q, int rw_flags, 1046static struct request *get_request(struct request_queue *q, int rw_flags,
1045 struct bio *bio) 1047 struct bio *bio, gfp_t gfp_mask)
1046{ 1048{
1047 const bool is_sync = rw_is_sync(rw_flags) != 0; 1049 const bool is_sync = rw_is_sync(rw_flags) != 0;
1050 DEFINE_WAIT(wait);
1051 struct request_list *rl;
1048 struct request *rq; 1052 struct request *rq;
1049 1053
1050 rq = get_request(q, rw_flags, bio, GFP_NOIO); 1054 rl = blk_get_rl(q, bio); /* transferred to @rq on success */
1051 while (!rq) { 1055retry:
1052 DEFINE_WAIT(wait); 1056 rq = __get_request(rl, rw_flags, bio, gfp_mask);
1053 struct request_list *rl = &q->rq; 1057 if (rq)
1054 1058 return rq;
1055 if (unlikely(blk_queue_dead(q)))
1056 return NULL;
1057 1059
1058 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, 1060 if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) {
1059 TASK_UNINTERRUPTIBLE); 1061 blk_put_rl(rl);
1062 return NULL;
1063 }
1060 1064
1061 trace_block_sleeprq(q, bio, rw_flags & 1); 1065 /* wait on @rl and retry */
1066 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1067 TASK_UNINTERRUPTIBLE);
1062 1068
1063 spin_unlock_irq(q->queue_lock); 1069 trace_block_sleeprq(q, bio, rw_flags & 1);
1064 io_schedule();
1065 1070
1066 /* 1071 spin_unlock_irq(q->queue_lock);
1067 * After sleeping, we become a "batching" process and 1072 io_schedule();
1068 * will be able to allocate at least one request, and
1069 * up to a big batch of them for a small period time.
1070 * See ioc_batching, ioc_set_batching
1071 */
1072 create_io_context(GFP_NOIO, q->node);
1073 ioc_set_batching(q, current->io_context);
1074 1073
1075 spin_lock_irq(q->queue_lock); 1074 /*
1076 finish_wait(&rl->wait[is_sync], &wait); 1075 * After sleeping, we become a "batching" process and will be able
1076 * to allocate at least one request, and up to a big batch of them
1077 * for a small period time. See ioc_batching, ioc_set_batching
1078 */
1079 ioc_set_batching(q, current->io_context);
1077 1080
1078 rq = get_request(q, rw_flags, bio, GFP_NOIO); 1081 spin_lock_irq(q->queue_lock);
1079 }; 1082 finish_wait(&rl->wait[is_sync], &wait);
1080 1083
1081 return rq; 1084 goto retry;
1082} 1085}
1083 1086
1084struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1087struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
@@ -1087,11 +1090,11 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1087 1090
1088 BUG_ON(rw != READ && rw != WRITE); 1091 BUG_ON(rw != READ && rw != WRITE);
1089 1092
1093 /* create ioc upfront */
1094 create_io_context(gfp_mask, q->node);
1095
1090 spin_lock_irq(q->queue_lock); 1096 spin_lock_irq(q->queue_lock);
1091 if (gfp_mask & __GFP_WAIT) 1097 rq = get_request(q, rw, NULL, gfp_mask);
1092 rq = get_request_wait(q, rw, NULL);
1093 else
1094 rq = get_request(q, rw, NULL, gfp_mask);
1095 if (!rq) 1098 if (!rq)
1096 spin_unlock_irq(q->queue_lock); 1099 spin_unlock_irq(q->queue_lock);
1097 /* q->queue_lock is unlocked at this point */ 1100 /* q->queue_lock is unlocked at this point */
@@ -1248,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1248 */ 1251 */
1249 if (req->cmd_flags & REQ_ALLOCED) { 1252 if (req->cmd_flags & REQ_ALLOCED) {
1250 unsigned int flags = req->cmd_flags; 1253 unsigned int flags = req->cmd_flags;
1254 struct request_list *rl = blk_rq_rl(req);
1251 1255
1252 BUG_ON(!list_empty(&req->queuelist)); 1256 BUG_ON(!list_empty(&req->queuelist));
1253 BUG_ON(!hlist_unhashed(&req->hash)); 1257 BUG_ON(!hlist_unhashed(&req->hash));
1254 1258
1255 blk_free_request(q, req); 1259 blk_free_request(rl, req);
1256 freed_request(q, flags); 1260 freed_request(rl, flags);
1261 blk_put_rl(rl);
1257 } 1262 }
1258} 1263}
1259EXPORT_SYMBOL_GPL(__blk_put_request); 1264EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1481,7 +1486,7 @@ get_rq:
1481 * Grab a free request. This is might sleep but can not fail. 1486 * Grab a free request. This is might sleep but can not fail.
1482 * Returns with the queue unlocked. 1487 * Returns with the queue unlocked.
1483 */ 1488 */
1484 req = get_request_wait(q, rw_flags, bio); 1489 req = get_request(q, rw_flags, bio, GFP_NOIO);
1485 if (unlikely(!req)) { 1490 if (unlikely(!req)) {
1486 bio_endio(bio, -ENODEV); /* @q is dead */ 1491 bio_endio(bio, -ENODEV); /* @q is dead */
1487 goto out_unlock; 1492 goto out_unlock;
@@ -1702,6 +1707,14 @@ generic_make_request_checks(struct bio *bio)
1702 goto end_io; 1707 goto end_io;
1703 } 1708 }
1704 1709
1710 /*
1711 * Various block parts want %current->io_context and lazy ioc
1712 * allocation ends up trading a lot of pain for a small amount of
1713 * memory. Just allocate it upfront. This may fail and block
1714 * layer knows how to live with it.
1715 */
1716 create_io_context(GFP_ATOMIC, q->node);
1717
1705 if (blk_throtl_bio(q, bio)) 1718 if (blk_throtl_bio(q, bio))
1706 return false; /* throttled, will be resubmitted later */ 1719 return false; /* throttled, will be resubmitted later */
1707 1720
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 893b8007c657..fab4cdd3f7bb 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -244,6 +244,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
244 244
245 /* initialize */ 245 /* initialize */
246 atomic_long_set(&ioc->refcount, 1); 246 atomic_long_set(&ioc->refcount, 1);
247 atomic_set(&ioc->nr_tasks, 1);
247 atomic_set(&ioc->active_ref, 1); 248 atomic_set(&ioc->active_ref, 1);
248 spin_lock_init(&ioc->lock); 249 spin_lock_init(&ioc->lock);
249 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); 250 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d3234fc494ad..565a6786032f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -143,8 +143,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
143 lim->discard_zeroes_data = 1; 143 lim->discard_zeroes_data = 1;
144 lim->max_segments = USHRT_MAX; 144 lim->max_segments = USHRT_MAX;
145 lim->max_hw_sectors = UINT_MAX; 145 lim->max_hw_sectors = UINT_MAX;
146 146 lim->max_sectors = UINT_MAX;
147 lim->max_sectors = BLK_DEF_MAX_SECTORS;
148} 147}
149EXPORT_SYMBOL(blk_set_stacking_limits); 148EXPORT_SYMBOL(blk_set_stacking_limits);
150 149
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index aa41b47c22d2..9628b291f960 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
40static ssize_t 40static ssize_t
41queue_requests_store(struct request_queue *q, const char *page, size_t count) 41queue_requests_store(struct request_queue *q, const char *page, size_t count)
42{ 42{
43 struct request_list *rl = &q->rq; 43 struct request_list *rl;
44 unsigned long nr; 44 unsigned long nr;
45 int ret; 45 int ret;
46 46
@@ -55,6 +55,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
55 q->nr_requests = nr; 55 q->nr_requests = nr;
56 blk_queue_congestion_threshold(q); 56 blk_queue_congestion_threshold(q);
57 57
58 /* congestion isn't cgroup aware and follows root blkcg for now */
59 rl = &q->root_rl;
60
58 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) 61 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
59 blk_set_queue_congested(q, BLK_RW_SYNC); 62 blk_set_queue_congested(q, BLK_RW_SYNC);
60 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) 63 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
@@ -65,19 +68,22 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
65 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) 68 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
66 blk_clear_queue_congested(q, BLK_RW_ASYNC); 69 blk_clear_queue_congested(q, BLK_RW_ASYNC);
67 70
68 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { 71 blk_queue_for_each_rl(rl, q) {
69 blk_set_queue_full(q, BLK_RW_SYNC); 72 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
70 } else { 73 blk_set_rl_full(rl, BLK_RW_SYNC);
71 blk_clear_queue_full(q, BLK_RW_SYNC); 74 } else {
72 wake_up(&rl->wait[BLK_RW_SYNC]); 75 blk_clear_rl_full(rl, BLK_RW_SYNC);
76 wake_up(&rl->wait[BLK_RW_SYNC]);
77 }
78
79 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
80 blk_set_rl_full(rl, BLK_RW_ASYNC);
81 } else {
82 blk_clear_rl_full(rl, BLK_RW_ASYNC);
83 wake_up(&rl->wait[BLK_RW_ASYNC]);
84 }
73 } 85 }
74 86
75 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
76 blk_set_queue_full(q, BLK_RW_ASYNC);
77 } else {
78 blk_clear_queue_full(q, BLK_RW_ASYNC);
79 wake_up(&rl->wait[BLK_RW_ASYNC]);
80 }
81 spin_unlock_irq(q->queue_lock); 87 spin_unlock_irq(q->queue_lock);
82 return ret; 88 return ret;
83} 89}
@@ -476,7 +482,6 @@ static void blk_release_queue(struct kobject *kobj)
476{ 482{
477 struct request_queue *q = 483 struct request_queue *q =
478 container_of(kobj, struct request_queue, kobj); 484 container_of(kobj, struct request_queue, kobj);
479 struct request_list *rl = &q->rq;
480 485
481 blk_sync_queue(q); 486 blk_sync_queue(q);
482 487
@@ -489,8 +494,7 @@ static void blk_release_queue(struct kobject *kobj)
489 elevator_exit(q->elevator); 494 elevator_exit(q->elevator);
490 } 495 }
491 496
492 if (rl->rq_pool) 497 blk_exit_rl(&q->root_rl);
493 mempool_destroy(rl->rq_pool);
494 498
495 if (q->queue_tags) 499 if (q->queue_tags)
496 __blk_queue_free_tags(q); 500 __blk_queue_free_tags(q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5b0659512047..e287c19908c8 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1123,9 +1123,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1123 goto out; 1123 goto out;
1124 } 1124 }
1125 1125
1126 /* bio_associate_current() needs ioc, try creating */
1127 create_io_context(GFP_ATOMIC, q->node);
1128
1129 /* 1126 /*
1130 * A throtl_grp pointer retrieved under rcu can be used to access 1127 * A throtl_grp pointer retrieved under rcu can be used to access
1131 * basic fields like stats and io rates. If a group has no rules, 1128 * basic fields like stats and io rates. If a group has no rules,
diff --git a/block/blk.h b/block/blk.h
index 85f6ae42f7d3..2a0ea32d249f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,6 +18,9 @@ static inline void __blk_get_queue(struct request_queue *q)
18 kobject_get(&q->kobj); 18 kobject_get(&q->kobj);
19} 19}
20 20
21int blk_init_rl(struct request_list *rl, struct request_queue *q,
22 gfp_t gfp_mask);
23void blk_exit_rl(struct request_list *rl);
21void init_request_from_bio(struct request *req, struct bio *bio); 24void init_request_from_bio(struct request *req, struct bio *bio);
22void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 25void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
23 struct bio *bio); 26 struct bio *bio);
@@ -33,7 +36,6 @@ bool __blk_end_bidi_request(struct request *rq, int error,
33void blk_rq_timed_out_timer(unsigned long data); 36void blk_rq_timed_out_timer(unsigned long data);
34void blk_delete_timer(struct request *); 37void blk_delete_timer(struct request *);
35void blk_add_timer(struct request *); 38void blk_add_timer(struct request *);
36void __generic_unplug_device(struct request_queue *);
37 39
38/* 40/*
39 * Internal atomic flags for request handling 41 * Internal atomic flags for request handling
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 7ad49c88f6b1..deee61fbb741 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -243,56 +243,3 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q,
243 return 0; 243 return 0;
244} 244}
245EXPORT_SYMBOL_GPL(bsg_setup_queue); 245EXPORT_SYMBOL_GPL(bsg_setup_queue);
246
247/**
248 * bsg_remove_queue - Deletes the bsg dev from the q
249 * @q: the request_queue that is to be torn down.
250 *
251 * Notes:
252 * Before unregistering the queue empty any requests that are blocked
253 */
254void bsg_remove_queue(struct request_queue *q)
255{
256 struct request *req; /* block request */
257 int counts; /* totals for request_list count and starved */
258
259 if (!q)
260 return;
261
262 /* Stop taking in new requests */
263 spin_lock_irq(q->queue_lock);
264 blk_stop_queue(q);
265
266 /* drain all requests in the queue */
267 while (1) {
268 /* need the lock to fetch a request
269 * this may fetch the same reqeust as the previous pass
270 */
271 req = blk_fetch_request(q);
272 /* save requests in use and starved */
273 counts = q->rq.count[0] + q->rq.count[1] +
274 q->rq.starved[0] + q->rq.starved[1];
275 spin_unlock_irq(q->queue_lock);
276 /* any requests still outstanding? */
277 if (counts == 0)
278 break;
279
280 /* This may be the same req as the previous iteration,
281 * always send the blk_end_request_all after a prefetch.
282 * It is not okay to not end the request because the
283 * prefetch started the request.
284 */
285 if (req) {
286 /* return -ENXIO to indicate that this queue is
287 * going away
288 */
289 req->errors = -ENXIO;
290 blk_end_request_all(req, -ENXIO);
291 }
292
293 msleep(200); /* allow bsg to possibly finish */
294 spin_lock_irq(q->queue_lock);
295 }
296 bsg_unregister_queue(q);
297}
298EXPORT_SYMBOL_GPL(bsg_remove_queue);
diff --git a/block/genhd.c b/block/genhd.c
index 9cf5583c90ff..cac7366957c3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
154 part = rcu_dereference(ptbl->part[piter->idx]); 154 part = rcu_dereference(ptbl->part[piter->idx]);
155 if (!part) 155 if (!part)
156 continue; 156 continue;
157 if (!part->nr_sects && 157 if (!part_nr_sects_read(part) &&
158 !(piter->flags & DISK_PITER_INCL_EMPTY) && 158 !(piter->flags & DISK_PITER_INCL_EMPTY) &&
159 !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && 159 !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
160 piter->idx == 0)) 160 piter->idx == 0))
@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
191static inline int sector_in_part(struct hd_struct *part, sector_t sector) 191static inline int sector_in_part(struct hd_struct *part, sector_t sector)
192{ 192{
193 return part->start_sect <= sector && 193 return part->start_sect <= sector &&
194 sector < part->start_sect + part->nr_sects; 194 sector < part->start_sect + part_nr_sects_read(part);
195} 195}
196 196
197/** 197/**
@@ -769,8 +769,8 @@ void __init printk_all_partitions(void)
769 769
770 printk("%s%s %10llu %s %s", is_part0 ? "" : " ", 770 printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
771 bdevt_str(part_devt(part), devt_buf), 771 bdevt_str(part_devt(part), devt_buf),
772 (unsigned long long)part->nr_sects >> 1, 772 (unsigned long long)part_nr_sects_read(part) >> 1
773 disk_name(disk, part->partno, name_buf), 773 , disk_name(disk, part->partno, name_buf),
774 uuid_buf); 774 uuid_buf);
775 if (is_part0) { 775 if (is_part0) {
776 if (disk->driverfs_dev != NULL && 776 if (disk->driverfs_dev != NULL &&
@@ -862,7 +862,7 @@ static int show_partition(struct seq_file *seqf, void *v)
862 while ((part = disk_part_iter_next(&piter))) 862 while ((part = disk_part_iter_next(&piter)))
863 seq_printf(seqf, "%4d %7d %10llu %s\n", 863 seq_printf(seqf, "%4d %7d %10llu %s\n",
864 MAJOR(part_devt(part)), MINOR(part_devt(part)), 864 MAJOR(part_devt(part)), MINOR(part_devt(part)),
865 (unsigned long long)part->nr_sects >> 1, 865 (unsigned long long)part_nr_sects_read(part) >> 1,
866 disk_name(sgp, part->partno, buf)); 866 disk_name(sgp, part->partno, buf));
867 disk_part_iter_exit(&piter); 867 disk_part_iter_exit(&piter);
868 868
@@ -1268,6 +1268,16 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
1268 } 1268 }
1269 disk->part_tbl->part[0] = &disk->part0; 1269 disk->part_tbl->part[0] = &disk->part0;
1270 1270
1271 /*
1272 * set_capacity() and get_capacity() currently don't use
1273 * seqcounter to read/update the part0->nr_sects. Still init
1274 * the counter as we can read the sectors in IO submission
1275 * patch using seqence counters.
1276 *
1277 * TODO: Ideally set_capacity() and get_capacity() should be
1278 * converted to make use of bd_mutex and sequence counters.
1279 */
1280 seqcount_init(&disk->part0.nr_sects_seq);
1271 hd_ref_init(&disk->part0); 1281 hd_ref_init(&disk->part0);
1272 1282
1273 disk->minors = minors; 1283 disk->minors = minors;
diff --git a/block/ioctl.c b/block/ioctl.c
index ba15b2dbfb98..4476e0e85d16 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -13,7 +13,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
13{ 13{
14 struct block_device *bdevp; 14 struct block_device *bdevp;
15 struct gendisk *disk; 15 struct gendisk *disk;
16 struct hd_struct *part; 16 struct hd_struct *part, *lpart;
17 struct blkpg_ioctl_arg a; 17 struct blkpg_ioctl_arg a;
18 struct blkpg_partition p; 18 struct blkpg_partition p;
19 struct disk_part_iter piter; 19 struct disk_part_iter piter;
@@ -36,8 +36,8 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
36 case BLKPG_ADD_PARTITION: 36 case BLKPG_ADD_PARTITION:
37 start = p.start >> 9; 37 start = p.start >> 9;
38 length = p.length >> 9; 38 length = p.length >> 9;
39 /* check for fit in a hd_struct */ 39 /* check for fit in a hd_struct */
40 if (sizeof(sector_t) == sizeof(long) && 40 if (sizeof(sector_t) == sizeof(long) &&
41 sizeof(long long) > sizeof(long)) { 41 sizeof(long long) > sizeof(long)) {
42 long pstart = start, plength = length; 42 long pstart = start, plength = length;
43 if (pstart != start || plength != length 43 if (pstart != start || plength != length
@@ -92,6 +92,59 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
92 bdput(bdevp); 92 bdput(bdevp);
93 93
94 return 0; 94 return 0;
95 case BLKPG_RESIZE_PARTITION:
96 start = p.start >> 9;
97 /* new length of partition in bytes */
98 length = p.length >> 9;
99 /* check for fit in a hd_struct */
100 if (sizeof(sector_t) == sizeof(long) &&
101 sizeof(long long) > sizeof(long)) {
102 long pstart = start, plength = length;
103 if (pstart != start || plength != length
104 || pstart < 0 || plength < 0)
105 return -EINVAL;
106 }
107 part = disk_get_part(disk, partno);
108 if (!part)
109 return -ENXIO;
110 bdevp = bdget(part_devt(part));
111 if (!bdevp) {
112 disk_put_part(part);
113 return -ENOMEM;
114 }
115 mutex_lock(&bdevp->bd_mutex);
116 mutex_lock_nested(&bdev->bd_mutex, 1);
117 if (start != part->start_sect) {
118 mutex_unlock(&bdevp->bd_mutex);
119 mutex_unlock(&bdev->bd_mutex);
120 bdput(bdevp);
121 disk_put_part(part);
122 return -EINVAL;
123 }
124 /* overlap? */
125 disk_part_iter_init(&piter, disk,
126 DISK_PITER_INCL_EMPTY);
127 while ((lpart = disk_part_iter_next(&piter))) {
128 if (lpart->partno != partno &&
129 !(start + length <= lpart->start_sect ||
130 start >= lpart->start_sect + lpart->nr_sects)
131 ) {
132 disk_part_iter_exit(&piter);
133 mutex_unlock(&bdevp->bd_mutex);
134 mutex_unlock(&bdev->bd_mutex);
135 bdput(bdevp);
136 disk_put_part(part);
137 return -EBUSY;
138 }
139 }
140 disk_part_iter_exit(&piter);
141 part_nr_sects_write(part, (sector_t)length);
142 i_size_write(bdevp->bd_inode, p.length);
143 mutex_unlock(&bdevp->bd_mutex);
144 mutex_unlock(&bdev->bd_mutex);
145 bdput(bdevp);
146 disk_put_part(part);
147 return 0;
95 default: 148 default:
96 return -EINVAL; 149 return -EINVAL;
97 } 150 }
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 6df5d6928a44..f1d14519cc04 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -84,7 +84,7 @@ ssize_t part_size_show(struct device *dev,
84 struct device_attribute *attr, char *buf) 84 struct device_attribute *attr, char *buf)
85{ 85{
86 struct hd_struct *p = dev_to_part(dev); 86 struct hd_struct *p = dev_to_part(dev);
87 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 87 return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
88} 88}
89 89
90static ssize_t part_ro_show(struct device *dev, 90static ssize_t part_ro_show(struct device *dev,
@@ -294,6 +294,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
294 err = -ENOMEM; 294 err = -ENOMEM;
295 goto out_free; 295 goto out_free;
296 } 296 }
297
298 seqcount_init(&p->nr_sects_seq);
297 pdev = part_to_dev(p); 299 pdev = part_to_dev(p);
298 300
299 p->start_sect = start; 301 p->start_sect = start;