aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 12:02:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 12:02:41 -0400
commit8cf1a3fce0b95050b63d451c9d561da0da2aa4d6 (patch)
tree0dc7f93474c3be601a5893900db1418dfd60ba5d
parentfcff06c438b60f415af5983efe92811d6aa02ad1 (diff)
parent80799fbb7d10c30df78015b3fa21f7ffcfc0eb2c (diff)
Merge branch 'for-3.6/core' of git://git.kernel.dk/linux-block
Pull core block IO bits from Jens Axboe: "The most complicated part if this is the request allocation rework by Tejun, which has been queued up for a long time and has been in for-next ditto as well. There are a few commits from yesterday and today, mostly trivial and obvious fixes. So I'm pretty confident that it is sound. It's also smaller than usual." * 'for-3.6/core' of git://git.kernel.dk/linux-block: block: remove dead func declaration block: add partition resize function to blkpg ioctl block: uninitialized ioc->nr_tasks triggers WARN_ON block: do not artificially constrain max_sectors for stacking drivers blkcg: implement per-blkg request allocation block: prepare for multiple request_lists block: add q->nr_rqs[] and move q->rq.elvpriv to q->nr_rqs_elvpriv blkcg: inline bio_blkcg() and friends block: allocate io_context upfront block: refactor get_request[_wait]() block: drop custom queue draining used by scsi_transport_{iscsi|fc} mempool: add @gfp_mask to mempool_create_node() blkcg: make root blkcg allocation use %GFP_KERNEL blkcg: __blkg_lookup_create() doesn't need radix preload
-rw-r--r--Documentation/block/queue-sysfs.txt7
-rw-r--r--block/blk-cgroup.c139
-rw-r--r--block/blk-cgroup.h128
-rw-r--r--block/blk-core.c209
-rw-r--r--block/blk-ioc.c1
-rw-r--r--block/blk-settings.c3
-rw-r--r--block/blk-sysfs.c34
-rw-r--r--block/blk-throttle.c3
-rw-r--r--block/blk.h4
-rw-r--r--block/bsg-lib.c53
-rw-r--r--block/genhd.c20
-rw-r--r--block/ioctl.c59
-rw-r--r--block/partition-generic.c4
-rw-r--r--drivers/scsi/scsi_transport_fc.c38
-rw-r--r--drivers/scsi/scsi_transport_iscsi.c2
-rw-r--r--include/linux/blkdev.h53
-rw-r--r--include/linux/blkpg.h1
-rw-r--r--include/linux/bsg-lib.h1
-rw-r--r--include/linux/genhd.h57
-rw-r--r--include/linux/mempool.h3
-rw-r--r--mm/mempool.c12
21 files changed, 530 insertions, 301 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index d8147b336c35..6518a55273e7 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -38,6 +38,13 @@ read or write requests. Note that the total allocated number may be twice
38this amount, since it applies only to reads or writes (not the accumulated 38this amount, since it applies only to reads or writes (not the accumulated
39sum). 39sum).
40 40
41To avoid priority inversion through request starvation, a request
42queue maintains a separate request pool per each cgroup when
43CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
44per-block-cgroup request pool. IOW, if there are N block cgroups,
45each request queue may have upto N request pools, each independently
46regulated by nr_requests.
47
41read_ahead_kb (RW) 48read_ahead_kb (RW)
42------------------ 49------------------
43Maximum number of kilobytes to read-ahead for filesystems on this block 50Maximum number of kilobytes to read-ahead for filesystems on this block
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7dee617358e..f3b44a65fc7a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -31,27 +31,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
31 31
32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
33 33
34struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
35{
36 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
37 struct blkcg, css);
38}
39EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
40
41static struct blkcg *task_blkcg(struct task_struct *tsk)
42{
43 return container_of(task_subsys_state(tsk, blkio_subsys_id),
44 struct blkcg, css);
45}
46
47struct blkcg *bio_blkcg(struct bio *bio)
48{
49 if (bio && bio->bi_css)
50 return container_of(bio->bi_css, struct blkcg, css);
51 return task_blkcg(current);
52}
53EXPORT_SYMBOL_GPL(bio_blkcg);
54
55static bool blkcg_policy_enabled(struct request_queue *q, 34static bool blkcg_policy_enabled(struct request_queue *q,
56 const struct blkcg_policy *pol) 35 const struct blkcg_policy *pol)
57{ 36{
@@ -84,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg)
84 kfree(pd); 63 kfree(pd);
85 } 64 }
86 65
66 blk_exit_rl(&blkg->rl);
87 kfree(blkg); 67 kfree(blkg);
88} 68}
89 69
@@ -91,16 +71,18 @@ static void blkg_free(struct blkcg_gq *blkg)
91 * blkg_alloc - allocate a blkg 71 * blkg_alloc - allocate a blkg
92 * @blkcg: block cgroup the new blkg is associated with 72 * @blkcg: block cgroup the new blkg is associated with
93 * @q: request_queue the new blkg is associated with 73 * @q: request_queue the new blkg is associated with
74 * @gfp_mask: allocation mask to use
94 * 75 *
95 * Allocate a new blkg assocating @blkcg and @q. 76 * Allocate a new blkg assocating @blkcg and @q.
96 */ 77 */
97static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) 78static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
79 gfp_t gfp_mask)
98{ 80{
99 struct blkcg_gq *blkg; 81 struct blkcg_gq *blkg;
100 int i; 82 int i;
101 83
102 /* alloc and init base part */ 84 /* alloc and init base part */
103 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); 85 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
104 if (!blkg) 86 if (!blkg)
105 return NULL; 87 return NULL;
106 88
@@ -109,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
109 blkg->blkcg = blkcg; 91 blkg->blkcg = blkcg;
110 blkg->refcnt = 1; 92 blkg->refcnt = 1;
111 93
94 /* root blkg uses @q->root_rl, init rl only for !root blkgs */
95 if (blkcg != &blkcg_root) {
96 if (blk_init_rl(&blkg->rl, q, gfp_mask))
97 goto err_free;
98 blkg->rl.blkg = blkg;
99 }
100
112 for (i = 0; i < BLKCG_MAX_POLS; i++) { 101 for (i = 0; i < BLKCG_MAX_POLS; i++) {
113 struct blkcg_policy *pol = blkcg_policy[i]; 102 struct blkcg_policy *pol = blkcg_policy[i];
114 struct blkg_policy_data *pd; 103 struct blkg_policy_data *pd;
@@ -117,11 +106,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
117 continue; 106 continue;
118 107
119 /* alloc per-policy data and attach it to blkg */ 108 /* alloc per-policy data and attach it to blkg */
120 pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); 109 pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
121 if (!pd) { 110 if (!pd)
122 blkg_free(blkg); 111 goto err_free;
123 return NULL;
124 }
125 112
126 blkg->pd[i] = pd; 113 blkg->pd[i] = pd;
127 pd->blkg = blkg; 114 pd->blkg = blkg;
@@ -132,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
132 } 119 }
133 120
134 return blkg; 121 return blkg;
122
123err_free:
124 blkg_free(blkg);
125 return NULL;
135} 126}
136 127
137static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 128static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
@@ -175,9 +166,13 @@ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
175} 166}
176EXPORT_SYMBOL_GPL(blkg_lookup); 167EXPORT_SYMBOL_GPL(blkg_lookup);
177 168
169/*
170 * If @new_blkg is %NULL, this function tries to allocate a new one as
171 * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return.
172 */
178static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, 173static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
179 struct request_queue *q) 174 struct request_queue *q,
180 __releases(q->queue_lock) __acquires(q->queue_lock) 175 struct blkcg_gq *new_blkg)
181{ 176{
182 struct blkcg_gq *blkg; 177 struct blkcg_gq *blkg;
183 int ret; 178 int ret;
@@ -189,24 +184,26 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
189 blkg = __blkg_lookup(blkcg, q); 184 blkg = __blkg_lookup(blkcg, q);
190 if (blkg) { 185 if (blkg) {
191 rcu_assign_pointer(blkcg->blkg_hint, blkg); 186 rcu_assign_pointer(blkcg->blkg_hint, blkg);
192 return blkg; 187 goto out_free;
193 } 188 }
194 189
195 /* blkg holds a reference to blkcg */ 190 /* blkg holds a reference to blkcg */
196 if (!css_tryget(&blkcg->css)) 191 if (!css_tryget(&blkcg->css)) {
197 return ERR_PTR(-EINVAL); 192 blkg = ERR_PTR(-EINVAL);
193 goto out_free;
194 }
198 195
199 /* allocate */ 196 /* allocate */
200 ret = -ENOMEM; 197 if (!new_blkg) {
201 blkg = blkg_alloc(blkcg, q); 198 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
202 if (unlikely(!blkg)) 199 if (unlikely(!new_blkg)) {
203 goto err_put; 200 blkg = ERR_PTR(-ENOMEM);
201 goto out_put;
202 }
203 }
204 blkg = new_blkg;
204 205
205 /* insert */ 206 /* insert */
206 ret = radix_tree_preload(GFP_ATOMIC);
207 if (ret)
208 goto err_free;
209
210 spin_lock(&blkcg->lock); 207 spin_lock(&blkcg->lock);
211 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 208 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
212 if (likely(!ret)) { 209 if (likely(!ret)) {
@@ -215,15 +212,15 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
215 } 212 }
216 spin_unlock(&blkcg->lock); 213 spin_unlock(&blkcg->lock);
217 214
218 radix_tree_preload_end();
219
220 if (!ret) 215 if (!ret)
221 return blkg; 216 return blkg;
222err_free: 217
223 blkg_free(blkg); 218 blkg = ERR_PTR(ret);
224err_put: 219out_put:
225 css_put(&blkcg->css); 220 css_put(&blkcg->css);
226 return ERR_PTR(ret); 221out_free:
222 blkg_free(new_blkg);
223 return blkg;
227} 224}
228 225
229struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 226struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
@@ -235,7 +232,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
235 */ 232 */
236 if (unlikely(blk_queue_bypass(q))) 233 if (unlikely(blk_queue_bypass(q)))
237 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); 234 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
238 return __blkg_lookup_create(blkcg, q); 235 return __blkg_lookup_create(blkcg, q, NULL);
239} 236}
240EXPORT_SYMBOL_GPL(blkg_lookup_create); 237EXPORT_SYMBOL_GPL(blkg_lookup_create);
241 238
@@ -313,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg)
313} 310}
314EXPORT_SYMBOL_GPL(__blkg_release); 311EXPORT_SYMBOL_GPL(__blkg_release);
315 312
313/*
314 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
315 * because the root blkg uses @q->root_rl instead of its own rl.
316 */
317struct request_list *__blk_queue_next_rl(struct request_list *rl,
318 struct request_queue *q)
319{
320 struct list_head *ent;
321 struct blkcg_gq *blkg;
322
323 /*
324 * Determine the current blkg list_head. The first entry is
325 * root_rl which is off @q->blkg_list and mapped to the head.
326 */
327 if (rl == &q->root_rl) {
328 ent = &q->blkg_list;
329 } else {
330 blkg = container_of(rl, struct blkcg_gq, rl);
331 ent = &blkg->q_node;
332 }
333
334 /* walk to the next list_head, skip root blkcg */
335 ent = ent->next;
336 if (ent == &q->root_blkg->q_node)
337 ent = ent->next;
338 if (ent == &q->blkg_list)
339 return NULL;
340
341 blkg = container_of(ent, struct blkcg_gq, q_node);
342 return &blkg->rl;
343}
344
316static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 345static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
317 u64 val) 346 u64 val)
318{ 347{
@@ -734,24 +763,36 @@ int blkcg_activate_policy(struct request_queue *q,
734 struct blkcg_gq *blkg; 763 struct blkcg_gq *blkg;
735 struct blkg_policy_data *pd, *n; 764 struct blkg_policy_data *pd, *n;
736 int cnt = 0, ret; 765 int cnt = 0, ret;
766 bool preloaded;
737 767
738 if (blkcg_policy_enabled(q, pol)) 768 if (blkcg_policy_enabled(q, pol))
739 return 0; 769 return 0;
740 770
771 /* preallocations for root blkg */
772 blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
773 if (!blkg)
774 return -ENOMEM;
775
776 preloaded = !radix_tree_preload(GFP_KERNEL);
777
741 blk_queue_bypass_start(q); 778 blk_queue_bypass_start(q);
742 779
743 /* make sure the root blkg exists and count the existing blkgs */ 780 /* make sure the root blkg exists and count the existing blkgs */
744 spin_lock_irq(q->queue_lock); 781 spin_lock_irq(q->queue_lock);
745 782
746 rcu_read_lock(); 783 rcu_read_lock();
747 blkg = __blkg_lookup_create(&blkcg_root, q); 784 blkg = __blkg_lookup_create(&blkcg_root, q, blkg);
748 rcu_read_unlock(); 785 rcu_read_unlock();
749 786
787 if (preloaded)
788 radix_tree_preload_end();
789
750 if (IS_ERR(blkg)) { 790 if (IS_ERR(blkg)) {
751 ret = PTR_ERR(blkg); 791 ret = PTR_ERR(blkg);
752 goto out_unlock; 792 goto out_unlock;
753 } 793 }
754 q->root_blkg = blkg; 794 q->root_blkg = blkg;
795 q->root_rl.blkg = blkg;
755 796
756 list_for_each_entry(blkg, &q->blkg_list, q_node) 797 list_for_each_entry(blkg, &q->blkg_list, q_node)
757 cnt++; 798 cnt++;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8ac457ce7783..24597309e23d 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -17,6 +17,7 @@
17#include <linux/u64_stats_sync.h> 17#include <linux/u64_stats_sync.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <linux/radix-tree.h> 19#include <linux/radix-tree.h>
20#include <linux/blkdev.h>
20 21
21/* Max limits for throttle policy */ 22/* Max limits for throttle policy */
22#define THROTL_IOPS_MAX UINT_MAX 23#define THROTL_IOPS_MAX UINT_MAX
@@ -93,6 +94,8 @@ struct blkcg_gq {
93 struct list_head q_node; 94 struct list_head q_node;
94 struct hlist_node blkcg_node; 95 struct hlist_node blkcg_node;
95 struct blkcg *blkcg; 96 struct blkcg *blkcg;
97 /* request allocation list for this blkcg-q pair */
98 struct request_list rl;
96 /* reference count */ 99 /* reference count */
97 int refcnt; 100 int refcnt;
98 101
@@ -120,8 +123,6 @@ struct blkcg_policy {
120 123
121extern struct blkcg blkcg_root; 124extern struct blkcg blkcg_root;
122 125
123struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup);
124struct blkcg *bio_blkcg(struct bio *bio);
125struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); 126struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
126struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 127struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
127 struct request_queue *q); 128 struct request_queue *q);
@@ -160,6 +161,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
160void blkg_conf_finish(struct blkg_conf_ctx *ctx); 161void blkg_conf_finish(struct blkg_conf_ctx *ctx);
161 162
162 163
164static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
165{
166 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
167 struct blkcg, css);
168}
169
170static inline struct blkcg *task_blkcg(struct task_struct *tsk)
171{
172 return container_of(task_subsys_state(tsk, blkio_subsys_id),
173 struct blkcg, css);
174}
175
176static inline struct blkcg *bio_blkcg(struct bio *bio)
177{
178 if (bio && bio->bi_css)
179 return container_of(bio->bi_css, struct blkcg, css);
180 return task_blkcg(current);
181}
182
163/** 183/**
164 * blkg_to_pdata - get policy private data 184 * blkg_to_pdata - get policy private data
165 * @blkg: blkg of interest 185 * @blkg: blkg of interest
@@ -234,6 +254,95 @@ static inline void blkg_put(struct blkcg_gq *blkg)
234} 254}
235 255
236/** 256/**
257 * blk_get_rl - get request_list to use
258 * @q: request_queue of interest
259 * @bio: bio which will be attached to the allocated request (may be %NULL)
260 *
261 * The caller wants to allocate a request from @q to use for @bio. Find
262 * the request_list to use and obtain a reference on it. Should be called
263 * under queue_lock. This function is guaranteed to return non-%NULL
264 * request_list.
265 */
266static inline struct request_list *blk_get_rl(struct request_queue *q,
267 struct bio *bio)
268{
269 struct blkcg *blkcg;
270 struct blkcg_gq *blkg;
271
272 rcu_read_lock();
273
274 blkcg = bio_blkcg(bio);
275
276 /* bypass blkg lookup and use @q->root_rl directly for root */
277 if (blkcg == &blkcg_root)
278 goto root_rl;
279
280 /*
281 * Try to use blkg->rl. blkg lookup may fail under memory pressure
282 * or if either the blkcg or queue is going away. Fall back to
283 * root_rl in such cases.
284 */
285 blkg = blkg_lookup_create(blkcg, q);
286 if (unlikely(IS_ERR(blkg)))
287 goto root_rl;
288
289 blkg_get(blkg);
290 rcu_read_unlock();
291 return &blkg->rl;
292root_rl:
293 rcu_read_unlock();
294 return &q->root_rl;
295}
296
297/**
298 * blk_put_rl - put request_list
299 * @rl: request_list to put
300 *
301 * Put the reference acquired by blk_get_rl(). Should be called under
302 * queue_lock.
303 */
304static inline void blk_put_rl(struct request_list *rl)
305{
306 /* root_rl may not have blkg set */
307 if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
308 blkg_put(rl->blkg);
309}
310
311/**
312 * blk_rq_set_rl - associate a request with a request_list
313 * @rq: request of interest
314 * @rl: target request_list
315 *
316 * Associate @rq with @rl so that accounting and freeing can know the
317 * request_list @rq came from.
318 */
319static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
320{
321 rq->rl = rl;
322}
323
324/**
325 * blk_rq_rl - return the request_list a request came from
326 * @rq: request of interest
327 *
328 * Return the request_list @rq is allocated from.
329 */
330static inline struct request_list *blk_rq_rl(struct request *rq)
331{
332 return rq->rl;
333}
334
335struct request_list *__blk_queue_next_rl(struct request_list *rl,
336 struct request_queue *q);
337/**
338 * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
339 *
340 * Should be used under queue_lock.
341 */
342#define blk_queue_for_each_rl(rl, q) \
343 for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
344
345/**
237 * blkg_stat_add - add a value to a blkg_stat 346 * blkg_stat_add - add a value to a blkg_stat
238 * @stat: target blkg_stat 347 * @stat: target blkg_stat
239 * @val: value to add 348 * @val: value to add
@@ -351,6 +460,7 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
351#else /* CONFIG_BLK_CGROUP */ 460#else /* CONFIG_BLK_CGROUP */
352 461
353struct cgroup; 462struct cgroup;
463struct blkcg;
354 464
355struct blkg_policy_data { 465struct blkg_policy_data {
356}; 466};
@@ -361,8 +471,6 @@ struct blkcg_gq {
361struct blkcg_policy { 471struct blkcg_policy {
362}; 472};
363 473
364static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
365static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
366static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } 474static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
367static inline int blkcg_init_queue(struct request_queue *q) { return 0; } 475static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
368static inline void blkcg_drain_queue(struct request_queue *q) { } 476static inline void blkcg_drain_queue(struct request_queue *q) { }
@@ -374,6 +482,9 @@ static inline int blkcg_activate_policy(struct request_queue *q,
374static inline void blkcg_deactivate_policy(struct request_queue *q, 482static inline void blkcg_deactivate_policy(struct request_queue *q,
375 const struct blkcg_policy *pol) { } 483 const struct blkcg_policy *pol) { }
376 484
485static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
486static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
487
377static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, 488static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
378 struct blkcg_policy *pol) { return NULL; } 489 struct blkcg_policy *pol) { return NULL; }
379static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } 490static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
@@ -381,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
381static inline void blkg_get(struct blkcg_gq *blkg) { } 492static inline void blkg_get(struct blkcg_gq *blkg) { }
382static inline void blkg_put(struct blkcg_gq *blkg) { } 493static inline void blkg_put(struct blkcg_gq *blkg) { }
383 494
495static inline struct request_list *blk_get_rl(struct request_queue *q,
496 struct bio *bio) { return &q->root_rl; }
497static inline void blk_put_rl(struct request_list *rl) { }
498static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
499static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
500
501#define blk_queue_for_each_rl(rl, q) \
502 for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
503
384#endif /* CONFIG_BLK_CGROUP */ 504#endif /* CONFIG_BLK_CGROUP */
385#endif /* _BLK_CGROUP_H */ 505#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 93eb3e4f88ce..dd134d834d58 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -387,7 +387,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
387 if (!list_empty(&q->queue_head) && q->request_fn) 387 if (!list_empty(&q->queue_head) && q->request_fn)
388 __blk_run_queue(q); 388 __blk_run_queue(q);
389 389
390 drain |= q->rq.elvpriv; 390 drain |= q->nr_rqs_elvpriv;
391 391
392 /* 392 /*
393 * Unfortunately, requests are queued at and tracked from 393 * Unfortunately, requests are queued at and tracked from
@@ -397,7 +397,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
397 if (drain_all) { 397 if (drain_all) {
398 drain |= !list_empty(&q->queue_head); 398 drain |= !list_empty(&q->queue_head);
399 for (i = 0; i < 2; i++) { 399 for (i = 0; i < 2; i++) {
400 drain |= q->rq.count[i]; 400 drain |= q->nr_rqs[i];
401 drain |= q->in_flight[i]; 401 drain |= q->in_flight[i];
402 drain |= !list_empty(&q->flush_queue[i]); 402 drain |= !list_empty(&q->flush_queue[i]);
403 } 403 }
@@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
416 * left with hung waiters. We need to wake up those waiters. 416 * left with hung waiters. We need to wake up those waiters.
417 */ 417 */
418 if (q->request_fn) { 418 if (q->request_fn) {
419 struct request_list *rl;
420
419 spin_lock_irq(q->queue_lock); 421 spin_lock_irq(q->queue_lock);
420 for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++) 422
421 wake_up_all(&q->rq.wait[i]); 423 blk_queue_for_each_rl(rl, q)
424 for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
425 wake_up_all(&rl->wait[i]);
426
422 spin_unlock_irq(q->queue_lock); 427 spin_unlock_irq(q->queue_lock);
423 } 428 }
424} 429}
@@ -517,28 +522,33 @@ void blk_cleanup_queue(struct request_queue *q)
517} 522}
518EXPORT_SYMBOL(blk_cleanup_queue); 523EXPORT_SYMBOL(blk_cleanup_queue);
519 524
520static int blk_init_free_list(struct request_queue *q) 525int blk_init_rl(struct request_list *rl, struct request_queue *q,
526 gfp_t gfp_mask)
521{ 527{
522 struct request_list *rl = &q->rq;
523
524 if (unlikely(rl->rq_pool)) 528 if (unlikely(rl->rq_pool))
525 return 0; 529 return 0;
526 530
531 rl->q = q;
527 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; 532 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
528 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; 533 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
529 rl->elvpriv = 0;
530 init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); 534 init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
531 init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); 535 init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
532 536
533 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 537 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
534 mempool_free_slab, request_cachep, q->node); 538 mempool_free_slab, request_cachep,
535 539 gfp_mask, q->node);
536 if (!rl->rq_pool) 540 if (!rl->rq_pool)
537 return -ENOMEM; 541 return -ENOMEM;
538 542
539 return 0; 543 return 0;
540} 544}
541 545
546void blk_exit_rl(struct request_list *rl)
547{
548 if (rl->rq_pool)
549 mempool_destroy(rl->rq_pool);
550}
551
542struct request_queue *blk_alloc_queue(gfp_t gfp_mask) 552struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
543{ 553{
544 return blk_alloc_queue_node(gfp_mask, -1); 554 return blk_alloc_queue_node(gfp_mask, -1);
@@ -680,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
680 if (!q) 690 if (!q)
681 return NULL; 691 return NULL;
682 692
683 if (blk_init_free_list(q)) 693 if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
684 return NULL; 694 return NULL;
685 695
686 q->request_fn = rfn; 696 q->request_fn = rfn;
@@ -722,15 +732,15 @@ bool blk_get_queue(struct request_queue *q)
722} 732}
723EXPORT_SYMBOL(blk_get_queue); 733EXPORT_SYMBOL(blk_get_queue);
724 734
725static inline void blk_free_request(struct request_queue *q, struct request *rq) 735static inline void blk_free_request(struct request_list *rl, struct request *rq)
726{ 736{
727 if (rq->cmd_flags & REQ_ELVPRIV) { 737 if (rq->cmd_flags & REQ_ELVPRIV) {
728 elv_put_request(q, rq); 738 elv_put_request(rl->q, rq);
729 if (rq->elv.icq) 739 if (rq->elv.icq)
730 put_io_context(rq->elv.icq->ioc); 740 put_io_context(rq->elv.icq->ioc);
731 } 741 }
732 742
733 mempool_free(rq, q->rq.rq_pool); 743 mempool_free(rq, rl->rq_pool);
734} 744}
735 745
736/* 746/*
@@ -767,18 +777,23 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
767 ioc->last_waited = jiffies; 777 ioc->last_waited = jiffies;
768} 778}
769 779
770static void __freed_request(struct request_queue *q, int sync) 780static void __freed_request(struct request_list *rl, int sync)
771{ 781{
772 struct request_list *rl = &q->rq; 782 struct request_queue *q = rl->q;
773 783
774 if (rl->count[sync] < queue_congestion_off_threshold(q)) 784 /*
785 * bdi isn't aware of blkcg yet. As all async IOs end up root
786 * blkcg anyway, just use root blkcg state.
787 */
788 if (rl == &q->root_rl &&
789 rl->count[sync] < queue_congestion_off_threshold(q))
775 blk_clear_queue_congested(q, sync); 790 blk_clear_queue_congested(q, sync);
776 791
777 if (rl->count[sync] + 1 <= q->nr_requests) { 792 if (rl->count[sync] + 1 <= q->nr_requests) {
778 if (waitqueue_active(&rl->wait[sync])) 793 if (waitqueue_active(&rl->wait[sync]))
779 wake_up(&rl->wait[sync]); 794 wake_up(&rl->wait[sync]);
780 795
781 blk_clear_queue_full(q, sync); 796 blk_clear_rl_full(rl, sync);
782 } 797 }
783} 798}
784 799
@@ -786,19 +801,20 @@ static void __freed_request(struct request_queue *q, int sync)
786 * A request has just been released. Account for it, update the full and 801 * A request has just been released. Account for it, update the full and
787 * congestion status, wake up any waiters. Called under q->queue_lock. 802 * congestion status, wake up any waiters. Called under q->queue_lock.
788 */ 803 */
789static void freed_request(struct request_queue *q, unsigned int flags) 804static void freed_request(struct request_list *rl, unsigned int flags)
790{ 805{
791 struct request_list *rl = &q->rq; 806 struct request_queue *q = rl->q;
792 int sync = rw_is_sync(flags); 807 int sync = rw_is_sync(flags);
793 808
809 q->nr_rqs[sync]--;
794 rl->count[sync]--; 810 rl->count[sync]--;
795 if (flags & REQ_ELVPRIV) 811 if (flags & REQ_ELVPRIV)
796 rl->elvpriv--; 812 q->nr_rqs_elvpriv--;
797 813
798 __freed_request(q, sync); 814 __freed_request(rl, sync);
799 815
800 if (unlikely(rl->starved[sync ^ 1])) 816 if (unlikely(rl->starved[sync ^ 1]))
801 __freed_request(q, sync ^ 1); 817 __freed_request(rl, sync ^ 1);
802} 818}
803 819
804/* 820/*
@@ -837,8 +853,8 @@ static struct io_context *rq_ioc(struct bio *bio)
837} 853}
838 854
839/** 855/**
840 * get_request - get a free request 856 * __get_request - get a free request
841 * @q: request_queue to allocate request from 857 * @rl: request list to allocate from
842 * @rw_flags: RW and SYNC flags 858 * @rw_flags: RW and SYNC flags
843 * @bio: bio to allocate request for (can be %NULL) 859 * @bio: bio to allocate request for (can be %NULL)
844 * @gfp_mask: allocation mask 860 * @gfp_mask: allocation mask
@@ -850,20 +866,16 @@ static struct io_context *rq_ioc(struct bio *bio)
850 * Returns %NULL on failure, with @q->queue_lock held. 866 * Returns %NULL on failure, with @q->queue_lock held.
851 * Returns !%NULL on success, with @q->queue_lock *not held*. 867 * Returns !%NULL on success, with @q->queue_lock *not held*.
852 */ 868 */
853static struct request *get_request(struct request_queue *q, int rw_flags, 869static struct request *__get_request(struct request_list *rl, int rw_flags,
854 struct bio *bio, gfp_t gfp_mask) 870 struct bio *bio, gfp_t gfp_mask)
855{ 871{
872 struct request_queue *q = rl->q;
856 struct request *rq; 873 struct request *rq;
857 struct request_list *rl = &q->rq; 874 struct elevator_type *et = q->elevator->type;
858 struct elevator_type *et; 875 struct io_context *ioc = rq_ioc(bio);
859 struct io_context *ioc;
860 struct io_cq *icq = NULL; 876 struct io_cq *icq = NULL;
861 const bool is_sync = rw_is_sync(rw_flags) != 0; 877 const bool is_sync = rw_is_sync(rw_flags) != 0;
862 bool retried = false;
863 int may_queue; 878 int may_queue;
864retry:
865 et = q->elevator->type;
866 ioc = rq_ioc(bio);
867 879
868 if (unlikely(blk_queue_dead(q))) 880 if (unlikely(blk_queue_dead(q)))
869 return NULL; 881 return NULL;
@@ -875,28 +887,14 @@ retry:
875 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { 887 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
876 if (rl->count[is_sync]+1 >= q->nr_requests) { 888 if (rl->count[is_sync]+1 >= q->nr_requests) {
877 /* 889 /*
878 * We want ioc to record batching state. If it's
879 * not already there, creating a new one requires
880 * dropping queue_lock, which in turn requires
881 * retesting conditions to avoid queue hang.
882 */
883 if (!ioc && !retried) {
884 spin_unlock_irq(q->queue_lock);
885 create_io_context(gfp_mask, q->node);
886 spin_lock_irq(q->queue_lock);
887 retried = true;
888 goto retry;
889 }
890
891 /*
892 * The queue will fill after this allocation, so set 890 * The queue will fill after this allocation, so set
893 * it as full, and mark this process as "batching". 891 * it as full, and mark this process as "batching".
894 * This process will be allowed to complete a batch of 892 * This process will be allowed to complete a batch of
895 * requests, others will be blocked. 893 * requests, others will be blocked.
896 */ 894 */
897 if (!blk_queue_full(q, is_sync)) { 895 if (!blk_rl_full(rl, is_sync)) {
898 ioc_set_batching(q, ioc); 896 ioc_set_batching(q, ioc);
899 blk_set_queue_full(q, is_sync); 897 blk_set_rl_full(rl, is_sync);
900 } else { 898 } else {
901 if (may_queue != ELV_MQUEUE_MUST 899 if (may_queue != ELV_MQUEUE_MUST
902 && !ioc_batching(q, ioc)) { 900 && !ioc_batching(q, ioc)) {
@@ -909,7 +907,12 @@ retry:
909 } 907 }
910 } 908 }
911 } 909 }
912 blk_set_queue_congested(q, is_sync); 910 /*
911 * bdi isn't aware of blkcg yet. As all async IOs end up
912 * root blkcg anyway, just use root blkcg state.
913 */
914 if (rl == &q->root_rl)
915 blk_set_queue_congested(q, is_sync);
913 } 916 }
914 917
915 /* 918 /*
@@ -920,6 +923,7 @@ retry:
920 if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) 923 if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
921 return NULL; 924 return NULL;
922 925
926 q->nr_rqs[is_sync]++;
923 rl->count[is_sync]++; 927 rl->count[is_sync]++;
924 rl->starved[is_sync] = 0; 928 rl->starved[is_sync] = 0;
925 929
@@ -935,7 +939,7 @@ retry:
935 */ 939 */
936 if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { 940 if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
937 rw_flags |= REQ_ELVPRIV; 941 rw_flags |= REQ_ELVPRIV;
938 rl->elvpriv++; 942 q->nr_rqs_elvpriv++;
939 if (et->icq_cache && ioc) 943 if (et->icq_cache && ioc)
940 icq = ioc_lookup_icq(ioc, q); 944 icq = ioc_lookup_icq(ioc, q);
941 } 945 }
@@ -945,22 +949,19 @@ retry:
945 spin_unlock_irq(q->queue_lock); 949 spin_unlock_irq(q->queue_lock);
946 950
947 /* allocate and init request */ 951 /* allocate and init request */
948 rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 952 rq = mempool_alloc(rl->rq_pool, gfp_mask);
949 if (!rq) 953 if (!rq)
950 goto fail_alloc; 954 goto fail_alloc;
951 955
952 blk_rq_init(q, rq); 956 blk_rq_init(q, rq);
957 blk_rq_set_rl(rq, rl);
953 rq->cmd_flags = rw_flags | REQ_ALLOCED; 958 rq->cmd_flags = rw_flags | REQ_ALLOCED;
954 959
955 /* init elvpriv */ 960 /* init elvpriv */
956 if (rw_flags & REQ_ELVPRIV) { 961 if (rw_flags & REQ_ELVPRIV) {
957 if (unlikely(et->icq_cache && !icq)) { 962 if (unlikely(et->icq_cache && !icq)) {
958 create_io_context(gfp_mask, q->node); 963 if (ioc)
959 ioc = rq_ioc(bio); 964 icq = ioc_create_icq(ioc, q, gfp_mask);
960 if (!ioc)
961 goto fail_elvpriv;
962
963 icq = ioc_create_icq(ioc, q, gfp_mask);
964 if (!icq) 965 if (!icq)
965 goto fail_elvpriv; 966 goto fail_elvpriv;
966 } 967 }
@@ -1000,7 +1001,7 @@ fail_elvpriv:
1000 rq->elv.icq = NULL; 1001 rq->elv.icq = NULL;
1001 1002
1002 spin_lock_irq(q->queue_lock); 1003 spin_lock_irq(q->queue_lock);
1003 rl->elvpriv--; 1004 q->nr_rqs_elvpriv--;
1004 spin_unlock_irq(q->queue_lock); 1005 spin_unlock_irq(q->queue_lock);
1005 goto out; 1006 goto out;
1006 1007
@@ -1013,7 +1014,7 @@ fail_alloc:
1013 * queue, but this is pretty rare. 1014 * queue, but this is pretty rare.
1014 */ 1015 */
1015 spin_lock_irq(q->queue_lock); 1016 spin_lock_irq(q->queue_lock);
1016 freed_request(q, rw_flags); 1017 freed_request(rl, rw_flags);
1017 1018
1018 /* 1019 /*
1019 * in the very unlikely event that allocation failed and no 1020 * in the very unlikely event that allocation failed and no
@@ -1029,56 +1030,58 @@ rq_starved:
1029} 1030}
1030 1031
1031/** 1032/**
1032 * get_request_wait - get a free request with retry 1033 * get_request - get a free request
1033 * @q: request_queue to allocate request from 1034 * @q: request_queue to allocate request from
1034 * @rw_flags: RW and SYNC flags 1035 * @rw_flags: RW and SYNC flags
1035 * @bio: bio to allocate request for (can be %NULL) 1036 * @bio: bio to allocate request for (can be %NULL)
1037 * @gfp_mask: allocation mask
1036 * 1038 *
1037 * Get a free request from @q. This function keeps retrying under memory 1039 * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this
1038 * pressure and fails iff @q is dead. 1040 * function keeps retrying under memory pressure and fails iff @q is dead.
1039 * 1041 *
1040 * Must be callled with @q->queue_lock held and, 1042 * Must be callled with @q->queue_lock held and,
1041 * Returns %NULL on failure, with @q->queue_lock held. 1043 * Returns %NULL on failure, with @q->queue_lock held.
1042 * Returns !%NULL on success, with @q->queue_lock *not held*. 1044 * Returns !%NULL on success, with @q->queue_lock *not held*.
1043 */ 1045 */
1044static struct request *get_request_wait(struct request_queue *q, int rw_flags, 1046static struct request *get_request(struct request_queue *q, int rw_flags,
1045 struct bio *bio) 1047 struct bio *bio, gfp_t gfp_mask)
1046{ 1048{
1047 const bool is_sync = rw_is_sync(rw_flags) != 0; 1049 const bool is_sync = rw_is_sync(rw_flags) != 0;
1050 DEFINE_WAIT(wait);
1051 struct request_list *rl;
1048 struct request *rq; 1052 struct request *rq;
1049 1053
1050 rq = get_request(q, rw_flags, bio, GFP_NOIO); 1054 rl = blk_get_rl(q, bio); /* transferred to @rq on success */
1051 while (!rq) { 1055retry:
1052 DEFINE_WAIT(wait); 1056 rq = __get_request(rl, rw_flags, bio, gfp_mask);
1053 struct request_list *rl = &q->rq; 1057 if (rq)
1054 1058 return rq;
1055 if (unlikely(blk_queue_dead(q)))
1056 return NULL;
1057 1059
1058 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, 1060 if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) {
1059 TASK_UNINTERRUPTIBLE); 1061 blk_put_rl(rl);
1062 return NULL;
1063 }
1060 1064
1061 trace_block_sleeprq(q, bio, rw_flags & 1); 1065 /* wait on @rl and retry */
1066 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1067 TASK_UNINTERRUPTIBLE);
1062 1068
1063 spin_unlock_irq(q->queue_lock); 1069 trace_block_sleeprq(q, bio, rw_flags & 1);
1064 io_schedule();
1065 1070
1066 /* 1071 spin_unlock_irq(q->queue_lock);
1067 * After sleeping, we become a "batching" process and 1072 io_schedule();
1068 * will be able to allocate at least one request, and
1069 * up to a big batch of them for a small period time.
1070 * See ioc_batching, ioc_set_batching
1071 */
1072 create_io_context(GFP_NOIO, q->node);
1073 ioc_set_batching(q, current->io_context);
1074 1073
1075 spin_lock_irq(q->queue_lock); 1074 /*
1076 finish_wait(&rl->wait[is_sync], &wait); 1075 * After sleeping, we become a "batching" process and will be able
1076 * to allocate at least one request, and up to a big batch of them
1077 * for a small period time. See ioc_batching, ioc_set_batching
1078 */
1079 ioc_set_batching(q, current->io_context);
1077 1080
1078 rq = get_request(q, rw_flags, bio, GFP_NOIO); 1081 spin_lock_irq(q->queue_lock);
1079 }; 1082 finish_wait(&rl->wait[is_sync], &wait);
1080 1083
1081 return rq; 1084 goto retry;
1082} 1085}
1083 1086
1084struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1087struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
@@ -1087,11 +1090,11 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1087 1090
1088 BUG_ON(rw != READ && rw != WRITE); 1091 BUG_ON(rw != READ && rw != WRITE);
1089 1092
1093 /* create ioc upfront */
1094 create_io_context(gfp_mask, q->node);
1095
1090 spin_lock_irq(q->queue_lock); 1096 spin_lock_irq(q->queue_lock);
1091 if (gfp_mask & __GFP_WAIT) 1097 rq = get_request(q, rw, NULL, gfp_mask);
1092 rq = get_request_wait(q, rw, NULL);
1093 else
1094 rq = get_request(q, rw, NULL, gfp_mask);
1095 if (!rq) 1098 if (!rq)
1096 spin_unlock_irq(q->queue_lock); 1099 spin_unlock_irq(q->queue_lock);
1097 /* q->queue_lock is unlocked at this point */ 1100 /* q->queue_lock is unlocked at this point */
@@ -1248,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1248 */ 1251 */
1249 if (req->cmd_flags & REQ_ALLOCED) { 1252 if (req->cmd_flags & REQ_ALLOCED) {
1250 unsigned int flags = req->cmd_flags; 1253 unsigned int flags = req->cmd_flags;
1254 struct request_list *rl = blk_rq_rl(req);
1251 1255
1252 BUG_ON(!list_empty(&req->queuelist)); 1256 BUG_ON(!list_empty(&req->queuelist));
1253 BUG_ON(!hlist_unhashed(&req->hash)); 1257 BUG_ON(!hlist_unhashed(&req->hash));
1254 1258
1255 blk_free_request(q, req); 1259 blk_free_request(rl, req);
1256 freed_request(q, flags); 1260 freed_request(rl, flags);
1261 blk_put_rl(rl);
1257 } 1262 }
1258} 1263}
1259EXPORT_SYMBOL_GPL(__blk_put_request); 1264EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1481,7 +1486,7 @@ get_rq:
1481 * Grab a free request. This is might sleep but can not fail. 1486 * Grab a free request. This is might sleep but can not fail.
1482 * Returns with the queue unlocked. 1487 * Returns with the queue unlocked.
1483 */ 1488 */
1484 req = get_request_wait(q, rw_flags, bio); 1489 req = get_request(q, rw_flags, bio, GFP_NOIO);
1485 if (unlikely(!req)) { 1490 if (unlikely(!req)) {
1486 bio_endio(bio, -ENODEV); /* @q is dead */ 1491 bio_endio(bio, -ENODEV); /* @q is dead */
1487 goto out_unlock; 1492 goto out_unlock;
@@ -1702,6 +1707,14 @@ generic_make_request_checks(struct bio *bio)
1702 goto end_io; 1707 goto end_io;
1703 } 1708 }
1704 1709
1710 /*
1711 * Various block parts want %current->io_context and lazy ioc
1712 * allocation ends up trading a lot of pain for a small amount of
1713 * memory. Just allocate it upfront. This may fail and block
1714 * layer knows how to live with it.
1715 */
1716 create_io_context(GFP_ATOMIC, q->node);
1717
1705 if (blk_throtl_bio(q, bio)) 1718 if (blk_throtl_bio(q, bio))
1706 return false; /* throttled, will be resubmitted later */ 1719 return false; /* throttled, will be resubmitted later */
1707 1720
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 893b8007c657..fab4cdd3f7bb 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -244,6 +244,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
244 244
245 /* initialize */ 245 /* initialize */
246 atomic_long_set(&ioc->refcount, 1); 246 atomic_long_set(&ioc->refcount, 1);
247 atomic_set(&ioc->nr_tasks, 1);
247 atomic_set(&ioc->active_ref, 1); 248 atomic_set(&ioc->active_ref, 1);
248 spin_lock_init(&ioc->lock); 249 spin_lock_init(&ioc->lock);
249 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); 250 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d3234fc494ad..565a6786032f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -143,8 +143,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
143 lim->discard_zeroes_data = 1; 143 lim->discard_zeroes_data = 1;
144 lim->max_segments = USHRT_MAX; 144 lim->max_segments = USHRT_MAX;
145 lim->max_hw_sectors = UINT_MAX; 145 lim->max_hw_sectors = UINT_MAX;
146 146 lim->max_sectors = UINT_MAX;
147 lim->max_sectors = BLK_DEF_MAX_SECTORS;
148} 147}
149EXPORT_SYMBOL(blk_set_stacking_limits); 148EXPORT_SYMBOL(blk_set_stacking_limits);
150 149
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index aa41b47c22d2..9628b291f960 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
40static ssize_t 40static ssize_t
41queue_requests_store(struct request_queue *q, const char *page, size_t count) 41queue_requests_store(struct request_queue *q, const char *page, size_t count)
42{ 42{
43 struct request_list *rl = &q->rq; 43 struct request_list *rl;
44 unsigned long nr; 44 unsigned long nr;
45 int ret; 45 int ret;
46 46
@@ -55,6 +55,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
55 q->nr_requests = nr; 55 q->nr_requests = nr;
56 blk_queue_congestion_threshold(q); 56 blk_queue_congestion_threshold(q);
57 57
58 /* congestion isn't cgroup aware and follows root blkcg for now */
59 rl = &q->root_rl;
60
58 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) 61 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
59 blk_set_queue_congested(q, BLK_RW_SYNC); 62 blk_set_queue_congested(q, BLK_RW_SYNC);
60 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) 63 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
@@ -65,19 +68,22 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
65 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) 68 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
66 blk_clear_queue_congested(q, BLK_RW_ASYNC); 69 blk_clear_queue_congested(q, BLK_RW_ASYNC);
67 70
68 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { 71 blk_queue_for_each_rl(rl, q) {
69 blk_set_queue_full(q, BLK_RW_SYNC); 72 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
70 } else { 73 blk_set_rl_full(rl, BLK_RW_SYNC);
71 blk_clear_queue_full(q, BLK_RW_SYNC); 74 } else {
72 wake_up(&rl->wait[BLK_RW_SYNC]); 75 blk_clear_rl_full(rl, BLK_RW_SYNC);
76 wake_up(&rl->wait[BLK_RW_SYNC]);
77 }
78
79 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
80 blk_set_rl_full(rl, BLK_RW_ASYNC);
81 } else {
82 blk_clear_rl_full(rl, BLK_RW_ASYNC);
83 wake_up(&rl->wait[BLK_RW_ASYNC]);
84 }
73 } 85 }
74 86
75 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
76 blk_set_queue_full(q, BLK_RW_ASYNC);
77 } else {
78 blk_clear_queue_full(q, BLK_RW_ASYNC);
79 wake_up(&rl->wait[BLK_RW_ASYNC]);
80 }
81 spin_unlock_irq(q->queue_lock); 87 spin_unlock_irq(q->queue_lock);
82 return ret; 88 return ret;
83} 89}
@@ -476,7 +482,6 @@ static void blk_release_queue(struct kobject *kobj)
476{ 482{
477 struct request_queue *q = 483 struct request_queue *q =
478 container_of(kobj, struct request_queue, kobj); 484 container_of(kobj, struct request_queue, kobj);
479 struct request_list *rl = &q->rq;
480 485
481 blk_sync_queue(q); 486 blk_sync_queue(q);
482 487
@@ -489,8 +494,7 @@ static void blk_release_queue(struct kobject *kobj)
489 elevator_exit(q->elevator); 494 elevator_exit(q->elevator);
490 } 495 }
491 496
492 if (rl->rq_pool) 497 blk_exit_rl(&q->root_rl);
493 mempool_destroy(rl->rq_pool);
494 498
495 if (q->queue_tags) 499 if (q->queue_tags)
496 __blk_queue_free_tags(q); 500 __blk_queue_free_tags(q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5b0659512047..e287c19908c8 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1123,9 +1123,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1123 goto out; 1123 goto out;
1124 } 1124 }
1125 1125
1126 /* bio_associate_current() needs ioc, try creating */
1127 create_io_context(GFP_ATOMIC, q->node);
1128
1129 /* 1126 /*
1130 * A throtl_grp pointer retrieved under rcu can be used to access 1127 * A throtl_grp pointer retrieved under rcu can be used to access
1131 * basic fields like stats and io rates. If a group has no rules, 1128 * basic fields like stats and io rates. If a group has no rules,
diff --git a/block/blk.h b/block/blk.h
index 85f6ae42f7d3..2a0ea32d249f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,6 +18,9 @@ static inline void __blk_get_queue(struct request_queue *q)
18 kobject_get(&q->kobj); 18 kobject_get(&q->kobj);
19} 19}
20 20
21int blk_init_rl(struct request_list *rl, struct request_queue *q,
22 gfp_t gfp_mask);
23void blk_exit_rl(struct request_list *rl);
21void init_request_from_bio(struct request *req, struct bio *bio); 24void init_request_from_bio(struct request *req, struct bio *bio);
22void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 25void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
23 struct bio *bio); 26 struct bio *bio);
@@ -33,7 +36,6 @@ bool __blk_end_bidi_request(struct request *rq, int error,
33void blk_rq_timed_out_timer(unsigned long data); 36void blk_rq_timed_out_timer(unsigned long data);
34void blk_delete_timer(struct request *); 37void blk_delete_timer(struct request *);
35void blk_add_timer(struct request *); 38void blk_add_timer(struct request *);
36void __generic_unplug_device(struct request_queue *);
37 39
38/* 40/*
39 * Internal atomic flags for request handling 41 * Internal atomic flags for request handling
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 7ad49c88f6b1..deee61fbb741 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -243,56 +243,3 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q,
243 return 0; 243 return 0;
244} 244}
245EXPORT_SYMBOL_GPL(bsg_setup_queue); 245EXPORT_SYMBOL_GPL(bsg_setup_queue);
246
247/**
248 * bsg_remove_queue - Deletes the bsg dev from the q
249 * @q: the request_queue that is to be torn down.
250 *
251 * Notes:
252 * Before unregistering the queue empty any requests that are blocked
253 */
254void bsg_remove_queue(struct request_queue *q)
255{
256 struct request *req; /* block request */
257 int counts; /* totals for request_list count and starved */
258
259 if (!q)
260 return;
261
262 /* Stop taking in new requests */
263 spin_lock_irq(q->queue_lock);
264 blk_stop_queue(q);
265
266 /* drain all requests in the queue */
267 while (1) {
268 /* need the lock to fetch a request
269 * this may fetch the same reqeust as the previous pass
270 */
271 req = blk_fetch_request(q);
272 /* save requests in use and starved */
273 counts = q->rq.count[0] + q->rq.count[1] +
274 q->rq.starved[0] + q->rq.starved[1];
275 spin_unlock_irq(q->queue_lock);
276 /* any requests still outstanding? */
277 if (counts == 0)
278 break;
279
280 /* This may be the same req as the previous iteration,
281 * always send the blk_end_request_all after a prefetch.
282 * It is not okay to not end the request because the
283 * prefetch started the request.
284 */
285 if (req) {
286 /* return -ENXIO to indicate that this queue is
287 * going away
288 */
289 req->errors = -ENXIO;
290 blk_end_request_all(req, -ENXIO);
291 }
292
293 msleep(200); /* allow bsg to possibly finish */
294 spin_lock_irq(q->queue_lock);
295 }
296 bsg_unregister_queue(q);
297}
298EXPORT_SYMBOL_GPL(bsg_remove_queue);
diff --git a/block/genhd.c b/block/genhd.c
index 9cf5583c90ff..cac7366957c3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
154 part = rcu_dereference(ptbl->part[piter->idx]); 154 part = rcu_dereference(ptbl->part[piter->idx]);
155 if (!part) 155 if (!part)
156 continue; 156 continue;
157 if (!part->nr_sects && 157 if (!part_nr_sects_read(part) &&
158 !(piter->flags & DISK_PITER_INCL_EMPTY) && 158 !(piter->flags & DISK_PITER_INCL_EMPTY) &&
159 !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && 159 !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
160 piter->idx == 0)) 160 piter->idx == 0))
@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
191static inline int sector_in_part(struct hd_struct *part, sector_t sector) 191static inline int sector_in_part(struct hd_struct *part, sector_t sector)
192{ 192{
193 return part->start_sect <= sector && 193 return part->start_sect <= sector &&
194 sector < part->start_sect + part->nr_sects; 194 sector < part->start_sect + part_nr_sects_read(part);
195} 195}
196 196
197/** 197/**
@@ -769,8 +769,8 @@ void __init printk_all_partitions(void)
769 769
770 printk("%s%s %10llu %s %s", is_part0 ? "" : " ", 770 printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
771 bdevt_str(part_devt(part), devt_buf), 771 bdevt_str(part_devt(part), devt_buf),
772 (unsigned long long)part->nr_sects >> 1, 772 (unsigned long long)part_nr_sects_read(part) >> 1
773 disk_name(disk, part->partno, name_buf), 773 , disk_name(disk, part->partno, name_buf),
774 uuid_buf); 774 uuid_buf);
775 if (is_part0) { 775 if (is_part0) {
776 if (disk->driverfs_dev != NULL && 776 if (disk->driverfs_dev != NULL &&
@@ -862,7 +862,7 @@ static int show_partition(struct seq_file *seqf, void *v)
862 while ((part = disk_part_iter_next(&piter))) 862 while ((part = disk_part_iter_next(&piter)))
863 seq_printf(seqf, "%4d %7d %10llu %s\n", 863 seq_printf(seqf, "%4d %7d %10llu %s\n",
864 MAJOR(part_devt(part)), MINOR(part_devt(part)), 864 MAJOR(part_devt(part)), MINOR(part_devt(part)),
865 (unsigned long long)part->nr_sects >> 1, 865 (unsigned long long)part_nr_sects_read(part) >> 1,
866 disk_name(sgp, part->partno, buf)); 866 disk_name(sgp, part->partno, buf));
867 disk_part_iter_exit(&piter); 867 disk_part_iter_exit(&piter);
868 868
@@ -1268,6 +1268,16 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
1268 } 1268 }
1269 disk->part_tbl->part[0] = &disk->part0; 1269 disk->part_tbl->part[0] = &disk->part0;
1270 1270
1271 /*
1272 * set_capacity() and get_capacity() currently don't use
1273 * seqcounter to read/update the part0->nr_sects. Still init
1274 * the counter as we can read the sectors in IO submission
1275 * patch using seqence counters.
1276 *
1277 * TODO: Ideally set_capacity() and get_capacity() should be
1278 * converted to make use of bd_mutex and sequence counters.
1279 */
1280 seqcount_init(&disk->part0.nr_sects_seq);
1271 hd_ref_init(&disk->part0); 1281 hd_ref_init(&disk->part0);
1272 1282
1273 disk->minors = minors; 1283 disk->minors = minors;
diff --git a/block/ioctl.c b/block/ioctl.c
index ba15b2dbfb98..4476e0e85d16 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -13,7 +13,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
13{ 13{
14 struct block_device *bdevp; 14 struct block_device *bdevp;
15 struct gendisk *disk; 15 struct gendisk *disk;
16 struct hd_struct *part; 16 struct hd_struct *part, *lpart;
17 struct blkpg_ioctl_arg a; 17 struct blkpg_ioctl_arg a;
18 struct blkpg_partition p; 18 struct blkpg_partition p;
19 struct disk_part_iter piter; 19 struct disk_part_iter piter;
@@ -36,8 +36,8 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
36 case BLKPG_ADD_PARTITION: 36 case BLKPG_ADD_PARTITION:
37 start = p.start >> 9; 37 start = p.start >> 9;
38 length = p.length >> 9; 38 length = p.length >> 9;
39 /* check for fit in a hd_struct */ 39 /* check for fit in a hd_struct */
40 if (sizeof(sector_t) == sizeof(long) && 40 if (sizeof(sector_t) == sizeof(long) &&
41 sizeof(long long) > sizeof(long)) { 41 sizeof(long long) > sizeof(long)) {
42 long pstart = start, plength = length; 42 long pstart = start, plength = length;
43 if (pstart != start || plength != length 43 if (pstart != start || plength != length
@@ -92,6 +92,59 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
92 bdput(bdevp); 92 bdput(bdevp);
93 93
94 return 0; 94 return 0;
95 case BLKPG_RESIZE_PARTITION:
96 start = p.start >> 9;
97 /* new length of partition in bytes */
98 length = p.length >> 9;
99 /* check for fit in a hd_struct */
100 if (sizeof(sector_t) == sizeof(long) &&
101 sizeof(long long) > sizeof(long)) {
102 long pstart = start, plength = length;
103 if (pstart != start || plength != length
104 || pstart < 0 || plength < 0)
105 return -EINVAL;
106 }
107 part = disk_get_part(disk, partno);
108 if (!part)
109 return -ENXIO;
110 bdevp = bdget(part_devt(part));
111 if (!bdevp) {
112 disk_put_part(part);
113 return -ENOMEM;
114 }
115 mutex_lock(&bdevp->bd_mutex);
116 mutex_lock_nested(&bdev->bd_mutex, 1);
117 if (start != part->start_sect) {
118 mutex_unlock(&bdevp->bd_mutex);
119 mutex_unlock(&bdev->bd_mutex);
120 bdput(bdevp);
121 disk_put_part(part);
122 return -EINVAL;
123 }
124 /* overlap? */
125 disk_part_iter_init(&piter, disk,
126 DISK_PITER_INCL_EMPTY);
127 while ((lpart = disk_part_iter_next(&piter))) {
128 if (lpart->partno != partno &&
129 !(start + length <= lpart->start_sect ||
130 start >= lpart->start_sect + lpart->nr_sects)
131 ) {
132 disk_part_iter_exit(&piter);
133 mutex_unlock(&bdevp->bd_mutex);
134 mutex_unlock(&bdev->bd_mutex);
135 bdput(bdevp);
136 disk_put_part(part);
137 return -EBUSY;
138 }
139 }
140 disk_part_iter_exit(&piter);
141 part_nr_sects_write(part, (sector_t)length);
142 i_size_write(bdevp->bd_inode, p.length);
143 mutex_unlock(&bdevp->bd_mutex);
144 mutex_unlock(&bdev->bd_mutex);
145 bdput(bdevp);
146 disk_put_part(part);
147 return 0;
95 default: 148 default:
96 return -EINVAL; 149 return -EINVAL;
97 } 150 }
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 6df5d6928a44..f1d14519cc04 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -84,7 +84,7 @@ ssize_t part_size_show(struct device *dev,
84 struct device_attribute *attr, char *buf) 84 struct device_attribute *attr, char *buf)
85{ 85{
86 struct hd_struct *p = dev_to_part(dev); 86 struct hd_struct *p = dev_to_part(dev);
87 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 87 return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
88} 88}
89 89
90static ssize_t part_ro_show(struct device *dev, 90static ssize_t part_ro_show(struct device *dev,
@@ -294,6 +294,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
294 err = -ENOMEM; 294 err = -ENOMEM;
295 goto out_free; 295 goto out_free;
296 } 296 }
297
298 seqcount_init(&p->nr_sects_seq);
297 pdev = part_to_dev(p); 299 pdev = part_to_dev(p);
298 300
299 p->start_sect = start; 301 p->start_sect = start;
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 2d1e68db9b3f..e894ca7b54c0 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -4146,45 +4146,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
4146static void 4146static void
4147fc_bsg_remove(struct request_queue *q) 4147fc_bsg_remove(struct request_queue *q)
4148{ 4148{
4149 struct request *req; /* block request */
4150 int counts; /* totals for request_list count and starved */
4151
4152 if (q) { 4149 if (q) {
4153 /* Stop taking in new requests */
4154 spin_lock_irq(q->queue_lock);
4155 blk_stop_queue(q);
4156
4157 /* drain all requests in the queue */
4158 while (1) {
4159 /* need the lock to fetch a request
4160 * this may fetch the same reqeust as the previous pass
4161 */
4162 req = blk_fetch_request(q);
4163 /* save requests in use and starved */
4164 counts = q->rq.count[0] + q->rq.count[1] +
4165 q->rq.starved[0] + q->rq.starved[1];
4166 spin_unlock_irq(q->queue_lock);
4167 /* any requests still outstanding? */
4168 if (counts == 0)
4169 break;
4170
4171 /* This may be the same req as the previous iteration,
4172 * always send the blk_end_request_all after a prefetch.
4173 * It is not okay to not end the request because the
4174 * prefetch started the request.
4175 */
4176 if (req) {
4177 /* return -ENXIO to indicate that this queue is
4178 * going away
4179 */
4180 req->errors = -ENXIO;
4181 blk_end_request_all(req, -ENXIO);
4182 }
4183
4184 msleep(200); /* allow bsg to possibly finish */
4185 spin_lock_irq(q->queue_lock);
4186 }
4187
4188 bsg_unregister_queue(q); 4150 bsg_unregister_queue(q);
4189 blk_cleanup_queue(q); 4151 blk_cleanup_queue(q);
4190 } 4152 }
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 09809d06eccb..fa1dfaa83e32 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -575,7 +575,7 @@ static int iscsi_remove_host(struct transport_container *tc,
575 struct iscsi_cls_host *ihost = shost->shost_data; 575 struct iscsi_cls_host *ihost = shost->shost_data;
576 576
577 if (ihost->bsg_q) { 577 if (ihost->bsg_q) {
578 bsg_remove_queue(ihost->bsg_q); 578 bsg_unregister_queue(ihost->bsg_q);
579 blk_cleanup_queue(ihost->bsg_q); 579 blk_cleanup_queue(ihost->bsg_q);
580 } 580 }
581 return 0; 581 return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 07954b05b86c..3816ce8a08fc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -46,16 +46,23 @@ struct blkcg_gq;
46struct request; 46struct request;
47typedef void (rq_end_io_fn)(struct request *, int); 47typedef void (rq_end_io_fn)(struct request *, int);
48 48
49#define BLK_RL_SYNCFULL (1U << 0)
50#define BLK_RL_ASYNCFULL (1U << 1)
51
49struct request_list { 52struct request_list {
53 struct request_queue *q; /* the queue this rl belongs to */
54#ifdef CONFIG_BLK_CGROUP
55 struct blkcg_gq *blkg; /* blkg this request pool belongs to */
56#endif
50 /* 57 /*
51 * count[], starved[], and wait[] are indexed by 58 * count[], starved[], and wait[] are indexed by
52 * BLK_RW_SYNC/BLK_RW_ASYNC 59 * BLK_RW_SYNC/BLK_RW_ASYNC
53 */ 60 */
54 int count[2]; 61 int count[2];
55 int starved[2]; 62 int starved[2];
56 int elvpriv; 63 mempool_t *rq_pool;
57 mempool_t *rq_pool; 64 wait_queue_head_t wait[2];
58 wait_queue_head_t wait[2]; 65 unsigned int flags;
59}; 66};
60 67
61/* 68/*
@@ -138,6 +145,7 @@ struct request {
138 struct hd_struct *part; 145 struct hd_struct *part;
139 unsigned long start_time; 146 unsigned long start_time;
140#ifdef CONFIG_BLK_CGROUP 147#ifdef CONFIG_BLK_CGROUP
148 struct request_list *rl; /* rl this rq is alloced from */
141 unsigned long long start_time_ns; 149 unsigned long long start_time_ns;
142 unsigned long long io_start_time_ns; /* when passed to hardware */ 150 unsigned long long io_start_time_ns; /* when passed to hardware */
143#endif 151#endif
@@ -282,11 +290,16 @@ struct request_queue {
282 struct list_head queue_head; 290 struct list_head queue_head;
283 struct request *last_merge; 291 struct request *last_merge;
284 struct elevator_queue *elevator; 292 struct elevator_queue *elevator;
293 int nr_rqs[2]; /* # allocated [a]sync rqs */
294 int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
285 295
286 /* 296 /*
287 * the queue request freelist, one for reads and one for writes 297 * If blkcg is not used, @q->root_rl serves all requests. If blkcg
298 * is used, root blkg allocates from @q->root_rl and all other
299 * blkgs from their own blkg->rl. Which one to use should be
300 * determined using bio_request_list().
288 */ 301 */
289 struct request_list rq; 302 struct request_list root_rl;
290 303
291 request_fn_proc *request_fn; 304 request_fn_proc *request_fn;
292 make_request_fn *make_request_fn; 305 make_request_fn *make_request_fn;
@@ -561,27 +574,25 @@ static inline bool rq_is_sync(struct request *rq)
561 return rw_is_sync(rq->cmd_flags); 574 return rw_is_sync(rq->cmd_flags);
562} 575}
563 576
564static inline int blk_queue_full(struct request_queue *q, int sync) 577static inline bool blk_rl_full(struct request_list *rl, bool sync)
565{ 578{
566 if (sync) 579 unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
567 return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags); 580
568 return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags); 581 return rl->flags & flag;
569} 582}
570 583
571static inline void blk_set_queue_full(struct request_queue *q, int sync) 584static inline void blk_set_rl_full(struct request_list *rl, bool sync)
572{ 585{
573 if (sync) 586 unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
574 queue_flag_set(QUEUE_FLAG_SYNCFULL, q); 587
575 else 588 rl->flags |= flag;
576 queue_flag_set(QUEUE_FLAG_ASYNCFULL, q);
577} 589}
578 590
579static inline void blk_clear_queue_full(struct request_queue *q, int sync) 591static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
580{ 592{
581 if (sync) 593 unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
582 queue_flag_clear(QUEUE_FLAG_SYNCFULL, q); 594
583 else 595 rl->flags &= ~flag;
584 queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q);
585} 596}
586 597
587 598
diff --git a/include/linux/blkpg.h b/include/linux/blkpg.h
index faf8a45af210..a8519446c111 100644
--- a/include/linux/blkpg.h
+++ b/include/linux/blkpg.h
@@ -40,6 +40,7 @@ struct blkpg_ioctl_arg {
40/* The subfunctions (for the op field) */ 40/* The subfunctions (for the op field) */
41#define BLKPG_ADD_PARTITION 1 41#define BLKPG_ADD_PARTITION 1
42#define BLKPG_DEL_PARTITION 2 42#define BLKPG_DEL_PARTITION 2
43#define BLKPG_RESIZE_PARTITION 3
43 44
44/* Sizes of name fields. Unused at present. */ 45/* Sizes of name fields. Unused at present. */
45#define BLKPG_DEVNAMELTH 64 46#define BLKPG_DEVNAMELTH 64
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index f55ab8cdc106..4d0fb3df2f4a 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -67,7 +67,6 @@ void bsg_job_done(struct bsg_job *job, int result,
67int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name, 67int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name,
68 bsg_job_fn *job_fn, int dd_job_size); 68 bsg_job_fn *job_fn, int dd_job_size);
69void bsg_request_fn(struct request_queue *q); 69void bsg_request_fn(struct request_queue *q);
70void bsg_remove_queue(struct request_queue *q);
71void bsg_goose_queue(struct request_queue *q); 70void bsg_goose_queue(struct request_queue *q);
72 71
73#endif 72#endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ae0aaa9d42fa..4f440b3e89fe 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -97,7 +97,13 @@ struct partition_meta_info {
97 97
98struct hd_struct { 98struct hd_struct {
99 sector_t start_sect; 99 sector_t start_sect;
100 /*
101 * nr_sects is protected by sequence counter. One might extend a
102 * partition while IO is happening to it and update of nr_sects
103 * can be non-atomic on 32bit machines with 64bit sector_t.
104 */
100 sector_t nr_sects; 105 sector_t nr_sects;
106 seqcount_t nr_sects_seq;
101 sector_t alignment_offset; 107 sector_t alignment_offset;
102 unsigned int discard_alignment; 108 unsigned int discard_alignment;
103 struct device __dev; 109 struct device __dev;
@@ -647,6 +653,57 @@ static inline void hd_struct_put(struct hd_struct *part)
647 __delete_partition(part); 653 __delete_partition(part);
648} 654}
649 655
656/*
657 * Any access of part->nr_sects which is not protected by partition
658 * bd_mutex or gendisk bdev bd_mutex, should be done using this
659 * accessor function.
660 *
661 * Code written along the lines of i_size_read() and i_size_write().
662 * CONFIG_PREEMPT case optimizes the case of UP kernel with preemption
663 * on.
664 */
665static inline sector_t part_nr_sects_read(struct hd_struct *part)
666{
667#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
668 sector_t nr_sects;
669 unsigned seq;
670 do {
671 seq = read_seqcount_begin(&part->nr_sects_seq);
672 nr_sects = part->nr_sects;
673 } while (read_seqcount_retry(&part->nr_sects_seq, seq));
674 return nr_sects;
675#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
676 sector_t nr_sects;
677
678 preempt_disable();
679 nr_sects = part->nr_sects;
680 preempt_enable();
681 return nr_sects;
682#else
683 return part->nr_sects;
684#endif
685}
686
687/*
688 * Should be called with mutex lock held (typically bd_mutex) of partition
689 * to provide mutual exlusion among writers otherwise seqcount might be
690 * left in wrong state leaving the readers spinning infinitely.
691 */
692static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
693{
694#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
695 write_seqcount_begin(&part->nr_sects_seq);
696 part->nr_sects = size;
697 write_seqcount_end(&part->nr_sects_seq);
698#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
699 preempt_disable();
700 part->nr_sects = size;
701 preempt_enable();
702#else
703 part->nr_sects = size;
704#endif
705}
706
650#else /* CONFIG_BLOCK */ 707#else /* CONFIG_BLOCK */
651 708
652static inline void printk_all_partitions(void) { } 709static inline void printk_all_partitions(void) { }
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 7c08052e3321..39ed62ab5b8a 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -26,7 +26,8 @@ typedef struct mempool_s {
26extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 26extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
27 mempool_free_t *free_fn, void *pool_data); 27 mempool_free_t *free_fn, void *pool_data);
28extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, 28extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
29 mempool_free_t *free_fn, void *pool_data, int nid); 29 mempool_free_t *free_fn, void *pool_data,
30 gfp_t gfp_mask, int nid);
30 31
31extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask); 32extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask);
32extern void mempool_destroy(mempool_t *pool); 33extern void mempool_destroy(mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c
index d9049811f352..54990476c049 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy);
63mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 63mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
64 mempool_free_t *free_fn, void *pool_data) 64 mempool_free_t *free_fn, void *pool_data)
65{ 65{
66 return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1); 66 return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
67 GFP_KERNEL, NUMA_NO_NODE);
67} 68}
68EXPORT_SYMBOL(mempool_create); 69EXPORT_SYMBOL(mempool_create);
69 70
70mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, 71mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
71 mempool_free_t *free_fn, void *pool_data, int node_id) 72 mempool_free_t *free_fn, void *pool_data,
73 gfp_t gfp_mask, int node_id)
72{ 74{
73 mempool_t *pool; 75 mempool_t *pool;
74 pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); 76 pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
75 if (!pool) 77 if (!pool)
76 return NULL; 78 return NULL;
77 pool->elements = kmalloc_node(min_nr * sizeof(void *), 79 pool->elements = kmalloc_node(min_nr * sizeof(void *),
78 GFP_KERNEL, node_id); 80 gfp_mask, node_id);
79 if (!pool->elements) { 81 if (!pool->elements) {
80 kfree(pool); 82 kfree(pool);
81 return NULL; 83 return NULL;
@@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
93 while (pool->curr_nr < pool->min_nr) { 95 while (pool->curr_nr < pool->min_nr) {
94 void *element; 96 void *element;
95 97
96 element = pool->alloc(GFP_KERNEL, pool->pool_data); 98 element = pool->alloc(gfp_mask, pool->pool_data);
97 if (unlikely(!element)) { 99 if (unlikely(!element)) {
98 mempool_destroy(pool); 100 mempool_destroy(pool);
99 return NULL; 101 return NULL;