aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-02-14 13:45:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-02-14 13:45:18 -0500
commit5e57dc81106b942786f5db8e7ab8788bb9319933 (patch)
tree4533e01e745bba3614c77200b3fd96dd7af7e04e
parent0d25e3691186f5ae6feb0229717a60a5169dc5b2 (diff)
parentc8123f8c9cb517403b51aa41c3c46ff5e10b2c17 (diff)
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block IO fixes from Jens Axboe: "Second round of updates and fixes for 3.14-rc2. Most of this stuff has been queued up for a while. The notable exception is the blk-mq changes, which are naturally a bit more in flux still. The pull request contains: - Two bug fixes for the new immutable vecs, causing crashes with raid or swap. From Kent. - Various blk-mq tweaks and fixes from Christoph. A fix for integrity bio's from Nic. - A few bcache fixes from Kent and Darrick Wong. - xen-blk{front,back} fixes from David Vrabel, Matt Rushton, Nicolas Swenson, and Roger Pau Monne. - Fix for a vec miscount with integrity vectors from Martin. - Minor annotations or fixes from Masanari Iida and Rashika Kheria. - Tweak to null_blk to do more normal FIFO processing of requests from Shlomo Pongratz. - Elevator switching bypass fix from Tejun. - Softlockup in blkdev_issue_discard() fix when !CONFIG_PREEMPT from me" * 'for-linus' of git://git.kernel.dk/linux-block: (31 commits) block: add cond_resched() to potentially long running ioctl discard loop xen-blkback: init persistent_purge_work work_struct blk-mq: pair blk_mq_start_request / blk_mq_requeue_request blk-mq: dont assume rq->errors is set when returning an error from ->queue_rq block: Fix cloning of discard/write same bios block: Fix type mismatch in ssize_t_blk_mq_tag_sysfs_show blk-mq: rework flush sequencing logic null_blk: use blk_complete_request and blk_mq_complete_request virtio_blk: use blk_mq_complete_request blk-mq: rework I/O completions fs: Add prototype declaration to appropriate header file include/linux/bio.h fs: Mark function as static in fs/bio-integrity.c block/null_blk: Fix completion processing from LIFO to FIFO block: Explicitly handle discard/write same segments block: Fix nr_vecs for inline integrity vectors blk-mq: Add bio_integrity setup to blk_mq_make_request blk-mq: initialize sg_reserved_size blk-mq: handle dma_drain_size blk-mq: divert __blk_put_request for MQ ops blk-mq: support at_head inserations for blk_execute_rq ...
-rw-r--r--block/blk-core.c20
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-flush.c101
-rw-r--r--block/blk-lib.c8
-rw-r--r--block/blk-merge.c91
-rw-r--r--block/blk-mq-tag.c2
-rw-r--r--block/blk-mq.c143
-rw-r--r--block/blk-mq.h4
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/blk-timeout.c2
-rw-r--r--block/blk.h2
-rw-r--r--drivers/block/null_blk.c97
-rw-r--r--drivers/block/virtio_blk.c7
-rw-r--r--drivers/block/xen-blkback/blkback.c66
-rw-r--r--drivers/block/xen-blkback/common.h5
-rw-r--r--drivers/block/xen-blkback/xenbus.c14
-rw-r--r--drivers/block/xen-blkfront.c11
-rw-r--r--drivers/md/bcache/bcache.h4
-rw-r--r--drivers/md/bcache/bset.c7
-rw-r--r--drivers/md/bcache/btree.c4
-rw-r--r--drivers/md/bcache/request.c6
-rw-r--r--drivers/md/bcache/sysfs.c2
-rw-r--r--fs/bio-integrity.c13
-rw-r--r--fs/bio.c15
-rw-r--r--include/linux/bio.h12
-rw-r--r--include/linux/blk-mq.h9
-rw-r--r--include/linux/blkdev.h11
-rw-r--r--include/xen/interface/io/blkif.h34
-rw-r--r--lib/percpu_ida.c7
29 files changed, 397 insertions, 304 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index c00e0bdeab4a..853f92749202 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
693 if (!uninit_q) 693 if (!uninit_q)
694 return NULL; 694 return NULL;
695 695
696 uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
697 if (!uninit_q->flush_rq)
698 goto out_cleanup_queue;
699
696 q = blk_init_allocated_queue(uninit_q, rfn, lock); 700 q = blk_init_allocated_queue(uninit_q, rfn, lock);
697 if (!q) 701 if (!q)
698 blk_cleanup_queue(uninit_q); 702 goto out_free_flush_rq;
699
700 return q; 703 return q;
704
705out_free_flush_rq:
706 kfree(uninit_q->flush_rq);
707out_cleanup_queue:
708 blk_cleanup_queue(uninit_q);
709 return NULL;
701} 710}
702EXPORT_SYMBOL(blk_init_queue_node); 711EXPORT_SYMBOL(blk_init_queue_node);
703 712
@@ -1127,7 +1136,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
1127struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1136struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1128{ 1137{
1129 if (q->mq_ops) 1138 if (q->mq_ops)
1130 return blk_mq_alloc_request(q, rw, gfp_mask, false); 1139 return blk_mq_alloc_request(q, rw, gfp_mask);
1131 else 1140 else
1132 return blk_old_get_request(q, rw, gfp_mask); 1141 return blk_old_get_request(q, rw, gfp_mask);
1133} 1142}
@@ -1278,6 +1287,11 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1278 if (unlikely(!q)) 1287 if (unlikely(!q))
1279 return; 1288 return;
1280 1289
1290 if (q->mq_ops) {
1291 blk_mq_free_request(req);
1292 return;
1293 }
1294
1281 blk_pm_put_request(req); 1295 blk_pm_put_request(req);
1282 1296
1283 elv_completed_request(q, req); 1297 elv_completed_request(q, req);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index bbfc072a79c2..c68613bb4c79 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -65,7 +65,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
65 * be resued after dying flag is set 65 * be resued after dying flag is set
66 */ 66 */
67 if (q->mq_ops) { 67 if (q->mq_ops) {
68 blk_mq_insert_request(q, rq, true); 68 blk_mq_insert_request(q, rq, at_head, true);
69 return; 69 return;
70 } 70 }
71 71
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9288aaf35c21..66e2b697f5db 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,20 +130,26 @@ static void blk_flush_restore_request(struct request *rq)
130 blk_clear_rq_complete(rq); 130 blk_clear_rq_complete(rq);
131} 131}
132 132
133static void mq_flush_data_run(struct work_struct *work) 133static void mq_flush_run(struct work_struct *work)
134{ 134{
135 struct request *rq; 135 struct request *rq;
136 136
137 rq = container_of(work, struct request, mq_flush_data); 137 rq = container_of(work, struct request, mq_flush_work);
138 138
139 memset(&rq->csd, 0, sizeof(rq->csd)); 139 memset(&rq->csd, 0, sizeof(rq->csd));
140 blk_mq_run_request(rq, true, false); 140 blk_mq_run_request(rq, true, false);
141} 141}
142 142
143static void blk_mq_flush_data_insert(struct request *rq) 143static bool blk_flush_queue_rq(struct request *rq)
144{ 144{
145 INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); 145 if (rq->q->mq_ops) {
146 kblockd_schedule_work(rq->q, &rq->mq_flush_data); 146 INIT_WORK(&rq->mq_flush_work, mq_flush_run);
147 kblockd_schedule_work(rq->q, &rq->mq_flush_work);
148 return false;
149 } else {
150 list_add_tail(&rq->queuelist, &rq->q->queue_head);
151 return true;
152 }
147} 153}
148 154
149/** 155/**
@@ -187,12 +193,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
187 193
188 case REQ_FSEQ_DATA: 194 case REQ_FSEQ_DATA:
189 list_move_tail(&rq->flush.list, &q->flush_data_in_flight); 195 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
190 if (q->mq_ops) 196 queued = blk_flush_queue_rq(rq);
191 blk_mq_flush_data_insert(rq);
192 else {
193 list_add(&rq->queuelist, &q->queue_head);
194 queued = true;
195 }
196 break; 197 break;
197 198
198 case REQ_FSEQ_DONE: 199 case REQ_FSEQ_DONE:
@@ -216,9 +217,6 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
216 } 217 }
217 218
218 kicked = blk_kick_flush(q); 219 kicked = blk_kick_flush(q);
219 /* blk_mq_run_flush will run queue */
220 if (q->mq_ops)
221 return queued;
222 return kicked | queued; 220 return kicked | queued;
223} 221}
224 222
@@ -230,10 +228,9 @@ static void flush_end_io(struct request *flush_rq, int error)
230 struct request *rq, *n; 228 struct request *rq, *n;
231 unsigned long flags = 0; 229 unsigned long flags = 0;
232 230
233 if (q->mq_ops) { 231 if (q->mq_ops)
234 blk_mq_free_request(flush_rq);
235 spin_lock_irqsave(&q->mq_flush_lock, flags); 232 spin_lock_irqsave(&q->mq_flush_lock, flags);
236 } 233
237 running = &q->flush_queue[q->flush_running_idx]; 234 running = &q->flush_queue[q->flush_running_idx];
238 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 235 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
239 236
@@ -263,49 +260,14 @@ static void flush_end_io(struct request *flush_rq, int error)
263 * kblockd. 260 * kblockd.
264 */ 261 */
265 if (queued || q->flush_queue_delayed) { 262 if (queued || q->flush_queue_delayed) {
266 if (!q->mq_ops) 263 WARN_ON(q->mq_ops);
267 blk_run_queue_async(q); 264 blk_run_queue_async(q);
268 else
269 /*
270 * This can be optimized to only run queues with requests
271 * queued if necessary.
272 */
273 blk_mq_run_queues(q, true);
274 } 265 }
275 q->flush_queue_delayed = 0; 266 q->flush_queue_delayed = 0;
276 if (q->mq_ops) 267 if (q->mq_ops)
277 spin_unlock_irqrestore(&q->mq_flush_lock, flags); 268 spin_unlock_irqrestore(&q->mq_flush_lock, flags);
278} 269}
279 270
280static void mq_flush_work(struct work_struct *work)
281{
282 struct request_queue *q;
283 struct request *rq;
284
285 q = container_of(work, struct request_queue, mq_flush_work);
286
287 /* We don't need set REQ_FLUSH_SEQ, it's for consistency */
288 rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
289 __GFP_WAIT|GFP_ATOMIC, true);
290 rq->cmd_type = REQ_TYPE_FS;
291 rq->end_io = flush_end_io;
292
293 blk_mq_run_request(rq, true, false);
294}
295
296/*
297 * We can't directly use q->flush_rq, because it doesn't have tag and is not in
298 * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
299 * so offload the work to workqueue.
300 *
301 * Note: we assume a flush request finished in any hardware queue will flush
302 * the whole disk cache.
303 */
304static void mq_run_flush(struct request_queue *q)
305{
306 kblockd_schedule_work(q, &q->mq_flush_work);
307}
308
309/** 271/**
310 * blk_kick_flush - consider issuing flush request 272 * blk_kick_flush - consider issuing flush request
311 * @q: request_queue being kicked 273 * @q: request_queue being kicked
@@ -340,19 +302,31 @@ static bool blk_kick_flush(struct request_queue *q)
340 * different from running_idx, which means flush is in flight. 302 * different from running_idx, which means flush is in flight.
341 */ 303 */
342 q->flush_pending_idx ^= 1; 304 q->flush_pending_idx ^= 1;
305
343 if (q->mq_ops) { 306 if (q->mq_ops) {
344 mq_run_flush(q); 307 struct blk_mq_ctx *ctx = first_rq->mq_ctx;
345 return true; 308 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
309
310 blk_mq_rq_init(hctx, q->flush_rq);
311 q->flush_rq->mq_ctx = ctx;
312
313 /*
314 * Reuse the tag value from the fist waiting request,
315 * with blk-mq the tag is generated during request
316 * allocation and drivers can rely on it being inside
317 * the range they asked for.
318 */
319 q->flush_rq->tag = first_rq->tag;
320 } else {
321 blk_rq_init(q, q->flush_rq);
346 } 322 }
347 323
348 blk_rq_init(q, &q->flush_rq); 324 q->flush_rq->cmd_type = REQ_TYPE_FS;
349 q->flush_rq.cmd_type = REQ_TYPE_FS; 325 q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
350 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 326 q->flush_rq->rq_disk = first_rq->rq_disk;
351 q->flush_rq.rq_disk = first_rq->rq_disk; 327 q->flush_rq->end_io = flush_end_io;
352 q->flush_rq.end_io = flush_end_io;
353 328
354 list_add_tail(&q->flush_rq.queuelist, &q->queue_head); 329 return blk_flush_queue_rq(q->flush_rq);
355 return true;
356} 330}
357 331
358static void flush_data_end_io(struct request *rq, int error) 332static void flush_data_end_io(struct request *rq, int error)
@@ -558,5 +532,4 @@ EXPORT_SYMBOL(blkdev_issue_flush);
558void blk_mq_init_flush(struct request_queue *q) 532void blk_mq_init_flush(struct request_queue *q)
559{ 533{
560 spin_lock_init(&q->mq_flush_lock); 534 spin_lock_init(&q->mq_flush_lock);
561 INIT_WORK(&q->mq_flush_work, mq_flush_work);
562} 535}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2da76c999ef3..97a733cf3d5f 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -119,6 +119,14 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
119 119
120 atomic_inc(&bb.done); 120 atomic_inc(&bb.done);
121 submit_bio(type, bio); 121 submit_bio(type, bio);
122
123 /*
124 * We can loop for a long time in here, if someone does
125 * full device discards (like mkfs). Be nice and allow
126 * us to schedule out to avoid softlocking if preempt
127 * is disabled.
128 */
129 cond_resched();
122 } 130 }
123 blk_finish_plug(&plug); 131 blk_finish_plug(&plug);
124 132
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 8f8adaa95466..6c583f9c5b65 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -21,6 +21,16 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
21 if (!bio) 21 if (!bio)
22 return 0; 22 return 0;
23 23
24 /*
25 * This should probably be returning 0, but blk_add_request_payload()
26 * (Christoph!!!!)
27 */
28 if (bio->bi_rw & REQ_DISCARD)
29 return 1;
30
31 if (bio->bi_rw & REQ_WRITE_SAME)
32 return 1;
33
24 fbio = bio; 34 fbio = bio;
25 cluster = blk_queue_cluster(q); 35 cluster = blk_queue_cluster(q);
26 seg_size = 0; 36 seg_size = 0;
@@ -161,30 +171,60 @@ new_segment:
161 *bvprv = *bvec; 171 *bvprv = *bvec;
162} 172}
163 173
164/* 174static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
165 * map a request to scatterlist, return number of sg entries setup. Caller 175 struct scatterlist *sglist,
166 * must make sure sg can hold rq->nr_phys_segments entries 176 struct scatterlist **sg)
167 */
168int blk_rq_map_sg(struct request_queue *q, struct request *rq,
169 struct scatterlist *sglist)
170{ 177{
171 struct bio_vec bvec, bvprv = { NULL }; 178 struct bio_vec bvec, bvprv = { NULL };
172 struct req_iterator iter; 179 struct bvec_iter iter;
173 struct scatterlist *sg;
174 int nsegs, cluster; 180 int nsegs, cluster;
175 181
176 nsegs = 0; 182 nsegs = 0;
177 cluster = blk_queue_cluster(q); 183 cluster = blk_queue_cluster(q);
178 184
179 /* 185 if (bio->bi_rw & REQ_DISCARD) {
180 * for each bio in rq 186 /*
181 */ 187 * This is a hack - drivers should be neither modifying the
182 sg = NULL; 188 * biovec, nor relying on bi_vcnt - but because of
183 rq_for_each_segment(bvec, rq, iter) { 189 * blk_add_request_payload(), a discard bio may or may not have
184 __blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg, 190 * a payload we need to set up here (thank you Christoph) and
185 &nsegs, &cluster); 191 * bi_vcnt is really the only way of telling if we need to.
186 } /* segments in rq */ 192 */
193
194 if (bio->bi_vcnt)
195 goto single_segment;
196
197 return 0;
198 }
199
200 if (bio->bi_rw & REQ_WRITE_SAME) {
201single_segment:
202 *sg = sglist;
203 bvec = bio_iovec(bio);
204 sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
205 return 1;
206 }
207
208 for_each_bio(bio)
209 bio_for_each_segment(bvec, bio, iter)
210 __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
211 &nsegs, &cluster);
187 212
213 return nsegs;
214}
215
216/*
217 * map a request to scatterlist, return number of sg entries setup. Caller
218 * must make sure sg can hold rq->nr_phys_segments entries
219 */
220int blk_rq_map_sg(struct request_queue *q, struct request *rq,
221 struct scatterlist *sglist)
222{
223 struct scatterlist *sg = NULL;
224 int nsegs = 0;
225
226 if (rq->bio)
227 nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
188 228
189 if (unlikely(rq->cmd_flags & REQ_COPY_USER) && 229 if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
190 (blk_rq_bytes(rq) & q->dma_pad_mask)) { 230 (blk_rq_bytes(rq) & q->dma_pad_mask)) {
@@ -230,20 +270,13 @@ EXPORT_SYMBOL(blk_rq_map_sg);
230int blk_bio_map_sg(struct request_queue *q, struct bio *bio, 270int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
231 struct scatterlist *sglist) 271 struct scatterlist *sglist)
232{ 272{
233 struct bio_vec bvec, bvprv = { NULL }; 273 struct scatterlist *sg = NULL;
234 struct scatterlist *sg; 274 int nsegs;
235 int nsegs, cluster; 275 struct bio *next = bio->bi_next;
236 struct bvec_iter iter; 276 bio->bi_next = NULL;
237
238 nsegs = 0;
239 cluster = blk_queue_cluster(q);
240
241 sg = NULL;
242 bio_for_each_segment(bvec, bio, iter) {
243 __blk_segment_map_sg(q, &bvec, sglist, &bvprv, &sg,
244 &nsegs, &cluster);
245 } /* segments in bio */
246 277
278 nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
279 bio->bi_next = next;
247 if (sg) 280 if (sg)
248 sg_mark_end(sg); 281 sg_mark_end(sg);
249 282
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 5d70edc9855f..83ae96c51a27 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -184,7 +184,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
184ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) 184ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
185{ 185{
186 char *orig_page = page; 186 char *orig_page = page;
187 int cpu; 187 unsigned int cpu;
188 188
189 if (!tags) 189 if (!tags)
190 return 0; 190 return 0;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 57039fcd9c93..1fa9dd153fde 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -226,15 +226,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
226 return rq; 226 return rq;
227} 227}
228 228
229struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 229struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
230 gfp_t gfp, bool reserved)
231{ 230{
232 struct request *rq; 231 struct request *rq;
233 232
234 if (blk_mq_queue_enter(q)) 233 if (blk_mq_queue_enter(q))
235 return NULL; 234 return NULL;
236 235
237 rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); 236 rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
238 if (rq) 237 if (rq)
239 blk_mq_put_ctx(rq->mq_ctx); 238 blk_mq_put_ctx(rq->mq_ctx);
240 return rq; 239 return rq;
@@ -258,7 +257,7 @@ EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
258/* 257/*
259 * Re-init and set pdu, if we have it 258 * Re-init and set pdu, if we have it
260 */ 259 */
261static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) 260void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
262{ 261{
263 blk_rq_init(hctx->queue, rq); 262 blk_rq_init(hctx->queue, rq);
264 263
@@ -305,7 +304,7 @@ static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
305 bio_endio(bio, error); 304 bio_endio(bio, error);
306} 305}
307 306
308void blk_mq_complete_request(struct request *rq, int error) 307void blk_mq_end_io(struct request *rq, int error)
309{ 308{
310 struct bio *bio = rq->bio; 309 struct bio *bio = rq->bio;
311 unsigned int bytes = 0; 310 unsigned int bytes = 0;
@@ -330,48 +329,55 @@ void blk_mq_complete_request(struct request *rq, int error)
330 else 329 else
331 blk_mq_free_request(rq); 330 blk_mq_free_request(rq);
332} 331}
332EXPORT_SYMBOL(blk_mq_end_io);
333 333
334void __blk_mq_end_io(struct request *rq, int error) 334static void __blk_mq_complete_request_remote(void *data)
335{
336 if (!blk_mark_rq_complete(rq))
337 blk_mq_complete_request(rq, error);
338}
339
340static void blk_mq_end_io_remote(void *data)
341{ 335{
342 struct request *rq = data; 336 struct request *rq = data;
343 337
344 __blk_mq_end_io(rq, rq->errors); 338 rq->q->softirq_done_fn(rq);
345} 339}
346 340
347/* 341void __blk_mq_complete_request(struct request *rq)
348 * End IO on this request on a multiqueue enabled driver. We'll either do
349 * it directly inline, or punt to a local IPI handler on the matching
350 * remote CPU.
351 */
352void blk_mq_end_io(struct request *rq, int error)
353{ 342{
354 struct blk_mq_ctx *ctx = rq->mq_ctx; 343 struct blk_mq_ctx *ctx = rq->mq_ctx;
355 int cpu; 344 int cpu;
356 345
357 if (!ctx->ipi_redirect) 346 if (!ctx->ipi_redirect) {
358 return __blk_mq_end_io(rq, error); 347 rq->q->softirq_done_fn(rq);
348 return;
349 }
359 350
360 cpu = get_cpu(); 351 cpu = get_cpu();
361 if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { 352 if (cpu != ctx->cpu && cpu_online(ctx->cpu)) {
362 rq->errors = error; 353 rq->csd.func = __blk_mq_complete_request_remote;
363 rq->csd.func = blk_mq_end_io_remote;
364 rq->csd.info = rq; 354 rq->csd.info = rq;
365 rq->csd.flags = 0; 355 rq->csd.flags = 0;
366 __smp_call_function_single(ctx->cpu, &rq->csd, 0); 356 __smp_call_function_single(ctx->cpu, &rq->csd, 0);
367 } else { 357 } else {
368 __blk_mq_end_io(rq, error); 358 rq->q->softirq_done_fn(rq);
369 } 359 }
370 put_cpu(); 360 put_cpu();
371} 361}
372EXPORT_SYMBOL(blk_mq_end_io);
373 362
374static void blk_mq_start_request(struct request *rq) 363/**
364 * blk_mq_complete_request - end I/O on a request
365 * @rq: the request being processed
366 *
367 * Description:
368 * Ends all I/O on a request. It does not handle partial completions.
369 * The actual completion happens out-of-order, through a IPI handler.
370 **/
371void blk_mq_complete_request(struct request *rq)
372{
373 if (unlikely(blk_should_fake_timeout(rq->q)))
374 return;
375 if (!blk_mark_rq_complete(rq))
376 __blk_mq_complete_request(rq);
377}
378EXPORT_SYMBOL(blk_mq_complete_request);
379
380static void blk_mq_start_request(struct request *rq, bool last)
375{ 381{
376 struct request_queue *q = rq->q; 382 struct request_queue *q = rq->q;
377 383
@@ -384,6 +390,25 @@ static void blk_mq_start_request(struct request *rq)
384 */ 390 */
385 rq->deadline = jiffies + q->rq_timeout; 391 rq->deadline = jiffies + q->rq_timeout;
386 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 392 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
393
394 if (q->dma_drain_size && blk_rq_bytes(rq)) {
395 /*
396 * Make sure space for the drain appears. We know we can do
397 * this because max_hw_segments has been adjusted to be one
398 * fewer than the device can handle.
399 */
400 rq->nr_phys_segments++;
401 }
402
403 /*
404 * Flag the last request in the series so that drivers know when IO
405 * should be kicked off, if they don't do it on a per-request basis.
406 *
407 * Note: the flag isn't the only condition drivers should do kick off.
408 * If drive is busy, the last request might not have the bit set.
409 */
410 if (last)
411 rq->cmd_flags |= REQ_END;
387} 412}
388 413
389static void blk_mq_requeue_request(struct request *rq) 414static void blk_mq_requeue_request(struct request *rq)
@@ -392,6 +417,11 @@ static void blk_mq_requeue_request(struct request *rq)
392 417
393 trace_block_rq_requeue(q, rq); 418 trace_block_rq_requeue(q, rq);
394 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 419 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
420
421 rq->cmd_flags &= ~REQ_END;
422
423 if (q->dma_drain_size && blk_rq_bytes(rq))
424 rq->nr_phys_segments--;
395} 425}
396 426
397struct blk_mq_timeout_data { 427struct blk_mq_timeout_data {
@@ -559,19 +589,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
559 589
560 rq = list_first_entry(&rq_list, struct request, queuelist); 590 rq = list_first_entry(&rq_list, struct request, queuelist);
561 list_del_init(&rq->queuelist); 591 list_del_init(&rq->queuelist);
562 blk_mq_start_request(rq);
563 592
564 /* 593 blk_mq_start_request(rq, list_empty(&rq_list));
565 * Last request in the series. Flag it as such, this
566 * enables drivers to know when IO should be kicked off,
567 * if they don't do it on a per-request basis.
568 *
569 * Note: the flag isn't the only condition drivers
570 * should do kick off. If drive is busy, the last
571 * request might not have the bit set.
572 */
573 if (list_empty(&rq_list))
574 rq->cmd_flags |= REQ_END;
575 594
576 ret = q->mq_ops->queue_rq(hctx, rq); 595 ret = q->mq_ops->queue_rq(hctx, rq);
577 switch (ret) { 596 switch (ret) {
@@ -589,8 +608,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
589 break; 608 break;
590 default: 609 default:
591 pr_err("blk-mq: bad return on queue: %d\n", ret); 610 pr_err("blk-mq: bad return on queue: %d\n", ret);
592 rq->errors = -EIO;
593 case BLK_MQ_RQ_QUEUE_ERROR: 611 case BLK_MQ_RQ_QUEUE_ERROR:
612 rq->errors = -EIO;
594 blk_mq_end_io(rq, rq->errors); 613 blk_mq_end_io(rq, rq->errors);
595 break; 614 break;
596 } 615 }
@@ -693,13 +712,16 @@ static void blk_mq_work_fn(struct work_struct *work)
693} 712}
694 713
695static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 714static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
696 struct request *rq) 715 struct request *rq, bool at_head)
697{ 716{
698 struct blk_mq_ctx *ctx = rq->mq_ctx; 717 struct blk_mq_ctx *ctx = rq->mq_ctx;
699 718
700 trace_block_rq_insert(hctx->queue, rq); 719 trace_block_rq_insert(hctx->queue, rq);
701 720
702 list_add_tail(&rq->queuelist, &ctx->rq_list); 721 if (at_head)
722 list_add(&rq->queuelist, &ctx->rq_list);
723 else
724 list_add_tail(&rq->queuelist, &ctx->rq_list);
703 blk_mq_hctx_mark_pending(hctx, ctx); 725 blk_mq_hctx_mark_pending(hctx, ctx);
704 726
705 /* 727 /*
@@ -709,7 +731,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
709} 731}
710 732
711void blk_mq_insert_request(struct request_queue *q, struct request *rq, 733void blk_mq_insert_request(struct request_queue *q, struct request *rq,
712 bool run_queue) 734 bool at_head, bool run_queue)
713{ 735{
714 struct blk_mq_hw_ctx *hctx; 736 struct blk_mq_hw_ctx *hctx;
715 struct blk_mq_ctx *ctx, *current_ctx; 737 struct blk_mq_ctx *ctx, *current_ctx;
@@ -728,7 +750,7 @@ void blk_mq_insert_request(struct request_queue *q, struct request *rq,
728 rq->mq_ctx = ctx; 750 rq->mq_ctx = ctx;
729 } 751 }
730 spin_lock(&ctx->lock); 752 spin_lock(&ctx->lock);
731 __blk_mq_insert_request(hctx, rq); 753 __blk_mq_insert_request(hctx, rq, at_head);
732 spin_unlock(&ctx->lock); 754 spin_unlock(&ctx->lock);
733 755
734 blk_mq_put_ctx(current_ctx); 756 blk_mq_put_ctx(current_ctx);
@@ -760,7 +782,7 @@ void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
760 782
761 /* ctx->cpu might be offline */ 783 /* ctx->cpu might be offline */
762 spin_lock(&ctx->lock); 784 spin_lock(&ctx->lock);
763 __blk_mq_insert_request(hctx, rq); 785 __blk_mq_insert_request(hctx, rq, false);
764 spin_unlock(&ctx->lock); 786 spin_unlock(&ctx->lock);
765 787
766 blk_mq_put_ctx(current_ctx); 788 blk_mq_put_ctx(current_ctx);
@@ -798,7 +820,7 @@ static void blk_mq_insert_requests(struct request_queue *q,
798 rq = list_first_entry(list, struct request, queuelist); 820 rq = list_first_entry(list, struct request, queuelist);
799 list_del_init(&rq->queuelist); 821 list_del_init(&rq->queuelist);
800 rq->mq_ctx = ctx; 822 rq->mq_ctx = ctx;
801 __blk_mq_insert_request(hctx, rq); 823 __blk_mq_insert_request(hctx, rq, false);
802 } 824 }
803 spin_unlock(&ctx->lock); 825 spin_unlock(&ctx->lock);
804 826
@@ -888,6 +910,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
888 910
889 blk_queue_bounce(q, &bio); 911 blk_queue_bounce(q, &bio);
890 912
913 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
914 bio_endio(bio, -EIO);
915 return;
916 }
917
891 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 918 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
892 return; 919 return;
893 920
@@ -950,7 +977,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
950 __blk_mq_free_request(hctx, ctx, rq); 977 __blk_mq_free_request(hctx, ctx, rq);
951 else { 978 else {
952 blk_mq_bio_to_request(rq, bio); 979 blk_mq_bio_to_request(rq, bio);
953 __blk_mq_insert_request(hctx, rq); 980 __blk_mq_insert_request(hctx, rq, false);
954 } 981 }
955 982
956 spin_unlock(&ctx->lock); 983 spin_unlock(&ctx->lock);
@@ -1309,15 +1336,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
1309 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1336 reg->queue_depth = BLK_MQ_MAX_DEPTH;
1310 } 1337 }
1311 1338
1312 /*
1313 * Set aside a tag for flush requests. It will only be used while
1314 * another flush request is in progress but outside the driver.
1315 *
1316 * TODO: only allocate if flushes are supported
1317 */
1318 reg->queue_depth++;
1319 reg->reserved_tags++;
1320
1321 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1339 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
1322 return ERR_PTR(-EINVAL); 1340 return ERR_PTR(-EINVAL);
1323 1341
@@ -1360,17 +1378,27 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
1360 q->mq_ops = reg->ops; 1378 q->mq_ops = reg->ops;
1361 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1379 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1362 1380
1381 q->sg_reserved_size = INT_MAX;
1382
1363 blk_queue_make_request(q, blk_mq_make_request); 1383 blk_queue_make_request(q, blk_mq_make_request);
1364 blk_queue_rq_timed_out(q, reg->ops->timeout); 1384 blk_queue_rq_timed_out(q, reg->ops->timeout);
1365 if (reg->timeout) 1385 if (reg->timeout)
1366 blk_queue_rq_timeout(q, reg->timeout); 1386 blk_queue_rq_timeout(q, reg->timeout);
1367 1387
1388 if (reg->ops->complete)
1389 blk_queue_softirq_done(q, reg->ops->complete);
1390
1368 blk_mq_init_flush(q); 1391 blk_mq_init_flush(q);
1369 blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1392 blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
1370 1393
1371 if (blk_mq_init_hw_queues(q, reg, driver_data)) 1394 q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
1395 cache_line_size()), GFP_KERNEL);
1396 if (!q->flush_rq)
1372 goto err_hw; 1397 goto err_hw;
1373 1398
1399 if (blk_mq_init_hw_queues(q, reg, driver_data))
1400 goto err_flush_rq;
1401
1374 blk_mq_map_swqueue(q); 1402 blk_mq_map_swqueue(q);
1375 1403
1376 mutex_lock(&all_q_mutex); 1404 mutex_lock(&all_q_mutex);
@@ -1378,6 +1406,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
1378 mutex_unlock(&all_q_mutex); 1406 mutex_unlock(&all_q_mutex);
1379 1407
1380 return q; 1408 return q;
1409
1410err_flush_rq:
1411 kfree(q->flush_rq);
1381err_hw: 1412err_hw:
1382 kfree(q->mq_map); 1413 kfree(q->mq_map);
1383err_map: 1414err_map:
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 5c3917984b00..ed0035cd458e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -22,13 +22,13 @@ struct blk_mq_ctx {
22 struct kobject kobj; 22 struct kobject kobj;
23}; 23};
24 24
25void __blk_mq_end_io(struct request *rq, int error); 25void __blk_mq_complete_request(struct request *rq);
26void blk_mq_complete_request(struct request *rq, int error);
27void blk_mq_run_request(struct request *rq, bool run_queue, bool async); 26void blk_mq_run_request(struct request *rq, bool run_queue, bool async);
28void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 27void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
29void blk_mq_init_flush(struct request_queue *q); 28void blk_mq_init_flush(struct request_queue *q);
30void blk_mq_drain_queue(struct request_queue *q); 29void blk_mq_drain_queue(struct request_queue *q);
31void blk_mq_free_queue(struct request_queue *q); 30void blk_mq_free_queue(struct request_queue *q);
31void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);
32 32
33/* 33/*
34 * CPU hotplug helpers 34 * CPU hotplug helpers
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8095c4a21fc0..7500f876dae4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -549,6 +549,8 @@ static void blk_release_queue(struct kobject *kobj)
549 if (q->mq_ops) 549 if (q->mq_ops)
550 blk_mq_free_queue(q); 550 blk_mq_free_queue(q);
551 551
552 kfree(q->flush_rq);
553
552 blk_trace_shutdown(q); 554 blk_trace_shutdown(q);
553 555
554 bdi_destroy(&q->backing_dev_info); 556 bdi_destroy(&q->backing_dev_info);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index bba81c9348e1..d96f7061c6fd 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -91,7 +91,7 @@ static void blk_rq_timed_out(struct request *req)
91 case BLK_EH_HANDLED: 91 case BLK_EH_HANDLED:
92 /* Can we use req->errors here? */ 92 /* Can we use req->errors here? */
93 if (q->mq_ops) 93 if (q->mq_ops)
94 blk_mq_complete_request(req, req->errors); 94 __blk_mq_complete_request(req);
95 else 95 else
96 __blk_complete_request(req); 96 __blk_complete_request(req);
97 break; 97 break;
diff --git a/block/blk.h b/block/blk.h
index c90e1d8f7a2b..d23b415b8a28 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -113,7 +113,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
113 q->flush_queue_delayed = 1; 113 q->flush_queue_delayed = 1;
114 return NULL; 114 return NULL;
115 } 115 }
116 if (unlikely(blk_queue_dying(q)) || 116 if (unlikely(blk_queue_bypass(q)) ||
117 !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) 117 !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
118 return NULL; 118 return NULL;
119 } 119 }
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 3107282a9741..091b9ea14feb 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -60,7 +60,9 @@ enum {
60 NULL_IRQ_NONE = 0, 60 NULL_IRQ_NONE = 0,
61 NULL_IRQ_SOFTIRQ = 1, 61 NULL_IRQ_SOFTIRQ = 1,
62 NULL_IRQ_TIMER = 2, 62 NULL_IRQ_TIMER = 2,
63};
63 64
65enum {
64 NULL_Q_BIO = 0, 66 NULL_Q_BIO = 0,
65 NULL_Q_RQ = 1, 67 NULL_Q_RQ = 1,
66 NULL_Q_MQ = 2, 68 NULL_Q_MQ = 2,
@@ -172,18 +174,20 @@ static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
172 174
173static void end_cmd(struct nullb_cmd *cmd) 175static void end_cmd(struct nullb_cmd *cmd)
174{ 176{
175 if (cmd->rq) { 177 switch (queue_mode) {
176 if (queue_mode == NULL_Q_MQ) 178 case NULL_Q_MQ:
177 blk_mq_end_io(cmd->rq, 0); 179 blk_mq_end_io(cmd->rq, 0);
178 else { 180 return;
179 INIT_LIST_HEAD(&cmd->rq->queuelist); 181 case NULL_Q_RQ:
180 blk_end_request_all(cmd->rq, 0); 182 INIT_LIST_HEAD(&cmd->rq->queuelist);
181 } 183 blk_end_request_all(cmd->rq, 0);
182 } else if (cmd->bio) 184 break;
185 case NULL_Q_BIO:
183 bio_endio(cmd->bio, 0); 186 bio_endio(cmd->bio, 0);
187 break;
188 }
184 189
185 if (queue_mode != NULL_Q_MQ) 190 free_cmd(cmd);
186 free_cmd(cmd);
187} 191}
188 192
189static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) 193static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
@@ -195,6 +199,7 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
195 cq = &per_cpu(completion_queues, smp_processor_id()); 199 cq = &per_cpu(completion_queues, smp_processor_id());
196 200
197 while ((entry = llist_del_all(&cq->list)) != NULL) { 201 while ((entry = llist_del_all(&cq->list)) != NULL) {
202 entry = llist_reverse_order(entry);
198 do { 203 do {
199 cmd = container_of(entry, struct nullb_cmd, ll_list); 204 cmd = container_of(entry, struct nullb_cmd, ll_list);
200 end_cmd(cmd); 205 end_cmd(cmd);
@@ -221,61 +226,31 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
221 226
222static void null_softirq_done_fn(struct request *rq) 227static void null_softirq_done_fn(struct request *rq)
223{ 228{
224 blk_end_request_all(rq, 0); 229 end_cmd(rq->special);
225}
226
227#ifdef CONFIG_SMP
228
229static void null_ipi_cmd_end_io(void *data)
230{
231 struct completion_queue *cq;
232 struct llist_node *entry, *next;
233 struct nullb_cmd *cmd;
234
235 cq = &per_cpu(completion_queues, smp_processor_id());
236
237 entry = llist_del_all(&cq->list);
238
239 while (entry) {
240 next = entry->next;
241 cmd = llist_entry(entry, struct nullb_cmd, ll_list);
242 end_cmd(cmd);
243 entry = next;
244 }
245}
246
247static void null_cmd_end_ipi(struct nullb_cmd *cmd)
248{
249 struct call_single_data *data = &cmd->csd;
250 int cpu = get_cpu();
251 struct completion_queue *cq = &per_cpu(completion_queues, cpu);
252
253 cmd->ll_list.next = NULL;
254
255 if (llist_add(&cmd->ll_list, &cq->list)) {
256 data->func = null_ipi_cmd_end_io;
257 data->flags = 0;
258 __smp_call_function_single(cpu, data, 0);
259 }
260
261 put_cpu();
262} 230}
263 231
264#endif /* CONFIG_SMP */
265
266static inline void null_handle_cmd(struct nullb_cmd *cmd) 232static inline void null_handle_cmd(struct nullb_cmd *cmd)
267{ 233{
268 /* Complete IO by inline, softirq or timer */ 234 /* Complete IO by inline, softirq or timer */
269 switch (irqmode) { 235 switch (irqmode) {
270 case NULL_IRQ_NONE:
271 end_cmd(cmd);
272 break;
273 case NULL_IRQ_SOFTIRQ: 236 case NULL_IRQ_SOFTIRQ:
274#ifdef CONFIG_SMP 237 switch (queue_mode) {
275 null_cmd_end_ipi(cmd); 238 case NULL_Q_MQ:
276#else 239 blk_mq_complete_request(cmd->rq);
240 break;
241 case NULL_Q_RQ:
242 blk_complete_request(cmd->rq);
243 break;
244 case NULL_Q_BIO:
245 /*
246 * XXX: no proper submitting cpu information available.
247 */
248 end_cmd(cmd);
249 break;
250 }
251 break;
252 case NULL_IRQ_NONE:
277 end_cmd(cmd); 253 end_cmd(cmd);
278#endif
279 break; 254 break;
280 case NULL_IRQ_TIMER: 255 case NULL_IRQ_TIMER:
281 null_cmd_end_timer(cmd); 256 null_cmd_end_timer(cmd);
@@ -411,6 +386,7 @@ static struct blk_mq_ops null_mq_ops = {
411 .queue_rq = null_queue_rq, 386 .queue_rq = null_queue_rq,
412 .map_queue = blk_mq_map_queue, 387 .map_queue = blk_mq_map_queue,
413 .init_hctx = null_init_hctx, 388 .init_hctx = null_init_hctx,
389 .complete = null_softirq_done_fn,
414}; 390};
415 391
416static struct blk_mq_reg null_mq_reg = { 392static struct blk_mq_reg null_mq_reg = {
@@ -609,13 +585,6 @@ static int __init null_init(void)
609{ 585{
610 unsigned int i; 586 unsigned int i;
611 587
612#if !defined(CONFIG_SMP)
613 if (irqmode == NULL_IRQ_SOFTIRQ) {
614 pr_warn("null_blk: softirq completions not available.\n");
615 pr_warn("null_blk: using direct completions.\n");
616 irqmode = NULL_IRQ_NONE;
617 }
618#endif
619 if (bs > PAGE_SIZE) { 588 if (bs > PAGE_SIZE) {
620 pr_warn("null_blk: invalid block size\n"); 589 pr_warn("null_blk: invalid block size\n");
621 pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); 590 pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6a680d4de7f1..b1cb3f4c4db4 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -110,9 +110,9 @@ static int __virtblk_add_req(struct virtqueue *vq,
110 return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); 110 return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
111} 111}
112 112
113static inline void virtblk_request_done(struct virtblk_req *vbr) 113static inline void virtblk_request_done(struct request *req)
114{ 114{
115 struct request *req = vbr->req; 115 struct virtblk_req *vbr = req->special;
116 int error = virtblk_result(vbr); 116 int error = virtblk_result(vbr);
117 117
118 if (req->cmd_type == REQ_TYPE_BLOCK_PC) { 118 if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -138,7 +138,7 @@ static void virtblk_done(struct virtqueue *vq)
138 do { 138 do {
139 virtqueue_disable_cb(vq); 139 virtqueue_disable_cb(vq);
140 while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { 140 while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
141 virtblk_request_done(vbr); 141 blk_mq_complete_request(vbr->req);
142 req_done = true; 142 req_done = true;
143 } 143 }
144 if (unlikely(virtqueue_is_broken(vq))) 144 if (unlikely(virtqueue_is_broken(vq)))
@@ -479,6 +479,7 @@ static struct blk_mq_ops virtio_mq_ops = {
479 .map_queue = blk_mq_map_queue, 479 .map_queue = blk_mq_map_queue,
480 .alloc_hctx = blk_mq_alloc_single_hw_queue, 480 .alloc_hctx = blk_mq_alloc_single_hw_queue,
481 .free_hctx = blk_mq_free_single_hw_queue, 481 .free_hctx = blk_mq_free_single_hw_queue,
482 .complete = virtblk_request_done,
482}; 483};
483 484
484static struct blk_mq_reg virtio_mq_reg = { 485static struct blk_mq_reg virtio_mq_reg = {
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 4b97b86da926..64c60edcdfbc 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -299,7 +299,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
299 BUG_ON(num != 0); 299 BUG_ON(num != 0);
300} 300}
301 301
302static void unmap_purged_grants(struct work_struct *work) 302void xen_blkbk_unmap_purged_grants(struct work_struct *work)
303{ 303{
304 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 304 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
305 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 305 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -375,7 +375,7 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
375 375
376 pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean); 376 pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);
377 377
378 INIT_LIST_HEAD(&blkif->persistent_purge_list); 378 BUG_ON(!list_empty(&blkif->persistent_purge_list));
379 root = &blkif->persistent_gnts; 379 root = &blkif->persistent_gnts;
380purge_list: 380purge_list:
381 foreach_grant_safe(persistent_gnt, n, root, node) { 381 foreach_grant_safe(persistent_gnt, n, root, node) {
@@ -420,7 +420,6 @@ finished:
420 blkif->vbd.overflow_max_grants = 0; 420 blkif->vbd.overflow_max_grants = 0;
421 421
422 /* We can defer this work */ 422 /* We can defer this work */
423 INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants);
424 schedule_work(&blkif->persistent_purge_work); 423 schedule_work(&blkif->persistent_purge_work);
425 pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total); 424 pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
426 return; 425 return;
@@ -625,9 +624,23 @@ purge_gnt_list:
625 print_stats(blkif); 624 print_stats(blkif);
626 } 625 }
627 626
628 /* Since we are shutting down remove all pages from the buffer */ 627 /* Drain pending purge work */
629 shrink_free_pagepool(blkif, 0 /* All */); 628 flush_work(&blkif->persistent_purge_work);
630 629
630 if (log_stats)
631 print_stats(blkif);
632
633 blkif->xenblkd = NULL;
634 xen_blkif_put(blkif);
635
636 return 0;
637}
638
639/*
640 * Remove persistent grants and empty the pool of free pages
641 */
642void xen_blkbk_free_caches(struct xen_blkif *blkif)
643{
631 /* Free all persistent grant pages */ 644 /* Free all persistent grant pages */
632 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) 645 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
633 free_persistent_gnts(blkif, &blkif->persistent_gnts, 646 free_persistent_gnts(blkif, &blkif->persistent_gnts,
@@ -636,13 +649,8 @@ purge_gnt_list:
636 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); 649 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
637 blkif->persistent_gnt_c = 0; 650 blkif->persistent_gnt_c = 0;
638 651
639 if (log_stats) 652 /* Since we are shutting down remove all pages from the buffer */
640 print_stats(blkif); 653 shrink_free_pagepool(blkif, 0 /* All */);
641
642 blkif->xenblkd = NULL;
643 xen_blkif_put(blkif);
644
645 return 0;
646} 654}
647 655
648/* 656/*
@@ -838,7 +846,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
838 struct grant_page **pages = pending_req->indirect_pages; 846 struct grant_page **pages = pending_req->indirect_pages;
839 struct xen_blkif *blkif = pending_req->blkif; 847 struct xen_blkif *blkif = pending_req->blkif;
840 int indirect_grefs, rc, n, nseg, i; 848 int indirect_grefs, rc, n, nseg, i;
841 struct blkif_request_segment_aligned *segments = NULL; 849 struct blkif_request_segment *segments = NULL;
842 850
843 nseg = pending_req->nr_pages; 851 nseg = pending_req->nr_pages;
844 indirect_grefs = INDIRECT_PAGES(nseg); 852 indirect_grefs = INDIRECT_PAGES(nseg);
@@ -934,9 +942,7 @@ static void xen_blk_drain_io(struct xen_blkif *blkif)
934{ 942{
935 atomic_set(&blkif->drain, 1); 943 atomic_set(&blkif->drain, 1);
936 do { 944 do {
937 /* The initial value is one, and one refcnt taken at the 945 if (atomic_read(&blkif->inflight) == 0)
938 * start of the xen_blkif_schedule thread. */
939 if (atomic_read(&blkif->refcnt) <= 2)
940 break; 946 break;
941 wait_for_completion_interruptible_timeout( 947 wait_for_completion_interruptible_timeout(
942 &blkif->drain_complete, HZ); 948 &blkif->drain_complete, HZ);
@@ -976,17 +982,30 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
976 * the proper response on the ring. 982 * the proper response on the ring.
977 */ 983 */
978 if (atomic_dec_and_test(&pending_req->pendcnt)) { 984 if (atomic_dec_and_test(&pending_req->pendcnt)) {
979 xen_blkbk_unmap(pending_req->blkif, 985 struct xen_blkif *blkif = pending_req->blkif;
986
987 xen_blkbk_unmap(blkif,
980 pending_req->segments, 988 pending_req->segments,
981 pending_req->nr_pages); 989 pending_req->nr_pages);
982 make_response(pending_req->blkif, pending_req->id, 990 make_response(blkif, pending_req->id,
983 pending_req->operation, pending_req->status); 991 pending_req->operation, pending_req->status);
984 xen_blkif_put(pending_req->blkif); 992 free_req(blkif, pending_req);
985 if (atomic_read(&pending_req->blkif->refcnt) <= 2) { 993 /*
986 if (atomic_read(&pending_req->blkif->drain)) 994 * Make sure the request is freed before releasing blkif,
987 complete(&pending_req->blkif->drain_complete); 995 * or there could be a race between free_req and the
996 * cleanup done in xen_blkif_free during shutdown.
997 *
998 * NB: The fact that we might try to wake up pending_free_wq
999 * before drain_complete (in case there's a drain going on)
1000 * it's not a problem with our current implementation
1001 * because we can assure there's no thread waiting on
1002 * pending_free_wq if there's a drain going on, but it has
1003 * to be taken into account if the current model is changed.
1004 */
1005 if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
1006 complete(&blkif->drain_complete);
988 } 1007 }
989 free_req(pending_req->blkif, pending_req); 1008 xen_blkif_put(blkif);
990 } 1009 }
991} 1010}
992 1011
@@ -1240,6 +1259,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1240 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. 1259 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
1241 */ 1260 */
1242 xen_blkif_get(blkif); 1261 xen_blkif_get(blkif);
1262 atomic_inc(&blkif->inflight);
1243 1263
1244 for (i = 0; i < nseg; i++) { 1264 for (i = 0; i < nseg; i++) {
1245 while ((bio == NULL) || 1265 while ((bio == NULL) ||
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 8d8807563d99..be052773ad03 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -57,7 +57,7 @@
57#define MAX_INDIRECT_SEGMENTS 256 57#define MAX_INDIRECT_SEGMENTS 256
58 58
59#define SEGS_PER_INDIRECT_FRAME \ 59#define SEGS_PER_INDIRECT_FRAME \
60 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) 60 (PAGE_SIZE/sizeof(struct blkif_request_segment))
61#define MAX_INDIRECT_PAGES \ 61#define MAX_INDIRECT_PAGES \
62 ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) 62 ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
63#define INDIRECT_PAGES(_segs) \ 63#define INDIRECT_PAGES(_segs) \
@@ -278,6 +278,7 @@ struct xen_blkif {
278 /* for barrier (drain) requests */ 278 /* for barrier (drain) requests */
279 struct completion drain_complete; 279 struct completion drain_complete;
280 atomic_t drain; 280 atomic_t drain;
281 atomic_t inflight;
281 /* One thread per one blkif. */ 282 /* One thread per one blkif. */
282 struct task_struct *xenblkd; 283 struct task_struct *xenblkd;
283 unsigned int waiting_reqs; 284 unsigned int waiting_reqs;
@@ -376,6 +377,7 @@ int xen_blkif_xenbus_init(void);
376irqreturn_t xen_blkif_be_int(int irq, void *dev_id); 377irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
377int xen_blkif_schedule(void *arg); 378int xen_blkif_schedule(void *arg);
378int xen_blkif_purge_persistent(void *arg); 379int xen_blkif_purge_persistent(void *arg);
380void xen_blkbk_free_caches(struct xen_blkif *blkif);
379 381
380int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, 382int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
381 struct backend_info *be, int state); 383 struct backend_info *be, int state);
@@ -383,6 +385,7 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
383int xen_blkbk_barrier(struct xenbus_transaction xbt, 385int xen_blkbk_barrier(struct xenbus_transaction xbt,
384 struct backend_info *be, int state); 386 struct backend_info *be, int state);
385struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); 387struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
388void xen_blkbk_unmap_purged_grants(struct work_struct *work);
386 389
387static inline void blkif_get_x86_32_req(struct blkif_request *dst, 390static inline void blkif_get_x86_32_req(struct blkif_request *dst,
388 struct blkif_x86_32_request *src) 391 struct blkif_x86_32_request *src)
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index c2014a0aa206..9a547e6b6ebf 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -125,8 +125,11 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
125 blkif->persistent_gnts.rb_node = NULL; 125 blkif->persistent_gnts.rb_node = NULL;
126 spin_lock_init(&blkif->free_pages_lock); 126 spin_lock_init(&blkif->free_pages_lock);
127 INIT_LIST_HEAD(&blkif->free_pages); 127 INIT_LIST_HEAD(&blkif->free_pages);
128 INIT_LIST_HEAD(&blkif->persistent_purge_list);
128 blkif->free_pages_num = 0; 129 blkif->free_pages_num = 0;
129 atomic_set(&blkif->persistent_gnt_in_use, 0); 130 atomic_set(&blkif->persistent_gnt_in_use, 0);
131 atomic_set(&blkif->inflight, 0);
132 INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
130 133
131 INIT_LIST_HEAD(&blkif->pending_free); 134 INIT_LIST_HEAD(&blkif->pending_free);
132 135
@@ -259,6 +262,17 @@ static void xen_blkif_free(struct xen_blkif *blkif)
259 if (!atomic_dec_and_test(&blkif->refcnt)) 262 if (!atomic_dec_and_test(&blkif->refcnt))
260 BUG(); 263 BUG();
261 264
265 /* Remove all persistent grants and the cache of ballooned pages. */
266 xen_blkbk_free_caches(blkif);
267
268 /* Make sure everything is drained before shutting down */
269 BUG_ON(blkif->persistent_gnt_c != 0);
270 BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
271 BUG_ON(blkif->free_pages_num != 0);
272 BUG_ON(!list_empty(&blkif->persistent_purge_list));
273 BUG_ON(!list_empty(&blkif->free_pages));
274 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
275
262 /* Check that there is no request in use */ 276 /* Check that there is no request in use */
263 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { 277 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
264 list_del(&req->free_list); 278 list_del(&req->free_list);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8dcfb54f1603..efe1b4761735 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -162,7 +162,7 @@ static DEFINE_SPINLOCK(minor_lock);
162#define DEV_NAME "xvd" /* name in /dev */ 162#define DEV_NAME "xvd" /* name in /dev */
163 163
164#define SEGS_PER_INDIRECT_FRAME \ 164#define SEGS_PER_INDIRECT_FRAME \
165 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) 165 (PAGE_SIZE/sizeof(struct blkif_request_segment))
166#define INDIRECT_GREFS(_segs) \ 166#define INDIRECT_GREFS(_segs) \
167 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) 167 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
168 168
@@ -393,7 +393,7 @@ static int blkif_queue_request(struct request *req)
393 unsigned long id; 393 unsigned long id;
394 unsigned int fsect, lsect; 394 unsigned int fsect, lsect;
395 int i, ref, n; 395 int i, ref, n;
396 struct blkif_request_segment_aligned *segments = NULL; 396 struct blkif_request_segment *segments = NULL;
397 397
398 /* 398 /*
399 * Used to store if we are able to queue the request by just using 399 * Used to store if we are able to queue the request by just using
@@ -550,7 +550,7 @@ static int blkif_queue_request(struct request *req)
550 } else { 550 } else {
551 n = i % SEGS_PER_INDIRECT_FRAME; 551 n = i % SEGS_PER_INDIRECT_FRAME;
552 segments[n] = 552 segments[n] =
553 (struct blkif_request_segment_aligned) { 553 (struct blkif_request_segment) {
554 .gref = ref, 554 .gref = ref,
555 .first_sect = fsect, 555 .first_sect = fsect,
556 .last_sect = lsect }; 556 .last_sect = lsect };
@@ -1904,13 +1904,16 @@ static void blkback_changed(struct xenbus_device *dev,
1904 case XenbusStateReconfiguring: 1904 case XenbusStateReconfiguring:
1905 case XenbusStateReconfigured: 1905 case XenbusStateReconfigured:
1906 case XenbusStateUnknown: 1906 case XenbusStateUnknown:
1907 case XenbusStateClosed:
1908 break; 1907 break;
1909 1908
1910 case XenbusStateConnected: 1909 case XenbusStateConnected:
1911 blkfront_connect(info); 1910 blkfront_connect(info);
1912 break; 1911 break;
1913 1912
1913 case XenbusStateClosed:
1914 if (dev->state == XenbusStateClosed)
1915 break;
1916 /* Missed the backend's Closing state -- fallthrough */
1914 case XenbusStateClosing: 1917 case XenbusStateClosing:
1915 blkfront_closing(info); 1918 blkfront_closing(info);
1916 break; 1919 break;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 0c707e4f4eaf..a4c7306ff43d 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -210,7 +210,9 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
210#define GC_MARK_RECLAIMABLE 0 210#define GC_MARK_RECLAIMABLE 0
211#define GC_MARK_DIRTY 1 211#define GC_MARK_DIRTY 1
212#define GC_MARK_METADATA 2 212#define GC_MARK_METADATA 2
213BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); 213#define GC_SECTORS_USED_SIZE 13
214#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE))
215BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
214BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); 216BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
215 217
216#include "journal.h" 218#include "journal.h"
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 4f6b5940e609..3f74b4b0747b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -23,7 +23,7 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
23 for (k = i->start; k < bset_bkey_last(i); k = next) { 23 for (k = i->start; k < bset_bkey_last(i); k = next) {
24 next = bkey_next(k); 24 next = bkey_next(k);
25 25
26 printk(KERN_ERR "block %u key %zi/%u: ", set, 26 printk(KERN_ERR "block %u key %li/%u: ", set,
27 (uint64_t *) k - i->d, i->keys); 27 (uint64_t *) k - i->d, i->keys);
28 28
29 if (b->ops->key_dump) 29 if (b->ops->key_dump)
@@ -1185,9 +1185,12 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
1185 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, 1185 struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
1186 order); 1186 order);
1187 if (!out) { 1187 if (!out) {
1188 struct page *outp;
1189
1188 BUG_ON(order > state->page_order); 1190 BUG_ON(order > state->page_order);
1189 1191
1190 out = page_address(mempool_alloc(state->pool, GFP_NOIO)); 1192 outp = mempool_alloc(state->pool, GFP_NOIO);
1193 out = page_address(outp);
1191 used_mempool = true; 1194 used_mempool = true;
1192 order = state->page_order; 1195 order = state->page_order;
1193 } 1196 }
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 98cc0a810a36..5f9c2a665ca5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1167,7 +1167,7 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1167 /* guard against overflow */ 1167 /* guard against overflow */
1168 SET_GC_SECTORS_USED(g, min_t(unsigned, 1168 SET_GC_SECTORS_USED(g, min_t(unsigned,
1169 GC_SECTORS_USED(g) + KEY_SIZE(k), 1169 GC_SECTORS_USED(g) + KEY_SIZE(k),
1170 (1 << 14) - 1)); 1170 MAX_GC_SECTORS_USED));
1171 1171
1172 BUG_ON(!GC_SECTORS_USED(g)); 1172 BUG_ON(!GC_SECTORS_USED(g));
1173 } 1173 }
@@ -1805,7 +1805,7 @@ static bool btree_insert_key(struct btree *b, struct bkey *k,
1805 1805
1806static size_t insert_u64s_remaining(struct btree *b) 1806static size_t insert_u64s_remaining(struct btree *b)
1807{ 1807{
1808 ssize_t ret = bch_btree_keys_u64s_remaining(&b->keys); 1808 long ret = bch_btree_keys_u64s_remaining(&b->keys);
1809 1809
1810 /* 1810 /*
1811 * Might land in the middle of an existing extent and have to split it 1811 * Might land in the middle of an existing extent and have to split it
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 72cd213f213f..5d5d031cf381 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -353,14 +353,14 @@ static void bch_data_insert_start(struct closure *cl)
353 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 353 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
354 struct bio *bio = op->bio, *n; 354 struct bio *bio = op->bio, *n;
355 355
356 if (op->bypass)
357 return bch_data_invalidate(cl);
358
359 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { 356 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
360 set_gc_sectors(op->c); 357 set_gc_sectors(op->c);
361 wake_up_gc(op->c); 358 wake_up_gc(op->c);
362 } 359 }
363 360
361 if (op->bypass)
362 return bch_data_invalidate(cl);
363
364 /* 364 /*
365 * Journal writes are marked REQ_FLUSH; if the original write was a 365 * Journal writes are marked REQ_FLUSH; if the original write was a
366 * flush, it'll wait on the journal write. 366 * flush, it'll wait on the journal write.
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c6ab69333a6d..d8458d477a12 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -416,7 +416,7 @@ static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
416 return MAP_CONTINUE; 416 return MAP_CONTINUE;
417} 417}
418 418
419int bch_bset_print_stats(struct cache_set *c, char *buf) 419static int bch_bset_print_stats(struct cache_set *c, char *buf)
420{ 420{
421 struct bset_stats_op op; 421 struct bset_stats_op op;
422 int ret; 422 int ret;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 0bad24ddc2e7..0129b78a6908 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -114,6 +114,14 @@ void bio_integrity_free(struct bio *bio)
114} 114}
115EXPORT_SYMBOL(bio_integrity_free); 115EXPORT_SYMBOL(bio_integrity_free);
116 116
117static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
118{
119 if (bip->bip_slab == BIO_POOL_NONE)
120 return BIP_INLINE_VECS;
121
122 return bvec_nr_vecs(bip->bip_slab);
123}
124
117/** 125/**
118 * bio_integrity_add_page - Attach integrity metadata 126 * bio_integrity_add_page - Attach integrity metadata
119 * @bio: bio to update 127 * @bio: bio to update
@@ -129,7 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
129 struct bio_integrity_payload *bip = bio->bi_integrity; 137 struct bio_integrity_payload *bip = bio->bi_integrity;
130 struct bio_vec *iv; 138 struct bio_vec *iv;
131 139
132 if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) { 140 if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
133 printk(KERN_ERR "%s: bip_vec full\n", __func__); 141 printk(KERN_ERR "%s: bip_vec full\n", __func__);
134 return 0; 142 return 0;
135 } 143 }
@@ -226,7 +234,8 @@ unsigned int bio_integrity_tag_size(struct bio *bio)
226} 234}
227EXPORT_SYMBOL(bio_integrity_tag_size); 235EXPORT_SYMBOL(bio_integrity_tag_size);
228 236
229int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) 237static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len,
238 int set)
230{ 239{
231 struct bio_integrity_payload *bip = bio->bi_integrity; 240 struct bio_integrity_payload *bip = bio->bi_integrity;
232 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 241 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
diff --git a/fs/bio.c b/fs/bio.c
index 75c49a382239..8754e7b6eb49 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -611,7 +611,6 @@ EXPORT_SYMBOL(bio_clone_fast);
611struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, 611struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
612 struct bio_set *bs) 612 struct bio_set *bs)
613{ 613{
614 unsigned nr_iovecs = 0;
615 struct bvec_iter iter; 614 struct bvec_iter iter;
616 struct bio_vec bv; 615 struct bio_vec bv;
617 struct bio *bio; 616 struct bio *bio;
@@ -638,10 +637,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
638 * __bio_clone_fast() anyways. 637 * __bio_clone_fast() anyways.
639 */ 638 */
640 639
641 bio_for_each_segment(bv, bio_src, iter) 640 bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
642 nr_iovecs++;
643
644 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, bs);
645 if (!bio) 641 if (!bio)
646 return NULL; 642 return NULL;
647 643
@@ -650,9 +646,18 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
650 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; 646 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
651 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; 647 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
652 648
649 if (bio->bi_rw & REQ_DISCARD)
650 goto integrity_clone;
651
652 if (bio->bi_rw & REQ_WRITE_SAME) {
653 bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
654 goto integrity_clone;
655 }
656
653 bio_for_each_segment(bv, bio_src, iter) 657 bio_for_each_segment(bv, bio_src, iter)
654 bio->bi_io_vec[bio->bi_vcnt++] = bv; 658 bio->bi_io_vec[bio->bi_vcnt++] = bv;
655 659
660integrity_clone:
656 if (bio_integrity(bio_src)) { 661 if (bio_integrity(bio_src)) {
657 int ret; 662 int ret;
658 663
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 70654521dab6..5a4d39b4686b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -250,6 +250,17 @@ static inline unsigned bio_segments(struct bio *bio)
250 struct bio_vec bv; 250 struct bio_vec bv;
251 struct bvec_iter iter; 251 struct bvec_iter iter;
252 252
253 /*
254 * We special case discard/write same, because they interpret bi_size
255 * differently:
256 */
257
258 if (bio->bi_rw & REQ_DISCARD)
259 return 1;
260
261 if (bio->bi_rw & REQ_WRITE_SAME)
262 return 1;
263
253 bio_for_each_segment(bv, bio, iter) 264 bio_for_each_segment(bv, bio, iter)
254 segs++; 265 segs++;
255 266
@@ -332,6 +343,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
332extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); 343extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
333 344
334extern struct bio_set *fs_bio_set; 345extern struct bio_set *fs_bio_set;
346unsigned int bio_integrity_tag_size(struct bio *bio);
335 347
336static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 348static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
337{ 349{
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 161b23105b1e..18ba8a627f46 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -83,6 +83,8 @@ struct blk_mq_ops {
83 */ 83 */
84 rq_timed_out_fn *timeout; 84 rq_timed_out_fn *timeout;
85 85
86 softirq_done_fn *complete;
87
86 /* 88 /*
87 * Override for hctx allocations (should probably go) 89 * Override for hctx allocations (should probably go)
88 */ 90 */
@@ -119,11 +121,12 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc
119 121
120void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 122void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
121 123
122void blk_mq_insert_request(struct request_queue *, struct request *, bool); 124void blk_mq_insert_request(struct request_queue *, struct request *,
125 bool, bool);
123void blk_mq_run_queues(struct request_queue *q, bool async); 126void blk_mq_run_queues(struct request_queue *q, bool async);
124void blk_mq_free_request(struct request *rq); 127void blk_mq_free_request(struct request *rq);
125bool blk_mq_can_queue(struct blk_mq_hw_ctx *); 128bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
126struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); 129struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
127struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); 130struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
128struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); 131struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
129 132
@@ -133,6 +136,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
133 136
134void blk_mq_end_io(struct request *rq, int error); 137void blk_mq_end_io(struct request *rq, int error);
135 138
139void blk_mq_complete_request(struct request *rq);
140
136void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 141void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
137void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 142void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
138void blk_mq_stop_hw_queues(struct request_queue *q); 143void blk_mq_stop_hw_queues(struct request_queue *q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8678c4322b44..4afa4f8f6090 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -98,7 +98,7 @@ struct request {
98 struct list_head queuelist; 98 struct list_head queuelist;
99 union { 99 union {
100 struct call_single_data csd; 100 struct call_single_data csd;
101 struct work_struct mq_flush_data; 101 struct work_struct mq_flush_work;
102 }; 102 };
103 103
104 struct request_queue *q; 104 struct request_queue *q;
@@ -448,13 +448,8 @@ struct request_queue {
448 unsigned long flush_pending_since; 448 unsigned long flush_pending_since;
449 struct list_head flush_queue[2]; 449 struct list_head flush_queue[2];
450 struct list_head flush_data_in_flight; 450 struct list_head flush_data_in_flight;
451 union { 451 struct request *flush_rq;
452 struct request flush_rq; 452 spinlock_t mq_flush_lock;
453 struct {
454 spinlock_t mq_flush_lock;
455 struct work_struct mq_flush_work;
456 };
457 };
458 453
459 struct mutex sysfs_lock; 454 struct mutex sysfs_lock;
460 455
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index ae665ac59c36..32ec05a6572f 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -113,13 +113,13 @@ typedef uint64_t blkif_sector_t;
113 * it's less than the number provided by the backend. The indirect_grefs field 113 * it's less than the number provided by the backend. The indirect_grefs field
114 * in blkif_request_indirect should be filled by the frontend with the 114 * in blkif_request_indirect should be filled by the frontend with the
115 * grant references of the pages that are holding the indirect segments. 115 * grant references of the pages that are holding the indirect segments.
116 * This pages are filled with an array of blkif_request_segment_aligned 116 * These pages are filled with an array of blkif_request_segment that hold the
117 * that hold the information about the segments. The number of indirect 117 * information about the segments. The number of indirect pages to use is
118 * pages to use is determined by the maximum number of segments 118 * determined by the number of segments an indirect request contains. Every
119 * a indirect request contains. Every indirect page can contain a maximum 119 * indirect page can contain a maximum of
120 * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)), 120 * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to
121 * so to calculate the number of indirect pages to use we have to do 121 * calculate the number of indirect pages to use we have to do
122 * ceil(indirect_segments/512). 122 * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))).
123 * 123 *
124 * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* 124 * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
125 * create the "feature-max-indirect-segments" node! 125 * create the "feature-max-indirect-segments" node!
@@ -135,13 +135,12 @@ typedef uint64_t blkif_sector_t;
135 135
136#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 136#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
137 137
138struct blkif_request_segment_aligned { 138struct blkif_request_segment {
139 grant_ref_t gref; /* reference to I/O buffer frame */ 139 grant_ref_t gref; /* reference to I/O buffer frame */
140 /* @first_sect: first sector in frame to transfer (inclusive). */ 140 /* @first_sect: first sector in frame to transfer (inclusive). */
141 /* @last_sect: last sector in frame to transfer (inclusive). */ 141 /* @last_sect: last sector in frame to transfer (inclusive). */
142 uint8_t first_sect, last_sect; 142 uint8_t first_sect, last_sect;
143 uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */ 143};
144} __attribute__((__packed__));
145 144
146struct blkif_request_rw { 145struct blkif_request_rw {
147 uint8_t nr_segments; /* number of segments */ 146 uint8_t nr_segments; /* number of segments */
@@ -151,12 +150,7 @@ struct blkif_request_rw {
151#endif 150#endif
152 uint64_t id; /* private guest value, echoed in resp */ 151 uint64_t id; /* private guest value, echoed in resp */
153 blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ 152 blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
154 struct blkif_request_segment { 153 struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
155 grant_ref_t gref; /* reference to I/O buffer frame */
156 /* @first_sect: first sector in frame to transfer (inclusive). */
157 /* @last_sect: last sector in frame to transfer (inclusive). */
158 uint8_t first_sect, last_sect;
159 } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
160} __attribute__((__packed__)); 154} __attribute__((__packed__));
161 155
162struct blkif_request_discard { 156struct blkif_request_discard {
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
index 7be235f1a70b..93d145e5539c 100644
--- a/lib/percpu_ida.c
+++ b/lib/percpu_ida.c
@@ -54,9 +54,7 @@ static inline void move_tags(unsigned *dst, unsigned *dst_nr,
54/* 54/*
55 * Try to steal tags from a remote cpu's percpu freelist. 55 * Try to steal tags from a remote cpu's percpu freelist.
56 * 56 *
57 * We first check how many percpu freelists have tags - we don't steal tags 57 * We first check how many percpu freelists have tags
58 * unless enough percpu freelists have tags on them that it's possible more than
59 * half the total tags could be stuck on remote percpu freelists.
60 * 58 *
61 * Then we iterate through the cpus until we find some tags - we don't attempt 59 * Then we iterate through the cpus until we find some tags - we don't attempt
62 * to find the "best" cpu to steal from, to keep cacheline bouncing to a 60 * to find the "best" cpu to steal from, to keep cacheline bouncing to a
@@ -69,8 +67,7 @@ static inline void steal_tags(struct percpu_ida *pool,
69 struct percpu_ida_cpu *remote; 67 struct percpu_ida_cpu *remote;
70 68
71 for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags); 69 for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
72 cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2; 70 cpus_have_tags; cpus_have_tags--) {
73 cpus_have_tags--) {
74 cpu = cpumask_next(cpu, &pool->cpus_have_tags); 71 cpu = cpumask_next(cpu, &pool->cpus_have_tags);
75 72
76 if (cpu >= nr_cpu_ids) { 73 if (cpu >= nr_cpu_ids) {