aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-02 12:29:34 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-02 12:29:34 -0400
commit681a2895486243a82547d8c9f53043eb54b53da0 (patch)
tree464273280aed6db55a99cc0d8614d4393f94fc48 /block
parent6c52486dedbb30a1313da64945dcd686b4579c51 (diff)
parented851860b4552fc8963ecf71eab9f6f7a5c19d74 (diff)
Merge branch 'for-3.16/core' of git://git.kernel.dk/linux-block into next
Pull block core updates from Jens Axboe: "It's a big(ish) round this time, lots of development effort has gone into blk-mq in the last 3 months. Generally we're heading to where 3.16 will be a feature complete and performant blk-mq. scsi-mq is progressing nicely and will hopefully be in 3.17. A nvme port is in progress, and the Micron pci-e flash driver, mtip32xx, is converted and will be sent in with the driver pull request for 3.16. This pull request contains: - Lots of prep and support patches for scsi-mq have been integrated. All from Christoph. - API and code cleanups for blk-mq from Christoph. - Lots of good corner case and error handling cleanup fixes for blk-mq from Ming Lei. - A flew of blk-mq updates from me: * Provide strict mappings so that the driver can rely on the CPU to queue mapping. This enables optimizations in the driver. * Provided a bitmap tagging instead of percpu_ida, which never really worked well for blk-mq. percpu_ida relies on the fact that we have a lot more tags available than we really need, it fails miserably for cases where we exhaust (or are close to exhausting) the tag space. * Provide sane support for shared tag maps, as utilized by scsi-mq * Various fixes for IO timeouts. * API cleanups, and lots of perf tweaks and optimizations. - Remove 'buffer' from struct request. This is ancient code, from when requests were always virtually mapped. Kill it, to reclaim some space in struct request. From me. - Remove 'magic' from blk_plug. Since we store these on the stack and since we've never caught any actual bugs with this, lets just get rid of it. From me. - Only call part_in_flight() once for IO completion, as includes two atomic reads. Hopefully we'll get a better implementation soon, as the part IO stats are now one of the more expensive parts of doing IO on blk-mq. From me. - File migration of block code from {mm,fs}/ to block/. This includes bio.c, bio-integrity.c, bounce.c, and ioprio.c. From me, from a discussion on lkml. That should describe the meat of the pull request. Also has various little fixes and cleanups from Dave Jones, Shaohua Li, Duan Jiong, Fengguang Wu, Fabian Frederick, Randy Dunlap, Robert Elliott, and Sam Bradshaw" * 'for-3.16/core' of git://git.kernel.dk/linux-block: (100 commits) blk-mq: push IPI or local end_io decision to __blk_mq_complete_request() blk-mq: remember to start timeout handler for direct queue block: ensure that the timer is always added blk-mq: blk_mq_unregister_hctx() can be static blk-mq: make the sysfs mq/ layout reflect current mappings blk-mq: blk_mq_tag_to_rq should handle flush request block: remove dead code in scsi_ioctl:blk_verify_command blk-mq: request initialization optimizations block: add queue flag for disabling SG merging block: remove 'magic' from struct blk_plug blk-mq: remove alloc_hctx and free_hctx methods blk-mq: add file comments and update copyright notices blk-mq: remove blk_mq_alloc_request_pinned blk-mq: do not use blk_mq_alloc_request_pinned in blk_mq_map_request blk-mq: remove blk_mq_wait_for_tags blk-mq: initialize request in __blk_mq_alloc_request blk-mq: merge blk_mq_alloc_reserved_request into blk_mq_alloc_request blk-mq: add helper to insert requests from irq context blk-mq: remove stale comment for blk_mq_complete_request() blk-mq: allow non-softirq completions ...
Diffstat (limited to 'block')
-rw-r--r--block/Makefile7
-rw-r--r--block/bio-integrity.c657
-rw-r--r--block/bio.c2038
-rw-r--r--block/blk-core.c113
-rw-r--r--block/blk-flush.c40
-rw-r--r--block/blk-iopoll.c4
-rw-r--r--block/blk-lib.c4
-rw-r--r--block/blk-map.c3
-rw-r--r--block/blk-merge.c28
-rw-r--r--block/blk-mq-cpu.c17
-rw-r--r--block/blk-mq-cpumap.c27
-rw-r--r--block/blk-mq-sysfs.c160
-rw-r--r--block/blk-mq-tag.c561
-rw-r--r--block/blk-mq-tag.h71
-rw-r--r--block/blk-mq.c1415
-rw-r--r--block/blk-mq.h32
-rw-r--r--block/blk-sysfs.c47
-rw-r--r--block/blk-throttle.c10
-rw-r--r--block/blk-timeout.c60
-rw-r--r--block/blk.h9
-rw-r--r--block/bounce.c287
-rw-r--r--block/cfq-iosched.c4
-rw-r--r--block/ioprio.c241
-rw-r--r--block/scsi_ioctl.c4
24 files changed, 5083 insertions, 756 deletions
diff --git a/block/Makefile b/block/Makefile
index 20645e88fb57..a2ce6ac935ec 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -2,13 +2,15 @@
2# Makefile for the kernel block layer 2# Makefile for the kernel block layer
3# 3#
4 4
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ 8 blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
9 blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ 9 blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
10 genhd.o scsi_ioctl.o partition-generic.o partitions/ 10 genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
11 partitions/
11 12
13obj-$(CONFIG_BOUNCE) += bounce.o
12obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 14obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
13obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o 15obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
14obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 16obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
@@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
20obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o 22obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
21obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o 23obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
22obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o 24obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
25obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
new file mode 100644
index 000000000000..9e241063a616
--- /dev/null
+++ b/block/bio-integrity.c
@@ -0,0 +1,657 @@
1/*
2 * bio-integrity.c - bio data integrity extensions
3 *
4 * Copyright (C) 2007, 2008, 2009 Oracle Corporation
5 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License version
9 * 2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
19 * USA.
20 *
21 */
22
23#include <linux/blkdev.h>
24#include <linux/mempool.h>
25#include <linux/export.h>
26#include <linux/bio.h>
27#include <linux/workqueue.h>
28#include <linux/slab.h>
29
30#define BIP_INLINE_VECS 4
31
32static struct kmem_cache *bip_slab;
33static struct workqueue_struct *kintegrityd_wq;
34
35/**
36 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
37 * @bio: bio to attach integrity metadata to
38 * @gfp_mask: Memory allocation mask
39 * @nr_vecs: Number of integrity metadata scatter-gather elements
40 *
41 * Description: This function prepares a bio for attaching integrity
42 * metadata. nr_vecs specifies the maximum number of pages containing
43 * integrity metadata that can be attached.
44 */
45struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
46 gfp_t gfp_mask,
47 unsigned int nr_vecs)
48{
49 struct bio_integrity_payload *bip;
50 struct bio_set *bs = bio->bi_pool;
51 unsigned long idx = BIO_POOL_NONE;
52 unsigned inline_vecs;
53
54 if (!bs) {
55 bip = kmalloc(sizeof(struct bio_integrity_payload) +
56 sizeof(struct bio_vec) * nr_vecs, gfp_mask);
57 inline_vecs = nr_vecs;
58 } else {
59 bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
60 inline_vecs = BIP_INLINE_VECS;
61 }
62
63 if (unlikely(!bip))
64 return NULL;
65
66 memset(bip, 0, sizeof(*bip));
67
68 if (nr_vecs > inline_vecs) {
69 bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
70 bs->bvec_integrity_pool);
71 if (!bip->bip_vec)
72 goto err;
73 } else {
74 bip->bip_vec = bip->bip_inline_vecs;
75 }
76
77 bip->bip_slab = idx;
78 bip->bip_bio = bio;
79 bio->bi_integrity = bip;
80
81 return bip;
82err:
83 mempool_free(bip, bs->bio_integrity_pool);
84 return NULL;
85}
86EXPORT_SYMBOL(bio_integrity_alloc);
87
88/**
89 * bio_integrity_free - Free bio integrity payload
90 * @bio: bio containing bip to be freed
91 *
92 * Description: Used to free the integrity portion of a bio. Usually
93 * called from bio_free().
94 */
95void bio_integrity_free(struct bio *bio)
96{
97 struct bio_integrity_payload *bip = bio->bi_integrity;
98 struct bio_set *bs = bio->bi_pool;
99
100 if (bip->bip_owns_buf)
101 kfree(bip->bip_buf);
102
103 if (bs) {
104 if (bip->bip_slab != BIO_POOL_NONE)
105 bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
106 bip->bip_slab);
107
108 mempool_free(bip, bs->bio_integrity_pool);
109 } else {
110 kfree(bip);
111 }
112
113 bio->bi_integrity = NULL;
114}
115EXPORT_SYMBOL(bio_integrity_free);
116
117static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
118{
119 if (bip->bip_slab == BIO_POOL_NONE)
120 return BIP_INLINE_VECS;
121
122 return bvec_nr_vecs(bip->bip_slab);
123}
124
125/**
126 * bio_integrity_add_page - Attach integrity metadata
127 * @bio: bio to update
128 * @page: page containing integrity metadata
129 * @len: number of bytes of integrity metadata in page
130 * @offset: start offset within page
131 *
132 * Description: Attach a page containing integrity metadata to bio.
133 */
134int bio_integrity_add_page(struct bio *bio, struct page *page,
135 unsigned int len, unsigned int offset)
136{
137 struct bio_integrity_payload *bip = bio->bi_integrity;
138 struct bio_vec *iv;
139
140 if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
141 printk(KERN_ERR "%s: bip_vec full\n", __func__);
142 return 0;
143 }
144
145 iv = bip->bip_vec + bip->bip_vcnt;
146
147 iv->bv_page = page;
148 iv->bv_len = len;
149 iv->bv_offset = offset;
150 bip->bip_vcnt++;
151
152 return len;
153}
154EXPORT_SYMBOL(bio_integrity_add_page);
155
156static int bdev_integrity_enabled(struct block_device *bdev, int rw)
157{
158 struct blk_integrity *bi = bdev_get_integrity(bdev);
159
160 if (bi == NULL)
161 return 0;
162
163 if (rw == READ && bi->verify_fn != NULL &&
164 (bi->flags & INTEGRITY_FLAG_READ))
165 return 1;
166
167 if (rw == WRITE && bi->generate_fn != NULL &&
168 (bi->flags & INTEGRITY_FLAG_WRITE))
169 return 1;
170
171 return 0;
172}
173
174/**
175 * bio_integrity_enabled - Check whether integrity can be passed
176 * @bio: bio to check
177 *
178 * Description: Determines whether bio_integrity_prep() can be called
179 * on this bio or not. bio data direction and target device must be
180 * set prior to calling. The functions honors the write_generate and
181 * read_verify flags in sysfs.
182 */
183int bio_integrity_enabled(struct bio *bio)
184{
185 if (!bio_is_rw(bio))
186 return 0;
187
188 /* Already protected? */
189 if (bio_integrity(bio))
190 return 0;
191
192 return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
193}
194EXPORT_SYMBOL(bio_integrity_enabled);
195
196/**
197 * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
198 * @bi: blk_integrity profile for device
199 * @sectors: Number of 512 sectors to convert
200 *
201 * Description: The block layer calculates everything in 512 byte
202 * sectors but integrity metadata is done in terms of the hardware
203 * sector size of the storage device. Convert the block layer sectors
204 * to physical sectors.
205 */
206static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
207 unsigned int sectors)
208{
209 /* At this point there are only 512b or 4096b DIF/EPP devices */
210 if (bi->sector_size == 4096)
211 return sectors >>= 3;
212
213 return sectors;
214}
215
216static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
217 unsigned int sectors)
218{
219 return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size;
220}
221
222/**
223 * bio_integrity_tag_size - Retrieve integrity tag space
224 * @bio: bio to inspect
225 *
226 * Description: Returns the maximum number of tag bytes that can be
227 * attached to this bio. Filesystems can use this to determine how
228 * much metadata to attach to an I/O.
229 */
230unsigned int bio_integrity_tag_size(struct bio *bio)
231{
232 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
233
234 BUG_ON(bio->bi_iter.bi_size == 0);
235
236 return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size);
237}
238EXPORT_SYMBOL(bio_integrity_tag_size);
239
240static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len,
241 int set)
242{
243 struct bio_integrity_payload *bip = bio->bi_integrity;
244 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
245 unsigned int nr_sectors;
246
247 BUG_ON(bip->bip_buf == NULL);
248
249 if (bi->tag_size == 0)
250 return -1;
251
252 nr_sectors = bio_integrity_hw_sectors(bi,
253 DIV_ROUND_UP(len, bi->tag_size));
254
255 if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) {
256 printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__,
257 nr_sectors * bi->tuple_size, bip->bip_iter.bi_size);
258 return -1;
259 }
260
261 if (set)
262 bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
263 else
264 bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
265
266 return 0;
267}
268
269/**
270 * bio_integrity_set_tag - Attach a tag buffer to a bio
271 * @bio: bio to attach buffer to
272 * @tag_buf: Pointer to a buffer containing tag data
273 * @len: Length of the included buffer
274 *
275 * Description: Use this function to tag a bio by leveraging the extra
276 * space provided by devices formatted with integrity protection. The
277 * size of the integrity buffer must be <= to the size reported by
278 * bio_integrity_tag_size().
279 */
280int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
281{
282 BUG_ON(bio_data_dir(bio) != WRITE);
283
284 return bio_integrity_tag(bio, tag_buf, len, 1);
285}
286EXPORT_SYMBOL(bio_integrity_set_tag);
287
288/**
289 * bio_integrity_get_tag - Retrieve a tag buffer from a bio
290 * @bio: bio to retrieve buffer from
291 * @tag_buf: Pointer to a buffer for the tag data
292 * @len: Length of the target buffer
293 *
294 * Description: Use this function to retrieve the tag buffer from a
295 * completed I/O. The size of the integrity buffer must be <= to the
296 * size reported by bio_integrity_tag_size().
297 */
298int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
299{
300 BUG_ON(bio_data_dir(bio) != READ);
301
302 return bio_integrity_tag(bio, tag_buf, len, 0);
303}
304EXPORT_SYMBOL(bio_integrity_get_tag);
305
306/**
307 * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio
308 * @bio: bio to generate/verify integrity metadata for
309 * @operate: operate number, 1 for generate, 0 for verify
310 */
311static int bio_integrity_generate_verify(struct bio *bio, int operate)
312{
313 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
314 struct blk_integrity_exchg bix;
315 struct bio_vec *bv;
316 sector_t sector;
317 unsigned int sectors, ret = 0, i;
318 void *prot_buf = bio->bi_integrity->bip_buf;
319
320 if (operate)
321 sector = bio->bi_iter.bi_sector;
322 else
323 sector = bio->bi_integrity->bip_iter.bi_sector;
324
325 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
326 bix.sector_size = bi->sector_size;
327
328 bio_for_each_segment_all(bv, bio, i) {
329 void *kaddr = kmap_atomic(bv->bv_page);
330 bix.data_buf = kaddr + bv->bv_offset;
331 bix.data_size = bv->bv_len;
332 bix.prot_buf = prot_buf;
333 bix.sector = sector;
334
335 if (operate)
336 bi->generate_fn(&bix);
337 else {
338 ret = bi->verify_fn(&bix);
339 if (ret) {
340 kunmap_atomic(kaddr);
341 return ret;
342 }
343 }
344
345 sectors = bv->bv_len / bi->sector_size;
346 sector += sectors;
347 prot_buf += sectors * bi->tuple_size;
348
349 kunmap_atomic(kaddr);
350 }
351 return ret;
352}
353
354/**
355 * bio_integrity_generate - Generate integrity metadata for a bio
356 * @bio: bio to generate integrity metadata for
357 *
358 * Description: Generates integrity metadata for a bio by calling the
359 * block device's generation callback function. The bio must have a
360 * bip attached with enough room to accommodate the generated
361 * integrity metadata.
362 */
363static void bio_integrity_generate(struct bio *bio)
364{
365 bio_integrity_generate_verify(bio, 1);
366}
367
368static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
369{
370 if (bi)
371 return bi->tuple_size;
372
373 return 0;
374}
375
376/**
377 * bio_integrity_prep - Prepare bio for integrity I/O
378 * @bio: bio to prepare
379 *
380 * Description: Allocates a buffer for integrity metadata, maps the
381 * pages and attaches them to a bio. The bio must have data
382 * direction, target device and start sector set priot to calling. In
383 * the WRITE case, integrity metadata will be generated using the
384 * block device's integrity function. In the READ case, the buffer
385 * will be prepared for DMA and a suitable end_io handler set up.
386 */
387int bio_integrity_prep(struct bio *bio)
388{
389 struct bio_integrity_payload *bip;
390 struct blk_integrity *bi;
391 struct request_queue *q;
392 void *buf;
393 unsigned long start, end;
394 unsigned int len, nr_pages;
395 unsigned int bytes, offset, i;
396 unsigned int sectors;
397
398 bi = bdev_get_integrity(bio->bi_bdev);
399 q = bdev_get_queue(bio->bi_bdev);
400 BUG_ON(bi == NULL);
401 BUG_ON(bio_integrity(bio));
402
403 sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
404
405 /* Allocate kernel buffer for protection data */
406 len = sectors * blk_integrity_tuple_size(bi);
407 buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
408 if (unlikely(buf == NULL)) {
409 printk(KERN_ERR "could not allocate integrity buffer\n");
410 return -ENOMEM;
411 }
412
413 end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
414 start = ((unsigned long) buf) >> PAGE_SHIFT;
415 nr_pages = end - start;
416
417 /* Allocate bio integrity payload and integrity vectors */
418 bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
419 if (unlikely(bip == NULL)) {
420 printk(KERN_ERR "could not allocate data integrity bioset\n");
421 kfree(buf);
422 return -EIO;
423 }
424
425 bip->bip_owns_buf = 1;
426 bip->bip_buf = buf;
427 bip->bip_iter.bi_size = len;
428 bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
429
430 /* Map it */
431 offset = offset_in_page(buf);
432 for (i = 0 ; i < nr_pages ; i++) {
433 int ret;
434 bytes = PAGE_SIZE - offset;
435
436 if (len <= 0)
437 break;
438
439 if (bytes > len)
440 bytes = len;
441
442 ret = bio_integrity_add_page(bio, virt_to_page(buf),
443 bytes, offset);
444
445 if (ret == 0)
446 return 0;
447
448 if (ret < bytes)
449 break;
450
451 buf += bytes;
452 len -= bytes;
453 offset = 0;
454 }
455
456 /* Install custom I/O completion handler if read verify is enabled */
457 if (bio_data_dir(bio) == READ) {
458 bip->bip_end_io = bio->bi_end_io;
459 bio->bi_end_io = bio_integrity_endio;
460 }
461
462 /* Auto-generate integrity metadata if this is a write */
463 if (bio_data_dir(bio) == WRITE)
464 bio_integrity_generate(bio);
465
466 return 0;
467}
468EXPORT_SYMBOL(bio_integrity_prep);
469
470/**
471 * bio_integrity_verify - Verify integrity metadata for a bio
472 * @bio: bio to verify
473 *
474 * Description: This function is called to verify the integrity of a
475 * bio. The data in the bio io_vec is compared to the integrity
476 * metadata returned by the HBA.
477 */
478static int bio_integrity_verify(struct bio *bio)
479{
480 return bio_integrity_generate_verify(bio, 0);
481}
482
483/**
484 * bio_integrity_verify_fn - Integrity I/O completion worker
485 * @work: Work struct stored in bio to be verified
486 *
487 * Description: This workqueue function is called to complete a READ
488 * request. The function verifies the transferred integrity metadata
489 * and then calls the original bio end_io function.
490 */
491static void bio_integrity_verify_fn(struct work_struct *work)
492{
493 struct bio_integrity_payload *bip =
494 container_of(work, struct bio_integrity_payload, bip_work);
495 struct bio *bio = bip->bip_bio;
496 int error;
497
498 error = bio_integrity_verify(bio);
499
500 /* Restore original bio completion handler */
501 bio->bi_end_io = bip->bip_end_io;
502 bio_endio_nodec(bio, error);
503}
504
505/**
506 * bio_integrity_endio - Integrity I/O completion function
507 * @bio: Protected bio
508 * @error: Pointer to errno
509 *
510 * Description: Completion for integrity I/O
511 *
512 * Normally I/O completion is done in interrupt context. However,
513 * verifying I/O integrity is a time-consuming task which must be run
514 * in process context. This function postpones completion
515 * accordingly.
516 */
517void bio_integrity_endio(struct bio *bio, int error)
518{
519 struct bio_integrity_payload *bip = bio->bi_integrity;
520
521 BUG_ON(bip->bip_bio != bio);
522
523 /* In case of an I/O error there is no point in verifying the
524 * integrity metadata. Restore original bio end_io handler
525 * and run it.
526 */
527 if (error) {
528 bio->bi_end_io = bip->bip_end_io;
529 bio_endio(bio, error);
530
531 return;
532 }
533
534 INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
535 queue_work(kintegrityd_wq, &bip->bip_work);
536}
537EXPORT_SYMBOL(bio_integrity_endio);
538
539/**
540 * bio_integrity_advance - Advance integrity vector
541 * @bio: bio whose integrity vector to update
542 * @bytes_done: number of data bytes that have been completed
543 *
544 * Description: This function calculates how many integrity bytes the
545 * number of completed data bytes correspond to and advances the
546 * integrity vector accordingly.
547 */
548void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
549{
550 struct bio_integrity_payload *bip = bio->bi_integrity;
551 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
552 unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
553
554 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
555}
556EXPORT_SYMBOL(bio_integrity_advance);
557
558/**
559 * bio_integrity_trim - Trim integrity vector
560 * @bio: bio whose integrity vector to update
561 * @offset: offset to first data sector
562 * @sectors: number of data sectors
563 *
564 * Description: Used to trim the integrity vector in a cloned bio.
565 * The ivec will be advanced corresponding to 'offset' data sectors
566 * and the length will be truncated corresponding to 'len' data
567 * sectors.
568 */
569void bio_integrity_trim(struct bio *bio, unsigned int offset,
570 unsigned int sectors)
571{
572 struct bio_integrity_payload *bip = bio->bi_integrity;
573 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
574
575 bio_integrity_advance(bio, offset << 9);
576 bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
577}
578EXPORT_SYMBOL(bio_integrity_trim);
579
580/**
581 * bio_integrity_clone - Callback for cloning bios with integrity metadata
582 * @bio: New bio
583 * @bio_src: Original bio
584 * @gfp_mask: Memory allocation mask
585 *
586 * Description: Called to allocate a bip when cloning a bio
587 */
588int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
589 gfp_t gfp_mask)
590{
591 struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
592 struct bio_integrity_payload *bip;
593
594 BUG_ON(bip_src == NULL);
595
596 bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
597
598 if (bip == NULL)
599 return -EIO;
600
601 memcpy(bip->bip_vec, bip_src->bip_vec,
602 bip_src->bip_vcnt * sizeof(struct bio_vec));
603
604 bip->bip_vcnt = bip_src->bip_vcnt;
605 bip->bip_iter = bip_src->bip_iter;
606
607 return 0;
608}
609EXPORT_SYMBOL(bio_integrity_clone);
610
611int bioset_integrity_create(struct bio_set *bs, int pool_size)
612{
613 if (bs->bio_integrity_pool)
614 return 0;
615
616 bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
617 if (!bs->bio_integrity_pool)
618 return -1;
619
620 bs->bvec_integrity_pool = biovec_create_pool(pool_size);
621 if (!bs->bvec_integrity_pool) {
622 mempool_destroy(bs->bio_integrity_pool);
623 return -1;
624 }
625
626 return 0;
627}
628EXPORT_SYMBOL(bioset_integrity_create);
629
630void bioset_integrity_free(struct bio_set *bs)
631{
632 if (bs->bio_integrity_pool)
633 mempool_destroy(bs->bio_integrity_pool);
634
635 if (bs->bvec_integrity_pool)
636 mempool_destroy(bs->bvec_integrity_pool);
637}
638EXPORT_SYMBOL(bioset_integrity_free);
639
640void __init bio_integrity_init(void)
641{
642 /*
643 * kintegrityd won't block much but may burn a lot of CPU cycles.
644 * Make it highpri CPU intensive wq with max concurrency of 1.
645 */
646 kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
647 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
648 if (!kintegrityd_wq)
649 panic("Failed to create kintegrityd\n");
650
651 bip_slab = kmem_cache_create("bio_integrity_payload",
652 sizeof(struct bio_integrity_payload) +
653 sizeof(struct bio_vec) * BIP_INLINE_VECS,
654 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
655 if (!bip_slab)
656 panic("Failed to create slab\n");
657}
diff --git a/block/bio.c b/block/bio.c
new file mode 100644
index 000000000000..96d28eee8a1e
--- /dev/null
+++ b/block/bio.c
@@ -0,0 +1,2038 @@
1/*
2 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public Licens
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
16 *
17 */
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/bio.h>
21#include <linux/blkdev.h>
22#include <linux/uio.h>
23#include <linux/iocontext.h>
24#include <linux/slab.h>
25#include <linux/init.h>
26#include <linux/kernel.h>
27#include <linux/export.h>
28#include <linux/mempool.h>
29#include <linux/workqueue.h>
30#include <linux/cgroup.h>
31#include <scsi/sg.h> /* for struct sg_iovec */
32
33#include <trace/events/block.h>
34
35/*
36 * Test patch to inline a certain number of bi_io_vec's inside the bio
37 * itself, to shrink a bio data allocation from two mempool calls to one
38 */
39#define BIO_INLINE_VECS 4
40
41/*
42 * if you change this list, also change bvec_alloc or things will
43 * break badly! cannot be bigger than what you can fit into an
44 * unsigned short
45 */
46#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
47static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
48 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
49};
50#undef BV
51
52/*
53 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
54 * IO code that does not need private memory pools.
55 */
56struct bio_set *fs_bio_set;
57EXPORT_SYMBOL(fs_bio_set);
58
59/*
60 * Our slab pool management
61 */
62struct bio_slab {
63 struct kmem_cache *slab;
64 unsigned int slab_ref;
65 unsigned int slab_size;
66 char name[8];
67};
68static DEFINE_MUTEX(bio_slab_lock);
69static struct bio_slab *bio_slabs;
70static unsigned int bio_slab_nr, bio_slab_max;
71
72static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
73{
74 unsigned int sz = sizeof(struct bio) + extra_size;
75 struct kmem_cache *slab = NULL;
76 struct bio_slab *bslab, *new_bio_slabs;
77 unsigned int new_bio_slab_max;
78 unsigned int i, entry = -1;
79
80 mutex_lock(&bio_slab_lock);
81
82 i = 0;
83 while (i < bio_slab_nr) {
84 bslab = &bio_slabs[i];
85
86 if (!bslab->slab && entry == -1)
87 entry = i;
88 else if (bslab->slab_size == sz) {
89 slab = bslab->slab;
90 bslab->slab_ref++;
91 break;
92 }
93 i++;
94 }
95
96 if (slab)
97 goto out_unlock;
98
99 if (bio_slab_nr == bio_slab_max && entry == -1) {
100 new_bio_slab_max = bio_slab_max << 1;
101 new_bio_slabs = krealloc(bio_slabs,
102 new_bio_slab_max * sizeof(struct bio_slab),
103 GFP_KERNEL);
104 if (!new_bio_slabs)
105 goto out_unlock;
106 bio_slab_max = new_bio_slab_max;
107 bio_slabs = new_bio_slabs;
108 }
109 if (entry == -1)
110 entry = bio_slab_nr++;
111
112 bslab = &bio_slabs[entry];
113
114 snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
115 slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
116 if (!slab)
117 goto out_unlock;
118
119 bslab->slab = slab;
120 bslab->slab_ref = 1;
121 bslab->slab_size = sz;
122out_unlock:
123 mutex_unlock(&bio_slab_lock);
124 return slab;
125}
126
127static void bio_put_slab(struct bio_set *bs)
128{
129 struct bio_slab *bslab = NULL;
130 unsigned int i;
131
132 mutex_lock(&bio_slab_lock);
133
134 for (i = 0; i < bio_slab_nr; i++) {
135 if (bs->bio_slab == bio_slabs[i].slab) {
136 bslab = &bio_slabs[i];
137 break;
138 }
139 }
140
141 if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
142 goto out;
143
144 WARN_ON(!bslab->slab_ref);
145
146 if (--bslab->slab_ref)
147 goto out;
148
149 kmem_cache_destroy(bslab->slab);
150 bslab->slab = NULL;
151
152out:
153 mutex_unlock(&bio_slab_lock);
154}
155
156unsigned int bvec_nr_vecs(unsigned short idx)
157{
158 return bvec_slabs[idx].nr_vecs;
159}
160
161void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
162{
163 BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
164
165 if (idx == BIOVEC_MAX_IDX)
166 mempool_free(bv, pool);
167 else {
168 struct biovec_slab *bvs = bvec_slabs + idx;
169
170 kmem_cache_free(bvs->slab, bv);
171 }
172}
173
174struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
175 mempool_t *pool)
176{
177 struct bio_vec *bvl;
178
179 /*
180 * see comment near bvec_array define!
181 */
182 switch (nr) {
183 case 1:
184 *idx = 0;
185 break;
186 case 2 ... 4:
187 *idx = 1;
188 break;
189 case 5 ... 16:
190 *idx = 2;
191 break;
192 case 17 ... 64:
193 *idx = 3;
194 break;
195 case 65 ... 128:
196 *idx = 4;
197 break;
198 case 129 ... BIO_MAX_PAGES:
199 *idx = 5;
200 break;
201 default:
202 return NULL;
203 }
204
205 /*
206 * idx now points to the pool we want to allocate from. only the
207 * 1-vec entry pool is mempool backed.
208 */
209 if (*idx == BIOVEC_MAX_IDX) {
210fallback:
211 bvl = mempool_alloc(pool, gfp_mask);
212 } else {
213 struct biovec_slab *bvs = bvec_slabs + *idx;
214 gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
215
216 /*
217 * Make this allocation restricted and don't dump info on
218 * allocation failures, since we'll fallback to the mempool
219 * in case of failure.
220 */
221 __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
222
223 /*
224 * Try a slab allocation. If this fails and __GFP_WAIT
225 * is set, retry with the 1-entry mempool
226 */
227 bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
228 if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
229 *idx = BIOVEC_MAX_IDX;
230 goto fallback;
231 }
232 }
233
234 return bvl;
235}
236
237static void __bio_free(struct bio *bio)
238{
239 bio_disassociate_task(bio);
240
241 if (bio_integrity(bio))
242 bio_integrity_free(bio);
243}
244
245static void bio_free(struct bio *bio)
246{
247 struct bio_set *bs = bio->bi_pool;
248 void *p;
249
250 __bio_free(bio);
251
252 if (bs) {
253 if (bio_flagged(bio, BIO_OWNS_VEC))
254 bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
255
256 /*
257 * If we have front padding, adjust the bio pointer before freeing
258 */
259 p = bio;
260 p -= bs->front_pad;
261
262 mempool_free(p, bs->bio_pool);
263 } else {
264 /* Bio was allocated by bio_kmalloc() */
265 kfree(bio);
266 }
267}
268
269void bio_init(struct bio *bio)
270{
271 memset(bio, 0, sizeof(*bio));
272 bio->bi_flags = 1 << BIO_UPTODATE;
273 atomic_set(&bio->bi_remaining, 1);
274 atomic_set(&bio->bi_cnt, 1);
275}
276EXPORT_SYMBOL(bio_init);
277
278/**
279 * bio_reset - reinitialize a bio
280 * @bio: bio to reset
281 *
282 * Description:
283 * After calling bio_reset(), @bio will be in the same state as a freshly
284 * allocated bio returned bio bio_alloc_bioset() - the only fields that are
285 * preserved are the ones that are initialized by bio_alloc_bioset(). See
286 * comment in struct bio.
287 */
288void bio_reset(struct bio *bio)
289{
290 unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
291
292 __bio_free(bio);
293
294 memset(bio, 0, BIO_RESET_BYTES);
295 bio->bi_flags = flags|(1 << BIO_UPTODATE);
296 atomic_set(&bio->bi_remaining, 1);
297}
298EXPORT_SYMBOL(bio_reset);
299
300static void bio_chain_endio(struct bio *bio, int error)
301{
302 bio_endio(bio->bi_private, error);
303 bio_put(bio);
304}
305
306/**
307 * bio_chain - chain bio completions
308 * @bio: the target bio
309 * @parent: the @bio's parent bio
310 *
311 * The caller won't have a bi_end_io called when @bio completes - instead,
312 * @parent's bi_end_io won't be called until both @parent and @bio have
313 * completed; the chained bio will also be freed when it completes.
314 *
315 * The caller must not set bi_private or bi_end_io in @bio.
316 */
317void bio_chain(struct bio *bio, struct bio *parent)
318{
319 BUG_ON(bio->bi_private || bio->bi_end_io);
320
321 bio->bi_private = parent;
322 bio->bi_end_io = bio_chain_endio;
323 atomic_inc(&parent->bi_remaining);
324}
325EXPORT_SYMBOL(bio_chain);
326
327static void bio_alloc_rescue(struct work_struct *work)
328{
329 struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
330 struct bio *bio;
331
332 while (1) {
333 spin_lock(&bs->rescue_lock);
334 bio = bio_list_pop(&bs->rescue_list);
335 spin_unlock(&bs->rescue_lock);
336
337 if (!bio)
338 break;
339
340 generic_make_request(bio);
341 }
342}
343
344static void punt_bios_to_rescuer(struct bio_set *bs)
345{
346 struct bio_list punt, nopunt;
347 struct bio *bio;
348
349 /*
350 * In order to guarantee forward progress we must punt only bios that
351 * were allocated from this bio_set; otherwise, if there was a bio on
352 * there for a stacking driver higher up in the stack, processing it
353 * could require allocating bios from this bio_set, and doing that from
354 * our own rescuer would be bad.
355 *
356 * Since bio lists are singly linked, pop them all instead of trying to
357 * remove from the middle of the list:
358 */
359
360 bio_list_init(&punt);
361 bio_list_init(&nopunt);
362
363 while ((bio = bio_list_pop(current->bio_list)))
364 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
365
366 *current->bio_list = nopunt;
367
368 spin_lock(&bs->rescue_lock);
369 bio_list_merge(&bs->rescue_list, &punt);
370 spin_unlock(&bs->rescue_lock);
371
372 queue_work(bs->rescue_workqueue, &bs->rescue_work);
373}
374
375/**
376 * bio_alloc_bioset - allocate a bio for I/O
377 * @gfp_mask: the GFP_ mask given to the slab allocator
378 * @nr_iovecs: number of iovecs to pre-allocate
379 * @bs: the bio_set to allocate from.
380 *
381 * Description:
382 * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
383 * backed by the @bs's mempool.
384 *
385 * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
386 * able to allocate a bio. This is due to the mempool guarantees. To make this
387 * work, callers must never allocate more than 1 bio at a time from this pool.
388 * Callers that need to allocate more than 1 bio must always submit the
389 * previously allocated bio for IO before attempting to allocate a new one.
390 * Failure to do so can cause deadlocks under memory pressure.
391 *
392 * Note that when running under generic_make_request() (i.e. any block
393 * driver), bios are not submitted until after you return - see the code in
394 * generic_make_request() that converts recursion into iteration, to prevent
395 * stack overflows.
396 *
397 * This would normally mean allocating multiple bios under
398 * generic_make_request() would be susceptible to deadlocks, but we have
399 * deadlock avoidance code that resubmits any blocked bios from a rescuer
400 * thread.
401 *
402 * However, we do not guarantee forward progress for allocations from other
403 * mempools. Doing multiple allocations from the same mempool under
404 * generic_make_request() should be avoided - instead, use bio_set's front_pad
405 * for per bio allocations.
406 *
407 * RETURNS:
408 * Pointer to new bio on success, NULL on failure.
409 */
410struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
411{
412 gfp_t saved_gfp = gfp_mask;
413 unsigned front_pad;
414 unsigned inline_vecs;
415 unsigned long idx = BIO_POOL_NONE;
416 struct bio_vec *bvl = NULL;
417 struct bio *bio;
418 void *p;
419
420 if (!bs) {
421 if (nr_iovecs > UIO_MAXIOV)
422 return NULL;
423
424 p = kmalloc(sizeof(struct bio) +
425 nr_iovecs * sizeof(struct bio_vec),
426 gfp_mask);
427 front_pad = 0;
428 inline_vecs = nr_iovecs;
429 } else {
430 /*
431 * generic_make_request() converts recursion to iteration; this
432 * means if we're running beneath it, any bios we allocate and
433 * submit will not be submitted (and thus freed) until after we
434 * return.
435 *
436 * This exposes us to a potential deadlock if we allocate
437 * multiple bios from the same bio_set() while running
438 * underneath generic_make_request(). If we were to allocate
439 * multiple bios (say a stacking block driver that was splitting
440 * bios), we would deadlock if we exhausted the mempool's
441 * reserve.
442 *
443 * We solve this, and guarantee forward progress, with a rescuer
444 * workqueue per bio_set. If we go to allocate and there are
445 * bios on current->bio_list, we first try the allocation
446 * without __GFP_WAIT; if that fails, we punt those bios we
447 * would be blocking to the rescuer workqueue before we retry
448 * with the original gfp_flags.
449 */
450
451 if (current->bio_list && !bio_list_empty(current->bio_list))
452 gfp_mask &= ~__GFP_WAIT;
453
454 p = mempool_alloc(bs->bio_pool, gfp_mask);
455 if (!p && gfp_mask != saved_gfp) {
456 punt_bios_to_rescuer(bs);
457 gfp_mask = saved_gfp;
458 p = mempool_alloc(bs->bio_pool, gfp_mask);
459 }
460
461 front_pad = bs->front_pad;
462 inline_vecs = BIO_INLINE_VECS;
463 }
464
465 if (unlikely(!p))
466 return NULL;
467
468 bio = p + front_pad;
469 bio_init(bio);
470
471 if (nr_iovecs > inline_vecs) {
472 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
473 if (!bvl && gfp_mask != saved_gfp) {
474 punt_bios_to_rescuer(bs);
475 gfp_mask = saved_gfp;
476 bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
477 }
478
479 if (unlikely(!bvl))
480 goto err_free;
481
482 bio->bi_flags |= 1 << BIO_OWNS_VEC;
483 } else if (nr_iovecs) {
484 bvl = bio->bi_inline_vecs;
485 }
486
487 bio->bi_pool = bs;
488 bio->bi_flags |= idx << BIO_POOL_OFFSET;
489 bio->bi_max_vecs = nr_iovecs;
490 bio->bi_io_vec = bvl;
491 return bio;
492
493err_free:
494 mempool_free(p, bs->bio_pool);
495 return NULL;
496}
497EXPORT_SYMBOL(bio_alloc_bioset);
498
499void zero_fill_bio(struct bio *bio)
500{
501 unsigned long flags;
502 struct bio_vec bv;
503 struct bvec_iter iter;
504
505 bio_for_each_segment(bv, bio, iter) {
506 char *data = bvec_kmap_irq(&bv, &flags);
507 memset(data, 0, bv.bv_len);
508 flush_dcache_page(bv.bv_page);
509 bvec_kunmap_irq(data, &flags);
510 }
511}
512EXPORT_SYMBOL(zero_fill_bio);
513
514/**
515 * bio_put - release a reference to a bio
516 * @bio: bio to release reference to
517 *
518 * Description:
519 * Put a reference to a &struct bio, either one you have gotten with
520 * bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
521 **/
522void bio_put(struct bio *bio)
523{
524 BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
525
526 /*
527 * last put frees it
528 */
529 if (atomic_dec_and_test(&bio->bi_cnt))
530 bio_free(bio);
531}
532EXPORT_SYMBOL(bio_put);
533
534inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
535{
536 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
537 blk_recount_segments(q, bio);
538
539 return bio->bi_phys_segments;
540}
541EXPORT_SYMBOL(bio_phys_segments);
542
543/**
544 * __bio_clone_fast - clone a bio that shares the original bio's biovec
545 * @bio: destination bio
546 * @bio_src: bio to clone
547 *
548 * Clone a &bio. Caller will own the returned bio, but not
549 * the actual data it points to. Reference count of returned
550 * bio will be one.
551 *
552 * Caller must ensure that @bio_src is not freed before @bio.
553 */
554void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
555{
556 BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE);
557
558 /*
559 * most users will be overriding ->bi_bdev with a new target,
560 * so we don't set nor calculate new physical/hw segment counts here
561 */
562 bio->bi_bdev = bio_src->bi_bdev;
563 bio->bi_flags |= 1 << BIO_CLONED;
564 bio->bi_rw = bio_src->bi_rw;
565 bio->bi_iter = bio_src->bi_iter;
566 bio->bi_io_vec = bio_src->bi_io_vec;
567}
568EXPORT_SYMBOL(__bio_clone_fast);
569
570/**
571 * bio_clone_fast - clone a bio that shares the original bio's biovec
572 * @bio: bio to clone
573 * @gfp_mask: allocation priority
574 * @bs: bio_set to allocate from
575 *
576 * Like __bio_clone_fast, only also allocates the returned bio
577 */
578struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
579{
580 struct bio *b;
581
582 b = bio_alloc_bioset(gfp_mask, 0, bs);
583 if (!b)
584 return NULL;
585
586 __bio_clone_fast(b, bio);
587
588 if (bio_integrity(bio)) {
589 int ret;
590
591 ret = bio_integrity_clone(b, bio, gfp_mask);
592
593 if (ret < 0) {
594 bio_put(b);
595 return NULL;
596 }
597 }
598
599 return b;
600}
601EXPORT_SYMBOL(bio_clone_fast);
602
603/**
604 * bio_clone_bioset - clone a bio
605 * @bio_src: bio to clone
606 * @gfp_mask: allocation priority
607 * @bs: bio_set to allocate from
608 *
609 * Clone bio. Caller will own the returned bio, but not the actual data it
610 * points to. Reference count of returned bio will be one.
611 */
612struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
613 struct bio_set *bs)
614{
615 struct bvec_iter iter;
616 struct bio_vec bv;
617 struct bio *bio;
618
619 /*
620 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
621 * bio_src->bi_io_vec to bio->bi_io_vec.
622 *
623 * We can't do that anymore, because:
624 *
625 * - The point of cloning the biovec is to produce a bio with a biovec
626 * the caller can modify: bi_idx and bi_bvec_done should be 0.
627 *
628 * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
629 * we tried to clone the whole thing bio_alloc_bioset() would fail.
630 * But the clone should succeed as long as the number of biovecs we
631 * actually need to allocate is fewer than BIO_MAX_PAGES.
632 *
633 * - Lastly, bi_vcnt should not be looked at or relied upon by code
634 * that does not own the bio - reason being drivers don't use it for
635 * iterating over the biovec anymore, so expecting it to be kept up
636 * to date (i.e. for clones that share the parent biovec) is just
637 * asking for trouble and would force extra work on
638 * __bio_clone_fast() anyways.
639 */
640
641 bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
642 if (!bio)
643 return NULL;
644
645 bio->bi_bdev = bio_src->bi_bdev;
646 bio->bi_rw = bio_src->bi_rw;
647 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
648 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
649
650 if (bio->bi_rw & REQ_DISCARD)
651 goto integrity_clone;
652
653 if (bio->bi_rw & REQ_WRITE_SAME) {
654 bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
655 goto integrity_clone;
656 }
657
658 bio_for_each_segment(bv, bio_src, iter)
659 bio->bi_io_vec[bio->bi_vcnt++] = bv;
660
661integrity_clone:
662 if (bio_integrity(bio_src)) {
663 int ret;
664
665 ret = bio_integrity_clone(bio, bio_src, gfp_mask);
666 if (ret < 0) {
667 bio_put(bio);
668 return NULL;
669 }
670 }
671
672 return bio;
673}
674EXPORT_SYMBOL(bio_clone_bioset);
675
676/**
677 * bio_get_nr_vecs - return approx number of vecs
678 * @bdev: I/O target
679 *
680 * Return the approximate number of pages we can send to this target.
681 * There's no guarantee that you will be able to fit this number of pages
682 * into a bio, it does not account for dynamic restrictions that vary
683 * on offset.
684 */
685int bio_get_nr_vecs(struct block_device *bdev)
686{
687 struct request_queue *q = bdev_get_queue(bdev);
688 int nr_pages;
689
690 nr_pages = min_t(unsigned,
691 queue_max_segments(q),
692 queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
693
694 return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
695
696}
697EXPORT_SYMBOL(bio_get_nr_vecs);
698
699static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
700 *page, unsigned int len, unsigned int offset,
701 unsigned int max_sectors)
702{
703 int retried_segments = 0;
704 struct bio_vec *bvec;
705
706 /*
707 * cloned bio must not modify vec list
708 */
709 if (unlikely(bio_flagged(bio, BIO_CLONED)))
710 return 0;
711
712 if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
713 return 0;
714
715 /*
716 * For filesystems with a blocksize smaller than the pagesize
717 * we will often be called with the same page as last time and
718 * a consecutive offset. Optimize this special case.
719 */
720 if (bio->bi_vcnt > 0) {
721 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
722
723 if (page == prev->bv_page &&
724 offset == prev->bv_offset + prev->bv_len) {
725 unsigned int prev_bv_len = prev->bv_len;
726 prev->bv_len += len;
727
728 if (q->merge_bvec_fn) {
729 struct bvec_merge_data bvm = {
730 /* prev_bvec is already charged in
731 bi_size, discharge it in order to
732 simulate merging updated prev_bvec
733 as new bvec. */
734 .bi_bdev = bio->bi_bdev,
735 .bi_sector = bio->bi_iter.bi_sector,
736 .bi_size = bio->bi_iter.bi_size -
737 prev_bv_len,
738 .bi_rw = bio->bi_rw,
739 };
740
741 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
742 prev->bv_len -= len;
743 return 0;
744 }
745 }
746
747 goto done;
748 }
749 }
750
751 if (bio->bi_vcnt >= bio->bi_max_vecs)
752 return 0;
753
754 /*
755 * we might lose a segment or two here, but rather that than
756 * make this too complex.
757 */
758
759 while (bio->bi_phys_segments >= queue_max_segments(q)) {
760
761 if (retried_segments)
762 return 0;
763
764 retried_segments = 1;
765 blk_recount_segments(q, bio);
766 }
767
768 /*
769 * setup the new entry, we might clear it again later if we
770 * cannot add the page
771 */
772 bvec = &bio->bi_io_vec[bio->bi_vcnt];
773 bvec->bv_page = page;
774 bvec->bv_len = len;
775 bvec->bv_offset = offset;
776
777 /*
778 * if queue has other restrictions (eg varying max sector size
779 * depending on offset), it can specify a merge_bvec_fn in the
780 * queue to get further control
781 */
782 if (q->merge_bvec_fn) {
783 struct bvec_merge_data bvm = {
784 .bi_bdev = bio->bi_bdev,
785 .bi_sector = bio->bi_iter.bi_sector,
786 .bi_size = bio->bi_iter.bi_size,
787 .bi_rw = bio->bi_rw,
788 };
789
790 /*
791 * merge_bvec_fn() returns number of bytes it can accept
792 * at this offset
793 */
794 if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
795 bvec->bv_page = NULL;
796 bvec->bv_len = 0;
797 bvec->bv_offset = 0;
798 return 0;
799 }
800 }
801
802 /* If we may be able to merge these biovecs, force a recount */
803 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
804 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
805
806 bio->bi_vcnt++;
807 bio->bi_phys_segments++;
808 done:
809 bio->bi_iter.bi_size += len;
810 return len;
811}
812
813/**
814 * bio_add_pc_page - attempt to add page to bio
815 * @q: the target queue
816 * @bio: destination bio
817 * @page: page to add
818 * @len: vec entry length
819 * @offset: vec entry offset
820 *
821 * Attempt to add a page to the bio_vec maplist. This can fail for a
822 * number of reasons, such as the bio being full or target block device
823 * limitations. The target block device must allow bio's up to PAGE_SIZE,
824 * so it is always possible to add a single page to an empty bio.
825 *
826 * This should only be used by REQ_PC bios.
827 */
828int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
829 unsigned int len, unsigned int offset)
830{
831 return __bio_add_page(q, bio, page, len, offset,
832 queue_max_hw_sectors(q));
833}
834EXPORT_SYMBOL(bio_add_pc_page);
835
836/**
837 * bio_add_page - attempt to add page to bio
838 * @bio: destination bio
839 * @page: page to add
840 * @len: vec entry length
841 * @offset: vec entry offset
842 *
843 * Attempt to add a page to the bio_vec maplist. This can fail for a
844 * number of reasons, such as the bio being full or target block device
845 * limitations. The target block device must allow bio's up to PAGE_SIZE,
846 * so it is always possible to add a single page to an empty bio.
847 */
848int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
849 unsigned int offset)
850{
851 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
852 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
853}
854EXPORT_SYMBOL(bio_add_page);
855
856struct submit_bio_ret {
857 struct completion event;
858 int error;
859};
860
861static void submit_bio_wait_endio(struct bio *bio, int error)
862{
863 struct submit_bio_ret *ret = bio->bi_private;
864
865 ret->error = error;
866 complete(&ret->event);
867}
868
869/**
870 * submit_bio_wait - submit a bio, and wait until it completes
871 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
872 * @bio: The &struct bio which describes the I/O
873 *
874 * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
875 * bio_endio() on failure.
876 */
877int submit_bio_wait(int rw, struct bio *bio)
878{
879 struct submit_bio_ret ret;
880
881 rw |= REQ_SYNC;
882 init_completion(&ret.event);
883 bio->bi_private = &ret;
884 bio->bi_end_io = submit_bio_wait_endio;
885 submit_bio(rw, bio);
886 wait_for_completion(&ret.event);
887
888 return ret.error;
889}
890EXPORT_SYMBOL(submit_bio_wait);
891
892/**
893 * bio_advance - increment/complete a bio by some number of bytes
894 * @bio: bio to advance
895 * @bytes: number of bytes to complete
896 *
897 * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
898 * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
899 * be updated on the last bvec as well.
900 *
901 * @bio will then represent the remaining, uncompleted portion of the io.
902 */
903void bio_advance(struct bio *bio, unsigned bytes)
904{
905 if (bio_integrity(bio))
906 bio_integrity_advance(bio, bytes);
907
908 bio_advance_iter(bio, &bio->bi_iter, bytes);
909}
910EXPORT_SYMBOL(bio_advance);
911
912/**
913 * bio_alloc_pages - allocates a single page for each bvec in a bio
914 * @bio: bio to allocate pages for
915 * @gfp_mask: flags for allocation
916 *
917 * Allocates pages up to @bio->bi_vcnt.
918 *
919 * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
920 * freed.
921 */
922int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
923{
924 int i;
925 struct bio_vec *bv;
926
927 bio_for_each_segment_all(bv, bio, i) {
928 bv->bv_page = alloc_page(gfp_mask);
929 if (!bv->bv_page) {
930 while (--bv >= bio->bi_io_vec)
931 __free_page(bv->bv_page);
932 return -ENOMEM;
933 }
934 }
935
936 return 0;
937}
938EXPORT_SYMBOL(bio_alloc_pages);
939
940/**
941 * bio_copy_data - copy contents of data buffers from one chain of bios to
942 * another
943 * @src: source bio list
944 * @dst: destination bio list
945 *
946 * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
947 * @src and @dst as linked lists of bios.
948 *
949 * Stops when it reaches the end of either @src or @dst - that is, copies
950 * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
951 */
952void bio_copy_data(struct bio *dst, struct bio *src)
953{
954 struct bvec_iter src_iter, dst_iter;
955 struct bio_vec src_bv, dst_bv;
956 void *src_p, *dst_p;
957 unsigned bytes;
958
959 src_iter = src->bi_iter;
960 dst_iter = dst->bi_iter;
961
962 while (1) {
963 if (!src_iter.bi_size) {
964 src = src->bi_next;
965 if (!src)
966 break;
967
968 src_iter = src->bi_iter;
969 }
970
971 if (!dst_iter.bi_size) {
972 dst = dst->bi_next;
973 if (!dst)
974 break;
975
976 dst_iter = dst->bi_iter;
977 }
978
979 src_bv = bio_iter_iovec(src, src_iter);
980 dst_bv = bio_iter_iovec(dst, dst_iter);
981
982 bytes = min(src_bv.bv_len, dst_bv.bv_len);
983
984 src_p = kmap_atomic(src_bv.bv_page);
985 dst_p = kmap_atomic(dst_bv.bv_page);
986
987 memcpy(dst_p + dst_bv.bv_offset,
988 src_p + src_bv.bv_offset,
989 bytes);
990
991 kunmap_atomic(dst_p);
992 kunmap_atomic(src_p);
993
994 bio_advance_iter(src, &src_iter, bytes);
995 bio_advance_iter(dst, &dst_iter, bytes);
996 }
997}
998EXPORT_SYMBOL(bio_copy_data);
999
1000struct bio_map_data {
1001 int nr_sgvecs;
1002 int is_our_pages;
1003 struct sg_iovec sgvecs[];
1004};
1005
1006static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
1007 const struct sg_iovec *iov, int iov_count,
1008 int is_our_pages)
1009{
1010 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
1011 bmd->nr_sgvecs = iov_count;
1012 bmd->is_our_pages = is_our_pages;
1013 bio->bi_private = bmd;
1014}
1015
1016static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
1017 gfp_t gfp_mask)
1018{
1019 if (iov_count > UIO_MAXIOV)
1020 return NULL;
1021
1022 return kmalloc(sizeof(struct bio_map_data) +
1023 sizeof(struct sg_iovec) * iov_count, gfp_mask);
1024}
1025
1026static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
1027 int to_user, int from_user, int do_free_page)
1028{
1029 int ret = 0, i;
1030 struct bio_vec *bvec;
1031 int iov_idx = 0;
1032 unsigned int iov_off = 0;
1033
1034 bio_for_each_segment_all(bvec, bio, i) {
1035 char *bv_addr = page_address(bvec->bv_page);
1036 unsigned int bv_len = bvec->bv_len;
1037
1038 while (bv_len && iov_idx < iov_count) {
1039 unsigned int bytes;
1040 char __user *iov_addr;
1041
1042 bytes = min_t(unsigned int,
1043 iov[iov_idx].iov_len - iov_off, bv_len);
1044 iov_addr = iov[iov_idx].iov_base + iov_off;
1045
1046 if (!ret) {
1047 if (to_user)
1048 ret = copy_to_user(iov_addr, bv_addr,
1049 bytes);
1050
1051 if (from_user)
1052 ret = copy_from_user(bv_addr, iov_addr,
1053 bytes);
1054
1055 if (ret)
1056 ret = -EFAULT;
1057 }
1058
1059 bv_len -= bytes;
1060 bv_addr += bytes;
1061 iov_addr += bytes;
1062 iov_off += bytes;
1063
1064 if (iov[iov_idx].iov_len == iov_off) {
1065 iov_idx++;
1066 iov_off = 0;
1067 }
1068 }
1069
1070 if (do_free_page)
1071 __free_page(bvec->bv_page);
1072 }
1073
1074 return ret;
1075}
1076
1077/**
1078 * bio_uncopy_user - finish previously mapped bio
1079 * @bio: bio being terminated
1080 *
1081 * Free pages allocated from bio_copy_user() and write back data
1082 * to user space in case of a read.
1083 */
1084int bio_uncopy_user(struct bio *bio)
1085{
1086 struct bio_map_data *bmd = bio->bi_private;
1087 struct bio_vec *bvec;
1088 int ret = 0, i;
1089
1090 if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
1091 /*
1092 * if we're in a workqueue, the request is orphaned, so
1093 * don't copy into a random user address space, just free.
1094 */
1095 if (current->mm)
1096 ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
1097 bio_data_dir(bio) == READ,
1098 0, bmd->is_our_pages);
1099 else if (bmd->is_our_pages)
1100 bio_for_each_segment_all(bvec, bio, i)
1101 __free_page(bvec->bv_page);
1102 }
1103 kfree(bmd);
1104 bio_put(bio);
1105 return ret;
1106}
1107EXPORT_SYMBOL(bio_uncopy_user);
1108
1109/**
1110 * bio_copy_user_iov - copy user data to bio
1111 * @q: destination block queue
1112 * @map_data: pointer to the rq_map_data holding pages (if necessary)
1113 * @iov: the iovec.
1114 * @iov_count: number of elements in the iovec
1115 * @write_to_vm: bool indicating writing to pages or not
1116 * @gfp_mask: memory allocation flags
1117 *
1118 * Prepares and returns a bio for indirect user io, bouncing data
1119 * to/from kernel pages as necessary. Must be paired with
1120 * call bio_uncopy_user() on io completion.
1121 */
1122struct bio *bio_copy_user_iov(struct request_queue *q,
1123 struct rq_map_data *map_data,
1124 const struct sg_iovec *iov, int iov_count,
1125 int write_to_vm, gfp_t gfp_mask)
1126{
1127 struct bio_map_data *bmd;
1128 struct bio_vec *bvec;
1129 struct page *page;
1130 struct bio *bio;
1131 int i, ret;
1132 int nr_pages = 0;
1133 unsigned int len = 0;
1134 unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
1135
1136 for (i = 0; i < iov_count; i++) {
1137 unsigned long uaddr;
1138 unsigned long end;
1139 unsigned long start;
1140
1141 uaddr = (unsigned long)iov[i].iov_base;
1142 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1143 start = uaddr >> PAGE_SHIFT;
1144
1145 /*
1146 * Overflow, abort
1147 */
1148 if (end < start)
1149 return ERR_PTR(-EINVAL);
1150
1151 nr_pages += end - start;
1152 len += iov[i].iov_len;
1153 }
1154
1155 if (offset)
1156 nr_pages++;
1157
1158 bmd = bio_alloc_map_data(iov_count, gfp_mask);
1159 if (!bmd)
1160 return ERR_PTR(-ENOMEM);
1161
1162 ret = -ENOMEM;
1163 bio = bio_kmalloc(gfp_mask, nr_pages);
1164 if (!bio)
1165 goto out_bmd;
1166
1167 if (!write_to_vm)
1168 bio->bi_rw |= REQ_WRITE;
1169
1170 ret = 0;
1171
1172 if (map_data) {
1173 nr_pages = 1 << map_data->page_order;
1174 i = map_data->offset / PAGE_SIZE;
1175 }
1176 while (len) {
1177 unsigned int bytes = PAGE_SIZE;
1178
1179 bytes -= offset;
1180
1181 if (bytes > len)
1182 bytes = len;
1183
1184 if (map_data) {
1185 if (i == map_data->nr_entries * nr_pages) {
1186 ret = -ENOMEM;
1187 break;
1188 }
1189
1190 page = map_data->pages[i / nr_pages];
1191 page += (i % nr_pages);
1192
1193 i++;
1194 } else {
1195 page = alloc_page(q->bounce_gfp | gfp_mask);
1196 if (!page) {
1197 ret = -ENOMEM;
1198 break;
1199 }
1200 }
1201
1202 if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
1203 break;
1204
1205 len -= bytes;
1206 offset = 0;
1207 }
1208
1209 if (ret)
1210 goto cleanup;
1211
1212 /*
1213 * success
1214 */
1215 if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
1216 (map_data && map_data->from_user)) {
1217 ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0);
1218 if (ret)
1219 goto cleanup;
1220 }
1221
1222 bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
1223 return bio;
1224cleanup:
1225 if (!map_data)
1226 bio_for_each_segment_all(bvec, bio, i)
1227 __free_page(bvec->bv_page);
1228
1229 bio_put(bio);
1230out_bmd:
1231 kfree(bmd);
1232 return ERR_PTR(ret);
1233}
1234
1235/**
1236 * bio_copy_user - copy user data to bio
1237 * @q: destination block queue
1238 * @map_data: pointer to the rq_map_data holding pages (if necessary)
1239 * @uaddr: start of user address
1240 * @len: length in bytes
1241 * @write_to_vm: bool indicating writing to pages or not
1242 * @gfp_mask: memory allocation flags
1243 *
1244 * Prepares and returns a bio for indirect user io, bouncing data
1245 * to/from kernel pages as necessary. Must be paired with
1246 * call bio_uncopy_user() on io completion.
1247 */
1248struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
1249 unsigned long uaddr, unsigned int len,
1250 int write_to_vm, gfp_t gfp_mask)
1251{
1252 struct sg_iovec iov;
1253
1254 iov.iov_base = (void __user *)uaddr;
1255 iov.iov_len = len;
1256
1257 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
1258}
1259EXPORT_SYMBOL(bio_copy_user);
1260
1261static struct bio *__bio_map_user_iov(struct request_queue *q,
1262 struct block_device *bdev,
1263 const struct sg_iovec *iov, int iov_count,
1264 int write_to_vm, gfp_t gfp_mask)
1265{
1266 int i, j;
1267 int nr_pages = 0;
1268 struct page **pages;
1269 struct bio *bio;
1270 int cur_page = 0;
1271 int ret, offset;
1272
1273 for (i = 0; i < iov_count; i++) {
1274 unsigned long uaddr = (unsigned long)iov[i].iov_base;
1275 unsigned long len = iov[i].iov_len;
1276 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1277 unsigned long start = uaddr >> PAGE_SHIFT;
1278
1279 /*
1280 * Overflow, abort
1281 */
1282 if (end < start)
1283 return ERR_PTR(-EINVAL);
1284
1285 nr_pages += end - start;
1286 /*
1287 * buffer must be aligned to at least hardsector size for now
1288 */
1289 if (uaddr & queue_dma_alignment(q))
1290 return ERR_PTR(-EINVAL);
1291 }
1292
1293 if (!nr_pages)
1294 return ERR_PTR(-EINVAL);
1295
1296 bio = bio_kmalloc(gfp_mask, nr_pages);
1297 if (!bio)
1298 return ERR_PTR(-ENOMEM);
1299
1300 ret = -ENOMEM;
1301 pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
1302 if (!pages)
1303 goto out;
1304
1305 for (i = 0; i < iov_count; i++) {
1306 unsigned long uaddr = (unsigned long)iov[i].iov_base;
1307 unsigned long len = iov[i].iov_len;
1308 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1309 unsigned long start = uaddr >> PAGE_SHIFT;
1310 const int local_nr_pages = end - start;
1311 const int page_limit = cur_page + local_nr_pages;
1312
1313 ret = get_user_pages_fast(uaddr, local_nr_pages,
1314 write_to_vm, &pages[cur_page]);
1315 if (ret < local_nr_pages) {
1316 ret = -EFAULT;
1317 goto out_unmap;
1318 }
1319
1320 offset = uaddr & ~PAGE_MASK;
1321 for (j = cur_page; j < page_limit; j++) {
1322 unsigned int bytes = PAGE_SIZE - offset;
1323
1324 if (len <= 0)
1325 break;
1326
1327 if (bytes > len)
1328 bytes = len;
1329
1330 /*
1331 * sorry...
1332 */
1333 if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
1334 bytes)
1335 break;
1336
1337 len -= bytes;
1338 offset = 0;
1339 }
1340
1341 cur_page = j;
1342 /*
1343 * release the pages we didn't map into the bio, if any
1344 */
1345 while (j < page_limit)
1346 page_cache_release(pages[j++]);
1347 }
1348
1349 kfree(pages);
1350
1351 /*
1352 * set data direction, and check if mapped pages need bouncing
1353 */
1354 if (!write_to_vm)
1355 bio->bi_rw |= REQ_WRITE;
1356
1357 bio->bi_bdev = bdev;
1358 bio->bi_flags |= (1 << BIO_USER_MAPPED);
1359 return bio;
1360
1361 out_unmap:
1362 for (i = 0; i < nr_pages; i++) {
1363 if(!pages[i])
1364 break;
1365 page_cache_release(pages[i]);
1366 }
1367 out:
1368 kfree(pages);
1369 bio_put(bio);
1370 return ERR_PTR(ret);
1371}
1372
1373/**
1374 * bio_map_user - map user address into bio
1375 * @q: the struct request_queue for the bio
1376 * @bdev: destination block device
1377 * @uaddr: start of user address
1378 * @len: length in bytes
1379 * @write_to_vm: bool indicating writing to pages or not
1380 * @gfp_mask: memory allocation flags
1381 *
1382 * Map the user space address into a bio suitable for io to a block
1383 * device. Returns an error pointer in case of error.
1384 */
1385struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
1386 unsigned long uaddr, unsigned int len, int write_to_vm,
1387 gfp_t gfp_mask)
1388{
1389 struct sg_iovec iov;
1390
1391 iov.iov_base = (void __user *)uaddr;
1392 iov.iov_len = len;
1393
1394 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
1395}
1396EXPORT_SYMBOL(bio_map_user);
1397
1398/**
1399 * bio_map_user_iov - map user sg_iovec table into bio
1400 * @q: the struct request_queue for the bio
1401 * @bdev: destination block device
1402 * @iov: the iovec.
1403 * @iov_count: number of elements in the iovec
1404 * @write_to_vm: bool indicating writing to pages or not
1405 * @gfp_mask: memory allocation flags
1406 *
1407 * Map the user space address into a bio suitable for io to a block
1408 * device. Returns an error pointer in case of error.
1409 */
1410struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
1411 const struct sg_iovec *iov, int iov_count,
1412 int write_to_vm, gfp_t gfp_mask)
1413{
1414 struct bio *bio;
1415
1416 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
1417 gfp_mask);
1418 if (IS_ERR(bio))
1419 return bio;
1420
1421 /*
1422 * subtle -- if __bio_map_user() ended up bouncing a bio,
1423 * it would normally disappear when its bi_end_io is run.
1424 * however, we need it for the unmap, so grab an extra
1425 * reference to it
1426 */
1427 bio_get(bio);
1428
1429 return bio;
1430}
1431
1432static void __bio_unmap_user(struct bio *bio)
1433{
1434 struct bio_vec *bvec;
1435 int i;
1436
1437 /*
1438 * make sure we dirty pages we wrote to
1439 */
1440 bio_for_each_segment_all(bvec, bio, i) {
1441 if (bio_data_dir(bio) == READ)
1442 set_page_dirty_lock(bvec->bv_page);
1443
1444 page_cache_release(bvec->bv_page);
1445 }
1446
1447 bio_put(bio);
1448}
1449
1450/**
1451 * bio_unmap_user - unmap a bio
1452 * @bio: the bio being unmapped
1453 *
1454 * Unmap a bio previously mapped by bio_map_user(). Must be called with
1455 * a process context.
1456 *
1457 * bio_unmap_user() may sleep.
1458 */
1459void bio_unmap_user(struct bio *bio)
1460{
1461 __bio_unmap_user(bio);
1462 bio_put(bio);
1463}
1464EXPORT_SYMBOL(bio_unmap_user);
1465
1466static void bio_map_kern_endio(struct bio *bio, int err)
1467{
1468 bio_put(bio);
1469}
1470
1471static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1472 unsigned int len, gfp_t gfp_mask)
1473{
1474 unsigned long kaddr = (unsigned long)data;
1475 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1476 unsigned long start = kaddr >> PAGE_SHIFT;
1477 const int nr_pages = end - start;
1478 int offset, i;
1479 struct bio *bio;
1480
1481 bio = bio_kmalloc(gfp_mask, nr_pages);
1482 if (!bio)
1483 return ERR_PTR(-ENOMEM);
1484
1485 offset = offset_in_page(kaddr);
1486 for (i = 0; i < nr_pages; i++) {
1487 unsigned int bytes = PAGE_SIZE - offset;
1488
1489 if (len <= 0)
1490 break;
1491
1492 if (bytes > len)
1493 bytes = len;
1494
1495 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
1496 offset) < bytes)
1497 break;
1498
1499 data += bytes;
1500 len -= bytes;
1501 offset = 0;
1502 }
1503
1504 bio->bi_end_io = bio_map_kern_endio;
1505 return bio;
1506}
1507
1508/**
1509 * bio_map_kern - map kernel address into bio
1510 * @q: the struct request_queue for the bio
1511 * @data: pointer to buffer to map
1512 * @len: length in bytes
1513 * @gfp_mask: allocation flags for bio allocation
1514 *
1515 * Map the kernel address into a bio suitable for io to a block
1516 * device. Returns an error pointer in case of error.
1517 */
1518struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1519 gfp_t gfp_mask)
1520{
1521 struct bio *bio;
1522
1523 bio = __bio_map_kern(q, data, len, gfp_mask);
1524 if (IS_ERR(bio))
1525 return bio;
1526
1527 if (bio->bi_iter.bi_size == len)
1528 return bio;
1529
1530 /*
1531 * Don't support partial mappings.
1532 */
1533 bio_put(bio);
1534 return ERR_PTR(-EINVAL);
1535}
1536EXPORT_SYMBOL(bio_map_kern);
1537
1538static void bio_copy_kern_endio(struct bio *bio, int err)
1539{
1540 struct bio_vec *bvec;
1541 const int read = bio_data_dir(bio) == READ;
1542 struct bio_map_data *bmd = bio->bi_private;
1543 int i;
1544 char *p = bmd->sgvecs[0].iov_base;
1545
1546 bio_for_each_segment_all(bvec, bio, i) {
1547 char *addr = page_address(bvec->bv_page);
1548
1549 if (read)
1550 memcpy(p, addr, bvec->bv_len);
1551
1552 __free_page(bvec->bv_page);
1553 p += bvec->bv_len;
1554 }
1555
1556 kfree(bmd);
1557 bio_put(bio);
1558}
1559
1560/**
1561 * bio_copy_kern - copy kernel address into bio
1562 * @q: the struct request_queue for the bio
1563 * @data: pointer to buffer to copy
1564 * @len: length in bytes
1565 * @gfp_mask: allocation flags for bio and page allocation
1566 * @reading: data direction is READ
1567 *
1568 * copy the kernel address into a bio suitable for io to a block
1569 * device. Returns an error pointer in case of error.
1570 */
1571struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1572 gfp_t gfp_mask, int reading)
1573{
1574 struct bio *bio;
1575 struct bio_vec *bvec;
1576 int i;
1577
1578 bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
1579 if (IS_ERR(bio))
1580 return bio;
1581
1582 if (!reading) {
1583 void *p = data;
1584
1585 bio_for_each_segment_all(bvec, bio, i) {
1586 char *addr = page_address(bvec->bv_page);
1587
1588 memcpy(addr, p, bvec->bv_len);
1589 p += bvec->bv_len;
1590 }
1591 }
1592
1593 bio->bi_end_io = bio_copy_kern_endio;
1594
1595 return bio;
1596}
1597EXPORT_SYMBOL(bio_copy_kern);
1598
1599/*
1600 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
1601 * for performing direct-IO in BIOs.
1602 *
1603 * The problem is that we cannot run set_page_dirty() from interrupt context
1604 * because the required locks are not interrupt-safe. So what we can do is to
1605 * mark the pages dirty _before_ performing IO. And in interrupt context,
1606 * check that the pages are still dirty. If so, fine. If not, redirty them
1607 * in process context.
1608 *
1609 * We special-case compound pages here: normally this means reads into hugetlb
1610 * pages. The logic in here doesn't really work right for compound pages
1611 * because the VM does not uniformly chase down the head page in all cases.
1612 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
1613 * handle them at all. So we skip compound pages here at an early stage.
1614 *
1615 * Note that this code is very hard to test under normal circumstances because
1616 * direct-io pins the pages with get_user_pages(). This makes
1617 * is_page_cache_freeable return false, and the VM will not clean the pages.
1618 * But other code (eg, flusher threads) could clean the pages if they are mapped
1619 * pagecache.
1620 *
1621 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
1622 * deferred bio dirtying paths.
1623 */
1624
1625/*
1626 * bio_set_pages_dirty() will mark all the bio's pages as dirty.
1627 */
1628void bio_set_pages_dirty(struct bio *bio)
1629{
1630 struct bio_vec *bvec;
1631 int i;
1632
1633 bio_for_each_segment_all(bvec, bio, i) {
1634 struct page *page = bvec->bv_page;
1635
1636 if (page && !PageCompound(page))
1637 set_page_dirty_lock(page);
1638 }
1639}
1640
1641static void bio_release_pages(struct bio *bio)
1642{
1643 struct bio_vec *bvec;
1644 int i;
1645
1646 bio_for_each_segment_all(bvec, bio, i) {
1647 struct page *page = bvec->bv_page;
1648
1649 if (page)
1650 put_page(page);
1651 }
1652}
1653
1654/*
1655 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
1656 * If they are, then fine. If, however, some pages are clean then they must
1657 * have been written out during the direct-IO read. So we take another ref on
1658 * the BIO and the offending pages and re-dirty the pages in process context.
1659 *
1660 * It is expected that bio_check_pages_dirty() will wholly own the BIO from
1661 * here on. It will run one page_cache_release() against each page and will
1662 * run one bio_put() against the BIO.
1663 */
1664
1665static void bio_dirty_fn(struct work_struct *work);
1666
1667static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
1668static DEFINE_SPINLOCK(bio_dirty_lock);
1669static struct bio *bio_dirty_list;
1670
1671/*
1672 * This runs in process context
1673 */
1674static void bio_dirty_fn(struct work_struct *work)
1675{
1676 unsigned long flags;
1677 struct bio *bio;
1678
1679 spin_lock_irqsave(&bio_dirty_lock, flags);
1680 bio = bio_dirty_list;
1681 bio_dirty_list = NULL;
1682 spin_unlock_irqrestore(&bio_dirty_lock, flags);
1683
1684 while (bio) {
1685 struct bio *next = bio->bi_private;
1686
1687 bio_set_pages_dirty(bio);
1688 bio_release_pages(bio);
1689 bio_put(bio);
1690 bio = next;
1691 }
1692}
1693
1694void bio_check_pages_dirty(struct bio *bio)
1695{
1696 struct bio_vec *bvec;
1697 int nr_clean_pages = 0;
1698 int i;
1699
1700 bio_for_each_segment_all(bvec, bio, i) {
1701 struct page *page = bvec->bv_page;
1702
1703 if (PageDirty(page) || PageCompound(page)) {
1704 page_cache_release(page);
1705 bvec->bv_page = NULL;
1706 } else {
1707 nr_clean_pages++;
1708 }
1709 }
1710
1711 if (nr_clean_pages) {
1712 unsigned long flags;
1713
1714 spin_lock_irqsave(&bio_dirty_lock, flags);
1715 bio->bi_private = bio_dirty_list;
1716 bio_dirty_list = bio;
1717 spin_unlock_irqrestore(&bio_dirty_lock, flags);
1718 schedule_work(&bio_dirty_work);
1719 } else {
1720 bio_put(bio);
1721 }
1722}
1723
1724#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1725void bio_flush_dcache_pages(struct bio *bi)
1726{
1727 struct bio_vec bvec;
1728 struct bvec_iter iter;
1729
1730 bio_for_each_segment(bvec, bi, iter)
1731 flush_dcache_page(bvec.bv_page);
1732}
1733EXPORT_SYMBOL(bio_flush_dcache_pages);
1734#endif
1735
1736/**
1737 * bio_endio - end I/O on a bio
1738 * @bio: bio
1739 * @error: error, if any
1740 *
1741 * Description:
1742 * bio_endio() will end I/O on the whole bio. bio_endio() is the
1743 * preferred way to end I/O on a bio, it takes care of clearing
1744 * BIO_UPTODATE on error. @error is 0 on success, and and one of the
1745 * established -Exxxx (-EIO, for instance) error values in case
1746 * something went wrong. No one should call bi_end_io() directly on a
1747 * bio unless they own it and thus know that it has an end_io
1748 * function.
1749 **/
1750void bio_endio(struct bio *bio, int error)
1751{
1752 while (bio) {
1753 BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
1754
1755 if (error)
1756 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1757 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1758 error = -EIO;
1759
1760 if (!atomic_dec_and_test(&bio->bi_remaining))
1761 return;
1762
1763 /*
1764 * Need to have a real endio function for chained bios,
1765 * otherwise various corner cases will break (like stacking
1766 * block devices that save/restore bi_end_io) - however, we want
1767 * to avoid unbounded recursion and blowing the stack. Tail call
1768 * optimization would handle this, but compiling with frame
1769 * pointers also disables gcc's sibling call optimization.
1770 */
1771 if (bio->bi_end_io == bio_chain_endio) {
1772 struct bio *parent = bio->bi_private;
1773 bio_put(bio);
1774 bio = parent;
1775 } else {
1776 if (bio->bi_end_io)
1777 bio->bi_end_io(bio, error);
1778 bio = NULL;
1779 }
1780 }
1781}
1782EXPORT_SYMBOL(bio_endio);
1783
1784/**
1785 * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
1786 * @bio: bio
1787 * @error: error, if any
1788 *
1789 * For code that has saved and restored bi_end_io; thing hard before using this
1790 * function, probably you should've cloned the entire bio.
1791 **/
1792void bio_endio_nodec(struct bio *bio, int error)
1793{
1794 atomic_inc(&bio->bi_remaining);
1795 bio_endio(bio, error);
1796}
1797EXPORT_SYMBOL(bio_endio_nodec);
1798
1799/**
1800 * bio_split - split a bio
1801 * @bio: bio to split
1802 * @sectors: number of sectors to split from the front of @bio
1803 * @gfp: gfp mask
1804 * @bs: bio set to allocate from
1805 *
1806 * Allocates and returns a new bio which represents @sectors from the start of
1807 * @bio, and updates @bio to represent the remaining sectors.
1808 *
1809 * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's
1810 * responsibility to ensure that @bio is not freed before the split.
1811 */
1812struct bio *bio_split(struct bio *bio, int sectors,
1813 gfp_t gfp, struct bio_set *bs)
1814{
1815 struct bio *split = NULL;
1816
1817 BUG_ON(sectors <= 0);
1818 BUG_ON(sectors >= bio_sectors(bio));
1819
1820 split = bio_clone_fast(bio, gfp, bs);
1821 if (!split)
1822 return NULL;
1823
1824 split->bi_iter.bi_size = sectors << 9;
1825
1826 if (bio_integrity(split))
1827 bio_integrity_trim(split, 0, sectors);
1828
1829 bio_advance(bio, split->bi_iter.bi_size);
1830
1831 return split;
1832}
1833EXPORT_SYMBOL(bio_split);
1834
1835/**
1836 * bio_trim - trim a bio
1837 * @bio: bio to trim
1838 * @offset: number of sectors to trim from the front of @bio
1839 * @size: size we want to trim @bio to, in sectors
1840 */
1841void bio_trim(struct bio *bio, int offset, int size)
1842{
1843 /* 'bio' is a cloned bio which we need to trim to match
1844 * the given offset and size.
1845 */
1846
1847 size <<= 9;
1848 if (offset == 0 && size == bio->bi_iter.bi_size)
1849 return;
1850
1851 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1852
1853 bio_advance(bio, offset << 9);
1854
1855 bio->bi_iter.bi_size = size;
1856}
1857EXPORT_SYMBOL_GPL(bio_trim);
1858
1859/*
1860 * create memory pools for biovec's in a bio_set.
1861 * use the global biovec slabs created for general use.
1862 */
1863mempool_t *biovec_create_pool(int pool_entries)
1864{
1865 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
1866
1867 return mempool_create_slab_pool(pool_entries, bp->slab);
1868}
1869
1870void bioset_free(struct bio_set *bs)
1871{
1872 if (bs->rescue_workqueue)
1873 destroy_workqueue(bs->rescue_workqueue);
1874
1875 if (bs->bio_pool)
1876 mempool_destroy(bs->bio_pool);
1877
1878 if (bs->bvec_pool)
1879 mempool_destroy(bs->bvec_pool);
1880
1881 bioset_integrity_free(bs);
1882 bio_put_slab(bs);
1883
1884 kfree(bs);
1885}
1886EXPORT_SYMBOL(bioset_free);
1887
1888/**
1889 * bioset_create - Create a bio_set
1890 * @pool_size: Number of bio and bio_vecs to cache in the mempool
1891 * @front_pad: Number of bytes to allocate in front of the returned bio
1892 *
1893 * Description:
1894 * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
1895 * to ask for a number of bytes to be allocated in front of the bio.
1896 * Front pad allocation is useful for embedding the bio inside
1897 * another structure, to avoid allocating extra data to go with the bio.
1898 * Note that the bio must be embedded at the END of that structure always,
1899 * or things will break badly.
1900 */
1901struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1902{
1903 unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
1904 struct bio_set *bs;
1905
1906 bs = kzalloc(sizeof(*bs), GFP_KERNEL);
1907 if (!bs)
1908 return NULL;
1909
1910 bs->front_pad = front_pad;
1911
1912 spin_lock_init(&bs->rescue_lock);
1913 bio_list_init(&bs->rescue_list);
1914 INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
1915
1916 bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
1917 if (!bs->bio_slab) {
1918 kfree(bs);
1919 return NULL;
1920 }
1921
1922 bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
1923 if (!bs->bio_pool)
1924 goto bad;
1925
1926 bs->bvec_pool = biovec_create_pool(pool_size);
1927 if (!bs->bvec_pool)
1928 goto bad;
1929
1930 bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
1931 if (!bs->rescue_workqueue)
1932 goto bad;
1933
1934 return bs;
1935bad:
1936 bioset_free(bs);
1937 return NULL;
1938}
1939EXPORT_SYMBOL(bioset_create);
1940
1941#ifdef CONFIG_BLK_CGROUP
1942/**
1943 * bio_associate_current - associate a bio with %current
1944 * @bio: target bio
1945 *
1946 * Associate @bio with %current if it hasn't been associated yet. Block
1947 * layer will treat @bio as if it were issued by %current no matter which
1948 * task actually issues it.
1949 *
1950 * This function takes an extra reference of @task's io_context and blkcg
1951 * which will be put when @bio is released. The caller must own @bio,
1952 * ensure %current->io_context exists, and is responsible for synchronizing
1953 * calls to this function.
1954 */
1955int bio_associate_current(struct bio *bio)
1956{
1957 struct io_context *ioc;
1958 struct cgroup_subsys_state *css;
1959
1960 if (bio->bi_ioc)
1961 return -EBUSY;
1962
1963 ioc = current->io_context;
1964 if (!ioc)
1965 return -ENOENT;
1966
1967 /* acquire active ref on @ioc and associate */
1968 get_io_context_active(ioc);
1969 bio->bi_ioc = ioc;
1970
1971 /* associate blkcg if exists */
1972 rcu_read_lock();
1973 css = task_css(current, blkio_cgrp_id);
1974 if (css && css_tryget(css))
1975 bio->bi_css = css;
1976 rcu_read_unlock();
1977
1978 return 0;
1979}
1980
1981/**
1982 * bio_disassociate_task - undo bio_associate_current()
1983 * @bio: target bio
1984 */
1985void bio_disassociate_task(struct bio *bio)
1986{
1987 if (bio->bi_ioc) {
1988 put_io_context(bio->bi_ioc);
1989 bio->bi_ioc = NULL;
1990 }
1991 if (bio->bi_css) {
1992 css_put(bio->bi_css);
1993 bio->bi_css = NULL;
1994 }
1995}
1996
1997#endif /* CONFIG_BLK_CGROUP */
1998
1999static void __init biovec_init_slabs(void)
2000{
2001 int i;
2002
2003 for (i = 0; i < BIOVEC_NR_POOLS; i++) {
2004 int size;
2005 struct biovec_slab *bvs = bvec_slabs + i;
2006
2007 if (bvs->nr_vecs <= BIO_INLINE_VECS) {
2008 bvs->slab = NULL;
2009 continue;
2010 }
2011
2012 size = bvs->nr_vecs * sizeof(struct bio_vec);
2013 bvs->slab = kmem_cache_create(bvs->name, size, 0,
2014 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2015 }
2016}
2017
2018static int __init init_bio(void)
2019{
2020 bio_slab_max = 2;
2021 bio_slab_nr = 0;
2022 bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
2023 if (!bio_slabs)
2024 panic("bio: can't allocate bios\n");
2025
2026 bio_integrity_init();
2027 biovec_init_slabs();
2028
2029 fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
2030 if (!fs_bio_set)
2031 panic("bio: can't allocate bios\n");
2032
2033 if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
2034 panic("bio: can't create integrity pool\n");
2035
2036 return 0;
2037}
2038subsys_initcall(init_bio);
diff --git a/block/blk-core.c b/block/blk-core.c
index a0e3096c4bb5..40d654861c33 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -146,8 +146,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
146 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", 146 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
147 (unsigned long long)blk_rq_pos(rq), 147 (unsigned long long)blk_rq_pos(rq),
148 blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); 148 blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
149 printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", 149 printk(KERN_INFO " bio %p, biotail %p, len %u\n",
150 rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); 150 rq->bio, rq->biotail, blk_rq_bytes(rq));
151 151
152 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 152 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
153 printk(KERN_INFO " cdb: "); 153 printk(KERN_INFO " cdb: ");
@@ -251,8 +251,10 @@ void blk_sync_queue(struct request_queue *q)
251 struct blk_mq_hw_ctx *hctx; 251 struct blk_mq_hw_ctx *hctx;
252 int i; 252 int i;
253 253
254 queue_for_each_hw_ctx(q, hctx, i) 254 queue_for_each_hw_ctx(q, hctx, i) {
255 cancel_delayed_work_sync(&hctx->delayed_work); 255 cancel_delayed_work_sync(&hctx->run_work);
256 cancel_delayed_work_sync(&hctx->delay_work);
257 }
256 } else { 258 } else {
257 cancel_delayed_work_sync(&q->delay_work); 259 cancel_delayed_work_sync(&q->delay_work);
258 } 260 }
@@ -574,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
574 if (!q) 576 if (!q)
575 return NULL; 577 return NULL;
576 578
577 if (percpu_counter_init(&q->mq_usage_counter, 0))
578 goto fail_q;
579
580 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 579 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
581 if (q->id < 0) 580 if (q->id < 0)
582 goto fail_c; 581 goto fail_q;
583 582
584 q->backing_dev_info.ra_pages = 583 q->backing_dev_info.ra_pages =
585 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 584 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -637,8 +636,6 @@ fail_bdi:
637 bdi_destroy(&q->backing_dev_info); 636 bdi_destroy(&q->backing_dev_info);
638fail_id: 637fail_id:
639 ida_simple_remove(&blk_queue_ida, q->id); 638 ida_simple_remove(&blk_queue_ida, q->id);
640fail_c:
641 percpu_counter_destroy(&q->mq_usage_counter);
642fail_q: 639fail_q:
643 kmem_cache_free(blk_requestq_cachep, q); 640 kmem_cache_free(blk_requestq_cachep, q);
644 return NULL; 641 return NULL;
@@ -846,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags)
846 __freed_request(rl, sync ^ 1); 843 __freed_request(rl, sync ^ 1);
847} 844}
848 845
846int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
847{
848 struct request_list *rl;
849
850 spin_lock_irq(q->queue_lock);
851 q->nr_requests = nr;
852 blk_queue_congestion_threshold(q);
853
854 /* congestion isn't cgroup aware and follows root blkcg for now */
855 rl = &q->root_rl;
856
857 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
858 blk_set_queue_congested(q, BLK_RW_SYNC);
859 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
860 blk_clear_queue_congested(q, BLK_RW_SYNC);
861
862 if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
863 blk_set_queue_congested(q, BLK_RW_ASYNC);
864 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
865 blk_clear_queue_congested(q, BLK_RW_ASYNC);
866
867 blk_queue_for_each_rl(rl, q) {
868 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
869 blk_set_rl_full(rl, BLK_RW_SYNC);
870 } else {
871 blk_clear_rl_full(rl, BLK_RW_SYNC);
872 wake_up(&rl->wait[BLK_RW_SYNC]);
873 }
874
875 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
876 blk_set_rl_full(rl, BLK_RW_ASYNC);
877 } else {
878 blk_clear_rl_full(rl, BLK_RW_ASYNC);
879 wake_up(&rl->wait[BLK_RW_ASYNC]);
880 }
881 }
882
883 spin_unlock_irq(q->queue_lock);
884 return 0;
885}
886
849/* 887/*
850 * Determine if elevator data should be initialized when allocating the 888 * Determine if elevator data should be initialized when allocating the
851 * request associated with @bio. 889 * request associated with @bio.
@@ -1135,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
1135struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1173struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1136{ 1174{
1137 if (q->mq_ops) 1175 if (q->mq_ops)
1138 return blk_mq_alloc_request(q, rw, gfp_mask); 1176 return blk_mq_alloc_request(q, rw, gfp_mask, false);
1139 else 1177 else
1140 return blk_old_get_request(q, rw, gfp_mask); 1178 return blk_old_get_request(q, rw, gfp_mask);
1141} 1179}
@@ -1231,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
1231static void part_round_stats_single(int cpu, struct hd_struct *part, 1269static void part_round_stats_single(int cpu, struct hd_struct *part,
1232 unsigned long now) 1270 unsigned long now)
1233{ 1271{
1272 int inflight;
1273
1234 if (now == part->stamp) 1274 if (now == part->stamp)
1235 return; 1275 return;
1236 1276
1237 if (part_in_flight(part)) { 1277 inflight = part_in_flight(part);
1278 if (inflight) {
1238 __part_stat_add(cpu, part, time_in_queue, 1279 __part_stat_add(cpu, part, time_in_queue,
1239 part_in_flight(part) * (now - part->stamp)); 1280 inflight * (now - part->stamp));
1240 __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); 1281 __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1241 } 1282 }
1242 part->stamp = now; 1283 part->stamp = now;
@@ -1360,7 +1401,6 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1360 1401
1361 rq->__data_len = rq->resid_len = len; 1402 rq->__data_len = rq->resid_len = len;
1362 rq->nr_phys_segments = 1; 1403 rq->nr_phys_segments = 1;
1363 rq->buffer = bio_data(bio);
1364} 1404}
1365EXPORT_SYMBOL_GPL(blk_add_request_payload); 1405EXPORT_SYMBOL_GPL(blk_add_request_payload);
1366 1406
@@ -1402,12 +1442,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1402 bio->bi_next = req->bio; 1442 bio->bi_next = req->bio;
1403 req->bio = bio; 1443 req->bio = bio;
1404 1444
1405 /*
1406 * may not be valid. if the low level driver said
1407 * it didn't need a bounce buffer then it better
1408 * not touch req->buffer either...
1409 */
1410 req->buffer = bio_data(bio);
1411 req->__sector = bio->bi_iter.bi_sector; 1445 req->__sector = bio->bi_iter.bi_sector;
1412 req->__data_len += bio->bi_iter.bi_size; 1446 req->__data_len += bio->bi_iter.bi_size;
1413 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1447 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
@@ -1432,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1432 * added on the elevator at this point. In addition, we don't have 1466 * added on the elevator at this point. In addition, we don't have
1433 * reliable access to the elevator outside queue lock. Only check basic 1467 * reliable access to the elevator outside queue lock. Only check basic
1434 * merging parameters without querying the elevator. 1468 * merging parameters without querying the elevator.
1469 *
1470 * Caller must ensure !blk_queue_nomerges(q) beforehand.
1435 */ 1471 */
1436bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 1472bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1437 unsigned int *request_count) 1473 unsigned int *request_count)
@@ -1441,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1441 bool ret = false; 1477 bool ret = false;
1442 struct list_head *plug_list; 1478 struct list_head *plug_list;
1443 1479
1444 if (blk_queue_nomerges(q))
1445 goto out;
1446
1447 plug = current->plug; 1480 plug = current->plug;
1448 if (!plug) 1481 if (!plug)
1449 goto out; 1482 goto out;
@@ -1522,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1522 * Check if we can merge with the plugged list before grabbing 1555 * Check if we can merge with the plugged list before grabbing
1523 * any locks. 1556 * any locks.
1524 */ 1557 */
1525 if (blk_attempt_plug_merge(q, bio, &request_count)) 1558 if (!blk_queue_nomerges(q) &&
1559 blk_attempt_plug_merge(q, bio, &request_count))
1526 return; 1560 return;
1527 1561
1528 spin_lock_irq(q->queue_lock); 1562 spin_lock_irq(q->queue_lock);
@@ -1654,7 +1688,7 @@ static int __init fail_make_request_debugfs(void)
1654 struct dentry *dir = fault_create_debugfs_attr("fail_make_request", 1688 struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
1655 NULL, &fail_make_request); 1689 NULL, &fail_make_request);
1656 1690
1657 return IS_ERR(dir) ? PTR_ERR(dir) : 0; 1691 return PTR_ERR_OR_ZERO(dir);
1658} 1692}
1659 1693
1660late_initcall(fail_make_request_debugfs); 1694late_initcall(fail_make_request_debugfs);
@@ -2434,7 +2468,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2434 } 2468 }
2435 2469
2436 req->__data_len -= total_bytes; 2470 req->__data_len -= total_bytes;
2437 req->buffer = bio_data(req->bio);
2438 2471
2439 /* update sector only for requests with clear definition of sector */ 2472 /* update sector only for requests with clear definition of sector */
2440 if (req->cmd_type == REQ_TYPE_FS) 2473 if (req->cmd_type == REQ_TYPE_FS)
@@ -2503,7 +2536,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
2503/* 2536/*
2504 * queue lock must be held 2537 * queue lock must be held
2505 */ 2538 */
2506static void blk_finish_request(struct request *req, int error) 2539void blk_finish_request(struct request *req, int error)
2507{ 2540{
2508 if (blk_rq_tagged(req)) 2541 if (blk_rq_tagged(req))
2509 blk_queue_end_tag(req->q, req); 2542 blk_queue_end_tag(req->q, req);
@@ -2529,6 +2562,7 @@ static void blk_finish_request(struct request *req, int error)
2529 __blk_put_request(req->q, req); 2562 __blk_put_request(req->q, req);
2530 } 2563 }
2531} 2564}
2565EXPORT_SYMBOL(blk_finish_request);
2532 2566
2533/** 2567/**
2534 * blk_end_bidi_request - Complete a bidi request 2568 * blk_end_bidi_request - Complete a bidi request
@@ -2752,10 +2786,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2752 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ 2786 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
2753 rq->cmd_flags |= bio->bi_rw & REQ_WRITE; 2787 rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
2754 2788
2755 if (bio_has_data(bio)) { 2789 if (bio_has_data(bio))
2756 rq->nr_phys_segments = bio_phys_segments(q, bio); 2790 rq->nr_phys_segments = bio_phys_segments(q, bio);
2757 rq->buffer = bio_data(bio); 2791
2758 }
2759 rq->__data_len = bio->bi_iter.bi_size; 2792 rq->__data_len = bio->bi_iter.bi_size;
2760 rq->bio = rq->biotail = bio; 2793 rq->bio = rq->biotail = bio;
2761 2794
@@ -2831,7 +2864,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2831 2864
2832/* 2865/*
2833 * Copy attributes of the original request to the clone request. 2866 * Copy attributes of the original request to the clone request.
2834 * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. 2867 * The actual data parts (e.g. ->cmd, ->sense) are not copied.
2835 */ 2868 */
2836static void __blk_rq_prep_clone(struct request *dst, struct request *src) 2869static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2837{ 2870{
@@ -2857,7 +2890,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2857 * 2890 *
2858 * Description: 2891 * Description:
2859 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. 2892 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
2860 * The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) 2893 * The actual data parts of @rq_src (e.g. ->cmd, ->sense)
2861 * are not copied, and copying such parts is the caller's responsibility. 2894 * are not copied, and copying such parts is the caller's responsibility.
2862 * Also, pages which the original bios are pointing to are not copied 2895 * Also, pages which the original bios are pointing to are not copied
2863 * and the cloned bios just point same pages. 2896 * and the cloned bios just point same pages.
@@ -2904,20 +2937,25 @@ free_and_out:
2904} 2937}
2905EXPORT_SYMBOL_GPL(blk_rq_prep_clone); 2938EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
2906 2939
2907int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) 2940int kblockd_schedule_work(struct work_struct *work)
2908{ 2941{
2909 return queue_work(kblockd_workqueue, work); 2942 return queue_work(kblockd_workqueue, work);
2910} 2943}
2911EXPORT_SYMBOL(kblockd_schedule_work); 2944EXPORT_SYMBOL(kblockd_schedule_work);
2912 2945
2913int kblockd_schedule_delayed_work(struct request_queue *q, 2946int kblockd_schedule_delayed_work(struct delayed_work *dwork,
2914 struct delayed_work *dwork, unsigned long delay) 2947 unsigned long delay)
2915{ 2948{
2916 return queue_delayed_work(kblockd_workqueue, dwork, delay); 2949 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2917} 2950}
2918EXPORT_SYMBOL(kblockd_schedule_delayed_work); 2951EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2919 2952
2920#define PLUG_MAGIC 0x91827364 2953int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
2954 unsigned long delay)
2955{
2956 return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
2957}
2958EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
2921 2959
2922/** 2960/**
2923 * blk_start_plug - initialize blk_plug and track it inside the task_struct 2961 * blk_start_plug - initialize blk_plug and track it inside the task_struct
@@ -2937,7 +2975,6 @@ void blk_start_plug(struct blk_plug *plug)
2937{ 2975{
2938 struct task_struct *tsk = current; 2976 struct task_struct *tsk = current;
2939 2977
2940 plug->magic = PLUG_MAGIC;
2941 INIT_LIST_HEAD(&plug->list); 2978 INIT_LIST_HEAD(&plug->list);
2942 INIT_LIST_HEAD(&plug->mq_list); 2979 INIT_LIST_HEAD(&plug->mq_list);
2943 INIT_LIST_HEAD(&plug->cb_list); 2980 INIT_LIST_HEAD(&plug->cb_list);
@@ -3034,8 +3071,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3034 LIST_HEAD(list); 3071 LIST_HEAD(list);
3035 unsigned int depth; 3072 unsigned int depth;
3036 3073
3037 BUG_ON(plug->magic != PLUG_MAGIC);
3038
3039 flush_plug_callbacks(plug, from_schedule); 3074 flush_plug_callbacks(plug, from_schedule);
3040 3075
3041 if (!list_empty(&plug->mq_list)) 3076 if (!list_empty(&plug->mq_list))
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43e6b4755e9a..ff87c664b7df 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq)
130 blk_clear_rq_complete(rq); 130 blk_clear_rq_complete(rq);
131} 131}
132 132
133static void mq_flush_run(struct work_struct *work)
134{
135 struct request *rq;
136
137 rq = container_of(work, struct request, mq_flush_work);
138
139 memset(&rq->csd, 0, sizeof(rq->csd));
140 blk_mq_insert_request(rq, false, true, false);
141}
142
143static bool blk_flush_queue_rq(struct request *rq, bool add_front) 133static bool blk_flush_queue_rq(struct request *rq, bool add_front)
144{ 134{
145 if (rq->q->mq_ops) { 135 if (rq->q->mq_ops) {
146 INIT_WORK(&rq->mq_flush_work, mq_flush_run); 136 struct request_queue *q = rq->q;
147 kblockd_schedule_work(rq->q, &rq->mq_flush_work); 137
138 blk_mq_add_to_requeue_list(rq, add_front);
139 blk_mq_kick_requeue_list(q);
148 return false; 140 return false;
149 } else { 141 } else {
150 if (add_front) 142 if (add_front)
@@ -231,8 +223,10 @@ static void flush_end_io(struct request *flush_rq, int error)
231 struct request *rq, *n; 223 struct request *rq, *n;
232 unsigned long flags = 0; 224 unsigned long flags = 0;
233 225
234 if (q->mq_ops) 226 if (q->mq_ops) {
235 spin_lock_irqsave(&q->mq_flush_lock, flags); 227 spin_lock_irqsave(&q->mq_flush_lock, flags);
228 q->flush_rq->cmd_flags = 0;
229 }
236 230
237 running = &q->flush_queue[q->flush_running_idx]; 231 running = &q->flush_queue[q->flush_running_idx];
238 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 232 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
@@ -306,23 +300,9 @@ static bool blk_kick_flush(struct request_queue *q)
306 */ 300 */
307 q->flush_pending_idx ^= 1; 301 q->flush_pending_idx ^= 1;
308 302
309 if (q->mq_ops) { 303 blk_rq_init(q, q->flush_rq);
310 struct blk_mq_ctx *ctx = first_rq->mq_ctx; 304 if (q->mq_ops)
311 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 305 blk_mq_clone_flush_request(q->flush_rq, first_rq);
312
313 blk_mq_rq_init(hctx, q->flush_rq);
314 q->flush_rq->mq_ctx = ctx;
315
316 /*
317 * Reuse the tag value from the fist waiting request,
318 * with blk-mq the tag is generated during request
319 * allocation and drivers can rely on it being inside
320 * the range they asked for.
321 */
322 q->flush_rq->tag = first_rq->tag;
323 } else {
324 blk_rq_init(q, q->flush_rq);
325 }
326 306
327 q->flush_rq->cmd_type = REQ_TYPE_FS; 307 q->flush_rq->cmd_type = REQ_TYPE_FS;
328 q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 308 q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index c11d24e379e2..d828b44a404b 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete);
64 * iopoll handler will not be invoked again before blk_iopoll_sched_prep() 64 * iopoll handler will not be invoked again before blk_iopoll_sched_prep()
65 * is called. 65 * is called.
66 **/ 66 **/
67void blk_iopoll_complete(struct blk_iopoll *iopoll) 67void blk_iopoll_complete(struct blk_iopoll *iop)
68{ 68{
69 unsigned long flags; 69 unsigned long flags;
70 70
71 local_irq_save(flags); 71 local_irq_save(flags);
72 __blk_iopoll_complete(iopoll); 72 __blk_iopoll_complete(iop);
73 local_irq_restore(flags); 73 local_irq_restore(flags);
74} 74}
75EXPORT_SYMBOL(blk_iopoll_complete); 75EXPORT_SYMBOL(blk_iopoll_complete);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 97a733cf3d5f..8411be3c19d3 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same);
226 * Generate and issue number of bios with zerofiled pages. 226 * Generate and issue number of bios with zerofiled pages.
227 */ 227 */
228 228
229int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 229static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
230 sector_t nr_sects, gfp_t gfp_mask) 230 sector_t nr_sects, gfp_t gfp_mask)
231{ 231{
232 int ret; 232 int ret;
233 struct bio *bio; 233 struct bio *bio;
diff --git a/block/blk-map.c b/block/blk-map.c
index f7b22bc21518..f890d4345b0c 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
155 if (!bio_flagged(bio, BIO_USER_MAPPED)) 155 if (!bio_flagged(bio, BIO_USER_MAPPED))
156 rq->cmd_flags |= REQ_COPY_USER; 156 rq->cmd_flags |= REQ_COPY_USER;
157 157
158 rq->buffer = NULL;
159 return 0; 158 return 0;
160unmap_rq: 159unmap_rq:
161 blk_rq_unmap_user(bio); 160 blk_rq_unmap_user(bio);
@@ -238,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
238 blk_queue_bounce(q, &bio); 237 blk_queue_bounce(q, &bio);
239 bio_get(bio); 238 bio_get(bio);
240 blk_rq_bio_prep(q, rq, bio); 239 blk_rq_bio_prep(q, rq, bio);
241 rq->buffer = NULL;
242 return 0; 240 return 0;
243} 241}
244EXPORT_SYMBOL(blk_rq_map_user_iov); 242EXPORT_SYMBOL(blk_rq_map_user_iov);
@@ -325,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
325 } 323 }
326 324
327 blk_queue_bounce(q, &rq->bio); 325 blk_queue_bounce(q, &rq->bio);
328 rq->buffer = NULL;
329 return 0; 326 return 0;
330} 327}
331EXPORT_SYMBOL(blk_rq_map_kern); 328EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6c583f9c5b65..b3bf0df0f4c2 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -13,7 +13,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
13 struct bio *bio) 13 struct bio *bio)
14{ 14{
15 struct bio_vec bv, bvprv = { NULL }; 15 struct bio_vec bv, bvprv = { NULL };
16 int cluster, high, highprv = 1; 16 int cluster, high, highprv = 1, no_sg_merge;
17 unsigned int seg_size, nr_phys_segs; 17 unsigned int seg_size, nr_phys_segs;
18 struct bio *fbio, *bbio; 18 struct bio *fbio, *bbio;
19 struct bvec_iter iter; 19 struct bvec_iter iter;
@@ -35,12 +35,21 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
35 cluster = blk_queue_cluster(q); 35 cluster = blk_queue_cluster(q);
36 seg_size = 0; 36 seg_size = 0;
37 nr_phys_segs = 0; 37 nr_phys_segs = 0;
38 no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
39 high = 0;
38 for_each_bio(bio) { 40 for_each_bio(bio) {
39 bio_for_each_segment(bv, bio, iter) { 41 bio_for_each_segment(bv, bio, iter) {
40 /* 42 /*
43 * If SG merging is disabled, each bio vector is
44 * a segment
45 */
46 if (no_sg_merge)
47 goto new_segment;
48
49 /*
41 * the trick here is making sure that a high page is 50 * the trick here is making sure that a high page is
42 * never considered part of another segment, since that 51 * never considered part of another segment, since
43 * might change with the bounce page. 52 * that might change with the bounce page.
44 */ 53 */
45 high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); 54 high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
46 if (!high && !highprv && cluster) { 55 if (!high && !highprv && cluster) {
@@ -84,11 +93,16 @@ void blk_recalc_rq_segments(struct request *rq)
84 93
85void blk_recount_segments(struct request_queue *q, struct bio *bio) 94void blk_recount_segments(struct request_queue *q, struct bio *bio)
86{ 95{
87 struct bio *nxt = bio->bi_next; 96 if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags))
97 bio->bi_phys_segments = bio->bi_vcnt;
98 else {
99 struct bio *nxt = bio->bi_next;
100
101 bio->bi_next = NULL;
102 bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
103 bio->bi_next = nxt;
104 }
88 105
89 bio->bi_next = NULL;
90 bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
91 bio->bi_next = nxt;
92 bio->bi_flags |= (1 << BIO_SEG_VALID); 106 bio->bi_flags |= (1 << BIO_SEG_VALID);
93} 107}
94EXPORT_SYMBOL(blk_recount_segments); 108EXPORT_SYMBOL(blk_recount_segments);
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
index 136ef8643bba..bb3ed488f7b5 100644
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@@ -1,3 +1,8 @@
1/*
2 * CPU notifier helper code for blk-mq
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 */
1#include <linux/kernel.h> 6#include <linux/kernel.h>
2#include <linux/module.h> 7#include <linux/module.h>
3#include <linux/init.h> 8#include <linux/init.h>
@@ -18,14 +23,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
18{ 23{
19 unsigned int cpu = (unsigned long) hcpu; 24 unsigned int cpu = (unsigned long) hcpu;
20 struct blk_mq_cpu_notifier *notify; 25 struct blk_mq_cpu_notifier *notify;
26 int ret = NOTIFY_OK;
21 27
22 raw_spin_lock(&blk_mq_cpu_notify_lock); 28 raw_spin_lock(&blk_mq_cpu_notify_lock);
23 29
24 list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) 30 list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
25 notify->notify(notify->data, action, cpu); 31 ret = notify->notify(notify->data, action, cpu);
32 if (ret != NOTIFY_OK)
33 break;
34 }
26 35
27 raw_spin_unlock(&blk_mq_cpu_notify_lock); 36 raw_spin_unlock(&blk_mq_cpu_notify_lock);
28 return NOTIFY_OK; 37 return ret;
29} 38}
30 39
31void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) 40void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
@@ -45,7 +54,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
45} 54}
46 55
47void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 56void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
48 void (*fn)(void *, unsigned long, unsigned int), 57 int (*fn)(void *, unsigned long, unsigned int),
49 void *data) 58 void *data)
50{ 59{
51 notifier->notify = fn; 60 notifier->notify = fn;
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 097921329619..1065d7c65fa1 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -1,3 +1,8 @@
1/*
2 * CPU <-> hardware queue mapping helpers
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 */
1#include <linux/kernel.h> 6#include <linux/kernel.h>
2#include <linux/threads.h> 7#include <linux/threads.h>
3#include <linux/module.h> 8#include <linux/module.h>
@@ -80,19 +85,35 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
80 return 0; 85 return 0;
81} 86}
82 87
83unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) 88unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
84{ 89{
85 unsigned int *map; 90 unsigned int *map;
86 91
87 /* If cpus are offline, map them to first hctx */ 92 /* If cpus are offline, map them to first hctx */
88 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, 93 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
89 reg->numa_node); 94 set->numa_node);
90 if (!map) 95 if (!map)
91 return NULL; 96 return NULL;
92 97
93 if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) 98 if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
94 return map; 99 return map;
95 100
96 kfree(map); 101 kfree(map);
97 return NULL; 102 return NULL;
98} 103}
104
105/*
106 * We have no quick way of doing reverse lookups. This is only used at
107 * queue init time, so runtime isn't important.
108 */
109int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
110{
111 int i;
112
113 for_each_possible_cpu(i) {
114 if (index == mq_map[i])
115 return cpu_to_node(i);
116 }
117
118 return NUMA_NO_NODE;
119}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index b0ba264b0522..ed5217867555 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -203,59 +203,24 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
203 return ret; 203 return ret;
204} 204}
205 205
206static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) 206static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
207{
208 ssize_t ret;
209
210 spin_lock(&hctx->lock);
211 ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
212 spin_unlock(&hctx->lock);
213
214 return ret;
215}
216
217static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
218 const char *page, size_t len)
219{ 207{
220 struct blk_mq_ctx *ctx; 208 return blk_mq_tag_sysfs_show(hctx->tags, page);
221 unsigned long ret;
222 unsigned int i;
223
224 if (kstrtoul(page, 10, &ret)) {
225 pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
226 return -EINVAL;
227 }
228
229 spin_lock(&hctx->lock);
230 if (ret)
231 hctx->flags |= BLK_MQ_F_SHOULD_IPI;
232 else
233 hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
234 spin_unlock(&hctx->lock);
235
236 hctx_for_each_ctx(hctx, ctx, i)
237 ctx->ipi_redirect = !!ret;
238
239 return len;
240} 209}
241 210
242static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) 211static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
243{ 212{
244 return blk_mq_tag_sysfs_show(hctx->tags, page); 213 return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
245} 214}
246 215
247static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) 216static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
248{ 217{
249 unsigned int i, queue_num, first = 1; 218 unsigned int i, first = 1;
250 ssize_t ret = 0; 219 ssize_t ret = 0;
251 220
252 blk_mq_disable_hotplug(); 221 blk_mq_disable_hotplug();
253 222
254 for_each_online_cpu(i) { 223 for_each_cpu(i, hctx->cpumask) {
255 queue_num = hctx->queue->mq_map[i];
256 if (queue_num != hctx->queue_num)
257 continue;
258
259 if (first) 224 if (first)
260 ret += sprintf(ret + page, "%u", i); 225 ret += sprintf(ret + page, "%u", i);
261 else 226 else
@@ -307,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
307 .attr = {.name = "dispatched", .mode = S_IRUGO }, 272 .attr = {.name = "dispatched", .mode = S_IRUGO },
308 .show = blk_mq_hw_sysfs_dispatched_show, 273 .show = blk_mq_hw_sysfs_dispatched_show,
309}; 274};
275static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
276 .attr = {.name = "active", .mode = S_IRUGO },
277 .show = blk_mq_hw_sysfs_active_show,
278};
310static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { 279static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
311 .attr = {.name = "pending", .mode = S_IRUGO }, 280 .attr = {.name = "pending", .mode = S_IRUGO },
312 .show = blk_mq_hw_sysfs_rq_list_show, 281 .show = blk_mq_hw_sysfs_rq_list_show,
313}; 282};
314static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
315 .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
316 .show = blk_mq_hw_sysfs_ipi_show,
317 .store = blk_mq_hw_sysfs_ipi_store,
318};
319static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { 283static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
320 .attr = {.name = "tags", .mode = S_IRUGO }, 284 .attr = {.name = "tags", .mode = S_IRUGO },
321 .show = blk_mq_hw_sysfs_tags_show, 285 .show = blk_mq_hw_sysfs_tags_show,
@@ -330,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = {
330 &blk_mq_hw_sysfs_run.attr, 294 &blk_mq_hw_sysfs_run.attr,
331 &blk_mq_hw_sysfs_dispatched.attr, 295 &blk_mq_hw_sysfs_dispatched.attr,
332 &blk_mq_hw_sysfs_pending.attr, 296 &blk_mq_hw_sysfs_pending.attr,
333 &blk_mq_hw_sysfs_ipi.attr,
334 &blk_mq_hw_sysfs_tags.attr, 297 &blk_mq_hw_sysfs_tags.attr,
335 &blk_mq_hw_sysfs_cpus.attr, 298 &blk_mq_hw_sysfs_cpus.attr,
299 &blk_mq_hw_sysfs_active.attr,
336 NULL, 300 NULL,
337}; 301};
338 302
@@ -363,6 +327,42 @@ static struct kobj_type blk_mq_hw_ktype = {
363 .release = blk_mq_sysfs_release, 327 .release = blk_mq_sysfs_release,
364}; 328};
365 329
330static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
331{
332 struct blk_mq_ctx *ctx;
333 int i;
334
335 if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
336 return;
337
338 hctx_for_each_ctx(hctx, ctx, i)
339 kobject_del(&ctx->kobj);
340
341 kobject_del(&hctx->kobj);
342}
343
344static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
345{
346 struct request_queue *q = hctx->queue;
347 struct blk_mq_ctx *ctx;
348 int i, ret;
349
350 if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
351 return 0;
352
353 ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
354 if (ret)
355 return ret;
356
357 hctx_for_each_ctx(hctx, ctx, i) {
358 ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
359 if (ret)
360 break;
361 }
362
363 return ret;
364}
365
366void blk_mq_unregister_disk(struct gendisk *disk) 366void blk_mq_unregister_disk(struct gendisk *disk)
367{ 367{
368 struct request_queue *q = disk->queue; 368 struct request_queue *q = disk->queue;
@@ -371,11 +371,11 @@ void blk_mq_unregister_disk(struct gendisk *disk)
371 int i, j; 371 int i, j;
372 372
373 queue_for_each_hw_ctx(q, hctx, i) { 373 queue_for_each_hw_ctx(q, hctx, i) {
374 hctx_for_each_ctx(hctx, ctx, j) { 374 blk_mq_unregister_hctx(hctx);
375 kobject_del(&ctx->kobj); 375
376 hctx_for_each_ctx(hctx, ctx, j)
376 kobject_put(&ctx->kobj); 377 kobject_put(&ctx->kobj);
377 } 378
378 kobject_del(&hctx->kobj);
379 kobject_put(&hctx->kobj); 379 kobject_put(&hctx->kobj);
380 } 380 }
381 381
@@ -386,15 +386,30 @@ void blk_mq_unregister_disk(struct gendisk *disk)
386 kobject_put(&disk_to_dev(disk)->kobj); 386 kobject_put(&disk_to_dev(disk)->kobj);
387} 387}
388 388
389static void blk_mq_sysfs_init(struct request_queue *q)
390{
391 struct blk_mq_hw_ctx *hctx;
392 struct blk_mq_ctx *ctx;
393 int i, j;
394
395 kobject_init(&q->mq_kobj, &blk_mq_ktype);
396
397 queue_for_each_hw_ctx(q, hctx, i) {
398 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
399
400 hctx_for_each_ctx(hctx, ctx, j)
401 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
402 }
403}
404
389int blk_mq_register_disk(struct gendisk *disk) 405int blk_mq_register_disk(struct gendisk *disk)
390{ 406{
391 struct device *dev = disk_to_dev(disk); 407 struct device *dev = disk_to_dev(disk);
392 struct request_queue *q = disk->queue; 408 struct request_queue *q = disk->queue;
393 struct blk_mq_hw_ctx *hctx; 409 struct blk_mq_hw_ctx *hctx;
394 struct blk_mq_ctx *ctx; 410 int ret, i;
395 int ret, i, j;
396 411
397 kobject_init(&q->mq_kobj, &blk_mq_ktype); 412 blk_mq_sysfs_init(q);
398 413
399 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); 414 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
400 if (ret < 0) 415 if (ret < 0)
@@ -403,20 +418,10 @@ int blk_mq_register_disk(struct gendisk *disk)
403 kobject_uevent(&q->mq_kobj, KOBJ_ADD); 418 kobject_uevent(&q->mq_kobj, KOBJ_ADD);
404 419
405 queue_for_each_hw_ctx(q, hctx, i) { 420 queue_for_each_hw_ctx(q, hctx, i) {
406 kobject_init(&hctx->kobj, &blk_mq_hw_ktype); 421 hctx->flags |= BLK_MQ_F_SYSFS_UP;
407 ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i); 422 ret = blk_mq_register_hctx(hctx);
408 if (ret) 423 if (ret)
409 break; 424 break;
410
411 if (!hctx->nr_ctx)
412 continue;
413
414 hctx_for_each_ctx(hctx, ctx, j) {
415 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
416 ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
417 if (ret)
418 break;
419 }
420 } 425 }
421 426
422 if (ret) { 427 if (ret) {
@@ -426,3 +431,26 @@ int blk_mq_register_disk(struct gendisk *disk)
426 431
427 return 0; 432 return 0;
428} 433}
434
435void blk_mq_sysfs_unregister(struct request_queue *q)
436{
437 struct blk_mq_hw_ctx *hctx;
438 int i;
439
440 queue_for_each_hw_ctx(q, hctx, i)
441 blk_mq_unregister_hctx(hctx);
442}
443
444int blk_mq_sysfs_register(struct request_queue *q)
445{
446 struct blk_mq_hw_ctx *hctx;
447 int i, ret = 0;
448
449 queue_for_each_hw_ctx(q, hctx, i) {
450 ret = blk_mq_register_hctx(hctx);
451 if (ret)
452 break;
453 }
454
455 return ret;
456}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83ae96c51a27..d90c4aeb7dd3 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,78 +1,345 @@
1/*
2 * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
3 * over multiple cachelines to avoid ping-pong between multiple submitters
4 * or submitter and completer. Uses rolling wakeups to avoid falling of
5 * the scaling cliff when we run out of tags and have to start putting
6 * submitters to sleep.
7 *
8 * Uses active queue tracking to support fairer distribution of tags
9 * between multiple submitters when a shared tag map is used.
10 *
11 * Copyright (C) 2013-2014 Jens Axboe
12 */
1#include <linux/kernel.h> 13#include <linux/kernel.h>
2#include <linux/module.h> 14#include <linux/module.h>
3#include <linux/percpu_ida.h> 15#include <linux/random.h>
4 16
5#include <linux/blk-mq.h> 17#include <linux/blk-mq.h>
6#include "blk.h" 18#include "blk.h"
7#include "blk-mq.h" 19#include "blk-mq.h"
8#include "blk-mq-tag.h" 20#include "blk-mq-tag.h"
9 21
22static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
23{
24 int i;
25
26 for (i = 0; i < bt->map_nr; i++) {
27 struct blk_align_bitmap *bm = &bt->map[i];
28 int ret;
29
30 ret = find_first_zero_bit(&bm->word, bm->depth);
31 if (ret < bm->depth)
32 return true;
33 }
34
35 return false;
36}
37
38bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
39{
40 if (!tags)
41 return true;
42
43 return bt_has_free_tags(&tags->bitmap_tags);
44}
45
46static inline void bt_index_inc(unsigned int *index)
47{
48 *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
49}
50
10/* 51/*
11 * Per tagged queue (tag address space) map 52 * If a previously inactive queue goes active, bump the active user count.
12 */ 53 */
13struct blk_mq_tags { 54bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
14 unsigned int nr_tags; 55{
15 unsigned int nr_reserved_tags; 56 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
16 unsigned int nr_batch_move; 57 !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
17 unsigned int nr_max_cache; 58 atomic_inc(&hctx->tags->active_queues);
18 59
19 struct percpu_ida free_tags; 60 return true;
20 struct percpu_ida reserved_tags; 61}
21};
22 62
23void blk_mq_wait_for_tags(struct blk_mq_tags *tags) 63/*
64 * Wakeup all potentially sleeping on normal (non-reserved) tags
65 */
66static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
24{ 67{
25 int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); 68 struct blk_mq_bitmap_tags *bt;
26 blk_mq_put_tag(tags, tag); 69 int i, wake_index;
70
71 bt = &tags->bitmap_tags;
72 wake_index = bt->wake_index;
73 for (i = 0; i < BT_WAIT_QUEUES; i++) {
74 struct bt_wait_state *bs = &bt->bs[wake_index];
75
76 if (waitqueue_active(&bs->wait))
77 wake_up(&bs->wait);
78
79 bt_index_inc(&wake_index);
80 }
27} 81}
28 82
29bool blk_mq_has_free_tags(struct blk_mq_tags *tags) 83/*
84 * If a previously busy queue goes inactive, potential waiters could now
85 * be allowed to queue. Wake them up and check.
86 */
87void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
88{
89 struct blk_mq_tags *tags = hctx->tags;
90
91 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
92 return;
93
94 atomic_dec(&tags->active_queues);
95
96 blk_mq_tag_wakeup_all(tags);
97}
98
99/*
100 * For shared tag users, we track the number of currently active users
101 * and attempt to provide a fair share of the tag depth for each of them.
102 */
103static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
104 struct blk_mq_bitmap_tags *bt)
105{
106 unsigned int depth, users;
107
108 if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
109 return true;
110 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
111 return true;
112
113 /*
114 * Don't try dividing an ant
115 */
116 if (bt->depth == 1)
117 return true;
118
119 users = atomic_read(&hctx->tags->active_queues);
120 if (!users)
121 return true;
122
123 /*
124 * Allow at least some tags
125 */
126 depth = max((bt->depth + users - 1) / users, 4U);
127 return atomic_read(&hctx->nr_active) < depth;
128}
129
130static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
30{ 131{
31 return !tags || 132 int tag, org_last_tag, end;
32 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; 133
134 org_last_tag = last_tag;
135 end = bm->depth;
136 do {
137restart:
138 tag = find_next_zero_bit(&bm->word, end, last_tag);
139 if (unlikely(tag >= end)) {
140 /*
141 * We started with an offset, start from 0 to
142 * exhaust the map.
143 */
144 if (org_last_tag && last_tag) {
145 end = last_tag;
146 last_tag = 0;
147 goto restart;
148 }
149 return -1;
150 }
151 last_tag = tag + 1;
152 } while (test_and_set_bit_lock(tag, &bm->word));
153
154 return tag;
33} 155}
34 156
35static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) 157/*
158 * Straight forward bitmap tag implementation, where each bit is a tag
159 * (cleared == free, and set == busy). The small twist is using per-cpu
160 * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
161 * contexts. This enables us to drastically limit the space searched,
162 * without dirtying an extra shared cacheline like we would if we stored
163 * the cache value inside the shared blk_mq_bitmap_tags structure. On top
164 * of that, each word of tags is in a separate cacheline. This means that
165 * multiple users will tend to stick to different cachelines, at least
166 * until the map is exhausted.
167 */
168static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
169 unsigned int *tag_cache)
36{ 170{
171 unsigned int last_tag, org_last_tag;
172 int index, i, tag;
173
174 if (!hctx_may_queue(hctx, bt))
175 return -1;
176
177 last_tag = org_last_tag = *tag_cache;
178 index = TAG_TO_INDEX(bt, last_tag);
179
180 for (i = 0; i < bt->map_nr; i++) {
181 tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
182 if (tag != -1) {
183 tag += (index << bt->bits_per_word);
184 goto done;
185 }
186
187 last_tag = 0;
188 if (++index >= bt->map_nr)
189 index = 0;
190 }
191
192 *tag_cache = 0;
193 return -1;
194
195 /*
196 * Only update the cache from the allocation path, if we ended
197 * up using the specific cached tag.
198 */
199done:
200 if (tag == org_last_tag) {
201 last_tag = tag + 1;
202 if (last_tag >= bt->depth - 1)
203 last_tag = 0;
204
205 *tag_cache = last_tag;
206 }
207
208 return tag;
209}
210
211static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
212 struct blk_mq_hw_ctx *hctx)
213{
214 struct bt_wait_state *bs;
215
216 if (!hctx)
217 return &bt->bs[0];
218
219 bs = &bt->bs[hctx->wait_index];
220 bt_index_inc(&hctx->wait_index);
221 return bs;
222}
223
224static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
225 unsigned int *last_tag, gfp_t gfp)
226{
227 struct bt_wait_state *bs;
228 DEFINE_WAIT(wait);
37 int tag; 229 int tag;
38 230
39 tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ? 231 tag = __bt_get(hctx, bt, last_tag);
40 TASK_UNINTERRUPTIBLE : TASK_RUNNING); 232 if (tag != -1)
41 if (tag < 0) 233 return tag;
42 return BLK_MQ_TAG_FAIL; 234
43 return tag + tags->nr_reserved_tags; 235 if (!(gfp & __GFP_WAIT))
236 return -1;
237
238 bs = bt_wait_ptr(bt, hctx);
239 do {
240 bool was_empty;
241
242 was_empty = list_empty(&wait.task_list);
243 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
244
245 tag = __bt_get(hctx, bt, last_tag);
246 if (tag != -1)
247 break;
248
249 if (was_empty)
250 atomic_set(&bs->wait_cnt, bt->wake_cnt);
251
252 io_schedule();
253 } while (1);
254
255 finish_wait(&bs->wait, &wait);
256 return tag;
257}
258
259static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags,
260 struct blk_mq_hw_ctx *hctx,
261 unsigned int *last_tag, gfp_t gfp)
262{
263 int tag;
264
265 tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp);
266 if (tag >= 0)
267 return tag + tags->nr_reserved_tags;
268
269 return BLK_MQ_TAG_FAIL;
44} 270}
45 271
46static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, 272static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
47 gfp_t gfp) 273 gfp_t gfp)
48{ 274{
49 int tag; 275 int tag, zero = 0;
50 276
51 if (unlikely(!tags->nr_reserved_tags)) { 277 if (unlikely(!tags->nr_reserved_tags)) {
52 WARN_ON_ONCE(1); 278 WARN_ON_ONCE(1);
53 return BLK_MQ_TAG_FAIL; 279 return BLK_MQ_TAG_FAIL;
54 } 280 }
55 281
56 tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ? 282 tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp);
57 TASK_UNINTERRUPTIBLE : TASK_RUNNING);
58 if (tag < 0) 283 if (tag < 0)
59 return BLK_MQ_TAG_FAIL; 284 return BLK_MQ_TAG_FAIL;
285
60 return tag; 286 return tag;
61} 287}
62 288
63unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) 289unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
290 gfp_t gfp, bool reserved)
64{ 291{
65 if (!reserved) 292 if (!reserved)
66 return __blk_mq_get_tag(tags, gfp); 293 return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);
67 294
68 return __blk_mq_get_reserved_tag(tags, gfp); 295 return __blk_mq_get_reserved_tag(hctx->tags, gfp);
296}
297
298static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
299{
300 int i, wake_index;
301
302 wake_index = bt->wake_index;
303 for (i = 0; i < BT_WAIT_QUEUES; i++) {
304 struct bt_wait_state *bs = &bt->bs[wake_index];
305
306 if (waitqueue_active(&bs->wait)) {
307 if (wake_index != bt->wake_index)
308 bt->wake_index = wake_index;
309
310 return bs;
311 }
312
313 bt_index_inc(&wake_index);
314 }
315
316 return NULL;
317}
318
319static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
320{
321 const int index = TAG_TO_INDEX(bt, tag);
322 struct bt_wait_state *bs;
323
324 /*
325 * The unlock memory barrier need to order access to req in free
326 * path and clearing tag bit
327 */
328 clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
329
330 bs = bt_wake_ptr(bt);
331 if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
332 atomic_set(&bs->wait_cnt, bt->wake_cnt);
333 bt_index_inc(&bt->wake_index);
334 wake_up(&bs->wait);
335 }
69} 336}
70 337
71static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 338static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
72{ 339{
73 BUG_ON(tag >= tags->nr_tags); 340 BUG_ON(tag >= tags->nr_tags);
74 341
75 percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); 342 bt_clear_tag(&tags->bitmap_tags, tag);
76} 343}
77 344
78static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, 345static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
@@ -80,22 +347,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
80{ 347{
81 BUG_ON(tag >= tags->nr_reserved_tags); 348 BUG_ON(tag >= tags->nr_reserved_tags);
82 349
83 percpu_ida_free(&tags->reserved_tags, tag); 350 bt_clear_tag(&tags->breserved_tags, tag);
84} 351}
85 352
86void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 353void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
354 unsigned int *last_tag)
87{ 355{
88 if (tag >= tags->nr_reserved_tags) 356 struct blk_mq_tags *tags = hctx->tags;
89 __blk_mq_put_tag(tags, tag); 357
90 else 358 if (tag >= tags->nr_reserved_tags) {
359 const int real_tag = tag - tags->nr_reserved_tags;
360
361 __blk_mq_put_tag(tags, real_tag);
362 *last_tag = real_tag;
363 } else
91 __blk_mq_put_reserved_tag(tags, tag); 364 __blk_mq_put_reserved_tag(tags, tag);
92} 365}
93 366
94static int __blk_mq_tag_iter(unsigned id, void *data) 367static void bt_for_each_free(struct blk_mq_bitmap_tags *bt,
368 unsigned long *free_map, unsigned int off)
95{ 369{
96 unsigned long *tag_map = data; 370 int i;
97 __set_bit(id, tag_map); 371
98 return 0; 372 for (i = 0; i < bt->map_nr; i++) {
373 struct blk_align_bitmap *bm = &bt->map[i];
374 int bit = 0;
375
376 do {
377 bit = find_next_zero_bit(&bm->word, bm->depth, bit);
378 if (bit >= bm->depth)
379 break;
380
381 __set_bit(bit + off, free_map);
382 bit++;
383 } while (1);
384
385 off += (1 << bt->bits_per_word);
386 }
99} 387}
100 388
101void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, 389void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
@@ -109,21 +397,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
109 if (!tag_map) 397 if (!tag_map)
110 return; 398 return;
111 399
112 percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); 400 bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags);
113 if (tags->nr_reserved_tags) 401 if (tags->nr_reserved_tags)
114 percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, 402 bt_for_each_free(&tags->breserved_tags, tag_map, 0);
115 tag_map);
116 403
117 fn(data, tag_map); 404 fn(data, tag_map);
118 kfree(tag_map); 405 kfree(tag_map);
119} 406}
407EXPORT_SYMBOL(blk_mq_tag_busy_iter);
408
409static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
410{
411 unsigned int i, used;
412
413 for (i = 0, used = 0; i < bt->map_nr; i++) {
414 struct blk_align_bitmap *bm = &bt->map[i];
415
416 used += bitmap_weight(&bm->word, bm->depth);
417 }
418
419 return bt->depth - used;
420}
421
422static void bt_update_count(struct blk_mq_bitmap_tags *bt,
423 unsigned int depth)
424{
425 unsigned int tags_per_word = 1U << bt->bits_per_word;
426 unsigned int map_depth = depth;
427
428 if (depth) {
429 int i;
430
431 for (i = 0; i < bt->map_nr; i++) {
432 bt->map[i].depth = min(map_depth, tags_per_word);
433 map_depth -= bt->map[i].depth;
434 }
435 }
436
437 bt->wake_cnt = BT_WAIT_BATCH;
438 if (bt->wake_cnt > depth / 4)
439 bt->wake_cnt = max(1U, depth / 4);
440
441 bt->depth = depth;
442}
443
444static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
445 int node, bool reserved)
446{
447 int i;
448
449 bt->bits_per_word = ilog2(BITS_PER_LONG);
450
451 /*
452 * Depth can be zero for reserved tags, that's not a failure
453 * condition.
454 */
455 if (depth) {
456 unsigned int nr, tags_per_word;
457
458 tags_per_word = (1 << bt->bits_per_word);
459
460 /*
461 * If the tag space is small, shrink the number of tags
462 * per word so we spread over a few cachelines, at least.
463 * If less than 4 tags, just forget about it, it's not
464 * going to work optimally anyway.
465 */
466 if (depth >= 4) {
467 while (tags_per_word * 4 > depth) {
468 bt->bits_per_word--;
469 tags_per_word = (1 << bt->bits_per_word);
470 }
471 }
472
473 nr = ALIGN(depth, tags_per_word) / tags_per_word;
474 bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
475 GFP_KERNEL, node);
476 if (!bt->map)
477 return -ENOMEM;
478
479 bt->map_nr = nr;
480 }
481
482 bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
483 if (!bt->bs) {
484 kfree(bt->map);
485 return -ENOMEM;
486 }
487
488 for (i = 0; i < BT_WAIT_QUEUES; i++)
489 init_waitqueue_head(&bt->bs[i].wait);
490
491 bt_update_count(bt, depth);
492 return 0;
493}
494
495static void bt_free(struct blk_mq_bitmap_tags *bt)
496{
497 kfree(bt->map);
498 kfree(bt->bs);
499}
500
501static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
502 int node)
503{
504 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
505
506 if (bt_alloc(&tags->bitmap_tags, depth, node, false))
507 goto enomem;
508 if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
509 goto enomem;
510
511 return tags;
512enomem:
513 bt_free(&tags->bitmap_tags);
514 kfree(tags);
515 return NULL;
516}
120 517
121struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 518struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
122 unsigned int reserved_tags, int node) 519 unsigned int reserved_tags, int node)
123{ 520{
124 unsigned int nr_tags, nr_cache;
125 struct blk_mq_tags *tags; 521 struct blk_mq_tags *tags;
126 int ret;
127 522
128 if (total_tags > BLK_MQ_TAG_MAX) { 523 if (total_tags > BLK_MQ_TAG_MAX) {
129 pr_err("blk-mq: tag depth too large\n"); 524 pr_err("blk-mq: tag depth too large\n");
@@ -134,73 +529,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
134 if (!tags) 529 if (!tags)
135 return NULL; 530 return NULL;
136 531
137 nr_tags = total_tags - reserved_tags;
138 nr_cache = nr_tags / num_possible_cpus();
139
140 if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
141 nr_cache = BLK_MQ_TAG_CACHE_MIN;
142 else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
143 nr_cache = BLK_MQ_TAG_CACHE_MAX;
144
145 tags->nr_tags = total_tags; 532 tags->nr_tags = total_tags;
146 tags->nr_reserved_tags = reserved_tags; 533 tags->nr_reserved_tags = reserved_tags;
147 tags->nr_max_cache = nr_cache;
148 tags->nr_batch_move = max(1u, nr_cache / 2);
149 534
150 ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - 535 return blk_mq_init_bitmap_tags(tags, node);
151 tags->nr_reserved_tags, 536}
152 tags->nr_max_cache,
153 tags->nr_batch_move);
154 if (ret)
155 goto err_free_tags;
156 537
157 if (reserved_tags) { 538void blk_mq_free_tags(struct blk_mq_tags *tags)
158 /* 539{
159 * With max_cahe and batch set to 1, the allocator fallbacks to 540 bt_free(&tags->bitmap_tags);
160 * no cached. It's fine reserved tags allocation is slow. 541 bt_free(&tags->breserved_tags);
161 */ 542 kfree(tags);
162 ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, 543}
163 1, 1);
164 if (ret)
165 goto err_reserved_tags;
166 }
167 544
168 return tags; 545void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
546{
547 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
169 548
170err_reserved_tags: 549 *tag = prandom_u32() % depth;
171 percpu_ida_destroy(&tags->free_tags);
172err_free_tags:
173 kfree(tags);
174 return NULL;
175} 550}
176 551
177void blk_mq_free_tags(struct blk_mq_tags *tags) 552int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
178{ 553{
179 percpu_ida_destroy(&tags->free_tags); 554 tdepth -= tags->nr_reserved_tags;
180 percpu_ida_destroy(&tags->reserved_tags); 555 if (tdepth > tags->nr_tags)
181 kfree(tags); 556 return -EINVAL;
557
558 /*
559 * Don't need (or can't) update reserved tags here, they remain
560 * static and should never need resizing.
561 */
562 bt_update_count(&tags->bitmap_tags, tdepth);
563 blk_mq_tag_wakeup_all(tags);
564 return 0;
182} 565}
183 566
184ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) 567ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
185{ 568{
186 char *orig_page = page; 569 char *orig_page = page;
187 unsigned int cpu; 570 unsigned int free, res;
188 571
189 if (!tags) 572 if (!tags)
190 return 0; 573 return 0;
191 574
192 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," 575 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
193 " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, 576 "bits_per_word=%u\n",
194 tags->nr_batch_move, tags->nr_max_cache); 577 tags->nr_tags, tags->nr_reserved_tags,
578 tags->bitmap_tags.bits_per_word);
195 579
196 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", 580 free = bt_unused_tags(&tags->bitmap_tags);
197 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), 581 res = bt_unused_tags(&tags->breserved_tags);
198 percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
199 582
200 for_each_possible_cpu(cpu) { 583 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
201 page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, 584 page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
202 percpu_ida_free_tags(&tags->free_tags, cpu));
203 }
204 585
205 return page - orig_page; 586 return page - orig_page;
206} 587}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 947ba2c6148e..c959de58d2a5 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -1,17 +1,59 @@
1#ifndef INT_BLK_MQ_TAG_H 1#ifndef INT_BLK_MQ_TAG_H
2#define INT_BLK_MQ_TAG_H 2#define INT_BLK_MQ_TAG_H
3 3
4struct blk_mq_tags; 4#include "blk-mq.h"
5
6enum {
7 BT_WAIT_QUEUES = 8,
8 BT_WAIT_BATCH = 8,
9};
10
11struct bt_wait_state {
12 atomic_t wait_cnt;
13 wait_queue_head_t wait;
14} ____cacheline_aligned_in_smp;
15
16#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word)
17#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1))
18
19struct blk_mq_bitmap_tags {
20 unsigned int depth;
21 unsigned int wake_cnt;
22 unsigned int bits_per_word;
23
24 unsigned int map_nr;
25 struct blk_align_bitmap *map;
26
27 unsigned int wake_index;
28 struct bt_wait_state *bs;
29};
30
31/*
32 * Tag address space map.
33 */
34struct blk_mq_tags {
35 unsigned int nr_tags;
36 unsigned int nr_reserved_tags;
37
38 atomic_t active_queues;
39
40 struct blk_mq_bitmap_tags bitmap_tags;
41 struct blk_mq_bitmap_tags breserved_tags;
42
43 struct request **rqs;
44 struct list_head page_list;
45};
46
5 47
6extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 48extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
7extern void blk_mq_free_tags(struct blk_mq_tags *tags); 49extern void blk_mq_free_tags(struct blk_mq_tags *tags);
8 50
9extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); 51extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
10extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); 52extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
11extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
12extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
13extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 53extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
14extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); 54extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
55extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
56extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
15 57
16enum { 58enum {
17 BLK_MQ_TAG_CACHE_MIN = 1, 59 BLK_MQ_TAG_CACHE_MIN = 1,
@@ -24,4 +66,23 @@ enum {
24 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 66 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
25}; 67};
26 68
69extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
70extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
71
72static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
73{
74 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
75 return false;
76
77 return __blk_mq_tag_busy(hctx);
78}
79
80static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
81{
82 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
83 return;
84
85 __blk_mq_tag_idle(hctx);
86}
87
27#endif 88#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1d2a9bdbee57..0f5879c42dcd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1,3 +1,9 @@
1/*
2 * Block multiqueue core code
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 * Copyright (C) 2013-2014 Christoph Hellwig
6 */
1#include <linux/kernel.h> 7#include <linux/kernel.h>
2#include <linux/module.h> 8#include <linux/module.h>
3#include <linux/backing-dev.h> 9#include <linux/backing-dev.h>
@@ -56,38 +62,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
56{ 62{
57 unsigned int i; 63 unsigned int i;
58 64
59 for (i = 0; i < hctx->nr_ctx_map; i++) 65 for (i = 0; i < hctx->ctx_map.map_size; i++)
60 if (hctx->ctx_map[i]) 66 if (hctx->ctx_map.map[i].word)
61 return true; 67 return true;
62 68
63 return false; 69 return false;
64} 70}
65 71
72static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
73 struct blk_mq_ctx *ctx)
74{
75 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
76}
77
78#define CTX_TO_BIT(hctx, ctx) \
79 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
80
66/* 81/*
67 * Mark this ctx as having pending work in this hardware queue 82 * Mark this ctx as having pending work in this hardware queue
68 */ 83 */
69static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 84static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
70 struct blk_mq_ctx *ctx) 85 struct blk_mq_ctx *ctx)
71{ 86{
72 if (!test_bit(ctx->index_hw, hctx->ctx_map)) 87 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
73 set_bit(ctx->index_hw, hctx->ctx_map); 88
89 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
90 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
74} 91}
75 92
76static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 93static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
77 gfp_t gfp, bool reserved) 94 struct blk_mq_ctx *ctx)
78{ 95{
79 struct request *rq; 96 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
80 unsigned int tag;
81 97
82 tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 98 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
83 if (tag != BLK_MQ_TAG_FAIL) {
84 rq = hctx->rqs[tag];
85 rq->tag = tag;
86
87 return rq;
88 }
89
90 return NULL;
91} 99}
92 100
93static int blk_mq_queue_enter(struct request_queue *q) 101static int blk_mq_queue_enter(struct request_queue *q)
@@ -186,78 +194,95 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
186 if (blk_queue_io_stat(q)) 194 if (blk_queue_io_stat(q))
187 rw_flags |= REQ_IO_STAT; 195 rw_flags |= REQ_IO_STAT;
188 196
197 INIT_LIST_HEAD(&rq->queuelist);
198 /* csd/requeue_work/fifo_time is initialized before use */
199 rq->q = q;
189 rq->mq_ctx = ctx; 200 rq->mq_ctx = ctx;
190 rq->cmd_flags = rw_flags; 201 rq->cmd_flags |= rw_flags;
191 rq->start_time = jiffies; 202 /* do not touch atomic flags, it needs atomic ops against the timer */
203 rq->cpu = -1;
204 INIT_HLIST_NODE(&rq->hash);
205 RB_CLEAR_NODE(&rq->rb_node);
206 rq->rq_disk = NULL;
207 rq->part = NULL;
208#ifdef CONFIG_BLK_CGROUP
209 rq->rl = NULL;
192 set_start_time_ns(rq); 210 set_start_time_ns(rq);
211 rq->io_start_time_ns = 0;
212#endif
213 rq->nr_phys_segments = 0;
214#if defined(CONFIG_BLK_DEV_INTEGRITY)
215 rq->nr_integrity_segments = 0;
216#endif
217 rq->special = NULL;
218 /* tag was already set */
219 rq->errors = 0;
220
221 rq->extra_len = 0;
222 rq->sense_len = 0;
223 rq->resid_len = 0;
224 rq->sense = NULL;
225
226 INIT_LIST_HEAD(&rq->timeout_list);
227 rq->end_io = NULL;
228 rq->end_io_data = NULL;
229 rq->next_rq = NULL;
230
193 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 231 ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
194} 232}
195 233
196static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 234static struct request *
197 int rw, gfp_t gfp, 235__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
198 bool reserved) 236 struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved)
199{ 237{
200 struct request *rq; 238 struct request *rq;
239 unsigned int tag;
201 240
202 do { 241 tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
203 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 242 if (tag != BLK_MQ_TAG_FAIL) {
204 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 243 rq = hctx->tags->rqs[tag];
205 244
206 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 245 rq->cmd_flags = 0;
207 if (rq) { 246 if (blk_mq_tag_busy(hctx)) {
208 blk_mq_rq_ctx_init(q, ctx, rq, rw); 247 rq->cmd_flags = REQ_MQ_INFLIGHT;
209 break; 248 atomic_inc(&hctx->nr_active);
210 } 249 }
211 250
212 blk_mq_put_ctx(ctx); 251 rq->tag = tag;
213 if (!(gfp & __GFP_WAIT)) 252 blk_mq_rq_ctx_init(q, ctx, rq, rw);
214 break; 253 return rq;
215 254 }
216 __blk_mq_run_hw_queue(hctx);
217 blk_mq_wait_for_tags(hctx->tags);
218 } while (1);
219 255
220 return rq; 256 return NULL;
221} 257}
222 258
223struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) 259struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
260 bool reserved)
224{ 261{
262 struct blk_mq_ctx *ctx;
263 struct blk_mq_hw_ctx *hctx;
225 struct request *rq; 264 struct request *rq;
226 265
227 if (blk_mq_queue_enter(q)) 266 if (blk_mq_queue_enter(q))
228 return NULL; 267 return NULL;
229 268
230 rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); 269 ctx = blk_mq_get_ctx(q);
231 if (rq) 270 hctx = q->mq_ops->map_queue(q, ctx->cpu);
232 blk_mq_put_ctx(rq->mq_ctx);
233 return rq;
234}
235
236struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
237 gfp_t gfp)
238{
239 struct request *rq;
240 271
241 if (blk_mq_queue_enter(q)) 272 rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT,
242 return NULL; 273 reserved);
274 if (!rq && (gfp & __GFP_WAIT)) {
275 __blk_mq_run_hw_queue(hctx);
276 blk_mq_put_ctx(ctx);
243 277
244 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 278 ctx = blk_mq_get_ctx(q);
245 if (rq) 279 hctx = q->mq_ops->map_queue(q, ctx->cpu);
246 blk_mq_put_ctx(rq->mq_ctx); 280 rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved);
281 }
282 blk_mq_put_ctx(ctx);
247 return rq; 283 return rq;
248} 284}
249EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 285EXPORT_SYMBOL(blk_mq_alloc_request);
250
251/*
252 * Re-init and set pdu, if we have it
253 */
254void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
255{
256 blk_rq_init(hctx->queue, rq);
257
258 if (hctx->cmd_size)
259 rq->special = blk_mq_rq_to_pdu(rq);
260}
261 286
262static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 287static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
263 struct blk_mq_ctx *ctx, struct request *rq) 288 struct blk_mq_ctx *ctx, struct request *rq)
@@ -265,9 +290,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
265 const int tag = rq->tag; 290 const int tag = rq->tag;
266 struct request_queue *q = rq->q; 291 struct request_queue *q = rq->q;
267 292
268 blk_mq_rq_init(hctx, rq); 293 if (rq->cmd_flags & REQ_MQ_INFLIGHT)
269 blk_mq_put_tag(hctx->tags, tag); 294 atomic_dec(&hctx->nr_active);
270 295
296 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
297 blk_mq_put_tag(hctx, tag, &ctx->last_tag);
271 blk_mq_queue_exit(q); 298 blk_mq_queue_exit(q);
272} 299}
273 300
@@ -283,20 +310,47 @@ void blk_mq_free_request(struct request *rq)
283 __blk_mq_free_request(hctx, ctx, rq); 310 __blk_mq_free_request(hctx, ctx, rq);
284} 311}
285 312
286bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes) 313/*
314 * Clone all relevant state from a request that has been put on hold in
315 * the flush state machine into the preallocated flush request that hangs
316 * off the request queue.
317 *
318 * For a driver the flush request should be invisible, that's why we are
319 * impersonating the original request here.
320 */
321void blk_mq_clone_flush_request(struct request *flush_rq,
322 struct request *orig_rq)
287{ 323{
288 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 324 struct blk_mq_hw_ctx *hctx =
289 return true; 325 orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
326
327 flush_rq->mq_ctx = orig_rq->mq_ctx;
328 flush_rq->tag = orig_rq->tag;
329 memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
330 hctx->cmd_size);
331}
290 332
333inline void __blk_mq_end_io(struct request *rq, int error)
334{
291 blk_account_io_done(rq); 335 blk_account_io_done(rq);
292 336
293 if (rq->end_io) 337 if (rq->end_io) {
294 rq->end_io(rq, error); 338 rq->end_io(rq, error);
295 else 339 } else {
340 if (unlikely(blk_bidi_rq(rq)))
341 blk_mq_free_request(rq->next_rq);
296 blk_mq_free_request(rq); 342 blk_mq_free_request(rq);
297 return false; 343 }
344}
345EXPORT_SYMBOL(__blk_mq_end_io);
346
347void blk_mq_end_io(struct request *rq, int error)
348{
349 if (blk_update_request(rq, error, blk_rq_bytes(rq)))
350 BUG();
351 __blk_mq_end_io(rq, error);
298} 352}
299EXPORT_SYMBOL(blk_mq_end_io_partial); 353EXPORT_SYMBOL(blk_mq_end_io);
300 354
301static void __blk_mq_complete_request_remote(void *data) 355static void __blk_mq_complete_request_remote(void *data)
302{ 356{
@@ -305,18 +359,22 @@ static void __blk_mq_complete_request_remote(void *data)
305 rq->q->softirq_done_fn(rq); 359 rq->q->softirq_done_fn(rq);
306} 360}
307 361
308void __blk_mq_complete_request(struct request *rq) 362static void blk_mq_ipi_complete_request(struct request *rq)
309{ 363{
310 struct blk_mq_ctx *ctx = rq->mq_ctx; 364 struct blk_mq_ctx *ctx = rq->mq_ctx;
365 bool shared = false;
311 int cpu; 366 int cpu;
312 367
313 if (!ctx->ipi_redirect) { 368 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
314 rq->q->softirq_done_fn(rq); 369 rq->q->softirq_done_fn(rq);
315 return; 370 return;
316 } 371 }
317 372
318 cpu = get_cpu(); 373 cpu = get_cpu();
319 if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { 374 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
375 shared = cpus_share_cache(cpu, ctx->cpu);
376
377 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
320 rq->csd.func = __blk_mq_complete_request_remote; 378 rq->csd.func = __blk_mq_complete_request_remote;
321 rq->csd.info = rq; 379 rq->csd.info = rq;
322 rq->csd.flags = 0; 380 rq->csd.flags = 0;
@@ -327,6 +385,16 @@ void __blk_mq_complete_request(struct request *rq)
327 put_cpu(); 385 put_cpu();
328} 386}
329 387
388void __blk_mq_complete_request(struct request *rq)
389{
390 struct request_queue *q = rq->q;
391
392 if (!q->softirq_done_fn)
393 blk_mq_end_io(rq, rq->errors);
394 else
395 blk_mq_ipi_complete_request(rq);
396}
397
330/** 398/**
331 * blk_mq_complete_request - end I/O on a request 399 * blk_mq_complete_request - end I/O on a request
332 * @rq: the request being processed 400 * @rq: the request being processed
@@ -337,7 +405,9 @@ void __blk_mq_complete_request(struct request *rq)
337 **/ 405 **/
338void blk_mq_complete_request(struct request *rq) 406void blk_mq_complete_request(struct request *rq)
339{ 407{
340 if (unlikely(blk_should_fake_timeout(rq->q))) 408 struct request_queue *q = rq->q;
409
410 if (unlikely(blk_should_fake_timeout(q)))
341 return; 411 return;
342 if (!blk_mark_rq_complete(rq)) 412 if (!blk_mark_rq_complete(rq))
343 __blk_mq_complete_request(rq); 413 __blk_mq_complete_request(rq);
@@ -350,13 +420,31 @@ static void blk_mq_start_request(struct request *rq, bool last)
350 420
351 trace_block_rq_issue(q, rq); 421 trace_block_rq_issue(q, rq);
352 422
423 rq->resid_len = blk_rq_bytes(rq);
424 if (unlikely(blk_bidi_rq(rq)))
425 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
426
353 /* 427 /*
354 * Just mark start time and set the started bit. Due to memory 428 * Just mark start time and set the started bit. Due to memory
355 * ordering, we know we'll see the correct deadline as long as 429 * ordering, we know we'll see the correct deadline as long as
356 * REQ_ATOMIC_STARTED is seen. 430 * REQ_ATOMIC_STARTED is seen. Use the default queue timeout,
431 * unless one has been set in the request.
432 */
433 if (!rq->timeout)
434 rq->deadline = jiffies + q->rq_timeout;
435 else
436 rq->deadline = jiffies + rq->timeout;
437
438 /*
439 * Mark us as started and clear complete. Complete might have been
440 * set if requeue raced with timeout, which then marked it as
441 * complete. So be sure to clear complete again when we start
442 * the request, otherwise we'll ignore the completion event.
357 */ 443 */
358 rq->deadline = jiffies + q->rq_timeout; 444 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
359 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 445 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
446 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
447 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
360 448
361 if (q->dma_drain_size && blk_rq_bytes(rq)) { 449 if (q->dma_drain_size && blk_rq_bytes(rq)) {
362 /* 450 /*
@@ -378,7 +466,7 @@ static void blk_mq_start_request(struct request *rq, bool last)
378 rq->cmd_flags |= REQ_END; 466 rq->cmd_flags |= REQ_END;
379} 467}
380 468
381static void blk_mq_requeue_request(struct request *rq) 469static void __blk_mq_requeue_request(struct request *rq)
382{ 470{
383 struct request_queue *q = rq->q; 471 struct request_queue *q = rq->q;
384 472
@@ -391,6 +479,86 @@ static void blk_mq_requeue_request(struct request *rq)
391 rq->nr_phys_segments--; 479 rq->nr_phys_segments--;
392} 480}
393 481
482void blk_mq_requeue_request(struct request *rq)
483{
484 __blk_mq_requeue_request(rq);
485 blk_clear_rq_complete(rq);
486
487 BUG_ON(blk_queued_rq(rq));
488 blk_mq_add_to_requeue_list(rq, true);
489}
490EXPORT_SYMBOL(blk_mq_requeue_request);
491
492static void blk_mq_requeue_work(struct work_struct *work)
493{
494 struct request_queue *q =
495 container_of(work, struct request_queue, requeue_work);
496 LIST_HEAD(rq_list);
497 struct request *rq, *next;
498 unsigned long flags;
499
500 spin_lock_irqsave(&q->requeue_lock, flags);
501 list_splice_init(&q->requeue_list, &rq_list);
502 spin_unlock_irqrestore(&q->requeue_lock, flags);
503
504 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
505 if (!(rq->cmd_flags & REQ_SOFTBARRIER))
506 continue;
507
508 rq->cmd_flags &= ~REQ_SOFTBARRIER;
509 list_del_init(&rq->queuelist);
510 blk_mq_insert_request(rq, true, false, false);
511 }
512
513 while (!list_empty(&rq_list)) {
514 rq = list_entry(rq_list.next, struct request, queuelist);
515 list_del_init(&rq->queuelist);
516 blk_mq_insert_request(rq, false, false, false);
517 }
518
519 blk_mq_run_queues(q, false);
520}
521
522void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
523{
524 struct request_queue *q = rq->q;
525 unsigned long flags;
526
527 /*
528 * We abuse this flag that is otherwise used by the I/O scheduler to
529 * request head insertation from the workqueue.
530 */
531 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
532
533 spin_lock_irqsave(&q->requeue_lock, flags);
534 if (at_head) {
535 rq->cmd_flags |= REQ_SOFTBARRIER;
536 list_add(&rq->queuelist, &q->requeue_list);
537 } else {
538 list_add_tail(&rq->queuelist, &q->requeue_list);
539 }
540 spin_unlock_irqrestore(&q->requeue_lock, flags);
541}
542EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
543
544void blk_mq_kick_requeue_list(struct request_queue *q)
545{
546 kblockd_schedule_work(&q->requeue_work);
547}
548EXPORT_SYMBOL(blk_mq_kick_requeue_list);
549
550struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
551{
552 struct request_queue *q = hctx->queue;
553
554 if ((q->flush_rq->cmd_flags & REQ_FLUSH_SEQ) &&
555 q->flush_rq->tag == tag)
556 return q->flush_rq;
557
558 return hctx->tags->rqs[tag];
559}
560EXPORT_SYMBOL(blk_mq_tag_to_rq);
561
394struct blk_mq_timeout_data { 562struct blk_mq_timeout_data {
395 struct blk_mq_hw_ctx *hctx; 563 struct blk_mq_hw_ctx *hctx;
396 unsigned long *next; 564 unsigned long *next;
@@ -412,12 +580,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
412 do { 580 do {
413 struct request *rq; 581 struct request *rq;
414 582
415 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 583 tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
416 if (tag >= hctx->queue_depth) 584 if (tag >= hctx->tags->nr_tags)
417 break; 585 break;
418 586
419 rq = hctx->rqs[tag++]; 587 rq = blk_mq_tag_to_rq(hctx, tag++);
420 588 if (rq->q != hctx->queue)
589 continue;
421 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 590 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
422 continue; 591 continue;
423 592
@@ -442,6 +611,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
442 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 611 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
443} 612}
444 613
614static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
615{
616 struct request_queue *q = rq->q;
617
618 /*
619 * We know that complete is set at this point. If STARTED isn't set
620 * anymore, then the request isn't active and the "timeout" should
621 * just be ignored. This can happen due to the bitflag ordering.
622 * Timeout first checks if STARTED is set, and if it is, assumes
623 * the request is active. But if we race with completion, then
624 * we both flags will get cleared. So check here again, and ignore
625 * a timeout event with a request that isn't active.
626 */
627 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
628 return BLK_EH_NOT_HANDLED;
629
630 if (!q->mq_ops->timeout)
631 return BLK_EH_RESET_TIMER;
632
633 return q->mq_ops->timeout(rq);
634}
635
445static void blk_mq_rq_timer(unsigned long data) 636static void blk_mq_rq_timer(unsigned long data)
446{ 637{
447 struct request_queue *q = (struct request_queue *) data; 638 struct request_queue *q = (struct request_queue *) data;
@@ -449,11 +640,24 @@ static void blk_mq_rq_timer(unsigned long data)
449 unsigned long next = 0; 640 unsigned long next = 0;
450 int i, next_set = 0; 641 int i, next_set = 0;
451 642
452 queue_for_each_hw_ctx(q, hctx, i) 643 queue_for_each_hw_ctx(q, hctx, i) {
644 /*
645 * If not software queues are currently mapped to this
646 * hardware queue, there's nothing to check
647 */
648 if (!hctx->nr_ctx || !hctx->tags)
649 continue;
650
453 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 651 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
652 }
454 653
455 if (next_set) 654 if (next_set) {
456 mod_timer(&q->timeout, round_jiffies_up(next)); 655 next = blk_rq_timeout(round_jiffies_up(next));
656 mod_timer(&q->timeout, next);
657 } else {
658 queue_for_each_hw_ctx(q, hctx, i)
659 blk_mq_tag_idle(hctx);
660 }
457} 661}
458 662
459/* 663/*
@@ -495,9 +699,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
495 return false; 699 return false;
496} 700}
497 701
498void blk_mq_add_timer(struct request *rq) 702/*
703 * Process software queues that have been marked busy, splicing them
704 * to the for-dispatch
705 */
706static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
499{ 707{
500 __blk_add_timer(rq, NULL); 708 struct blk_mq_ctx *ctx;
709 int i;
710
711 for (i = 0; i < hctx->ctx_map.map_size; i++) {
712 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
713 unsigned int off, bit;
714
715 if (!bm->word)
716 continue;
717
718 bit = 0;
719 off = i * hctx->ctx_map.bits_per_word;
720 do {
721 bit = find_next_bit(&bm->word, bm->depth, bit);
722 if (bit >= bm->depth)
723 break;
724
725 ctx = hctx->ctxs[bit + off];
726 clear_bit(bit, &bm->word);
727 spin_lock(&ctx->lock);
728 list_splice_tail_init(&ctx->rq_list, list);
729 spin_unlock(&ctx->lock);
730
731 bit++;
732 } while (1);
733 }
501} 734}
502 735
503/* 736/*
@@ -509,10 +742,11 @@ void blk_mq_add_timer(struct request *rq)
509static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 742static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
510{ 743{
511 struct request_queue *q = hctx->queue; 744 struct request_queue *q = hctx->queue;
512 struct blk_mq_ctx *ctx;
513 struct request *rq; 745 struct request *rq;
514 LIST_HEAD(rq_list); 746 LIST_HEAD(rq_list);
515 int bit, queued; 747 int queued;
748
749 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
516 750
517 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 751 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
518 return; 752 return;
@@ -522,15 +756,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
522 /* 756 /*
523 * Touch any software queue that has pending entries. 757 * Touch any software queue that has pending entries.
524 */ 758 */
525 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 759 flush_busy_ctxs(hctx, &rq_list);
526 clear_bit(bit, hctx->ctx_map);
527 ctx = hctx->ctxs[bit];
528 BUG_ON(bit != ctx->index_hw);
529
530 spin_lock(&ctx->lock);
531 list_splice_tail_init(&ctx->rq_list, &rq_list);
532 spin_unlock(&ctx->lock);
533 }
534 760
535 /* 761 /*
536 * If we have previous entries on our dispatch list, grab them 762 * If we have previous entries on our dispatch list, grab them
@@ -544,13 +770,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
544 } 770 }
545 771
546 /* 772 /*
547 * Delete and return all entries from our dispatch list
548 */
549 queued = 0;
550
551 /*
552 * Now process all the entries, sending them to the driver. 773 * Now process all the entries, sending them to the driver.
553 */ 774 */
775 queued = 0;
554 while (!list_empty(&rq_list)) { 776 while (!list_empty(&rq_list)) {
555 int ret; 777 int ret;
556 778
@@ -565,13 +787,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
565 queued++; 787 queued++;
566 continue; 788 continue;
567 case BLK_MQ_RQ_QUEUE_BUSY: 789 case BLK_MQ_RQ_QUEUE_BUSY:
568 /*
569 * FIXME: we should have a mechanism to stop the queue
570 * like blk_stop_queue, otherwise we will waste cpu
571 * time
572 */
573 list_add(&rq->queuelist, &rq_list); 790 list_add(&rq->queuelist, &rq_list);
574 blk_mq_requeue_request(rq); 791 __blk_mq_requeue_request(rq);
575 break; 792 break;
576 default: 793 default:
577 pr_err("blk-mq: bad return on queue: %d\n", ret); 794 pr_err("blk-mq: bad return on queue: %d\n", ret);
@@ -601,17 +818,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
601 } 818 }
602} 819}
603 820
821/*
822 * It'd be great if the workqueue API had a way to pass
823 * in a mask and had some smarts for more clever placement.
824 * For now we just round-robin here, switching for every
825 * BLK_MQ_CPU_WORK_BATCH queued items.
826 */
827static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
828{
829 int cpu = hctx->next_cpu;
830
831 if (--hctx->next_cpu_batch <= 0) {
832 int next_cpu;
833
834 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
835 if (next_cpu >= nr_cpu_ids)
836 next_cpu = cpumask_first(hctx->cpumask);
837
838 hctx->next_cpu = next_cpu;
839 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
840 }
841
842 return cpu;
843}
844
604void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 845void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
605{ 846{
606 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 847 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
607 return; 848 return;
608 849
609 if (!async) 850 if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
610 __blk_mq_run_hw_queue(hctx); 851 __blk_mq_run_hw_queue(hctx);
852 else if (hctx->queue->nr_hw_queues == 1)
853 kblockd_schedule_delayed_work(&hctx->run_work, 0);
611 else { 854 else {
612 struct request_queue *q = hctx->queue; 855 unsigned int cpu;
613 856
614 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 857 cpu = blk_mq_hctx_next_cpu(hctx);
858 kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
615 } 859 }
616} 860}
617 861
@@ -626,14 +870,17 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
626 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 870 test_bit(BLK_MQ_S_STOPPED, &hctx->state))
627 continue; 871 continue;
628 872
873 preempt_disable();
629 blk_mq_run_hw_queue(hctx, async); 874 blk_mq_run_hw_queue(hctx, async);
875 preempt_enable();
630 } 876 }
631} 877}
632EXPORT_SYMBOL(blk_mq_run_queues); 878EXPORT_SYMBOL(blk_mq_run_queues);
633 879
634void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 880void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
635{ 881{
636 cancel_delayed_work(&hctx->delayed_work); 882 cancel_delayed_work(&hctx->run_work);
883 cancel_delayed_work(&hctx->delay_work);
637 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 884 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
638} 885}
639EXPORT_SYMBOL(blk_mq_stop_hw_queue); 886EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -651,11 +898,25 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues);
651void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 898void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
652{ 899{
653 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 900 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
901
902 preempt_disable();
654 __blk_mq_run_hw_queue(hctx); 903 __blk_mq_run_hw_queue(hctx);
904 preempt_enable();
655} 905}
656EXPORT_SYMBOL(blk_mq_start_hw_queue); 906EXPORT_SYMBOL(blk_mq_start_hw_queue);
657 907
658void blk_mq_start_stopped_hw_queues(struct request_queue *q) 908void blk_mq_start_hw_queues(struct request_queue *q)
909{
910 struct blk_mq_hw_ctx *hctx;
911 int i;
912
913 queue_for_each_hw_ctx(q, hctx, i)
914 blk_mq_start_hw_queue(hctx);
915}
916EXPORT_SYMBOL(blk_mq_start_hw_queues);
917
918
919void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
659{ 920{
660 struct blk_mq_hw_ctx *hctx; 921 struct blk_mq_hw_ctx *hctx;
661 int i; 922 int i;
@@ -665,19 +926,47 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q)
665 continue; 926 continue;
666 927
667 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 928 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
668 blk_mq_run_hw_queue(hctx, true); 929 preempt_disable();
930 blk_mq_run_hw_queue(hctx, async);
931 preempt_enable();
669 } 932 }
670} 933}
671EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 934EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
672 935
673static void blk_mq_work_fn(struct work_struct *work) 936static void blk_mq_run_work_fn(struct work_struct *work)
674{ 937{
675 struct blk_mq_hw_ctx *hctx; 938 struct blk_mq_hw_ctx *hctx;
676 939
677 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 940 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
941
678 __blk_mq_run_hw_queue(hctx); 942 __blk_mq_run_hw_queue(hctx);
679} 943}
680 944
945static void blk_mq_delay_work_fn(struct work_struct *work)
946{
947 struct blk_mq_hw_ctx *hctx;
948
949 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
950
951 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
952 __blk_mq_run_hw_queue(hctx);
953}
954
955void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
956{
957 unsigned long tmo = msecs_to_jiffies(msecs);
958
959 if (hctx->queue->nr_hw_queues == 1)
960 kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
961 else {
962 unsigned int cpu;
963
964 cpu = blk_mq_hctx_next_cpu(hctx);
965 kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
966 }
967}
968EXPORT_SYMBOL(blk_mq_delay_queue);
969
681static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 970static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
682 struct request *rq, bool at_head) 971 struct request *rq, bool at_head)
683{ 972{
@@ -689,12 +978,13 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
689 list_add(&rq->queuelist, &ctx->rq_list); 978 list_add(&rq->queuelist, &ctx->rq_list);
690 else 979 else
691 list_add_tail(&rq->queuelist, &ctx->rq_list); 980 list_add_tail(&rq->queuelist, &ctx->rq_list);
981
692 blk_mq_hctx_mark_pending(hctx, ctx); 982 blk_mq_hctx_mark_pending(hctx, ctx);
693 983
694 /* 984 /*
695 * We do this early, to ensure we are on the right CPU. 985 * We do this early, to ensure we are on the right CPU.
696 */ 986 */
697 blk_mq_add_timer(rq); 987 blk_add_timer(rq);
698} 988}
699 989
700void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 990void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
@@ -719,10 +1009,10 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
719 spin_unlock(&ctx->lock); 1009 spin_unlock(&ctx->lock);
720 } 1010 }
721 1011
722 blk_mq_put_ctx(current_ctx);
723
724 if (run_queue) 1012 if (run_queue)
725 blk_mq_run_hw_queue(hctx, async); 1013 blk_mq_run_hw_queue(hctx, async);
1014
1015 blk_mq_put_ctx(current_ctx);
726} 1016}
727 1017
728static void blk_mq_insert_requests(struct request_queue *q, 1018static void blk_mq_insert_requests(struct request_queue *q,
@@ -758,9 +1048,8 @@ static void blk_mq_insert_requests(struct request_queue *q,
758 } 1048 }
759 spin_unlock(&ctx->lock); 1049 spin_unlock(&ctx->lock);
760 1050
761 blk_mq_put_ctx(current_ctx);
762
763 blk_mq_run_hw_queue(hctx, from_schedule); 1051 blk_mq_run_hw_queue(hctx, from_schedule);
1052 blk_mq_put_ctx(current_ctx);
764} 1053}
765 1054
766static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1055static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -823,24 +1112,169 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
823static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1112static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
824{ 1113{
825 init_request_from_bio(rq, bio); 1114 init_request_from_bio(rq, bio);
826 blk_account_io_start(rq, 1); 1115
1116 if (blk_do_io_stat(rq)) {
1117 rq->start_time = jiffies;
1118 blk_account_io_start(rq, 1);
1119 }
827} 1120}
828 1121
829static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 1122static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1123 struct blk_mq_ctx *ctx,
1124 struct request *rq, struct bio *bio)
1125{
1126 struct request_queue *q = hctx->queue;
1127
1128 if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
1129 blk_mq_bio_to_request(rq, bio);
1130 spin_lock(&ctx->lock);
1131insert_rq:
1132 __blk_mq_insert_request(hctx, rq, false);
1133 spin_unlock(&ctx->lock);
1134 return false;
1135 } else {
1136 spin_lock(&ctx->lock);
1137 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1138 blk_mq_bio_to_request(rq, bio);
1139 goto insert_rq;
1140 }
1141
1142 spin_unlock(&ctx->lock);
1143 __blk_mq_free_request(hctx, ctx, rq);
1144 return true;
1145 }
1146}
1147
1148struct blk_map_ctx {
1149 struct blk_mq_hw_ctx *hctx;
1150 struct blk_mq_ctx *ctx;
1151};
1152
1153static struct request *blk_mq_map_request(struct request_queue *q,
1154 struct bio *bio,
1155 struct blk_map_ctx *data)
830{ 1156{
831 struct blk_mq_hw_ctx *hctx; 1157 struct blk_mq_hw_ctx *hctx;
832 struct blk_mq_ctx *ctx; 1158 struct blk_mq_ctx *ctx;
1159 struct request *rq;
1160 int rw = bio_data_dir(bio);
1161
1162 if (unlikely(blk_mq_queue_enter(q))) {
1163 bio_endio(bio, -EIO);
1164 return NULL;
1165 }
1166
1167 ctx = blk_mq_get_ctx(q);
1168 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1169
1170 if (rw_is_sync(bio->bi_rw))
1171 rw |= REQ_SYNC;
1172
1173 trace_block_getrq(q, bio, rw);
1174 rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false);
1175 if (unlikely(!rq)) {
1176 __blk_mq_run_hw_queue(hctx);
1177 blk_mq_put_ctx(ctx);
1178 trace_block_sleeprq(q, bio, rw);
1179
1180 ctx = blk_mq_get_ctx(q);
1181 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1182 rq = __blk_mq_alloc_request(q, hctx, ctx, rw,
1183 __GFP_WAIT|GFP_ATOMIC, false);
1184 }
1185
1186 hctx->queued++;
1187 data->hctx = hctx;
1188 data->ctx = ctx;
1189 return rq;
1190}
1191
1192/*
1193 * Multiple hardware queue variant. This will not use per-process plugs,
1194 * but will attempt to bypass the hctx queueing if we can go straight to
1195 * hardware for SYNC IO.
1196 */
1197static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1198{
833 const int is_sync = rw_is_sync(bio->bi_rw); 1199 const int is_sync = rw_is_sync(bio->bi_rw);
834 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1200 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
835 int rw = bio_data_dir(bio); 1201 struct blk_map_ctx data;
836 struct request *rq; 1202 struct request *rq;
1203
1204 blk_queue_bounce(q, &bio);
1205
1206 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1207 bio_endio(bio, -EIO);
1208 return;
1209 }
1210
1211 rq = blk_mq_map_request(q, bio, &data);
1212 if (unlikely(!rq))
1213 return;
1214
1215 if (unlikely(is_flush_fua)) {
1216 blk_mq_bio_to_request(rq, bio);
1217 blk_insert_flush(rq);
1218 goto run_queue;
1219 }
1220
1221 if (is_sync) {
1222 int ret;
1223
1224 blk_mq_bio_to_request(rq, bio);
1225 blk_mq_start_request(rq, true);
1226 blk_add_timer(rq);
1227
1228 /*
1229 * For OK queue, we are done. For error, kill it. Any other
1230 * error (busy), just add it to our list as we previously
1231 * would have done
1232 */
1233 ret = q->mq_ops->queue_rq(data.hctx, rq);
1234 if (ret == BLK_MQ_RQ_QUEUE_OK)
1235 goto done;
1236 else {
1237 __blk_mq_requeue_request(rq);
1238
1239 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1240 rq->errors = -EIO;
1241 blk_mq_end_io(rq, rq->errors);
1242 goto done;
1243 }
1244 }
1245 }
1246
1247 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1248 /*
1249 * For a SYNC request, send it to the hardware immediately. For
1250 * an ASYNC request, just ensure that we run it later on. The
1251 * latter allows for merging opportunities and more efficient
1252 * dispatching.
1253 */
1254run_queue:
1255 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1256 }
1257done:
1258 blk_mq_put_ctx(data.ctx);
1259}
1260
1261/*
1262 * Single hardware queue variant. This will attempt to use any per-process
1263 * plug for merging and IO deferral.
1264 */
1265static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1266{
1267 const int is_sync = rw_is_sync(bio->bi_rw);
1268 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
837 unsigned int use_plug, request_count = 0; 1269 unsigned int use_plug, request_count = 0;
1270 struct blk_map_ctx data;
1271 struct request *rq;
838 1272
839 /* 1273 /*
840 * If we have multiple hardware queues, just go directly to 1274 * If we have multiple hardware queues, just go directly to
841 * one of those for sync IO. 1275 * one of those for sync IO.
842 */ 1276 */
843 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 1277 use_plug = !is_flush_fua && !is_sync;
844 1278
845 blk_queue_bounce(q, &bio); 1279 blk_queue_bounce(q, &bio);
846 1280
@@ -849,37 +1283,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
849 return; 1283 return;
850 } 1284 }
851 1285
852 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 1286 if (use_plug && !blk_queue_nomerges(q) &&
1287 blk_attempt_plug_merge(q, bio, &request_count))
853 return; 1288 return;
854 1289
855 if (blk_mq_queue_enter(q)) { 1290 rq = blk_mq_map_request(q, bio, &data);
856 bio_endio(bio, -EIO);
857 return;
858 }
859
860 ctx = blk_mq_get_ctx(q);
861 hctx = q->mq_ops->map_queue(q, ctx->cpu);
862
863 if (is_sync)
864 rw |= REQ_SYNC;
865 trace_block_getrq(q, bio, rw);
866 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
867 if (likely(rq))
868 blk_mq_rq_ctx_init(q, ctx, rq, rw);
869 else {
870 blk_mq_put_ctx(ctx);
871 trace_block_sleeprq(q, bio, rw);
872 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
873 false);
874 ctx = rq->mq_ctx;
875 hctx = q->mq_ops->map_queue(q, ctx->cpu);
876 }
877
878 hctx->queued++;
879 1291
880 if (unlikely(is_flush_fua)) { 1292 if (unlikely(is_flush_fua)) {
881 blk_mq_bio_to_request(rq, bio); 1293 blk_mq_bio_to_request(rq, bio);
882 blk_mq_put_ctx(ctx);
883 blk_insert_flush(rq); 1294 blk_insert_flush(rq);
884 goto run_queue; 1295 goto run_queue;
885 } 1296 }
@@ -901,31 +1312,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
901 trace_block_plug(q); 1312 trace_block_plug(q);
902 } 1313 }
903 list_add_tail(&rq->queuelist, &plug->mq_list); 1314 list_add_tail(&rq->queuelist, &plug->mq_list);
904 blk_mq_put_ctx(ctx); 1315 blk_mq_put_ctx(data.ctx);
905 return; 1316 return;
906 } 1317 }
907 } 1318 }
908 1319
909 spin_lock(&ctx->lock); 1320 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
910 1321 /*
911 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1322 * For a SYNC request, send it to the hardware immediately. For
912 blk_mq_attempt_merge(q, ctx, bio)) 1323 * an ASYNC request, just ensure that we run it later on. The
913 __blk_mq_free_request(hctx, ctx, rq); 1324 * latter allows for merging opportunities and more efficient
914 else { 1325 * dispatching.
915 blk_mq_bio_to_request(rq, bio); 1326 */
916 __blk_mq_insert_request(hctx, rq, false); 1327run_queue:
1328 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
917 } 1329 }
918 1330
919 spin_unlock(&ctx->lock); 1331 blk_mq_put_ctx(data.ctx);
920 blk_mq_put_ctx(ctx);
921
922 /*
923 * For a SYNC request, send it to the hardware immediately. For an
924 * ASYNC request, just ensure that we run it later on. The latter
925 * allows for merging opportunities and more efficient dispatching.
926 */
927run_queue:
928 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
929} 1332}
930 1333
931/* 1334/*
@@ -937,32 +1340,153 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
937} 1340}
938EXPORT_SYMBOL(blk_mq_map_queue); 1341EXPORT_SYMBOL(blk_mq_map_queue);
939 1342
940struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 1343static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
941 unsigned int hctx_index) 1344 struct blk_mq_tags *tags, unsigned int hctx_idx)
942{ 1345{
943 return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 1346 struct page *page;
944 GFP_KERNEL | __GFP_ZERO, reg->numa_node); 1347
1348 if (tags->rqs && set->ops->exit_request) {
1349 int i;
1350
1351 for (i = 0; i < tags->nr_tags; i++) {
1352 if (!tags->rqs[i])
1353 continue;
1354 set->ops->exit_request(set->driver_data, tags->rqs[i],
1355 hctx_idx, i);
1356 }
1357 }
1358
1359 while (!list_empty(&tags->page_list)) {
1360 page = list_first_entry(&tags->page_list, struct page, lru);
1361 list_del_init(&page->lru);
1362 __free_pages(page, page->private);
1363 }
1364
1365 kfree(tags->rqs);
1366
1367 blk_mq_free_tags(tags);
945} 1368}
946EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
947 1369
948void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 1370static size_t order_to_size(unsigned int order)
949 unsigned int hctx_index)
950{ 1371{
951 kfree(hctx); 1372 return (size_t)PAGE_SIZE << order;
952} 1373}
953EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
954 1374
955static void blk_mq_hctx_notify(void *data, unsigned long action, 1375static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
956 unsigned int cpu) 1376 unsigned int hctx_idx)
1377{
1378 struct blk_mq_tags *tags;
1379 unsigned int i, j, entries_per_page, max_order = 4;
1380 size_t rq_size, left;
1381
1382 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1383 set->numa_node);
1384 if (!tags)
1385 return NULL;
1386
1387 INIT_LIST_HEAD(&tags->page_list);
1388
1389 tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
1390 GFP_KERNEL, set->numa_node);
1391 if (!tags->rqs) {
1392 blk_mq_free_tags(tags);
1393 return NULL;
1394 }
1395
1396 /*
1397 * rq_size is the size of the request plus driver payload, rounded
1398 * to the cacheline size
1399 */
1400 rq_size = round_up(sizeof(struct request) + set->cmd_size,
1401 cache_line_size());
1402 left = rq_size * set->queue_depth;
1403
1404 for (i = 0; i < set->queue_depth; ) {
1405 int this_order = max_order;
1406 struct page *page;
1407 int to_do;
1408 void *p;
1409
1410 while (left < order_to_size(this_order - 1) && this_order)
1411 this_order--;
1412
1413 do {
1414 page = alloc_pages_node(set->numa_node, GFP_KERNEL,
1415 this_order);
1416 if (page)
1417 break;
1418 if (!this_order--)
1419 break;
1420 if (order_to_size(this_order) < rq_size)
1421 break;
1422 } while (1);
1423
1424 if (!page)
1425 goto fail;
1426
1427 page->private = this_order;
1428 list_add_tail(&page->lru, &tags->page_list);
1429
1430 p = page_address(page);
1431 entries_per_page = order_to_size(this_order) / rq_size;
1432 to_do = min(entries_per_page, set->queue_depth - i);
1433 left -= to_do * rq_size;
1434 for (j = 0; j < to_do; j++) {
1435 tags->rqs[i] = p;
1436 if (set->ops->init_request) {
1437 if (set->ops->init_request(set->driver_data,
1438 tags->rqs[i], hctx_idx, i,
1439 set->numa_node))
1440 goto fail;
1441 }
1442
1443 p += rq_size;
1444 i++;
1445 }
1446 }
1447
1448 return tags;
1449
1450fail:
1451 pr_warn("%s: failed to allocate requests\n", __func__);
1452 blk_mq_free_rq_map(set, tags, hctx_idx);
1453 return NULL;
1454}
1455
1456static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
1457{
1458 kfree(bitmap->map);
1459}
1460
1461static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
1462{
1463 unsigned int bpw = 8, total, num_maps, i;
1464
1465 bitmap->bits_per_word = bpw;
1466
1467 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
1468 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
1469 GFP_KERNEL, node);
1470 if (!bitmap->map)
1471 return -ENOMEM;
1472
1473 bitmap->map_size = num_maps;
1474
1475 total = nr_cpu_ids;
1476 for (i = 0; i < num_maps; i++) {
1477 bitmap->map[i].depth = min(total, bitmap->bits_per_word);
1478 total -= bitmap->map[i].depth;
1479 }
1480
1481 return 0;
1482}
1483
1484static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
957{ 1485{
958 struct blk_mq_hw_ctx *hctx = data;
959 struct request_queue *q = hctx->queue; 1486 struct request_queue *q = hctx->queue;
960 struct blk_mq_ctx *ctx; 1487 struct blk_mq_ctx *ctx;
961 LIST_HEAD(tmp); 1488 LIST_HEAD(tmp);
962 1489
963 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
964 return;
965
966 /* 1490 /*
967 * Move ctx entries to new CPU, if this one is going away. 1491 * Move ctx entries to new CPU, if this one is going away.
968 */ 1492 */
@@ -971,12 +1495,12 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
971 spin_lock(&ctx->lock); 1495 spin_lock(&ctx->lock);
972 if (!list_empty(&ctx->rq_list)) { 1496 if (!list_empty(&ctx->rq_list)) {
973 list_splice_init(&ctx->rq_list, &tmp); 1497 list_splice_init(&ctx->rq_list, &tmp);
974 clear_bit(ctx->index_hw, hctx->ctx_map); 1498 blk_mq_hctx_clear_pending(hctx, ctx);
975 } 1499 }
976 spin_unlock(&ctx->lock); 1500 spin_unlock(&ctx->lock);
977 1501
978 if (list_empty(&tmp)) 1502 if (list_empty(&tmp))
979 return; 1503 return NOTIFY_OK;
980 1504
981 ctx = blk_mq_get_ctx(q); 1505 ctx = blk_mq_get_ctx(q);
982 spin_lock(&ctx->lock); 1506 spin_lock(&ctx->lock);
@@ -993,210 +1517,103 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
993 blk_mq_hctx_mark_pending(hctx, ctx); 1517 blk_mq_hctx_mark_pending(hctx, ctx);
994 1518
995 spin_unlock(&ctx->lock); 1519 spin_unlock(&ctx->lock);
996 blk_mq_put_ctx(ctx);
997 1520
998 blk_mq_run_hw_queue(hctx, true); 1521 blk_mq_run_hw_queue(hctx, true);
1522 blk_mq_put_ctx(ctx);
1523 return NOTIFY_OK;
999} 1524}
1000 1525
1001static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1526static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
1002 int (*init)(void *, struct blk_mq_hw_ctx *,
1003 struct request *, unsigned int),
1004 void *data)
1005{ 1527{
1006 unsigned int i; 1528 struct request_queue *q = hctx->queue;
1007 int ret = 0; 1529 struct blk_mq_tag_set *set = q->tag_set;
1008
1009 for (i = 0; i < hctx->queue_depth; i++) {
1010 struct request *rq = hctx->rqs[i];
1011
1012 ret = init(data, hctx, rq, i);
1013 if (ret)
1014 break;
1015 }
1016
1017 return ret;
1018}
1019 1530
1020int blk_mq_init_commands(struct request_queue *q, 1531 if (set->tags[hctx->queue_num])
1021 int (*init)(void *, struct blk_mq_hw_ctx *, 1532 return NOTIFY_OK;
1022 struct request *, unsigned int),
1023 void *data)
1024{
1025 struct blk_mq_hw_ctx *hctx;
1026 unsigned int i;
1027 int ret = 0;
1028 1533
1029 queue_for_each_hw_ctx(q, hctx, i) { 1534 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
1030 ret = blk_mq_init_hw_commands(hctx, init, data); 1535 if (!set->tags[hctx->queue_num])
1031 if (ret) 1536 return NOTIFY_STOP;
1032 break;
1033 }
1034 1537
1035 return ret; 1538 hctx->tags = set->tags[hctx->queue_num];
1539 return NOTIFY_OK;
1036} 1540}
1037EXPORT_SYMBOL(blk_mq_init_commands);
1038 1541
1039static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx, 1542static int blk_mq_hctx_notify(void *data, unsigned long action,
1040 void (*free)(void *, struct blk_mq_hw_ctx *, 1543 unsigned int cpu)
1041 struct request *, unsigned int),
1042 void *data)
1043{ 1544{
1044 unsigned int i; 1545 struct blk_mq_hw_ctx *hctx = data;
1045 1546
1046 for (i = 0; i < hctx->queue_depth; i++) { 1547 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1047 struct request *rq = hctx->rqs[i]; 1548 return blk_mq_hctx_cpu_offline(hctx, cpu);
1549 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
1550 return blk_mq_hctx_cpu_online(hctx, cpu);
1048 1551
1049 free(data, hctx, rq, i); 1552 return NOTIFY_OK;
1050 }
1051} 1553}
1052 1554
1053void blk_mq_free_commands(struct request_queue *q, 1555static void blk_mq_exit_hw_queues(struct request_queue *q,
1054 void (*free)(void *, struct blk_mq_hw_ctx *, 1556 struct blk_mq_tag_set *set, int nr_queue)
1055 struct request *, unsigned int),
1056 void *data)
1057{ 1557{
1058 struct blk_mq_hw_ctx *hctx; 1558 struct blk_mq_hw_ctx *hctx;
1059 unsigned int i; 1559 unsigned int i;
1060 1560
1061 queue_for_each_hw_ctx(q, hctx, i) 1561 queue_for_each_hw_ctx(q, hctx, i) {
1062 blk_mq_free_hw_commands(hctx, free, data); 1562 if (i == nr_queue)
1063} 1563 break;
1064EXPORT_SYMBOL(blk_mq_free_commands);
1065 1564
1066static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1565 if (set->ops->exit_hctx)
1067{ 1566 set->ops->exit_hctx(hctx, i);
1068 struct page *page;
1069 1567
1070 while (!list_empty(&hctx->page_list)) { 1568 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1071 page = list_first_entry(&hctx->page_list, struct page, lru); 1569 kfree(hctx->ctxs);
1072 list_del_init(&page->lru); 1570 blk_mq_free_bitmap(&hctx->ctx_map);
1073 __free_pages(page, page->private);
1074 } 1571 }
1075 1572
1076 kfree(hctx->rqs);
1077
1078 if (hctx->tags)
1079 blk_mq_free_tags(hctx->tags);
1080}
1081
1082static size_t order_to_size(unsigned int order)
1083{
1084 size_t ret = PAGE_SIZE;
1085
1086 while (order--)
1087 ret *= 2;
1088
1089 return ret;
1090} 1573}
1091 1574
1092static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1575static void blk_mq_free_hw_queues(struct request_queue *q,
1093 unsigned int reserved_tags, int node) 1576 struct blk_mq_tag_set *set)
1094{ 1577{
1095 unsigned int i, j, entries_per_page, max_order = 4; 1578 struct blk_mq_hw_ctx *hctx;
1096 size_t rq_size, left; 1579 unsigned int i;
1097
1098 INIT_LIST_HEAD(&hctx->page_list);
1099
1100 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
1101 GFP_KERNEL, node);
1102 if (!hctx->rqs)
1103 return -ENOMEM;
1104
1105 /*
1106 * rq_size is the size of the request plus driver payload, rounded
1107 * to the cacheline size
1108 */
1109 rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
1110 cache_line_size());
1111 left = rq_size * hctx->queue_depth;
1112
1113 for (i = 0; i < hctx->queue_depth;) {
1114 int this_order = max_order;
1115 struct page *page;
1116 int to_do;
1117 void *p;
1118
1119 while (left < order_to_size(this_order - 1) && this_order)
1120 this_order--;
1121
1122 do {
1123 page = alloc_pages_node(node, GFP_KERNEL, this_order);
1124 if (page)
1125 break;
1126 if (!this_order--)
1127 break;
1128 if (order_to_size(this_order) < rq_size)
1129 break;
1130 } while (1);
1131
1132 if (!page)
1133 break;
1134
1135 page->private = this_order;
1136 list_add_tail(&page->lru, &hctx->page_list);
1137
1138 p = page_address(page);
1139 entries_per_page = order_to_size(this_order) / rq_size;
1140 to_do = min(entries_per_page, hctx->queue_depth - i);
1141 left -= to_do * rq_size;
1142 for (j = 0; j < to_do; j++) {
1143 hctx->rqs[i] = p;
1144 blk_mq_rq_init(hctx, hctx->rqs[i]);
1145 p += rq_size;
1146 i++;
1147 }
1148 }
1149
1150 if (i < (reserved_tags + BLK_MQ_TAG_MIN))
1151 goto err_rq_map;
1152 else if (i != hctx->queue_depth) {
1153 hctx->queue_depth = i;
1154 pr_warn("%s: queue depth set to %u because of low memory\n",
1155 __func__, i);
1156 }
1157 1580
1158 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1581 queue_for_each_hw_ctx(q, hctx, i) {
1159 if (!hctx->tags) { 1582 free_cpumask_var(hctx->cpumask);
1160err_rq_map: 1583 kfree(hctx);
1161 blk_mq_free_rq_map(hctx);
1162 return -ENOMEM;
1163 } 1584 }
1164
1165 return 0;
1166} 1585}
1167 1586
1168static int blk_mq_init_hw_queues(struct request_queue *q, 1587static int blk_mq_init_hw_queues(struct request_queue *q,
1169 struct blk_mq_reg *reg, void *driver_data) 1588 struct blk_mq_tag_set *set)
1170{ 1589{
1171 struct blk_mq_hw_ctx *hctx; 1590 struct blk_mq_hw_ctx *hctx;
1172 unsigned int i, j; 1591 unsigned int i;
1173 1592
1174 /* 1593 /*
1175 * Initialize hardware queues 1594 * Initialize hardware queues
1176 */ 1595 */
1177 queue_for_each_hw_ctx(q, hctx, i) { 1596 queue_for_each_hw_ctx(q, hctx, i) {
1178 unsigned int num_maps;
1179 int node; 1597 int node;
1180 1598
1181 node = hctx->numa_node; 1599 node = hctx->numa_node;
1182 if (node == NUMA_NO_NODE) 1600 if (node == NUMA_NO_NODE)
1183 node = hctx->numa_node = reg->numa_node; 1601 node = hctx->numa_node = set->numa_node;
1184 1602
1185 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1603 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1604 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1186 spin_lock_init(&hctx->lock); 1605 spin_lock_init(&hctx->lock);
1187 INIT_LIST_HEAD(&hctx->dispatch); 1606 INIT_LIST_HEAD(&hctx->dispatch);
1188 hctx->queue = q; 1607 hctx->queue = q;
1189 hctx->queue_num = i; 1608 hctx->queue_num = i;
1190 hctx->flags = reg->flags; 1609 hctx->flags = set->flags;
1191 hctx->queue_depth = reg->queue_depth; 1610 hctx->cmd_size = set->cmd_size;
1192 hctx->cmd_size = reg->cmd_size;
1193 1611
1194 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1612 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1195 blk_mq_hctx_notify, hctx); 1613 blk_mq_hctx_notify, hctx);
1196 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1614 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1197 1615
1198 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1616 hctx->tags = set->tags[i];
1199 break;
1200 1617
1201 /* 1618 /*
1202 * Allocate space for all possible cpus to avoid allocation in 1619 * Allocate space for all possible cpus to avoid allocation in
@@ -1207,17 +1624,13 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
1207 if (!hctx->ctxs) 1624 if (!hctx->ctxs)
1208 break; 1625 break;
1209 1626
1210 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1627 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
1211 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1212 GFP_KERNEL, node);
1213 if (!hctx->ctx_map)
1214 break; 1628 break;
1215 1629
1216 hctx->nr_ctx_map = num_maps;
1217 hctx->nr_ctx = 0; 1630 hctx->nr_ctx = 0;
1218 1631
1219 if (reg->ops->init_hctx && 1632 if (set->ops->init_hctx &&
1220 reg->ops->init_hctx(hctx, driver_data, i)) 1633 set->ops->init_hctx(hctx, set->driver_data, i))
1221 break; 1634 break;
1222 } 1635 }
1223 1636
@@ -1227,17 +1640,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
1227 /* 1640 /*
1228 * Init failed 1641 * Init failed
1229 */ 1642 */
1230 queue_for_each_hw_ctx(q, hctx, j) { 1643 blk_mq_exit_hw_queues(q, set, i);
1231 if (i == j)
1232 break;
1233
1234 if (reg->ops->exit_hctx)
1235 reg->ops->exit_hctx(hctx, j);
1236
1237 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1238 blk_mq_free_rq_map(hctx);
1239 kfree(hctx->ctxs);
1240 }
1241 1644
1242 return 1; 1645 return 1;
1243} 1646}
@@ -1258,12 +1661,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
1258 __ctx->queue = q; 1661 __ctx->queue = q;
1259 1662
1260 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1663 /* If the cpu isn't online, the cpu is mapped to first hctx */
1261 hctx = q->mq_ops->map_queue(q, i);
1262 hctx->nr_ctx++;
1263
1264 if (!cpu_online(i)) 1664 if (!cpu_online(i))
1265 continue; 1665 continue;
1266 1666
1667 hctx = q->mq_ops->map_queue(q, i);
1668 cpumask_set_cpu(i, hctx->cpumask);
1669 hctx->nr_ctx++;
1670
1267 /* 1671 /*
1268 * Set local node, IFF we have more than one hw queue. If 1672 * Set local node, IFF we have more than one hw queue. If
1269 * not, we remain on the home node of the device 1673 * not, we remain on the home node of the device
@@ -1280,6 +1684,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1280 struct blk_mq_ctx *ctx; 1684 struct blk_mq_ctx *ctx;
1281 1685
1282 queue_for_each_hw_ctx(q, hctx, i) { 1686 queue_for_each_hw_ctx(q, hctx, i) {
1687 cpumask_clear(hctx->cpumask);
1283 hctx->nr_ctx = 0; 1688 hctx->nr_ctx = 0;
1284 } 1689 }
1285 1690
@@ -1288,115 +1693,208 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1288 */ 1693 */
1289 queue_for_each_ctx(q, ctx, i) { 1694 queue_for_each_ctx(q, ctx, i) {
1290 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1695 /* If the cpu isn't online, the cpu is mapped to first hctx */
1696 if (!cpu_online(i))
1697 continue;
1698
1291 hctx = q->mq_ops->map_queue(q, i); 1699 hctx = q->mq_ops->map_queue(q, i);
1700 cpumask_set_cpu(i, hctx->cpumask);
1292 ctx->index_hw = hctx->nr_ctx; 1701 ctx->index_hw = hctx->nr_ctx;
1293 hctx->ctxs[hctx->nr_ctx++] = ctx; 1702 hctx->ctxs[hctx->nr_ctx++] = ctx;
1294 } 1703 }
1704
1705 queue_for_each_hw_ctx(q, hctx, i) {
1706 /*
1707 * If not software queues are mapped to this hardware queue,
1708 * disable it and free the request entries
1709 */
1710 if (!hctx->nr_ctx) {
1711 struct blk_mq_tag_set *set = q->tag_set;
1712
1713 if (set->tags[i]) {
1714 blk_mq_free_rq_map(set, set->tags[i], i);
1715 set->tags[i] = NULL;
1716 hctx->tags = NULL;
1717 }
1718 continue;
1719 }
1720
1721 /*
1722 * Initialize batch roundrobin counts
1723 */
1724 hctx->next_cpu = cpumask_first(hctx->cpumask);
1725 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1726 }
1295} 1727}
1296 1728
1297struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1729static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
1298 void *driver_data)
1299{ 1730{
1300 struct blk_mq_hw_ctx **hctxs; 1731 struct blk_mq_hw_ctx *hctx;
1301 struct blk_mq_ctx *ctx;
1302 struct request_queue *q; 1732 struct request_queue *q;
1733 bool shared;
1303 int i; 1734 int i;
1304 1735
1305 if (!reg->nr_hw_queues || 1736 if (set->tag_list.next == set->tag_list.prev)
1306 !reg->ops->queue_rq || !reg->ops->map_queue || 1737 shared = false;
1307 !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1738 else
1308 return ERR_PTR(-EINVAL); 1739 shared = true;
1740
1741 list_for_each_entry(q, &set->tag_list, tag_set_list) {
1742 blk_mq_freeze_queue(q);
1309 1743
1310 if (!reg->queue_depth) 1744 queue_for_each_hw_ctx(q, hctx, i) {
1311 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1745 if (shared)
1312 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1746 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1313 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1747 else
1314 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1748 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1749 }
1750 blk_mq_unfreeze_queue(q);
1315 } 1751 }
1752}
1316 1753
1317 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1754static void blk_mq_del_queue_tag_set(struct request_queue *q)
1318 return ERR_PTR(-EINVAL); 1755{
1756 struct blk_mq_tag_set *set = q->tag_set;
1757
1758 blk_mq_freeze_queue(q);
1759
1760 mutex_lock(&set->tag_list_lock);
1761 list_del_init(&q->tag_set_list);
1762 blk_mq_update_tag_set_depth(set);
1763 mutex_unlock(&set->tag_list_lock);
1764
1765 blk_mq_unfreeze_queue(q);
1766}
1767
1768static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1769 struct request_queue *q)
1770{
1771 q->tag_set = set;
1772
1773 mutex_lock(&set->tag_list_lock);
1774 list_add_tail(&q->tag_set_list, &set->tag_list);
1775 blk_mq_update_tag_set_depth(set);
1776 mutex_unlock(&set->tag_list_lock);
1777}
1778
1779struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1780{
1781 struct blk_mq_hw_ctx **hctxs;
1782 struct blk_mq_ctx *ctx;
1783 struct request_queue *q;
1784 unsigned int *map;
1785 int i;
1319 1786
1320 ctx = alloc_percpu(struct blk_mq_ctx); 1787 ctx = alloc_percpu(struct blk_mq_ctx);
1321 if (!ctx) 1788 if (!ctx)
1322 return ERR_PTR(-ENOMEM); 1789 return ERR_PTR(-ENOMEM);
1323 1790
1324 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1791 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1325 reg->numa_node); 1792 set->numa_node);
1326 1793
1327 if (!hctxs) 1794 if (!hctxs)
1328 goto err_percpu; 1795 goto err_percpu;
1329 1796
1330 for (i = 0; i < reg->nr_hw_queues; i++) { 1797 map = blk_mq_make_queue_map(set);
1331 hctxs[i] = reg->ops->alloc_hctx(reg, i); 1798 if (!map)
1799 goto err_map;
1800
1801 for (i = 0; i < set->nr_hw_queues; i++) {
1802 int node = blk_mq_hw_queue_to_node(map, i);
1803
1804 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
1805 GFP_KERNEL, node);
1332 if (!hctxs[i]) 1806 if (!hctxs[i])
1333 goto err_hctxs; 1807 goto err_hctxs;
1334 1808
1335 hctxs[i]->numa_node = NUMA_NO_NODE; 1809 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
1810 goto err_hctxs;
1811
1812 atomic_set(&hctxs[i]->nr_active, 0);
1813 hctxs[i]->numa_node = node;
1336 hctxs[i]->queue_num = i; 1814 hctxs[i]->queue_num = i;
1337 } 1815 }
1338 1816
1339 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1817 q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1340 if (!q) 1818 if (!q)
1341 goto err_hctxs; 1819 goto err_hctxs;
1342 1820
1343 q->mq_map = blk_mq_make_queue_map(reg); 1821 if (percpu_counter_init(&q->mq_usage_counter, 0))
1344 if (!q->mq_map)
1345 goto err_map; 1822 goto err_map;
1346 1823
1347 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1824 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1348 blk_queue_rq_timeout(q, 30000); 1825 blk_queue_rq_timeout(q, 30000);
1349 1826
1350 q->nr_queues = nr_cpu_ids; 1827 q->nr_queues = nr_cpu_ids;
1351 q->nr_hw_queues = reg->nr_hw_queues; 1828 q->nr_hw_queues = set->nr_hw_queues;
1829 q->mq_map = map;
1352 1830
1353 q->queue_ctx = ctx; 1831 q->queue_ctx = ctx;
1354 q->queue_hw_ctx = hctxs; 1832 q->queue_hw_ctx = hctxs;
1355 1833
1356 q->mq_ops = reg->ops; 1834 q->mq_ops = set->ops;
1357 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1835 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1358 1836
1837 if (!(set->flags & BLK_MQ_F_SG_MERGE))
1838 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
1839
1359 q->sg_reserved_size = INT_MAX; 1840 q->sg_reserved_size = INT_MAX;
1360 1841
1361 blk_queue_make_request(q, blk_mq_make_request); 1842 INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
1362 blk_queue_rq_timed_out(q, reg->ops->timeout); 1843 INIT_LIST_HEAD(&q->requeue_list);
1363 if (reg->timeout) 1844 spin_lock_init(&q->requeue_lock);
1364 blk_queue_rq_timeout(q, reg->timeout); 1845
1846 if (q->nr_hw_queues > 1)
1847 blk_queue_make_request(q, blk_mq_make_request);
1848 else
1849 blk_queue_make_request(q, blk_sq_make_request);
1850
1851 blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
1852 if (set->timeout)
1853 blk_queue_rq_timeout(q, set->timeout);
1854
1855 /*
1856 * Do this after blk_queue_make_request() overrides it...
1857 */
1858 q->nr_requests = set->queue_depth;
1365 1859
1366 if (reg->ops->complete) 1860 if (set->ops->complete)
1367 blk_queue_softirq_done(q, reg->ops->complete); 1861 blk_queue_softirq_done(q, set->ops->complete);
1368 1862
1369 blk_mq_init_flush(q); 1863 blk_mq_init_flush(q);
1370 blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1864 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
1371 1865
1372 q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, 1866 q->flush_rq = kzalloc(round_up(sizeof(struct request) +
1373 cache_line_size()), GFP_KERNEL); 1867 set->cmd_size, cache_line_size()),
1868 GFP_KERNEL);
1374 if (!q->flush_rq) 1869 if (!q->flush_rq)
1375 goto err_hw; 1870 goto err_hw;
1376 1871
1377 if (blk_mq_init_hw_queues(q, reg, driver_data)) 1872 if (blk_mq_init_hw_queues(q, set))
1378 goto err_flush_rq; 1873 goto err_flush_rq;
1379 1874
1380 blk_mq_map_swqueue(q);
1381
1382 mutex_lock(&all_q_mutex); 1875 mutex_lock(&all_q_mutex);
1383 list_add_tail(&q->all_q_node, &all_q_list); 1876 list_add_tail(&q->all_q_node, &all_q_list);
1384 mutex_unlock(&all_q_mutex); 1877 mutex_unlock(&all_q_mutex);
1385 1878
1879 blk_mq_add_queue_tag_set(set, q);
1880
1881 blk_mq_map_swqueue(q);
1882
1386 return q; 1883 return q;
1387 1884
1388err_flush_rq: 1885err_flush_rq:
1389 kfree(q->flush_rq); 1886 kfree(q->flush_rq);
1390err_hw: 1887err_hw:
1391 kfree(q->mq_map);
1392err_map:
1393 blk_cleanup_queue(q); 1888 blk_cleanup_queue(q);
1394err_hctxs: 1889err_hctxs:
1395 for (i = 0; i < reg->nr_hw_queues; i++) { 1890 kfree(map);
1891 for (i = 0; i < set->nr_hw_queues; i++) {
1396 if (!hctxs[i]) 1892 if (!hctxs[i])
1397 break; 1893 break;
1398 reg->ops->free_hctx(hctxs[i], i); 1894 free_cpumask_var(hctxs[i]->cpumask);
1895 kfree(hctxs[i]);
1399 } 1896 }
1897err_map:
1400 kfree(hctxs); 1898 kfree(hctxs);
1401err_percpu: 1899err_percpu:
1402 free_percpu(ctx); 1900 free_percpu(ctx);
@@ -1406,18 +1904,14 @@ EXPORT_SYMBOL(blk_mq_init_queue);
1406 1904
1407void blk_mq_free_queue(struct request_queue *q) 1905void blk_mq_free_queue(struct request_queue *q)
1408{ 1906{
1409 struct blk_mq_hw_ctx *hctx; 1907 struct blk_mq_tag_set *set = q->tag_set;
1410 int i;
1411 1908
1412 queue_for_each_hw_ctx(q, hctx, i) { 1909 blk_mq_del_queue_tag_set(q);
1413 kfree(hctx->ctx_map); 1910
1414 kfree(hctx->ctxs); 1911 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
1415 blk_mq_free_rq_map(hctx); 1912 blk_mq_free_hw_queues(q, set);
1416 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1913
1417 if (q->mq_ops->exit_hctx) 1914 percpu_counter_destroy(&q->mq_usage_counter);
1418 q->mq_ops->exit_hctx(hctx, i);
1419 q->mq_ops->free_hctx(hctx, i);
1420 }
1421 1915
1422 free_percpu(q->queue_ctx); 1916 free_percpu(q->queue_ctx);
1423 kfree(q->queue_hw_ctx); 1917 kfree(q->queue_hw_ctx);
@@ -1437,6 +1931,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
1437{ 1931{
1438 blk_mq_freeze_queue(q); 1932 blk_mq_freeze_queue(q);
1439 1933
1934 blk_mq_sysfs_unregister(q);
1935
1440 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1936 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1441 1937
1442 /* 1938 /*
@@ -1447,6 +1943,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
1447 1943
1448 blk_mq_map_swqueue(q); 1944 blk_mq_map_swqueue(q);
1449 1945
1946 blk_mq_sysfs_register(q);
1947
1450 blk_mq_unfreeze_queue(q); 1948 blk_mq_unfreeze_queue(q);
1451} 1949}
1452 1950
@@ -1456,10 +1954,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1456 struct request_queue *q; 1954 struct request_queue *q;
1457 1955
1458 /* 1956 /*
1459 * Before new mapping is established, hotadded cpu might already start 1957 * Before new mappings are established, hotadded cpu might already
1460 * handling requests. This doesn't break anything as we map offline 1958 * start handling requests. This doesn't break anything as we map
1461 * CPUs to first hardware queue. We will re-init queue below to get 1959 * offline CPUs to first hardware queue. We will re-init the queue
1462 * optimal settings. 1960 * below to get optimal settings.
1463 */ 1961 */
1464 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1962 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1465 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1963 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
@@ -1472,6 +1970,81 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1472 return NOTIFY_OK; 1970 return NOTIFY_OK;
1473} 1971}
1474 1972
1973int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
1974{
1975 int i;
1976
1977 if (!set->nr_hw_queues)
1978 return -EINVAL;
1979 if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
1980 return -EINVAL;
1981 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
1982 return -EINVAL;
1983
1984 if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
1985 return -EINVAL;
1986
1987
1988 set->tags = kmalloc_node(set->nr_hw_queues *
1989 sizeof(struct blk_mq_tags *),
1990 GFP_KERNEL, set->numa_node);
1991 if (!set->tags)
1992 goto out;
1993
1994 for (i = 0; i < set->nr_hw_queues; i++) {
1995 set->tags[i] = blk_mq_init_rq_map(set, i);
1996 if (!set->tags[i])
1997 goto out_unwind;
1998 }
1999
2000 mutex_init(&set->tag_list_lock);
2001 INIT_LIST_HEAD(&set->tag_list);
2002
2003 return 0;
2004
2005out_unwind:
2006 while (--i >= 0)
2007 blk_mq_free_rq_map(set, set->tags[i], i);
2008out:
2009 return -ENOMEM;
2010}
2011EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2012
2013void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2014{
2015 int i;
2016
2017 for (i = 0; i < set->nr_hw_queues; i++) {
2018 if (set->tags[i])
2019 blk_mq_free_rq_map(set, set->tags[i], i);
2020 }
2021
2022 kfree(set->tags);
2023}
2024EXPORT_SYMBOL(blk_mq_free_tag_set);
2025
2026int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2027{
2028 struct blk_mq_tag_set *set = q->tag_set;
2029 struct blk_mq_hw_ctx *hctx;
2030 int i, ret;
2031
2032 if (!set || nr > set->queue_depth)
2033 return -EINVAL;
2034
2035 ret = 0;
2036 queue_for_each_hw_ctx(q, hctx, i) {
2037 ret = blk_mq_tag_update_depth(hctx->tags, nr);
2038 if (ret)
2039 break;
2040 }
2041
2042 if (!ret)
2043 q->nr_requests = nr;
2044
2045 return ret;
2046}
2047
1475void blk_mq_disable_hotplug(void) 2048void blk_mq_disable_hotplug(void)
1476{ 2049{
1477 mutex_lock(&all_q_mutex); 2050 mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ebbe6bac9d61..de7b3bbd5bd6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
1#ifndef INT_BLK_MQ_H 1#ifndef INT_BLK_MQ_H
2#define INT_BLK_MQ_H 2#define INT_BLK_MQ_H
3 3
4struct blk_mq_tag_set;
5
4struct blk_mq_ctx { 6struct blk_mq_ctx {
5 struct { 7 struct {
6 spinlock_t lock; 8 spinlock_t lock;
@@ -9,7 +11,8 @@ struct blk_mq_ctx {
9 11
10 unsigned int cpu; 12 unsigned int cpu;
11 unsigned int index_hw; 13 unsigned int index_hw;
12 unsigned int ipi_redirect; 14
15 unsigned int last_tag ____cacheline_aligned_in_smp;
13 16
14 /* incremented at dispatch time */ 17 /* incremented at dispatch time */
15 unsigned long rq_dispatched[2]; 18 unsigned long rq_dispatched[2];
@@ -20,21 +23,23 @@ struct blk_mq_ctx {
20 23
21 struct request_queue *queue; 24 struct request_queue *queue;
22 struct kobject kobj; 25 struct kobject kobj;
23}; 26} ____cacheline_aligned_in_smp;
24 27
25void __blk_mq_complete_request(struct request *rq); 28void __blk_mq_complete_request(struct request *rq);
26void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 29void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
27void blk_mq_init_flush(struct request_queue *q); 30void blk_mq_init_flush(struct request_queue *q);
28void blk_mq_drain_queue(struct request_queue *q); 31void blk_mq_drain_queue(struct request_queue *q);
29void blk_mq_free_queue(struct request_queue *q); 32void blk_mq_free_queue(struct request_queue *q);
30void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); 33void blk_mq_clone_flush_request(struct request *flush_rq,
34 struct request *orig_rq);
35int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
31 36
32/* 37/*
33 * CPU hotplug helpers 38 * CPU hotplug helpers
34 */ 39 */
35struct blk_mq_cpu_notifier; 40struct blk_mq_cpu_notifier;
36void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 41void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
37 void (*fn)(void *, unsigned long, unsigned int), 42 int (*fn)(void *, unsigned long, unsigned int),
38 void *data); 43 void *data);
39void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 44void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
40void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 45void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
@@ -45,10 +50,23 @@ void blk_mq_disable_hotplug(void);
45/* 50/*
46 * CPU -> queue mappings 51 * CPU -> queue mappings
47 */ 52 */
48struct blk_mq_reg; 53extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
49extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
50extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); 54extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
55extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
51 56
52void blk_mq_add_timer(struct request *rq); 57/*
58 * sysfs helpers
59 */
60extern int blk_mq_sysfs_register(struct request_queue *q);
61extern void blk_mq_sysfs_unregister(struct request_queue *q);
62
63/*
64 * Basic implementation of sparser bitmap, allowing the user to spread
65 * the bits over more cachelines.
66 */
67struct blk_align_bitmap {
68 unsigned long word;
69 unsigned long depth;
70} ____cacheline_aligned_in_smp;
53 71
54#endif 72#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7500f876dae4..23321fbab293 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
48static ssize_t 48static ssize_t
49queue_requests_store(struct request_queue *q, const char *page, size_t count) 49queue_requests_store(struct request_queue *q, const char *page, size_t count)
50{ 50{
51 struct request_list *rl;
52 unsigned long nr; 51 unsigned long nr;
53 int ret; 52 int ret, err;
54 53
55 if (!q->request_fn) 54 if (!q->request_fn && !q->mq_ops)
56 return -EINVAL; 55 return -EINVAL;
57 56
58 ret = queue_var_store(&nr, page, count); 57 ret = queue_var_store(&nr, page, count);
@@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
62 if (nr < BLKDEV_MIN_RQ) 61 if (nr < BLKDEV_MIN_RQ)
63 nr = BLKDEV_MIN_RQ; 62 nr = BLKDEV_MIN_RQ;
64 63
65 spin_lock_irq(q->queue_lock); 64 if (q->request_fn)
66 q->nr_requests = nr; 65 err = blk_update_nr_requests(q, nr);
67 blk_queue_congestion_threshold(q); 66 else
68 67 err = blk_mq_update_nr_requests(q, nr);
69 /* congestion isn't cgroup aware and follows root blkcg for now */ 68
70 rl = &q->root_rl; 69 if (err)
71 70 return err;
72 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
73 blk_set_queue_congested(q, BLK_RW_SYNC);
74 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
75 blk_clear_queue_congested(q, BLK_RW_SYNC);
76
77 if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
78 blk_set_queue_congested(q, BLK_RW_ASYNC);
79 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
80 blk_clear_queue_congested(q, BLK_RW_ASYNC);
81
82 blk_queue_for_each_rl(rl, q) {
83 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
84 blk_set_rl_full(rl, BLK_RW_SYNC);
85 } else {
86 blk_clear_rl_full(rl, BLK_RW_SYNC);
87 wake_up(&rl->wait[BLK_RW_SYNC]);
88 }
89
90 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
91 blk_set_rl_full(rl, BLK_RW_ASYNC);
92 } else {
93 blk_clear_rl_full(rl, BLK_RW_ASYNC);
94 wake_up(&rl->wait[BLK_RW_ASYNC]);
95 }
96 }
97 71
98 spin_unlock_irq(q->queue_lock);
99 return ret; 72 return ret;
100} 73}
101 74
@@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj)
544 if (q->queue_tags) 517 if (q->queue_tags)
545 __blk_queue_free_tags(q); 518 __blk_queue_free_tags(q);
546 519
547 percpu_counter_destroy(&q->mq_usage_counter);
548
549 if (q->mq_ops) 520 if (q->mq_ops)
550 blk_mq_free_queue(q); 521 blk_mq_free_queue(q);
551 522
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 033745cd7fba..9353b4683359 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
744static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 744static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
745{ 745{
746 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 746 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
747 return 0; 747 return false;
748 748
749 return 1; 749 return 1;
750} 750}
@@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
842 if (tg->io_disp[rw] + 1 <= io_allowed) { 842 if (tg->io_disp[rw] + 1 <= io_allowed) {
843 if (wait) 843 if (wait)
844 *wait = 0; 844 *wait = 0;
845 return 1; 845 return true;
846 } 846 }
847 847
848 /* Calc approx time to dispatch */ 848 /* Calc approx time to dispatch */
@@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
880 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { 880 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
881 if (wait) 881 if (wait)
882 *wait = 0; 882 *wait = 0;
883 return 1; 883 return true;
884 } 884 }
885 885
886 /* Calc approx time to dispatch */ 886 /* Calc approx time to dispatch */
@@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
923 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 923 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
924 if (wait) 924 if (wait)
925 *wait = 0; 925 *wait = 0;
926 return 1; 926 return true;
927 } 927 }
928 928
929 /* 929 /*
@@ -1258,7 +1258,7 @@ out_unlock:
1258 * of throtl_data->service_queue. Those bio's are ready and issued by this 1258 * of throtl_data->service_queue. Those bio's are ready and issued by this
1259 * function. 1259 * function.
1260 */ 1260 */
1261void blk_throtl_dispatch_work_fn(struct work_struct *work) 1261static void blk_throtl_dispatch_work_fn(struct work_struct *work)
1262{ 1262{
1263 struct throtl_data *td = container_of(work, struct throtl_data, 1263 struct throtl_data *td = container_of(work, struct throtl_data,
1264 dispatch_work); 1264 dispatch_work);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index d96f7061c6fd..95a09590ccfd 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req)
96 __blk_complete_request(req); 96 __blk_complete_request(req);
97 break; 97 break;
98 case BLK_EH_RESET_TIMER: 98 case BLK_EH_RESET_TIMER:
99 if (q->mq_ops) 99 blk_add_timer(req);
100 blk_mq_add_timer(req);
101 else
102 blk_add_timer(req);
103
104 blk_clear_rq_complete(req); 100 blk_clear_rq_complete(req);
105 break; 101 break;
106 case BLK_EH_NOT_HANDLED: 102 case BLK_EH_NOT_HANDLED:
@@ -170,7 +166,26 @@ void blk_abort_request(struct request *req)
170} 166}
171EXPORT_SYMBOL_GPL(blk_abort_request); 167EXPORT_SYMBOL_GPL(blk_abort_request);
172 168
173void __blk_add_timer(struct request *req, struct list_head *timeout_list) 169unsigned long blk_rq_timeout(unsigned long timeout)
170{
171 unsigned long maxt;
172
173 maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
174 if (time_after(timeout, maxt))
175 timeout = maxt;
176
177 return timeout;
178}
179
180/**
181 * blk_add_timer - Start timeout timer for a single request
182 * @req: request that is about to start running.
183 *
184 * Notes:
185 * Each request has its own timer, and as it is added to the queue, we
186 * set up the timer. When the request completes, we cancel the timer.
187 */
188void blk_add_timer(struct request *req)
174{ 189{
175 struct request_queue *q = req->q; 190 struct request_queue *q = req->q;
176 unsigned long expiry; 191 unsigned long expiry;
@@ -188,32 +203,29 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list)
188 req->timeout = q->rq_timeout; 203 req->timeout = q->rq_timeout;
189 204
190 req->deadline = jiffies + req->timeout; 205 req->deadline = jiffies + req->timeout;
191 if (timeout_list) 206 if (!q->mq_ops)
192 list_add_tail(&req->timeout_list, timeout_list); 207 list_add_tail(&req->timeout_list, &req->q->timeout_list);
193 208
194 /* 209 /*
195 * If the timer isn't already pending or this timeout is earlier 210 * If the timer isn't already pending or this timeout is earlier
196 * than an existing one, modify the timer. Round up to next nearest 211 * than an existing one, modify the timer. Round up to next nearest
197 * second. 212 * second.
198 */ 213 */
199 expiry = round_jiffies_up(req->deadline); 214 expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
200 215
201 if (!timer_pending(&q->timeout) || 216 if (!timer_pending(&q->timeout) ||
202 time_before(expiry, q->timeout.expires)) 217 time_before(expiry, q->timeout.expires)) {
203 mod_timer(&q->timeout, expiry); 218 unsigned long diff = q->timeout.expires - expiry;
204 219
205} 220 /*
221 * Due to added timer slack to group timers, the timer
222 * will often be a little in front of what we asked for.
223 * So apply some tolerance here too, otherwise we keep
224 * modifying the timer because expires for value X
225 * will be X + something.
226 */
227 if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
228 mod_timer(&q->timeout, expiry);
229 }
206 230
207/**
208 * blk_add_timer - Start timeout timer for a single request
209 * @req: request that is about to start running.
210 *
211 * Notes:
212 * Each request has its own timer, and as it is added to the queue, we
213 * set up the timer. When the request completes, we cancel the timer.
214 */
215void blk_add_timer(struct request *req)
216{
217 __blk_add_timer(req, &req->q->timeout_list);
218} 231}
219
diff --git a/block/blk.h b/block/blk.h
index 1d880f1f957f..45385e9abf6f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -9,6 +9,9 @@
9/* Number of requests a "batching" process may submit */ 9/* Number of requests a "batching" process may submit */
10#define BLK_BATCH_REQ 32 10#define BLK_BATCH_REQ 32
11 11
12/* Max future timer expiry for timeouts */
13#define BLK_MAX_TIMEOUT (5 * HZ)
14
12extern struct kmem_cache *blk_requestq_cachep; 15extern struct kmem_cache *blk_requestq_cachep;
13extern struct kmem_cache *request_cachep; 16extern struct kmem_cache *request_cachep;
14extern struct kobj_type blk_queue_ktype; 17extern struct kobj_type blk_queue_ktype;
@@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error,
37void blk_rq_timed_out_timer(unsigned long data); 40void blk_rq_timed_out_timer(unsigned long data);
38void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 41void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
39 unsigned int *next_set); 42 unsigned int *next_set);
40void __blk_add_timer(struct request *req, struct list_head *timeout_list); 43unsigned long blk_rq_timeout(unsigned long timeout);
44void blk_add_timer(struct request *req);
41void blk_delete_timer(struct request *); 45void blk_delete_timer(struct request *);
42void blk_add_timer(struct request *);
43 46
44 47
45bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 48bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
@@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
185 return q->nr_congestion_off; 188 return q->nr_congestion_off;
186} 189}
187 190
191extern int blk_update_nr_requests(struct request_queue *, unsigned int);
192
188/* 193/*
189 * Contribute to IO statistics IFF: 194 * Contribute to IO statistics IFF:
190 * 195 *
diff --git a/block/bounce.c b/block/bounce.c
new file mode 100644
index 000000000000..523918b8c6dc
--- /dev/null
+++ b/block/bounce.c
@@ -0,0 +1,287 @@
1/* bounce buffer handling for block devices
2 *
3 * - Split from highmem.c
4 */
5
6#include <linux/mm.h>
7#include <linux/export.h>
8#include <linux/swap.h>
9#include <linux/gfp.h>
10#include <linux/bio.h>
11#include <linux/pagemap.h>
12#include <linux/mempool.h>
13#include <linux/blkdev.h>
14#include <linux/init.h>
15#include <linux/hash.h>
16#include <linux/highmem.h>
17#include <linux/bootmem.h>
18#include <asm/tlbflush.h>
19
20#include <trace/events/block.h>
21
22#define POOL_SIZE 64
23#define ISA_POOL_SIZE 16
24
25static mempool_t *page_pool, *isa_page_pool;
26
27#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
28static __init int init_emergency_pool(void)
29{
30#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
31 if (max_pfn <= max_low_pfn)
32 return 0;
33#endif
34
35 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
36 BUG_ON(!page_pool);
37 printk("bounce pool size: %d pages\n", POOL_SIZE);
38
39 return 0;
40}
41
42__initcall(init_emergency_pool);
43#endif
44
45#ifdef CONFIG_HIGHMEM
46/*
47 * highmem version, map in to vec
48 */
49static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
50{
51 unsigned long flags;
52 unsigned char *vto;
53
54 local_irq_save(flags);
55 vto = kmap_atomic(to->bv_page);
56 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
57 kunmap_atomic(vto);
58 local_irq_restore(flags);
59}
60
61#else /* CONFIG_HIGHMEM */
62
63#define bounce_copy_vec(to, vfrom) \
64 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
65
66#endif /* CONFIG_HIGHMEM */
67
68/*
69 * allocate pages in the DMA region for the ISA pool
70 */
71static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
72{
73 return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
74}
75
76/*
77 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
78 * as the max address, so check if the pool has already been created.
79 */
80int init_emergency_isa_pool(void)
81{
82 if (isa_page_pool)
83 return 0;
84
85 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
86 mempool_free_pages, (void *) 0);
87 BUG_ON(!isa_page_pool);
88
89 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
90 return 0;
91}
92
93/*
94 * Simple bounce buffer support for highmem pages. Depending on the
95 * queue gfp mask set, *to may or may not be a highmem page. kmap it
96 * always, it will do the Right Thing
97 */
98static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
99{
100 unsigned char *vfrom;
101 struct bio_vec tovec, *fromvec = from->bi_io_vec;
102 struct bvec_iter iter;
103
104 bio_for_each_segment(tovec, to, iter) {
105 if (tovec.bv_page != fromvec->bv_page) {
106 /*
107 * fromvec->bv_offset and fromvec->bv_len might have
108 * been modified by the block layer, so use the original
109 * copy, bounce_copy_vec already uses tovec->bv_len
110 */
111 vfrom = page_address(fromvec->bv_page) +
112 tovec.bv_offset;
113
114 bounce_copy_vec(&tovec, vfrom);
115 flush_dcache_page(tovec.bv_page);
116 }
117
118 fromvec++;
119 }
120}
121
122static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
123{
124 struct bio *bio_orig = bio->bi_private;
125 struct bio_vec *bvec, *org_vec;
126 int i;
127
128 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
129 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
130
131 /*
132 * free up bounce indirect pages used
133 */
134 bio_for_each_segment_all(bvec, bio, i) {
135 org_vec = bio_orig->bi_io_vec + i;
136 if (bvec->bv_page == org_vec->bv_page)
137 continue;
138
139 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
140 mempool_free(bvec->bv_page, pool);
141 }
142
143 bio_endio(bio_orig, err);
144 bio_put(bio);
145}
146
147static void bounce_end_io_write(struct bio *bio, int err)
148{
149 bounce_end_io(bio, page_pool, err);
150}
151
152static void bounce_end_io_write_isa(struct bio *bio, int err)
153{
154
155 bounce_end_io(bio, isa_page_pool, err);
156}
157
158static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
159{
160 struct bio *bio_orig = bio->bi_private;
161
162 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
163 copy_to_high_bio_irq(bio_orig, bio);
164
165 bounce_end_io(bio, pool, err);
166}
167
168static void bounce_end_io_read(struct bio *bio, int err)
169{
170 __bounce_end_io_read(bio, page_pool, err);
171}
172
173static void bounce_end_io_read_isa(struct bio *bio, int err)
174{
175 __bounce_end_io_read(bio, isa_page_pool, err);
176}
177
178#ifdef CONFIG_NEED_BOUNCE_POOL
179static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
180{
181 if (bio_data_dir(bio) != WRITE)
182 return 0;
183
184 if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
185 return 0;
186
187 return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
188}
189#else
190static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
191{
192 return 0;
193}
194#endif /* CONFIG_NEED_BOUNCE_POOL */
195
196static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
197 mempool_t *pool, int force)
198{
199 struct bio *bio;
200 int rw = bio_data_dir(*bio_orig);
201 struct bio_vec *to, from;
202 struct bvec_iter iter;
203 unsigned i;
204
205 if (force)
206 goto bounce;
207 bio_for_each_segment(from, *bio_orig, iter)
208 if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
209 goto bounce;
210
211 return;
212bounce:
213 bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
214
215 bio_for_each_segment_all(to, bio, i) {
216 struct page *page = to->bv_page;
217
218 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
219 continue;
220
221 inc_zone_page_state(to->bv_page, NR_BOUNCE);
222 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
223
224 if (rw == WRITE) {
225 char *vto, *vfrom;
226
227 flush_dcache_page(page);
228
229 vto = page_address(to->bv_page) + to->bv_offset;
230 vfrom = kmap_atomic(page) + to->bv_offset;
231 memcpy(vto, vfrom, to->bv_len);
232 kunmap_atomic(vfrom);
233 }
234 }
235
236 trace_block_bio_bounce(q, *bio_orig);
237
238 bio->bi_flags |= (1 << BIO_BOUNCED);
239
240 if (pool == page_pool) {
241 bio->bi_end_io = bounce_end_io_write;
242 if (rw == READ)
243 bio->bi_end_io = bounce_end_io_read;
244 } else {
245 bio->bi_end_io = bounce_end_io_write_isa;
246 if (rw == READ)
247 bio->bi_end_io = bounce_end_io_read_isa;
248 }
249
250 bio->bi_private = *bio_orig;
251 *bio_orig = bio;
252}
253
254void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
255{
256 int must_bounce;
257 mempool_t *pool;
258
259 /*
260 * Data-less bio, nothing to bounce
261 */
262 if (!bio_has_data(*bio_orig))
263 return;
264
265 must_bounce = must_snapshot_stable_pages(q, *bio_orig);
266
267 /*
268 * for non-isa bounce case, just check if the bounce pfn is equal
269 * to or bigger than the highest pfn in the system -- in that case,
270 * don't waste time iterating over bio segments
271 */
272 if (!(q->bounce_gfp & GFP_DMA)) {
273 if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
274 return;
275 pool = page_pool;
276 } else {
277 BUG_ON(!isa_page_pool);
278 pool = isa_page_pool;
279 }
280
281 /*
282 * slow path
283 */
284 __blk_queue_bounce(q, bio_orig, pool, must_bounce);
285}
286
287EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e0985f1955e7..22dffebc7c73 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -908,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
908{ 908{
909 if (cfqd->busy_queues) { 909 if (cfqd->busy_queues) {
910 cfq_log(cfqd, "schedule dispatch"); 910 cfq_log(cfqd, "schedule dispatch");
911 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 911 kblockd_schedule_work(&cfqd->unplug_work);
912 } 912 }
913} 913}
914 914
@@ -4460,7 +4460,7 @@ out_free:
4460static ssize_t 4460static ssize_t
4461cfq_var_show(unsigned int var, char *page) 4461cfq_var_show(unsigned int var, char *page)
4462{ 4462{
4463 return sprintf(page, "%d\n", var); 4463 return sprintf(page, "%u\n", var);
4464} 4464}
4465 4465
4466static ssize_t 4466static ssize_t
diff --git a/block/ioprio.c b/block/ioprio.c
new file mode 100644
index 000000000000..e50170ca7c33
--- /dev/null
+++ b/block/ioprio.c
@@ -0,0 +1,241 @@
1/*
2 * fs/ioprio.c
3 *
4 * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
5 *
6 * Helper functions for setting/querying io priorities of processes. The
7 * system calls closely mimmick getpriority/setpriority, see the man page for
8 * those. The prio argument is a composite of prio class and prio data, where
9 * the data argument has meaning within that class. The standard scheduling
10 * classes have 8 distinct prio levels, with 0 being the highest prio and 7
11 * being the lowest.
12 *
13 * IOW, setting BE scheduling class with prio 2 is done ala:
14 *
15 * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
16 *
17 * ioprio_set(PRIO_PROCESS, pid, prio);
18 *
19 * See also Documentation/block/ioprio.txt
20 *
21 */
22#include <linux/gfp.h>
23#include <linux/kernel.h>
24#include <linux/export.h>
25#include <linux/ioprio.h>
26#include <linux/blkdev.h>
27#include <linux/capability.h>
28#include <linux/syscalls.h>
29#include <linux/security.h>
30#include <linux/pid_namespace.h>
31
32int set_task_ioprio(struct task_struct *task, int ioprio)
33{
34 int err;
35 struct io_context *ioc;
36 const struct cred *cred = current_cred(), *tcred;
37
38 rcu_read_lock();
39 tcred = __task_cred(task);
40 if (!uid_eq(tcred->uid, cred->euid) &&
41 !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
42 rcu_read_unlock();
43 return -EPERM;
44 }
45 rcu_read_unlock();
46
47 err = security_task_setioprio(task, ioprio);
48 if (err)
49 return err;
50
51 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
52 if (ioc) {
53 ioc->ioprio = ioprio;
54 put_io_context(ioc);
55 }
56
57 return err;
58}
59EXPORT_SYMBOL_GPL(set_task_ioprio);
60
61SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
62{
63 int class = IOPRIO_PRIO_CLASS(ioprio);
64 int data = IOPRIO_PRIO_DATA(ioprio);
65 struct task_struct *p, *g;
66 struct user_struct *user;
67 struct pid *pgrp;
68 kuid_t uid;
69 int ret;
70
71 switch (class) {
72 case IOPRIO_CLASS_RT:
73 if (!capable(CAP_SYS_ADMIN))
74 return -EPERM;
75 /* fall through, rt has prio field too */
76 case IOPRIO_CLASS_BE:
77 if (data >= IOPRIO_BE_NR || data < 0)
78 return -EINVAL;
79
80 break;
81 case IOPRIO_CLASS_IDLE:
82 break;
83 case IOPRIO_CLASS_NONE:
84 if (data)
85 return -EINVAL;
86 break;
87 default:
88 return -EINVAL;
89 }
90
91 ret = -ESRCH;
92 rcu_read_lock();
93 switch (which) {
94 case IOPRIO_WHO_PROCESS:
95 if (!who)
96 p = current;
97 else
98 p = find_task_by_vpid(who);
99 if (p)
100 ret = set_task_ioprio(p, ioprio);
101 break;
102 case IOPRIO_WHO_PGRP:
103 if (!who)
104 pgrp = task_pgrp(current);
105 else
106 pgrp = find_vpid(who);
107 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
108 ret = set_task_ioprio(p, ioprio);
109 if (ret)
110 break;
111 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
112 break;
113 case IOPRIO_WHO_USER:
114 uid = make_kuid(current_user_ns(), who);
115 if (!uid_valid(uid))
116 break;
117 if (!who)
118 user = current_user();
119 else
120 user = find_user(uid);
121
122 if (!user)
123 break;
124
125 do_each_thread(g, p) {
126 if (!uid_eq(task_uid(p), uid))
127 continue;
128 ret = set_task_ioprio(p, ioprio);
129 if (ret)
130 goto free_uid;
131 } while_each_thread(g, p);
132free_uid:
133 if (who)
134 free_uid(user);
135 break;
136 default:
137 ret = -EINVAL;
138 }
139
140 rcu_read_unlock();
141 return ret;
142}
143
144static int get_task_ioprio(struct task_struct *p)
145{
146 int ret;
147
148 ret = security_task_getioprio(p);
149 if (ret)
150 goto out;
151 ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
152 if (p->io_context)
153 ret = p->io_context->ioprio;
154out:
155 return ret;
156}
157
158int ioprio_best(unsigned short aprio, unsigned short bprio)
159{
160 unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
161 unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
162
163 if (aclass == IOPRIO_CLASS_NONE)
164 aclass = IOPRIO_CLASS_BE;
165 if (bclass == IOPRIO_CLASS_NONE)
166 bclass = IOPRIO_CLASS_BE;
167
168 if (aclass == bclass)
169 return min(aprio, bprio);
170 if (aclass > bclass)
171 return bprio;
172 else
173 return aprio;
174}
175
176SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
177{
178 struct task_struct *g, *p;
179 struct user_struct *user;
180 struct pid *pgrp;
181 kuid_t uid;
182 int ret = -ESRCH;
183 int tmpio;
184
185 rcu_read_lock();
186 switch (which) {
187 case IOPRIO_WHO_PROCESS:
188 if (!who)
189 p = current;
190 else
191 p = find_task_by_vpid(who);
192 if (p)
193 ret = get_task_ioprio(p);
194 break;
195 case IOPRIO_WHO_PGRP:
196 if (!who)
197 pgrp = task_pgrp(current);
198 else
199 pgrp = find_vpid(who);
200 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
201 tmpio = get_task_ioprio(p);
202 if (tmpio < 0)
203 continue;
204 if (ret == -ESRCH)
205 ret = tmpio;
206 else
207 ret = ioprio_best(ret, tmpio);
208 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
209 break;
210 case IOPRIO_WHO_USER:
211 uid = make_kuid(current_user_ns(), who);
212 if (!who)
213 user = current_user();
214 else
215 user = find_user(uid);
216
217 if (!user)
218 break;
219
220 do_each_thread(g, p) {
221 if (!uid_eq(task_uid(p), user->uid))
222 continue;
223 tmpio = get_task_ioprio(p);
224 if (tmpio < 0)
225 continue;
226 if (ret == -ESRCH)
227 ret = tmpio;
228 else
229 ret = ioprio_best(ret, tmpio);
230 } while_each_thread(g, p);
231
232 if (who)
233 free_uid(user);
234 break;
235 default:
236 ret = -EINVAL;
237 }
238
239 rcu_read_unlock();
240 return ret;
241}
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 26487972ac54..9c28a5b38042 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -205,10 +205,6 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
205 if (capable(CAP_SYS_RAWIO)) 205 if (capable(CAP_SYS_RAWIO))
206 return 0; 206 return 0;
207 207
208 /* if there's no filter set, assume we're filtering everything out */
209 if (!filter)
210 return -EPERM;
211
212 /* Anybody who can open the device can do a read-safe command */ 208 /* Anybody who can open the device can do a read-safe command */
213 if (test_bit(cmd[0], filter->read_ok)) 209 if (test_bit(cmd[0], filter->read_ok))
214 return 0; 210 return 0;