aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 17:30:53 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 17:30:53 -0500
commit8494bcf5b7c4b2416687e233dd34d4c6b6fe5653 (patch)
treecebb468e170e639ecfd61ddc5ebcba86c21105fa /drivers/block
parent3e12cefbe143b4947171ff92dd50024c4841e291 (diff)
parentb042a3ca949053231950a1b15f31cccca9e305f3 (diff)
Merge branch 'for-3.20/drivers' of git://git.kernel.dk/linux-block
Pull block driver changes from Jens Axboe: "This contains: - The 4k/partition fixes for brd from Boaz/Matthew. - A few xen front/back block fixes from David Vrabel and Roger Pau Monne. - Floppy changes from Takashi, cleaning the device file creation. - Switching libata to use the new blk-mq tagging policy, removing code (and a suboptimal implementation) from libata. This will throw you a merge conflict, since a bug in the original libata tagging code was fixed since this code was branched. Trivial. From Shaohua. - Conversion of loop to blk-mq, from Ming Lei. - Cleanup of the io_schedule() handling in bsg from Peter Zijlstra. He claims it improves on unreadable code, which will cost him a beer. - Maintainer update or NDB, now handled by Markus Pargmann. - NVMe: - Optimization from me that avoids a kmalloc/kfree per IO for smaller (<= 8KB) IO. This cuts about 1% of high IOPS CPU overhead. - Removal of (now) dead RCU code, a relic from before NVMe was converted to blk-mq" * 'for-3.20/drivers' of git://git.kernel.dk/linux-block: xen-blkback: default to X86_32 ABI on x86 xen-blkfront: fix accounting of reqs when migrating xen-blkback,xen-blkfront: add myself as maintainer block: Simplify bsg complete all floppy: Avoid manual call of device_create_file() NVMe: avoid kmalloc/kfree for smaller IO MAINTAINERS: Update NBD maintainer libata: make sata_sil24 use fifo tag allocator libata: move sas ata tag allocation to libata-scsi.c libata: use blk taging NVMe: within nvme_free_queues(), delete RCU sychro/deferred free null_blk: suppress invalid partition info brd: Request from fdisk 4k alignment brd: Fix all partitions BUGs axonram: Fix bug in direct_access loop: add blk-mq.h include block: loop: don't handle REQ_FUA explicitly block: loop: introduce lo_discard() and lo_req_flush() block: loop: say goodby to bio block: loop: improve performance via blk-mq
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/brd.c109
-rw-r--r--drivers/block/floppy.c17
-rw-r--r--drivers/block/loop.c416
-rw-r--r--drivers/block/loop.h18
-rw-r--r--drivers/block/null_blk.c2
-rw-r--r--drivers/block/nvme-core.c128
-rw-r--r--drivers/block/xen-blkback/common.h9
-rw-r--r--drivers/block/xen-blkback/xenbus.c4
-rw-r--r--drivers/block/xen-blkfront.c4
9 files changed, 374 insertions, 333 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 89e90ec52f28..c01b921b1b4a 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = {
438/* 438/*
439 * And now the modules code and kernel interface. 439 * And now the modules code and kernel interface.
440 */ 440 */
441static int rd_nr; 441static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
442int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
443static int max_part;
444static int part_shift;
445static int part_show = 0;
446module_param(rd_nr, int, S_IRUGO); 442module_param(rd_nr, int, S_IRUGO);
447MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); 443MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
444
445int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
448module_param(rd_size, int, S_IRUGO); 446module_param(rd_size, int, S_IRUGO);
449MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); 447MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
448
449static int max_part = 1;
450module_param(max_part, int, S_IRUGO); 450module_param(max_part, int, S_IRUGO);
451MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); 451MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
452module_param(part_show, int, S_IRUGO); 452
453MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions");
454MODULE_LICENSE("GPL"); 453MODULE_LICENSE("GPL");
455MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); 454MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
456MODULE_ALIAS("rd"); 455MODULE_ALIAS("rd");
@@ -487,25 +486,33 @@ static struct brd_device *brd_alloc(int i)
487 brd->brd_queue = blk_alloc_queue(GFP_KERNEL); 486 brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
488 if (!brd->brd_queue) 487 if (!brd->brd_queue)
489 goto out_free_dev; 488 goto out_free_dev;
489
490 blk_queue_make_request(brd->brd_queue, brd_make_request); 490 blk_queue_make_request(brd->brd_queue, brd_make_request);
491 blk_queue_max_hw_sectors(brd->brd_queue, 1024); 491 blk_queue_max_hw_sectors(brd->brd_queue, 1024);
492 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); 492 blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
493 493
494 /* This is so fdisk will align partitions on 4k, because of
495 * direct_access API needing 4k alignment, returning a PFN
496 * (This is only a problem on very small devices <= 4M,
497 * otherwise fdisk will align on 1M. Regardless this call
498 * is harmless)
499 */
500 blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
501
494 brd->brd_queue->limits.discard_granularity = PAGE_SIZE; 502 brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
495 brd->brd_queue->limits.max_discard_sectors = UINT_MAX; 503 brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
496 brd->brd_queue->limits.discard_zeroes_data = 1; 504 brd->brd_queue->limits.discard_zeroes_data = 1;
497 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); 505 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
498 506
499 disk = brd->brd_disk = alloc_disk(1 << part_shift); 507 disk = brd->brd_disk = alloc_disk(max_part);
500 if (!disk) 508 if (!disk)
501 goto out_free_queue; 509 goto out_free_queue;
502 disk->major = RAMDISK_MAJOR; 510 disk->major = RAMDISK_MAJOR;
503 disk->first_minor = i << part_shift; 511 disk->first_minor = i * max_part;
504 disk->fops = &brd_fops; 512 disk->fops = &brd_fops;
505 disk->private_data = brd; 513 disk->private_data = brd;
506 disk->queue = brd->brd_queue; 514 disk->queue = brd->brd_queue;
507 if (!part_show) 515 disk->flags = GENHD_FL_EXT_DEVT;
508 disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
509 sprintf(disk->disk_name, "ram%d", i); 516 sprintf(disk->disk_name, "ram%d", i);
510 set_capacity(disk, rd_size * 2); 517 set_capacity(disk, rd_size * 2);
511 518
@@ -527,10 +534,11 @@ static void brd_free(struct brd_device *brd)
527 kfree(brd); 534 kfree(brd);
528} 535}
529 536
530static struct brd_device *brd_init_one(int i) 537static struct brd_device *brd_init_one(int i, bool *new)
531{ 538{
532 struct brd_device *brd; 539 struct brd_device *brd;
533 540
541 *new = false;
534 list_for_each_entry(brd, &brd_devices, brd_list) { 542 list_for_each_entry(brd, &brd_devices, brd_list) {
535 if (brd->brd_number == i) 543 if (brd->brd_number == i)
536 goto out; 544 goto out;
@@ -541,6 +549,7 @@ static struct brd_device *brd_init_one(int i)
541 add_disk(brd->brd_disk); 549 add_disk(brd->brd_disk);
542 list_add_tail(&brd->brd_list, &brd_devices); 550 list_add_tail(&brd->brd_list, &brd_devices);
543 } 551 }
552 *new = true;
544out: 553out:
545 return brd; 554 return brd;
546} 555}
@@ -556,70 +565,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
556{ 565{
557 struct brd_device *brd; 566 struct brd_device *brd;
558 struct kobject *kobj; 567 struct kobject *kobj;
568 bool new;
559 569
560 mutex_lock(&brd_devices_mutex); 570 mutex_lock(&brd_devices_mutex);
561 brd = brd_init_one(MINOR(dev) >> part_shift); 571 brd = brd_init_one(MINOR(dev) / max_part, &new);
562 kobj = brd ? get_disk(brd->brd_disk) : NULL; 572 kobj = brd ? get_disk(brd->brd_disk) : NULL;
563 mutex_unlock(&brd_devices_mutex); 573 mutex_unlock(&brd_devices_mutex);
564 574
565 *part = 0; 575 if (new)
576 *part = 0;
577
566 return kobj; 578 return kobj;
567} 579}
568 580
569static int __init brd_init(void) 581static int __init brd_init(void)
570{ 582{
571 int i, nr;
572 unsigned long range;
573 struct brd_device *brd, *next; 583 struct brd_device *brd, *next;
584 int i;
574 585
575 /* 586 /*
576 * brd module now has a feature to instantiate underlying device 587 * brd module now has a feature to instantiate underlying device
577 * structure on-demand, provided that there is an access dev node. 588 * structure on-demand, provided that there is an access dev node.
578 * However, this will not work well with user space tool that doesn't
579 * know about such "feature". In order to not break any existing
580 * tool, we do the following:
581 * 589 *
582 * (1) if rd_nr is specified, create that many upfront, and this 590 * (1) if rd_nr is specified, create that many upfront. else
583 * also becomes a hard limit. 591 * it defaults to CONFIG_BLK_DEV_RAM_COUNT
584 * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT 592 * (2) User can further extend brd devices by create dev node themselves
585 * (default 16) rd device on module load, user can further 593 * and have kernel automatically instantiate actual device
586 * extend brd device by create dev node themselves and have 594 * on-demand. Example:
587 * kernel automatically instantiate actual device on-demand. 595 * mknod /path/devnod_name b 1 X # 1 is the rd major
596 * fdisk -l /path/devnod_name
597 * If (X / max_part) was not already created it will be created
598 * dynamically.
588 */ 599 */
589 600
590 part_shift = 0;
591 if (max_part > 0) {
592 part_shift = fls(max_part);
593
594 /*
595 * Adjust max_part according to part_shift as it is exported
596 * to user space so that user can decide correct minor number
597 * if [s]he want to create more devices.
598 *
599 * Note that -1 is required because partition 0 is reserved
600 * for the whole disk.
601 */
602 max_part = (1UL << part_shift) - 1;
603 }
604
605 if ((1UL << part_shift) > DISK_MAX_PARTS)
606 return -EINVAL;
607
608 if (rd_nr > 1UL << (MINORBITS - part_shift))
609 return -EINVAL;
610
611 if (rd_nr) {
612 nr = rd_nr;
613 range = rd_nr << part_shift;
614 } else {
615 nr = CONFIG_BLK_DEV_RAM_COUNT;
616 range = 1UL << MINORBITS;
617 }
618
619 if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) 601 if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
620 return -EIO; 602 return -EIO;
621 603
622 for (i = 0; i < nr; i++) { 604 if (unlikely(!max_part))
605 max_part = 1;
606
607 for (i = 0; i < rd_nr; i++) {
623 brd = brd_alloc(i); 608 brd = brd_alloc(i);
624 if (!brd) 609 if (!brd)
625 goto out_free; 610 goto out_free;
@@ -631,10 +616,10 @@ static int __init brd_init(void)
631 list_for_each_entry(brd, &brd_devices, brd_list) 616 list_for_each_entry(brd, &brd_devices, brd_list)
632 add_disk(brd->brd_disk); 617 add_disk(brd->brd_disk);
633 618
634 blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range, 619 blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
635 THIS_MODULE, brd_probe, NULL, NULL); 620 THIS_MODULE, brd_probe, NULL, NULL);
636 621
637 printk(KERN_INFO "brd: module loaded\n"); 622 pr_info("brd: module loaded\n");
638 return 0; 623 return 0;
639 624
640out_free: 625out_free:
@@ -644,21 +629,21 @@ out_free:
644 } 629 }
645 unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); 630 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
646 631
632 pr_info("brd: module NOT loaded !!!\n");
647 return -ENOMEM; 633 return -ENOMEM;
648} 634}
649 635
650static void __exit brd_exit(void) 636static void __exit brd_exit(void)
651{ 637{
652 unsigned long range;
653 struct brd_device *brd, *next; 638 struct brd_device *brd, *next;
654 639
655 range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
656
657 list_for_each_entry_safe(brd, next, &brd_devices, brd_list) 640 list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
658 brd_del_one(brd); 641 brd_del_one(brd);
659 642
660 blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range); 643 blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
661 unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); 644 unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
645
646 pr_info("brd: module unloaded\n");
662} 647}
663 648
664module_init(brd_init); 649module_init(brd_init);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 56d46ffb08e1..a08cda955285 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev,
4112 4112
4113static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL); 4113static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
4114 4114
4115static struct attribute *floppy_dev_attrs[] = {
4116 &dev_attr_cmos.attr,
4117 NULL
4118};
4119
4120ATTRIBUTE_GROUPS(floppy_dev);
4121
4115static void floppy_device_release(struct device *dev) 4122static void floppy_device_release(struct device *dev)
4116{ 4123{
4117} 4124}
@@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void)
4324 floppy_device[drive].name = floppy_device_name; 4331 floppy_device[drive].name = floppy_device_name;
4325 floppy_device[drive].id = drive; 4332 floppy_device[drive].id = drive;
4326 floppy_device[drive].dev.release = floppy_device_release; 4333 floppy_device[drive].dev.release = floppy_device_release;
4334 floppy_device[drive].dev.groups = floppy_dev_groups;
4327 4335
4328 err = platform_device_register(&floppy_device[drive]); 4336 err = platform_device_register(&floppy_device[drive]);
4329 if (err) 4337 if (err)
4330 goto out_remove_drives; 4338 goto out_remove_drives;
4331 4339
4332 err = device_create_file(&floppy_device[drive].dev,
4333 &dev_attr_cmos);
4334 if (err)
4335 goto out_unreg_platform_dev;
4336
4337 /* to be cleaned up... */ 4340 /* to be cleaned up... */
4338 disks[drive]->private_data = (void *)(long)drive; 4341 disks[drive]->private_data = (void *)(long)drive;
4339 disks[drive]->flags |= GENHD_FL_REMOVABLE; 4342 disks[drive]->flags |= GENHD_FL_REMOVABLE;
@@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void)
4343 4346
4344 return 0; 4347 return 0;
4345 4348
4346out_unreg_platform_dev:
4347 platform_device_unregister(&floppy_device[drive]);
4348out_remove_drives: 4349out_remove_drives:
4349 while (drive--) { 4350 while (drive--) {
4350 if (floppy_available(drive)) { 4351 if (floppy_available(drive)) {
4351 del_gendisk(disks[drive]); 4352 del_gendisk(disks[drive]);
4352 device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
4353 platform_device_unregister(&floppy_device[drive]); 4353 platform_device_unregister(&floppy_device[drive]);
4354 } 4354 }
4355 } 4355 }
@@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void)
4594 4594
4595 if (floppy_available(drive)) { 4595 if (floppy_available(drive)) {
4596 del_gendisk(disks[drive]); 4596 del_gendisk(disks[drive]);
4597 device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
4598 platform_device_unregister(&floppy_device[drive]); 4597 platform_device_unregister(&floppy_device[drive]);
4599 } 4598 }
4600 blk_cleanup_queue(disks[drive]->queue); 4599 blk_cleanup_queue(disks[drive]->queue);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6cb1beb47c25..d1f168b73634 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -85,6 +85,8 @@ static DEFINE_MUTEX(loop_index_mutex);
85static int max_part; 85static int max_part;
86static int part_shift; 86static int part_shift;
87 87
88static struct workqueue_struct *loop_wq;
89
88/* 90/*
89 * Transfer functions 91 * Transfer functions
90 */ 92 */
@@ -284,12 +286,12 @@ static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
284 return ret; 286 return ret;
285} 287}
286 288
287static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) 289static int lo_send(struct loop_device *lo, struct request *rq, loff_t pos)
288{ 290{
289 int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t, 291 int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t,
290 struct page *page); 292 struct page *page);
291 struct bio_vec bvec; 293 struct bio_vec bvec;
292 struct bvec_iter iter; 294 struct req_iterator iter;
293 struct page *page = NULL; 295 struct page *page = NULL;
294 int ret = 0; 296 int ret = 0;
295 297
@@ -303,7 +305,7 @@ static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos)
303 do_lo_send = do_lo_send_direct_write; 305 do_lo_send = do_lo_send_direct_write;
304 } 306 }
305 307
306 bio_for_each_segment(bvec, bio, iter) { 308 rq_for_each_segment(bvec, rq, iter) {
307 ret = do_lo_send(lo, &bvec, pos, page); 309 ret = do_lo_send(lo, &bvec, pos, page);
308 if (ret < 0) 310 if (ret < 0)
309 break; 311 break;
@@ -391,19 +393,22 @@ do_lo_receive(struct loop_device *lo,
391} 393}
392 394
393static int 395static int
394lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) 396lo_receive(struct loop_device *lo, struct request *rq, int bsize, loff_t pos)
395{ 397{
396 struct bio_vec bvec; 398 struct bio_vec bvec;
397 struct bvec_iter iter; 399 struct req_iterator iter;
398 ssize_t s; 400 ssize_t s;
399 401
400 bio_for_each_segment(bvec, bio, iter) { 402 rq_for_each_segment(bvec, rq, iter) {
401 s = do_lo_receive(lo, &bvec, bsize, pos); 403 s = do_lo_receive(lo, &bvec, bsize, pos);
402 if (s < 0) 404 if (s < 0)
403 return s; 405 return s;
404 406
405 if (s != bvec.bv_len) { 407 if (s != bvec.bv_len) {
406 zero_fill_bio(bio); 408 struct bio *bio;
409
410 __rq_for_each_bio(bio, rq)
411 zero_fill_bio(bio);
407 break; 412 break;
408 } 413 }
409 pos += bvec.bv_len; 414 pos += bvec.bv_len;
@@ -411,106 +416,58 @@ lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
411 return 0; 416 return 0;
412} 417}
413 418
414static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) 419static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
415{ 420{
416 loff_t pos; 421 /*
422 * We use punch hole to reclaim the free space used by the
423 * image a.k.a. discard. However we do not support discard if
424 * encryption is enabled, because it may give an attacker
425 * useful information.
426 */
427 struct file *file = lo->lo_backing_file;
428 int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
417 int ret; 429 int ret;
418 430
419 pos = ((loff_t) bio->bi_iter.bi_sector << 9) + lo->lo_offset; 431 if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
420 432 ret = -EOPNOTSUPP;
421 if (bio_rw(bio) == WRITE) { 433 goto out;
422 struct file *file = lo->lo_backing_file; 434 }
423
424 if (bio->bi_rw & REQ_FLUSH) {
425 ret = vfs_fsync(file, 0);
426 if (unlikely(ret && ret != -EINVAL)) {
427 ret = -EIO;
428 goto out;
429 }
430 }
431
432 /*
433 * We use punch hole to reclaim the free space used by the
434 * image a.k.a. discard. However we do not support discard if
435 * encryption is enabled, because it may give an attacker
436 * useful information.
437 */
438 if (bio->bi_rw & REQ_DISCARD) {
439 struct file *file = lo->lo_backing_file;
440 int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
441
442 if ((!file->f_op->fallocate) ||
443 lo->lo_encrypt_key_size) {
444 ret = -EOPNOTSUPP;
445 goto out;
446 }
447 ret = file->f_op->fallocate(file, mode, pos,
448 bio->bi_iter.bi_size);
449 if (unlikely(ret && ret != -EINVAL &&
450 ret != -EOPNOTSUPP))
451 ret = -EIO;
452 goto out;
453 }
454
455 ret = lo_send(lo, bio, pos);
456
457 if ((bio->bi_rw & REQ_FUA) && !ret) {
458 ret = vfs_fsync(file, 0);
459 if (unlikely(ret && ret != -EINVAL))
460 ret = -EIO;
461 }
462 } else
463 ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
464 435
465out: 436 ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
437 if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
438 ret = -EIO;
439 out:
466 return ret; 440 return ret;
467} 441}
468 442
469/* 443static int lo_req_flush(struct loop_device *lo, struct request *rq)
470 * Add bio to back of pending list
471 */
472static void loop_add_bio(struct loop_device *lo, struct bio *bio)
473{ 444{
474 lo->lo_bio_count++; 445 struct file *file = lo->lo_backing_file;
475 bio_list_add(&lo->lo_bio_list, bio); 446 int ret = vfs_fsync(file, 0);
476} 447 if (unlikely(ret && ret != -EINVAL))
448 ret = -EIO;
477 449
478/* 450 return ret;
479 * Grab first pending buffer
480 */
481static struct bio *loop_get_bio(struct loop_device *lo)
482{
483 lo->lo_bio_count--;
484 return bio_list_pop(&lo->lo_bio_list);
485} 451}
486 452
487static void loop_make_request(struct request_queue *q, struct bio *old_bio) 453static int do_req_filebacked(struct loop_device *lo, struct request *rq)
488{ 454{
489 struct loop_device *lo = q->queuedata; 455 loff_t pos;
490 int rw = bio_rw(old_bio); 456 int ret;
491
492 if (rw == READA)
493 rw = READ;
494 457
495 BUG_ON(!lo || (rw != READ && rw != WRITE)); 458 pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
496 459
497 spin_lock_irq(&lo->lo_lock); 460 if (rq->cmd_flags & REQ_WRITE) {
498 if (lo->lo_state != Lo_bound) 461 if (rq->cmd_flags & REQ_FLUSH)
499 goto out; 462 ret = lo_req_flush(lo, rq);
500 if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) 463 else if (rq->cmd_flags & REQ_DISCARD)
501 goto out; 464 ret = lo_discard(lo, rq, pos);
502 if (lo->lo_bio_count >= q->nr_congestion_on) 465 else
503 wait_event_lock_irq(lo->lo_req_wait, 466 ret = lo_send(lo, rq, pos);
504 lo->lo_bio_count < q->nr_congestion_off, 467 } else
505 lo->lo_lock); 468 ret = lo_receive(lo, rq, lo->lo_blocksize, pos);
506 loop_add_bio(lo, old_bio);
507 wake_up(&lo->lo_event);
508 spin_unlock_irq(&lo->lo_lock);
509 return;
510 469
511out: 470 return ret;
512 spin_unlock_irq(&lo->lo_lock);
513 bio_io_error(old_bio);
514} 471}
515 472
516struct switch_request { 473struct switch_request {
@@ -518,57 +475,26 @@ struct switch_request {
518 struct completion wait; 475 struct completion wait;
519}; 476};
520 477
521static void do_loop_switch(struct loop_device *, struct switch_request *);
522
523static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
524{
525 if (unlikely(!bio->bi_bdev)) {
526 do_loop_switch(lo, bio->bi_private);
527 bio_put(bio);
528 } else {
529 int ret = do_bio_filebacked(lo, bio);
530 bio_endio(bio, ret);
531 }
532}
533
534/* 478/*
535 * worker thread that handles reads/writes to file backed loop devices, 479 * Do the actual switch; called from the BIO completion routine
536 * to avoid blocking in our make_request_fn. it also does loop decrypting
537 * on reads for block backed loop, as that is too heavy to do from
538 * b_end_io context where irqs may be disabled.
539 *
540 * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before
541 * calling kthread_stop(). Therefore once kthread_should_stop() is
542 * true, make_request will not place any more requests. Therefore
543 * once kthread_should_stop() is true and lo_bio is NULL, we are
544 * done with the loop.
545 */ 480 */
546static int loop_thread(void *data) 481static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
547{ 482{
548 struct loop_device *lo = data; 483 struct file *file = p->file;
549 struct bio *bio; 484 struct file *old_file = lo->lo_backing_file;
550 485 struct address_space *mapping;
551 set_user_nice(current, MIN_NICE);
552
553 while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
554
555 wait_event_interruptible(lo->lo_event,
556 !bio_list_empty(&lo->lo_bio_list) ||
557 kthread_should_stop());
558
559 if (bio_list_empty(&lo->lo_bio_list))
560 continue;
561 spin_lock_irq(&lo->lo_lock);
562 bio = loop_get_bio(lo);
563 if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off)
564 wake_up(&lo->lo_req_wait);
565 spin_unlock_irq(&lo->lo_lock);
566 486
567 BUG_ON(!bio); 487 /* if no new file, only flush of queued bios requested */
568 loop_handle_bio(lo, bio); 488 if (!file)
569 } 489 return;
570 490
571 return 0; 491 mapping = file->f_mapping;
492 mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
493 lo->lo_backing_file = file;
494 lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
495 mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
496 lo->old_gfp_mask = mapping_gfp_mask(mapping);
497 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
572} 498}
573 499
574/* 500/*
@@ -579,15 +505,18 @@ static int loop_thread(void *data)
579static int loop_switch(struct loop_device *lo, struct file *file) 505static int loop_switch(struct loop_device *lo, struct file *file)
580{ 506{
581 struct switch_request w; 507 struct switch_request w;
582 struct bio *bio = bio_alloc(GFP_KERNEL, 0); 508
583 if (!bio)
584 return -ENOMEM;
585 init_completion(&w.wait);
586 w.file = file; 509 w.file = file;
587 bio->bi_private = &w; 510
588 bio->bi_bdev = NULL; 511 /* freeze queue and wait for completion of scheduled requests */
589 loop_make_request(lo->lo_queue, bio); 512 blk_mq_freeze_queue(lo->lo_queue);
590 wait_for_completion(&w.wait); 513
514 /* do the switch action */
515 do_loop_switch(lo, &w);
516
517 /* unfreeze */
518 blk_mq_unfreeze_queue(lo->lo_queue);
519
591 return 0; 520 return 0;
592} 521}
593 522
@@ -596,39 +525,10 @@ static int loop_switch(struct loop_device *lo, struct file *file)
596 */ 525 */
597static int loop_flush(struct loop_device *lo) 526static int loop_flush(struct loop_device *lo)
598{ 527{
599 /* loop not yet configured, no running thread, nothing to flush */
600 if (!lo->lo_thread)
601 return 0;
602
603 return loop_switch(lo, NULL); 528 return loop_switch(lo, NULL);
604} 529}
605 530
606/* 531/*
607 * Do the actual switch; called from the BIO completion routine
608 */
609static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
610{
611 struct file *file = p->file;
612 struct file *old_file = lo->lo_backing_file;
613 struct address_space *mapping;
614
615 /* if no new file, only flush of queued bios requested */
616 if (!file)
617 goto out;
618
619 mapping = file->f_mapping;
620 mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
621 lo->lo_backing_file = file;
622 lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
623 mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
624 lo->old_gfp_mask = mapping_gfp_mask(mapping);
625 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
626out:
627 complete(&p->wait);
628}
629
630
631/*
632 * loop_change_fd switched the backing store of a loopback device to 532 * loop_change_fd switched the backing store of a loopback device to
633 * a new file. This is useful for operating system installers to free up 533 * a new file. This is useful for operating system installers to free up
634 * the original file and in High Availability environments to switch to 534 * the original file and in High Availability environments to switch to
@@ -889,12 +789,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
889 lo->transfer = transfer_none; 789 lo->transfer = transfer_none;
890 lo->ioctl = NULL; 790 lo->ioctl = NULL;
891 lo->lo_sizelimit = 0; 791 lo->lo_sizelimit = 0;
892 lo->lo_bio_count = 0;
893 lo->old_gfp_mask = mapping_gfp_mask(mapping); 792 lo->old_gfp_mask = mapping_gfp_mask(mapping);
894 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); 793 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
895 794
896 bio_list_init(&lo->lo_bio_list);
897
898 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) 795 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
899 blk_queue_flush(lo->lo_queue, REQ_FLUSH); 796 blk_queue_flush(lo->lo_queue, REQ_FLUSH);
900 797
@@ -906,14 +803,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
906 803
907 set_blocksize(bdev, lo_blocksize); 804 set_blocksize(bdev, lo_blocksize);
908 805
909 lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
910 lo->lo_number);
911 if (IS_ERR(lo->lo_thread)) {
912 error = PTR_ERR(lo->lo_thread);
913 goto out_clr;
914 }
915 lo->lo_state = Lo_bound; 806 lo->lo_state = Lo_bound;
916 wake_up_process(lo->lo_thread);
917 if (part_shift) 807 if (part_shift)
918 lo->lo_flags |= LO_FLAGS_PARTSCAN; 808 lo->lo_flags |= LO_FLAGS_PARTSCAN;
919 if (lo->lo_flags & LO_FLAGS_PARTSCAN) 809 if (lo->lo_flags & LO_FLAGS_PARTSCAN)
@@ -925,18 +815,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
925 bdgrab(bdev); 815 bdgrab(bdev);
926 return 0; 816 return 0;
927 817
928out_clr:
929 loop_sysfs_exit(lo);
930 lo->lo_thread = NULL;
931 lo->lo_device = NULL;
932 lo->lo_backing_file = NULL;
933 lo->lo_flags = 0;
934 set_capacity(lo->lo_disk, 0);
935 invalidate_bdev(bdev);
936 bd_set_size(bdev, 0);
937 kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
938 mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
939 lo->lo_state = Lo_unbound;
940 out_putf: 818 out_putf:
941 fput(file); 819 fput(file);
942 out: 820 out:
@@ -1012,11 +890,6 @@ static int loop_clr_fd(struct loop_device *lo)
1012 890
1013 spin_lock_irq(&lo->lo_lock); 891 spin_lock_irq(&lo->lo_lock);
1014 lo->lo_state = Lo_rundown; 892 lo->lo_state = Lo_rundown;
1015 spin_unlock_irq(&lo->lo_lock);
1016
1017 kthread_stop(lo->lo_thread);
1018
1019 spin_lock_irq(&lo->lo_lock);
1020 lo->lo_backing_file = NULL; 893 lo->lo_backing_file = NULL;
1021 spin_unlock_irq(&lo->lo_lock); 894 spin_unlock_irq(&lo->lo_lock);
1022 895
@@ -1028,7 +901,6 @@ static int loop_clr_fd(struct loop_device *lo)
1028 lo->lo_offset = 0; 901 lo->lo_offset = 0;
1029 lo->lo_sizelimit = 0; 902 lo->lo_sizelimit = 0;
1030 lo->lo_encrypt_key_size = 0; 903 lo->lo_encrypt_key_size = 0;
1031 lo->lo_thread = NULL;
1032 memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); 904 memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
1033 memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); 905 memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
1034 memset(lo->lo_file_name, 0, LO_NAME_SIZE); 906 memset(lo->lo_file_name, 0, LO_NAME_SIZE);
@@ -1601,6 +1473,105 @@ int loop_unregister_transfer(int number)
1601EXPORT_SYMBOL(loop_register_transfer); 1473EXPORT_SYMBOL(loop_register_transfer);
1602EXPORT_SYMBOL(loop_unregister_transfer); 1474EXPORT_SYMBOL(loop_unregister_transfer);
1603 1475
1476static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
1477 const struct blk_mq_queue_data *bd)
1478{
1479 struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
1480
1481 blk_mq_start_request(bd->rq);
1482
1483 if (cmd->rq->cmd_flags & REQ_WRITE) {
1484 struct loop_device *lo = cmd->rq->q->queuedata;
1485 bool need_sched = true;
1486
1487 spin_lock_irq(&lo->lo_lock);
1488 if (lo->write_started)
1489 need_sched = false;
1490 else
1491 lo->write_started = true;
1492 list_add_tail(&cmd->list, &lo->write_cmd_head);
1493 spin_unlock_irq(&lo->lo_lock);
1494
1495 if (need_sched)
1496 queue_work(loop_wq, &lo->write_work);
1497 } else {
1498 queue_work(loop_wq, &cmd->read_work);
1499 }
1500
1501 return BLK_MQ_RQ_QUEUE_OK;
1502}
1503
1504static void loop_handle_cmd(struct loop_cmd *cmd)
1505{
1506 const bool write = cmd->rq->cmd_flags & REQ_WRITE;
1507 struct loop_device *lo = cmd->rq->q->queuedata;
1508 int ret = -EIO;
1509
1510 if (lo->lo_state != Lo_bound)
1511 goto failed;
1512
1513 if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
1514 goto failed;
1515
1516 ret = do_req_filebacked(lo, cmd->rq);
1517
1518 failed:
1519 if (ret)
1520 cmd->rq->errors = -EIO;
1521 blk_mq_complete_request(cmd->rq);
1522}
1523
1524static void loop_queue_write_work(struct work_struct *work)
1525{
1526 struct loop_device *lo =
1527 container_of(work, struct loop_device, write_work);
1528 LIST_HEAD(cmd_list);
1529
1530 spin_lock_irq(&lo->lo_lock);
1531 repeat:
1532 list_splice_init(&lo->write_cmd_head, &cmd_list);
1533 spin_unlock_irq(&lo->lo_lock);
1534
1535 while (!list_empty(&cmd_list)) {
1536 struct loop_cmd *cmd = list_first_entry(&cmd_list,
1537 struct loop_cmd, list);
1538 list_del_init(&cmd->list);
1539 loop_handle_cmd(cmd);
1540 }
1541
1542 spin_lock_irq(&lo->lo_lock);
1543 if (!list_empty(&lo->write_cmd_head))
1544 goto repeat;
1545 lo->write_started = false;
1546 spin_unlock_irq(&lo->lo_lock);
1547}
1548
1549static void loop_queue_read_work(struct work_struct *work)
1550{
1551 struct loop_cmd *cmd =
1552 container_of(work, struct loop_cmd, read_work);
1553
1554 loop_handle_cmd(cmd);
1555}
1556
1557static int loop_init_request(void *data, struct request *rq,
1558 unsigned int hctx_idx, unsigned int request_idx,
1559 unsigned int numa_node)
1560{
1561 struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
1562
1563 cmd->rq = rq;
1564 INIT_WORK(&cmd->read_work, loop_queue_read_work);
1565
1566 return 0;
1567}
1568
1569static struct blk_mq_ops loop_mq_ops = {
1570 .queue_rq = loop_queue_rq,
1571 .map_queue = blk_mq_map_queue,
1572 .init_request = loop_init_request,
1573};
1574
1604static int loop_add(struct loop_device **l, int i) 1575static int loop_add(struct loop_device **l, int i)
1605{ 1576{
1606 struct loop_device *lo; 1577 struct loop_device *lo;
@@ -1627,16 +1598,28 @@ static int loop_add(struct loop_device **l, int i)
1627 i = err; 1598 i = err;
1628 1599
1629 err = -ENOMEM; 1600 err = -ENOMEM;
1630 lo->lo_queue = blk_alloc_queue(GFP_KERNEL); 1601 lo->tag_set.ops = &loop_mq_ops;
1631 if (!lo->lo_queue) 1602 lo->tag_set.nr_hw_queues = 1;
1603 lo->tag_set.queue_depth = 128;
1604 lo->tag_set.numa_node = NUMA_NO_NODE;
1605 lo->tag_set.cmd_size = sizeof(struct loop_cmd);
1606 lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
1607 lo->tag_set.driver_data = lo;
1608
1609 err = blk_mq_alloc_tag_set(&lo->tag_set);
1610 if (err)
1632 goto out_free_idr; 1611 goto out_free_idr;
1633 1612
1634 /* 1613 lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
1635 * set queue make_request_fn 1614 if (IS_ERR_OR_NULL(lo->lo_queue)) {
1636 */ 1615 err = PTR_ERR(lo->lo_queue);
1637 blk_queue_make_request(lo->lo_queue, loop_make_request); 1616 goto out_cleanup_tags;
1617 }
1638 lo->lo_queue->queuedata = lo; 1618 lo->lo_queue->queuedata = lo;
1639 1619
1620 INIT_LIST_HEAD(&lo->write_cmd_head);
1621 INIT_WORK(&lo->write_work, loop_queue_write_work);
1622
1640 disk = lo->lo_disk = alloc_disk(1 << part_shift); 1623 disk = lo->lo_disk = alloc_disk(1 << part_shift);
1641 if (!disk) 1624 if (!disk)
1642 goto out_free_queue; 1625 goto out_free_queue;
@@ -1664,9 +1647,6 @@ static int loop_add(struct loop_device **l, int i)
1664 disk->flags |= GENHD_FL_EXT_DEVT; 1647 disk->flags |= GENHD_FL_EXT_DEVT;
1665 mutex_init(&lo->lo_ctl_mutex); 1648 mutex_init(&lo->lo_ctl_mutex);
1666 lo->lo_number = i; 1649 lo->lo_number = i;
1667 lo->lo_thread = NULL;
1668 init_waitqueue_head(&lo->lo_event);
1669 init_waitqueue_head(&lo->lo_req_wait);
1670 spin_lock_init(&lo->lo_lock); 1650 spin_lock_init(&lo->lo_lock);
1671 disk->major = LOOP_MAJOR; 1651 disk->major = LOOP_MAJOR;
1672 disk->first_minor = i << part_shift; 1652 disk->first_minor = i << part_shift;
@@ -1680,6 +1660,8 @@ static int loop_add(struct loop_device **l, int i)
1680 1660
1681out_free_queue: 1661out_free_queue:
1682 blk_cleanup_queue(lo->lo_queue); 1662 blk_cleanup_queue(lo->lo_queue);
1663out_cleanup_tags:
1664 blk_mq_free_tag_set(&lo->tag_set);
1683out_free_idr: 1665out_free_idr:
1684 idr_remove(&loop_index_idr, i); 1666 idr_remove(&loop_index_idr, i);
1685out_free_dev: 1667out_free_dev:
@@ -1692,6 +1674,7 @@ static void loop_remove(struct loop_device *lo)
1692{ 1674{
1693 del_gendisk(lo->lo_disk); 1675 del_gendisk(lo->lo_disk);
1694 blk_cleanup_queue(lo->lo_queue); 1676 blk_cleanup_queue(lo->lo_queue);
1677 blk_mq_free_tag_set(&lo->tag_set);
1695 put_disk(lo->lo_disk); 1678 put_disk(lo->lo_disk);
1696 kfree(lo); 1679 kfree(lo);
1697} 1680}
@@ -1875,6 +1858,13 @@ static int __init loop_init(void)
1875 goto misc_out; 1858 goto misc_out;
1876 } 1859 }
1877 1860
1861 loop_wq = alloc_workqueue("kloopd",
1862 WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0);
1863 if (!loop_wq) {
1864 err = -ENOMEM;
1865 goto misc_out;
1866 }
1867
1878 blk_register_region(MKDEV(LOOP_MAJOR, 0), range, 1868 blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
1879 THIS_MODULE, loop_probe, NULL, NULL); 1869 THIS_MODULE, loop_probe, NULL, NULL);
1880 1870
@@ -1912,6 +1902,8 @@ static void __exit loop_exit(void)
1912 blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); 1902 blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
1913 unregister_blkdev(LOOP_MAJOR, "loop"); 1903 unregister_blkdev(LOOP_MAJOR, "loop");
1914 1904
1905 destroy_workqueue(loop_wq);
1906
1915 misc_deregister(&loop_misc); 1907 misc_deregister(&loop_misc);
1916} 1908}
1917 1909
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 90df5d6485b6..301c27f8323f 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -11,8 +11,10 @@
11 11
12#include <linux/bio.h> 12#include <linux/bio.h>
13#include <linux/blkdev.h> 13#include <linux/blkdev.h>
14#include <linux/blk-mq.h>
14#include <linux/spinlock.h> 15#include <linux/spinlock.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/workqueue.h>
16#include <uapi/linux/loop.h> 18#include <uapi/linux/loop.h>
17 19
18/* Possible states of device */ 20/* Possible states of device */
@@ -52,19 +54,23 @@ struct loop_device {
52 gfp_t old_gfp_mask; 54 gfp_t old_gfp_mask;
53 55
54 spinlock_t lo_lock; 56 spinlock_t lo_lock;
55 struct bio_list lo_bio_list; 57 struct list_head write_cmd_head;
56 unsigned int lo_bio_count; 58 struct work_struct write_work;
59 bool write_started;
57 int lo_state; 60 int lo_state;
58 struct mutex lo_ctl_mutex; 61 struct mutex lo_ctl_mutex;
59 struct task_struct *lo_thread;
60 wait_queue_head_t lo_event;
61 /* wait queue for incoming requests */
62 wait_queue_head_t lo_req_wait;
63 62
64 struct request_queue *lo_queue; 63 struct request_queue *lo_queue;
64 struct blk_mq_tag_set tag_set;
65 struct gendisk *lo_disk; 65 struct gendisk *lo_disk;
66}; 66};
67 67
68struct loop_cmd {
69 struct work_struct read_work;
70 struct request *rq;
71 struct list_head list;
72};
73
68/* Support for loadable transfer modules */ 74/* Support for loadable transfer modules */
69struct loop_func_table { 75struct loop_func_table {
70 int number; /* filter type */ 76 int number; /* filter type */
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index aa2224aa7caa..65cd61a4145e 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -579,7 +579,7 @@ static int null_add_dev(void)
579 sector_div(size, bs); 579 sector_div(size, bs);
580 set_capacity(disk, size); 580 set_capacity(disk, size);
581 581
582 disk->flags |= GENHD_FL_EXT_DEVT; 582 disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
583 disk->major = null_major; 583 disk->major = null_major;
584 disk->first_minor = nullb->index; 584 disk->first_minor = nullb->index;
585 disk->fops = &null_fops; 585 disk->fops = &null_fops;
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d826bf3e62c8..cbdfbbf98392 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
144 void *ctx; 144 void *ctx;
145 int aborted; 145 int aborted;
146 struct nvme_queue *nvmeq; 146 struct nvme_queue *nvmeq;
147 struct nvme_iod iod[0];
147}; 148};
148 149
150/*
151 * Max size of iod being embedded in the request payload
152 */
153#define NVME_INT_PAGES 2
154#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
155
156/*
157 * Will slightly overestimate the number of pages needed. This is OK
158 * as it only leads to a small amount of wasted memory for the lifetime of
159 * the I/O.
160 */
161static int nvme_npages(unsigned size, struct nvme_dev *dev)
162{
163 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
164 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
165}
166
167static unsigned int nvme_cmd_size(struct nvme_dev *dev)
168{
169 unsigned int ret = sizeof(struct nvme_cmd_info);
170
171 ret += sizeof(struct nvme_iod);
172 ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
173 ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
174
175 return ret;
176}
177
149static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 178static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
150 unsigned int hctx_idx) 179 unsigned int hctx_idx)
151{ 180{
@@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
218 blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); 247 blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
219} 248}
220 249
250static void *iod_get_private(struct nvme_iod *iod)
251{
252 return (void *) (iod->private & ~0x1UL);
253}
254
255/*
256 * If bit 0 is set, the iod is embedded in the request payload.
257 */
258static bool iod_should_kfree(struct nvme_iod *iod)
259{
260 return (iod->private & 0x01) == 0;
261}
262
221/* Special values must be less than 0x1000 */ 263/* Special values must be less than 0x1000 */
222#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 264#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
223#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 265#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
@@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
361 return ((void *)iod) + iod->offset; 403 return ((void *)iod) + iod->offset;
362} 404}
363 405
364/* 406static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
365 * Will slightly overestimate the number of pages needed. This is OK 407 unsigned nseg, unsigned long private)
366 * as it only leads to a small amount of wasted memory for the lifetime of
367 * the I/O.
368 */
369static int nvme_npages(unsigned size, struct nvme_dev *dev)
370{ 408{
371 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); 409 iod->private = private;
372 return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); 410 iod->offset = offsetof(struct nvme_iod, sg[nseg]);
411 iod->npages = -1;
412 iod->length = nbytes;
413 iod->nents = 0;
373} 414}
374 415
375static struct nvme_iod * 416static struct nvme_iod *
376nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) 417__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
418 unsigned long priv, gfp_t gfp)
377{ 419{
378 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 420 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
379 sizeof(__le64 *) * nvme_npages(nbytes, dev) + 421 sizeof(__le64 *) * nvme_npages(bytes, dev) +
380 sizeof(struct scatterlist) * nseg, gfp); 422 sizeof(struct scatterlist) * nseg, gfp);
381 423
382 if (iod) { 424 if (iod)
383 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 425 iod_init(iod, bytes, nseg, priv);
384 iod->npages = -1;
385 iod->length = nbytes;
386 iod->nents = 0;
387 iod->first_dma = 0ULL;
388 }
389 426
390 return iod; 427 return iod;
391} 428}
392 429
430static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
431 gfp_t gfp)
432{
433 unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
434 sizeof(struct nvme_dsm_range);
435 unsigned long mask = 0;
436 struct nvme_iod *iod;
437
438 if (rq->nr_phys_segments <= NVME_INT_PAGES &&
439 size <= NVME_INT_BYTES(dev)) {
440 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
441
442 iod = cmd->iod;
443 mask = 0x01;
444 iod_init(iod, size, rq->nr_phys_segments,
445 (unsigned long) rq | 0x01);
446 return iod;
447 }
448
449 return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
450 (unsigned long) rq, gfp);
451}
452
393void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 453void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
394{ 454{
395 const int last_prp = dev->page_size / 8 - 1; 455 const int last_prp = dev->page_size / 8 - 1;
@@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
405 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 465 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
406 prp_dma = next_prp_dma; 466 prp_dma = next_prp_dma;
407 } 467 }
408 kfree(iod); 468
469 if (iod_should_kfree(iod))
470 kfree(iod);
409} 471}
410 472
411static int nvme_error_status(u16 status) 473static int nvme_error_status(u16 status)
@@ -424,7 +486,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
424 struct nvme_completion *cqe) 486 struct nvme_completion *cqe)
425{ 487{
426 struct nvme_iod *iod = ctx; 488 struct nvme_iod *iod = ctx;
427 struct request *req = iod->private; 489 struct request *req = iod_get_private(iod);
428 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 490 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
429 491
430 u16 status = le16_to_cpup(&cqe->status) >> 1; 492 u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -585,7 +647,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
585static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, 647static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
586 struct nvme_ns *ns) 648 struct nvme_ns *ns)
587{ 649{
588 struct request *req = iod->private; 650 struct request *req = iod_get_private(iod);
589 struct nvme_command *cmnd; 651 struct nvme_command *cmnd;
590 u16 control = 0; 652 u16 control = 0;
591 u32 dsmgmt = 0; 653 u32 dsmgmt = 0;
@@ -626,17 +688,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
626 struct request *req = bd->rq; 688 struct request *req = bd->rq;
627 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 689 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
628 struct nvme_iod *iod; 690 struct nvme_iod *iod;
629 int psegs = req->nr_phys_segments;
630 enum dma_data_direction dma_dir; 691 enum dma_data_direction dma_dir;
631 unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
632 sizeof(struct nvme_dsm_range);
633 692
634 iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); 693 iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
635 if (!iod) 694 if (!iod)
636 return BLK_MQ_RQ_QUEUE_BUSY; 695 return BLK_MQ_RQ_QUEUE_BUSY;
637 696
638 iod->private = req;
639
640 if (req->cmd_flags & REQ_DISCARD) { 697 if (req->cmd_flags & REQ_DISCARD) {
641 void *range; 698 void *range;
642 /* 699 /*
@@ -651,10 +708,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
651 goto retry_cmd; 708 goto retry_cmd;
652 iod_list(iod)[0] = (__le64 *)range; 709 iod_list(iod)[0] = (__le64 *)range;
653 iod->npages = 0; 710 iod->npages = 0;
654 } else if (psegs) { 711 } else if (req->nr_phys_segments) {
655 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; 712 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
656 713
657 sg_init_table(iod->sg, psegs); 714 sg_init_table(iod->sg, req->nr_phys_segments);
658 iod->nents = blk_rq_map_sg(req->q, req, iod->sg); 715 iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
659 if (!iod->nents) 716 if (!iod->nents)
660 goto error_cmd; 717 goto error_cmd;
@@ -1137,21 +1194,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
1137 1194
1138static void nvme_free_queues(struct nvme_dev *dev, int lowest) 1195static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1139{ 1196{
1140 LLIST_HEAD(q_list);
1141 struct nvme_queue *nvmeq, *next;
1142 struct llist_node *entry;
1143 int i; 1197 int i;
1144 1198
1145 for (i = dev->queue_count - 1; i >= lowest; i--) { 1199 for (i = dev->queue_count - 1; i >= lowest; i--) {
1146 struct nvme_queue *nvmeq = dev->queues[i]; 1200 struct nvme_queue *nvmeq = dev->queues[i];
1147 llist_add(&nvmeq->node, &q_list);
1148 dev->queue_count--; 1201 dev->queue_count--;
1149 dev->queues[i] = NULL; 1202 dev->queues[i] = NULL;
1150 }
1151 synchronize_rcu();
1152 entry = llist_del_all(&q_list);
1153 llist_for_each_entry_safe(nvmeq, next, entry, node)
1154 nvme_free_queue(nvmeq); 1203 nvme_free_queue(nvmeq);
1204 }
1155} 1205}
1156 1206
1157/** 1207/**
@@ -1408,7 +1458,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1408 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; 1458 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
1409 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1459 dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1410 dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); 1460 dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
1411 dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); 1461 dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
1412 dev->admin_tagset.driver_data = dev; 1462 dev->admin_tagset.driver_data = dev;
1413 1463
1414 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1464 if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1522,7 +1572,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
1522 } 1572 }
1523 1573
1524 err = -ENOMEM; 1574 err = -ENOMEM;
1525 iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); 1575 iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
1526 if (!iod) 1576 if (!iod)
1527 goto put_pages; 1577 goto put_pages;
1528 1578
@@ -2148,7 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
2148 dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); 2198 dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
2149 dev->tagset.queue_depth = 2199 dev->tagset.queue_depth =
2150 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 2200 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
2151 dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); 2201 dev->tagset.cmd_size = nvme_cmd_size(dev);
2152 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2202 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
2153 dev->tagset.driver_data = dev; 2203 dev->tagset.driver_data = dev;
2154 2204
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index cc90a840e616..375d28851860 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -214,6 +214,15 @@ enum blkif_protocol {
214 BLKIF_PROTOCOL_X86_64 = 3, 214 BLKIF_PROTOCOL_X86_64 = 3,
215}; 215};
216 216
217/*
218 * Default protocol if the frontend doesn't specify one.
219 */
220#ifdef CONFIG_X86
221# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
222#else
223# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
224#endif
225
217struct xen_vbd { 226struct xen_vbd {
218 /* What the domain refers to this vbd as. */ 227 /* What the domain refers to this vbd as. */
219 blkif_vdev_t handle; 228 blkif_vdev_t handle;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 630a489e757d..e3afe97280b1 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be)
868 return err; 868 return err;
869 } 869 }
870 870
871 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; 871 be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
872 err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", 872 err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
873 "%63s", protocol, NULL); 873 "%63s", protocol, NULL);
874 if (err) 874 if (err)
875 strcpy(protocol, "unspecified, assuming native"); 875 strcpy(protocol, "unspecified, assuming default");
876 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) 876 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
877 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; 877 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
878 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) 878 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2236c6f31608..7f66d2e08f19 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info)
1511 merge_bio.tail = copy[i].request->biotail; 1511 merge_bio.tail = copy[i].request->biotail;
1512 bio_list_merge(&bio_list, &merge_bio); 1512 bio_list_merge(&bio_list, &merge_bio);
1513 copy[i].request->bio = NULL; 1513 copy[i].request->bio = NULL;
1514 blk_put_request(copy[i].request); 1514 blk_end_request_all(copy[i].request, 0);
1515 } 1515 }
1516 1516
1517 kfree(copy); 1517 kfree(copy);
@@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info)
1534 req->bio = NULL; 1534 req->bio = NULL;
1535 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) 1535 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
1536 pr_alert("diskcache flush request found!\n"); 1536 pr_alert("diskcache flush request found!\n");
1537 __blk_put_request(info->rq, req); 1537 __blk_end_request_all(req, 0);
1538 } 1538 }
1539 spin_unlock_irq(&info->io_lock); 1539 spin_unlock_irq(&info->io_lock);
1540 1540