aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 16:19:59 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 16:19:59 -0500
commit0e9da3fbf7d81f0f913b491c8de1ba7883d4f217 (patch)
tree2b3d25e3be60bf4ee40b4690c7bb9d6fa499ae69
parentb12a9124eeb71d766a3e3eb594ebbb3fefc66902 (diff)
parent00203ba40d40d7f33857416adfb18adaf0e40123 (diff)
Merge tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "This is the main pull request for block/storage for 4.21. Larger than usual, it was a busy round with lots of goodies queued up. Most notable is the removal of the old IO stack, which has been a long time coming. No new features for a while, everything coming in this week has all been fixes for things that were previously merged. This contains: - Use atomic counters instead of semaphores for mtip32xx (Arnd) - Cleanup of the mtip32xx request setup (Christoph) - Fix for circular locking dependency in loop (Jan, Tetsuo) - bcache (Coly, Guoju, Shenghui) * Optimizations for writeback caching * Various fixes and improvements - nvme (Chaitanya, Christoph, Sagi, Jay, me, Keith) * host and target support for NVMe over TCP * Error log page support * Support for separate read/write/poll queues * Much improved polling * discard OOM fallback * Tracepoint improvements - lightnvm (Hans, Hua, Igor, Matias, Javier) * Igor added packed metadata to pblk. Now drives without metadata per LBA can be used as well. * Fix from Geert on uninitialized value on chunk metadata reads. * Fixes from Hans and Javier to pblk recovery and write path. * Fix from Hua Su to fix a race condition in the pblk recovery code. * Scan optimization added to pblk recovery from Zhoujie. * Small geometry cleanup from me. - Conversion of the last few drivers that used the legacy path to blk-mq (me) - Removal of legacy IO path in SCSI (me, Christoph) - Removal of legacy IO stack and schedulers (me) - Support for much better polling, now without interrupts at all. blk-mq adds support for multiple queue maps, which enables us to have a map per type. This in turn enables nvme to have separate completion queues for polling, which can then be interrupt-less. Also means we're ready for async polled IO, which is hopefully coming in the next release. - Killing of (now) unused block exports (Christoph) - Unification of the blk-rq-qos and blk-wbt wait handling (Josef) - Support for zoned testing with null_blk (Masato) - sx8 conversion to per-host tag sets (Christoph) - IO priority improvements (Damien) - mq-deadline zoned fix (Damien) - Ref count blkcg series (Dennis) - Lots of blk-mq improvements and speedups (me) - sbitmap scalability improvements (me) - Make core inflight IO accounting per-cpu (Mikulas) - Export timeout setting in sysfs (Weiping) - Cleanup the direct issue path (Jianchao) - Export blk-wbt internals in block debugfs for easier debugging (Ming) - Lots of other fixes and improvements" * tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block: (364 commits) kyber: use sbitmap add_wait_queue/list_del wait helpers sbitmap: add helpers for add/del wait queue handling block: save irq state in blkg_lookup_create() dm: don't reuse bio for flushes nvme-pci: trace SQ status on completions nvme-rdma: implement polling queue map nvme-fabrics: allow user to pass in nr_poll_queues nvme-fabrics: allow nvmf_connect_io_queue to poll nvme-core: optionally poll sync commands block: make request_to_qc_t public nvme-tcp: fix spelling mistake "attepmpt" -> "attempt" nvme-tcp: fix endianess annotations nvmet-tcp: fix endianess annotations nvme-pci: refactor nvme_poll_irqdisable to make sparse happy nvme-pci: only set nr_maps to 2 if poll queues are supported nvmet: use a macro for default error location nvmet: fix comparison of a u16 with -1 blk-mq: enable IO poll if .nr_queues of type poll > 0 blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight() blk-mq: skip zero-queue maps in blk_mq_map_swqueue ...
-rw-r--r--Documentation/ABI/testing/sysfs-block12
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst8
-rw-r--r--Documentation/block/biodoc.txt88
-rw-r--r--Documentation/block/cfq-iosched.txt291
-rw-r--r--Documentation/block/queue-sysfs.txt29
-rw-r--r--Documentation/scsi/scsi-parameters.txt5
-rw-r--r--block/Kconfig6
-rw-r--r--block/Kconfig.iosched61
-rw-r--r--block/Makefile5
-rw-r--r--block/bfq-cgroup.c6
-rw-r--r--block/bfq-iosched.c21
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/bio.c202
-rw-r--r--block/blk-cgroup.c272
-rw-r--r--block/blk-core.c2066
-rw-r--r--block/blk-exec.c20
-rw-r--r--block/blk-flush.c188
-rw-r--r--block/blk-ioc.c54
-rw-r--r--block/blk-iolatency.c75
-rw-r--r--block/blk-merge.c53
-rw-r--r--block/blk-mq-cpumap.c19
-rw-r--r--block/blk-mq-debugfs.c147
-rw-r--r--block/blk-mq-debugfs.h17
-rw-r--r--block/blk-mq-pci.c10
-rw-r--r--block/blk-mq-rdma.c8
-rw-r--r--block/blk-mq-sched.c82
-rw-r--r--block/blk-mq-sched.h25
-rw-r--r--block/blk-mq-sysfs.c35
-rw-r--r--block/blk-mq-tag.c41
-rw-r--r--block/blk-mq-virtio.c8
-rw-r--r--block/blk-mq.c757
-rw-r--r--block/blk-mq.h70
-rw-r--r--block/blk-pm.c20
-rw-r--r--block/blk-pm.h6
-rw-r--r--block/blk-rq-qos.c154
-rw-r--r--block/blk-rq-qos.h96
-rw-r--r--block/blk-settings.c65
-rw-r--r--block/blk-softirq.c27
-rw-r--r--block/blk-stat.c4
-rw-r--r--block/blk-stat.h5
-rw-r--r--block/blk-sysfs.c107
-rw-r--r--block/blk-tag.c378
-rw-r--r--block/blk-throttle.c39
-rw-r--r--block/blk-timeout.c117
-rw-r--r--block/blk-wbt.c176
-rw-r--r--block/blk-zoned.c2
-rw-r--r--block/blk.h188
-rw-r--r--block/bounce.c3
-rw-r--r--block/bsg-lib.c146
-rw-r--r--block/bsg.c2
-rw-r--r--block/cfq-iosched.c4916
-rw-r--r--block/deadline-iosched.c560
-rw-r--r--block/elevator.c477
-rw-r--r--block/genhd.c63
-rw-r--r--block/kyber-iosched.c37
-rw-r--r--block/mq-deadline.c15
-rw-r--r--block/noop-iosched.c124
-rw-r--r--block/partition-generic.c18
-rw-r--r--drivers/ata/libata-eh.c4
-rw-r--r--drivers/block/aoe/aoe.h4
-rw-r--r--drivers/block/aoe/aoeblk.c1
-rw-r--r--drivers/block/aoe/aoecmd.c27
-rw-r--r--drivers/block/aoe/aoedev.c11
-rw-r--r--drivers/block/aoe/aoemain.c2
-rw-r--r--drivers/block/ataflop.c26
-rw-r--r--drivers/block/drbd/drbd_main.c2
-rw-r--r--drivers/block/floppy.c6
-rw-r--r--drivers/block/loop.c415
-rw-r--r--drivers/block/loop.h1
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c226
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h48
-rw-r--r--drivers/block/nbd.c3
-rw-r--r--drivers/block/null_blk.h1
-rw-r--r--drivers/block/null_blk_main.c21
-rw-r--r--drivers/block/null_blk_zoned.c27
-rw-r--r--drivers/block/paride/pd.c30
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/block/skd_main.c16
-rw-r--r--drivers/block/sunvdc.c153
-rw-r--r--drivers/block/sx8.c434
-rw-r--r--drivers/block/umem.c3
-rw-r--r--drivers/block/virtio_blk.c17
-rw-r--r--drivers/ide/ide-atapi.c27
-rw-r--r--drivers/ide/ide-cd.c179
-rw-r--r--drivers/ide/ide-devsets.c4
-rw-r--r--drivers/ide/ide-disk.c15
-rw-r--r--drivers/ide/ide-eh.c2
-rw-r--r--drivers/ide/ide-floppy.c2
-rw-r--r--drivers/ide/ide-io.c112
-rw-r--r--drivers/ide/ide-park.c8
-rw-r--r--drivers/ide/ide-pm.c46
-rw-r--r--drivers/ide/ide-probe.c69
-rw-r--r--drivers/ide/ide-tape.c2
-rw-r--r--drivers/ide/ide-taskfile.c2
-rw-r--r--drivers/lightnvm/core.c25
-rw-r--r--drivers/lightnvm/pblk-core.c77
-rw-r--r--drivers/lightnvm/pblk-init.c103
-rw-r--r--drivers/lightnvm/pblk-map.c63
-rw-r--r--drivers/lightnvm/pblk-rb.c5
-rw-r--r--drivers/lightnvm/pblk-read.c66
-rw-r--r--drivers/lightnvm/pblk-recovery.c46
-rw-r--r--drivers/lightnvm/pblk-rl.c5
-rw-r--r--drivers/lightnvm/pblk-sysfs.c7
-rw-r--r--drivers/lightnvm/pblk-write.c64
-rw-r--r--drivers/lightnvm/pblk.h43
-rw-r--r--drivers/md/bcache/bcache.h20
-rw-r--r--drivers/md/bcache/btree.c5
-rw-r--r--drivers/md/bcache/btree.h18
-rw-r--r--drivers/md/bcache/debug.c3
-rw-r--r--drivers/md/bcache/journal.c2
-rw-r--r--drivers/md/bcache/request.c6
-rw-r--r--drivers/md/bcache/super.c48
-rw-r--r--drivers/md/bcache/sysfs.c61
-rw-r--r--drivers/md/bcache/writeback.c30
-rw-r--r--drivers/md/bcache/writeback.h12
-rw-r--r--drivers/md/dm-core.h5
-rw-r--r--drivers/md/dm-rq.c7
-rw-r--r--drivers/md/dm-table.c4
-rw-r--r--drivers/md/dm.c79
-rw-r--r--drivers/md/md.c7
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/memstick/core/ms_block.c109
-rw-r--r--drivers/memstick/core/ms_block.h1
-rw-r--r--drivers/memstick/core/mspro_block.c121
-rw-r--r--drivers/mmc/core/block.c26
-rw-r--r--drivers/mmc/core/queue.c110
-rw-r--r--drivers/mmc/core/queue.h4
-rw-r--r--drivers/net/wireless/ath/ath6kl/cfg80211.c2
-rw-r--r--drivers/net/wireless/ath/ath6kl/common.h2
-rw-r--r--drivers/net/wireless/ath/ath6kl/wmi.c6
-rw-r--r--drivers/net/wireless/ath/ath6kl/wmi.h6
-rw-r--r--drivers/nvdimm/pmem.c2
-rw-r--r--drivers/nvme/host/Kconfig15
-rw-r--r--drivers/nvme/host/Makefile3
-rw-r--r--drivers/nvme/host/core.c191
-rw-r--r--drivers/nvme/host/fabrics.c61
-rw-r--r--drivers/nvme/host/fabrics.h17
-rw-r--r--drivers/nvme/host/fc.c43
-rw-r--r--drivers/nvme/host/lightnvm.c33
-rw-r--r--drivers/nvme/host/multipath.c20
-rw-r--r--drivers/nvme/host/nvme.h24
-rw-r--r--drivers/nvme/host/pci.c518
-rw-r--r--drivers/nvme/host/rdma.c119
-rw-r--r--drivers/nvme/host/tcp.c2278
-rw-r--r--drivers/nvme/host/trace.c3
-rw-r--r--drivers/nvme/host/trace.h27
-rw-r--r--drivers/nvme/target/Kconfig10
-rw-r--r--drivers/nvme/target/Makefile2
-rw-r--r--drivers/nvme/target/admin-cmd.c146
-rw-r--r--drivers/nvme/target/configfs.c43
-rw-r--r--drivers/nvme/target/core.c220
-rw-r--r--drivers/nvme/target/discovery.c139
-rw-r--r--drivers/nvme/target/fabrics-cmd.c64
-rw-r--r--drivers/nvme/target/fc.c66
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c89
-rw-r--r--drivers/nvme/target/io-cmd-file.c165
-rw-r--r--drivers/nvme/target/loop.c2
-rw-r--r--drivers/nvme/target/nvmet.h68
-rw-r--r--drivers/nvme/target/rdma.c12
-rw-r--r--drivers/nvme/target/tcp.c1737
-rw-r--r--drivers/s390/block/dasd_ioctl.c22
-rw-r--r--drivers/scsi/Kconfig12
-rw-r--r--drivers/scsi/bnx2i/bnx2i_hwi.c8
-rw-r--r--drivers/scsi/csiostor/csio_scsi.c8
-rw-r--r--drivers/scsi/cxlflash/main.c6
-rw-r--r--drivers/scsi/device_handler/scsi_dh_alua.c21
-rw-r--r--drivers/scsi/device_handler/scsi_dh_emc.c8
-rw-r--r--drivers/scsi/device_handler/scsi_dh_hp_sw.c7
-rw-r--r--drivers/scsi/device_handler/scsi_dh_rdac.c7
-rw-r--r--drivers/scsi/fnic/fnic_scsi.c4
-rw-r--r--drivers/scsi/hosts.c29
-rw-r--r--drivers/scsi/libsas/sas_ata.c5
-rw-r--r--drivers/scsi/libsas/sas_scsi_host.c10
-rw-r--r--drivers/scsi/lpfc/lpfc_scsi.c2
-rw-r--r--drivers/scsi/osd/osd_initiator.c4
-rw-r--r--drivers/scsi/osst.c2
-rw-r--r--drivers/scsi/qedi/qedi_main.c3
-rw-r--r--drivers/scsi/qla2xxx/qla_nvme.c12
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c37
-rw-r--r--drivers/scsi/scsi.c5
-rw-r--r--drivers/scsi/scsi_debug.c3
-rw-r--r--drivers/scsi/scsi_error.c24
-rw-r--r--drivers/scsi/scsi_lib.c806
-rw-r--r--drivers/scsi/scsi_priv.h1
-rw-r--r--drivers/scsi/scsi_scan.c10
-rw-r--r--drivers/scsi/scsi_sysfs.c8
-rw-r--r--drivers/scsi/scsi_transport_fc.c71
-rw-r--r--drivers/scsi/scsi_transport_iscsi.c7
-rw-r--r--drivers/scsi/scsi_transport_sas.c10
-rw-r--r--drivers/scsi/sd.c85
-rw-r--r--drivers/scsi/sd.h6
-rw-r--r--drivers/scsi/sd_zbc.c10
-rw-r--r--drivers/scsi/sg.c2
-rw-r--r--drivers/scsi/smartpqi/smartpqi_init.c3
-rw-r--r--drivers/scsi/sr.c12
-rw-r--r--drivers/scsi/st.c2
-rw-r--r--drivers/scsi/ufs/ufs_bsg.c4
-rw-r--r--drivers/scsi/virtio_scsi.c3
-rw-r--r--drivers/target/iscsi/iscsi_target_util.c12
-rw-r--r--drivers/target/target_core_pscsi.c2
-rw-r--r--fs/aio.c13
-rw-r--r--fs/block_dev.c50
-rw-r--r--fs/buffer.c10
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/iomap.c16
-rw-r--r--include/linux/bio.h29
-rw-r--r--include/linux/blk-cgroup.h227
-rw-r--r--include/linux/blk-mq-pci.h4
-rw-r--r--include/linux/blk-mq-rdma.h2
-rw-r--r--include/linux/blk-mq-virtio.h4
-rw-r--r--include/linux/blk-mq.h83
-rw-r--r--include/linux/blk_types.h24
-rw-r--r--include/linux/blkdev.h250
-rw-r--r--include/linux/bsg-lib.h6
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/elevator.h94
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/genhd.h57
-rw-r--r--include/linux/ide.h14
-rw-r--r--include/linux/init.h1
-rw-r--r--include/linux/ioprio.h13
-rw-r--r--include/linux/lightnvm.h3
-rw-r--r--include/linux/nvme-fc-driver.h17
-rw-r--r--include/linux/nvme-tcp.h189
-rw-r--r--include/linux/nvme.h73
-rw-r--r--include/linux/sbitmap.h89
-rw-r--r--include/linux/skbuff.h3
-rw-r--r--include/linux/uio.h5
-rw-r--r--include/linux/writeback.h5
-rw-r--r--include/scsi/scsi_cmnd.h6
-rw-r--r--include/scsi/scsi_dh.h2
-rw-r--r--include/scsi/scsi_driver.h3
-rw-r--r--include/scsi/scsi_host.h18
-rw-r--r--include/scsi/scsi_tcq.h14
-rw-r--r--include/trace/events/bcache.h27
-rw-r--r--include/uapi/linux/aio_abi.h2
-rw-r--r--init/do_mounts_initrd.c3
-rw-r--r--init/initramfs.c6
-rw-r--r--init/main.c12
-rw-r--r--kernel/cgroup/cgroup.c48
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--lib/iov_iter.c19
-rw-r--r--lib/sbitmap.c170
-rw-r--r--mm/page_io.c9
-rw-r--r--net/core/datagram.c159
246 files changed, 10439 insertions, 14340 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index dea212db9df3..7710d4022b19 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -244,7 +244,7 @@ Description:
244 244
245What: /sys/block/<disk>/queue/zoned 245What: /sys/block/<disk>/queue/zoned
246Date: September 2016 246Date: September 2016
247Contact: Damien Le Moal <damien.lemoal@hgst.com> 247Contact: Damien Le Moal <damien.lemoal@wdc.com>
248Description: 248Description:
249 zoned indicates if the device is a zoned block device 249 zoned indicates if the device is a zoned block device
250 and the zone model of the device if it is indeed zoned. 250 and the zone model of the device if it is indeed zoned.
@@ -259,6 +259,14 @@ Description:
259 zone commands, they will be treated as regular block 259 zone commands, they will be treated as regular block
260 devices and zoned will report "none". 260 devices and zoned will report "none".
261 261
262What: /sys/block/<disk>/queue/nr_zones
263Date: November 2018
264Contact: Damien Le Moal <damien.lemoal@wdc.com>
265Description:
266 nr_zones indicates the total number of zones of a zoned block
267 device ("host-aware" or "host-managed" zone model). For regular
268 block devices, the value is always 0.
269
262What: /sys/block/<disk>/queue/chunk_sectors 270What: /sys/block/<disk>/queue/chunk_sectors
263Date: September 2016 271Date: September 2016
264Contact: Hannes Reinecke <hare@suse.com> 272Contact: Hannes Reinecke <hare@suse.com>
@@ -268,6 +276,6 @@ Description:
268 indicates the size in 512B sectors of the RAID volume 276 indicates the size in 512B sectors of the RAID volume
269 stripe segment. For a zoned block device, either 277 stripe segment. For a zoned block device, either
270 host-aware or host-managed, chunk_sectors indicates the 278 host-aware or host-managed, chunk_sectors indicates the
271 size of 512B sectors of the zones of the device, with 279 size in 512B sectors of the zones of the device, with
272 the eventual exception of the last zone of the device 280 the eventual exception of the last zone of the device
273 which may be smaller. 281 which may be smaller.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 476722b7b636..baf19bf28385 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1879,8 +1879,10 @@ following two functions.
1879 1879
1880 wbc_init_bio(@wbc, @bio) 1880 wbc_init_bio(@wbc, @bio)
1881 Should be called for each bio carrying writeback data and 1881 Should be called for each bio carrying writeback data and
1882 associates the bio with the inode's owner cgroup. Can be 1882 associates the bio with the inode's owner cgroup and the
1883 called anytime between bio allocation and submission. 1883 corresponding request queue. This must be called after
1884 a queue (device) has been associated with the bio and
1885 before submission.
1884 1886
1885 wbc_account_io(@wbc, @page, @bytes) 1887 wbc_account_io(@wbc, @page, @bytes)
1886 Should be called for each data segment being written out. 1888 Should be called for each data segment being written out.
@@ -1899,7 +1901,7 @@ the configuration, the bio may be executed at a lower priority and if
1899the writeback session is holding shared resources, e.g. a journal 1901the writeback session is holding shared resources, e.g. a journal
1900entry, may lead to priority inversion. There is no one easy solution 1902entry, may lead to priority inversion. There is no one easy solution
1901for the problem. Filesystems can try to work around specific problem 1903for the problem. Filesystems can try to work around specific problem
1902cases by skipping wbc_init_bio() or using bio_associate_blkcg() 1904cases by skipping wbc_init_bio() and using bio_associate_blkg()
1903directly. 1905directly.
1904 1906
1905 1907
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 207eca58efaa..ac18b488cb5e 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -65,7 +65,6 @@ Description of Contents:
65 3.2.3 I/O completion 65 3.2.3 I/O completion
66 3.2.4 Implications for drivers that do not interpret bios (don't handle 66 3.2.4 Implications for drivers that do not interpret bios (don't handle
67 multiple segments) 67 multiple segments)
68 3.2.5 Request command tagging
69 3.3 I/O submission 68 3.3 I/O submission
704. The I/O scheduler 694. The I/O scheduler
715. Scalability related changes 705. Scalability related changes
@@ -708,93 +707,6 @@ is crossed on completion of a transfer. (The end*request* functions should
708be used if only if the request has come down from block/bio path, not for 707be used if only if the request has come down from block/bio path, not for
709direct access requests which only specify rq->buffer without a valid rq->bio) 708direct access requests which only specify rq->buffer without a valid rq->bio)
710 709
7113.2.5 Generic request command tagging
712
7133.2.5.1 Tag helpers
714
715Block now offers some simple generic functionality to help support command
716queueing (typically known as tagged command queueing), ie manage more than
717one outstanding command on a queue at any given time.
718
719 blk_queue_init_tags(struct request_queue *q, int depth)
720
721 Initialize internal command tagging structures for a maximum
722 depth of 'depth'.
723
724 blk_queue_free_tags((struct request_queue *q)
725
726 Teardown tag info associated with the queue. This will be done
727 automatically by block if blk_queue_cleanup() is called on a queue
728 that is using tagging.
729
730The above are initialization and exit management, the main helpers during
731normal operations are:
732
733 blk_queue_start_tag(struct request_queue *q, struct request *rq)
734
735 Start tagged operation for this request. A free tag number between
736 0 and 'depth' is assigned to the request (rq->tag holds this number),
737 and 'rq' is added to the internal tag management. If the maximum depth
738 for this queue is already achieved (or if the tag wasn't started for
739 some other reason), 1 is returned. Otherwise 0 is returned.
740
741 blk_queue_end_tag(struct request_queue *q, struct request *rq)
742
743 End tagged operation on this request. 'rq' is removed from the internal
744 book keeping structures.
745
746To minimize struct request and queue overhead, the tag helpers utilize some
747of the same request members that are used for normal request queue management.
748This means that a request cannot both be an active tag and be on the queue
749list at the same time. blk_queue_start_tag() will remove the request, but
750the driver must remember to call blk_queue_end_tag() before signalling
751completion of the request to the block layer. This means ending tag
752operations before calling end_that_request_last()! For an example of a user
753of these helpers, see the IDE tagged command queueing support.
754
7553.2.5.2 Tag info
756
757Some block functions exist to query current tag status or to go from a
758tag number to the associated request. These are, in no particular order:
759
760 blk_queue_tagged(q)
761
762 Returns 1 if the queue 'q' is using tagging, 0 if not.
763
764 blk_queue_tag_request(q, tag)
765
766 Returns a pointer to the request associated with tag 'tag'.
767
768 blk_queue_tag_depth(q)
769
770 Return current queue depth.
771
772 blk_queue_tag_queue(q)
773
774 Returns 1 if the queue can accept a new queued command, 0 if we are
775 at the maximum depth already.
776
777 blk_queue_rq_tagged(rq)
778
779 Returns 1 if the request 'rq' is tagged.
780
7813.2.5.2 Internal structure
782
783Internally, block manages tags in the blk_queue_tag structure:
784
785 struct blk_queue_tag {
786 struct request **tag_index; /* array or pointers to rq */
787 unsigned long *tag_map; /* bitmap of free tags */
788 struct list_head busy_list; /* fifo list of busy tags */
789 int busy; /* queue depth */
790 int max_depth; /* max queue depth */
791 };
792
793Most of the above is simple and straight forward, however busy_list may need
794a bit of explaining. Normally we don't care too much about request ordering,
795but in the event of any barrier requests in the tag queue we need to ensure
796that requests are restarted in the order they were queue.
797
7983.3 I/O Submission 7103.3 I/O Submission
799 711
800The routine submit_bio() is used to submit a single io. Higher level i/o 712The routine submit_bio() is used to submit a single io. Higher level i/o
diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
deleted file mode 100644
index 895bd3813115..000000000000
--- a/Documentation/block/cfq-iosched.txt
+++ /dev/null
@@ -1,291 +0,0 @@
1CFQ (Complete Fairness Queueing)
2===============================
3
4The main aim of CFQ scheduler is to provide a fair allocation of the disk
5I/O bandwidth for all the processes which requests an I/O operation.
6
7CFQ maintains the per process queue for the processes which request I/O
8operation(synchronous requests). In case of asynchronous requests, all the
9requests from all the processes are batched together according to their
10process's I/O priority.
11
12CFQ ioscheduler tunables
13========================
14
15slice_idle
16----------
17This specifies how long CFQ should idle for next request on certain cfq queues
18(for sequential workloads) and service trees (for random workloads) before
19queue is expired and CFQ selects next queue to dispatch from.
20
21By default slice_idle is a non-zero value. That means by default we idle on
22queues/service trees. This can be very helpful on highly seeky media like
23single spindle SATA/SAS disks where we can cut down on overall number of
24seeks and see improved throughput.
25
26Setting slice_idle to 0 will remove all the idling on queues/service tree
27level and one should see an overall improved throughput on faster storage
28devices like multiple SATA/SAS disks in hardware RAID configuration. The down
29side is that isolation provided from WRITES also goes down and notion of
30IO priority becomes weaker.
31
32So depending on storage and workload, it might be useful to set slice_idle=0.
33In general I think for SATA/SAS disks and software RAID of SATA/SAS disks
34keeping slice_idle enabled should be useful. For any configurations where
35there are multiple spindles behind single LUN (Host based hardware RAID
36controller or for storage arrays), setting slice_idle=0 might end up in better
37throughput and acceptable latencies.
38
39back_seek_max
40-------------
41This specifies, given in Kbytes, the maximum "distance" for backward seeking.
42The distance is the amount of space from the current head location to the
43sectors that are backward in terms of distance.
44
45This parameter allows the scheduler to anticipate requests in the "backward"
46direction and consider them as being the "next" if they are within this
47distance from the current head location.
48
49back_seek_penalty
50-----------------
51This parameter is used to compute the cost of backward seeking. If the
52backward distance of request is just 1/back_seek_penalty from a "front"
53request, then the seeking cost of two requests is considered equivalent.
54
55So scheduler will not bias toward one or the other request (otherwise scheduler
56will bias toward front request). Default value of back_seek_penalty is 2.
57
58fifo_expire_async
59-----------------
60This parameter is used to set the timeout of asynchronous requests. Default
61value of this is 248ms.
62
63fifo_expire_sync
64----------------
65This parameter is used to set the timeout of synchronous requests. Default
66value of this is 124ms. In case to favor synchronous requests over asynchronous
67one, this value should be decreased relative to fifo_expire_async.
68
69group_idle
70-----------
71This parameter forces idling at the CFQ group level instead of CFQ
72queue level. This was introduced after a bottleneck was observed
73in higher end storage due to idle on sequential queue and allow dispatch
74from a single queue. The idea with this parameter is that it can be run with
75slice_idle=0 and group_idle=8, so that idling does not happen on individual
76queues in the group but happens overall on the group and thus still keeps the
77IO controller working.
78Not idling on individual queues in the group will dispatch requests from
79multiple queues in the group at the same time and achieve higher throughput
80on higher end storage.
81
82Default value for this parameter is 8ms.
83
84low_latency
85-----------
86This parameter is used to enable/disable the low latency mode of the CFQ
87scheduler. If enabled, CFQ tries to recompute the slice time for each process
88based on the target_latency set for the system. This favors fairness over
89throughput. Disabling low latency (setting it to 0) ignores target latency,
90allowing each process in the system to get a full time slice.
91
92By default low latency mode is enabled.
93
94target_latency
95--------------
96This parameter is used to calculate the time slice for a process if cfq's
97latency mode is enabled. It will ensure that sync requests have an estimated
98latency. But if sequential workload is higher(e.g. sequential read),
99then to meet the latency constraints, throughput may decrease because of less
100time for each process to issue I/O request before the cfq queue is switched.
101
102Though this can be overcome by disabling the latency_mode, it may increase
103the read latency for some applications. This parameter allows for changing
104target_latency through the sysfs interface which can provide the balanced
105throughput and read latency.
106
107Default value for target_latency is 300ms.
108
109slice_async
110-----------
111This parameter is same as of slice_sync but for asynchronous queue. The
112default value is 40ms.
113
114slice_async_rq
115--------------
116This parameter is used to limit the dispatching of asynchronous request to
117device request queue in queue's slice time. The maximum number of request that
118are allowed to be dispatched also depends upon the io priority. Default value
119for this is 2.
120
121slice_sync
122----------
123When a queue is selected for execution, the queues IO requests are only
124executed for a certain amount of time(time_slice) before switching to another
125queue. This parameter is used to calculate the time slice of synchronous
126queue.
127
128time_slice is computed using the below equation:-
129time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
130time_slice of synchronous queue, increase the value of slice_sync. Default
131value is 100ms.
132
133quantum
134-------
135This specifies the number of request dispatched to the device queue. In a
136queue's time slice, a request will not be dispatched if the number of request
137in the device exceeds this parameter. This parameter is used for synchronous
138request.
139
140In case of storage with several disk, this setting can limit the parallel
141processing of request. Therefore, increasing the value can improve the
142performance although this can cause the latency of some I/O to increase due
143to more number of requests.
144
145CFQ Group scheduling
146====================
147
148CFQ supports blkio cgroup and has "blkio." prefixed files in each
149blkio cgroup directory. It is weight-based and there are four knobs
150for configuration - weight[_device] and leaf_weight[_device].
151Internal cgroup nodes (the ones with children) can also have tasks in
152them, so the former two configure how much proportion the cgroup as a
153whole is entitled to at its parent's level while the latter two
154configure how much proportion the tasks in the cgroup have compared to
155its direct children.
156
157Another way to think about it is assuming that each internal node has
158an implicit leaf child node which hosts all the tasks whose weight is
159configured by leaf_weight[_device]. Let's assume a blkio hierarchy
160composed of five cgroups - root, A, B, AA and AB - with the following
161weights where the names represent the hierarchy.
162
163 weight leaf_weight
164 root : 125 125
165 A : 500 750
166 B : 250 500
167 AA : 500 500
168 AB : 1000 500
169
170root never has a parent making its weight is meaningless. For backward
171compatibility, weight is always kept in sync with leaf_weight. B, AA
172and AB have no child and thus its tasks have no children cgroup to
173compete with. They always get 100% of what the cgroup won at the
174parent level. Considering only the weights which matter, the hierarchy
175looks like the following.
176
177 root
178 / | \
179 A B leaf
180 500 250 125
181 / | \
182 AA AB leaf
183 500 1000 750
184
185If all cgroups have active IOs and competing with each other, disk
186time will be distributed like the following.
187
188Distribution below root. The total active weight at this level is
189A:500 + B:250 + C:125 = 875.
190
191 root-leaf : 125 / 875 =~ 14%
192 A : 500 / 875 =~ 57%
193 B(-leaf) : 250 / 875 =~ 28%
194
195A has children and further distributes its 57% among the children and
196the implicit leaf node. The total active weight at this level is
197AA:500 + AB:1000 + A-leaf:750 = 2250.
198
199 A-leaf : ( 750 / 2250) * A =~ 19%
200 AA(-leaf) : ( 500 / 2250) * A =~ 12%
201 AB(-leaf) : (1000 / 2250) * A =~ 25%
202
203CFQ IOPS Mode for group scheduling
204===================================
205Basic CFQ design is to provide priority based time slices. Higher priority
206process gets bigger time slice and lower priority process gets smaller time
207slice. Measuring time becomes harder if storage is fast and supports NCQ and
208it would be better to dispatch multiple requests from multiple cfq queues in
209request queue at a time. In such scenario, it is not possible to measure time
210consumed by single queue accurately.
211
212What is possible though is to measure number of requests dispatched from a
213single queue and also allow dispatch from multiple cfq queue at the same time.
214This effectively becomes the fairness in terms of IOPS (IO operations per
215second).
216
217If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
218to IOPS mode and starts providing fairness in terms of number of requests
219dispatched. Note that this mode switching takes effect only for group
220scheduling. For non-cgroup users nothing should change.
221
222CFQ IO scheduler Idling Theory
223===============================
224Idling on a queue is primarily about waiting for the next request to come
225on same queue after completion of a request. In this process CFQ will not
226dispatch requests from other cfq queues even if requests are pending there.
227
228The rationale behind idling is that it can cut down on number of seeks
229on rotational media. For example, if a process is doing dependent
230sequential reads (next read will come on only after completion of previous
231one), then not dispatching request from other queue should help as we
232did not move the disk head and kept on dispatching sequential IO from
233one queue.
234
235CFQ has following service trees and various queues are put on these trees.
236
237 sync-idle sync-noidle async
238
239All cfq queues doing synchronous sequential IO go on to sync-idle tree.
240On this tree we idle on each queue individually.
241
242All synchronous non-sequential queues go on sync-noidle tree. Also any
243synchronous write request which is not marked with REQ_IDLE goes on this
244service tree. On this tree we do not idle on individual queues instead idle
245on the whole group of queues or the tree. So if there are 4 queues waiting
246for IO to dispatch we will idle only once last queue has dispatched the IO
247and there is no more IO on this service tree.
248
249All async writes go on async service tree. There is no idling on async
250queues.
251
252CFQ has some optimizations for SSDs and if it detects a non-rotational
253media which can support higher queue depth (multiple requests at in
254flight at a time), then it cuts down on idling of individual queues and
255all the queues move to sync-noidle tree and only tree idle remains. This
256tree idling provides isolation with buffered write queues on async tree.
257
258FAQ
259===
260Q1. Why to idle at all on queues not marked with REQ_IDLE.
261
262A1. We only do tree idle (all queues on sync-noidle tree) on queues not marked
263 with REQ_IDLE. This helps in providing isolation with all the sync-idle
264 queues. Otherwise in presence of many sequential readers, other
265 synchronous IO might not get fair share of disk.
266
267 For example, if there are 10 sequential readers doing IO and they get
268 100ms each. If a !REQ_IDLE request comes in, it will be scheduled
269 roughly after 1 second. If after completion of !REQ_IDLE request we
270 do not idle, and after a couple of milli seconds a another !REQ_IDLE
271 request comes in, again it will be scheduled after 1second. Repeat it
272 and notice how a workload can lose its disk share and suffer due to
273 multiple sequential readers.
274
275 fsync can generate dependent IO where bunch of data is written in the
276 context of fsync, and later some journaling data is written. Journaling
277 data comes in only after fsync has finished its IO (atleast for ext4
278 that seemed to be the case). Now if one decides not to idle on fsync
279 thread due to !REQ_IDLE, then next journaling write will not get
280 scheduled for another second. A process doing small fsync, will suffer
281 badly in presence of multiple sequential readers.
282
283 Hence doing tree idling on threads using !REQ_IDLE flag on requests
284 provides isolation from multiple sequential readers and at the same
285 time we do not idle on individual threads.
286
287Q2. When to specify REQ_IDLE
288A2. I would think whenever one is doing synchronous write and expecting
289 more writes to be dispatched from same context soon, should be able
290 to specify REQ_IDLE on writes and that probably should work well for
291 most of the cases.
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 2c1e67058fd3..39e286d7afc9 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -64,7 +64,7 @@ guess, the kernel will put the process issuing IO to sleep for an amount
64of time, before entering a classic poll loop. This mode might be a 64of time, before entering a classic poll loop. This mode might be a
65little slower than pure classic polling, but it will be more efficient. 65little slower than pure classic polling, but it will be more efficient.
66If set to a value larger than 0, the kernel will put the process issuing 66If set to a value larger than 0, the kernel will put the process issuing
67IO to sleep for this amont of microseconds before entering classic 67IO to sleep for this amount of microseconds before entering classic
68polling. 68polling.
69 69
70iostats (RW) 70iostats (RW)
@@ -194,4 +194,31 @@ blk-throttle makes decision based on the samplings. Lower time means cgroups
194have more smooth throughput, but higher CPU overhead. This exists only when 194have more smooth throughput, but higher CPU overhead. This exists only when
195CONFIG_BLK_DEV_THROTTLING_LOW is enabled. 195CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
196 196
197zoned (RO)
198----------
199This indicates if the device is a zoned block device and the zone model of the
200device if it is indeed zoned. The possible values indicated by zoned are
201"none" for regular block devices and "host-aware" or "host-managed" for zoned
202block devices. The characteristics of host-aware and host-managed zoned block
203devices are described in the ZBC (Zoned Block Commands) and ZAC
204(Zoned Device ATA Command Set) standards. These standards also define the
205"drive-managed" zone model. However, since drive-managed zoned block devices
206do not support zone commands, they will be treated as regular block devices
207and zoned will report "none".
208
209nr_zones (RO)
210-------------
211For zoned block devices (zoned attribute indicating "host-managed" or
212"host-aware"), this indicates the total number of zones of the device.
213This is always 0 for regular block devices.
214
215chunk_sectors (RO)
216------------------
217This has different meaning depending on the type of the block device.
218For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors
219of the RAID volume stripe segment. For a zoned block device, either host-aware
220or host-managed, chunk_sectors indicates the size in 512B sectors of the zones
221of the device, with the eventual exception of the last zone of the device which
222may be smaller.
223
197Jens Axboe <jens.axboe@oracle.com>, February 2009 224Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/scsi/scsi-parameters.txt b/Documentation/scsi/scsi-parameters.txt
index 92999d4e0cb8..25a4b4cf04a6 100644
--- a/Documentation/scsi/scsi-parameters.txt
+++ b/Documentation/scsi/scsi-parameters.txt
@@ -97,11 +97,6 @@ parameters may be changed at runtime by the command
97 allowing boot to proceed. none ignores them, expecting 97 allowing boot to proceed. none ignores them, expecting
98 user space to do the scan. 98 user space to do the scan.
99 99
100 scsi_mod.use_blk_mq=
101 [SCSI] use blk-mq I/O path by default
102 See SCSI_MQ_DEFAULT in drivers/scsi/Kconfig.
103 Format: <y/n>
104
105 sim710= [SCSI,HW] 100 sim710= [SCSI,HW]
106 See header of drivers/scsi/sim710.c. 101 See header of drivers/scsi/sim710.c.
107 102
diff --git a/block/Kconfig b/block/Kconfig
index f7045aa47edb..8044452a4fd3 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -155,12 +155,6 @@ config BLK_CGROUP_IOLATENCY
155 155
156 Note, this is an experimental interface and could be changed someday. 156 Note, this is an experimental interface and could be changed someday.
157 157
158config BLK_WBT_SQ
159 bool "Single queue writeback throttling"
160 depends on BLK_WBT
161 ---help---
162 Enable writeback throttling by default on legacy single queue devices
163
164config BLK_WBT_MQ 158config BLK_WBT_MQ
165 bool "Multiqueue writeback throttling" 159 bool "Multiqueue writeback throttling"
166 default y 160 default y
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index f95a48b0d7b2..4626b88b2d5a 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -3,67 +3,6 @@ if BLOCK
3 3
4menu "IO Schedulers" 4menu "IO Schedulers"
5 5
6config IOSCHED_NOOP
7 bool
8 default y
9 ---help---
10 The no-op I/O scheduler is a minimal scheduler that does basic merging
11 and sorting. Its main uses include non-disk based block devices like
12 memory devices, and specialised software or hardware environments
13 that do their own scheduling and require only minimal assistance from
14 the kernel.
15
16config IOSCHED_DEADLINE
17 tristate "Deadline I/O scheduler"
18 default y
19 ---help---
20 The deadline I/O scheduler is simple and compact. It will provide
21 CSCAN service with FIFO expiration of requests, switching to
22 a new point in the service tree and doing a batch of IO from there
23 in case of expiry.
24
25config IOSCHED_CFQ
26 tristate "CFQ I/O scheduler"
27 default y
28 ---help---
29 The CFQ I/O scheduler tries to distribute bandwidth equally
30 among all processes in the system. It should provide a fair
31 and low latency working environment, suitable for both desktop
32 and server systems.
33
34 This is the default I/O scheduler.
35
36config CFQ_GROUP_IOSCHED
37 bool "CFQ Group Scheduling support"
38 depends on IOSCHED_CFQ && BLK_CGROUP
39 ---help---
40 Enable group IO scheduling in CFQ.
41
42choice
43
44 prompt "Default I/O scheduler"
45 default DEFAULT_CFQ
46 help
47 Select the I/O scheduler which will be used by default for all
48 block devices.
49
50 config DEFAULT_DEADLINE
51 bool "Deadline" if IOSCHED_DEADLINE=y
52
53 config DEFAULT_CFQ
54 bool "CFQ" if IOSCHED_CFQ=y
55
56 config DEFAULT_NOOP
57 bool "No-op"
58
59endchoice
60
61config DEFAULT_IOSCHED
62 string
63 default "deadline" if DEFAULT_DEADLINE
64 default "cfq" if DEFAULT_CFQ
65 default "noop" if DEFAULT_NOOP
66
67config MQ_IOSCHED_DEADLINE 6config MQ_IOSCHED_DEADLINE
68 tristate "MQ deadline I/O scheduler" 7 tristate "MQ deadline I/O scheduler"
69 default y 8 default y
diff --git a/block/Makefile b/block/Makefile
index 27eac600474f..eee1b4ceecf9 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,7 +3,7 @@
3# Makefile for the kernel block layer 3# Makefile for the kernel block layer
4# 4#
5 5
6obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 6obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
7 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 7 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
8 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 8 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
9 blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ 9 blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
@@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
18obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 18obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
19obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o 19obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
20obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o 20obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
21obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
22obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
23obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
24obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o 21obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
25obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o 22obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
26bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o 23bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9fe5952d117d..c6113af31960 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -334,7 +334,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
334 334
335 parent = bfqg_parent(bfqg); 335 parent = bfqg_parent(bfqg);
336 336
337 lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); 337 lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock);
338 338
339 if (unlikely(!parent)) 339 if (unlikely(!parent))
340 return; 340 return;
@@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
642 uint64_t serial_nr; 642 uint64_t serial_nr;
643 643
644 rcu_read_lock(); 644 rcu_read_lock();
645 serial_nr = bio_blkcg(bio)->css.serial_nr; 645 serial_nr = __bio_blkcg(bio)->css.serial_nr;
646 646
647 /* 647 /*
648 * Check whether blkcg has changed. The condition may trigger 648 * Check whether blkcg has changed. The condition may trigger
@@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
651 if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) 651 if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
652 goto out; 652 goto out;
653 653
654 bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); 654 bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio));
655 /* 655 /*
656 * Update blkg_path for bfq_log_* functions. We cache this 656 * Update blkg_path for bfq_log_* functions. We cache this
657 * path, and update it here, for the following 657 * path, and update it here, for the following
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 97337214bec4..cd307767a134 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -399,9 +399,9 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
399 unsigned long flags; 399 unsigned long flags;
400 struct bfq_io_cq *icq; 400 struct bfq_io_cq *icq;
401 401
402 spin_lock_irqsave(q->queue_lock, flags); 402 spin_lock_irqsave(&q->queue_lock, flags);
403 icq = icq_to_bic(ioc_lookup_icq(ioc, q)); 403 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
404 spin_unlock_irqrestore(q->queue_lock, flags); 404 spin_unlock_irqrestore(&q->queue_lock, flags);
405 405
406 return icq; 406 return icq;
407 } 407 }
@@ -4066,7 +4066,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
4066 * In addition, the following queue lock guarantees that 4066 * In addition, the following queue lock guarantees that
4067 * bfqq_group(bfqq) exists as well. 4067 * bfqq_group(bfqq) exists as well.
4068 */ 4068 */
4069 spin_lock_irq(q->queue_lock); 4069 spin_lock_irq(&q->queue_lock);
4070 if (idle_timer_disabled) 4070 if (idle_timer_disabled)
4071 /* 4071 /*
4072 * Since the idle timer has been disabled, 4072 * Since the idle timer has been disabled,
@@ -4085,7 +4085,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
4085 bfqg_stats_set_start_empty_time(bfqg); 4085 bfqg_stats_set_start_empty_time(bfqg);
4086 bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); 4086 bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
4087 } 4087 }
4088 spin_unlock_irq(q->queue_lock); 4088 spin_unlock_irq(&q->queue_lock);
4089} 4089}
4090#else 4090#else
4091static inline void bfq_update_dispatch_stats(struct request_queue *q, 4091static inline void bfq_update_dispatch_stats(struct request_queue *q,
@@ -4416,7 +4416,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
4416 4416
4417 rcu_read_lock(); 4417 rcu_read_lock();
4418 4418
4419 bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); 4419 bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio));
4420 if (!bfqg) { 4420 if (!bfqg) {
4421 bfqq = &bfqd->oom_bfqq; 4421 bfqq = &bfqd->oom_bfqq;
4422 goto out; 4422 goto out;
@@ -4669,11 +4669,11 @@ static void bfq_update_insert_stats(struct request_queue *q,
4669 * In addition, the following queue lock guarantees that 4669 * In addition, the following queue lock guarantees that
4670 * bfqq_group(bfqq) exists as well. 4670 * bfqq_group(bfqq) exists as well.
4671 */ 4671 */
4672 spin_lock_irq(q->queue_lock); 4672 spin_lock_irq(&q->queue_lock);
4673 bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); 4673 bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
4674 if (idle_timer_disabled) 4674 if (idle_timer_disabled)
4675 bfqg_stats_update_idle_time(bfqq_group(bfqq)); 4675 bfqg_stats_update_idle_time(bfqq_group(bfqq));
4676 spin_unlock_irq(q->queue_lock); 4676 spin_unlock_irq(&q->queue_lock);
4677} 4677}
4678#else 4678#else
4679static inline void bfq_update_insert_stats(struct request_queue *q, 4679static inline void bfq_update_insert_stats(struct request_queue *q,
@@ -5414,9 +5414,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5414 } 5414 }
5415 eq->elevator_data = bfqd; 5415 eq->elevator_data = bfqd;
5416 5416
5417 spin_lock_irq(q->queue_lock); 5417 spin_lock_irq(&q->queue_lock);
5418 q->elevator = eq; 5418 q->elevator = eq;
5419 spin_unlock_irq(q->queue_lock); 5419 spin_unlock_irq(&q->queue_lock);
5420 5420
5421 /* 5421 /*
5422 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. 5422 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
@@ -5756,7 +5756,7 @@ static struct elv_fs_entry bfq_attrs[] = {
5756}; 5756};
5757 5757
5758static struct elevator_type iosched_bfq_mq = { 5758static struct elevator_type iosched_bfq_mq = {
5759 .ops.mq = { 5759 .ops = {
5760 .limit_depth = bfq_limit_depth, 5760 .limit_depth = bfq_limit_depth,
5761 .prepare_request = bfq_prepare_request, 5761 .prepare_request = bfq_prepare_request,
5762 .requeue_request = bfq_finish_requeue_request, 5762 .requeue_request = bfq_finish_requeue_request,
@@ -5777,7 +5777,6 @@ static struct elevator_type iosched_bfq_mq = {
5777 .exit_sched = bfq_exit_queue, 5777 .exit_sched = bfq_exit_queue,
5778 }, 5778 },
5779 5779
5780 .uses_mq = true,
5781 .icq_size = sizeof(struct bfq_io_cq), 5780 .icq_size = sizeof(struct bfq_io_cq),
5782 .icq_align = __alignof__(struct bfq_io_cq), 5781 .icq_align = __alignof__(struct bfq_io_cq),
5783 .elevator_attrs = bfq_attrs, 5782 .elevator_attrs = bfq_attrs,
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 290af497997b..1b633a3526d4 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -390,7 +390,6 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
390 bip->bip_iter.bi_sector += bytes_done >> 9; 390 bip->bip_iter.bi_sector += bytes_done >> 9;
391 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); 391 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
392} 392}
393EXPORT_SYMBOL(bio_integrity_advance);
394 393
395/** 394/**
396 * bio_integrity_trim - Trim integrity vector 395 * bio_integrity_trim - Trim integrity vector
@@ -460,7 +459,6 @@ void bioset_integrity_free(struct bio_set *bs)
460 mempool_exit(&bs->bio_integrity_pool); 459 mempool_exit(&bs->bio_integrity_pool);
461 mempool_exit(&bs->bvec_integrity_pool); 460 mempool_exit(&bs->bvec_integrity_pool);
462} 461}
463EXPORT_SYMBOL(bioset_integrity_free);
464 462
465void __init bio_integrity_init(void) 463void __init bio_integrity_init(void)
466{ 464{
diff --git a/block/bio.c b/block/bio.c
index 4d86e90654b2..8281bfcbc265 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -244,7 +244,7 @@ fallback:
244 244
245void bio_uninit(struct bio *bio) 245void bio_uninit(struct bio *bio)
246{ 246{
247 bio_disassociate_task(bio); 247 bio_disassociate_blkg(bio);
248} 248}
249EXPORT_SYMBOL(bio_uninit); 249EXPORT_SYMBOL(bio_uninit);
250 250
@@ -571,14 +571,13 @@ void bio_put(struct bio *bio)
571} 571}
572EXPORT_SYMBOL(bio_put); 572EXPORT_SYMBOL(bio_put);
573 573
574inline int bio_phys_segments(struct request_queue *q, struct bio *bio) 574int bio_phys_segments(struct request_queue *q, struct bio *bio)
575{ 575{
576 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 576 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
577 blk_recount_segments(q, bio); 577 blk_recount_segments(q, bio);
578 578
579 return bio->bi_phys_segments; 579 return bio->bi_phys_segments;
580} 580}
581EXPORT_SYMBOL(bio_phys_segments);
582 581
583/** 582/**
584 * __bio_clone_fast - clone a bio that shares the original bio's biovec 583 * __bio_clone_fast - clone a bio that shares the original bio's biovec
@@ -610,7 +609,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
610 bio->bi_iter = bio_src->bi_iter; 609 bio->bi_iter = bio_src->bi_iter;
611 bio->bi_io_vec = bio_src->bi_io_vec; 610 bio->bi_io_vec = bio_src->bi_io_vec;
612 611
613 bio_clone_blkcg_association(bio, bio_src); 612 bio_clone_blkg_association(bio, bio_src);
613 blkcg_bio_issue_init(bio);
614} 614}
615EXPORT_SYMBOL(__bio_clone_fast); 615EXPORT_SYMBOL(__bio_clone_fast);
616 616
@@ -901,7 +901,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
901 901
902 return 0; 902 return 0;
903} 903}
904EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
905 904
906static void submit_bio_wait_endio(struct bio *bio) 905static void submit_bio_wait_endio(struct bio *bio)
907{ 906{
@@ -1592,7 +1591,6 @@ void bio_set_pages_dirty(struct bio *bio)
1592 set_page_dirty_lock(bvec->bv_page); 1591 set_page_dirty_lock(bvec->bv_page);
1593 } 1592 }
1594} 1593}
1595EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
1596 1594
1597static void bio_release_pages(struct bio *bio) 1595static void bio_release_pages(struct bio *bio)
1598{ 1596{
@@ -1662,17 +1660,33 @@ defer:
1662 spin_unlock_irqrestore(&bio_dirty_lock, flags); 1660 spin_unlock_irqrestore(&bio_dirty_lock, flags);
1663 schedule_work(&bio_dirty_work); 1661 schedule_work(&bio_dirty_work);
1664} 1662}
1665EXPORT_SYMBOL_GPL(bio_check_pages_dirty); 1663
1664void update_io_ticks(struct hd_struct *part, unsigned long now)
1665{
1666 unsigned long stamp;
1667again:
1668 stamp = READ_ONCE(part->stamp);
1669 if (unlikely(stamp != now)) {
1670 if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
1671 __part_stat_add(part, io_ticks, 1);
1672 }
1673 }
1674 if (part->partno) {
1675 part = &part_to_disk(part)->part0;
1676 goto again;
1677 }
1678}
1666 1679
1667void generic_start_io_acct(struct request_queue *q, int op, 1680void generic_start_io_acct(struct request_queue *q, int op,
1668 unsigned long sectors, struct hd_struct *part) 1681 unsigned long sectors, struct hd_struct *part)
1669{ 1682{
1670 const int sgrp = op_stat_group(op); 1683 const int sgrp = op_stat_group(op);
1671 int cpu = part_stat_lock();
1672 1684
1673 part_round_stats(q, cpu, part); 1685 part_stat_lock();
1674 part_stat_inc(cpu, part, ios[sgrp]); 1686
1675 part_stat_add(cpu, part, sectors[sgrp], sectors); 1687 update_io_ticks(part, jiffies);
1688 part_stat_inc(part, ios[sgrp]);
1689 part_stat_add(part, sectors[sgrp], sectors);
1676 part_inc_in_flight(q, part, op_is_write(op)); 1690 part_inc_in_flight(q, part, op_is_write(op));
1677 1691
1678 part_stat_unlock(); 1692 part_stat_unlock();
@@ -1682,12 +1696,15 @@ EXPORT_SYMBOL(generic_start_io_acct);
1682void generic_end_io_acct(struct request_queue *q, int req_op, 1696void generic_end_io_acct(struct request_queue *q, int req_op,
1683 struct hd_struct *part, unsigned long start_time) 1697 struct hd_struct *part, unsigned long start_time)
1684{ 1698{
1685 unsigned long duration = jiffies - start_time; 1699 unsigned long now = jiffies;
1700 unsigned long duration = now - start_time;
1686 const int sgrp = op_stat_group(req_op); 1701 const int sgrp = op_stat_group(req_op);
1687 int cpu = part_stat_lock();
1688 1702
1689 part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); 1703 part_stat_lock();
1690 part_round_stats(q, cpu, part); 1704
1705 update_io_ticks(part, now);
1706 part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
1707 part_stat_add(part, time_in_queue, duration);
1691 part_dec_in_flight(q, part, op_is_write(req_op)); 1708 part_dec_in_flight(q, part, op_is_write(req_op));
1692 1709
1693 part_stat_unlock(); 1710 part_stat_unlock();
@@ -1957,102 +1974,133 @@ EXPORT_SYMBOL(bioset_init_from_src);
1957 1974
1958#ifdef CONFIG_BLK_CGROUP 1975#ifdef CONFIG_BLK_CGROUP
1959 1976
1960#ifdef CONFIG_MEMCG
1961/** 1977/**
1962 * bio_associate_blkcg_from_page - associate a bio with the page's blkcg 1978 * bio_disassociate_blkg - puts back the blkg reference if associated
1963 * @bio: target bio 1979 * @bio: target bio
1964 * @page: the page to lookup the blkcg from
1965 * 1980 *
1966 * Associate @bio with the blkcg from @page's owning memcg. This works like 1981 * Helper to disassociate the blkg from @bio if a blkg is associated.
1967 * every other associate function wrt references.
1968 */ 1982 */
1969int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) 1983void bio_disassociate_blkg(struct bio *bio)
1970{ 1984{
1971 struct cgroup_subsys_state *blkcg_css; 1985 if (bio->bi_blkg) {
1972 1986 blkg_put(bio->bi_blkg);
1973 if (unlikely(bio->bi_css)) 1987 bio->bi_blkg = NULL;
1974 return -EBUSY; 1988 }
1975 if (!page->mem_cgroup)
1976 return 0;
1977 blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
1978 &io_cgrp_subsys);
1979 bio->bi_css = blkcg_css;
1980 return 0;
1981} 1989}
1982#endif /* CONFIG_MEMCG */ 1990EXPORT_SYMBOL_GPL(bio_disassociate_blkg);
1983 1991
1984/** 1992/**
1985 * bio_associate_blkcg - associate a bio with the specified blkcg 1993 * __bio_associate_blkg - associate a bio with the a blkg
1986 * @bio: target bio 1994 * @bio: target bio
1987 * @blkcg_css: css of the blkcg to associate 1995 * @blkg: the blkg to associate
1988 * 1996 *
1989 * Associate @bio with the blkcg specified by @blkcg_css. Block layer will 1997 * This tries to associate @bio with the specified @blkg. Association failure
1990 * treat @bio as if it were issued by a task which belongs to the blkcg. 1998 * is handled by walking up the blkg tree. Therefore, the blkg associated can
1999 * be anything between @blkg and the root_blkg. This situation only happens
2000 * when a cgroup is dying and then the remaining bios will spill to the closest
2001 * alive blkg.
1991 * 2002 *
1992 * This function takes an extra reference of @blkcg_css which will be put 2003 * A reference will be taken on the @blkg and will be released when @bio is
1993 * when @bio is released. The caller must own @bio and is responsible for 2004 * freed.
1994 * synchronizing calls to this function.
1995 */ 2005 */
1996int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) 2006static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
1997{ 2007{
1998 if (unlikely(bio->bi_css)) 2008 bio_disassociate_blkg(bio);
1999 return -EBUSY; 2009
2000 css_get(blkcg_css); 2010 bio->bi_blkg = blkg_tryget_closest(blkg);
2001 bio->bi_css = blkcg_css;
2002 return 0;
2003} 2011}
2004EXPORT_SYMBOL_GPL(bio_associate_blkcg);
2005 2012
2006/** 2013/**
2007 * bio_associate_blkg - associate a bio with the specified blkg 2014 * bio_associate_blkg_from_css - associate a bio with a specified css
2008 * @bio: target bio 2015 * @bio: target bio
2009 * @blkg: the blkg to associate 2016 * @css: target css
2010 * 2017 *
2011 * Associate @bio with the blkg specified by @blkg. This is the queue specific 2018 * Associate @bio with the blkg found by combining the css's blkg and the
2012 * blkcg information associated with the @bio, a reference will be taken on the 2019 * request_queue of the @bio. This falls back to the queue's root_blkg if
2013 * @blkg and will be freed when the bio is freed. 2020 * the association fails with the css.
2014 */ 2021 */
2015int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) 2022void bio_associate_blkg_from_css(struct bio *bio,
2023 struct cgroup_subsys_state *css)
2016{ 2024{
2017 if (unlikely(bio->bi_blkg)) 2025 struct request_queue *q = bio->bi_disk->queue;
2018 return -EBUSY; 2026 struct blkcg_gq *blkg;
2019 if (!blkg_try_get(blkg)) 2027
2020 return -ENODEV; 2028 rcu_read_lock();
2021 bio->bi_blkg = blkg; 2029
2022 return 0; 2030 if (!css || !css->parent)
2031 blkg = q->root_blkg;
2032 else
2033 blkg = blkg_lookup_create(css_to_blkcg(css), q);
2034
2035 __bio_associate_blkg(bio, blkg);
2036
2037 rcu_read_unlock();
2023} 2038}
2039EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
2024 2040
2041#ifdef CONFIG_MEMCG
2025/** 2042/**
2026 * bio_disassociate_task - undo bio_associate_current() 2043 * bio_associate_blkg_from_page - associate a bio with the page's blkg
2027 * @bio: target bio 2044 * @bio: target bio
2045 * @page: the page to lookup the blkcg from
2046 *
2047 * Associate @bio with the blkg from @page's owning memcg and the respective
2048 * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's
2049 * root_blkg.
2028 */ 2050 */
2029void bio_disassociate_task(struct bio *bio) 2051void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
2030{ 2052{
2031 if (bio->bi_ioc) { 2053 struct cgroup_subsys_state *css;
2032 put_io_context(bio->bi_ioc); 2054
2033 bio->bi_ioc = NULL; 2055 if (!page->mem_cgroup)
2034 } 2056 return;
2035 if (bio->bi_css) { 2057
2036 css_put(bio->bi_css); 2058 rcu_read_lock();
2037 bio->bi_css = NULL; 2059
2038 } 2060 css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
2039 if (bio->bi_blkg) { 2061 bio_associate_blkg_from_css(bio, css);
2040 blkg_put(bio->bi_blkg); 2062
2041 bio->bi_blkg = NULL; 2063 rcu_read_unlock();
2042 } 2064}
2065#endif /* CONFIG_MEMCG */
2066
2067/**
2068 * bio_associate_blkg - associate a bio with a blkg
2069 * @bio: target bio
2070 *
2071 * Associate @bio with the blkg found from the bio's css and request_queue.
2072 * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
2073 * already associated, the css is reused and association redone as the
2074 * request_queue may have changed.
2075 */
2076void bio_associate_blkg(struct bio *bio)
2077{
2078 struct cgroup_subsys_state *css;
2079
2080 rcu_read_lock();
2081
2082 if (bio->bi_blkg)
2083 css = &bio_blkcg(bio)->css;
2084 else
2085 css = blkcg_css();
2086
2087 bio_associate_blkg_from_css(bio, css);
2088
2089 rcu_read_unlock();
2043} 2090}
2091EXPORT_SYMBOL_GPL(bio_associate_blkg);
2044 2092
2045/** 2093/**
2046 * bio_clone_blkcg_association - clone blkcg association from src to dst bio 2094 * bio_clone_blkg_association - clone blkg association from src to dst bio
2047 * @dst: destination bio 2095 * @dst: destination bio
2048 * @src: source bio 2096 * @src: source bio
2049 */ 2097 */
2050void bio_clone_blkcg_association(struct bio *dst, struct bio *src) 2098void bio_clone_blkg_association(struct bio *dst, struct bio *src)
2051{ 2099{
2052 if (src->bi_css) 2100 if (src->bi_blkg)
2053 WARN_ON(bio_associate_blkcg(dst, src->bi_css)); 2101 __bio_associate_blkg(dst, src->bi_blkg);
2054} 2102}
2055EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); 2103EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
2056#endif /* CONFIG_BLK_CGROUP */ 2104#endif /* CONFIG_BLK_CGROUP */
2057 2105
2058static void __init biovec_init_slabs(void) 2106static void __init biovec_init_slabs(void)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c630e02836a8..c8cc1cbb6370 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -76,14 +76,42 @@ static void blkg_free(struct blkcg_gq *blkg)
76 if (blkg->pd[i]) 76 if (blkg->pd[i])
77 blkcg_policy[i]->pd_free_fn(blkg->pd[i]); 77 blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
78 78
79 if (blkg->blkcg != &blkcg_root)
80 blk_exit_rl(blkg->q, &blkg->rl);
81
82 blkg_rwstat_exit(&blkg->stat_ios); 79 blkg_rwstat_exit(&blkg->stat_ios);
83 blkg_rwstat_exit(&blkg->stat_bytes); 80 blkg_rwstat_exit(&blkg->stat_bytes);
84 kfree(blkg); 81 kfree(blkg);
85} 82}
86 83
84static void __blkg_release(struct rcu_head *rcu)
85{
86 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
87
88 percpu_ref_exit(&blkg->refcnt);
89
90 /* release the blkcg and parent blkg refs this blkg has been holding */
91 css_put(&blkg->blkcg->css);
92 if (blkg->parent)
93 blkg_put(blkg->parent);
94
95 wb_congested_put(blkg->wb_congested);
96
97 blkg_free(blkg);
98}
99
100/*
101 * A group is RCU protected, but having an rcu lock does not mean that one
102 * can access all the fields of blkg and assume these are valid. For
103 * example, don't try to follow throtl_data and request queue links.
104 *
105 * Having a reference to blkg under an rcu allows accesses to only values
106 * local to groups like group stats and group rate limits.
107 */
108static void blkg_release(struct percpu_ref *ref)
109{
110 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
111
112 call_rcu(&blkg->rcu_head, __blkg_release);
113}
114
87/** 115/**
88 * blkg_alloc - allocate a blkg 116 * blkg_alloc - allocate a blkg
89 * @blkcg: block cgroup the new blkg is associated with 117 * @blkcg: block cgroup the new blkg is associated with
@@ -110,14 +138,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
110 blkg->q = q; 138 blkg->q = q;
111 INIT_LIST_HEAD(&blkg->q_node); 139 INIT_LIST_HEAD(&blkg->q_node);
112 blkg->blkcg = blkcg; 140 blkg->blkcg = blkcg;
113 atomic_set(&blkg->refcnt, 1);
114
115 /* root blkg uses @q->root_rl, init rl only for !root blkgs */
116 if (blkcg != &blkcg_root) {
117 if (blk_init_rl(&blkg->rl, q, gfp_mask))
118 goto err_free;
119 blkg->rl.blkg = blkg;
120 }
121 141
122 for (i = 0; i < BLKCG_MAX_POLS; i++) { 142 for (i = 0; i < BLKCG_MAX_POLS; i++) {
123 struct blkcg_policy *pol = blkcg_policy[i]; 143 struct blkcg_policy *pol = blkcg_policy[i];
@@ -157,7 +177,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
157 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); 177 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
158 if (blkg && blkg->q == q) { 178 if (blkg && blkg->q == q) {
159 if (update_hint) { 179 if (update_hint) {
160 lockdep_assert_held(q->queue_lock); 180 lockdep_assert_held(&q->queue_lock);
161 rcu_assign_pointer(blkcg->blkg_hint, blkg); 181 rcu_assign_pointer(blkcg->blkg_hint, blkg);
162 } 182 }
163 return blkg; 183 return blkg;
@@ -180,7 +200,13 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
180 int i, ret; 200 int i, ret;
181 201
182 WARN_ON_ONCE(!rcu_read_lock_held()); 202 WARN_ON_ONCE(!rcu_read_lock_held());
183 lockdep_assert_held(q->queue_lock); 203 lockdep_assert_held(&q->queue_lock);
204
205 /* request_queue is dying, do not create/recreate a blkg */
206 if (blk_queue_dying(q)) {
207 ret = -ENODEV;
208 goto err_free_blkg;
209 }
184 210
185 /* blkg holds a reference to blkcg */ 211 /* blkg holds a reference to blkcg */
186 if (!css_tryget_online(&blkcg->css)) { 212 if (!css_tryget_online(&blkcg->css)) {
@@ -217,6 +243,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
217 blkg_get(blkg->parent); 243 blkg_get(blkg->parent);
218 } 244 }
219 245
246 ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
247 GFP_NOWAIT | __GFP_NOWARN);
248 if (ret)
249 goto err_cancel_ref;
250
220 /* invoke per-policy init */ 251 /* invoke per-policy init */
221 for (i = 0; i < BLKCG_MAX_POLS; i++) { 252 for (i = 0; i < BLKCG_MAX_POLS; i++) {
222 struct blkcg_policy *pol = blkcg_policy[i]; 253 struct blkcg_policy *pol = blkcg_policy[i];
@@ -249,6 +280,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
249 blkg_put(blkg); 280 blkg_put(blkg);
250 return ERR_PTR(ret); 281 return ERR_PTR(ret);
251 282
283err_cancel_ref:
284 percpu_ref_exit(&blkg->refcnt);
252err_put_congested: 285err_put_congested:
253 wb_congested_put(wb_congested); 286 wb_congested_put(wb_congested);
254err_put_css: 287err_put_css:
@@ -259,7 +292,7 @@ err_free_blkg:
259} 292}
260 293
261/** 294/**
262 * blkg_lookup_create - lookup blkg, try to create one if not there 295 * __blkg_lookup_create - lookup blkg, try to create one if not there
263 * @blkcg: blkcg of interest 296 * @blkcg: blkcg of interest
264 * @q: request_queue of interest 297 * @q: request_queue of interest
265 * 298 *
@@ -268,24 +301,16 @@ err_free_blkg:
268 * that all non-root blkg's have access to the parent blkg. This function 301 * that all non-root blkg's have access to the parent blkg. This function
269 * should be called under RCU read lock and @q->queue_lock. 302 * should be called under RCU read lock and @q->queue_lock.
270 * 303 *
271 * Returns pointer to the looked up or created blkg on success, ERR_PTR() 304 * Returns the blkg or the closest blkg if blkg_create() fails as it walks
272 * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not 305 * down from root.
273 * dead and bypassing, returns ERR_PTR(-EBUSY).
274 */ 306 */
275struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 307struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
276 struct request_queue *q) 308 struct request_queue *q)
277{ 309{
278 struct blkcg_gq *blkg; 310 struct blkcg_gq *blkg;
279 311
280 WARN_ON_ONCE(!rcu_read_lock_held()); 312 WARN_ON_ONCE(!rcu_read_lock_held());
281 lockdep_assert_held(q->queue_lock); 313 lockdep_assert_held(&q->queue_lock);
282
283 /*
284 * This could be the first entry point of blkcg implementation and
285 * we shouldn't allow anything to go through for a bypassing queue.
286 */
287 if (unlikely(blk_queue_bypass(q)))
288 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
289 314
290 blkg = __blkg_lookup(blkcg, q, true); 315 blkg = __blkg_lookup(blkcg, q, true);
291 if (blkg) 316 if (blkg)
@@ -293,30 +318,64 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
293 318
294 /* 319 /*
295 * Create blkgs walking down from blkcg_root to @blkcg, so that all 320 * Create blkgs walking down from blkcg_root to @blkcg, so that all
296 * non-root blkgs have access to their parents. 321 * non-root blkgs have access to their parents. Returns the closest
322 * blkg to the intended blkg should blkg_create() fail.
297 */ 323 */
298 while (true) { 324 while (true) {
299 struct blkcg *pos = blkcg; 325 struct blkcg *pos = blkcg;
300 struct blkcg *parent = blkcg_parent(blkcg); 326 struct blkcg *parent = blkcg_parent(blkcg);
301 327 struct blkcg_gq *ret_blkg = q->root_blkg;
302 while (parent && !__blkg_lookup(parent, q, false)) { 328
329 while (parent) {
330 blkg = __blkg_lookup(parent, q, false);
331 if (blkg) {
332 /* remember closest blkg */
333 ret_blkg = blkg;
334 break;
335 }
303 pos = parent; 336 pos = parent;
304 parent = blkcg_parent(parent); 337 parent = blkcg_parent(parent);
305 } 338 }
306 339
307 blkg = blkg_create(pos, q, NULL); 340 blkg = blkg_create(pos, q, NULL);
308 if (pos == blkcg || IS_ERR(blkg)) 341 if (IS_ERR(blkg))
342 return ret_blkg;
343 if (pos == blkcg)
309 return blkg; 344 return blkg;
310 } 345 }
311} 346}
312 347
348/**
349 * blkg_lookup_create - find or create a blkg
350 * @blkcg: target block cgroup
351 * @q: target request_queue
352 *
353 * This looks up or creates the blkg representing the unique pair
354 * of the blkcg and the request_queue.
355 */
356struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
357 struct request_queue *q)
358{
359 struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
360
361 if (unlikely(!blkg)) {
362 unsigned long flags;
363
364 spin_lock_irqsave(&q->queue_lock, flags);
365 blkg = __blkg_lookup_create(blkcg, q);
366 spin_unlock_irqrestore(&q->queue_lock, flags);
367 }
368
369 return blkg;
370}
371
313static void blkg_destroy(struct blkcg_gq *blkg) 372static void blkg_destroy(struct blkcg_gq *blkg)
314{ 373{
315 struct blkcg *blkcg = blkg->blkcg; 374 struct blkcg *blkcg = blkg->blkcg;
316 struct blkcg_gq *parent = blkg->parent; 375 struct blkcg_gq *parent = blkg->parent;
317 int i; 376 int i;
318 377
319 lockdep_assert_held(blkg->q->queue_lock); 378 lockdep_assert_held(&blkg->q->queue_lock);
320 lockdep_assert_held(&blkcg->lock); 379 lockdep_assert_held(&blkcg->lock);
321 380
322 /* Something wrong if we are trying to remove same group twice */ 381 /* Something wrong if we are trying to remove same group twice */
@@ -353,7 +412,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
353 * Put the reference taken at the time of creation so that when all 412 * Put the reference taken at the time of creation so that when all
354 * queues are gone, group can be destroyed. 413 * queues are gone, group can be destroyed.
355 */ 414 */
356 blkg_put(blkg); 415 percpu_ref_kill(&blkg->refcnt);
357} 416}
358 417
359/** 418/**
@@ -366,8 +425,7 @@ static void blkg_destroy_all(struct request_queue *q)
366{ 425{
367 struct blkcg_gq *blkg, *n; 426 struct blkcg_gq *blkg, *n;
368 427
369 lockdep_assert_held(q->queue_lock); 428 spin_lock_irq(&q->queue_lock);
370
371 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 429 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
372 struct blkcg *blkcg = blkg->blkcg; 430 struct blkcg *blkcg = blkg->blkcg;
373 431
@@ -377,7 +435,7 @@ static void blkg_destroy_all(struct request_queue *q)
377 } 435 }
378 436
379 q->root_blkg = NULL; 437 q->root_blkg = NULL;
380 q->root_rl.blkg = NULL; 438 spin_unlock_irq(&q->queue_lock);
381} 439}
382 440
383/* 441/*
@@ -403,41 +461,6 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
403} 461}
404EXPORT_SYMBOL_GPL(__blkg_release_rcu); 462EXPORT_SYMBOL_GPL(__blkg_release_rcu);
405 463
406/*
407 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
408 * because the root blkg uses @q->root_rl instead of its own rl.
409 */
410struct request_list *__blk_queue_next_rl(struct request_list *rl,
411 struct request_queue *q)
412{
413 struct list_head *ent;
414 struct blkcg_gq *blkg;
415
416 /*
417 * Determine the current blkg list_head. The first entry is
418 * root_rl which is off @q->blkg_list and mapped to the head.
419 */
420 if (rl == &q->root_rl) {
421 ent = &q->blkg_list;
422 /* There are no more block groups, hence no request lists */
423 if (list_empty(ent))
424 return NULL;
425 } else {
426 blkg = container_of(rl, struct blkcg_gq, rl);
427 ent = &blkg->q_node;
428 }
429
430 /* walk to the next list_head, skip root blkcg */
431 ent = ent->next;
432 if (ent == &q->root_blkg->q_node)
433 ent = ent->next;
434 if (ent == &q->blkg_list)
435 return NULL;
436
437 blkg = container_of(ent, struct blkcg_gq, q_node);
438 return &blkg->rl;
439}
440
441static int blkcg_reset_stats(struct cgroup_subsys_state *css, 464static int blkcg_reset_stats(struct cgroup_subsys_state *css,
442 struct cftype *cftype, u64 val) 465 struct cftype *cftype, u64 val)
443{ 466{
@@ -477,7 +500,6 @@ const char *blkg_dev_name(struct blkcg_gq *blkg)
477 return dev_name(blkg->q->backing_dev_info->dev); 500 return dev_name(blkg->q->backing_dev_info->dev);
478 return NULL; 501 return NULL;
479} 502}
480EXPORT_SYMBOL_GPL(blkg_dev_name);
481 503
482/** 504/**
483 * blkcg_print_blkgs - helper for printing per-blkg data 505 * blkcg_print_blkgs - helper for printing per-blkg data
@@ -508,10 +530,10 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
508 530
509 rcu_read_lock(); 531 rcu_read_lock();
510 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 532 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
511 spin_lock_irq(blkg->q->queue_lock); 533 spin_lock_irq(&blkg->q->queue_lock);
512 if (blkcg_policy_enabled(blkg->q, pol)) 534 if (blkcg_policy_enabled(blkg->q, pol))
513 total += prfill(sf, blkg->pd[pol->plid], data); 535 total += prfill(sf, blkg->pd[pol->plid], data);
514 spin_unlock_irq(blkg->q->queue_lock); 536 spin_unlock_irq(&blkg->q->queue_lock);
515 } 537 }
516 rcu_read_unlock(); 538 rcu_read_unlock();
517 539
@@ -709,7 +731,7 @@ u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
709 struct cgroup_subsys_state *pos_css; 731 struct cgroup_subsys_state *pos_css;
710 u64 sum = 0; 732 u64 sum = 0;
711 733
712 lockdep_assert_held(blkg->q->queue_lock); 734 lockdep_assert_held(&blkg->q->queue_lock);
713 735
714 rcu_read_lock(); 736 rcu_read_lock();
715 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { 737 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
@@ -752,7 +774,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
752 struct blkg_rwstat sum = { }; 774 struct blkg_rwstat sum = { };
753 int i; 775 int i;
754 776
755 lockdep_assert_held(blkg->q->queue_lock); 777 lockdep_assert_held(&blkg->q->queue_lock);
756 778
757 rcu_read_lock(); 779 rcu_read_lock();
758 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { 780 blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
@@ -783,18 +805,10 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
783 struct request_queue *q) 805 struct request_queue *q)
784{ 806{
785 WARN_ON_ONCE(!rcu_read_lock_held()); 807 WARN_ON_ONCE(!rcu_read_lock_held());
786 lockdep_assert_held(q->queue_lock); 808 lockdep_assert_held(&q->queue_lock);
787 809
788 if (!blkcg_policy_enabled(q, pol)) 810 if (!blkcg_policy_enabled(q, pol))
789 return ERR_PTR(-EOPNOTSUPP); 811 return ERR_PTR(-EOPNOTSUPP);
790
791 /*
792 * This could be the first entry point of blkcg implementation and
793 * we shouldn't allow anything to go through for a bypassing queue.
794 */
795 if (unlikely(blk_queue_bypass(q)))
796 return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
797
798 return __blkg_lookup(blkcg, q, true /* update_hint */); 812 return __blkg_lookup(blkcg, q, true /* update_hint */);
799} 813}
800 814
@@ -812,7 +826,7 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
812 */ 826 */
813int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, 827int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
814 char *input, struct blkg_conf_ctx *ctx) 828 char *input, struct blkg_conf_ctx *ctx)
815 __acquires(rcu) __acquires(disk->queue->queue_lock) 829 __acquires(rcu) __acquires(&disk->queue->queue_lock)
816{ 830{
817 struct gendisk *disk; 831 struct gendisk *disk;
818 struct request_queue *q; 832 struct request_queue *q;
@@ -840,7 +854,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
840 q = disk->queue; 854 q = disk->queue;
841 855
842 rcu_read_lock(); 856 rcu_read_lock();
843 spin_lock_irq(q->queue_lock); 857 spin_lock_irq(&q->queue_lock);
844 858
845 blkg = blkg_lookup_check(blkcg, pol, q); 859 blkg = blkg_lookup_check(blkcg, pol, q);
846 if (IS_ERR(blkg)) { 860 if (IS_ERR(blkg)) {
@@ -867,7 +881,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
867 } 881 }
868 882
869 /* Drop locks to do new blkg allocation with GFP_KERNEL. */ 883 /* Drop locks to do new blkg allocation with GFP_KERNEL. */
870 spin_unlock_irq(q->queue_lock); 884 spin_unlock_irq(&q->queue_lock);
871 rcu_read_unlock(); 885 rcu_read_unlock();
872 886
873 new_blkg = blkg_alloc(pos, q, GFP_KERNEL); 887 new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
@@ -877,7 +891,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
877 } 891 }
878 892
879 rcu_read_lock(); 893 rcu_read_lock();
880 spin_lock_irq(q->queue_lock); 894 spin_lock_irq(&q->queue_lock);
881 895
882 blkg = blkg_lookup_check(pos, pol, q); 896 blkg = blkg_lookup_check(pos, pol, q);
883 if (IS_ERR(blkg)) { 897 if (IS_ERR(blkg)) {
@@ -905,7 +919,7 @@ success:
905 return 0; 919 return 0;
906 920
907fail_unlock: 921fail_unlock:
908 spin_unlock_irq(q->queue_lock); 922 spin_unlock_irq(&q->queue_lock);
909 rcu_read_unlock(); 923 rcu_read_unlock();
910fail: 924fail:
911 put_disk_and_module(disk); 925 put_disk_and_module(disk);
@@ -921,7 +935,6 @@ fail:
921 } 935 }
922 return ret; 936 return ret;
923} 937}
924EXPORT_SYMBOL_GPL(blkg_conf_prep);
925 938
926/** 939/**
927 * blkg_conf_finish - finish up per-blkg config update 940 * blkg_conf_finish - finish up per-blkg config update
@@ -931,13 +944,12 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
931 * with blkg_conf_prep(). 944 * with blkg_conf_prep().
932 */ 945 */
933void blkg_conf_finish(struct blkg_conf_ctx *ctx) 946void blkg_conf_finish(struct blkg_conf_ctx *ctx)
934 __releases(ctx->disk->queue->queue_lock) __releases(rcu) 947 __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
935{ 948{
936 spin_unlock_irq(ctx->disk->queue->queue_lock); 949 spin_unlock_irq(&ctx->disk->queue->queue_lock);
937 rcu_read_unlock(); 950 rcu_read_unlock();
938 put_disk_and_module(ctx->disk); 951 put_disk_and_module(ctx->disk);
939} 952}
940EXPORT_SYMBOL_GPL(blkg_conf_finish);
941 953
942static int blkcg_print_stat(struct seq_file *sf, void *v) 954static int blkcg_print_stat(struct seq_file *sf, void *v)
943{ 955{
@@ -967,7 +979,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
967 */ 979 */
968 off += scnprintf(buf+off, size-off, "%s ", dname); 980 off += scnprintf(buf+off, size-off, "%s ", dname);
969 981
970 spin_lock_irq(blkg->q->queue_lock); 982 spin_lock_irq(&blkg->q->queue_lock);
971 983
972 rwstat = blkg_rwstat_recursive_sum(blkg, NULL, 984 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
973 offsetof(struct blkcg_gq, stat_bytes)); 985 offsetof(struct blkcg_gq, stat_bytes));
@@ -981,7 +993,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
981 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); 993 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
982 dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); 994 dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
983 995
984 spin_unlock_irq(blkg->q->queue_lock); 996 spin_unlock_irq(&blkg->q->queue_lock);
985 997
986 if (rbytes || wbytes || rios || wios) { 998 if (rbytes || wbytes || rios || wios) {
987 has_stats = true; 999 has_stats = true;
@@ -1102,9 +1114,9 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg)
1102 struct blkcg_gq, blkcg_node); 1114 struct blkcg_gq, blkcg_node);
1103 struct request_queue *q = blkg->q; 1115 struct request_queue *q = blkg->q;
1104 1116
1105 if (spin_trylock(q->queue_lock)) { 1117 if (spin_trylock(&q->queue_lock)) {
1106 blkg_destroy(blkg); 1118 blkg_destroy(blkg);
1107 spin_unlock(q->queue_lock); 1119 spin_unlock(&q->queue_lock);
1108 } else { 1120 } else {
1109 spin_unlock_irq(&blkcg->lock); 1121 spin_unlock_irq(&blkcg->lock);
1110 cpu_relax(); 1122 cpu_relax();
@@ -1225,36 +1237,31 @@ int blkcg_init_queue(struct request_queue *q)
1225 1237
1226 /* Make sure the root blkg exists. */ 1238 /* Make sure the root blkg exists. */
1227 rcu_read_lock(); 1239 rcu_read_lock();
1228 spin_lock_irq(q->queue_lock); 1240 spin_lock_irq(&q->queue_lock);
1229 blkg = blkg_create(&blkcg_root, q, new_blkg); 1241 blkg = blkg_create(&blkcg_root, q, new_blkg);
1230 if (IS_ERR(blkg)) 1242 if (IS_ERR(blkg))
1231 goto err_unlock; 1243 goto err_unlock;
1232 q->root_blkg = blkg; 1244 q->root_blkg = blkg;
1233 q->root_rl.blkg = blkg; 1245 spin_unlock_irq(&q->queue_lock);
1234 spin_unlock_irq(q->queue_lock);
1235 rcu_read_unlock(); 1246 rcu_read_unlock();
1236 1247
1237 if (preloaded) 1248 if (preloaded)
1238 radix_tree_preload_end(); 1249 radix_tree_preload_end();
1239 1250
1240 ret = blk_iolatency_init(q); 1251 ret = blk_iolatency_init(q);
1241 if (ret) { 1252 if (ret)
1242 spin_lock_irq(q->queue_lock); 1253 goto err_destroy_all;
1243 blkg_destroy_all(q);
1244 spin_unlock_irq(q->queue_lock);
1245 return ret;
1246 }
1247 1254
1248 ret = blk_throtl_init(q); 1255 ret = blk_throtl_init(q);
1249 if (ret) { 1256 if (ret)
1250 spin_lock_irq(q->queue_lock); 1257 goto err_destroy_all;
1251 blkg_destroy_all(q); 1258 return 0;
1252 spin_unlock_irq(q->queue_lock);
1253 }
1254 return ret;
1255 1259
1260err_destroy_all:
1261 blkg_destroy_all(q);
1262 return ret;
1256err_unlock: 1263err_unlock:
1257 spin_unlock_irq(q->queue_lock); 1264 spin_unlock_irq(&q->queue_lock);
1258 rcu_read_unlock(); 1265 rcu_read_unlock();
1259 if (preloaded) 1266 if (preloaded)
1260 radix_tree_preload_end(); 1267 radix_tree_preload_end();
@@ -1269,7 +1276,7 @@ err_unlock:
1269 */ 1276 */
1270void blkcg_drain_queue(struct request_queue *q) 1277void blkcg_drain_queue(struct request_queue *q)
1271{ 1278{
1272 lockdep_assert_held(q->queue_lock); 1279 lockdep_assert_held(&q->queue_lock);
1273 1280
1274 /* 1281 /*
1275 * @q could be exiting and already have destroyed all blkgs as 1282 * @q could be exiting and already have destroyed all blkgs as
@@ -1289,10 +1296,7 @@ void blkcg_drain_queue(struct request_queue *q)
1289 */ 1296 */
1290void blkcg_exit_queue(struct request_queue *q) 1297void blkcg_exit_queue(struct request_queue *q)
1291{ 1298{
1292 spin_lock_irq(q->queue_lock);
1293 blkg_destroy_all(q); 1299 blkg_destroy_all(q);
1294 spin_unlock_irq(q->queue_lock);
1295
1296 blk_throtl_exit(q); 1300 blk_throtl_exit(q);
1297} 1301}
1298 1302
@@ -1396,10 +1400,8 @@ int blkcg_activate_policy(struct request_queue *q,
1396 if (blkcg_policy_enabled(q, pol)) 1400 if (blkcg_policy_enabled(q, pol))
1397 return 0; 1401 return 0;
1398 1402
1399 if (q->mq_ops) 1403 if (queue_is_mq(q))
1400 blk_mq_freeze_queue(q); 1404 blk_mq_freeze_queue(q);
1401 else
1402 blk_queue_bypass_start(q);
1403pd_prealloc: 1405pd_prealloc:
1404 if (!pd_prealloc) { 1406 if (!pd_prealloc) {
1405 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); 1407 pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
@@ -1409,7 +1411,7 @@ pd_prealloc:
1409 } 1411 }
1410 } 1412 }
1411 1413
1412 spin_lock_irq(q->queue_lock); 1414 spin_lock_irq(&q->queue_lock);
1413 1415
1414 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1416 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1415 struct blkg_policy_data *pd; 1417 struct blkg_policy_data *pd;
@@ -1421,7 +1423,7 @@ pd_prealloc:
1421 if (!pd) 1423 if (!pd)
1422 swap(pd, pd_prealloc); 1424 swap(pd, pd_prealloc);
1423 if (!pd) { 1425 if (!pd) {
1424 spin_unlock_irq(q->queue_lock); 1426 spin_unlock_irq(&q->queue_lock);
1425 goto pd_prealloc; 1427 goto pd_prealloc;
1426 } 1428 }
1427 1429
@@ -1435,12 +1437,10 @@ pd_prealloc:
1435 __set_bit(pol->plid, q->blkcg_pols); 1437 __set_bit(pol->plid, q->blkcg_pols);
1436 ret = 0; 1438 ret = 0;
1437 1439
1438 spin_unlock_irq(q->queue_lock); 1440 spin_unlock_irq(&q->queue_lock);
1439out_bypass_end: 1441out_bypass_end:
1440 if (q->mq_ops) 1442 if (queue_is_mq(q))
1441 blk_mq_unfreeze_queue(q); 1443 blk_mq_unfreeze_queue(q);
1442 else
1443 blk_queue_bypass_end(q);
1444 if (pd_prealloc) 1444 if (pd_prealloc)
1445 pol->pd_free_fn(pd_prealloc); 1445 pol->pd_free_fn(pd_prealloc);
1446 return ret; 1446 return ret;
@@ -1463,12 +1463,10 @@ void blkcg_deactivate_policy(struct request_queue *q,
1463 if (!blkcg_policy_enabled(q, pol)) 1463 if (!blkcg_policy_enabled(q, pol))
1464 return; 1464 return;
1465 1465
1466 if (q->mq_ops) 1466 if (queue_is_mq(q))
1467 blk_mq_freeze_queue(q); 1467 blk_mq_freeze_queue(q);
1468 else
1469 blk_queue_bypass_start(q);
1470 1468
1471 spin_lock_irq(q->queue_lock); 1469 spin_lock_irq(&q->queue_lock);
1472 1470
1473 __clear_bit(pol->plid, q->blkcg_pols); 1471 __clear_bit(pol->plid, q->blkcg_pols);
1474 1472
@@ -1481,12 +1479,10 @@ void blkcg_deactivate_policy(struct request_queue *q,
1481 } 1479 }
1482 } 1480 }
1483 1481
1484 spin_unlock_irq(q->queue_lock); 1482 spin_unlock_irq(&q->queue_lock);
1485 1483
1486 if (q->mq_ops) 1484 if (queue_is_mq(q))
1487 blk_mq_unfreeze_queue(q); 1485 blk_mq_unfreeze_queue(q);
1488 else
1489 blk_queue_bypass_end(q);
1490} 1486}
1491EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); 1487EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1492 1488
@@ -1748,8 +1744,7 @@ void blkcg_maybe_throttle_current(void)
1748 blkg = blkg_lookup(blkcg, q); 1744 blkg = blkg_lookup(blkcg, q);
1749 if (!blkg) 1745 if (!blkg)
1750 goto out; 1746 goto out;
1751 blkg = blkg_try_get(blkg); 1747 if (!blkg_tryget(blkg))
1752 if (!blkg)
1753 goto out; 1748 goto out;
1754 rcu_read_unlock(); 1749 rcu_read_unlock();
1755 1750
@@ -1761,7 +1756,6 @@ out:
1761 rcu_read_unlock(); 1756 rcu_read_unlock();
1762 blk_put_queue(q); 1757 blk_put_queue(q);
1763} 1758}
1764EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
1765 1759
1766/** 1760/**
1767 * blkcg_schedule_throttle - this task needs to check for throttling 1761 * blkcg_schedule_throttle - this task needs to check for throttling
@@ -1795,7 +1789,6 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1795 current->use_memdelay = use_memdelay; 1789 current->use_memdelay = use_memdelay;
1796 set_notify_resume(current); 1790 set_notify_resume(current);
1797} 1791}
1798EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
1799 1792
1800/** 1793/**
1801 * blkcg_add_delay - add delay to this blkg 1794 * blkcg_add_delay - add delay to this blkg
@@ -1810,7 +1803,6 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1810 blkcg_scale_delay(blkg, now); 1803 blkcg_scale_delay(blkg, now);
1811 atomic64_add(delta, &blkg->delay_nsec); 1804 atomic64_add(delta, &blkg->delay_nsec);
1812} 1805}
1813EXPORT_SYMBOL_GPL(blkcg_add_delay);
1814 1806
1815module_param(blkcg_debug_stats, bool, 0644); 1807module_param(blkcg_debug_stats, bool, 0644);
1816MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); 1808MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index deb56932f8c4..c78042975737 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -58,11 +58,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
58DEFINE_IDA(blk_queue_ida); 58DEFINE_IDA(blk_queue_ida);
59 59
60/* 60/*
61 * For the allocated request tables
62 */
63struct kmem_cache *request_cachep;
64
65/*
66 * For queue allocation 61 * For queue allocation
67 */ 62 */
68struct kmem_cache *blk_requestq_cachep; 63struct kmem_cache *blk_requestq_cachep;
@@ -79,11 +74,7 @@ static struct workqueue_struct *kblockd_workqueue;
79 */ 74 */
80void blk_queue_flag_set(unsigned int flag, struct request_queue *q) 75void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
81{ 76{
82 unsigned long flags; 77 set_bit(flag, &q->queue_flags);
83
84 spin_lock_irqsave(q->queue_lock, flags);
85 queue_flag_set(flag, q);
86 spin_unlock_irqrestore(q->queue_lock, flags);
87} 78}
88EXPORT_SYMBOL(blk_queue_flag_set); 79EXPORT_SYMBOL(blk_queue_flag_set);
89 80
@@ -94,11 +85,7 @@ EXPORT_SYMBOL(blk_queue_flag_set);
94 */ 85 */
95void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) 86void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
96{ 87{
97 unsigned long flags; 88 clear_bit(flag, &q->queue_flags);
98
99 spin_lock_irqsave(q->queue_lock, flags);
100 queue_flag_clear(flag, q);
101 spin_unlock_irqrestore(q->queue_lock, flags);
102} 89}
103EXPORT_SYMBOL(blk_queue_flag_clear); 90EXPORT_SYMBOL(blk_queue_flag_clear);
104 91
@@ -112,85 +99,15 @@ EXPORT_SYMBOL(blk_queue_flag_clear);
112 */ 99 */
113bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) 100bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
114{ 101{
115 unsigned long flags; 102 return test_and_set_bit(flag, &q->queue_flags);
116 bool res;
117
118 spin_lock_irqsave(q->queue_lock, flags);
119 res = queue_flag_test_and_set(flag, q);
120 spin_unlock_irqrestore(q->queue_lock, flags);
121
122 return res;
123} 103}
124EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); 104EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
125 105
126/**
127 * blk_queue_flag_test_and_clear - atomically test and clear a queue flag
128 * @flag: flag to be cleared
129 * @q: request queue
130 *
131 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
132 * the flag was set.
133 */
134bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q)
135{
136 unsigned long flags;
137 bool res;
138
139 spin_lock_irqsave(q->queue_lock, flags);
140 res = queue_flag_test_and_clear(flag, q);
141 spin_unlock_irqrestore(q->queue_lock, flags);
142
143 return res;
144}
145EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear);
146
147static void blk_clear_congested(struct request_list *rl, int sync)
148{
149#ifdef CONFIG_CGROUP_WRITEBACK
150 clear_wb_congested(rl->blkg->wb_congested, sync);
151#else
152 /*
153 * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
154 * flip its congestion state for events on other blkcgs.
155 */
156 if (rl == &rl->q->root_rl)
157 clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
158#endif
159}
160
161static void blk_set_congested(struct request_list *rl, int sync)
162{
163#ifdef CONFIG_CGROUP_WRITEBACK
164 set_wb_congested(rl->blkg->wb_congested, sync);
165#else
166 /* see blk_clear_congested() */
167 if (rl == &rl->q->root_rl)
168 set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
169#endif
170}
171
172void blk_queue_congestion_threshold(struct request_queue *q)
173{
174 int nr;
175
176 nr = q->nr_requests - (q->nr_requests / 8) + 1;
177 if (nr > q->nr_requests)
178 nr = q->nr_requests;
179 q->nr_congestion_on = nr;
180
181 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
182 if (nr < 1)
183 nr = 1;
184 q->nr_congestion_off = nr;
185}
186
187void blk_rq_init(struct request_queue *q, struct request *rq) 106void blk_rq_init(struct request_queue *q, struct request *rq)
188{ 107{
189 memset(rq, 0, sizeof(*rq)); 108 memset(rq, 0, sizeof(*rq));
190 109
191 INIT_LIST_HEAD(&rq->queuelist); 110 INIT_LIST_HEAD(&rq->queuelist);
192 INIT_LIST_HEAD(&rq->timeout_list);
193 rq->cpu = -1;
194 rq->q = q; 111 rq->q = q;
195 rq->__sector = (sector_t) -1; 112 rq->__sector = (sector_t) -1;
196 INIT_HLIST_NODE(&rq->hash); 113 INIT_HLIST_NODE(&rq->hash);
@@ -256,10 +173,11 @@ static void print_req_error(struct request *req, blk_status_t status)
256 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) 173 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
257 return; 174 return;
258 175
259 printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", 176 printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n",
260 __func__, blk_errors[idx].name, req->rq_disk ? 177 __func__, blk_errors[idx].name,
261 req->rq_disk->disk_name : "?", 178 req->rq_disk ? req->rq_disk->disk_name : "?",
262 (unsigned long long)blk_rq_pos(req)); 179 (unsigned long long)blk_rq_pos(req),
180 req->cmd_flags);
263} 181}
264 182
265static void req_bio_endio(struct request *rq, struct bio *bio, 183static void req_bio_endio(struct request *rq, struct bio *bio,
@@ -292,99 +210,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
292} 210}
293EXPORT_SYMBOL(blk_dump_rq_flags); 211EXPORT_SYMBOL(blk_dump_rq_flags);
294 212
295static void blk_delay_work(struct work_struct *work)
296{
297 struct request_queue *q;
298
299 q = container_of(work, struct request_queue, delay_work.work);
300 spin_lock_irq(q->queue_lock);
301 __blk_run_queue(q);
302 spin_unlock_irq(q->queue_lock);
303}
304
305/**
306 * blk_delay_queue - restart queueing after defined interval
307 * @q: The &struct request_queue in question
308 * @msecs: Delay in msecs
309 *
310 * Description:
311 * Sometimes queueing needs to be postponed for a little while, to allow
312 * resources to come back. This function will make sure that queueing is
313 * restarted around the specified time.
314 */
315void blk_delay_queue(struct request_queue *q, unsigned long msecs)
316{
317 lockdep_assert_held(q->queue_lock);
318 WARN_ON_ONCE(q->mq_ops);
319
320 if (likely(!blk_queue_dead(q)))
321 queue_delayed_work(kblockd_workqueue, &q->delay_work,
322 msecs_to_jiffies(msecs));
323}
324EXPORT_SYMBOL(blk_delay_queue);
325
326/**
327 * blk_start_queue_async - asynchronously restart a previously stopped queue
328 * @q: The &struct request_queue in question
329 *
330 * Description:
331 * blk_start_queue_async() will clear the stop flag on the queue, and
332 * ensure that the request_fn for the queue is run from an async
333 * context.
334 **/
335void blk_start_queue_async(struct request_queue *q)
336{
337 lockdep_assert_held(q->queue_lock);
338 WARN_ON_ONCE(q->mq_ops);
339
340 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
341 blk_run_queue_async(q);
342}
343EXPORT_SYMBOL(blk_start_queue_async);
344
345/**
346 * blk_start_queue - restart a previously stopped queue
347 * @q: The &struct request_queue in question
348 *
349 * Description:
350 * blk_start_queue() will clear the stop flag on the queue, and call
351 * the request_fn for the queue if it was in a stopped state when
352 * entered. Also see blk_stop_queue().
353 **/
354void blk_start_queue(struct request_queue *q)
355{
356 lockdep_assert_held(q->queue_lock);
357 WARN_ON_ONCE(q->mq_ops);
358
359 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
360 __blk_run_queue(q);
361}
362EXPORT_SYMBOL(blk_start_queue);
363
364/**
365 * blk_stop_queue - stop a queue
366 * @q: The &struct request_queue in question
367 *
368 * Description:
369 * The Linux block layer assumes that a block driver will consume all
370 * entries on the request queue when the request_fn strategy is called.
371 * Often this will not happen, because of hardware limitations (queue
372 * depth settings). If a device driver gets a 'queue full' response,
373 * or if it simply chooses not to queue more I/O at one point, it can
374 * call this function to prevent the request_fn from being called until
375 * the driver has signalled it's ready to go again. This happens by calling
376 * blk_start_queue() to restart queue operations.
377 **/
378void blk_stop_queue(struct request_queue *q)
379{
380 lockdep_assert_held(q->queue_lock);
381 WARN_ON_ONCE(q->mq_ops);
382
383 cancel_delayed_work(&q->delay_work);
384 queue_flag_set(QUEUE_FLAG_STOPPED, q);
385}
386EXPORT_SYMBOL(blk_stop_queue);
387
388/** 213/**
389 * blk_sync_queue - cancel any pending callbacks on a queue 214 * blk_sync_queue - cancel any pending callbacks on a queue
390 * @q: the queue 215 * @q: the queue
@@ -408,15 +233,13 @@ void blk_sync_queue(struct request_queue *q)
408 del_timer_sync(&q->timeout); 233 del_timer_sync(&q->timeout);
409 cancel_work_sync(&q->timeout_work); 234 cancel_work_sync(&q->timeout_work);
410 235
411 if (q->mq_ops) { 236 if (queue_is_mq(q)) {
412 struct blk_mq_hw_ctx *hctx; 237 struct blk_mq_hw_ctx *hctx;
413 int i; 238 int i;
414 239
415 cancel_delayed_work_sync(&q->requeue_work); 240 cancel_delayed_work_sync(&q->requeue_work);
416 queue_for_each_hw_ctx(q, hctx, i) 241 queue_for_each_hw_ctx(q, hctx, i)
417 cancel_delayed_work_sync(&hctx->run_work); 242 cancel_delayed_work_sync(&hctx->run_work);
418 } else {
419 cancel_delayed_work_sync(&q->delay_work);
420 } 243 }
421} 244}
422EXPORT_SYMBOL(blk_sync_queue); 245EXPORT_SYMBOL(blk_sync_queue);
@@ -442,250 +265,12 @@ void blk_clear_pm_only(struct request_queue *q)
442} 265}
443EXPORT_SYMBOL_GPL(blk_clear_pm_only); 266EXPORT_SYMBOL_GPL(blk_clear_pm_only);
444 267
445/**
446 * __blk_run_queue_uncond - run a queue whether or not it has been stopped
447 * @q: The queue to run
448 *
449 * Description:
450 * Invoke request handling on a queue if there are any pending requests.
451 * May be used to restart request handling after a request has completed.
452 * This variant runs the queue whether or not the queue has been
453 * stopped. Must be called with the queue lock held and interrupts
454 * disabled. See also @blk_run_queue.
455 */
456inline void __blk_run_queue_uncond(struct request_queue *q)
457{
458 lockdep_assert_held(q->queue_lock);
459 WARN_ON_ONCE(q->mq_ops);
460
461 if (unlikely(blk_queue_dead(q)))
462 return;
463
464 /*
465 * Some request_fn implementations, e.g. scsi_request_fn(), unlock
466 * the queue lock internally. As a result multiple threads may be
467 * running such a request function concurrently. Keep track of the
468 * number of active request_fn invocations such that blk_drain_queue()
469 * can wait until all these request_fn calls have finished.
470 */
471 q->request_fn_active++;
472 q->request_fn(q);
473 q->request_fn_active--;
474}
475EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
476
477/**
478 * __blk_run_queue - run a single device queue
479 * @q: The queue to run
480 *
481 * Description:
482 * See @blk_run_queue.
483 */
484void __blk_run_queue(struct request_queue *q)
485{
486 lockdep_assert_held(q->queue_lock);
487 WARN_ON_ONCE(q->mq_ops);
488
489 if (unlikely(blk_queue_stopped(q)))
490 return;
491
492 __blk_run_queue_uncond(q);
493}
494EXPORT_SYMBOL(__blk_run_queue);
495
496/**
497 * blk_run_queue_async - run a single device queue in workqueue context
498 * @q: The queue to run
499 *
500 * Description:
501 * Tells kblockd to perform the equivalent of @blk_run_queue on behalf
502 * of us.
503 *
504 * Note:
505 * Since it is not allowed to run q->delay_work after blk_cleanup_queue()
506 * has canceled q->delay_work, callers must hold the queue lock to avoid
507 * race conditions between blk_cleanup_queue() and blk_run_queue_async().
508 */
509void blk_run_queue_async(struct request_queue *q)
510{
511 lockdep_assert_held(q->queue_lock);
512 WARN_ON_ONCE(q->mq_ops);
513
514 if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
515 mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
516}
517EXPORT_SYMBOL(blk_run_queue_async);
518
519/**
520 * blk_run_queue - run a single device queue
521 * @q: The queue to run
522 *
523 * Description:
524 * Invoke request handling on this queue, if it has pending work to do.
525 * May be used to restart queueing when a request has completed.
526 */
527void blk_run_queue(struct request_queue *q)
528{
529 unsigned long flags;
530
531 WARN_ON_ONCE(q->mq_ops);
532
533 spin_lock_irqsave(q->queue_lock, flags);
534 __blk_run_queue(q);
535 spin_unlock_irqrestore(q->queue_lock, flags);
536}
537EXPORT_SYMBOL(blk_run_queue);
538
539void blk_put_queue(struct request_queue *q) 268void blk_put_queue(struct request_queue *q)
540{ 269{
541 kobject_put(&q->kobj); 270 kobject_put(&q->kobj);
542} 271}
543EXPORT_SYMBOL(blk_put_queue); 272EXPORT_SYMBOL(blk_put_queue);
544 273
545/**
546 * __blk_drain_queue - drain requests from request_queue
547 * @q: queue to drain
548 * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
549 *
550 * Drain requests from @q. If @drain_all is set, all requests are drained.
551 * If not, only ELVPRIV requests are drained. The caller is responsible
552 * for ensuring that no new requests which need to be drained are queued.
553 */
554static void __blk_drain_queue(struct request_queue *q, bool drain_all)
555 __releases(q->queue_lock)
556 __acquires(q->queue_lock)
557{
558 int i;
559
560 lockdep_assert_held(q->queue_lock);
561 WARN_ON_ONCE(q->mq_ops);
562
563 while (true) {
564 bool drain = false;
565
566 /*
567 * The caller might be trying to drain @q before its
568 * elevator is initialized.
569 */
570 if (q->elevator)
571 elv_drain_elevator(q);
572
573 blkcg_drain_queue(q);
574
575 /*
576 * This function might be called on a queue which failed
577 * driver init after queue creation or is not yet fully
578 * active yet. Some drivers (e.g. fd and loop) get unhappy
579 * in such cases. Kick queue iff dispatch queue has
580 * something on it and @q has request_fn set.
581 */
582 if (!list_empty(&q->queue_head) && q->request_fn)
583 __blk_run_queue(q);
584
585 drain |= q->nr_rqs_elvpriv;
586 drain |= q->request_fn_active;
587
588 /*
589 * Unfortunately, requests are queued at and tracked from
590 * multiple places and there's no single counter which can
591 * be drained. Check all the queues and counters.
592 */
593 if (drain_all) {
594 struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
595 drain |= !list_empty(&q->queue_head);
596 for (i = 0; i < 2; i++) {
597 drain |= q->nr_rqs[i];
598 drain |= q->in_flight[i];
599 if (fq)
600 drain |= !list_empty(&fq->flush_queue[i]);
601 }
602 }
603
604 if (!drain)
605 break;
606
607 spin_unlock_irq(q->queue_lock);
608
609 msleep(10);
610
611 spin_lock_irq(q->queue_lock);
612 }
613
614 /*
615 * With queue marked dead, any woken up waiter will fail the
616 * allocation path, so the wakeup chaining is lost and we're
617 * left with hung waiters. We need to wake up those waiters.
618 */
619 if (q->request_fn) {
620 struct request_list *rl;
621
622 blk_queue_for_each_rl(rl, q)
623 for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
624 wake_up_all(&rl->wait[i]);
625 }
626}
627
628void blk_drain_queue(struct request_queue *q)
629{
630 spin_lock_irq(q->queue_lock);
631 __blk_drain_queue(q, true);
632 spin_unlock_irq(q->queue_lock);
633}
634
635/**
636 * blk_queue_bypass_start - enter queue bypass mode
637 * @q: queue of interest
638 *
639 * In bypass mode, only the dispatch FIFO queue of @q is used. This
640 * function makes @q enter bypass mode and drains all requests which were
641 * throttled or issued before. On return, it's guaranteed that no request
642 * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
643 * inside queue or RCU read lock.
644 */
645void blk_queue_bypass_start(struct request_queue *q)
646{
647 WARN_ON_ONCE(q->mq_ops);
648
649 spin_lock_irq(q->queue_lock);
650 q->bypass_depth++;
651 queue_flag_set(QUEUE_FLAG_BYPASS, q);
652 spin_unlock_irq(q->queue_lock);
653
654 /*
655 * Queues start drained. Skip actual draining till init is
656 * complete. This avoids lenghty delays during queue init which
657 * can happen many times during boot.
658 */
659 if (blk_queue_init_done(q)) {
660 spin_lock_irq(q->queue_lock);
661 __blk_drain_queue(q, false);
662 spin_unlock_irq(q->queue_lock);
663
664 /* ensure blk_queue_bypass() is %true inside RCU read lock */
665 synchronize_rcu();
666 }
667}
668EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
669
670/**
671 * blk_queue_bypass_end - leave queue bypass mode
672 * @q: queue of interest
673 *
674 * Leave bypass mode and restore the normal queueing behavior.
675 *
676 * Note: although blk_queue_bypass_start() is only called for blk-sq queues,
677 * this function is called for both blk-sq and blk-mq queues.
678 */
679void blk_queue_bypass_end(struct request_queue *q)
680{
681 spin_lock_irq(q->queue_lock);
682 if (!--q->bypass_depth)
683 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
684 WARN_ON_ONCE(q->bypass_depth < 0);
685 spin_unlock_irq(q->queue_lock);
686}
687EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
688
689void blk_set_queue_dying(struct request_queue *q) 274void blk_set_queue_dying(struct request_queue *q)
690{ 275{
691 blk_queue_flag_set(QUEUE_FLAG_DYING, q); 276 blk_queue_flag_set(QUEUE_FLAG_DYING, q);
@@ -697,20 +282,8 @@ void blk_set_queue_dying(struct request_queue *q)
697 */ 282 */
698 blk_freeze_queue_start(q); 283 blk_freeze_queue_start(q);
699 284
700 if (q->mq_ops) 285 if (queue_is_mq(q))
701 blk_mq_wake_waiters(q); 286 blk_mq_wake_waiters(q);
702 else {
703 struct request_list *rl;
704
705 spin_lock_irq(q->queue_lock);
706 blk_queue_for_each_rl(rl, q) {
707 if (rl->rq_pool) {
708 wake_up_all(&rl->wait[BLK_RW_SYNC]);
709 wake_up_all(&rl->wait[BLK_RW_ASYNC]);
710 }
711 }
712 spin_unlock_irq(q->queue_lock);
713 }
714 287
715 /* Make blk_queue_enter() reexamine the DYING flag. */ 288 /* Make blk_queue_enter() reexamine the DYING flag. */
716 wake_up_all(&q->mq_freeze_wq); 289 wake_up_all(&q->mq_freeze_wq);
@@ -755,29 +328,13 @@ void blk_exit_queue(struct request_queue *q)
755 */ 328 */
756void blk_cleanup_queue(struct request_queue *q) 329void blk_cleanup_queue(struct request_queue *q)
757{ 330{
758 spinlock_t *lock = q->queue_lock;
759
760 /* mark @q DYING, no new request or merges will be allowed afterwards */ 331 /* mark @q DYING, no new request or merges will be allowed afterwards */
761 mutex_lock(&q->sysfs_lock); 332 mutex_lock(&q->sysfs_lock);
762 blk_set_queue_dying(q); 333 blk_set_queue_dying(q);
763 spin_lock_irq(lock);
764
765 /*
766 * A dying queue is permanently in bypass mode till released. Note
767 * that, unlike blk_queue_bypass_start(), we aren't performing
768 * synchronize_rcu() after entering bypass mode to avoid the delay
769 * as some drivers create and destroy a lot of queues while
770 * probing. This is still safe because blk_release_queue() will be
771 * called only after the queue refcnt drops to zero and nothing,
772 * RCU or not, would be traversing the queue by then.
773 */
774 q->bypass_depth++;
775 queue_flag_set(QUEUE_FLAG_BYPASS, q);
776 334
777 queue_flag_set(QUEUE_FLAG_NOMERGES, q); 335 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
778 queue_flag_set(QUEUE_FLAG_NOXMERGES, q); 336 blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
779 queue_flag_set(QUEUE_FLAG_DYING, q); 337 blk_queue_flag_set(QUEUE_FLAG_DYING, q);
780 spin_unlock_irq(lock);
781 mutex_unlock(&q->sysfs_lock); 338 mutex_unlock(&q->sysfs_lock);
782 339
783 /* 340 /*
@@ -788,9 +345,7 @@ void blk_cleanup_queue(struct request_queue *q)
788 345
789 rq_qos_exit(q); 346 rq_qos_exit(q);
790 347
791 spin_lock_irq(lock); 348 blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
792 queue_flag_set(QUEUE_FLAG_DEAD, q);
793 spin_unlock_irq(lock);
794 349
795 /* 350 /*
796 * make sure all in-progress dispatch are completed because 351 * make sure all in-progress dispatch are completed because
@@ -801,7 +356,7 @@ void blk_cleanup_queue(struct request_queue *q)
801 * We rely on driver to deal with the race in case that queue 356 * We rely on driver to deal with the race in case that queue
802 * initialization isn't done. 357 * initialization isn't done.
803 */ 358 */
804 if (q->mq_ops && blk_queue_init_done(q)) 359 if (queue_is_mq(q) && blk_queue_init_done(q))
805 blk_mq_quiesce_queue(q); 360 blk_mq_quiesce_queue(q);
806 361
807 /* for synchronous bio-based driver finish in-flight integrity i/o */ 362 /* for synchronous bio-based driver finish in-flight integrity i/o */
@@ -819,98 +374,19 @@ void blk_cleanup_queue(struct request_queue *q)
819 374
820 blk_exit_queue(q); 375 blk_exit_queue(q);
821 376
822 if (q->mq_ops) 377 if (queue_is_mq(q))
823 blk_mq_free_queue(q); 378 blk_mq_free_queue(q);
824 percpu_ref_exit(&q->q_usage_counter);
825 379
826 spin_lock_irq(lock); 380 percpu_ref_exit(&q->q_usage_counter);
827 if (q->queue_lock != &q->__queue_lock)
828 q->queue_lock = &q->__queue_lock;
829 spin_unlock_irq(lock);
830 381
831 /* @q is and will stay empty, shutdown and put */ 382 /* @q is and will stay empty, shutdown and put */
832 blk_put_queue(q); 383 blk_put_queue(q);
833} 384}
834EXPORT_SYMBOL(blk_cleanup_queue); 385EXPORT_SYMBOL(blk_cleanup_queue);
835 386
836/* Allocate memory local to the request queue */
837static void *alloc_request_simple(gfp_t gfp_mask, void *data)
838{
839 struct request_queue *q = data;
840
841 return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
842}
843
844static void free_request_simple(void *element, void *data)
845{
846 kmem_cache_free(request_cachep, element);
847}
848
849static void *alloc_request_size(gfp_t gfp_mask, void *data)
850{
851 struct request_queue *q = data;
852 struct request *rq;
853
854 rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
855 q->node);
856 if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
857 kfree(rq);
858 rq = NULL;
859 }
860 return rq;
861}
862
863static void free_request_size(void *element, void *data)
864{
865 struct request_queue *q = data;
866
867 if (q->exit_rq_fn)
868 q->exit_rq_fn(q, element);
869 kfree(element);
870}
871
872int blk_init_rl(struct request_list *rl, struct request_queue *q,
873 gfp_t gfp_mask)
874{
875 if (unlikely(rl->rq_pool) || q->mq_ops)
876 return 0;
877
878 rl->q = q;
879 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
880 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
881 init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
882 init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
883
884 if (q->cmd_size) {
885 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
886 alloc_request_size, free_request_size,
887 q, gfp_mask, q->node);
888 } else {
889 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
890 alloc_request_simple, free_request_simple,
891 q, gfp_mask, q->node);
892 }
893 if (!rl->rq_pool)
894 return -ENOMEM;
895
896 if (rl != &q->root_rl)
897 WARN_ON_ONCE(!blk_get_queue(q));
898
899 return 0;
900}
901
902void blk_exit_rl(struct request_queue *q, struct request_list *rl)
903{
904 if (rl->rq_pool) {
905 mempool_destroy(rl->rq_pool);
906 if (rl != &q->root_rl)
907 blk_put_queue(q);
908 }
909}
910
911struct request_queue *blk_alloc_queue(gfp_t gfp_mask) 387struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
912{ 388{
913 return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL); 389 return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
914} 390}
915EXPORT_SYMBOL(blk_alloc_queue); 391EXPORT_SYMBOL(blk_alloc_queue);
916 392
@@ -990,17 +466,8 @@ static void blk_rq_timed_out_timer(struct timer_list *t)
990 * blk_alloc_queue_node - allocate a request queue 466 * blk_alloc_queue_node - allocate a request queue
991 * @gfp_mask: memory allocation flags 467 * @gfp_mask: memory allocation flags
992 * @node_id: NUMA node to allocate memory from 468 * @node_id: NUMA node to allocate memory from
993 * @lock: For legacy queues, pointer to a spinlock that will be used to e.g.
994 * serialize calls to the legacy .request_fn() callback. Ignored for
995 * blk-mq request queues.
996 *
997 * Note: pass the queue lock as the third argument to this function instead of
998 * setting the queue lock pointer explicitly to avoid triggering a sporadic
999 * crash in the blkcg code. This function namely calls blkcg_init_queue() and
1000 * the queue lock pointer must be set before blkcg_init_queue() is called.
1001 */ 469 */
1002struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, 470struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1003 spinlock_t *lock)
1004{ 471{
1005 struct request_queue *q; 472 struct request_queue *q;
1006 int ret; 473 int ret;
@@ -1012,8 +479,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
1012 479
1013 INIT_LIST_HEAD(&q->queue_head); 480 INIT_LIST_HEAD(&q->queue_head);
1014 q->last_merge = NULL; 481 q->last_merge = NULL;
1015 q->end_sector = 0;
1016 q->boundary_rq = NULL;
1017 482
1018 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 483 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
1019 if (q->id < 0) 484 if (q->id < 0)
@@ -1041,12 +506,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
1041 laptop_mode_timer_fn, 0); 506 laptop_mode_timer_fn, 0);
1042 timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); 507 timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
1043 INIT_WORK(&q->timeout_work, NULL); 508 INIT_WORK(&q->timeout_work, NULL);
1044 INIT_LIST_HEAD(&q->timeout_list);
1045 INIT_LIST_HEAD(&q->icq_list); 509 INIT_LIST_HEAD(&q->icq_list);
1046#ifdef CONFIG_BLK_CGROUP 510#ifdef CONFIG_BLK_CGROUP
1047 INIT_LIST_HEAD(&q->blkg_list); 511 INIT_LIST_HEAD(&q->blkg_list);
1048#endif 512#endif
1049 INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
1050 513
1051 kobject_init(&q->kobj, &blk_queue_ktype); 514 kobject_init(&q->kobj, &blk_queue_ktype);
1052 515
@@ -1054,18 +517,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
1054 mutex_init(&q->blk_trace_mutex); 517 mutex_init(&q->blk_trace_mutex);
1055#endif 518#endif
1056 mutex_init(&q->sysfs_lock); 519 mutex_init(&q->sysfs_lock);
1057 spin_lock_init(&q->__queue_lock); 520 spin_lock_init(&q->queue_lock);
1058
1059 q->queue_lock = lock ? : &q->__queue_lock;
1060
1061 /*
1062 * A queue starts its life with bypass turned on to avoid
1063 * unnecessary bypass on/off overhead and nasty surprises during
1064 * init. The initial bypass will be finished when the queue is
1065 * registered by blk_register_queue().
1066 */
1067 q->bypass_depth = 1;
1068 queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q);
1069 521
1070 init_waitqueue_head(&q->mq_freeze_wq); 522 init_waitqueue_head(&q->mq_freeze_wq);
1071 523
@@ -1099,105 +551,6 @@ fail_q:
1099} 551}
1100EXPORT_SYMBOL(blk_alloc_queue_node); 552EXPORT_SYMBOL(blk_alloc_queue_node);
1101 553
1102/**
1103 * blk_init_queue - prepare a request queue for use with a block device
1104 * @rfn: The function to be called to process requests that have been
1105 * placed on the queue.
1106 * @lock: Request queue spin lock
1107 *
1108 * Description:
1109 * If a block device wishes to use the standard request handling procedures,
1110 * which sorts requests and coalesces adjacent requests, then it must
1111 * call blk_init_queue(). The function @rfn will be called when there
1112 * are requests on the queue that need to be processed. If the device
1113 * supports plugging, then @rfn may not be called immediately when requests
1114 * are available on the queue, but may be called at some time later instead.
1115 * Plugged queues are generally unplugged when a buffer belonging to one
1116 * of the requests on the queue is needed, or due to memory pressure.
1117 *
1118 * @rfn is not required, or even expected, to remove all requests off the
1119 * queue, but only as many as it can handle at a time. If it does leave
1120 * requests on the queue, it is responsible for arranging that the requests
1121 * get dealt with eventually.
1122 *
1123 * The queue spin lock must be held while manipulating the requests on the
1124 * request queue; this lock will be taken also from interrupt context, so irq
1125 * disabling is needed for it.
1126 *
1127 * Function returns a pointer to the initialized request queue, or %NULL if
1128 * it didn't succeed.
1129 *
1130 * Note:
1131 * blk_init_queue() must be paired with a blk_cleanup_queue() call
1132 * when the block device is deactivated (such as at module unload).
1133 **/
1134
1135struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1136{
1137 return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
1138}
1139EXPORT_SYMBOL(blk_init_queue);
1140
1141struct request_queue *
1142blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1143{
1144 struct request_queue *q;
1145
1146 q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock);
1147 if (!q)
1148 return NULL;
1149
1150 q->request_fn = rfn;
1151 if (blk_init_allocated_queue(q) < 0) {
1152 blk_cleanup_queue(q);
1153 return NULL;
1154 }
1155
1156 return q;
1157}
1158EXPORT_SYMBOL(blk_init_queue_node);
1159
1160static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
1161
1162
1163int blk_init_allocated_queue(struct request_queue *q)
1164{
1165 WARN_ON_ONCE(q->mq_ops);
1166
1167 q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL);
1168 if (!q->fq)
1169 return -ENOMEM;
1170
1171 if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
1172 goto out_free_flush_queue;
1173
1174 if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
1175 goto out_exit_flush_rq;
1176
1177 INIT_WORK(&q->timeout_work, blk_timeout_work);
1178 q->queue_flags |= QUEUE_FLAG_DEFAULT;
1179
1180 /*
1181 * This also sets hw/phys segments, boundary and size
1182 */
1183 blk_queue_make_request(q, blk_queue_bio);
1184
1185 q->sg_reserved_size = INT_MAX;
1186
1187 if (elevator_init(q))
1188 goto out_exit_flush_rq;
1189 return 0;
1190
1191out_exit_flush_rq:
1192 if (q->exit_rq_fn)
1193 q->exit_rq_fn(q, q->fq->flush_rq);
1194out_free_flush_queue:
1195 blk_free_flush_queue(q->fq);
1196 q->fq = NULL;
1197 return -ENOMEM;
1198}
1199EXPORT_SYMBOL(blk_init_allocated_queue);
1200
1201bool blk_get_queue(struct request_queue *q) 554bool blk_get_queue(struct request_queue *q)
1202{ 555{
1203 if (likely(!blk_queue_dying(q))) { 556 if (likely(!blk_queue_dying(q))) {
@@ -1209,406 +562,6 @@ bool blk_get_queue(struct request_queue *q)
1209} 562}
1210EXPORT_SYMBOL(blk_get_queue); 563EXPORT_SYMBOL(blk_get_queue);
1211 564
1212static inline void blk_free_request(struct request_list *rl, struct request *rq)
1213{
1214 if (rq->rq_flags & RQF_ELVPRIV) {
1215 elv_put_request(rl->q, rq);
1216 if (rq->elv.icq)
1217 put_io_context(rq->elv.icq->ioc);
1218 }
1219
1220 mempool_free(rq, rl->rq_pool);
1221}
1222
1223/*
1224 * ioc_batching returns true if the ioc is a valid batching request and
1225 * should be given priority access to a request.
1226 */
1227static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
1228{
1229 if (!ioc)
1230 return 0;
1231
1232 /*
1233 * Make sure the process is able to allocate at least 1 request
1234 * even if the batch times out, otherwise we could theoretically
1235 * lose wakeups.
1236 */
1237 return ioc->nr_batch_requests == q->nr_batching ||
1238 (ioc->nr_batch_requests > 0
1239 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
1240}
1241
1242/*
1243 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
1244 * will cause the process to be a "batcher" on all queues in the system. This
1245 * is the behaviour we want though - once it gets a wakeup it should be given
1246 * a nice run.
1247 */
1248static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
1249{
1250 if (!ioc || ioc_batching(q, ioc))
1251 return;
1252
1253 ioc->nr_batch_requests = q->nr_batching;
1254 ioc->last_waited = jiffies;
1255}
1256
1257static void __freed_request(struct request_list *rl, int sync)
1258{
1259 struct request_queue *q = rl->q;
1260
1261 if (rl->count[sync] < queue_congestion_off_threshold(q))
1262 blk_clear_congested(rl, sync);
1263
1264 if (rl->count[sync] + 1 <= q->nr_requests) {
1265 if (waitqueue_active(&rl->wait[sync]))
1266 wake_up(&rl->wait[sync]);
1267
1268 blk_clear_rl_full(rl, sync);
1269 }
1270}
1271
1272/*
1273 * A request has just been released. Account for it, update the full and
1274 * congestion status, wake up any waiters. Called under q->queue_lock.
1275 */
1276static void freed_request(struct request_list *rl, bool sync,
1277 req_flags_t rq_flags)
1278{
1279 struct request_queue *q = rl->q;
1280
1281 q->nr_rqs[sync]--;
1282 rl->count[sync]--;
1283 if (rq_flags & RQF_ELVPRIV)
1284 q->nr_rqs_elvpriv--;
1285
1286 __freed_request(rl, sync);
1287
1288 if (unlikely(rl->starved[sync ^ 1]))
1289 __freed_request(rl, sync ^ 1);
1290}
1291
1292int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
1293{
1294 struct request_list *rl;
1295 int on_thresh, off_thresh;
1296
1297 WARN_ON_ONCE(q->mq_ops);
1298
1299 spin_lock_irq(q->queue_lock);
1300 q->nr_requests = nr;
1301 blk_queue_congestion_threshold(q);
1302 on_thresh = queue_congestion_on_threshold(q);
1303 off_thresh = queue_congestion_off_threshold(q);
1304
1305 blk_queue_for_each_rl(rl, q) {
1306 if (rl->count[BLK_RW_SYNC] >= on_thresh)
1307 blk_set_congested(rl, BLK_RW_SYNC);
1308 else if (rl->count[BLK_RW_SYNC] < off_thresh)
1309 blk_clear_congested(rl, BLK_RW_SYNC);
1310
1311 if (rl->count[BLK_RW_ASYNC] >= on_thresh)
1312 blk_set_congested(rl, BLK_RW_ASYNC);
1313 else if (rl->count[BLK_RW_ASYNC] < off_thresh)
1314 blk_clear_congested(rl, BLK_RW_ASYNC);
1315
1316 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
1317 blk_set_rl_full(rl, BLK_RW_SYNC);
1318 } else {
1319 blk_clear_rl_full(rl, BLK_RW_SYNC);
1320 wake_up(&rl->wait[BLK_RW_SYNC]);
1321 }
1322
1323 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
1324 blk_set_rl_full(rl, BLK_RW_ASYNC);
1325 } else {
1326 blk_clear_rl_full(rl, BLK_RW_ASYNC);
1327 wake_up(&rl->wait[BLK_RW_ASYNC]);
1328 }
1329 }
1330
1331 spin_unlock_irq(q->queue_lock);
1332 return 0;
1333}
1334
1335/**
1336 * __get_request - get a free request
1337 * @rl: request list to allocate from
1338 * @op: operation and flags
1339 * @bio: bio to allocate request for (can be %NULL)
1340 * @flags: BLQ_MQ_REQ_* flags
1341 * @gfp_mask: allocator flags
1342 *
1343 * Get a free request from @q. This function may fail under memory
1344 * pressure or if @q is dead.
1345 *
1346 * Must be called with @q->queue_lock held and,
1347 * Returns ERR_PTR on failure, with @q->queue_lock held.
1348 * Returns request pointer on success, with @q->queue_lock *not held*.
1349 */
1350static struct request *__get_request(struct request_list *rl, unsigned int op,
1351 struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask)
1352{
1353 struct request_queue *q = rl->q;
1354 struct request *rq;
1355 struct elevator_type *et = q->elevator->type;
1356 struct io_context *ioc = rq_ioc(bio);
1357 struct io_cq *icq = NULL;
1358 const bool is_sync = op_is_sync(op);
1359 int may_queue;
1360 req_flags_t rq_flags = RQF_ALLOCED;
1361
1362 lockdep_assert_held(q->queue_lock);
1363
1364 if (unlikely(blk_queue_dying(q)))
1365 return ERR_PTR(-ENODEV);
1366
1367 may_queue = elv_may_queue(q, op);
1368 if (may_queue == ELV_MQUEUE_NO)
1369 goto rq_starved;
1370
1371 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
1372 if (rl->count[is_sync]+1 >= q->nr_requests) {
1373 /*
1374 * The queue will fill after this allocation, so set
1375 * it as full, and mark this process as "batching".
1376 * This process will be allowed to complete a batch of
1377 * requests, others will be blocked.
1378 */
1379 if (!blk_rl_full(rl, is_sync)) {
1380 ioc_set_batching(q, ioc);
1381 blk_set_rl_full(rl, is_sync);
1382 } else {
1383 if (may_queue != ELV_MQUEUE_MUST
1384 && !ioc_batching(q, ioc)) {
1385 /*
1386 * The queue is full and the allocating
1387 * process is not a "batcher", and not
1388 * exempted by the IO scheduler
1389 */
1390 return ERR_PTR(-ENOMEM);
1391 }
1392 }
1393 }
1394 blk_set_congested(rl, is_sync);
1395 }
1396
1397 /*
1398 * Only allow batching queuers to allocate up to 50% over the defined
1399 * limit of requests, otherwise we could have thousands of requests
1400 * allocated with any setting of ->nr_requests
1401 */
1402 if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
1403 return ERR_PTR(-ENOMEM);
1404
1405 q->nr_rqs[is_sync]++;
1406 rl->count[is_sync]++;
1407 rl->starved[is_sync] = 0;
1408
1409 /*
1410 * Decide whether the new request will be managed by elevator. If
1411 * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will
1412 * prevent the current elevator from being destroyed until the new
1413 * request is freed. This guarantees icq's won't be destroyed and
1414 * makes creating new ones safe.
1415 *
1416 * Flush requests do not use the elevator so skip initialization.
1417 * This allows a request to share the flush and elevator data.
1418 *
1419 * Also, lookup icq while holding queue_lock. If it doesn't exist,
1420 * it will be created after releasing queue_lock.
1421 */
1422 if (!op_is_flush(op) && !blk_queue_bypass(q)) {
1423 rq_flags |= RQF_ELVPRIV;
1424 q->nr_rqs_elvpriv++;
1425 if (et->icq_cache && ioc)
1426 icq = ioc_lookup_icq(ioc, q);
1427 }
1428
1429 if (blk_queue_io_stat(q))
1430 rq_flags |= RQF_IO_STAT;
1431 spin_unlock_irq(q->queue_lock);
1432
1433 /* allocate and init request */
1434 rq = mempool_alloc(rl->rq_pool, gfp_mask);
1435 if (!rq)
1436 goto fail_alloc;
1437
1438 blk_rq_init(q, rq);
1439 blk_rq_set_rl(rq, rl);
1440 rq->cmd_flags = op;
1441 rq->rq_flags = rq_flags;
1442 if (flags & BLK_MQ_REQ_PREEMPT)
1443 rq->rq_flags |= RQF_PREEMPT;
1444
1445 /* init elvpriv */
1446 if (rq_flags & RQF_ELVPRIV) {
1447 if (unlikely(et->icq_cache && !icq)) {
1448 if (ioc)
1449 icq = ioc_create_icq(ioc, q, gfp_mask);
1450 if (!icq)
1451 goto fail_elvpriv;
1452 }
1453
1454 rq->elv.icq = icq;
1455 if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
1456 goto fail_elvpriv;
1457
1458 /* @rq->elv.icq holds io_context until @rq is freed */
1459 if (icq)
1460 get_io_context(icq->ioc);
1461 }
1462out:
1463 /*
1464 * ioc may be NULL here, and ioc_batching will be false. That's
1465 * OK, if the queue is under the request limit then requests need
1466 * not count toward the nr_batch_requests limit. There will always
1467 * be some limit enforced by BLK_BATCH_TIME.
1468 */
1469 if (ioc_batching(q, ioc))
1470 ioc->nr_batch_requests--;
1471
1472 trace_block_getrq(q, bio, op);
1473 return rq;
1474
1475fail_elvpriv:
1476 /*
1477 * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed
1478 * and may fail indefinitely under memory pressure and thus
1479 * shouldn't stall IO. Treat this request as !elvpriv. This will
1480 * disturb iosched and blkcg but weird is bettern than dead.
1481 */
1482 printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
1483 __func__, dev_name(q->backing_dev_info->dev));
1484
1485 rq->rq_flags &= ~RQF_ELVPRIV;
1486 rq->elv.icq = NULL;
1487
1488 spin_lock_irq(q->queue_lock);
1489 q->nr_rqs_elvpriv--;
1490 spin_unlock_irq(q->queue_lock);
1491 goto out;
1492
1493fail_alloc:
1494 /*
1495 * Allocation failed presumably due to memory. Undo anything we
1496 * might have messed up.
1497 *
1498 * Allocating task should really be put onto the front of the wait
1499 * queue, but this is pretty rare.
1500 */
1501 spin_lock_irq(q->queue_lock);
1502 freed_request(rl, is_sync, rq_flags);
1503
1504 /*
1505 * in the very unlikely event that allocation failed and no
1506 * requests for this direction was pending, mark us starved so that
1507 * freeing of a request in the other direction will notice
1508 * us. another possible fix would be to split the rq mempool into
1509 * READ and WRITE
1510 */
1511rq_starved:
1512 if (unlikely(rl->count[is_sync] == 0))
1513 rl->starved[is_sync] = 1;
1514 return ERR_PTR(-ENOMEM);
1515}
1516
1517/**
1518 * get_request - get a free request
1519 * @q: request_queue to allocate request from
1520 * @op: operation and flags
1521 * @bio: bio to allocate request for (can be %NULL)
1522 * @flags: BLK_MQ_REQ_* flags.
1523 * @gfp: allocator flags
1524 *
1525 * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags,
1526 * this function keeps retrying under memory pressure and fails iff @q is dead.
1527 *
1528 * Must be called with @q->queue_lock held and,
1529 * Returns ERR_PTR on failure, with @q->queue_lock held.
1530 * Returns request pointer on success, with @q->queue_lock *not held*.
1531 */
1532static struct request *get_request(struct request_queue *q, unsigned int op,
1533 struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp)
1534{
1535 const bool is_sync = op_is_sync(op);
1536 DEFINE_WAIT(wait);
1537 struct request_list *rl;
1538 struct request *rq;
1539
1540 lockdep_assert_held(q->queue_lock);
1541 WARN_ON_ONCE(q->mq_ops);
1542
1543 rl = blk_get_rl(q, bio); /* transferred to @rq on success */
1544retry:
1545 rq = __get_request(rl, op, bio, flags, gfp);
1546 if (!IS_ERR(rq))
1547 return rq;
1548
1549 if (op & REQ_NOWAIT) {
1550 blk_put_rl(rl);
1551 return ERR_PTR(-EAGAIN);
1552 }
1553
1554 if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
1555 blk_put_rl(rl);
1556 return rq;
1557 }
1558
1559 /* wait on @rl and retry */
1560 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1561 TASK_UNINTERRUPTIBLE);
1562
1563 trace_block_sleeprq(q, bio, op);
1564
1565 spin_unlock_irq(q->queue_lock);
1566 io_schedule();
1567
1568 /*
1569 * After sleeping, we become a "batching" process and will be able
1570 * to allocate at least one request, and up to a big batch of them
1571 * for a small period time. See ioc_batching, ioc_set_batching
1572 */
1573 ioc_set_batching(q, current->io_context);
1574
1575 spin_lock_irq(q->queue_lock);
1576 finish_wait(&rl->wait[is_sync], &wait);
1577
1578 goto retry;
1579}
1580
1581/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
1582static struct request *blk_old_get_request(struct request_queue *q,
1583 unsigned int op, blk_mq_req_flags_t flags)
1584{
1585 struct request *rq;
1586 gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO;
1587 int ret = 0;
1588
1589 WARN_ON_ONCE(q->mq_ops);
1590
1591 /* create ioc upfront */
1592 create_io_context(gfp_mask, q->node);
1593
1594 ret = blk_queue_enter(q, flags);
1595 if (ret)
1596 return ERR_PTR(ret);
1597 spin_lock_irq(q->queue_lock);
1598 rq = get_request(q, op, NULL, flags, gfp_mask);
1599 if (IS_ERR(rq)) {
1600 spin_unlock_irq(q->queue_lock);
1601 blk_queue_exit(q);
1602 return rq;
1603 }
1604
1605 /* q->queue_lock is unlocked at this point */
1606 rq->__data_len = 0;
1607 rq->__sector = (sector_t) -1;
1608 rq->bio = rq->biotail = NULL;
1609 return rq;
1610}
1611
1612/** 565/**
1613 * blk_get_request - allocate a request 566 * blk_get_request - allocate a request
1614 * @q: request queue to allocate a request for 567 * @q: request queue to allocate a request for
@@ -1623,170 +576,17 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op,
1623 WARN_ON_ONCE(op & REQ_NOWAIT); 576 WARN_ON_ONCE(op & REQ_NOWAIT);
1624 WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); 577 WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
1625 578
1626 if (q->mq_ops) { 579 req = blk_mq_alloc_request(q, op, flags);
1627 req = blk_mq_alloc_request(q, op, flags); 580 if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
1628 if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) 581 q->mq_ops->initialize_rq_fn(req);
1629 q->mq_ops->initialize_rq_fn(req);
1630 } else {
1631 req = blk_old_get_request(q, op, flags);
1632 if (!IS_ERR(req) && q->initialize_rq_fn)
1633 q->initialize_rq_fn(req);
1634 }
1635 582
1636 return req; 583 return req;
1637} 584}
1638EXPORT_SYMBOL(blk_get_request); 585EXPORT_SYMBOL(blk_get_request);
1639 586
1640/**
1641 * blk_requeue_request - put a request back on queue
1642 * @q: request queue where request should be inserted
1643 * @rq: request to be inserted
1644 *
1645 * Description:
1646 * Drivers often keep queueing requests until the hardware cannot accept
1647 * more, when that condition happens we need to put the request back
1648 * on the queue. Must be called with queue lock held.
1649 */
1650void blk_requeue_request(struct request_queue *q, struct request *rq)
1651{
1652 lockdep_assert_held(q->queue_lock);
1653 WARN_ON_ONCE(q->mq_ops);
1654
1655 blk_delete_timer(rq);
1656 blk_clear_rq_complete(rq);
1657 trace_block_rq_requeue(q, rq);
1658 rq_qos_requeue(q, rq);
1659
1660 if (rq->rq_flags & RQF_QUEUED)
1661 blk_queue_end_tag(q, rq);
1662
1663 BUG_ON(blk_queued_rq(rq));
1664
1665 elv_requeue_request(q, rq);
1666}
1667EXPORT_SYMBOL(blk_requeue_request);
1668
1669static void add_acct_request(struct request_queue *q, struct request *rq,
1670 int where)
1671{
1672 blk_account_io_start(rq, true);
1673 __elv_add_request(q, rq, where);
1674}
1675
1676static void part_round_stats_single(struct request_queue *q, int cpu,
1677 struct hd_struct *part, unsigned long now,
1678 unsigned int inflight)
1679{
1680 if (inflight) {
1681 __part_stat_add(cpu, part, time_in_queue,
1682 inflight * (now - part->stamp));
1683 __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1684 }
1685 part->stamp = now;
1686}
1687
1688/**
1689 * part_round_stats() - Round off the performance stats on a struct disk_stats.
1690 * @q: target block queue
1691 * @cpu: cpu number for stats access
1692 * @part: target partition
1693 *
1694 * The average IO queue length and utilisation statistics are maintained
1695 * by observing the current state of the queue length and the amount of
1696 * time it has been in this state for.
1697 *
1698 * Normally, that accounting is done on IO completion, but that can result
1699 * in more than a second's worth of IO being accounted for within any one
1700 * second, leading to >100% utilisation. To deal with that, we call this
1701 * function to do a round-off before returning the results when reading
1702 * /proc/diskstats. This accounts immediately for all queue usage up to
1703 * the current jiffies and restarts the counters again.
1704 */
1705void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
1706{
1707 struct hd_struct *part2 = NULL;
1708 unsigned long now = jiffies;
1709 unsigned int inflight[2];
1710 int stats = 0;
1711
1712 if (part->stamp != now)
1713 stats |= 1;
1714
1715 if (part->partno) {
1716 part2 = &part_to_disk(part)->part0;
1717 if (part2->stamp != now)
1718 stats |= 2;
1719 }
1720
1721 if (!stats)
1722 return;
1723
1724 part_in_flight(q, part, inflight);
1725
1726 if (stats & 2)
1727 part_round_stats_single(q, cpu, part2, now, inflight[1]);
1728 if (stats & 1)
1729 part_round_stats_single(q, cpu, part, now, inflight[0]);
1730}
1731EXPORT_SYMBOL_GPL(part_round_stats);
1732
1733void __blk_put_request(struct request_queue *q, struct request *req)
1734{
1735 req_flags_t rq_flags = req->rq_flags;
1736
1737 if (unlikely(!q))
1738 return;
1739
1740 if (q->mq_ops) {
1741 blk_mq_free_request(req);
1742 return;
1743 }
1744
1745 lockdep_assert_held(q->queue_lock);
1746
1747 blk_req_zone_write_unlock(req);
1748 blk_pm_put_request(req);
1749 blk_pm_mark_last_busy(req);
1750
1751 elv_completed_request(q, req);
1752
1753 /* this is a bio leak */
1754 WARN_ON(req->bio != NULL);
1755
1756 rq_qos_done(q, req);
1757
1758 /*
1759 * Request may not have originated from ll_rw_blk. if not,
1760 * it didn't come out of our reserved rq pools
1761 */
1762 if (rq_flags & RQF_ALLOCED) {
1763 struct request_list *rl = blk_rq_rl(req);
1764 bool sync = op_is_sync(req->cmd_flags);
1765
1766 BUG_ON(!list_empty(&req->queuelist));
1767 BUG_ON(ELV_ON_HASH(req));
1768
1769 blk_free_request(rl, req);
1770 freed_request(rl, sync, rq_flags);
1771 blk_put_rl(rl);
1772 blk_queue_exit(q);
1773 }
1774}
1775EXPORT_SYMBOL_GPL(__blk_put_request);
1776
1777void blk_put_request(struct request *req) 587void blk_put_request(struct request *req)
1778{ 588{
1779 struct request_queue *q = req->q; 589 blk_mq_free_request(req);
1780
1781 if (q->mq_ops)
1782 blk_mq_free_request(req);
1783 else {
1784 unsigned long flags;
1785
1786 spin_lock_irqsave(q->queue_lock, flags);
1787 __blk_put_request(q, req);
1788 spin_unlock_irqrestore(q->queue_lock, flags);
1789 }
1790} 590}
1791EXPORT_SYMBOL(blk_put_request); 591EXPORT_SYMBOL(blk_put_request);
1792 592
@@ -1806,7 +606,6 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1806 req->biotail->bi_next = bio; 606 req->biotail->bi_next = bio;
1807 req->biotail = bio; 607 req->biotail = bio;
1808 req->__data_len += bio->bi_iter.bi_size; 608 req->__data_len += bio->bi_iter.bi_size;
1809 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1810 609
1811 blk_account_io_start(req, false); 610 blk_account_io_start(req, false);
1812 return true; 611 return true;
@@ -1830,7 +629,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1830 629
1831 req->__sector = bio->bi_iter.bi_sector; 630 req->__sector = bio->bi_iter.bi_sector;
1832 req->__data_len += bio->bi_iter.bi_size; 631 req->__data_len += bio->bi_iter.bi_size;
1833 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1834 632
1835 blk_account_io_start(req, false); 633 blk_account_io_start(req, false);
1836 return true; 634 return true;
@@ -1850,7 +648,6 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
1850 req->biotail->bi_next = bio; 648 req->biotail->bi_next = bio;
1851 req->biotail = bio; 649 req->biotail = bio;
1852 req->__data_len += bio->bi_iter.bi_size; 650 req->__data_len += bio->bi_iter.bi_size;
1853 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1854 req->nr_phys_segments = segments + 1; 651 req->nr_phys_segments = segments + 1;
1855 652
1856 blk_account_io_start(req, false); 653 blk_account_io_start(req, false);
@@ -1883,7 +680,6 @@ no_merge:
1883 * Caller must ensure !blk_queue_nomerges(q) beforehand. 680 * Caller must ensure !blk_queue_nomerges(q) beforehand.
1884 */ 681 */
1885bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 682bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1886 unsigned int *request_count,
1887 struct request **same_queue_rq) 683 struct request **same_queue_rq)
1888{ 684{
1889 struct blk_plug *plug; 685 struct blk_plug *plug;
@@ -1893,25 +689,19 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1893 plug = current->plug; 689 plug = current->plug;
1894 if (!plug) 690 if (!plug)
1895 return false; 691 return false;
1896 *request_count = 0;
1897 692
1898 if (q->mq_ops) 693 plug_list = &plug->mq_list;
1899 plug_list = &plug->mq_list;
1900 else
1901 plug_list = &plug->list;
1902 694
1903 list_for_each_entry_reverse(rq, plug_list, queuelist) { 695 list_for_each_entry_reverse(rq, plug_list, queuelist) {
1904 bool merged = false; 696 bool merged = false;
1905 697
1906 if (rq->q == q) { 698 if (rq->q == q && same_queue_rq) {
1907 (*request_count)++;
1908 /* 699 /*
1909 * Only blk-mq multiple hardware queues case checks the 700 * Only blk-mq multiple hardware queues case checks the
1910 * rq in the same queue, there should be only one such 701 * rq in the same queue, there should be only one such
1911 * rq in a queue 702 * rq in a queue
1912 **/ 703 **/
1913 if (same_queue_rq) 704 *same_queue_rq = rq;
1914 *same_queue_rq = rq;
1915 } 705 }
1916 706
1917 if (rq->q != q || !blk_rq_merge_ok(rq, bio)) 707 if (rq->q != q || !blk_rq_merge_ok(rq, bio))
@@ -1938,176 +728,18 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1938 return false; 728 return false;
1939} 729}
1940 730
1941unsigned int blk_plug_queued_count(struct request_queue *q)
1942{
1943 struct blk_plug *plug;
1944 struct request *rq;
1945 struct list_head *plug_list;
1946 unsigned int ret = 0;
1947
1948 plug = current->plug;
1949 if (!plug)
1950 goto out;
1951
1952 if (q->mq_ops)
1953 plug_list = &plug->mq_list;
1954 else
1955 plug_list = &plug->list;
1956
1957 list_for_each_entry(rq, plug_list, queuelist) {
1958 if (rq->q == q)
1959 ret++;
1960 }
1961out:
1962 return ret;
1963}
1964
1965void blk_init_request_from_bio(struct request *req, struct bio *bio) 731void blk_init_request_from_bio(struct request *req, struct bio *bio)
1966{ 732{
1967 struct io_context *ioc = rq_ioc(bio);
1968
1969 if (bio->bi_opf & REQ_RAHEAD) 733 if (bio->bi_opf & REQ_RAHEAD)
1970 req->cmd_flags |= REQ_FAILFAST_MASK; 734 req->cmd_flags |= REQ_FAILFAST_MASK;
1971 735
1972 req->__sector = bio->bi_iter.bi_sector; 736 req->__sector = bio->bi_iter.bi_sector;
1973 if (ioprio_valid(bio_prio(bio))) 737 req->ioprio = bio_prio(bio);
1974 req->ioprio = bio_prio(bio);
1975 else if (ioc)
1976 req->ioprio = ioc->ioprio;
1977 else
1978 req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
1979 req->write_hint = bio->bi_write_hint; 738 req->write_hint = bio->bi_write_hint;
1980 blk_rq_bio_prep(req->q, req, bio); 739 blk_rq_bio_prep(req->q, req, bio);
1981} 740}
1982EXPORT_SYMBOL_GPL(blk_init_request_from_bio); 741EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
1983 742
1984static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
1985{
1986 struct blk_plug *plug;
1987 int where = ELEVATOR_INSERT_SORT;
1988 struct request *req, *free;
1989 unsigned int request_count = 0;
1990
1991 /*
1992 * low level driver can indicate that it wants pages above a
1993 * certain limit bounced to low memory (ie for highmem, or even
1994 * ISA dma in theory)
1995 */
1996 blk_queue_bounce(q, &bio);
1997
1998 blk_queue_split(q, &bio);
1999
2000 if (!bio_integrity_prep(bio))
2001 return BLK_QC_T_NONE;
2002
2003 if (op_is_flush(bio->bi_opf)) {
2004 spin_lock_irq(q->queue_lock);
2005 where = ELEVATOR_INSERT_FLUSH;
2006 goto get_rq;
2007 }
2008
2009 /*
2010 * Check if we can merge with the plugged list before grabbing
2011 * any locks.
2012 */
2013 if (!blk_queue_nomerges(q)) {
2014 if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
2015 return BLK_QC_T_NONE;
2016 } else
2017 request_count = blk_plug_queued_count(q);
2018
2019 spin_lock_irq(q->queue_lock);
2020
2021 switch (elv_merge(q, &req, bio)) {
2022 case ELEVATOR_BACK_MERGE:
2023 if (!bio_attempt_back_merge(q, req, bio))
2024 break;
2025 elv_bio_merged(q, req, bio);
2026 free = attempt_back_merge(q, req);
2027 if (free)
2028 __blk_put_request(q, free);
2029 else
2030 elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
2031 goto out_unlock;
2032 case ELEVATOR_FRONT_MERGE:
2033 if (!bio_attempt_front_merge(q, req, bio))
2034 break;
2035 elv_bio_merged(q, req, bio);
2036 free = attempt_front_merge(q, req);
2037 if (free)
2038 __blk_put_request(q, free);
2039 else
2040 elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
2041 goto out_unlock;
2042 default:
2043 break;
2044 }
2045
2046get_rq:
2047 rq_qos_throttle(q, bio, q->queue_lock);
2048
2049 /*
2050 * Grab a free request. This is might sleep but can not fail.
2051 * Returns with the queue unlocked.
2052 */
2053 blk_queue_enter_live(q);
2054 req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
2055 if (IS_ERR(req)) {
2056 blk_queue_exit(q);
2057 rq_qos_cleanup(q, bio);
2058 if (PTR_ERR(req) == -ENOMEM)
2059 bio->bi_status = BLK_STS_RESOURCE;
2060 else
2061 bio->bi_status = BLK_STS_IOERR;
2062 bio_endio(bio);
2063 goto out_unlock;
2064 }
2065
2066 rq_qos_track(q, req, bio);
2067
2068 /*
2069 * After dropping the lock and possibly sleeping here, our request
2070 * may now be mergeable after it had proven unmergeable (above).
2071 * We don't worry about that case for efficiency. It won't happen
2072 * often, and the elevators are able to handle it.
2073 */
2074 blk_init_request_from_bio(req, bio);
2075
2076 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
2077 req->cpu = raw_smp_processor_id();
2078
2079 plug = current->plug;
2080 if (plug) {
2081 /*
2082 * If this is the first request added after a plug, fire
2083 * of a plug trace.
2084 *
2085 * @request_count may become stale because of schedule
2086 * out, so check plug list again.
2087 */
2088 if (!request_count || list_empty(&plug->list))
2089 trace_block_plug(q);
2090 else {
2091 struct request *last = list_entry_rq(plug->list.prev);
2092 if (request_count >= BLK_MAX_REQUEST_COUNT ||
2093 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
2094 blk_flush_plug_list(plug, false);
2095 trace_block_plug(q);
2096 }
2097 }
2098 list_add_tail(&req->queuelist, &plug->list);
2099 blk_account_io_start(req, true);
2100 } else {
2101 spin_lock_irq(q->queue_lock);
2102 add_acct_request(q, req, where);
2103 __blk_run_queue(q);
2104out_unlock:
2105 spin_unlock_irq(q->queue_lock);
2106 }
2107
2108 return BLK_QC_T_NONE;
2109}
2110
2111static void handle_bad_sector(struct bio *bio, sector_t maxsector) 743static void handle_bad_sector(struct bio *bio, sector_t maxsector)
2112{ 744{
2113 char b[BDEVNAME_SIZE]; 745 char b[BDEVNAME_SIZE];
@@ -2259,7 +891,7 @@ generic_make_request_checks(struct bio *bio)
2259 * For a REQ_NOWAIT based request, return -EOPNOTSUPP 891 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
2260 * if queue is not a request based queue. 892 * if queue is not a request based queue.
2261 */ 893 */
2262 if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) 894 if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
2263 goto not_supported; 895 goto not_supported;
2264 896
2265 if (should_fail_bio(bio)) 897 if (should_fail_bio(bio))
@@ -2289,6 +921,9 @@ generic_make_request_checks(struct bio *bio)
2289 } 921 }
2290 } 922 }
2291 923
924 if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
925 bio->bi_opf &= ~REQ_HIPRI;
926
2292 switch (bio_op(bio)) { 927 switch (bio_op(bio)) {
2293 case REQ_OP_DISCARD: 928 case REQ_OP_DISCARD:
2294 if (!blk_queue_discard(q)) 929 if (!blk_queue_discard(q))
@@ -2561,17 +1196,6 @@ blk_qc_t submit_bio(struct bio *bio)
2561} 1196}
2562EXPORT_SYMBOL(submit_bio); 1197EXPORT_SYMBOL(submit_bio);
2563 1198
2564bool blk_poll(struct request_queue *q, blk_qc_t cookie)
2565{
2566 if (!q->poll_fn || !blk_qc_t_valid(cookie))
2567 return false;
2568
2569 if (current->plug)
2570 blk_flush_plug_list(current->plug, false);
2571 return q->poll_fn(q, cookie);
2572}
2573EXPORT_SYMBOL_GPL(blk_poll);
2574
2575/** 1199/**
2576 * blk_cloned_rq_check_limits - Helper function to check a cloned request 1200 * blk_cloned_rq_check_limits - Helper function to check a cloned request
2577 * for new the queue limits 1201 * for new the queue limits
@@ -2619,8 +1243,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
2619 */ 1243 */
2620blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) 1244blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
2621{ 1245{
2622 unsigned long flags; 1246 blk_qc_t unused;
2623 int where = ELEVATOR_INSERT_BACK;
2624 1247
2625 if (blk_cloned_rq_check_limits(q, rq)) 1248 if (blk_cloned_rq_check_limits(q, rq))
2626 return BLK_STS_IOERR; 1249 return BLK_STS_IOERR;
@@ -2629,38 +1252,15 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
2629 should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) 1252 should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
2630 return BLK_STS_IOERR; 1253 return BLK_STS_IOERR;
2631 1254
2632 if (q->mq_ops) { 1255 if (blk_queue_io_stat(q))
2633 if (blk_queue_io_stat(q)) 1256 blk_account_io_start(rq, true);
2634 blk_account_io_start(rq, true);
2635 /*
2636 * Since we have a scheduler attached on the top device,
2637 * bypass a potential scheduler on the bottom device for
2638 * insert.
2639 */
2640 return blk_mq_request_issue_directly(rq);
2641 }
2642
2643 spin_lock_irqsave(q->queue_lock, flags);
2644 if (unlikely(blk_queue_dying(q))) {
2645 spin_unlock_irqrestore(q->queue_lock, flags);
2646 return BLK_STS_IOERR;
2647 }
2648 1257
2649 /* 1258 /*
2650 * Submitting request must be dequeued before calling this function 1259 * Since we have a scheduler attached on the top device,
2651 * because it will be linked to another request_queue 1260 * bypass a potential scheduler on the bottom device for
1261 * insert.
2652 */ 1262 */
2653 BUG_ON(blk_queued_rq(rq)); 1263 return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, true);
2654
2655 if (op_is_flush(rq->cmd_flags))
2656 where = ELEVATOR_INSERT_FLUSH;
2657
2658 add_acct_request(q, rq, where);
2659 if (where == ELEVATOR_INSERT_FLUSH)
2660 __blk_run_queue(q);
2661 spin_unlock_irqrestore(q->queue_lock, flags);
2662
2663 return BLK_STS_OK;
2664} 1264}
2665EXPORT_SYMBOL_GPL(blk_insert_cloned_request); 1265EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
2666 1266
@@ -2710,11 +1310,10 @@ void blk_account_io_completion(struct request *req, unsigned int bytes)
2710 if (blk_do_io_stat(req)) { 1310 if (blk_do_io_stat(req)) {
2711 const int sgrp = op_stat_group(req_op(req)); 1311 const int sgrp = op_stat_group(req_op(req));
2712 struct hd_struct *part; 1312 struct hd_struct *part;
2713 int cpu;
2714 1313
2715 cpu = part_stat_lock(); 1314 part_stat_lock();
2716 part = req->part; 1315 part = req->part;
2717 part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); 1316 part_stat_add(part, sectors[sgrp], bytes >> 9);
2718 part_stat_unlock(); 1317 part_stat_unlock();
2719 } 1318 }
2720} 1319}
@@ -2729,14 +1328,14 @@ void blk_account_io_done(struct request *req, u64 now)
2729 if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { 1328 if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
2730 const int sgrp = op_stat_group(req_op(req)); 1329 const int sgrp = op_stat_group(req_op(req));
2731 struct hd_struct *part; 1330 struct hd_struct *part;
2732 int cpu;
2733 1331
2734 cpu = part_stat_lock(); 1332 part_stat_lock();
2735 part = req->part; 1333 part = req->part;
2736 1334
2737 part_stat_inc(cpu, part, ios[sgrp]); 1335 update_io_ticks(part, jiffies);
2738 part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); 1336 part_stat_inc(part, ios[sgrp]);
2739 part_round_stats(req->q, cpu, part); 1337 part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
1338 part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns));
2740 part_dec_in_flight(req->q, part, rq_data_dir(req)); 1339 part_dec_in_flight(req->q, part, rq_data_dir(req));
2741 1340
2742 hd_struct_put(part); 1341 hd_struct_put(part);
@@ -2748,16 +1347,15 @@ void blk_account_io_start(struct request *rq, bool new_io)
2748{ 1347{
2749 struct hd_struct *part; 1348 struct hd_struct *part;
2750 int rw = rq_data_dir(rq); 1349 int rw = rq_data_dir(rq);
2751 int cpu;
2752 1350
2753 if (!blk_do_io_stat(rq)) 1351 if (!blk_do_io_stat(rq))
2754 return; 1352 return;
2755 1353
2756 cpu = part_stat_lock(); 1354 part_stat_lock();
2757 1355
2758 if (!new_io) { 1356 if (!new_io) {
2759 part = rq->part; 1357 part = rq->part;
2760 part_stat_inc(cpu, part, merges[rw]); 1358 part_stat_inc(part, merges[rw]);
2761 } else { 1359 } else {
2762 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); 1360 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
2763 if (!hd_struct_try_get(part)) { 1361 if (!hd_struct_try_get(part)) {
@@ -2772,232 +1370,14 @@ void blk_account_io_start(struct request *rq, bool new_io)
2772 part = &rq->rq_disk->part0; 1370 part = &rq->rq_disk->part0;
2773 hd_struct_get(part); 1371 hd_struct_get(part);
2774 } 1372 }
2775 part_round_stats(rq->q, cpu, part);
2776 part_inc_in_flight(rq->q, part, rw); 1373 part_inc_in_flight(rq->q, part, rw);
2777 rq->part = part; 1374 rq->part = part;
2778 } 1375 }
2779 1376
2780 part_stat_unlock(); 1377 update_io_ticks(part, jiffies);
2781}
2782
2783static struct request *elv_next_request(struct request_queue *q)
2784{
2785 struct request *rq;
2786 struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
2787
2788 WARN_ON_ONCE(q->mq_ops);
2789
2790 while (1) {
2791 list_for_each_entry(rq, &q->queue_head, queuelist) {
2792#ifdef CONFIG_PM
2793 /*
2794 * If a request gets queued in state RPM_SUSPENDED
2795 * then that's a kernel bug.
2796 */
2797 WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED);
2798#endif
2799 return rq;
2800 }
2801
2802 /*
2803 * Flush request is running and flush request isn't queueable
2804 * in the drive, we can hold the queue till flush request is
2805 * finished. Even we don't do this, driver can't dispatch next
2806 * requests and will requeue them. And this can improve
2807 * throughput too. For example, we have request flush1, write1,
2808 * flush 2. flush1 is dispatched, then queue is hold, write1
2809 * isn't inserted to queue. After flush1 is finished, flush2
2810 * will be dispatched. Since disk cache is already clean,
2811 * flush2 will be finished very soon, so looks like flush2 is
2812 * folded to flush1.
2813 * Since the queue is hold, a flag is set to indicate the queue
2814 * should be restarted later. Please see flush_end_io() for
2815 * details.
2816 */
2817 if (fq->flush_pending_idx != fq->flush_running_idx &&
2818 !queue_flush_queueable(q)) {
2819 fq->flush_queue_delayed = 1;
2820 return NULL;
2821 }
2822 if (unlikely(blk_queue_bypass(q)) ||
2823 !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
2824 return NULL;
2825 }
2826}
2827
2828/**
2829 * blk_peek_request - peek at the top of a request queue
2830 * @q: request queue to peek at
2831 *
2832 * Description:
2833 * Return the request at the top of @q. The returned request
2834 * should be started using blk_start_request() before LLD starts
2835 * processing it.
2836 *
2837 * Return:
2838 * Pointer to the request at the top of @q if available. Null
2839 * otherwise.
2840 */
2841struct request *blk_peek_request(struct request_queue *q)
2842{
2843 struct request *rq;
2844 int ret;
2845
2846 lockdep_assert_held(q->queue_lock);
2847 WARN_ON_ONCE(q->mq_ops);
2848
2849 while ((rq = elv_next_request(q)) != NULL) {
2850 if (!(rq->rq_flags & RQF_STARTED)) {
2851 /*
2852 * This is the first time the device driver
2853 * sees this request (possibly after
2854 * requeueing). Notify IO scheduler.
2855 */
2856 if (rq->rq_flags & RQF_SORTED)
2857 elv_activate_rq(q, rq);
2858
2859 /*
2860 * just mark as started even if we don't start
2861 * it, a request that has been delayed should
2862 * not be passed by new incoming requests
2863 */
2864 rq->rq_flags |= RQF_STARTED;
2865 trace_block_rq_issue(q, rq);
2866 }
2867
2868 if (!q->boundary_rq || q->boundary_rq == rq) {
2869 q->end_sector = rq_end_sector(rq);
2870 q->boundary_rq = NULL;
2871 }
2872
2873 if (rq->rq_flags & RQF_DONTPREP)
2874 break;
2875
2876 if (q->dma_drain_size && blk_rq_bytes(rq)) {
2877 /*
2878 * make sure space for the drain appears we
2879 * know we can do this because max_hw_segments
2880 * has been adjusted to be one fewer than the
2881 * device can handle
2882 */
2883 rq->nr_phys_segments++;
2884 }
2885
2886 if (!q->prep_rq_fn)
2887 break;
2888
2889 ret = q->prep_rq_fn(q, rq);
2890 if (ret == BLKPREP_OK) {
2891 break;
2892 } else if (ret == BLKPREP_DEFER) {
2893 /*
2894 * the request may have been (partially) prepped.
2895 * we need to keep this request in the front to
2896 * avoid resource deadlock. RQF_STARTED will
2897 * prevent other fs requests from passing this one.
2898 */
2899 if (q->dma_drain_size && blk_rq_bytes(rq) &&
2900 !(rq->rq_flags & RQF_DONTPREP)) {
2901 /*
2902 * remove the space for the drain we added
2903 * so that we don't add it again
2904 */
2905 --rq->nr_phys_segments;
2906 }
2907
2908 rq = NULL;
2909 break;
2910 } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
2911 rq->rq_flags |= RQF_QUIET;
2912 /*
2913 * Mark this request as started so we don't trigger
2914 * any debug logic in the end I/O path.
2915 */
2916 blk_start_request(rq);
2917 __blk_end_request_all(rq, ret == BLKPREP_INVALID ?
2918 BLK_STS_TARGET : BLK_STS_IOERR);
2919 } else {
2920 printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
2921 break;
2922 }
2923 }
2924
2925 return rq;
2926}
2927EXPORT_SYMBOL(blk_peek_request);
2928
2929static void blk_dequeue_request(struct request *rq)
2930{
2931 struct request_queue *q = rq->q;
2932 1378
2933 BUG_ON(list_empty(&rq->queuelist)); 1379 part_stat_unlock();
2934 BUG_ON(ELV_ON_HASH(rq));
2935
2936 list_del_init(&rq->queuelist);
2937
2938 /*
2939 * the time frame between a request being removed from the lists
2940 * and to it is freed is accounted as io that is in progress at
2941 * the driver side.
2942 */
2943 if (blk_account_rq(rq))
2944 q->in_flight[rq_is_sync(rq)]++;
2945}
2946
2947/**
2948 * blk_start_request - start request processing on the driver
2949 * @req: request to dequeue
2950 *
2951 * Description:
2952 * Dequeue @req and start timeout timer on it. This hands off the
2953 * request to the driver.
2954 */
2955void blk_start_request(struct request *req)
2956{
2957 lockdep_assert_held(req->q->queue_lock);
2958 WARN_ON_ONCE(req->q->mq_ops);
2959
2960 blk_dequeue_request(req);
2961
2962 if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
2963 req->io_start_time_ns = ktime_get_ns();
2964#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2965 req->throtl_size = blk_rq_sectors(req);
2966#endif
2967 req->rq_flags |= RQF_STATS;
2968 rq_qos_issue(req->q, req);
2969 }
2970
2971 BUG_ON(blk_rq_is_complete(req));
2972 blk_add_timer(req);
2973}
2974EXPORT_SYMBOL(blk_start_request);
2975
2976/**
2977 * blk_fetch_request - fetch a request from a request queue
2978 * @q: request queue to fetch a request from
2979 *
2980 * Description:
2981 * Return the request at the top of @q. The request is started on
2982 * return and LLD can start processing it immediately.
2983 *
2984 * Return:
2985 * Pointer to the request at the top of @q if available. Null
2986 * otherwise.
2987 */
2988struct request *blk_fetch_request(struct request_queue *q)
2989{
2990 struct request *rq;
2991
2992 lockdep_assert_held(q->queue_lock);
2993 WARN_ON_ONCE(q->mq_ops);
2994
2995 rq = blk_peek_request(q);
2996 if (rq)
2997 blk_start_request(rq);
2998 return rq;
2999} 1380}
3000EXPORT_SYMBOL(blk_fetch_request);
3001 1381
3002/* 1382/*
3003 * Steal bios from a request and add them to a bio list. 1383 * Steal bios from a request and add them to a bio list.
@@ -3124,255 +1504,6 @@ bool blk_update_request(struct request *req, blk_status_t error,
3124} 1504}
3125EXPORT_SYMBOL_GPL(blk_update_request); 1505EXPORT_SYMBOL_GPL(blk_update_request);
3126 1506
3127static bool blk_update_bidi_request(struct request *rq, blk_status_t error,
3128 unsigned int nr_bytes,
3129 unsigned int bidi_bytes)
3130{
3131 if (blk_update_request(rq, error, nr_bytes))
3132 return true;
3133
3134 /* Bidi request must be completed as a whole */
3135 if (unlikely(blk_bidi_rq(rq)) &&
3136 blk_update_request(rq->next_rq, error, bidi_bytes))
3137 return true;
3138
3139 if (blk_queue_add_random(rq->q))
3140 add_disk_randomness(rq->rq_disk);
3141
3142 return false;
3143}
3144
3145/**
3146 * blk_unprep_request - unprepare a request
3147 * @req: the request
3148 *
3149 * This function makes a request ready for complete resubmission (or
3150 * completion). It happens only after all error handling is complete,
3151 * so represents the appropriate moment to deallocate any resources
3152 * that were allocated to the request in the prep_rq_fn. The queue
3153 * lock is held when calling this.
3154 */
3155void blk_unprep_request(struct request *req)
3156{
3157 struct request_queue *q = req->q;
3158
3159 req->rq_flags &= ~RQF_DONTPREP;
3160 if (q->unprep_rq_fn)
3161 q->unprep_rq_fn(q, req);
3162}
3163EXPORT_SYMBOL_GPL(blk_unprep_request);
3164
3165void blk_finish_request(struct request *req, blk_status_t error)
3166{
3167 struct request_queue *q = req->q;
3168 u64 now = ktime_get_ns();
3169
3170 lockdep_assert_held(req->q->queue_lock);
3171 WARN_ON_ONCE(q->mq_ops);
3172
3173 if (req->rq_flags & RQF_STATS)
3174 blk_stat_add(req, now);
3175
3176 if (req->rq_flags & RQF_QUEUED)
3177 blk_queue_end_tag(q, req);
3178
3179 BUG_ON(blk_queued_rq(req));
3180
3181 if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
3182 laptop_io_completion(req->q->backing_dev_info);
3183
3184 blk_delete_timer(req);
3185
3186 if (req->rq_flags & RQF_DONTPREP)
3187 blk_unprep_request(req);
3188
3189 blk_account_io_done(req, now);
3190
3191 if (req->end_io) {
3192 rq_qos_done(q, req);
3193 req->end_io(req, error);
3194 } else {
3195 if (blk_bidi_rq(req))
3196 __blk_put_request(req->next_rq->q, req->next_rq);
3197
3198 __blk_put_request(q, req);
3199 }
3200}
3201EXPORT_SYMBOL(blk_finish_request);
3202
3203/**
3204 * blk_end_bidi_request - Complete a bidi request
3205 * @rq: the request to complete
3206 * @error: block status code
3207 * @nr_bytes: number of bytes to complete @rq
3208 * @bidi_bytes: number of bytes to complete @rq->next_rq
3209 *
3210 * Description:
3211 * Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
3212 * Drivers that supports bidi can safely call this member for any
3213 * type of request, bidi or uni. In the later case @bidi_bytes is
3214 * just ignored.
3215 *
3216 * Return:
3217 * %false - we are done with this request
3218 * %true - still buffers pending for this request
3219 **/
3220static bool blk_end_bidi_request(struct request *rq, blk_status_t error,
3221 unsigned int nr_bytes, unsigned int bidi_bytes)
3222{
3223 struct request_queue *q = rq->q;
3224 unsigned long flags;
3225
3226 WARN_ON_ONCE(q->mq_ops);
3227
3228 if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3229 return true;
3230
3231 spin_lock_irqsave(q->queue_lock, flags);
3232 blk_finish_request(rq, error);
3233 spin_unlock_irqrestore(q->queue_lock, flags);
3234
3235 return false;
3236}
3237
3238/**
3239 * __blk_end_bidi_request - Complete a bidi request with queue lock held
3240 * @rq: the request to complete
3241 * @error: block status code
3242 * @nr_bytes: number of bytes to complete @rq
3243 * @bidi_bytes: number of bytes to complete @rq->next_rq
3244 *
3245 * Description:
3246 * Identical to blk_end_bidi_request() except that queue lock is
3247 * assumed to be locked on entry and remains so on return.
3248 *
3249 * Return:
3250 * %false - we are done with this request
3251 * %true - still buffers pending for this request
3252 **/
3253static bool __blk_end_bidi_request(struct request *rq, blk_status_t error,
3254 unsigned int nr_bytes, unsigned int bidi_bytes)
3255{
3256 lockdep_assert_held(rq->q->queue_lock);
3257 WARN_ON_ONCE(rq->q->mq_ops);
3258
3259 if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
3260 return true;
3261
3262 blk_finish_request(rq, error);
3263
3264 return false;
3265}
3266
3267/**
3268 * blk_end_request - Helper function for drivers to complete the request.
3269 * @rq: the request being processed
3270 * @error: block status code
3271 * @nr_bytes: number of bytes to complete
3272 *
3273 * Description:
3274 * Ends I/O on a number of bytes attached to @rq.
3275 * If @rq has leftover, sets it up for the next range of segments.
3276 *
3277 * Return:
3278 * %false - we are done with this request
3279 * %true - still buffers pending for this request
3280 **/
3281bool blk_end_request(struct request *rq, blk_status_t error,
3282 unsigned int nr_bytes)
3283{
3284 WARN_ON_ONCE(rq->q->mq_ops);
3285 return blk_end_bidi_request(rq, error, nr_bytes, 0);
3286}
3287EXPORT_SYMBOL(blk_end_request);
3288
3289/**
3290 * blk_end_request_all - Helper function for drives to finish the request.
3291 * @rq: the request to finish
3292 * @error: block status code
3293 *
3294 * Description:
3295 * Completely finish @rq.
3296 */
3297void blk_end_request_all(struct request *rq, blk_status_t error)
3298{
3299 bool pending;
3300 unsigned int bidi_bytes = 0;
3301
3302 if (unlikely(blk_bidi_rq(rq)))
3303 bidi_bytes = blk_rq_bytes(rq->next_rq);
3304
3305 pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3306 BUG_ON(pending);
3307}
3308EXPORT_SYMBOL(blk_end_request_all);
3309
3310/**
3311 * __blk_end_request - Helper function for drivers to complete the request.
3312 * @rq: the request being processed
3313 * @error: block status code
3314 * @nr_bytes: number of bytes to complete
3315 *
3316 * Description:
3317 * Must be called with queue lock held unlike blk_end_request().
3318 *
3319 * Return:
3320 * %false - we are done with this request
3321 * %true - still buffers pending for this request
3322 **/
3323bool __blk_end_request(struct request *rq, blk_status_t error,
3324 unsigned int nr_bytes)
3325{
3326 lockdep_assert_held(rq->q->queue_lock);
3327 WARN_ON_ONCE(rq->q->mq_ops);
3328
3329 return __blk_end_bidi_request(rq, error, nr_bytes, 0);
3330}
3331EXPORT_SYMBOL(__blk_end_request);
3332
3333/**
3334 * __blk_end_request_all - Helper function for drives to finish the request.
3335 * @rq: the request to finish
3336 * @error: block status code
3337 *
3338 * Description:
3339 * Completely finish @rq. Must be called with queue lock held.
3340 */
3341void __blk_end_request_all(struct request *rq, blk_status_t error)
3342{
3343 bool pending;
3344 unsigned int bidi_bytes = 0;
3345
3346 lockdep_assert_held(rq->q->queue_lock);
3347 WARN_ON_ONCE(rq->q->mq_ops);
3348
3349 if (unlikely(blk_bidi_rq(rq)))
3350 bidi_bytes = blk_rq_bytes(rq->next_rq);
3351
3352 pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
3353 BUG_ON(pending);
3354}
3355EXPORT_SYMBOL(__blk_end_request_all);
3356
3357/**
3358 * __blk_end_request_cur - Helper function to finish the current request chunk.
3359 * @rq: the request to finish the current chunk for
3360 * @error: block status code
3361 *
3362 * Description:
3363 * Complete the current consecutively mapped chunk from @rq. Must
3364 * be called with queue lock held.
3365 *
3366 * Return:
3367 * %false - we are done with this request
3368 * %true - still buffers pending for this request
3369 */
3370bool __blk_end_request_cur(struct request *rq, blk_status_t error)
3371{
3372 return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
3373}
3374EXPORT_SYMBOL(__blk_end_request_cur);
3375
3376void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 1507void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
3377 struct bio *bio) 1508 struct bio *bio)
3378{ 1509{
@@ -3428,8 +1559,8 @@ EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
3428 */ 1559 */
3429int blk_lld_busy(struct request_queue *q) 1560int blk_lld_busy(struct request_queue *q)
3430{ 1561{
3431 if (q->lld_busy_fn) 1562 if (queue_is_mq(q) && q->mq_ops->busy)
3432 return q->lld_busy_fn(q); 1563 return q->mq_ops->busy(q);
3433 1564
3434 return 0; 1565 return 0;
3435} 1566}
@@ -3460,7 +1591,6 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
3460 */ 1591 */
3461static void __blk_rq_prep_clone(struct request *dst, struct request *src) 1592static void __blk_rq_prep_clone(struct request *dst, struct request *src)
3462{ 1593{
3463 dst->cpu = src->cpu;
3464 dst->__sector = blk_rq_pos(src); 1594 dst->__sector = blk_rq_pos(src);
3465 dst->__data_len = blk_rq_bytes(src); 1595 dst->__data_len = blk_rq_bytes(src);
3466 if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { 1596 if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
@@ -3572,9 +1702,11 @@ void blk_start_plug(struct blk_plug *plug)
3572 if (tsk->plug) 1702 if (tsk->plug)
3573 return; 1703 return;
3574 1704
3575 INIT_LIST_HEAD(&plug->list);
3576 INIT_LIST_HEAD(&plug->mq_list); 1705 INIT_LIST_HEAD(&plug->mq_list);
3577 INIT_LIST_HEAD(&plug->cb_list); 1706 INIT_LIST_HEAD(&plug->cb_list);
1707 plug->rq_count = 0;
1708 plug->multiple_queues = false;
1709
3578 /* 1710 /*
3579 * Store ordering should not be needed here, since a potential 1711 * Store ordering should not be needed here, since a potential
3580 * preempt will imply a full memory barrier 1712 * preempt will imply a full memory barrier
@@ -3583,36 +1715,6 @@ void blk_start_plug(struct blk_plug *plug)
3583} 1715}
3584EXPORT_SYMBOL(blk_start_plug); 1716EXPORT_SYMBOL(blk_start_plug);
3585 1717
3586static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
3587{
3588 struct request *rqa = container_of(a, struct request, queuelist);
3589 struct request *rqb = container_of(b, struct request, queuelist);
3590
3591 return !(rqa->q < rqb->q ||
3592 (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
3593}
3594
3595/*
3596 * If 'from_schedule' is true, then postpone the dispatch of requests
3597 * until a safe kblockd context. We due this to avoid accidental big
3598 * additional stack usage in driver dispatch, in places where the originally
3599 * plugger did not intend it.
3600 */
3601static void queue_unplugged(struct request_queue *q, unsigned int depth,
3602 bool from_schedule)
3603 __releases(q->queue_lock)
3604{
3605 lockdep_assert_held(q->queue_lock);
3606
3607 trace_block_unplug(q, depth, !from_schedule);
3608
3609 if (from_schedule)
3610 blk_run_queue_async(q);
3611 else
3612 __blk_run_queue(q);
3613 spin_unlock_irq(q->queue_lock);
3614}
3615
3616static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) 1718static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3617{ 1719{
3618 LIST_HEAD(callbacks); 1720 LIST_HEAD(callbacks);
@@ -3657,65 +1759,10 @@ EXPORT_SYMBOL(blk_check_plugged);
3657 1759
3658void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1760void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3659{ 1761{
3660 struct request_queue *q;
3661 struct request *rq;
3662 LIST_HEAD(list);
3663 unsigned int depth;
3664
3665 flush_plug_callbacks(plug, from_schedule); 1762 flush_plug_callbacks(plug, from_schedule);
3666 1763
3667 if (!list_empty(&plug->mq_list)) 1764 if (!list_empty(&plug->mq_list))
3668 blk_mq_flush_plug_list(plug, from_schedule); 1765 blk_mq_flush_plug_list(plug, from_schedule);
3669
3670 if (list_empty(&plug->list))
3671 return;
3672
3673 list_splice_init(&plug->list, &list);
3674
3675 list_sort(NULL, &list, plug_rq_cmp);
3676
3677 q = NULL;
3678 depth = 0;
3679
3680 while (!list_empty(&list)) {
3681 rq = list_entry_rq(list.next);
3682 list_del_init(&rq->queuelist);
3683 BUG_ON(!rq->q);
3684 if (rq->q != q) {
3685 /*
3686 * This drops the queue lock
3687 */
3688 if (q)
3689 queue_unplugged(q, depth, from_schedule);
3690 q = rq->q;
3691 depth = 0;
3692 spin_lock_irq(q->queue_lock);
3693 }
3694
3695 /*
3696 * Short-circuit if @q is dead
3697 */
3698 if (unlikely(blk_queue_dying(q))) {
3699 __blk_end_request_all(rq, BLK_STS_IOERR);
3700 continue;
3701 }
3702
3703 /*
3704 * rq is already accounted, so use raw insert
3705 */
3706 if (op_is_flush(rq->cmd_flags))
3707 __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
3708 else
3709 __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
3710
3711 depth++;
3712 }
3713
3714 /*
3715 * This drops the queue lock
3716 */
3717 if (q)
3718 queue_unplugged(q, depth, from_schedule);
3719} 1766}
3720 1767
3721void blk_finish_plug(struct blk_plug *plug) 1768void blk_finish_plug(struct blk_plug *plug)
@@ -3742,9 +1789,6 @@ int __init blk_dev_init(void)
3742 if (!kblockd_workqueue) 1789 if (!kblockd_workqueue)
3743 panic("Failed to create kblockd\n"); 1790 panic("Failed to create kblockd\n");
3744 1791
3745 request_cachep = kmem_cache_create("blkdev_requests",
3746 sizeof(struct request), 0, SLAB_PANIC, NULL);
3747
3748 blk_requestq_cachep = kmem_cache_create("request_queue", 1792 blk_requestq_cachep = kmem_cache_create("request_queue",
3749 sizeof(struct request_queue), 0, SLAB_PANIC, NULL); 1793 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
3750 1794
diff --git a/block/blk-exec.c b/block/blk-exec.c
index f7b292f12449..a34b7d918742 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -48,8 +48,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
48 struct request *rq, int at_head, 48 struct request *rq, int at_head,
49 rq_end_io_fn *done) 49 rq_end_io_fn *done)
50{ 50{
51 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
52
53 WARN_ON(irqs_disabled()); 51 WARN_ON(irqs_disabled());
54 WARN_ON(!blk_rq_is_passthrough(rq)); 52 WARN_ON(!blk_rq_is_passthrough(rq));
55 53
@@ -60,23 +58,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
60 * don't check dying flag for MQ because the request won't 58 * don't check dying flag for MQ because the request won't
61 * be reused after dying flag is set 59 * be reused after dying flag is set
62 */ 60 */
63 if (q->mq_ops) { 61 blk_mq_sched_insert_request(rq, at_head, true, false);
64 blk_mq_sched_insert_request(rq, at_head, true, false);
65 return;
66 }
67
68 spin_lock_irq(q->queue_lock);
69
70 if (unlikely(blk_queue_dying(q))) {
71 rq->rq_flags |= RQF_QUIET;
72 __blk_end_request_all(rq, BLK_STS_IOERR);
73 spin_unlock_irq(q->queue_lock);
74 return;
75 }
76
77 __elv_add_request(q, rq, where);
78 __blk_run_queue(q);
79 spin_unlock_irq(q->queue_lock);
80} 62}
81EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 63EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
82 64
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8b44b86779da..a3fc7191c694 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -93,7 +93,7 @@ enum {
93 FLUSH_PENDING_TIMEOUT = 5 * HZ, 93 FLUSH_PENDING_TIMEOUT = 5 * HZ,
94}; 94};
95 95
96static bool blk_kick_flush(struct request_queue *q, 96static void blk_kick_flush(struct request_queue *q,
97 struct blk_flush_queue *fq, unsigned int flags); 97 struct blk_flush_queue *fq, unsigned int flags);
98 98
99static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) 99static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
@@ -132,18 +132,9 @@ static void blk_flush_restore_request(struct request *rq)
132 rq->end_io = rq->flush.saved_end_io; 132 rq->end_io = rq->flush.saved_end_io;
133} 133}
134 134
135static bool blk_flush_queue_rq(struct request *rq, bool add_front) 135static void blk_flush_queue_rq(struct request *rq, bool add_front)
136{ 136{
137 if (rq->q->mq_ops) { 137 blk_mq_add_to_requeue_list(rq, add_front, true);
138 blk_mq_add_to_requeue_list(rq, add_front, true);
139 return false;
140 } else {
141 if (add_front)
142 list_add(&rq->queuelist, &rq->q->queue_head);
143 else
144 list_add_tail(&rq->queuelist, &rq->q->queue_head);
145 return true;
146 }
147} 138}
148 139
149/** 140/**
@@ -157,18 +148,17 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
157 * completion and trigger the next step. 148 * completion and trigger the next step.
158 * 149 *
159 * CONTEXT: 150 * CONTEXT:
160 * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) 151 * spin_lock_irq(fq->mq_flush_lock)
161 * 152 *
162 * RETURNS: 153 * RETURNS:
163 * %true if requests were added to the dispatch queue, %false otherwise. 154 * %true if requests were added to the dispatch queue, %false otherwise.
164 */ 155 */
165static bool blk_flush_complete_seq(struct request *rq, 156static void blk_flush_complete_seq(struct request *rq,
166 struct blk_flush_queue *fq, 157 struct blk_flush_queue *fq,
167 unsigned int seq, blk_status_t error) 158 unsigned int seq, blk_status_t error)
168{ 159{
169 struct request_queue *q = rq->q; 160 struct request_queue *q = rq->q;
170 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 161 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
171 bool queued = false, kicked;
172 unsigned int cmd_flags; 162 unsigned int cmd_flags;
173 163
174 BUG_ON(rq->flush.seq & seq); 164 BUG_ON(rq->flush.seq & seq);
@@ -191,7 +181,7 @@ static bool blk_flush_complete_seq(struct request *rq,
191 181
192 case REQ_FSEQ_DATA: 182 case REQ_FSEQ_DATA:
193 list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); 183 list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
194 queued = blk_flush_queue_rq(rq, true); 184 blk_flush_queue_rq(rq, true);
195 break; 185 break;
196 186
197 case REQ_FSEQ_DONE: 187 case REQ_FSEQ_DONE:
@@ -204,42 +194,34 @@ static bool blk_flush_complete_seq(struct request *rq,
204 BUG_ON(!list_empty(&rq->queuelist)); 194 BUG_ON(!list_empty(&rq->queuelist));
205 list_del_init(&rq->flush.list); 195 list_del_init(&rq->flush.list);
206 blk_flush_restore_request(rq); 196 blk_flush_restore_request(rq);
207 if (q->mq_ops) 197 blk_mq_end_request(rq, error);
208 blk_mq_end_request(rq, error);
209 else
210 __blk_end_request_all(rq, error);
211 break; 198 break;
212 199
213 default: 200 default:
214 BUG(); 201 BUG();
215 } 202 }
216 203
217 kicked = blk_kick_flush(q, fq, cmd_flags); 204 blk_kick_flush(q, fq, cmd_flags);
218 return kicked | queued;
219} 205}
220 206
221static void flush_end_io(struct request *flush_rq, blk_status_t error) 207static void flush_end_io(struct request *flush_rq, blk_status_t error)
222{ 208{
223 struct request_queue *q = flush_rq->q; 209 struct request_queue *q = flush_rq->q;
224 struct list_head *running; 210 struct list_head *running;
225 bool queued = false;
226 struct request *rq, *n; 211 struct request *rq, *n;
227 unsigned long flags = 0; 212 unsigned long flags = 0;
228 struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); 213 struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
214 struct blk_mq_hw_ctx *hctx;
229 215
230 if (q->mq_ops) { 216 /* release the tag's ownership to the req cloned from */
231 struct blk_mq_hw_ctx *hctx; 217 spin_lock_irqsave(&fq->mq_flush_lock, flags);
232 218 hctx = flush_rq->mq_hctx;
233 /* release the tag's ownership to the req cloned from */ 219 if (!q->elevator) {
234 spin_lock_irqsave(&fq->mq_flush_lock, flags); 220 blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
235 hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); 221 flush_rq->tag = -1;
236 if (!q->elevator) { 222 } else {
237 blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); 223 blk_mq_put_driver_tag_hctx(hctx, flush_rq);
238 flush_rq->tag = -1; 224 flush_rq->internal_tag = -1;
239 } else {
240 blk_mq_put_driver_tag_hctx(hctx, flush_rq);
241 flush_rq->internal_tag = -1;
242 }
243 } 225 }
244 226
245 running = &fq->flush_queue[fq->flush_running_idx]; 227 running = &fq->flush_queue[fq->flush_running_idx];
@@ -248,35 +230,16 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
248 /* account completion of the flush request */ 230 /* account completion of the flush request */
249 fq->flush_running_idx ^= 1; 231 fq->flush_running_idx ^= 1;
250 232
251 if (!q->mq_ops)
252 elv_completed_request(q, flush_rq);
253
254 /* and push the waiting requests to the next stage */ 233 /* and push the waiting requests to the next stage */
255 list_for_each_entry_safe(rq, n, running, flush.list) { 234 list_for_each_entry_safe(rq, n, running, flush.list) {
256 unsigned int seq = blk_flush_cur_seq(rq); 235 unsigned int seq = blk_flush_cur_seq(rq);
257 236
258 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); 237 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
259 queued |= blk_flush_complete_seq(rq, fq, seq, error); 238 blk_flush_complete_seq(rq, fq, seq, error);
260 } 239 }
261 240
262 /*
263 * Kick the queue to avoid stall for two cases:
264 * 1. Moving a request silently to empty queue_head may stall the
265 * queue.
266 * 2. When flush request is running in non-queueable queue, the
267 * queue is hold. Restart the queue after flush request is finished
268 * to avoid stall.
269 * This function is called from request completion path and calling
270 * directly into request_fn may confuse the driver. Always use
271 * kblockd.
272 */
273 if (queued || fq->flush_queue_delayed) {
274 WARN_ON(q->mq_ops);
275 blk_run_queue_async(q);
276 }
277 fq->flush_queue_delayed = 0; 241 fq->flush_queue_delayed = 0;
278 if (q->mq_ops) 242 spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
279 spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
280} 243}
281 244
282/** 245/**
@@ -289,12 +252,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
289 * Please read the comment at the top of this file for more info. 252 * Please read the comment at the top of this file for more info.
290 * 253 *
291 * CONTEXT: 254 * CONTEXT:
292 * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) 255 * spin_lock_irq(fq->mq_flush_lock)
293 * 256 *
294 * RETURNS:
295 * %true if flush was issued, %false otherwise.
296 */ 257 */
297static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, 258static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
298 unsigned int flags) 259 unsigned int flags)
299{ 260{
300 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 261 struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
@@ -304,7 +265,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
304 265
305 /* C1 described at the top of this file */ 266 /* C1 described at the top of this file */
306 if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) 267 if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
307 return false; 268 return;
308 269
309 /* C2 and C3 270 /* C2 and C3
310 * 271 *
@@ -312,11 +273,10 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
312 * assigned to empty flushes, and we deadlock if we are expecting 273 * assigned to empty flushes, and we deadlock if we are expecting
313 * other requests to make progress. Don't defer for that case. 274 * other requests to make progress. Don't defer for that case.
314 */ 275 */
315 if (!list_empty(&fq->flush_data_in_flight) && 276 if (!list_empty(&fq->flush_data_in_flight) && q->elevator &&
316 !(q->mq_ops && q->elevator) &&
317 time_before(jiffies, 277 time_before(jiffies,
318 fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) 278 fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
319 return false; 279 return;
320 280
321 /* 281 /*
322 * Issue flush and toggle pending_idx. This makes pending_idx 282 * Issue flush and toggle pending_idx. This makes pending_idx
@@ -334,19 +294,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
334 * In case of IO scheduler, flush rq need to borrow scheduler tag 294 * In case of IO scheduler, flush rq need to borrow scheduler tag
335 * just for cheating put/get driver tag. 295 * just for cheating put/get driver tag.
336 */ 296 */
337 if (q->mq_ops) { 297 flush_rq->mq_ctx = first_rq->mq_ctx;
338 struct blk_mq_hw_ctx *hctx; 298 flush_rq->mq_hctx = first_rq->mq_hctx;
339 299
340 flush_rq->mq_ctx = first_rq->mq_ctx; 300 if (!q->elevator) {
341 301 fq->orig_rq = first_rq;
342 if (!q->elevator) { 302 flush_rq->tag = first_rq->tag;
343 fq->orig_rq = first_rq; 303 blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
344 flush_rq->tag = first_rq->tag; 304 } else {
345 hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); 305 flush_rq->internal_tag = first_rq->internal_tag;
346 blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
347 } else {
348 flush_rq->internal_tag = first_rq->internal_tag;
349 }
350 } 306 }
351 307
352 flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; 308 flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -355,62 +311,17 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
355 flush_rq->rq_disk = first_rq->rq_disk; 311 flush_rq->rq_disk = first_rq->rq_disk;
356 flush_rq->end_io = flush_end_io; 312 flush_rq->end_io = flush_end_io;
357 313
358 return blk_flush_queue_rq(flush_rq, false); 314 blk_flush_queue_rq(flush_rq, false);
359}
360
361static void flush_data_end_io(struct request *rq, blk_status_t error)
362{
363 struct request_queue *q = rq->q;
364 struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
365
366 lockdep_assert_held(q->queue_lock);
367
368 /*
369 * Updating q->in_flight[] here for making this tag usable
370 * early. Because in blk_queue_start_tag(),
371 * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and
372 * reserve tags for sync I/O.
373 *
374 * More importantly this way can avoid the following I/O
375 * deadlock:
376 *
377 * - suppose there are 40 fua requests comming to flush queue
378 * and queue depth is 31
379 * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc
380 * tag for async I/O any more
381 * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT
382 * and flush_data_end_io() is called
383 * - the other rqs still can't go ahead if not updating
384 * q->in_flight[BLK_RW_ASYNC] here, meantime these rqs
385 * are held in flush data queue and make no progress of
386 * handling post flush rq
387 * - only after the post flush rq is handled, all these rqs
388 * can be completed
389 */
390
391 elv_completed_request(q, rq);
392
393 /* for avoiding double accounting */
394 rq->rq_flags &= ~RQF_STARTED;
395
396 /*
397 * After populating an empty queue, kick it to avoid stall. Read
398 * the comment in flush_end_io().
399 */
400 if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
401 blk_run_queue_async(q);
402} 315}
403 316
404static void mq_flush_data_end_io(struct request *rq, blk_status_t error) 317static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
405{ 318{
406 struct request_queue *q = rq->q; 319 struct request_queue *q = rq->q;
407 struct blk_mq_hw_ctx *hctx; 320 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
408 struct blk_mq_ctx *ctx = rq->mq_ctx; 321 struct blk_mq_ctx *ctx = rq->mq_ctx;
409 unsigned long flags; 322 unsigned long flags;
410 struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); 323 struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
411 324
412 hctx = blk_mq_map_queue(q, ctx->cpu);
413
414 if (q->elevator) { 325 if (q->elevator) {
415 WARN_ON(rq->tag < 0); 326 WARN_ON(rq->tag < 0);
416 blk_mq_put_driver_tag_hctx(hctx, rq); 327 blk_mq_put_driver_tag_hctx(hctx, rq);
@@ -443,9 +354,6 @@ void blk_insert_flush(struct request *rq)
443 unsigned int policy = blk_flush_policy(fflags, rq); 354 unsigned int policy = blk_flush_policy(fflags, rq);
444 struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); 355 struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
445 356
446 if (!q->mq_ops)
447 lockdep_assert_held(q->queue_lock);
448
449 /* 357 /*
450 * @policy now records what operations need to be done. Adjust 358 * @policy now records what operations need to be done. Adjust
451 * REQ_PREFLUSH and FUA for the driver. 359 * REQ_PREFLUSH and FUA for the driver.
@@ -468,10 +376,7 @@ void blk_insert_flush(struct request *rq)
468 * complete the request. 376 * complete the request.
469 */ 377 */
470 if (!policy) { 378 if (!policy) {
471 if (q->mq_ops) 379 blk_mq_end_request(rq, 0);
472 blk_mq_end_request(rq, 0);
473 else
474 __blk_end_request(rq, 0, 0);
475 return; 380 return;
476 } 381 }
477 382
@@ -484,10 +389,7 @@ void blk_insert_flush(struct request *rq)
484 */ 389 */
485 if ((policy & REQ_FSEQ_DATA) && 390 if ((policy & REQ_FSEQ_DATA) &&
486 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 391 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
487 if (q->mq_ops) 392 blk_mq_request_bypass_insert(rq, false);
488 blk_mq_request_bypass_insert(rq, false);
489 else
490 list_add_tail(&rq->queuelist, &q->queue_head);
491 return; 393 return;
492 } 394 }
493 395
@@ -499,17 +401,12 @@ void blk_insert_flush(struct request *rq)
499 INIT_LIST_HEAD(&rq->flush.list); 401 INIT_LIST_HEAD(&rq->flush.list);
500 rq->rq_flags |= RQF_FLUSH_SEQ; 402 rq->rq_flags |= RQF_FLUSH_SEQ;
501 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 403 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
502 if (q->mq_ops) {
503 rq->end_io = mq_flush_data_end_io;
504 404
505 spin_lock_irq(&fq->mq_flush_lock); 405 rq->end_io = mq_flush_data_end_io;
506 blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
507 spin_unlock_irq(&fq->mq_flush_lock);
508 return;
509 }
510 rq->end_io = flush_data_end_io;
511 406
407 spin_lock_irq(&fq->mq_flush_lock);
512 blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); 408 blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
409 spin_unlock_irq(&fq->mq_flush_lock);
513} 410}
514 411
515/** 412/**
@@ -575,8 +472,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
575 if (!fq) 472 if (!fq)
576 goto fail; 473 goto fail;
577 474
578 if (q->mq_ops) 475 spin_lock_init(&fq->mq_flush_lock);
579 spin_lock_init(&fq->mq_flush_lock);
580 476
581 rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); 477 rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
582 fq->flush_rq = kzalloc_node(rq_sz, flags, node); 478 fq->flush_rq = kzalloc_node(rq_sz, flags, node);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 01580f88fcb3..5ed59ac6ae58 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -28,7 +28,6 @@ void get_io_context(struct io_context *ioc)
28 BUG_ON(atomic_long_read(&ioc->refcount) <= 0); 28 BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
29 atomic_long_inc(&ioc->refcount); 29 atomic_long_inc(&ioc->refcount);
30} 30}
31EXPORT_SYMBOL(get_io_context);
32 31
33static void icq_free_icq_rcu(struct rcu_head *head) 32static void icq_free_icq_rcu(struct rcu_head *head)
34{ 33{
@@ -48,10 +47,8 @@ static void ioc_exit_icq(struct io_cq *icq)
48 if (icq->flags & ICQ_EXITED) 47 if (icq->flags & ICQ_EXITED)
49 return; 48 return;
50 49
51 if (et->uses_mq && et->ops.mq.exit_icq) 50 if (et->ops.exit_icq)
52 et->ops.mq.exit_icq(icq); 51 et->ops.exit_icq(icq);
53 else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
54 et->ops.sq.elevator_exit_icq_fn(icq);
55 52
56 icq->flags |= ICQ_EXITED; 53 icq->flags |= ICQ_EXITED;
57} 54}
@@ -113,9 +110,9 @@ static void ioc_release_fn(struct work_struct *work)
113 struct io_cq, ioc_node); 110 struct io_cq, ioc_node);
114 struct request_queue *q = icq->q; 111 struct request_queue *q = icq->q;
115 112
116 if (spin_trylock(q->queue_lock)) { 113 if (spin_trylock(&q->queue_lock)) {
117 ioc_destroy_icq(icq); 114 ioc_destroy_icq(icq);
118 spin_unlock(q->queue_lock); 115 spin_unlock(&q->queue_lock);
119 } else { 116 } else {
120 spin_unlock_irqrestore(&ioc->lock, flags); 117 spin_unlock_irqrestore(&ioc->lock, flags);
121 cpu_relax(); 118 cpu_relax();
@@ -162,7 +159,6 @@ void put_io_context(struct io_context *ioc)
162 if (free_ioc) 159 if (free_ioc)
163 kmem_cache_free(iocontext_cachep, ioc); 160 kmem_cache_free(iocontext_cachep, ioc);
164} 161}
165EXPORT_SYMBOL(put_io_context);
166 162
167/** 163/**
168 * put_io_context_active - put active reference on ioc 164 * put_io_context_active - put active reference on ioc
@@ -173,7 +169,6 @@ EXPORT_SYMBOL(put_io_context);
173 */ 169 */
174void put_io_context_active(struct io_context *ioc) 170void put_io_context_active(struct io_context *ioc)
175{ 171{
176 struct elevator_type *et;
177 unsigned long flags; 172 unsigned long flags;
178 struct io_cq *icq; 173 struct io_cq *icq;
179 174
@@ -187,25 +182,12 @@ void put_io_context_active(struct io_context *ioc)
187 * reverse double locking. Read comment in ioc_release_fn() for 182 * reverse double locking. Read comment in ioc_release_fn() for
188 * explanation on the nested locking annotation. 183 * explanation on the nested locking annotation.
189 */ 184 */
190retry:
191 spin_lock_irqsave_nested(&ioc->lock, flags, 1); 185 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
192 hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { 186 hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
193 if (icq->flags & ICQ_EXITED) 187 if (icq->flags & ICQ_EXITED)
194 continue; 188 continue;
195 189
196 et = icq->q->elevator->type; 190 ioc_exit_icq(icq);
197 if (et->uses_mq) {
198 ioc_exit_icq(icq);
199 } else {
200 if (spin_trylock(icq->q->queue_lock)) {
201 ioc_exit_icq(icq);
202 spin_unlock(icq->q->queue_lock);
203 } else {
204 spin_unlock_irqrestore(&ioc->lock, flags);
205 cpu_relax();
206 goto retry;
207 }
208 }
209 } 191 }
210 spin_unlock_irqrestore(&ioc->lock, flags); 192 spin_unlock_irqrestore(&ioc->lock, flags);
211 193
@@ -232,7 +214,7 @@ static void __ioc_clear_queue(struct list_head *icq_list)
232 214
233 while (!list_empty(icq_list)) { 215 while (!list_empty(icq_list)) {
234 struct io_cq *icq = list_entry(icq_list->next, 216 struct io_cq *icq = list_entry(icq_list->next,
235 struct io_cq, q_node); 217 struct io_cq, q_node);
236 struct io_context *ioc = icq->ioc; 218 struct io_context *ioc = icq->ioc;
237 219
238 spin_lock_irqsave(&ioc->lock, flags); 220 spin_lock_irqsave(&ioc->lock, flags);
@@ -251,16 +233,11 @@ void ioc_clear_queue(struct request_queue *q)
251{ 233{
252 LIST_HEAD(icq_list); 234 LIST_HEAD(icq_list);
253 235
254 spin_lock_irq(q->queue_lock); 236 spin_lock_irq(&q->queue_lock);
255 list_splice_init(&q->icq_list, &icq_list); 237 list_splice_init(&q->icq_list, &icq_list);
238 spin_unlock_irq(&q->queue_lock);
256 239
257 if (q->mq_ops) { 240 __ioc_clear_queue(&icq_list);
258 spin_unlock_irq(q->queue_lock);
259 __ioc_clear_queue(&icq_list);
260 } else {
261 __ioc_clear_queue(&icq_list);
262 spin_unlock_irq(q->queue_lock);
263 }
264} 241}
265 242
266int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) 243int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
@@ -336,7 +313,6 @@ struct io_context *get_task_io_context(struct task_struct *task,
336 313
337 return NULL; 314 return NULL;
338} 315}
339EXPORT_SYMBOL(get_task_io_context);
340 316
341/** 317/**
342 * ioc_lookup_icq - lookup io_cq from ioc 318 * ioc_lookup_icq - lookup io_cq from ioc
@@ -350,7 +326,7 @@ struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
350{ 326{
351 struct io_cq *icq; 327 struct io_cq *icq;
352 328
353 lockdep_assert_held(q->queue_lock); 329 lockdep_assert_held(&q->queue_lock);
354 330
355 /* 331 /*
356 * icq's are indexed from @ioc using radix tree and hint pointer, 332 * icq's are indexed from @ioc using radix tree and hint pointer,
@@ -409,16 +385,14 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
409 INIT_HLIST_NODE(&icq->ioc_node); 385 INIT_HLIST_NODE(&icq->ioc_node);
410 386
411 /* lock both q and ioc and try to link @icq */ 387 /* lock both q and ioc and try to link @icq */
412 spin_lock_irq(q->queue_lock); 388 spin_lock_irq(&q->queue_lock);
413 spin_lock(&ioc->lock); 389 spin_lock(&ioc->lock);
414 390
415 if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { 391 if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
416 hlist_add_head(&icq->ioc_node, &ioc->icq_list); 392 hlist_add_head(&icq->ioc_node, &ioc->icq_list);
417 list_add(&icq->q_node, &q->icq_list); 393 list_add(&icq->q_node, &q->icq_list);
418 if (et->uses_mq && et->ops.mq.init_icq) 394 if (et->ops.init_icq)
419 et->ops.mq.init_icq(icq); 395 et->ops.init_icq(icq);
420 else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
421 et->ops.sq.elevator_init_icq_fn(icq);
422 } else { 396 } else {
423 kmem_cache_free(et->icq_cache, icq); 397 kmem_cache_free(et->icq_cache, icq);
424 icq = ioc_lookup_icq(ioc, q); 398 icq = ioc_lookup_icq(ioc, q);
@@ -427,7 +401,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
427 } 401 }
428 402
429 spin_unlock(&ioc->lock); 403 spin_unlock(&ioc->lock);
430 spin_unlock_irq(q->queue_lock); 404 spin_unlock_irq(&q->queue_lock);
431 radix_tree_preload_end(); 405 radix_tree_preload_end();
432 return icq; 406 return icq;
433} 407}
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 38c35c32aff2..fc714ef402a6 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -262,29 +262,25 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
262 stat->rqs.mean); 262 stat->rqs.mean);
263} 263}
264 264
265static inline bool iolatency_may_queue(struct iolatency_grp *iolat, 265static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
266 wait_queue_entry_t *wait,
267 bool first_block)
268{ 266{
269 struct rq_wait *rqw = &iolat->rq_wait; 267 atomic_dec(&rqw->inflight);
268 wake_up(&rqw->wait);
269}
270 270
271 if (first_block && waitqueue_active(&rqw->wait) && 271static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
272 rqw->wait.head.next != &wait->entry) 272{
273 return false; 273 struct iolatency_grp *iolat = private_data;
274 return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); 274 return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
275} 275}
276 276
277static void __blkcg_iolatency_throttle(struct rq_qos *rqos, 277static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
278 struct iolatency_grp *iolat, 278 struct iolatency_grp *iolat,
279 spinlock_t *lock, bool issue_as_root, 279 bool issue_as_root,
280 bool use_memdelay) 280 bool use_memdelay)
281 __releases(lock)
282 __acquires(lock)
283{ 281{
284 struct rq_wait *rqw = &iolat->rq_wait; 282 struct rq_wait *rqw = &iolat->rq_wait;
285 unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); 283 unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
286 DEFINE_WAIT(wait);
287 bool first_block = true;
288 284
289 if (use_delay) 285 if (use_delay)
290 blkcg_schedule_throttle(rqos->q, use_memdelay); 286 blkcg_schedule_throttle(rqos->q, use_memdelay);
@@ -301,27 +297,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
301 return; 297 return;
302 } 298 }
303 299
304 if (iolatency_may_queue(iolat, &wait, first_block)) 300 rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb);
305 return;
306
307 do {
308 prepare_to_wait_exclusive(&rqw->wait, &wait,
309 TASK_UNINTERRUPTIBLE);
310
311 if (iolatency_may_queue(iolat, &wait, first_block))
312 break;
313 first_block = false;
314
315 if (lock) {
316 spin_unlock_irq(lock);
317 io_schedule();
318 spin_lock_irq(lock);
319 } else {
320 io_schedule();
321 }
322 } while (1);
323
324 finish_wait(&rqw->wait, &wait);
325} 301}
326 302
327#define SCALE_DOWN_FACTOR 2 303#define SCALE_DOWN_FACTOR 2
@@ -478,38 +454,15 @@ static void check_scale_change(struct iolatency_grp *iolat)
478 scale_change(iolat, direction > 0); 454 scale_change(iolat, direction > 0);
479} 455}
480 456
481static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, 457static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
482 spinlock_t *lock)
483{ 458{
484 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 459 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
485 struct blkcg *blkcg; 460 struct blkcg_gq *blkg = bio->bi_blkg;
486 struct blkcg_gq *blkg;
487 struct request_queue *q = rqos->q;
488 bool issue_as_root = bio_issue_as_root_blkg(bio); 461 bool issue_as_root = bio_issue_as_root_blkg(bio);
489 462
490 if (!blk_iolatency_enabled(blkiolat)) 463 if (!blk_iolatency_enabled(blkiolat))
491 return; 464 return;
492 465
493 rcu_read_lock();
494 blkcg = bio_blkcg(bio);
495 bio_associate_blkcg(bio, &blkcg->css);
496 blkg = blkg_lookup(blkcg, q);
497 if (unlikely(!blkg)) {
498 if (!lock)
499 spin_lock_irq(q->queue_lock);
500 blkg = blkg_lookup_create(blkcg, q);
501 if (IS_ERR(blkg))
502 blkg = NULL;
503 if (!lock)
504 spin_unlock_irq(q->queue_lock);
505 }
506 if (!blkg)
507 goto out;
508
509 bio_issue_init(&bio->bi_issue, bio_sectors(bio));
510 bio_associate_blkg(bio, blkg);
511out:
512 rcu_read_unlock();
513 while (blkg && blkg->parent) { 466 while (blkg && blkg->parent) {
514 struct iolatency_grp *iolat = blkg_to_lat(blkg); 467 struct iolatency_grp *iolat = blkg_to_lat(blkg);
515 if (!iolat) { 468 if (!iolat) {
@@ -518,7 +471,7 @@ out:
518 } 471 }
519 472
520 check_scale_change(iolat); 473 check_scale_change(iolat);
521 __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, 474 __blkcg_iolatency_throttle(rqos, iolat, issue_as_root,
522 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 475 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
523 blkg = blkg->parent; 476 blkg = blkg->parent;
524 } 477 }
@@ -640,7 +593,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
640 bool enabled = false; 593 bool enabled = false;
641 594
642 blkg = bio->bi_blkg; 595 blkg = bio->bi_blkg;
643 if (!blkg) 596 if (!blkg || !bio_flagged(bio, BIO_TRACKED))
644 return; 597 return;
645 598
646 iolat = blkg_to_lat(bio->bi_blkg); 599 iolat = blkg_to_lat(bio->bi_blkg);
@@ -730,7 +683,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
730 * We could be exiting, don't access the pd unless we have a 683 * We could be exiting, don't access the pd unless we have a
731 * ref on the blkg. 684 * ref on the blkg.
732 */ 685 */
733 if (!blkg_try_get(blkg)) 686 if (!blkg_tryget(blkg))
734 continue; 687 continue;
735 688
736 iolat = blkg_to_lat(blkg); 689 iolat = blkg_to_lat(blkg);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7695034f4b87..e7f1c6cf0167 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -389,7 +389,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
389 389
390 bio_set_flag(bio, BIO_SEG_VALID); 390 bio_set_flag(bio, BIO_SEG_VALID);
391} 391}
392EXPORT_SYMBOL(blk_recount_segments);
393 392
394static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, 393static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
395 struct bio *nxt) 394 struct bio *nxt)
@@ -596,17 +595,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
596 return ll_new_hw_segment(q, req, bio); 595 return ll_new_hw_segment(q, req, bio);
597} 596}
598 597
599/*
600 * blk-mq uses req->special to carry normal driver per-request payload, it
601 * does not indicate a prepared command that we cannot merge with.
602 */
603static bool req_no_special_merge(struct request *req)
604{
605 struct request_queue *q = req->q;
606
607 return !q->mq_ops && req->special;
608}
609
610static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, 598static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
611 struct request *next) 599 struct request *next)
612{ 600{
@@ -632,13 +620,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
632 unsigned int seg_size = 620 unsigned int seg_size =
633 req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; 621 req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
634 622
635 /*
636 * First check if the either of the requests are re-queued
637 * requests. Can't merge them if they are.
638 */
639 if (req_no_special_merge(req) || req_no_special_merge(next))
640 return 0;
641
642 if (req_gap_back_merge(req, next->bio)) 623 if (req_gap_back_merge(req, next->bio))
643 return 0; 624 return 0;
644 625
@@ -703,12 +684,10 @@ static void blk_account_io_merge(struct request *req)
703{ 684{
704 if (blk_do_io_stat(req)) { 685 if (blk_do_io_stat(req)) {
705 struct hd_struct *part; 686 struct hd_struct *part;
706 int cpu;
707 687
708 cpu = part_stat_lock(); 688 part_stat_lock();
709 part = req->part; 689 part = req->part;
710 690
711 part_round_stats(req->q, cpu, part);
712 part_dec_in_flight(req->q, part, rq_data_dir(req)); 691 part_dec_in_flight(req->q, part, rq_data_dir(req));
713 692
714 hd_struct_put(part); 693 hd_struct_put(part);
@@ -731,7 +710,8 @@ static inline bool blk_discard_mergable(struct request *req)
731 return false; 710 return false;
732} 711}
733 712
734enum elv_merge blk_try_req_merge(struct request *req, struct request *next) 713static enum elv_merge blk_try_req_merge(struct request *req,
714 struct request *next)
735{ 715{
736 if (blk_discard_mergable(req)) 716 if (blk_discard_mergable(req))
737 return ELEVATOR_DISCARD_MERGE; 717 return ELEVATOR_DISCARD_MERGE;
@@ -748,9 +728,6 @@ enum elv_merge blk_try_req_merge(struct request *req, struct request *next)
748static struct request *attempt_merge(struct request_queue *q, 728static struct request *attempt_merge(struct request_queue *q,
749 struct request *req, struct request *next) 729 struct request *req, struct request *next)
750{ 730{
751 if (!q->mq_ops)
752 lockdep_assert_held(q->queue_lock);
753
754 if (!rq_mergeable(req) || !rq_mergeable(next)) 731 if (!rq_mergeable(req) || !rq_mergeable(next))
755 return NULL; 732 return NULL;
756 733
@@ -758,8 +735,7 @@ static struct request *attempt_merge(struct request_queue *q,
758 return NULL; 735 return NULL;
759 736
760 if (rq_data_dir(req) != rq_data_dir(next) 737 if (rq_data_dir(req) != rq_data_dir(next)
761 || req->rq_disk != next->rq_disk 738 || req->rq_disk != next->rq_disk)
762 || req_no_special_merge(next))
763 return NULL; 739 return NULL;
764 740
765 if (req_op(req) == REQ_OP_WRITE_SAME && 741 if (req_op(req) == REQ_OP_WRITE_SAME &&
@@ -773,6 +749,9 @@ static struct request *attempt_merge(struct request_queue *q,
773 if (req->write_hint != next->write_hint) 749 if (req->write_hint != next->write_hint)
774 return NULL; 750 return NULL;
775 751
752 if (req->ioprio != next->ioprio)
753 return NULL;
754
776 /* 755 /*
777 * If we are allowed to merge, then append bio list 756 * If we are allowed to merge, then append bio list
778 * from next to rq and release next. merge_requests_fn 757 * from next to rq and release next. merge_requests_fn
@@ -828,10 +807,6 @@ static struct request *attempt_merge(struct request_queue *q,
828 */ 807 */
829 blk_account_io_merge(next); 808 blk_account_io_merge(next);
830 809
831 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
832 if (blk_rq_cpu_valid(next))
833 req->cpu = next->cpu;
834
835 /* 810 /*
836 * ownership of bio passed from next to req, return 'next' for 811 * ownership of bio passed from next to req, return 'next' for
837 * the caller to free 812 * the caller to free
@@ -863,16 +838,11 @@ struct request *attempt_front_merge(struct request_queue *q, struct request *rq)
863int blk_attempt_req_merge(struct request_queue *q, struct request *rq, 838int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
864 struct request *next) 839 struct request *next)
865{ 840{
866 struct elevator_queue *e = q->elevator;
867 struct request *free; 841 struct request *free;
868 842
869 if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
870 if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
871 return 0;
872
873 free = attempt_merge(q, rq, next); 843 free = attempt_merge(q, rq, next);
874 if (free) { 844 if (free) {
875 __blk_put_request(q, free); 845 blk_put_request(free);
876 return 1; 846 return 1;
877 } 847 }
878 848
@@ -891,8 +861,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
891 if (bio_data_dir(bio) != rq_data_dir(rq)) 861 if (bio_data_dir(bio) != rq_data_dir(rq))
892 return false; 862 return false;
893 863
894 /* must be same device and not a special request */ 864 /* must be same device */
895 if (rq->rq_disk != bio->bi_disk || req_no_special_merge(rq)) 865 if (rq->rq_disk != bio->bi_disk)
896 return false; 866 return false;
897 867
898 /* only merge integrity protected bio into ditto rq */ 868 /* only merge integrity protected bio into ditto rq */
@@ -911,6 +881,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
911 if (rq->write_hint != bio->bi_write_hint) 881 if (rq->write_hint != bio->bi_write_hint)
912 return false; 882 return false;
913 883
884 if (rq->ioprio != bio_prio(bio))
885 return false;
886
914 return true; 887 return true;
915} 888}
916 889
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 3eb169f15842..03a534820271 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -14,9 +14,10 @@
14#include "blk.h" 14#include "blk.h"
15#include "blk-mq.h" 15#include "blk-mq.h"
16 16
17static int cpu_to_queue_index(unsigned int nr_queues, const int cpu) 17static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
18 unsigned int nr_queues, const int cpu)
18{ 19{
19 return cpu % nr_queues; 20 return qmap->queue_offset + (cpu % nr_queues);
20} 21}
21 22
22static int get_first_sibling(unsigned int cpu) 23static int get_first_sibling(unsigned int cpu)
@@ -30,10 +31,10 @@ static int get_first_sibling(unsigned int cpu)
30 return cpu; 31 return cpu;
31} 32}
32 33
33int blk_mq_map_queues(struct blk_mq_tag_set *set) 34int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
34{ 35{
35 unsigned int *map = set->mq_map; 36 unsigned int *map = qmap->mq_map;
36 unsigned int nr_queues = set->nr_hw_queues; 37 unsigned int nr_queues = qmap->nr_queues;
37 unsigned int cpu, first_sibling; 38 unsigned int cpu, first_sibling;
38 39
39 for_each_possible_cpu(cpu) { 40 for_each_possible_cpu(cpu) {
@@ -44,11 +45,11 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set)
44 * performace optimizations. 45 * performace optimizations.
45 */ 46 */
46 if (cpu < nr_queues) { 47 if (cpu < nr_queues) {
47 map[cpu] = cpu_to_queue_index(nr_queues, cpu); 48 map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
48 } else { 49 } else {
49 first_sibling = get_first_sibling(cpu); 50 first_sibling = get_first_sibling(cpu);
50 if (first_sibling == cpu) 51 if (first_sibling == cpu)
51 map[cpu] = cpu_to_queue_index(nr_queues, cpu); 52 map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
52 else 53 else
53 map[cpu] = map[first_sibling]; 54 map[cpu] = map[first_sibling];
54 } 55 }
@@ -62,12 +63,12 @@ EXPORT_SYMBOL_GPL(blk_mq_map_queues);
62 * We have no quick way of doing reverse lookups. This is only used at 63 * We have no quick way of doing reverse lookups. This is only used at
63 * queue init time, so runtime isn't important. 64 * queue init time, so runtime isn't important.
64 */ 65 */
65int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) 66int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
66{ 67{
67 int i; 68 int i;
68 69
69 for_each_possible_cpu(i) { 70 for_each_possible_cpu(i) {
70 if (index == mq_map[i]) 71 if (index == qmap->mq_map[i])
71 return local_memory_node(cpu_to_node(i)); 72 return local_memory_node(cpu_to_node(i));
72 } 73 }
73 74
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 10b284a1f18d..90d68760af08 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -23,6 +23,7 @@
23#include "blk-mq.h" 23#include "blk-mq.h"
24#include "blk-mq-debugfs.h" 24#include "blk-mq-debugfs.h"
25#include "blk-mq-tag.h" 25#include "blk-mq-tag.h"
26#include "blk-rq-qos.h"
26 27
27static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) 28static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
28{ 29{
@@ -112,10 +113,8 @@ static int queue_pm_only_show(void *data, struct seq_file *m)
112 113
113#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name 114#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
114static const char *const blk_queue_flag_name[] = { 115static const char *const blk_queue_flag_name[] = {
115 QUEUE_FLAG_NAME(QUEUED),
116 QUEUE_FLAG_NAME(STOPPED), 116 QUEUE_FLAG_NAME(STOPPED),
117 QUEUE_FLAG_NAME(DYING), 117 QUEUE_FLAG_NAME(DYING),
118 QUEUE_FLAG_NAME(BYPASS),
119 QUEUE_FLAG_NAME(BIDI), 118 QUEUE_FLAG_NAME(BIDI),
120 QUEUE_FLAG_NAME(NOMERGES), 119 QUEUE_FLAG_NAME(NOMERGES),
121 QUEUE_FLAG_NAME(SAME_COMP), 120 QUEUE_FLAG_NAME(SAME_COMP),
@@ -318,7 +317,6 @@ static const char *const cmd_flag_name[] = {
318static const char *const rqf_name[] = { 317static const char *const rqf_name[] = {
319 RQF_NAME(SORTED), 318 RQF_NAME(SORTED),
320 RQF_NAME(STARTED), 319 RQF_NAME(STARTED),
321 RQF_NAME(QUEUED),
322 RQF_NAME(SOFTBARRIER), 320 RQF_NAME(SOFTBARRIER),
323 RQF_NAME(FLUSH_SEQ), 321 RQF_NAME(FLUSH_SEQ),
324 RQF_NAME(MIXED_MERGE), 322 RQF_NAME(MIXED_MERGE),
@@ -424,15 +422,18 @@ struct show_busy_params {
424 422
425/* 423/*
426 * Note: the state of a request may change while this function is in progress, 424 * Note: the state of a request may change while this function is in progress,
427 * e.g. due to a concurrent blk_mq_finish_request() call. 425 * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to
426 * keep iterating requests.
428 */ 427 */
429static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) 428static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
430{ 429{
431 const struct show_busy_params *params = data; 430 const struct show_busy_params *params = data;
432 431
433 if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx) 432 if (rq->mq_hctx == params->hctx)
434 __blk_mq_debugfs_rq_show(params->m, 433 __blk_mq_debugfs_rq_show(params->m,
435 list_entry_rq(&rq->queuelist)); 434 list_entry_rq(&rq->queuelist));
435
436 return true;
436} 437}
437 438
438static int hctx_busy_show(void *data, struct seq_file *m) 439static int hctx_busy_show(void *data, struct seq_file *m)
@@ -446,6 +447,21 @@ static int hctx_busy_show(void *data, struct seq_file *m)
446 return 0; 447 return 0;
447} 448}
448 449
450static const char *const hctx_types[] = {
451 [HCTX_TYPE_DEFAULT] = "default",
452 [HCTX_TYPE_READ] = "read",
453 [HCTX_TYPE_POLL] = "poll",
454};
455
456static int hctx_type_show(void *data, struct seq_file *m)
457{
458 struct blk_mq_hw_ctx *hctx = data;
459
460 BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES);
461 seq_printf(m, "%s\n", hctx_types[hctx->type]);
462 return 0;
463}
464
449static int hctx_ctx_map_show(void *data, struct seq_file *m) 465static int hctx_ctx_map_show(void *data, struct seq_file *m)
450{ 466{
451 struct blk_mq_hw_ctx *hctx = data; 467 struct blk_mq_hw_ctx *hctx = data;
@@ -636,36 +652,43 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
636 return 0; 652 return 0;
637} 653}
638 654
639static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) 655#define CTX_RQ_SEQ_OPS(name, type) \
640 __acquires(&ctx->lock) 656static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
641{ 657 __acquires(&ctx->lock) \
642 struct blk_mq_ctx *ctx = m->private; 658{ \
643 659 struct blk_mq_ctx *ctx = m->private; \
644 spin_lock(&ctx->lock); 660 \
645 return seq_list_start(&ctx->rq_list, *pos); 661 spin_lock(&ctx->lock); \
646} 662 return seq_list_start(&ctx->rq_lists[type], *pos); \
647 663} \
648static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) 664 \
649{ 665static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \
650 struct blk_mq_ctx *ctx = m->private; 666 loff_t *pos) \
651 667{ \
652 return seq_list_next(v, &ctx->rq_list, pos); 668 struct blk_mq_ctx *ctx = m->private; \
669 \
670 return seq_list_next(v, &ctx->rq_lists[type], pos); \
671} \
672 \
673static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \
674 __releases(&ctx->lock) \
675{ \
676 struct blk_mq_ctx *ctx = m->private; \
677 \
678 spin_unlock(&ctx->lock); \
679} \
680 \
681static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \
682 .start = ctx_##name##_rq_list_start, \
683 .next = ctx_##name##_rq_list_next, \
684 .stop = ctx_##name##_rq_list_stop, \
685 .show = blk_mq_debugfs_rq_show, \
653} 686}
654 687
655static void ctx_rq_list_stop(struct seq_file *m, void *v) 688CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
656 __releases(&ctx->lock) 689CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
657{ 690CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
658 struct blk_mq_ctx *ctx = m->private;
659
660 spin_unlock(&ctx->lock);
661}
662 691
663static const struct seq_operations ctx_rq_list_seq_ops = {
664 .start = ctx_rq_list_start,
665 .next = ctx_rq_list_next,
666 .stop = ctx_rq_list_stop,
667 .show = blk_mq_debugfs_rq_show,
668};
669static int ctx_dispatched_show(void *data, struct seq_file *m) 692static int ctx_dispatched_show(void *data, struct seq_file *m)
670{ 693{
671 struct blk_mq_ctx *ctx = data; 694 struct blk_mq_ctx *ctx = data;
@@ -798,11 +821,14 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
798 {"run", 0600, hctx_run_show, hctx_run_write}, 821 {"run", 0600, hctx_run_show, hctx_run_write},
799 {"active", 0400, hctx_active_show}, 822 {"active", 0400, hctx_active_show},
800 {"dispatch_busy", 0400, hctx_dispatch_busy_show}, 823 {"dispatch_busy", 0400, hctx_dispatch_busy_show},
824 {"type", 0400, hctx_type_show},
801 {}, 825 {},
802}; 826};
803 827
804static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { 828static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
805 {"rq_list", 0400, .seq_ops = &ctx_rq_list_seq_ops}, 829 {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
830 {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
831 {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
806 {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, 832 {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
807 {"merged", 0600, ctx_merged_show, ctx_merged_write}, 833 {"merged", 0600, ctx_merged_show, ctx_merged_write},
808 {"completed", 0600, ctx_completed_show, ctx_completed_write}, 834 {"completed", 0600, ctx_completed_show, ctx_completed_write},
@@ -856,6 +882,15 @@ int blk_mq_debugfs_register(struct request_queue *q)
856 goto err; 882 goto err;
857 } 883 }
858 884
885 if (q->rq_qos) {
886 struct rq_qos *rqos = q->rq_qos;
887
888 while (rqos) {
889 blk_mq_debugfs_register_rqos(rqos);
890 rqos = rqos->next;
891 }
892 }
893
859 return 0; 894 return 0;
860 895
861err: 896err:
@@ -978,6 +1013,50 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q)
978 q->sched_debugfs_dir = NULL; 1013 q->sched_debugfs_dir = NULL;
979} 1014}
980 1015
1016void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
1017{
1018 debugfs_remove_recursive(rqos->debugfs_dir);
1019 rqos->debugfs_dir = NULL;
1020}
1021
1022int blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
1023{
1024 struct request_queue *q = rqos->q;
1025 const char *dir_name = rq_qos_id_to_name(rqos->id);
1026
1027 if (!q->debugfs_dir)
1028 return -ENOENT;
1029
1030 if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
1031 return 0;
1032
1033 if (!q->rqos_debugfs_dir) {
1034 q->rqos_debugfs_dir = debugfs_create_dir("rqos",
1035 q->debugfs_dir);
1036 if (!q->rqos_debugfs_dir)
1037 return -ENOMEM;
1038 }
1039
1040 rqos->debugfs_dir = debugfs_create_dir(dir_name,
1041 rqos->q->rqos_debugfs_dir);
1042 if (!rqos->debugfs_dir)
1043 return -ENOMEM;
1044
1045 if (!debugfs_create_files(rqos->debugfs_dir, rqos,
1046 rqos->ops->debugfs_attrs))
1047 goto err;
1048 return 0;
1049 err:
1050 blk_mq_debugfs_unregister_rqos(rqos);
1051 return -ENOMEM;
1052}
1053
1054void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
1055{
1056 debugfs_remove_recursive(q->rqos_debugfs_dir);
1057 q->rqos_debugfs_dir = NULL;
1058}
1059
981int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, 1060int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
982 struct blk_mq_hw_ctx *hctx) 1061 struct blk_mq_hw_ctx *hctx)
983{ 1062{
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a9160be12be0..8c9012a578c1 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -31,6 +31,10 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q);
31int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, 31int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
32 struct blk_mq_hw_ctx *hctx); 32 struct blk_mq_hw_ctx *hctx);
33void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); 33void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
34
35int blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
36void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
37void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q);
34#else 38#else
35static inline int blk_mq_debugfs_register(struct request_queue *q) 39static inline int blk_mq_debugfs_register(struct request_queue *q)
36{ 40{
@@ -78,6 +82,19 @@ static inline int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
78static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) 82static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
79{ 83{
80} 84}
85
86static inline int blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
87{
88 return 0;
89}
90
91static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
92{
93}
94
95static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
96{
97}
81#endif 98#endif
82 99
83#ifdef CONFIG_BLK_DEBUG_FS_ZONED 100#ifdef CONFIG_BLK_DEBUG_FS_ZONED
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index db644ec624f5..1dce18553984 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -31,26 +31,26 @@
31 * that maps a queue to the CPUs that have irq affinity for the corresponding 31 * that maps a queue to the CPUs that have irq affinity for the corresponding
32 * vector. 32 * vector.
33 */ 33 */
34int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, 34int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
35 int offset) 35 int offset)
36{ 36{
37 const struct cpumask *mask; 37 const struct cpumask *mask;
38 unsigned int queue, cpu; 38 unsigned int queue, cpu;
39 39
40 for (queue = 0; queue < set->nr_hw_queues; queue++) { 40 for (queue = 0; queue < qmap->nr_queues; queue++) {
41 mask = pci_irq_get_affinity(pdev, queue + offset); 41 mask = pci_irq_get_affinity(pdev, queue + offset);
42 if (!mask) 42 if (!mask)
43 goto fallback; 43 goto fallback;
44 44
45 for_each_cpu(cpu, mask) 45 for_each_cpu(cpu, mask)
46 set->mq_map[cpu] = queue; 46 qmap->mq_map[cpu] = qmap->queue_offset + queue;
47 } 47 }
48 48
49 return 0; 49 return 0;
50 50
51fallback: 51fallback:
52 WARN_ON_ONCE(set->nr_hw_queues > 1); 52 WARN_ON_ONCE(qmap->nr_queues > 1);
53 blk_mq_clear_mq_map(set); 53 blk_mq_clear_mq_map(qmap);
54 return 0; 54 return 0;
55} 55}
56EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); 56EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
index 996167f1de18..45030a81a1ed 100644
--- a/block/blk-mq-rdma.c
+++ b/block/blk-mq-rdma.c
@@ -29,24 +29,24 @@
29 * @set->nr_hw_queues, or @dev does not provide an affinity mask for a 29 * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
30 * vector, we fallback to the naive mapping. 30 * vector, we fallback to the naive mapping.
31 */ 31 */
32int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, 32int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
33 struct ib_device *dev, int first_vec) 33 struct ib_device *dev, int first_vec)
34{ 34{
35 const struct cpumask *mask; 35 const struct cpumask *mask;
36 unsigned int queue, cpu; 36 unsigned int queue, cpu;
37 37
38 for (queue = 0; queue < set->nr_hw_queues; queue++) { 38 for (queue = 0; queue < map->nr_queues; queue++) {
39 mask = ib_get_vector_affinity(dev, first_vec + queue); 39 mask = ib_get_vector_affinity(dev, first_vec + queue);
40 if (!mask) 40 if (!mask)
41 goto fallback; 41 goto fallback;
42 42
43 for_each_cpu(cpu, mask) 43 for_each_cpu(cpu, mask)
44 set->mq_map[cpu] = queue; 44 map->mq_map[cpu] = map->queue_offset + queue;
45 } 45 }
46 46
47 return 0; 47 return 0;
48 48
49fallback: 49fallback:
50 return blk_mq_map_queues(set); 50 return blk_mq_map_queues(map);
51} 51}
52EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); 52EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 29bfe8017a2d..140933e4a7d1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -31,15 +31,22 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
31} 31}
32EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); 32EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
33 33
34void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) 34void blk_mq_sched_assign_ioc(struct request *rq)
35{ 35{
36 struct request_queue *q = rq->q; 36 struct request_queue *q = rq->q;
37 struct io_context *ioc = rq_ioc(bio); 37 struct io_context *ioc;
38 struct io_cq *icq; 38 struct io_cq *icq;
39 39
40 spin_lock_irq(q->queue_lock); 40 /*
41 * May not have an IO context if it's a passthrough request
42 */
43 ioc = current->io_context;
44 if (!ioc)
45 return;
46
47 spin_lock_irq(&q->queue_lock);
41 icq = ioc_lookup_icq(ioc, q); 48 icq = ioc_lookup_icq(ioc, q);
42 spin_unlock_irq(q->queue_lock); 49 spin_unlock_irq(&q->queue_lock);
43 50
44 if (!icq) { 51 if (!icq) {
45 icq = ioc_create_icq(ioc, q, GFP_ATOMIC); 52 icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
@@ -54,13 +61,14 @@ void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
54 * Mark a hardware queue as needing a restart. For shared queues, maintain 61 * Mark a hardware queue as needing a restart. For shared queues, maintain
55 * a count of how many hardware queues are marked for restart. 62 * a count of how many hardware queues are marked for restart.
56 */ 63 */
57static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) 64void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
58{ 65{
59 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 66 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
60 return; 67 return;
61 68
62 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 69 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
63} 70}
71EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
64 72
65void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) 73void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
66{ 74{
@@ -85,14 +93,13 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
85 do { 93 do {
86 struct request *rq; 94 struct request *rq;
87 95
88 if (e->type->ops.mq.has_work && 96 if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
89 !e->type->ops.mq.has_work(hctx))
90 break; 97 break;
91 98
92 if (!blk_mq_get_dispatch_budget(hctx)) 99 if (!blk_mq_get_dispatch_budget(hctx))
93 break; 100 break;
94 101
95 rq = e->type->ops.mq.dispatch_request(hctx); 102 rq = e->type->ops.dispatch_request(hctx);
96 if (!rq) { 103 if (!rq) {
97 blk_mq_put_dispatch_budget(hctx); 104 blk_mq_put_dispatch_budget(hctx);
98 break; 105 break;
@@ -110,7 +117,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
110static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, 117static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
111 struct blk_mq_ctx *ctx) 118 struct blk_mq_ctx *ctx)
112{ 119{
113 unsigned idx = ctx->index_hw; 120 unsigned short idx = ctx->index_hw[hctx->type];
114 121
115 if (++idx == hctx->nr_ctx) 122 if (++idx == hctx->nr_ctx)
116 idx = 0; 123 idx = 0;
@@ -163,7 +170,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
163{ 170{
164 struct request_queue *q = hctx->queue; 171 struct request_queue *q = hctx->queue;
165 struct elevator_queue *e = q->elevator; 172 struct elevator_queue *e = q->elevator;
166 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; 173 const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
167 LIST_HEAD(rq_list); 174 LIST_HEAD(rq_list);
168 175
169 /* RCU or SRCU read lock is needed before checking quiesced flag */ 176 /* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -295,11 +302,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
295 * too much time checking for merges. 302 * too much time checking for merges.
296 */ 303 */
297static bool blk_mq_attempt_merge(struct request_queue *q, 304static bool blk_mq_attempt_merge(struct request_queue *q,
305 struct blk_mq_hw_ctx *hctx,
298 struct blk_mq_ctx *ctx, struct bio *bio) 306 struct blk_mq_ctx *ctx, struct bio *bio)
299{ 307{
308 enum hctx_type type = hctx->type;
309
300 lockdep_assert_held(&ctx->lock); 310 lockdep_assert_held(&ctx->lock);
301 311
302 if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { 312 if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) {
303 ctx->rq_merged++; 313 ctx->rq_merged++;
304 return true; 314 return true;
305 } 315 }
@@ -311,19 +321,21 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
311{ 321{
312 struct elevator_queue *e = q->elevator; 322 struct elevator_queue *e = q->elevator;
313 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 323 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
314 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 324 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
315 bool ret = false; 325 bool ret = false;
326 enum hctx_type type;
316 327
317 if (e && e->type->ops.mq.bio_merge) { 328 if (e && e->type->ops.bio_merge) {
318 blk_mq_put_ctx(ctx); 329 blk_mq_put_ctx(ctx);
319 return e->type->ops.mq.bio_merge(hctx, bio); 330 return e->type->ops.bio_merge(hctx, bio);
320 } 331 }
321 332
333 type = hctx->type;
322 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 334 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
323 !list_empty_careful(&ctx->rq_list)) { 335 !list_empty_careful(&ctx->rq_lists[type])) {
324 /* default per sw-queue merge */ 336 /* default per sw-queue merge */
325 spin_lock(&ctx->lock); 337 spin_lock(&ctx->lock);
326 ret = blk_mq_attempt_merge(q, ctx, bio); 338 ret = blk_mq_attempt_merge(q, hctx, ctx, bio);
327 spin_unlock(&ctx->lock); 339 spin_unlock(&ctx->lock);
328 } 340 }
329 341
@@ -367,7 +379,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
367 struct request_queue *q = rq->q; 379 struct request_queue *q = rq->q;
368 struct elevator_queue *e = q->elevator; 380 struct elevator_queue *e = q->elevator;
369 struct blk_mq_ctx *ctx = rq->mq_ctx; 381 struct blk_mq_ctx *ctx = rq->mq_ctx;
370 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 382 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
371 383
372 /* flush rq in flush machinery need to be dispatched directly */ 384 /* flush rq in flush machinery need to be dispatched directly */
373 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { 385 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
@@ -380,11 +392,11 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
380 if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) 392 if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
381 goto run; 393 goto run;
382 394
383 if (e && e->type->ops.mq.insert_requests) { 395 if (e && e->type->ops.insert_requests) {
384 LIST_HEAD(list); 396 LIST_HEAD(list);
385 397
386 list_add(&rq->queuelist, &list); 398 list_add(&rq->queuelist, &list);
387 e->type->ops.mq.insert_requests(hctx, &list, at_head); 399 e->type->ops.insert_requests(hctx, &list, at_head);
388 } else { 400 } else {
389 spin_lock(&ctx->lock); 401 spin_lock(&ctx->lock);
390 __blk_mq_insert_request(hctx, rq, at_head); 402 __blk_mq_insert_request(hctx, rq, at_head);
@@ -396,27 +408,25 @@ run:
396 blk_mq_run_hw_queue(hctx, async); 408 blk_mq_run_hw_queue(hctx, async);
397} 409}
398 410
399void blk_mq_sched_insert_requests(struct request_queue *q, 411void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
400 struct blk_mq_ctx *ctx, 412 struct blk_mq_ctx *ctx,
401 struct list_head *list, bool run_queue_async) 413 struct list_head *list, bool run_queue_async)
402{ 414{
403 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 415 struct elevator_queue *e;
404 struct elevator_queue *e = hctx->queue->elevator;
405 416
406 if (e && e->type->ops.mq.insert_requests) 417 e = hctx->queue->elevator;
407 e->type->ops.mq.insert_requests(hctx, list, false); 418 if (e && e->type->ops.insert_requests)
419 e->type->ops.insert_requests(hctx, list, false);
408 else { 420 else {
409 /* 421 /*
410 * try to issue requests directly if the hw queue isn't 422 * try to issue requests directly if the hw queue isn't
411 * busy in case of 'none' scheduler, and this way may save 423 * busy in case of 'none' scheduler, and this way may save
412 * us one extra enqueue & dequeue to sw queue. 424 * us one extra enqueue & dequeue to sw queue.
413 */ 425 */
414 if (!hctx->dispatch_busy && !e && !run_queue_async) { 426 if (!hctx->dispatch_busy && !e && !run_queue_async)
415 blk_mq_try_issue_list_directly(hctx, list); 427 blk_mq_try_issue_list_directly(hctx, list);
416 if (list_empty(list)) 428 else
417 return; 429 blk_mq_insert_requests(hctx, ctx, list);
418 }
419 blk_mq_insert_requests(hctx, ctx, list);
420 } 430 }
421 431
422 blk_mq_run_hw_queue(hctx, run_queue_async); 432 blk_mq_run_hw_queue(hctx, run_queue_async);
@@ -489,15 +499,15 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
489 goto err; 499 goto err;
490 } 500 }
491 501
492 ret = e->ops.mq.init_sched(q, e); 502 ret = e->ops.init_sched(q, e);
493 if (ret) 503 if (ret)
494 goto err; 504 goto err;
495 505
496 blk_mq_debugfs_register_sched(q); 506 blk_mq_debugfs_register_sched(q);
497 507
498 queue_for_each_hw_ctx(q, hctx, i) { 508 queue_for_each_hw_ctx(q, hctx, i) {
499 if (e->ops.mq.init_hctx) { 509 if (e->ops.init_hctx) {
500 ret = e->ops.mq.init_hctx(hctx, i); 510 ret = e->ops.init_hctx(hctx, i);
501 if (ret) { 511 if (ret) {
502 eq = q->elevator; 512 eq = q->elevator;
503 blk_mq_exit_sched(q, eq); 513 blk_mq_exit_sched(q, eq);
@@ -523,14 +533,14 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
523 533
524 queue_for_each_hw_ctx(q, hctx, i) { 534 queue_for_each_hw_ctx(q, hctx, i) {
525 blk_mq_debugfs_unregister_sched_hctx(hctx); 535 blk_mq_debugfs_unregister_sched_hctx(hctx);
526 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 536 if (e->type->ops.exit_hctx && hctx->sched_data) {
527 e->type->ops.mq.exit_hctx(hctx, i); 537 e->type->ops.exit_hctx(hctx, i);
528 hctx->sched_data = NULL; 538 hctx->sched_data = NULL;
529 } 539 }
530 } 540 }
531 blk_mq_debugfs_unregister_sched(q); 541 blk_mq_debugfs_unregister_sched(q);
532 if (e->type->ops.mq.exit_sched) 542 if (e->type->ops.exit_sched)
533 e->type->ops.mq.exit_sched(e); 543 e->type->ops.exit_sched(e);
534 blk_mq_sched_tags_teardown(q); 544 blk_mq_sched_tags_teardown(q);
535 q->elevator = NULL; 545 q->elevator = NULL;
536} 546}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 8a9544203173..c7bdb52367ac 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -8,18 +8,19 @@
8void blk_mq_sched_free_hctx_data(struct request_queue *q, 8void blk_mq_sched_free_hctx_data(struct request_queue *q,
9 void (*exit)(struct blk_mq_hw_ctx *)); 9 void (*exit)(struct blk_mq_hw_ctx *));
10 10
11void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio); 11void blk_mq_sched_assign_ioc(struct request *rq);
12 12
13void blk_mq_sched_request_inserted(struct request *rq); 13void blk_mq_sched_request_inserted(struct request *rq);
14bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 14bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
15 struct request **merged_request); 15 struct request **merged_request);
16bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); 16bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
17bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); 17bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
18void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
18void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); 19void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
19 20
20void blk_mq_sched_insert_request(struct request *rq, bool at_head, 21void blk_mq_sched_insert_request(struct request *rq, bool at_head,
21 bool run_queue, bool async); 22 bool run_queue, bool async);
22void blk_mq_sched_insert_requests(struct request_queue *q, 23void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
23 struct blk_mq_ctx *ctx, 24 struct blk_mq_ctx *ctx,
24 struct list_head *list, bool run_queue_async); 25 struct list_head *list, bool run_queue_async);
25 26
@@ -43,8 +44,8 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
43{ 44{
44 struct elevator_queue *e = q->elevator; 45 struct elevator_queue *e = q->elevator;
45 46
46 if (e && e->type->ops.mq.allow_merge) 47 if (e && e->type->ops.allow_merge)
47 return e->type->ops.mq.allow_merge(q, rq, bio); 48 return e->type->ops.allow_merge(q, rq, bio);
48 49
49 return true; 50 return true;
50} 51}
@@ -53,8 +54,8 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
53{ 54{
54 struct elevator_queue *e = rq->q->elevator; 55 struct elevator_queue *e = rq->q->elevator;
55 56
56 if (e && e->type->ops.mq.completed_request) 57 if (e && e->type->ops.completed_request)
57 e->type->ops.mq.completed_request(rq, now); 58 e->type->ops.completed_request(rq, now);
58} 59}
59 60
60static inline void blk_mq_sched_started_request(struct request *rq) 61static inline void blk_mq_sched_started_request(struct request *rq)
@@ -62,8 +63,8 @@ static inline void blk_mq_sched_started_request(struct request *rq)
62 struct request_queue *q = rq->q; 63 struct request_queue *q = rq->q;
63 struct elevator_queue *e = q->elevator; 64 struct elevator_queue *e = q->elevator;
64 65
65 if (e && e->type->ops.mq.started_request) 66 if (e && e->type->ops.started_request)
66 e->type->ops.mq.started_request(rq); 67 e->type->ops.started_request(rq);
67} 68}
68 69
69static inline void blk_mq_sched_requeue_request(struct request *rq) 70static inline void blk_mq_sched_requeue_request(struct request *rq)
@@ -71,16 +72,16 @@ static inline void blk_mq_sched_requeue_request(struct request *rq)
71 struct request_queue *q = rq->q; 72 struct request_queue *q = rq->q;
72 struct elevator_queue *e = q->elevator; 73 struct elevator_queue *e = q->elevator;
73 74
74 if (e && e->type->ops.mq.requeue_request) 75 if (e && e->type->ops.requeue_request)
75 e->type->ops.mq.requeue_request(rq); 76 e->type->ops.requeue_request(rq);
76} 77}
77 78
78static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) 79static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
79{ 80{
80 struct elevator_queue *e = hctx->queue->elevator; 81 struct elevator_queue *e = hctx->queue->elevator;
81 82
82 if (e && e->type->ops.mq.has_work) 83 if (e && e->type->ops.has_work)
83 return e->type->ops.mq.has_work(hctx); 84 return e->type->ops.has_work(hctx);
84 85
85 return false; 86 return false;
86} 87}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index aafb44224c89..3f9c3f4ac44c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -15,6 +15,18 @@
15 15
16static void blk_mq_sysfs_release(struct kobject *kobj) 16static void blk_mq_sysfs_release(struct kobject *kobj)
17{ 17{
18 struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj);
19
20 free_percpu(ctxs->queue_ctx);
21 kfree(ctxs);
22}
23
24static void blk_mq_ctx_sysfs_release(struct kobject *kobj)
25{
26 struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj);
27
28 /* ctx->ctxs won't be released until all ctx are freed */
29 kobject_put(&ctx->ctxs->kobj);
18} 30}
19 31
20static void blk_mq_hw_sysfs_release(struct kobject *kobj) 32static void blk_mq_hw_sysfs_release(struct kobject *kobj)
@@ -203,7 +215,7 @@ static struct kobj_type blk_mq_ktype = {
203static struct kobj_type blk_mq_ctx_ktype = { 215static struct kobj_type blk_mq_ctx_ktype = {
204 .sysfs_ops = &blk_mq_sysfs_ops, 216 .sysfs_ops = &blk_mq_sysfs_ops,
205 .default_attrs = default_ctx_attrs, 217 .default_attrs = default_ctx_attrs,
206 .release = blk_mq_sysfs_release, 218 .release = blk_mq_ctx_sysfs_release,
207}; 219};
208 220
209static struct kobj_type blk_mq_hw_ktype = { 221static struct kobj_type blk_mq_hw_ktype = {
@@ -235,7 +247,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
235 if (!hctx->nr_ctx) 247 if (!hctx->nr_ctx)
236 return 0; 248 return 0;
237 249
238 ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); 250 ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num);
239 if (ret) 251 if (ret)
240 return ret; 252 return ret;
241 253
@@ -258,8 +270,8 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
258 queue_for_each_hw_ctx(q, hctx, i) 270 queue_for_each_hw_ctx(q, hctx, i)
259 blk_mq_unregister_hctx(hctx); 271 blk_mq_unregister_hctx(hctx);
260 272
261 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); 273 kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
262 kobject_del(&q->mq_kobj); 274 kobject_del(q->mq_kobj);
263 kobject_put(&dev->kobj); 275 kobject_put(&dev->kobj);
264 276
265 q->mq_sysfs_init_done = false; 277 q->mq_sysfs_init_done = false;
@@ -279,7 +291,7 @@ void blk_mq_sysfs_deinit(struct request_queue *q)
279 ctx = per_cpu_ptr(q->queue_ctx, cpu); 291 ctx = per_cpu_ptr(q->queue_ctx, cpu);
280 kobject_put(&ctx->kobj); 292 kobject_put(&ctx->kobj);
281 } 293 }
282 kobject_put(&q->mq_kobj); 294 kobject_put(q->mq_kobj);
283} 295}
284 296
285void blk_mq_sysfs_init(struct request_queue *q) 297void blk_mq_sysfs_init(struct request_queue *q)
@@ -287,10 +299,12 @@ void blk_mq_sysfs_init(struct request_queue *q)
287 struct blk_mq_ctx *ctx; 299 struct blk_mq_ctx *ctx;
288 int cpu; 300 int cpu;
289 301
290 kobject_init(&q->mq_kobj, &blk_mq_ktype); 302 kobject_init(q->mq_kobj, &blk_mq_ktype);
291 303
292 for_each_possible_cpu(cpu) { 304 for_each_possible_cpu(cpu) {
293 ctx = per_cpu_ptr(q->queue_ctx, cpu); 305 ctx = per_cpu_ptr(q->queue_ctx, cpu);
306
307 kobject_get(q->mq_kobj);
294 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); 308 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
295 } 309 }
296} 310}
@@ -303,11 +317,11 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
303 WARN_ON_ONCE(!q->kobj.parent); 317 WARN_ON_ONCE(!q->kobj.parent);
304 lockdep_assert_held(&q->sysfs_lock); 318 lockdep_assert_held(&q->sysfs_lock);
305 319
306 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); 320 ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
307 if (ret < 0) 321 if (ret < 0)
308 goto out; 322 goto out;
309 323
310 kobject_uevent(&q->mq_kobj, KOBJ_ADD); 324 kobject_uevent(q->mq_kobj, KOBJ_ADD);
311 325
312 queue_for_each_hw_ctx(q, hctx, i) { 326 queue_for_each_hw_ctx(q, hctx, i) {
313 ret = blk_mq_register_hctx(hctx); 327 ret = blk_mq_register_hctx(hctx);
@@ -324,8 +338,8 @@ unreg:
324 while (--i >= 0) 338 while (--i >= 0)
325 blk_mq_unregister_hctx(q->queue_hw_ctx[i]); 339 blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
326 340
327 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); 341 kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
328 kobject_del(&q->mq_kobj); 342 kobject_del(q->mq_kobj);
329 kobject_put(&dev->kobj); 343 kobject_put(&dev->kobj);
330 return ret; 344 return ret;
331} 345}
@@ -340,7 +354,6 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
340 354
341 return ret; 355 return ret;
342} 356}
343EXPORT_SYMBOL_GPL(blk_mq_register_dev);
344 357
345void blk_mq_sysfs_unregister(struct request_queue *q) 358void blk_mq_sysfs_unregister(struct request_queue *q)
346{ 359{
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index cfda95b85d34..2089c6c62f44 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -110,7 +110,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
110 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 110 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
111 struct sbitmap_queue *bt; 111 struct sbitmap_queue *bt;
112 struct sbq_wait_state *ws; 112 struct sbq_wait_state *ws;
113 DEFINE_WAIT(wait); 113 DEFINE_SBQ_WAIT(wait);
114 unsigned int tag_offset; 114 unsigned int tag_offset;
115 bool drop_ctx; 115 bool drop_ctx;
116 int tag; 116 int tag;
@@ -154,8 +154,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
154 if (tag != -1) 154 if (tag != -1)
155 break; 155 break;
156 156
157 prepare_to_wait_exclusive(&ws->wait, &wait, 157 sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
158 TASK_UNINTERRUPTIBLE);
159 158
160 tag = __blk_mq_get_tag(data, bt); 159 tag = __blk_mq_get_tag(data, bt);
161 if (tag != -1) 160 if (tag != -1)
@@ -167,16 +166,17 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
167 bt_prev = bt; 166 bt_prev = bt;
168 io_schedule(); 167 io_schedule();
169 168
169 sbitmap_finish_wait(bt, ws, &wait);
170
170 data->ctx = blk_mq_get_ctx(data->q); 171 data->ctx = blk_mq_get_ctx(data->q);
171 data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); 172 data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
173 data->ctx->cpu);
172 tags = blk_mq_tags_from_data(data); 174 tags = blk_mq_tags_from_data(data);
173 if (data->flags & BLK_MQ_REQ_RESERVED) 175 if (data->flags & BLK_MQ_REQ_RESERVED)
174 bt = &tags->breserved_tags; 176 bt = &tags->breserved_tags;
175 else 177 else
176 bt = &tags->bitmap_tags; 178 bt = &tags->bitmap_tags;
177 179
178 finish_wait(&ws->wait, &wait);
179
180 /* 180 /*
181 * If destination hw queue is changed, fake wake up on 181 * If destination hw queue is changed, fake wake up on
182 * previous queue for compensating the wake up miss, so 182 * previous queue for compensating the wake up miss, so
@@ -191,7 +191,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
191 if (drop_ctx && data->ctx) 191 if (drop_ctx && data->ctx)
192 blk_mq_put_ctx(data->ctx); 192 blk_mq_put_ctx(data->ctx);
193 193
194 finish_wait(&ws->wait, &wait); 194 sbitmap_finish_wait(bt, ws, &wait);
195 195
196found_tag: 196found_tag:
197 return tag + tag_offset; 197 return tag + tag_offset;
@@ -235,7 +235,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
235 * test and set the bit before assigning ->rqs[]. 235 * test and set the bit before assigning ->rqs[].
236 */ 236 */
237 if (rq && rq->q == hctx->queue) 237 if (rq && rq->q == hctx->queue)
238 iter_data->fn(hctx, rq, iter_data->data, reserved); 238 return iter_data->fn(hctx, rq, iter_data->data, reserved);
239 return true; 239 return true;
240} 240}
241 241
@@ -247,7 +247,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
247 * @fn: Pointer to the function that will be called for each request 247 * @fn: Pointer to the function that will be called for each request
248 * associated with @hctx that has been assigned a driver tag. 248 * associated with @hctx that has been assigned a driver tag.
249 * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) 249 * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
250 * where rq is a pointer to a request. 250 * where rq is a pointer to a request. Return true to continue
251 * iterating tags, false to stop.
251 * @data: Will be passed as third argument to @fn. 252 * @data: Will be passed as third argument to @fn.
252 * @reserved: Indicates whether @bt is the breserved_tags member or the 253 * @reserved: Indicates whether @bt is the breserved_tags member or the
253 * bitmap_tags member of struct blk_mq_tags. 254 * bitmap_tags member of struct blk_mq_tags.
@@ -288,7 +289,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
288 */ 289 */
289 rq = tags->rqs[bitnr]; 290 rq = tags->rqs[bitnr];
290 if (rq && blk_mq_request_started(rq)) 291 if (rq && blk_mq_request_started(rq))
291 iter_data->fn(rq, iter_data->data, reserved); 292 return iter_data->fn(rq, iter_data->data, reserved);
292 293
293 return true; 294 return true;
294} 295}
@@ -300,7 +301,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
300 * or the bitmap_tags member of struct blk_mq_tags. 301 * or the bitmap_tags member of struct blk_mq_tags.
301 * @fn: Pointer to the function that will be called for each started 302 * @fn: Pointer to the function that will be called for each started
302 * request. @fn will be called as follows: @fn(rq, @data, 303 * request. @fn will be called as follows: @fn(rq, @data,
303 * @reserved) where rq is a pointer to a request. 304 * @reserved) where rq is a pointer to a request. Return true
305 * to continue iterating tags, false to stop.
304 * @data: Will be passed as second argument to @fn. 306 * @data: Will be passed as second argument to @fn.
305 * @reserved: Indicates whether @bt is the breserved_tags member or the 307 * @reserved: Indicates whether @bt is the breserved_tags member or the
306 * bitmap_tags member of struct blk_mq_tags. 308 * bitmap_tags member of struct blk_mq_tags.
@@ -325,7 +327,8 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
325 * @fn: Pointer to the function that will be called for each started 327 * @fn: Pointer to the function that will be called for each started
326 * request. @fn will be called as follows: @fn(rq, @priv, 328 * request. @fn will be called as follows: @fn(rq, @priv,
327 * reserved) where rq is a pointer to a request. 'reserved' 329 * reserved) where rq is a pointer to a request. 'reserved'
328 * indicates whether or not @rq is a reserved request. 330 * indicates whether or not @rq is a reserved request. Return
331 * true to continue iterating tags, false to stop.
329 * @priv: Will be passed as second argument to @fn. 332 * @priv: Will be passed as second argument to @fn.
330 */ 333 */
331static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, 334static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
@@ -342,7 +345,8 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
342 * @fn: Pointer to the function that will be called for each started 345 * @fn: Pointer to the function that will be called for each started
343 * request. @fn will be called as follows: @fn(rq, @priv, 346 * request. @fn will be called as follows: @fn(rq, @priv,
344 * reserved) where rq is a pointer to a request. 'reserved' 347 * reserved) where rq is a pointer to a request. 'reserved'
345 * indicates whether or not @rq is a reserved request. 348 * indicates whether or not @rq is a reserved request. Return
349 * true to continue iterating tags, false to stop.
346 * @priv: Will be passed as second argument to @fn. 350 * @priv: Will be passed as second argument to @fn.
347 */ 351 */
348void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 352void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
@@ -526,16 +530,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
526 */ 530 */
527u32 blk_mq_unique_tag(struct request *rq) 531u32 blk_mq_unique_tag(struct request *rq)
528{ 532{
529 struct request_queue *q = rq->q; 533 return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
530 struct blk_mq_hw_ctx *hctx;
531 int hwq = 0;
532
533 if (q->mq_ops) {
534 hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
535 hwq = hctx->queue_num;
536 }
537
538 return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
539 (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); 534 (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
540} 535}
541EXPORT_SYMBOL(blk_mq_unique_tag); 536EXPORT_SYMBOL(blk_mq_unique_tag);
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
index c3afbca11299..370827163835 100644
--- a/block/blk-mq-virtio.c
+++ b/block/blk-mq-virtio.c
@@ -29,7 +29,7 @@
29 * that maps a queue to the CPUs that have irq affinity for the corresponding 29 * that maps a queue to the CPUs that have irq affinity for the corresponding
30 * vector. 30 * vector.
31 */ 31 */
32int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, 32int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
33 struct virtio_device *vdev, int first_vec) 33 struct virtio_device *vdev, int first_vec)
34{ 34{
35 const struct cpumask *mask; 35 const struct cpumask *mask;
@@ -38,17 +38,17 @@ int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set,
38 if (!vdev->config->get_vq_affinity) 38 if (!vdev->config->get_vq_affinity)
39 goto fallback; 39 goto fallback;
40 40
41 for (queue = 0; queue < set->nr_hw_queues; queue++) { 41 for (queue = 0; queue < qmap->nr_queues; queue++) {
42 mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); 42 mask = vdev->config->get_vq_affinity(vdev, first_vec + queue);
43 if (!mask) 43 if (!mask)
44 goto fallback; 44 goto fallback;
45 45
46 for_each_cpu(cpu, mask) 46 for_each_cpu(cpu, mask)
47 set->mq_map[cpu] = queue; 47 qmap->mq_map[cpu] = qmap->queue_offset + queue;
48 } 48 }
49 49
50 return 0; 50 return 0;
51fallback: 51fallback:
52 return blk_mq_map_queues(set); 52 return blk_mq_map_queues(qmap);
53} 53}
54EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); 54EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6a7566244de3..3ba37b9e15e9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -38,7 +38,6 @@
38#include "blk-mq-sched.h" 38#include "blk-mq-sched.h"
39#include "blk-rq-qos.h" 39#include "blk-rq-qos.h"
40 40
41static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
42static void blk_mq_poll_stats_start(struct request_queue *q); 41static void blk_mq_poll_stats_start(struct request_queue *q);
43static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); 42static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
44 43
@@ -75,14 +74,18 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
75static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 74static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
76 struct blk_mq_ctx *ctx) 75 struct blk_mq_ctx *ctx)
77{ 76{
78 if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) 77 const int bit = ctx->index_hw[hctx->type];
79 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); 78
79 if (!sbitmap_test_bit(&hctx->ctx_map, bit))
80 sbitmap_set_bit(&hctx->ctx_map, bit);
80} 81}
81 82
82static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 83static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
83 struct blk_mq_ctx *ctx) 84 struct blk_mq_ctx *ctx)
84{ 85{
85 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); 86 const int bit = ctx->index_hw[hctx->type];
87
88 sbitmap_clear_bit(&hctx->ctx_map, bit);
86} 89}
87 90
88struct mq_inflight { 91struct mq_inflight {
@@ -90,33 +93,33 @@ struct mq_inflight {
90 unsigned int *inflight; 93 unsigned int *inflight;
91}; 94};
92 95
93static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, 96static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
94 struct request *rq, void *priv, 97 struct request *rq, void *priv,
95 bool reserved) 98 bool reserved)
96{ 99{
97 struct mq_inflight *mi = priv; 100 struct mq_inflight *mi = priv;
98 101
99 /* 102 /*
100 * index[0] counts the specific partition that was asked for. index[1] 103 * index[0] counts the specific partition that was asked for.
101 * counts the ones that are active on the whole device, so increment
102 * that if mi->part is indeed a partition, and not a whole device.
103 */ 104 */
104 if (rq->part == mi->part) 105 if (rq->part == mi->part)
105 mi->inflight[0]++; 106 mi->inflight[0]++;
106 if (mi->part->partno) 107
107 mi->inflight[1]++; 108 return true;
108} 109}
109 110
110void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, 111unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
111 unsigned int inflight[2])
112{ 112{
113 unsigned inflight[2];
113 struct mq_inflight mi = { .part = part, .inflight = inflight, }; 114 struct mq_inflight mi = { .part = part, .inflight = inflight, };
114 115
115 inflight[0] = inflight[1] = 0; 116 inflight[0] = inflight[1] = 0;
116 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); 117 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
118
119 return inflight[0];
117} 120}
118 121
119static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, 122static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
120 struct request *rq, void *priv, 123 struct request *rq, void *priv,
121 bool reserved) 124 bool reserved)
122{ 125{
@@ -124,6 +127,8 @@ static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
124 127
125 if (rq->part == mi->part) 128 if (rq->part == mi->part)
126 mi->inflight[rq_data_dir(rq)]++; 129 mi->inflight[rq_data_dir(rq)]++;
130
131 return true;
127} 132}
128 133
129void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, 134void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
@@ -142,7 +147,7 @@ void blk_freeze_queue_start(struct request_queue *q)
142 freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 147 freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
143 if (freeze_depth == 1) { 148 if (freeze_depth == 1) {
144 percpu_ref_kill(&q->q_usage_counter); 149 percpu_ref_kill(&q->q_usage_counter);
145 if (q->mq_ops) 150 if (queue_is_mq(q))
146 blk_mq_run_hw_queues(q, false); 151 blk_mq_run_hw_queues(q, false);
147 } 152 }
148} 153}
@@ -177,8 +182,6 @@ void blk_freeze_queue(struct request_queue *q)
177 * exported to drivers as the only user for unfreeze is blk_mq. 182 * exported to drivers as the only user for unfreeze is blk_mq.
178 */ 183 */
179 blk_freeze_queue_start(q); 184 blk_freeze_queue_start(q);
180 if (!q->mq_ops)
181 blk_drain_queue(q);
182 blk_mq_freeze_queue_wait(q); 185 blk_mq_freeze_queue_wait(q);
183} 186}
184 187
@@ -275,6 +278,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
275} 278}
276EXPORT_SYMBOL(blk_mq_can_queue); 279EXPORT_SYMBOL(blk_mq_can_queue);
277 280
281/*
282 * Only need start/end time stamping if we have stats enabled, or using
283 * an IO scheduler.
284 */
285static inline bool blk_mq_need_time_stamp(struct request *rq)
286{
287 return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
288}
289
278static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 290static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
279 unsigned int tag, unsigned int op) 291 unsigned int tag, unsigned int op)
280{ 292{
@@ -298,8 +310,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
298 /* csd/requeue_work/fifo_time is initialized before use */ 310 /* csd/requeue_work/fifo_time is initialized before use */
299 rq->q = data->q; 311 rq->q = data->q;
300 rq->mq_ctx = data->ctx; 312 rq->mq_ctx = data->ctx;
313 rq->mq_hctx = data->hctx;
301 rq->rq_flags = rq_flags; 314 rq->rq_flags = rq_flags;
302 rq->cpu = -1;
303 rq->cmd_flags = op; 315 rq->cmd_flags = op;
304 if (data->flags & BLK_MQ_REQ_PREEMPT) 316 if (data->flags & BLK_MQ_REQ_PREEMPT)
305 rq->rq_flags |= RQF_PREEMPT; 317 rq->rq_flags |= RQF_PREEMPT;
@@ -310,7 +322,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
310 RB_CLEAR_NODE(&rq->rb_node); 322 RB_CLEAR_NODE(&rq->rb_node);
311 rq->rq_disk = NULL; 323 rq->rq_disk = NULL;
312 rq->part = NULL; 324 rq->part = NULL;
313 rq->start_time_ns = ktime_get_ns(); 325 if (blk_mq_need_time_stamp(rq))
326 rq->start_time_ns = ktime_get_ns();
327 else
328 rq->start_time_ns = 0;
314 rq->io_start_time_ns = 0; 329 rq->io_start_time_ns = 0;
315 rq->nr_phys_segments = 0; 330 rq->nr_phys_segments = 0;
316#if defined(CONFIG_BLK_DEV_INTEGRITY) 331#if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -319,27 +334,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
319 rq->special = NULL; 334 rq->special = NULL;
320 /* tag was already set */ 335 /* tag was already set */
321 rq->extra_len = 0; 336 rq->extra_len = 0;
322 rq->__deadline = 0; 337 WRITE_ONCE(rq->deadline, 0);
323 338
324 INIT_LIST_HEAD(&rq->timeout_list);
325 rq->timeout = 0; 339 rq->timeout = 0;
326 340
327 rq->end_io = NULL; 341 rq->end_io = NULL;
328 rq->end_io_data = NULL; 342 rq->end_io_data = NULL;
329 rq->next_rq = NULL; 343 rq->next_rq = NULL;
330 344
331#ifdef CONFIG_BLK_CGROUP
332 rq->rl = NULL;
333#endif
334
335 data->ctx->rq_dispatched[op_is_sync(op)]++; 345 data->ctx->rq_dispatched[op_is_sync(op)]++;
336 refcount_set(&rq->ref, 1); 346 refcount_set(&rq->ref, 1);
337 return rq; 347 return rq;
338} 348}
339 349
340static struct request *blk_mq_get_request(struct request_queue *q, 350static struct request *blk_mq_get_request(struct request_queue *q,
341 struct bio *bio, unsigned int op, 351 struct bio *bio,
342 struct blk_mq_alloc_data *data) 352 struct blk_mq_alloc_data *data)
343{ 353{
344 struct elevator_queue *e = q->elevator; 354 struct elevator_queue *e = q->elevator;
345 struct request *rq; 355 struct request *rq;
@@ -353,8 +363,9 @@ static struct request *blk_mq_get_request(struct request_queue *q,
353 put_ctx_on_error = true; 363 put_ctx_on_error = true;
354 } 364 }
355 if (likely(!data->hctx)) 365 if (likely(!data->hctx))
356 data->hctx = blk_mq_map_queue(q, data->ctx->cpu); 366 data->hctx = blk_mq_map_queue(q, data->cmd_flags,
357 if (op & REQ_NOWAIT) 367 data->ctx->cpu);
368 if (data->cmd_flags & REQ_NOWAIT)
358 data->flags |= BLK_MQ_REQ_NOWAIT; 369 data->flags |= BLK_MQ_REQ_NOWAIT;
359 370
360 if (e) { 371 if (e) {
@@ -365,9 +376,10 @@ static struct request *blk_mq_get_request(struct request_queue *q,
365 * dispatch list. Don't include reserved tags in the 376 * dispatch list. Don't include reserved tags in the
366 * limiting, as it isn't useful. 377 * limiting, as it isn't useful.
367 */ 378 */
368 if (!op_is_flush(op) && e->type->ops.mq.limit_depth && 379 if (!op_is_flush(data->cmd_flags) &&
380 e->type->ops.limit_depth &&
369 !(data->flags & BLK_MQ_REQ_RESERVED)) 381 !(data->flags & BLK_MQ_REQ_RESERVED))
370 e->type->ops.mq.limit_depth(op, data); 382 e->type->ops.limit_depth(data->cmd_flags, data);
371 } else { 383 } else {
372 blk_mq_tag_busy(data->hctx); 384 blk_mq_tag_busy(data->hctx);
373 } 385 }
@@ -382,14 +394,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
382 return NULL; 394 return NULL;
383 } 395 }
384 396
385 rq = blk_mq_rq_ctx_init(data, tag, op); 397 rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
386 if (!op_is_flush(op)) { 398 if (!op_is_flush(data->cmd_flags)) {
387 rq->elv.icq = NULL; 399 rq->elv.icq = NULL;
388 if (e && e->type->ops.mq.prepare_request) { 400 if (e && e->type->ops.prepare_request) {
389 if (e->type->icq_cache && rq_ioc(bio)) 401 if (e->type->icq_cache)
390 blk_mq_sched_assign_ioc(rq, bio); 402 blk_mq_sched_assign_ioc(rq);
391 403
392 e->type->ops.mq.prepare_request(rq, bio); 404 e->type->ops.prepare_request(rq, bio);
393 rq->rq_flags |= RQF_ELVPRIV; 405 rq->rq_flags |= RQF_ELVPRIV;
394 } 406 }
395 } 407 }
@@ -400,7 +412,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
400struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 412struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
401 blk_mq_req_flags_t flags) 413 blk_mq_req_flags_t flags)
402{ 414{
403 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 415 struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
404 struct request *rq; 416 struct request *rq;
405 int ret; 417 int ret;
406 418
@@ -408,7 +420,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
408 if (ret) 420 if (ret)
409 return ERR_PTR(ret); 421 return ERR_PTR(ret);
410 422
411 rq = blk_mq_get_request(q, NULL, op, &alloc_data); 423 rq = blk_mq_get_request(q, NULL, &alloc_data);
412 blk_queue_exit(q); 424 blk_queue_exit(q);
413 425
414 if (!rq) 426 if (!rq)
@@ -426,7 +438,7 @@ EXPORT_SYMBOL(blk_mq_alloc_request);
426struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 438struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
427 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) 439 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
428{ 440{
429 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 441 struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
430 struct request *rq; 442 struct request *rq;
431 unsigned int cpu; 443 unsigned int cpu;
432 int ret; 444 int ret;
@@ -459,7 +471,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
459 cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); 471 cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
460 alloc_data.ctx = __blk_mq_get_ctx(q, cpu); 472 alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
461 473
462 rq = blk_mq_get_request(q, NULL, op, &alloc_data); 474 rq = blk_mq_get_request(q, NULL, &alloc_data);
463 blk_queue_exit(q); 475 blk_queue_exit(q);
464 476
465 if (!rq) 477 if (!rq)
@@ -473,10 +485,11 @@ static void __blk_mq_free_request(struct request *rq)
473{ 485{
474 struct request_queue *q = rq->q; 486 struct request_queue *q = rq->q;
475 struct blk_mq_ctx *ctx = rq->mq_ctx; 487 struct blk_mq_ctx *ctx = rq->mq_ctx;
476 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 488 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
477 const int sched_tag = rq->internal_tag; 489 const int sched_tag = rq->internal_tag;
478 490
479 blk_pm_mark_last_busy(rq); 491 blk_pm_mark_last_busy(rq);
492 rq->mq_hctx = NULL;
480 if (rq->tag != -1) 493 if (rq->tag != -1)
481 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); 494 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
482 if (sched_tag != -1) 495 if (sched_tag != -1)
@@ -490,11 +503,11 @@ void blk_mq_free_request(struct request *rq)
490 struct request_queue *q = rq->q; 503 struct request_queue *q = rq->q;
491 struct elevator_queue *e = q->elevator; 504 struct elevator_queue *e = q->elevator;
492 struct blk_mq_ctx *ctx = rq->mq_ctx; 505 struct blk_mq_ctx *ctx = rq->mq_ctx;
493 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 506 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
494 507
495 if (rq->rq_flags & RQF_ELVPRIV) { 508 if (rq->rq_flags & RQF_ELVPRIV) {
496 if (e && e->type->ops.mq.finish_request) 509 if (e && e->type->ops.finish_request)
497 e->type->ops.mq.finish_request(rq); 510 e->type->ops.finish_request(rq);
498 if (rq->elv.icq) { 511 if (rq->elv.icq) {
499 put_io_context(rq->elv.icq->ioc); 512 put_io_context(rq->elv.icq->ioc);
500 rq->elv.icq = NULL; 513 rq->elv.icq = NULL;
@@ -510,9 +523,6 @@ void blk_mq_free_request(struct request *rq)
510 523
511 rq_qos_done(q, rq); 524 rq_qos_done(q, rq);
512 525
513 if (blk_rq_rl(rq))
514 blk_put_rl(blk_rq_rl(rq));
515
516 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 526 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
517 if (refcount_dec_and_test(&rq->ref)) 527 if (refcount_dec_and_test(&rq->ref))
518 __blk_mq_free_request(rq); 528 __blk_mq_free_request(rq);
@@ -521,7 +531,10 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
521 531
522inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 532inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
523{ 533{
524 u64 now = ktime_get_ns(); 534 u64 now = 0;
535
536 if (blk_mq_need_time_stamp(rq))
537 now = ktime_get_ns();
525 538
526 if (rq->rq_flags & RQF_STATS) { 539 if (rq->rq_flags & RQF_STATS) {
527 blk_mq_poll_stats_start(rq->q); 540 blk_mq_poll_stats_start(rq->q);
@@ -555,19 +568,19 @@ EXPORT_SYMBOL(blk_mq_end_request);
555static void __blk_mq_complete_request_remote(void *data) 568static void __blk_mq_complete_request_remote(void *data)
556{ 569{
557 struct request *rq = data; 570 struct request *rq = data;
571 struct request_queue *q = rq->q;
558 572
559 rq->q->softirq_done_fn(rq); 573 q->mq_ops->complete(rq);
560} 574}
561 575
562static void __blk_mq_complete_request(struct request *rq) 576static void __blk_mq_complete_request(struct request *rq)
563{ 577{
564 struct blk_mq_ctx *ctx = rq->mq_ctx; 578 struct blk_mq_ctx *ctx = rq->mq_ctx;
579 struct request_queue *q = rq->q;
565 bool shared = false; 580 bool shared = false;
566 int cpu; 581 int cpu;
567 582
568 if (!blk_mq_mark_complete(rq)) 583 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
569 return;
570
571 /* 584 /*
572 * Most of single queue controllers, there is only one irq vector 585 * Most of single queue controllers, there is only one irq vector
573 * for handling IO completion, and the only irq's affinity is set 586 * for handling IO completion, and the only irq's affinity is set
@@ -577,18 +590,23 @@ static void __blk_mq_complete_request(struct request *rq)
577 * So complete IO reqeust in softirq context in case of single queue 590 * So complete IO reqeust in softirq context in case of single queue
578 * for not degrading IO performance by irqsoff latency. 591 * for not degrading IO performance by irqsoff latency.
579 */ 592 */
580 if (rq->q->nr_hw_queues == 1) { 593 if (q->nr_hw_queues == 1) {
581 __blk_complete_request(rq); 594 __blk_complete_request(rq);
582 return; 595 return;
583 } 596 }
584 597
585 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 598 /*
586 rq->q->softirq_done_fn(rq); 599 * For a polled request, always complete locallly, it's pointless
600 * to redirect the completion.
601 */
602 if ((rq->cmd_flags & REQ_HIPRI) ||
603 !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
604 q->mq_ops->complete(rq);
587 return; 605 return;
588 } 606 }
589 607
590 cpu = get_cpu(); 608 cpu = get_cpu();
591 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 609 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
592 shared = cpus_share_cache(cpu, ctx->cpu); 610 shared = cpus_share_cache(cpu, ctx->cpu);
593 611
594 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 612 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
@@ -597,7 +615,7 @@ static void __blk_mq_complete_request(struct request *rq)
597 rq->csd.flags = 0; 615 rq->csd.flags = 0;
598 smp_call_function_single_async(ctx->cpu, &rq->csd); 616 smp_call_function_single_async(ctx->cpu, &rq->csd);
599 } else { 617 } else {
600 rq->q->softirq_done_fn(rq); 618 q->mq_ops->complete(rq);
601 } 619 }
602 put_cpu(); 620 put_cpu();
603} 621}
@@ -630,11 +648,12 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
630 * Ends all I/O on a request. It does not handle partial completions. 648 * Ends all I/O on a request. It does not handle partial completions.
631 * The actual completion happens out-of-order, through a IPI handler. 649 * The actual completion happens out-of-order, through a IPI handler.
632 **/ 650 **/
633void blk_mq_complete_request(struct request *rq) 651bool blk_mq_complete_request(struct request *rq)
634{ 652{
635 if (unlikely(blk_should_fake_timeout(rq->q))) 653 if (unlikely(blk_should_fake_timeout(rq->q)))
636 return; 654 return false;
637 __blk_mq_complete_request(rq); 655 __blk_mq_complete_request(rq);
656 return true;
638} 657}
639EXPORT_SYMBOL(blk_mq_complete_request); 658EXPORT_SYMBOL(blk_mq_complete_request);
640 659
@@ -701,7 +720,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
701 /* this request will be re-inserted to io scheduler queue */ 720 /* this request will be re-inserted to io scheduler queue */
702 blk_mq_sched_requeue_request(rq); 721 blk_mq_sched_requeue_request(rq);
703 722
704 BUG_ON(blk_queued_rq(rq)); 723 BUG_ON(!list_empty(&rq->queuelist));
705 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); 724 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
706} 725}
707EXPORT_SYMBOL(blk_mq_requeue_request); 726EXPORT_SYMBOL(blk_mq_requeue_request);
@@ -786,6 +805,32 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
786} 805}
787EXPORT_SYMBOL(blk_mq_tag_to_rq); 806EXPORT_SYMBOL(blk_mq_tag_to_rq);
788 807
808static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
809 void *priv, bool reserved)
810{
811 /*
812 * If we find a request that is inflight and the queue matches,
813 * we know the queue is busy. Return false to stop the iteration.
814 */
815 if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
816 bool *busy = priv;
817
818 *busy = true;
819 return false;
820 }
821
822 return true;
823}
824
825bool blk_mq_queue_inflight(struct request_queue *q)
826{
827 bool busy = false;
828
829 blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
830 return busy;
831}
832EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
833
789static void blk_mq_rq_timed_out(struct request *req, bool reserved) 834static void blk_mq_rq_timed_out(struct request *req, bool reserved)
790{ 835{
791 req->rq_flags |= RQF_TIMED_OUT; 836 req->rq_flags |= RQF_TIMED_OUT;
@@ -810,7 +855,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
810 if (rq->rq_flags & RQF_TIMED_OUT) 855 if (rq->rq_flags & RQF_TIMED_OUT)
811 return false; 856 return false;
812 857
813 deadline = blk_rq_deadline(rq); 858 deadline = READ_ONCE(rq->deadline);
814 if (time_after_eq(jiffies, deadline)) 859 if (time_after_eq(jiffies, deadline))
815 return true; 860 return true;
816 861
@@ -821,7 +866,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
821 return false; 866 return false;
822} 867}
823 868
824static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 869static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
825 struct request *rq, void *priv, bool reserved) 870 struct request *rq, void *priv, bool reserved)
826{ 871{
827 unsigned long *next = priv; 872 unsigned long *next = priv;
@@ -831,7 +876,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
831 * so we're not unnecessarilly synchronizing across CPUs. 876 * so we're not unnecessarilly synchronizing across CPUs.
832 */ 877 */
833 if (!blk_mq_req_expired(rq, next)) 878 if (!blk_mq_req_expired(rq, next))
834 return; 879 return true;
835 880
836 /* 881 /*
837 * We have reason to believe the request may be expired. Take a 882 * We have reason to believe the request may be expired. Take a
@@ -843,7 +888,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
843 * timeout handler to posting a natural completion. 888 * timeout handler to posting a natural completion.
844 */ 889 */
845 if (!refcount_inc_not_zero(&rq->ref)) 890 if (!refcount_inc_not_zero(&rq->ref))
846 return; 891 return true;
847 892
848 /* 893 /*
849 * The request is now locked and cannot be reallocated underneath the 894 * The request is now locked and cannot be reallocated underneath the
@@ -855,6 +900,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
855 blk_mq_rq_timed_out(rq, reserved); 900 blk_mq_rq_timed_out(rq, reserved);
856 if (refcount_dec_and_test(&rq->ref)) 901 if (refcount_dec_and_test(&rq->ref))
857 __blk_mq_free_request(rq); 902 __blk_mq_free_request(rq);
903
904 return true;
858} 905}
859 906
860static void blk_mq_timeout_work(struct work_struct *work) 907static void blk_mq_timeout_work(struct work_struct *work)
@@ -911,9 +958,10 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
911 struct flush_busy_ctx_data *flush_data = data; 958 struct flush_busy_ctx_data *flush_data = data;
912 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 959 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
913 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 960 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
961 enum hctx_type type = hctx->type;
914 962
915 spin_lock(&ctx->lock); 963 spin_lock(&ctx->lock);
916 list_splice_tail_init(&ctx->rq_list, flush_data->list); 964 list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
917 sbitmap_clear_bit(sb, bitnr); 965 sbitmap_clear_bit(sb, bitnr);
918 spin_unlock(&ctx->lock); 966 spin_unlock(&ctx->lock);
919 return true; 967 return true;
@@ -945,12 +993,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
945 struct dispatch_rq_data *dispatch_data = data; 993 struct dispatch_rq_data *dispatch_data = data;
946 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; 994 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
947 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 995 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
996 enum hctx_type type = hctx->type;
948 997
949 spin_lock(&ctx->lock); 998 spin_lock(&ctx->lock);
950 if (!list_empty(&ctx->rq_list)) { 999 if (!list_empty(&ctx->rq_lists[type])) {
951 dispatch_data->rq = list_entry_rq(ctx->rq_list.next); 1000 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
952 list_del_init(&dispatch_data->rq->queuelist); 1001 list_del_init(&dispatch_data->rq->queuelist);
953 if (list_empty(&ctx->rq_list)) 1002 if (list_empty(&ctx->rq_lists[type]))
954 sbitmap_clear_bit(sb, bitnr); 1003 sbitmap_clear_bit(sb, bitnr);
955 } 1004 }
956 spin_unlock(&ctx->lock); 1005 spin_unlock(&ctx->lock);
@@ -961,7 +1010,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
961struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 1010struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
962 struct blk_mq_ctx *start) 1011 struct blk_mq_ctx *start)
963{ 1012{
964 unsigned off = start ? start->index_hw : 0; 1013 unsigned off = start ? start->index_hw[hctx->type] : 0;
965 struct dispatch_rq_data data = { 1014 struct dispatch_rq_data data = {
966 .hctx = hctx, 1015 .hctx = hctx,
967 .rq = NULL, 1016 .rq = NULL,
@@ -985,8 +1034,9 @@ bool blk_mq_get_driver_tag(struct request *rq)
985{ 1034{
986 struct blk_mq_alloc_data data = { 1035 struct blk_mq_alloc_data data = {
987 .q = rq->q, 1036 .q = rq->q,
988 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), 1037 .hctx = rq->mq_hctx,
989 .flags = BLK_MQ_REQ_NOWAIT, 1038 .flags = BLK_MQ_REQ_NOWAIT,
1039 .cmd_flags = rq->cmd_flags,
990 }; 1040 };
991 bool shared; 1041 bool shared;
992 1042
@@ -1150,7 +1200,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1150 1200
1151 rq = list_first_entry(list, struct request, queuelist); 1201 rq = list_first_entry(list, struct request, queuelist);
1152 1202
1153 hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); 1203 hctx = rq->mq_hctx;
1154 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) 1204 if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
1155 break; 1205 break;
1156 1206
@@ -1223,6 +1273,14 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1223 if (!list_empty(list)) { 1273 if (!list_empty(list)) {
1224 bool needs_restart; 1274 bool needs_restart;
1225 1275
1276 /*
1277 * If we didn't flush the entire list, we could have told
1278 * the driver there was more coming, but that turned out to
1279 * be a lie.
1280 */
1281 if (q->mq_ops->commit_rqs)
1282 q->mq_ops->commit_rqs(hctx);
1283
1226 spin_lock(&hctx->lock); 1284 spin_lock(&hctx->lock);
1227 list_splice_init(list, &hctx->dispatch); 1285 list_splice_init(list, &hctx->dispatch);
1228 spin_unlock(&hctx->lock); 1286 spin_unlock(&hctx->lock);
@@ -1552,15 +1610,16 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1552 bool at_head) 1610 bool at_head)
1553{ 1611{
1554 struct blk_mq_ctx *ctx = rq->mq_ctx; 1612 struct blk_mq_ctx *ctx = rq->mq_ctx;
1613 enum hctx_type type = hctx->type;
1555 1614
1556 lockdep_assert_held(&ctx->lock); 1615 lockdep_assert_held(&ctx->lock);
1557 1616
1558 trace_block_rq_insert(hctx->queue, rq); 1617 trace_block_rq_insert(hctx->queue, rq);
1559 1618
1560 if (at_head) 1619 if (at_head)
1561 list_add(&rq->queuelist, &ctx->rq_list); 1620 list_add(&rq->queuelist, &ctx->rq_lists[type]);
1562 else 1621 else
1563 list_add_tail(&rq->queuelist, &ctx->rq_list); 1622 list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
1564} 1623}
1565 1624
1566void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 1625void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
@@ -1580,8 +1639,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1580 */ 1639 */
1581void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) 1640void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1582{ 1641{
1583 struct blk_mq_ctx *ctx = rq->mq_ctx; 1642 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1584 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1585 1643
1586 spin_lock(&hctx->lock); 1644 spin_lock(&hctx->lock);
1587 list_add_tail(&rq->queuelist, &hctx->dispatch); 1645 list_add_tail(&rq->queuelist, &hctx->dispatch);
@@ -1596,6 +1654,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1596 1654
1597{ 1655{
1598 struct request *rq; 1656 struct request *rq;
1657 enum hctx_type type = hctx->type;
1599 1658
1600 /* 1659 /*
1601 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1660 * preemption doesn't flush plug list, so it's possible ctx->cpu is
@@ -1607,35 +1666,46 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1607 } 1666 }
1608 1667
1609 spin_lock(&ctx->lock); 1668 spin_lock(&ctx->lock);
1610 list_splice_tail_init(list, &ctx->rq_list); 1669 list_splice_tail_init(list, &ctx->rq_lists[type]);
1611 blk_mq_hctx_mark_pending(hctx, ctx); 1670 blk_mq_hctx_mark_pending(hctx, ctx);
1612 spin_unlock(&ctx->lock); 1671 spin_unlock(&ctx->lock);
1613} 1672}
1614 1673
1615static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1674static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
1616{ 1675{
1617 struct request *rqa = container_of(a, struct request, queuelist); 1676 struct request *rqa = container_of(a, struct request, queuelist);
1618 struct request *rqb = container_of(b, struct request, queuelist); 1677 struct request *rqb = container_of(b, struct request, queuelist);
1619 1678
1620 return !(rqa->mq_ctx < rqb->mq_ctx || 1679 if (rqa->mq_ctx < rqb->mq_ctx)
1621 (rqa->mq_ctx == rqb->mq_ctx && 1680 return -1;
1622 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 1681 else if (rqa->mq_ctx > rqb->mq_ctx)
1682 return 1;
1683 else if (rqa->mq_hctx < rqb->mq_hctx)
1684 return -1;
1685 else if (rqa->mq_hctx > rqb->mq_hctx)
1686 return 1;
1687
1688 return blk_rq_pos(rqa) > blk_rq_pos(rqb);
1623} 1689}
1624 1690
1625void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1691void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1626{ 1692{
1693 struct blk_mq_hw_ctx *this_hctx;
1627 struct blk_mq_ctx *this_ctx; 1694 struct blk_mq_ctx *this_ctx;
1628 struct request_queue *this_q; 1695 struct request_queue *this_q;
1629 struct request *rq; 1696 struct request *rq;
1630 LIST_HEAD(list); 1697 LIST_HEAD(list);
1631 LIST_HEAD(ctx_list); 1698 LIST_HEAD(rq_list);
1632 unsigned int depth; 1699 unsigned int depth;
1633 1700
1634 list_splice_init(&plug->mq_list, &list); 1701 list_splice_init(&plug->mq_list, &list);
1702 plug->rq_count = 0;
1635 1703
1636 list_sort(NULL, &list, plug_ctx_cmp); 1704 if (plug->rq_count > 2 && plug->multiple_queues)
1705 list_sort(NULL, &list, plug_rq_cmp);
1637 1706
1638 this_q = NULL; 1707 this_q = NULL;
1708 this_hctx = NULL;
1639 this_ctx = NULL; 1709 this_ctx = NULL;
1640 depth = 0; 1710 depth = 0;
1641 1711
@@ -1643,30 +1713,31 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1643 rq = list_entry_rq(list.next); 1713 rq = list_entry_rq(list.next);
1644 list_del_init(&rq->queuelist); 1714 list_del_init(&rq->queuelist);
1645 BUG_ON(!rq->q); 1715 BUG_ON(!rq->q);
1646 if (rq->mq_ctx != this_ctx) { 1716 if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) {
1647 if (this_ctx) { 1717 if (this_hctx) {
1648 trace_block_unplug(this_q, depth, !from_schedule); 1718 trace_block_unplug(this_q, depth, !from_schedule);
1649 blk_mq_sched_insert_requests(this_q, this_ctx, 1719 blk_mq_sched_insert_requests(this_hctx, this_ctx,
1650 &ctx_list, 1720 &rq_list,
1651 from_schedule); 1721 from_schedule);
1652 } 1722 }
1653 1723
1654 this_ctx = rq->mq_ctx;
1655 this_q = rq->q; 1724 this_q = rq->q;
1725 this_ctx = rq->mq_ctx;
1726 this_hctx = rq->mq_hctx;
1656 depth = 0; 1727 depth = 0;
1657 } 1728 }
1658 1729
1659 depth++; 1730 depth++;
1660 list_add_tail(&rq->queuelist, &ctx_list); 1731 list_add_tail(&rq->queuelist, &rq_list);
1661 } 1732 }
1662 1733
1663 /* 1734 /*
1664 * If 'this_ctx' is set, we know we have entries to complete 1735 * If 'this_hctx' is set, we know we have entries to complete
1665 * on 'ctx_list'. Do those. 1736 * on 'rq_list'. Do those.
1666 */ 1737 */
1667 if (this_ctx) { 1738 if (this_hctx) {
1668 trace_block_unplug(this_q, depth, !from_schedule); 1739 trace_block_unplug(this_q, depth, !from_schedule);
1669 blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, 1740 blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
1670 from_schedule); 1741 from_schedule);
1671 } 1742 }
1672} 1743}
@@ -1675,27 +1746,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1675{ 1746{
1676 blk_init_request_from_bio(rq, bio); 1747 blk_init_request_from_bio(rq, bio);
1677 1748
1678 blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
1679
1680 blk_account_io_start(rq, true); 1749 blk_account_io_start(rq, true);
1681} 1750}
1682 1751
1683static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1684{
1685 if (rq->tag != -1)
1686 return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1687
1688 return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1689}
1690
1691static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, 1752static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1692 struct request *rq, 1753 struct request *rq,
1693 blk_qc_t *cookie) 1754 blk_qc_t *cookie, bool last)
1694{ 1755{
1695 struct request_queue *q = rq->q; 1756 struct request_queue *q = rq->q;
1696 struct blk_mq_queue_data bd = { 1757 struct blk_mq_queue_data bd = {
1697 .rq = rq, 1758 .rq = rq,
1698 .last = true, 1759 .last = last,
1699 }; 1760 };
1700 blk_qc_t new_cookie; 1761 blk_qc_t new_cookie;
1701 blk_status_t ret; 1762 blk_status_t ret;
@@ -1727,77 +1788,74 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1727 return ret; 1788 return ret;
1728} 1789}
1729 1790
1730static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 1791blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1731 struct request *rq, 1792 struct request *rq,
1732 blk_qc_t *cookie, 1793 blk_qc_t *cookie,
1733 bool bypass_insert) 1794 bool bypass, bool last)
1734{ 1795{
1735 struct request_queue *q = rq->q; 1796 struct request_queue *q = rq->q;
1736 bool run_queue = true; 1797 bool run_queue = true;
1798 blk_status_t ret = BLK_STS_RESOURCE;
1799 int srcu_idx;
1800 bool force = false;
1737 1801
1802 hctx_lock(hctx, &srcu_idx);
1738 /* 1803 /*
1739 * RCU or SRCU read lock is needed before checking quiesced flag. 1804 * hctx_lock is needed before checking quiesced flag.
1740 * 1805 *
1741 * When queue is stopped or quiesced, ignore 'bypass_insert' from 1806 * When queue is stopped or quiesced, ignore 'bypass', insert
1742 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, 1807 * and return BLK_STS_OK to caller, and avoid driver to try to
1743 * and avoid driver to try to dispatch again. 1808 * dispatch again.
1744 */ 1809 */
1745 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { 1810 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) {
1746 run_queue = false; 1811 run_queue = false;
1747 bypass_insert = false; 1812 bypass = false;
1748 goto insert; 1813 goto out_unlock;
1749 } 1814 }
1750 1815
1751 if (q->elevator && !bypass_insert) 1816 if (unlikely(q->elevator && !bypass))
1752 goto insert; 1817 goto out_unlock;
1753 1818
1754 if (!blk_mq_get_dispatch_budget(hctx)) 1819 if (!blk_mq_get_dispatch_budget(hctx))
1755 goto insert; 1820 goto out_unlock;
1756 1821
1757 if (!blk_mq_get_driver_tag(rq)) { 1822 if (!blk_mq_get_driver_tag(rq)) {
1758 blk_mq_put_dispatch_budget(hctx); 1823 blk_mq_put_dispatch_budget(hctx);
1759 goto insert; 1824 goto out_unlock;
1760 } 1825 }
1761 1826
1762 return __blk_mq_issue_directly(hctx, rq, cookie); 1827 /*
1763insert: 1828 * Always add a request that has been through
1764 if (bypass_insert) 1829 *.queue_rq() to the hardware dispatch list.
1765 return BLK_STS_RESOURCE; 1830 */
1766 1831 force = true;
1767 blk_mq_request_bypass_insert(rq, run_queue); 1832 ret = __blk_mq_issue_directly(hctx, rq, cookie, last);
1768 return BLK_STS_OK; 1833out_unlock:
1769}
1770
1771static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1772 struct request *rq, blk_qc_t *cookie)
1773{
1774 blk_status_t ret;
1775 int srcu_idx;
1776
1777 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1778
1779 hctx_lock(hctx, &srcu_idx);
1780
1781 ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
1782 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
1783 blk_mq_request_bypass_insert(rq, true);
1784 else if (ret != BLK_STS_OK)
1785 blk_mq_end_request(rq, ret);
1786
1787 hctx_unlock(hctx, srcu_idx);
1788}
1789
1790blk_status_t blk_mq_request_issue_directly(struct request *rq)
1791{
1792 blk_status_t ret;
1793 int srcu_idx;
1794 blk_qc_t unused_cookie;
1795 struct blk_mq_ctx *ctx = rq->mq_ctx;
1796 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1797
1798 hctx_lock(hctx, &srcu_idx);
1799 ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
1800 hctx_unlock(hctx, srcu_idx); 1834 hctx_unlock(hctx, srcu_idx);
1835 switch (ret) {
1836 case BLK_STS_OK:
1837 break;
1838 case BLK_STS_DEV_RESOURCE:
1839 case BLK_STS_RESOURCE:
1840 if (force) {
1841 blk_mq_request_bypass_insert(rq, run_queue);
1842 /*
1843 * We have to return BLK_STS_OK for the DM
1844 * to avoid livelock. Otherwise, we return
1845 * the real result to indicate whether the
1846 * request is direct-issued successfully.
1847 */
1848 ret = bypass ? BLK_STS_OK : ret;
1849 } else if (!bypass) {
1850 blk_mq_sched_insert_request(rq, false,
1851 run_queue, false);
1852 }
1853 break;
1854 default:
1855 if (!bypass)
1856 blk_mq_end_request(rq, ret);
1857 break;
1858 }
1801 1859
1802 return ret; 1860 return ret;
1803} 1861}
@@ -1805,22 +1863,42 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
1805void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 1863void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1806 struct list_head *list) 1864 struct list_head *list)
1807{ 1865{
1866 blk_qc_t unused;
1867 blk_status_t ret = BLK_STS_OK;
1868
1808 while (!list_empty(list)) { 1869 while (!list_empty(list)) {
1809 blk_status_t ret;
1810 struct request *rq = list_first_entry(list, struct request, 1870 struct request *rq = list_first_entry(list, struct request,
1811 queuelist); 1871 queuelist);
1812 1872
1813 list_del_init(&rq->queuelist); 1873 list_del_init(&rq->queuelist);
1814 ret = blk_mq_request_issue_directly(rq); 1874 if (ret == BLK_STS_OK)
1815 if (ret != BLK_STS_OK) { 1875 ret = blk_mq_try_issue_directly(hctx, rq, &unused,
1816 if (ret == BLK_STS_RESOURCE || 1876 false,
1817 ret == BLK_STS_DEV_RESOURCE) {
1818 blk_mq_request_bypass_insert(rq,
1819 list_empty(list)); 1877 list_empty(list));
1820 break; 1878 else
1821 } 1879 blk_mq_sched_insert_request(rq, false, true, false);
1822 blk_mq_end_request(rq, ret); 1880 }
1823 } 1881
1882 /*
1883 * If we didn't flush the entire list, we could have told
1884 * the driver there was more coming, but that turned out to
1885 * be a lie.
1886 */
1887 if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs)
1888 hctx->queue->mq_ops->commit_rqs(hctx);
1889}
1890
1891static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
1892{
1893 list_add_tail(&rq->queuelist, &plug->mq_list);
1894 plug->rq_count++;
1895 if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
1896 struct request *tmp;
1897
1898 tmp = list_first_entry(&plug->mq_list, struct request,
1899 queuelist);
1900 if (tmp->q != rq->q)
1901 plug->multiple_queues = true;
1824 } 1902 }
1825} 1903}
1826 1904
@@ -1828,9 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1828{ 1906{
1829 const int is_sync = op_is_sync(bio->bi_opf); 1907 const int is_sync = op_is_sync(bio->bi_opf);
1830 const int is_flush_fua = op_is_flush(bio->bi_opf); 1908 const int is_flush_fua = op_is_flush(bio->bi_opf);
1831 struct blk_mq_alloc_data data = { .flags = 0 }; 1909 struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf };
1832 struct request *rq; 1910 struct request *rq;
1833 unsigned int request_count = 0;
1834 struct blk_plug *plug; 1911 struct blk_plug *plug;
1835 struct request *same_queue_rq = NULL; 1912 struct request *same_queue_rq = NULL;
1836 blk_qc_t cookie; 1913 blk_qc_t cookie;
@@ -1843,15 +1920,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1843 return BLK_QC_T_NONE; 1920 return BLK_QC_T_NONE;
1844 1921
1845 if (!is_flush_fua && !blk_queue_nomerges(q) && 1922 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1846 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) 1923 blk_attempt_plug_merge(q, bio, &same_queue_rq))
1847 return BLK_QC_T_NONE; 1924 return BLK_QC_T_NONE;
1848 1925
1849 if (blk_mq_sched_bio_merge(q, bio)) 1926 if (blk_mq_sched_bio_merge(q, bio))
1850 return BLK_QC_T_NONE; 1927 return BLK_QC_T_NONE;
1851 1928
1852 rq_qos_throttle(q, bio, NULL); 1929 rq_qos_throttle(q, bio);
1853 1930
1854 rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); 1931 rq = blk_mq_get_request(q, bio, &data);
1855 if (unlikely(!rq)) { 1932 if (unlikely(!rq)) {
1856 rq_qos_cleanup(q, bio); 1933 rq_qos_cleanup(q, bio);
1857 if (bio->bi_opf & REQ_NOWAIT) 1934 if (bio->bi_opf & REQ_NOWAIT)
@@ -1873,21 +1950,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1873 /* bypass scheduler for flush rq */ 1950 /* bypass scheduler for flush rq */
1874 blk_insert_flush(rq); 1951 blk_insert_flush(rq);
1875 blk_mq_run_hw_queue(data.hctx, true); 1952 blk_mq_run_hw_queue(data.hctx, true);
1876 } else if (plug && q->nr_hw_queues == 1) { 1953 } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) {
1954 /*
1955 * Use plugging if we have a ->commit_rqs() hook as well, as
1956 * we know the driver uses bd->last in a smart fashion.
1957 */
1958 unsigned int request_count = plug->rq_count;
1877 struct request *last = NULL; 1959 struct request *last = NULL;
1878 1960
1879 blk_mq_put_ctx(data.ctx); 1961 blk_mq_put_ctx(data.ctx);
1880 blk_mq_bio_to_request(rq, bio); 1962 blk_mq_bio_to_request(rq, bio);
1881 1963
1882 /*
1883 * @request_count may become stale because of schedule
1884 * out, so check the list again.
1885 */
1886 if (list_empty(&plug->mq_list))
1887 request_count = 0;
1888 else if (blk_queue_nomerges(q))
1889 request_count = blk_plug_queued_count(q);
1890
1891 if (!request_count) 1964 if (!request_count)
1892 trace_block_plug(q); 1965 trace_block_plug(q);
1893 else 1966 else
@@ -1899,7 +1972,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1899 trace_block_plug(q); 1972 trace_block_plug(q);
1900 } 1973 }
1901 1974
1902 list_add_tail(&rq->queuelist, &plug->mq_list); 1975 blk_add_rq_to_plug(plug, rq);
1903 } else if (plug && !blk_queue_nomerges(q)) { 1976 } else if (plug && !blk_queue_nomerges(q)) {
1904 blk_mq_bio_to_request(rq, bio); 1977 blk_mq_bio_to_request(rq, bio);
1905 1978
@@ -1912,23 +1985,24 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1912 */ 1985 */
1913 if (list_empty(&plug->mq_list)) 1986 if (list_empty(&plug->mq_list))
1914 same_queue_rq = NULL; 1987 same_queue_rq = NULL;
1915 if (same_queue_rq) 1988 if (same_queue_rq) {
1916 list_del_init(&same_queue_rq->queuelist); 1989 list_del_init(&same_queue_rq->queuelist);
1917 list_add_tail(&rq->queuelist, &plug->mq_list); 1990 plug->rq_count--;
1991 }
1992 blk_add_rq_to_plug(plug, rq);
1918 1993
1919 blk_mq_put_ctx(data.ctx); 1994 blk_mq_put_ctx(data.ctx);
1920 1995
1921 if (same_queue_rq) { 1996 if (same_queue_rq) {
1922 data.hctx = blk_mq_map_queue(q, 1997 data.hctx = same_queue_rq->mq_hctx;
1923 same_queue_rq->mq_ctx->cpu);
1924 blk_mq_try_issue_directly(data.hctx, same_queue_rq, 1998 blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1925 &cookie); 1999 &cookie, false, true);
1926 } 2000 }
1927 } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && 2001 } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
1928 !data.hctx->dispatch_busy)) { 2002 !data.hctx->dispatch_busy)) {
1929 blk_mq_put_ctx(data.ctx); 2003 blk_mq_put_ctx(data.ctx);
1930 blk_mq_bio_to_request(rq, bio); 2004 blk_mq_bio_to_request(rq, bio);
1931 blk_mq_try_issue_directly(data.hctx, rq, &cookie); 2005 blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true);
1932 } else { 2006 } else {
1933 blk_mq_put_ctx(data.ctx); 2007 blk_mq_put_ctx(data.ctx);
1934 blk_mq_bio_to_request(rq, bio); 2008 blk_mq_bio_to_request(rq, bio);
@@ -1986,7 +2060,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
1986 struct blk_mq_tags *tags; 2060 struct blk_mq_tags *tags;
1987 int node; 2061 int node;
1988 2062
1989 node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 2063 node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
1990 if (node == NUMA_NO_NODE) 2064 if (node == NUMA_NO_NODE)
1991 node = set->numa_node; 2065 node = set->numa_node;
1992 2066
@@ -2042,7 +2116,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2042 size_t rq_size, left; 2116 size_t rq_size, left;
2043 int node; 2117 int node;
2044 2118
2045 node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 2119 node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
2046 if (node == NUMA_NO_NODE) 2120 if (node == NUMA_NO_NODE)
2047 node = set->numa_node; 2121 node = set->numa_node;
2048 2122
@@ -2122,13 +2196,15 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
2122 struct blk_mq_hw_ctx *hctx; 2196 struct blk_mq_hw_ctx *hctx;
2123 struct blk_mq_ctx *ctx; 2197 struct blk_mq_ctx *ctx;
2124 LIST_HEAD(tmp); 2198 LIST_HEAD(tmp);
2199 enum hctx_type type;
2125 2200
2126 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 2201 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2127 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 2202 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2203 type = hctx->type;
2128 2204
2129 spin_lock(&ctx->lock); 2205 spin_lock(&ctx->lock);
2130 if (!list_empty(&ctx->rq_list)) { 2206 if (!list_empty(&ctx->rq_lists[type])) {
2131 list_splice_init(&ctx->rq_list, &tmp); 2207 list_splice_init(&ctx->rq_lists[type], &tmp);
2132 blk_mq_hctx_clear_pending(hctx, ctx); 2208 blk_mq_hctx_clear_pending(hctx, ctx);
2133 } 2209 }
2134 spin_unlock(&ctx->lock); 2210 spin_unlock(&ctx->lock);
@@ -2259,24 +2335,30 @@ static int blk_mq_init_hctx(struct request_queue *q,
2259static void blk_mq_init_cpu_queues(struct request_queue *q, 2335static void blk_mq_init_cpu_queues(struct request_queue *q,
2260 unsigned int nr_hw_queues) 2336 unsigned int nr_hw_queues)
2261{ 2337{
2262 unsigned int i; 2338 struct blk_mq_tag_set *set = q->tag_set;
2339 unsigned int i, j;
2263 2340
2264 for_each_possible_cpu(i) { 2341 for_each_possible_cpu(i) {
2265 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 2342 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2266 struct blk_mq_hw_ctx *hctx; 2343 struct blk_mq_hw_ctx *hctx;
2344 int k;
2267 2345
2268 __ctx->cpu = i; 2346 __ctx->cpu = i;
2269 spin_lock_init(&__ctx->lock); 2347 spin_lock_init(&__ctx->lock);
2270 INIT_LIST_HEAD(&__ctx->rq_list); 2348 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2349 INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2350
2271 __ctx->queue = q; 2351 __ctx->queue = q;
2272 2352
2273 /* 2353 /*
2274 * Set local node, IFF we have more than one hw queue. If 2354 * Set local node, IFF we have more than one hw queue. If
2275 * not, we remain on the home node of the device 2355 * not, we remain on the home node of the device
2276 */ 2356 */
2277 hctx = blk_mq_map_queue(q, i); 2357 for (j = 0; j < set->nr_maps; j++) {
2278 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 2358 hctx = blk_mq_map_queue_type(q, j, i);
2279 hctx->numa_node = local_memory_node(cpu_to_node(i)); 2359 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2360 hctx->numa_node = local_memory_node(cpu_to_node(i));
2361 }
2280 } 2362 }
2281} 2363}
2282 2364
@@ -2302,7 +2384,7 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2302static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, 2384static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2303 unsigned int hctx_idx) 2385 unsigned int hctx_idx)
2304{ 2386{
2305 if (set->tags[hctx_idx]) { 2387 if (set->tags && set->tags[hctx_idx]) {
2306 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); 2388 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2307 blk_mq_free_rq_map(set->tags[hctx_idx]); 2389 blk_mq_free_rq_map(set->tags[hctx_idx]);
2308 set->tags[hctx_idx] = NULL; 2390 set->tags[hctx_idx] = NULL;
@@ -2311,7 +2393,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2311 2393
2312static void blk_mq_map_swqueue(struct request_queue *q) 2394static void blk_mq_map_swqueue(struct request_queue *q)
2313{ 2395{
2314 unsigned int i, hctx_idx; 2396 unsigned int i, j, hctx_idx;
2315 struct blk_mq_hw_ctx *hctx; 2397 struct blk_mq_hw_ctx *hctx;
2316 struct blk_mq_ctx *ctx; 2398 struct blk_mq_ctx *ctx;
2317 struct blk_mq_tag_set *set = q->tag_set; 2399 struct blk_mq_tag_set *set = q->tag_set;
@@ -2333,7 +2415,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
2333 * If the cpu isn't present, the cpu is mapped to first hctx. 2415 * If the cpu isn't present, the cpu is mapped to first hctx.
2334 */ 2416 */
2335 for_each_possible_cpu(i) { 2417 for_each_possible_cpu(i) {
2336 hctx_idx = q->mq_map[i]; 2418 hctx_idx = set->map[0].mq_map[i];
2337 /* unmapped hw queue can be remapped after CPU topo changed */ 2419 /* unmapped hw queue can be remapped after CPU topo changed */
2338 if (!set->tags[hctx_idx] && 2420 if (!set->tags[hctx_idx] &&
2339 !__blk_mq_alloc_rq_map(set, hctx_idx)) { 2421 !__blk_mq_alloc_rq_map(set, hctx_idx)) {
@@ -2343,15 +2425,35 @@ static void blk_mq_map_swqueue(struct request_queue *q)
2343 * case, remap the current ctx to hctx[0] which 2425 * case, remap the current ctx to hctx[0] which
2344 * is guaranteed to always have tags allocated 2426 * is guaranteed to always have tags allocated
2345 */ 2427 */
2346 q->mq_map[i] = 0; 2428 set->map[0].mq_map[i] = 0;
2347 } 2429 }
2348 2430
2349 ctx = per_cpu_ptr(q->queue_ctx, i); 2431 ctx = per_cpu_ptr(q->queue_ctx, i);
2350 hctx = blk_mq_map_queue(q, i); 2432 for (j = 0; j < set->nr_maps; j++) {
2433 if (!set->map[j].nr_queues)
2434 continue;
2435
2436 hctx = blk_mq_map_queue_type(q, j, i);
2437
2438 /*
2439 * If the CPU is already set in the mask, then we've
2440 * mapped this one already. This can happen if
2441 * devices share queues across queue maps.
2442 */
2443 if (cpumask_test_cpu(i, hctx->cpumask))
2444 continue;
2445
2446 cpumask_set_cpu(i, hctx->cpumask);
2447 hctx->type = j;
2448 ctx->index_hw[hctx->type] = hctx->nr_ctx;
2449 hctx->ctxs[hctx->nr_ctx++] = ctx;
2351 2450
2352 cpumask_set_cpu(i, hctx->cpumask); 2451 /*
2353 ctx->index_hw = hctx->nr_ctx; 2452 * If the nr_ctx type overflows, we have exceeded the
2354 hctx->ctxs[hctx->nr_ctx++] = ctx; 2453 * amount of sw queues we can support.
2454 */
2455 BUG_ON(!hctx->nr_ctx);
2456 }
2355 } 2457 }
2356 2458
2357 mutex_unlock(&q->sysfs_lock); 2459 mutex_unlock(&q->sysfs_lock);
@@ -2441,8 +2543,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
2441static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 2543static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2442 struct request_queue *q) 2544 struct request_queue *q)
2443{ 2545{
2444 q->tag_set = set;
2445
2446 mutex_lock(&set->tag_list_lock); 2546 mutex_lock(&set->tag_list_lock);
2447 2547
2448 /* 2548 /*
@@ -2461,6 +2561,34 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2461 mutex_unlock(&set->tag_list_lock); 2561 mutex_unlock(&set->tag_list_lock);
2462} 2562}
2463 2563
2564/* All allocations will be freed in release handler of q->mq_kobj */
2565static int blk_mq_alloc_ctxs(struct request_queue *q)
2566{
2567 struct blk_mq_ctxs *ctxs;
2568 int cpu;
2569
2570 ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
2571 if (!ctxs)
2572 return -ENOMEM;
2573
2574 ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2575 if (!ctxs->queue_ctx)
2576 goto fail;
2577
2578 for_each_possible_cpu(cpu) {
2579 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
2580 ctx->ctxs = ctxs;
2581 }
2582
2583 q->mq_kobj = &ctxs->kobj;
2584 q->queue_ctx = ctxs->queue_ctx;
2585
2586 return 0;
2587 fail:
2588 kfree(ctxs);
2589 return -ENOMEM;
2590}
2591
2464/* 2592/*
2465 * It is the actual release handler for mq, but we do it from 2593 * It is the actual release handler for mq, but we do it from
2466 * request queue's release handler for avoiding use-after-free 2594 * request queue's release handler for avoiding use-after-free
@@ -2479,8 +2607,6 @@ void blk_mq_release(struct request_queue *q)
2479 kobject_put(&hctx->kobj); 2607 kobject_put(&hctx->kobj);
2480 } 2608 }
2481 2609
2482 q->mq_map = NULL;
2483
2484 kfree(q->queue_hw_ctx); 2610 kfree(q->queue_hw_ctx);
2485 2611
2486 /* 2612 /*
@@ -2488,15 +2614,13 @@ void blk_mq_release(struct request_queue *q)
2488 * both share lifetime with request queue. 2614 * both share lifetime with request queue.
2489 */ 2615 */
2490 blk_mq_sysfs_deinit(q); 2616 blk_mq_sysfs_deinit(q);
2491
2492 free_percpu(q->queue_ctx);
2493} 2617}
2494 2618
2495struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 2619struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2496{ 2620{
2497 struct request_queue *uninit_q, *q; 2621 struct request_queue *uninit_q, *q;
2498 2622
2499 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL); 2623 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
2500 if (!uninit_q) 2624 if (!uninit_q)
2501 return ERR_PTR(-ENOMEM); 2625 return ERR_PTR(-ENOMEM);
2502 2626
@@ -2523,6 +2647,7 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
2523 memset(set, 0, sizeof(*set)); 2647 memset(set, 0, sizeof(*set));
2524 set->ops = ops; 2648 set->ops = ops;
2525 set->nr_hw_queues = 1; 2649 set->nr_hw_queues = 1;
2650 set->nr_maps = 1;
2526 set->queue_depth = queue_depth; 2651 set->queue_depth = queue_depth;
2527 set->numa_node = NUMA_NO_NODE; 2652 set->numa_node = NUMA_NO_NODE;
2528 set->flags = set_flags; 2653 set->flags = set_flags;
@@ -2600,7 +2725,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2600 int node; 2725 int node;
2601 struct blk_mq_hw_ctx *hctx; 2726 struct blk_mq_hw_ctx *hctx;
2602 2727
2603 node = blk_mq_hw_queue_to_node(q->mq_map, i); 2728 node = blk_mq_hw_queue_to_node(&set->map[0], i);
2604 /* 2729 /*
2605 * If the hw queue has been mapped to another numa node, 2730 * If the hw queue has been mapped to another numa node,
2606 * we need to realloc the hctx. If allocation fails, fallback 2731 * we need to realloc the hctx. If allocation fails, fallback
@@ -2653,6 +2778,19 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2653 mutex_unlock(&q->sysfs_lock); 2778 mutex_unlock(&q->sysfs_lock);
2654} 2779}
2655 2780
2781/*
2782 * Maximum number of hardware queues we support. For single sets, we'll never
2783 * have more than the CPUs (software queues). For multiple sets, the tag_set
2784 * user may have set ->nr_hw_queues larger.
2785 */
2786static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
2787{
2788 if (set->nr_maps == 1)
2789 return nr_cpu_ids;
2790
2791 return max(set->nr_hw_queues, nr_cpu_ids);
2792}
2793
2656struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 2794struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2657 struct request_queue *q) 2795 struct request_queue *q)
2658{ 2796{
@@ -2665,19 +2803,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2665 if (!q->poll_cb) 2803 if (!q->poll_cb)
2666 goto err_exit; 2804 goto err_exit;
2667 2805
2668 q->queue_ctx = alloc_percpu(struct blk_mq_ctx); 2806 if (blk_mq_alloc_ctxs(q))
2669 if (!q->queue_ctx)
2670 goto err_exit; 2807 goto err_exit;
2671 2808
2672 /* init q->mq_kobj and sw queues' kobjects */ 2809 /* init q->mq_kobj and sw queues' kobjects */
2673 blk_mq_sysfs_init(q); 2810 blk_mq_sysfs_init(q);
2674 2811
2675 q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)), 2812 q->nr_queues = nr_hw_queues(set);
2813 q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)),
2676 GFP_KERNEL, set->numa_node); 2814 GFP_KERNEL, set->numa_node);
2677 if (!q->queue_hw_ctx) 2815 if (!q->queue_hw_ctx)
2678 goto err_percpu; 2816 goto err_sys_init;
2679
2680 q->mq_map = set->mq_map;
2681 2817
2682 blk_mq_realloc_hw_ctxs(set, q); 2818 blk_mq_realloc_hw_ctxs(set, q);
2683 if (!q->nr_hw_queues) 2819 if (!q->nr_hw_queues)
@@ -2686,12 +2822,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2686 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 2822 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2687 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 2823 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2688 2824
2689 q->nr_queues = nr_cpu_ids; 2825 q->tag_set = set;
2690 2826
2691 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 2827 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2828 if (set->nr_maps > HCTX_TYPE_POLL &&
2829 set->map[HCTX_TYPE_POLL].nr_queues)
2830 blk_queue_flag_set(QUEUE_FLAG_POLL, q);
2692 2831
2693 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 2832 if (!(set->flags & BLK_MQ_F_SG_MERGE))
2694 queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); 2833 blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
2695 2834
2696 q->sg_reserved_size = INT_MAX; 2835 q->sg_reserved_size = INT_MAX;
2697 2836
@@ -2700,8 +2839,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2700 spin_lock_init(&q->requeue_lock); 2839 spin_lock_init(&q->requeue_lock);
2701 2840
2702 blk_queue_make_request(q, blk_mq_make_request); 2841 blk_queue_make_request(q, blk_mq_make_request);
2703 if (q->mq_ops->poll)
2704 q->poll_fn = blk_mq_poll;
2705 2842
2706 /* 2843 /*
2707 * Do this after blk_queue_make_request() overrides it... 2844 * Do this after blk_queue_make_request() overrides it...
@@ -2713,9 +2850,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2713 */ 2850 */
2714 q->poll_nsec = -1; 2851 q->poll_nsec = -1;
2715 2852
2716 if (set->ops->complete)
2717 blk_queue_softirq_done(q, set->ops->complete);
2718
2719 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 2853 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2720 blk_mq_add_queue_tag_set(set, q); 2854 blk_mq_add_queue_tag_set(set, q);
2721 blk_mq_map_swqueue(q); 2855 blk_mq_map_swqueue(q);
@@ -2732,8 +2866,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2732 2866
2733err_hctxs: 2867err_hctxs:
2734 kfree(q->queue_hw_ctx); 2868 kfree(q->queue_hw_ctx);
2735err_percpu: 2869err_sys_init:
2736 free_percpu(q->queue_ctx); 2870 blk_mq_sysfs_deinit(q);
2737err_exit: 2871err_exit:
2738 q->mq_ops = NULL; 2872 q->mq_ops = NULL;
2739 return ERR_PTR(-ENOMEM); 2873 return ERR_PTR(-ENOMEM);
@@ -2802,7 +2936,9 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2802 2936
2803static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 2937static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2804{ 2938{
2805 if (set->ops->map_queues) { 2939 if (set->ops->map_queues && !is_kdump_kernel()) {
2940 int i;
2941
2806 /* 2942 /*
2807 * transport .map_queues is usually done in the following 2943 * transport .map_queues is usually done in the following
2808 * way: 2944 * way:
@@ -2810,18 +2946,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2810 * for (queue = 0; queue < set->nr_hw_queues; queue++) { 2946 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
2811 * mask = get_cpu_mask(queue) 2947 * mask = get_cpu_mask(queue)
2812 * for_each_cpu(cpu, mask) 2948 * for_each_cpu(cpu, mask)
2813 * set->mq_map[cpu] = queue; 2949 * set->map[x].mq_map[cpu] = queue;
2814 * } 2950 * }
2815 * 2951 *
2816 * When we need to remap, the table has to be cleared for 2952 * When we need to remap, the table has to be cleared for
2817 * killing stale mapping since one CPU may not be mapped 2953 * killing stale mapping since one CPU may not be mapped
2818 * to any hw queue. 2954 * to any hw queue.
2819 */ 2955 */
2820 blk_mq_clear_mq_map(set); 2956 for (i = 0; i < set->nr_maps; i++)
2957 blk_mq_clear_mq_map(&set->map[i]);
2821 2958
2822 return set->ops->map_queues(set); 2959 return set->ops->map_queues(set);
2823 } else 2960 } else {
2824 return blk_mq_map_queues(set); 2961 BUG_ON(set->nr_maps > 1);
2962 return blk_mq_map_queues(&set->map[0]);
2963 }
2825} 2964}
2826 2965
2827/* 2966/*
@@ -2832,7 +2971,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2832 */ 2971 */
2833int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2972int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2834{ 2973{
2835 int ret; 2974 int i, ret;
2836 2975
2837 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2976 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2838 2977
@@ -2855,6 +2994,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2855 set->queue_depth = BLK_MQ_MAX_DEPTH; 2994 set->queue_depth = BLK_MQ_MAX_DEPTH;
2856 } 2995 }
2857 2996
2997 if (!set->nr_maps)
2998 set->nr_maps = 1;
2999 else if (set->nr_maps > HCTX_MAX_TYPES)
3000 return -EINVAL;
3001
2858 /* 3002 /*
2859 * If a crashdump is active, then we are potentially in a very 3003 * If a crashdump is active, then we are potentially in a very
2860 * memory constrained environment. Limit us to 1 queue and 3004 * memory constrained environment. Limit us to 1 queue and
@@ -2862,24 +3006,30 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2862 */ 3006 */
2863 if (is_kdump_kernel()) { 3007 if (is_kdump_kernel()) {
2864 set->nr_hw_queues = 1; 3008 set->nr_hw_queues = 1;
3009 set->nr_maps = 1;
2865 set->queue_depth = min(64U, set->queue_depth); 3010 set->queue_depth = min(64U, set->queue_depth);
2866 } 3011 }
2867 /* 3012 /*
2868 * There is no use for more h/w queues than cpus. 3013 * There is no use for more h/w queues than cpus if we just have
3014 * a single map
2869 */ 3015 */
2870 if (set->nr_hw_queues > nr_cpu_ids) 3016 if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
2871 set->nr_hw_queues = nr_cpu_ids; 3017 set->nr_hw_queues = nr_cpu_ids;
2872 3018
2873 set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *), 3019 set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *),
2874 GFP_KERNEL, set->numa_node); 3020 GFP_KERNEL, set->numa_node);
2875 if (!set->tags) 3021 if (!set->tags)
2876 return -ENOMEM; 3022 return -ENOMEM;
2877 3023
2878 ret = -ENOMEM; 3024 ret = -ENOMEM;
2879 set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map), 3025 for (i = 0; i < set->nr_maps; i++) {
2880 GFP_KERNEL, set->numa_node); 3026 set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
2881 if (!set->mq_map) 3027 sizeof(set->map[i].mq_map[0]),
2882 goto out_free_tags; 3028 GFP_KERNEL, set->numa_node);
3029 if (!set->map[i].mq_map)
3030 goto out_free_mq_map;
3031 set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
3032 }
2883 3033
2884 ret = blk_mq_update_queue_map(set); 3034 ret = blk_mq_update_queue_map(set);
2885 if (ret) 3035 if (ret)
@@ -2895,9 +3045,10 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2895 return 0; 3045 return 0;
2896 3046
2897out_free_mq_map: 3047out_free_mq_map:
2898 kfree(set->mq_map); 3048 for (i = 0; i < set->nr_maps; i++) {
2899 set->mq_map = NULL; 3049 kfree(set->map[i].mq_map);
2900out_free_tags: 3050 set->map[i].mq_map = NULL;
3051 }
2901 kfree(set->tags); 3052 kfree(set->tags);
2902 set->tags = NULL; 3053 set->tags = NULL;
2903 return ret; 3054 return ret;
@@ -2906,13 +3057,15 @@ EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2906 3057
2907void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 3058void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2908{ 3059{
2909 int i; 3060 int i, j;
2910 3061
2911 for (i = 0; i < nr_cpu_ids; i++) 3062 for (i = 0; i < nr_hw_queues(set); i++)
2912 blk_mq_free_map_and_requests(set, i); 3063 blk_mq_free_map_and_requests(set, i);
2913 3064
2914 kfree(set->mq_map); 3065 for (j = 0; j < set->nr_maps; j++) {
2915 set->mq_map = NULL; 3066 kfree(set->map[j].mq_map);
3067 set->map[j].mq_map = NULL;
3068 }
2916 3069
2917 kfree(set->tags); 3070 kfree(set->tags);
2918 set->tags = NULL; 3071 set->tags = NULL;
@@ -3038,7 +3191,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
3038 3191
3039 lockdep_assert_held(&set->tag_list_lock); 3192 lockdep_assert_held(&set->tag_list_lock);
3040 3193
3041 if (nr_hw_queues > nr_cpu_ids) 3194 if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
3042 nr_hw_queues = nr_cpu_ids; 3195 nr_hw_queues = nr_cpu_ids;
3043 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) 3196 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
3044 return; 3197 return;
@@ -3073,7 +3226,7 @@ fallback:
3073 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", 3226 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3074 nr_hw_queues, prev_nr_hw_queues); 3227 nr_hw_queues, prev_nr_hw_queues);
3075 set->nr_hw_queues = prev_nr_hw_queues; 3228 set->nr_hw_queues = prev_nr_hw_queues;
3076 blk_mq_map_queues(set); 3229 blk_mq_map_queues(&set->map[0]);
3077 goto fallback; 3230 goto fallback;
3078 } 3231 }
3079 blk_mq_map_swqueue(q); 3232 blk_mq_map_swqueue(q);
@@ -3180,15 +3333,12 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3180 return false; 3333 return false;
3181 3334
3182 /* 3335 /*
3183 * poll_nsec can be: 3336 * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
3184 * 3337 *
3185 * -1: don't ever hybrid sleep
3186 * 0: use half of prev avg 3338 * 0: use half of prev avg
3187 * >0: use this specific value 3339 * >0: use this specific value
3188 */ 3340 */
3189 if (q->poll_nsec == -1) 3341 if (q->poll_nsec > 0)
3190 return false;
3191 else if (q->poll_nsec > 0)
3192 nsecs = q->poll_nsec; 3342 nsecs = q->poll_nsec;
3193 else 3343 else
3194 nsecs = blk_mq_poll_nsecs(q, hctx, rq); 3344 nsecs = blk_mq_poll_nsecs(q, hctx, rq);
@@ -3225,11 +3375,57 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3225 return true; 3375 return true;
3226} 3376}
3227 3377
3228static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) 3378static bool blk_mq_poll_hybrid(struct request_queue *q,
3379 struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
3229{ 3380{
3230 struct request_queue *q = hctx->queue; 3381 struct request *rq;
3382
3383 if (q->poll_nsec == -1)
3384 return false;
3385
3386 if (!blk_qc_t_is_internal(cookie))
3387 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3388 else {
3389 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
3390 /*
3391 * With scheduling, if the request has completed, we'll
3392 * get a NULL return here, as we clear the sched tag when
3393 * that happens. The request still remains valid, like always,
3394 * so we should be safe with just the NULL check.
3395 */
3396 if (!rq)
3397 return false;
3398 }
3399
3400 return blk_mq_poll_hybrid_sleep(q, hctx, rq);
3401}
3402
3403/**
3404 * blk_poll - poll for IO completions
3405 * @q: the queue
3406 * @cookie: cookie passed back at IO submission time
3407 * @spin: whether to spin for completions
3408 *
3409 * Description:
3410 * Poll for completions on the passed in queue. Returns number of
3411 * completed entries found. If @spin is true, then blk_poll will continue
3412 * looping until at least one completion is found, unless the task is
3413 * otherwise marked running (or we need to reschedule).
3414 */
3415int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
3416{
3417 struct blk_mq_hw_ctx *hctx;
3231 long state; 3418 long state;
3232 3419
3420 if (!blk_qc_t_valid(cookie) ||
3421 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3422 return 0;
3423
3424 if (current->plug)
3425 blk_flush_plug_list(current->plug, false);
3426
3427 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3428
3233 /* 3429 /*
3234 * If we sleep, have the caller restart the poll loop to reset 3430 * If we sleep, have the caller restart the poll loop to reset
3235 * the state. Like for the other success return cases, the 3431 * the state. Like for the other success return cases, the
@@ -3237,63 +3433,44 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
3237 * the IO isn't complete, we'll get called again and will go 3433 * the IO isn't complete, we'll get called again and will go
3238 * straight to the busy poll loop. 3434 * straight to the busy poll loop.
3239 */ 3435 */
3240 if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) 3436 if (blk_mq_poll_hybrid(q, hctx, cookie))
3241 return true; 3437 return 1;
3242 3438
3243 hctx->poll_considered++; 3439 hctx->poll_considered++;
3244 3440
3245 state = current->state; 3441 state = current->state;
3246 while (!need_resched()) { 3442 do {
3247 int ret; 3443 int ret;
3248 3444
3249 hctx->poll_invoked++; 3445 hctx->poll_invoked++;
3250 3446
3251 ret = q->mq_ops->poll(hctx, rq->tag); 3447 ret = q->mq_ops->poll(hctx);
3252 if (ret > 0) { 3448 if (ret > 0) {
3253 hctx->poll_success++; 3449 hctx->poll_success++;
3254 set_current_state(TASK_RUNNING); 3450 __set_current_state(TASK_RUNNING);
3255 return true; 3451 return ret;
3256 } 3452 }
3257 3453
3258 if (signal_pending_state(state, current)) 3454 if (signal_pending_state(state, current))
3259 set_current_state(TASK_RUNNING); 3455 __set_current_state(TASK_RUNNING);
3260 3456
3261 if (current->state == TASK_RUNNING) 3457 if (current->state == TASK_RUNNING)
3262 return true; 3458 return 1;
3263 if (ret < 0) 3459 if (ret < 0 || !spin)
3264 break; 3460 break;
3265 cpu_relax(); 3461 cpu_relax();
3266 } 3462 } while (!need_resched());
3267 3463
3268 __set_current_state(TASK_RUNNING); 3464 __set_current_state(TASK_RUNNING);
3269 return false; 3465 return 0;
3270} 3466}
3467EXPORT_SYMBOL_GPL(blk_poll);
3271 3468
3272static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) 3469unsigned int blk_mq_rq_cpu(struct request *rq)
3273{ 3470{
3274 struct blk_mq_hw_ctx *hctx; 3471 return rq->mq_ctx->cpu;
3275 struct request *rq;
3276
3277 if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3278 return false;
3279
3280 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3281 if (!blk_qc_t_is_internal(cookie))
3282 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3283 else {
3284 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
3285 /*
3286 * With scheduling, if the request has completed, we'll
3287 * get a NULL return here, as we clear the sched tag when
3288 * that happens. The request still remains valid, like always,
3289 * so we should be safe with just the NULL check.
3290 */
3291 if (!rq)
3292 return false;
3293 }
3294
3295 return __blk_mq_poll(hctx, rq);
3296} 3472}
3473EXPORT_SYMBOL(blk_mq_rq_cpu);
3297 3474
3298static int __init blk_mq_init(void) 3475static int __init blk_mq_init(void)
3299{ 3476{
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9497b47e2526..d943d46b0785 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -7,17 +7,22 @@
7 7
8struct blk_mq_tag_set; 8struct blk_mq_tag_set;
9 9
10struct blk_mq_ctxs {
11 struct kobject kobj;
12 struct blk_mq_ctx __percpu *queue_ctx;
13};
14
10/** 15/**
11 * struct blk_mq_ctx - State for a software queue facing the submitting CPUs 16 * struct blk_mq_ctx - State for a software queue facing the submitting CPUs
12 */ 17 */
13struct blk_mq_ctx { 18struct blk_mq_ctx {
14 struct { 19 struct {
15 spinlock_t lock; 20 spinlock_t lock;
16 struct list_head rq_list; 21 struct list_head rq_lists[HCTX_MAX_TYPES];
17 } ____cacheline_aligned_in_smp; 22 } ____cacheline_aligned_in_smp;
18 23
19 unsigned int cpu; 24 unsigned int cpu;
20 unsigned int index_hw; 25 unsigned short index_hw[HCTX_MAX_TYPES];
21 26
22 /* incremented at dispatch time */ 27 /* incremented at dispatch time */
23 unsigned long rq_dispatched[2]; 28 unsigned long rq_dispatched[2];
@@ -27,6 +32,7 @@ struct blk_mq_ctx {
27 unsigned long ____cacheline_aligned_in_smp rq_completed[2]; 32 unsigned long ____cacheline_aligned_in_smp rq_completed[2];
28 33
29 struct request_queue *queue; 34 struct request_queue *queue;
35 struct blk_mq_ctxs *ctxs;
30 struct kobject kobj; 36 struct kobject kobj;
31} ____cacheline_aligned_in_smp; 37} ____cacheline_aligned_in_smp;
32 38
@@ -62,20 +68,55 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
62void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 68void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
63 struct list_head *list); 69 struct list_head *list);
64 70
65/* Used by blk_insert_cloned_request() to issue request directly */ 71blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
66blk_status_t blk_mq_request_issue_directly(struct request *rq); 72 struct request *rq,
73 blk_qc_t *cookie,
74 bool bypass, bool last);
67void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 75void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
68 struct list_head *list); 76 struct list_head *list);
69 77
70/* 78/*
71 * CPU -> queue mappings 79 * CPU -> queue mappings
72 */ 80 */
73extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); 81extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);
82
83/*
84 * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue
85 * @q: request queue
86 * @type: the hctx type index
87 * @cpu: CPU
88 */
89static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q,
90 enum hctx_type type,
91 unsigned int cpu)
92{
93 return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
94}
74 95
96/*
97 * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
98 * @q: request queue
99 * @flags: request command flags
100 * @cpu: CPU
101 */
75static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, 102static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
76 int cpu) 103 unsigned int flags,
104 unsigned int cpu)
77{ 105{
78 return q->queue_hw_ctx[q->mq_map[cpu]]; 106 enum hctx_type type = HCTX_TYPE_DEFAULT;
107
108 if ((flags & REQ_HIPRI) &&
109 q->tag_set->nr_maps > HCTX_TYPE_POLL &&
110 q->tag_set->map[HCTX_TYPE_POLL].nr_queues &&
111 test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
112 type = HCTX_TYPE_POLL;
113
114 else if (((flags & REQ_OP_MASK) == REQ_OP_READ) &&
115 q->tag_set->nr_maps > HCTX_TYPE_READ &&
116 q->tag_set->map[HCTX_TYPE_READ].nr_queues)
117 type = HCTX_TYPE_READ;
118
119 return blk_mq_map_queue_type(q, type, cpu);
79} 120}
80 121
81/* 122/*
@@ -126,6 +167,7 @@ struct blk_mq_alloc_data {
126 struct request_queue *q; 167 struct request_queue *q;
127 blk_mq_req_flags_t flags; 168 blk_mq_req_flags_t flags;
128 unsigned int shallow_depth; 169 unsigned int shallow_depth;
170 unsigned int cmd_flags;
129 171
130 /* input & output parameter */ 172 /* input & output parameter */
131 struct blk_mq_ctx *ctx; 173 struct blk_mq_ctx *ctx;
@@ -150,8 +192,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
150 return hctx->nr_ctx && hctx->tags; 192 return hctx->nr_ctx && hctx->tags;
151} 193}
152 194
153void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, 195unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
154 unsigned int inflight[2]);
155void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, 196void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
156 unsigned int inflight[2]); 197 unsigned int inflight[2]);
157 198
@@ -195,21 +236,18 @@ static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
195 236
196static inline void blk_mq_put_driver_tag(struct request *rq) 237static inline void blk_mq_put_driver_tag(struct request *rq)
197{ 238{
198 struct blk_mq_hw_ctx *hctx;
199
200 if (rq->tag == -1 || rq->internal_tag == -1) 239 if (rq->tag == -1 || rq->internal_tag == -1)
201 return; 240 return;
202 241
203 hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); 242 __blk_mq_put_driver_tag(rq->mq_hctx, rq);
204 __blk_mq_put_driver_tag(hctx, rq);
205} 243}
206 244
207static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set) 245static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
208{ 246{
209 int cpu; 247 int cpu;
210 248
211 for_each_possible_cpu(cpu) 249 for_each_possible_cpu(cpu)
212 set->mq_map[cpu] = 0; 250 qmap->mq_map[cpu] = 0;
213} 251}
214 252
215#endif 253#endif
diff --git a/block/blk-pm.c b/block/blk-pm.c
index f8fdae01bea2..0a028c189897 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -89,12 +89,12 @@ int blk_pre_runtime_suspend(struct request_queue *q)
89 /* Switch q_usage_counter back to per-cpu mode. */ 89 /* Switch q_usage_counter back to per-cpu mode. */
90 blk_mq_unfreeze_queue(q); 90 blk_mq_unfreeze_queue(q);
91 91
92 spin_lock_irq(q->queue_lock); 92 spin_lock_irq(&q->queue_lock);
93 if (ret < 0) 93 if (ret < 0)
94 pm_runtime_mark_last_busy(q->dev); 94 pm_runtime_mark_last_busy(q->dev);
95 else 95 else
96 q->rpm_status = RPM_SUSPENDING; 96 q->rpm_status = RPM_SUSPENDING;
97 spin_unlock_irq(q->queue_lock); 97 spin_unlock_irq(&q->queue_lock);
98 98
99 if (ret) 99 if (ret)
100 blk_clear_pm_only(q); 100 blk_clear_pm_only(q);
@@ -121,14 +121,14 @@ void blk_post_runtime_suspend(struct request_queue *q, int err)
121 if (!q->dev) 121 if (!q->dev)
122 return; 122 return;
123 123
124 spin_lock_irq(q->queue_lock); 124 spin_lock_irq(&q->queue_lock);
125 if (!err) { 125 if (!err) {
126 q->rpm_status = RPM_SUSPENDED; 126 q->rpm_status = RPM_SUSPENDED;
127 } else { 127 } else {
128 q->rpm_status = RPM_ACTIVE; 128 q->rpm_status = RPM_ACTIVE;
129 pm_runtime_mark_last_busy(q->dev); 129 pm_runtime_mark_last_busy(q->dev);
130 } 130 }
131 spin_unlock_irq(q->queue_lock); 131 spin_unlock_irq(&q->queue_lock);
132 132
133 if (err) 133 if (err)
134 blk_clear_pm_only(q); 134 blk_clear_pm_only(q);
@@ -151,9 +151,9 @@ void blk_pre_runtime_resume(struct request_queue *q)
151 if (!q->dev) 151 if (!q->dev)
152 return; 152 return;
153 153
154 spin_lock_irq(q->queue_lock); 154 spin_lock_irq(&q->queue_lock);
155 q->rpm_status = RPM_RESUMING; 155 q->rpm_status = RPM_RESUMING;
156 spin_unlock_irq(q->queue_lock); 156 spin_unlock_irq(&q->queue_lock);
157} 157}
158EXPORT_SYMBOL(blk_pre_runtime_resume); 158EXPORT_SYMBOL(blk_pre_runtime_resume);
159 159
@@ -176,7 +176,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err)
176 if (!q->dev) 176 if (!q->dev)
177 return; 177 return;
178 178
179 spin_lock_irq(q->queue_lock); 179 spin_lock_irq(&q->queue_lock);
180 if (!err) { 180 if (!err) {
181 q->rpm_status = RPM_ACTIVE; 181 q->rpm_status = RPM_ACTIVE;
182 pm_runtime_mark_last_busy(q->dev); 182 pm_runtime_mark_last_busy(q->dev);
@@ -184,7 +184,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err)
184 } else { 184 } else {
185 q->rpm_status = RPM_SUSPENDED; 185 q->rpm_status = RPM_SUSPENDED;
186 } 186 }
187 spin_unlock_irq(q->queue_lock); 187 spin_unlock_irq(&q->queue_lock);
188 188
189 if (!err) 189 if (!err)
190 blk_clear_pm_only(q); 190 blk_clear_pm_only(q);
@@ -207,10 +207,10 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
207 */ 207 */
208void blk_set_runtime_active(struct request_queue *q) 208void blk_set_runtime_active(struct request_queue *q)
209{ 209{
210 spin_lock_irq(q->queue_lock); 210 spin_lock_irq(&q->queue_lock);
211 q->rpm_status = RPM_ACTIVE; 211 q->rpm_status = RPM_ACTIVE;
212 pm_runtime_mark_last_busy(q->dev); 212 pm_runtime_mark_last_busy(q->dev);
213 pm_request_autosuspend(q->dev); 213 pm_request_autosuspend(q->dev);
214 spin_unlock_irq(q->queue_lock); 214 spin_unlock_irq(&q->queue_lock);
215} 215}
216EXPORT_SYMBOL(blk_set_runtime_active); 216EXPORT_SYMBOL(blk_set_runtime_active);
diff --git a/block/blk-pm.h b/block/blk-pm.h
index a8564ea72a41..ea5507d23e75 100644
--- a/block/blk-pm.h
+++ b/block/blk-pm.h
@@ -21,7 +21,7 @@ static inline void blk_pm_mark_last_busy(struct request *rq)
21 21
22static inline void blk_pm_requeue_request(struct request *rq) 22static inline void blk_pm_requeue_request(struct request *rq)
23{ 23{
24 lockdep_assert_held(rq->q->queue_lock); 24 lockdep_assert_held(&rq->q->queue_lock);
25 25
26 if (rq->q->dev && !(rq->rq_flags & RQF_PM)) 26 if (rq->q->dev && !(rq->rq_flags & RQF_PM))
27 rq->q->nr_pending--; 27 rq->q->nr_pending--;
@@ -30,7 +30,7 @@ static inline void blk_pm_requeue_request(struct request *rq)
30static inline void blk_pm_add_request(struct request_queue *q, 30static inline void blk_pm_add_request(struct request_queue *q,
31 struct request *rq) 31 struct request *rq)
32{ 32{
33 lockdep_assert_held(q->queue_lock); 33 lockdep_assert_held(&q->queue_lock);
34 34
35 if (q->dev && !(rq->rq_flags & RQF_PM)) 35 if (q->dev && !(rq->rq_flags & RQF_PM))
36 q->nr_pending++; 36 q->nr_pending++;
@@ -38,7 +38,7 @@ static inline void blk_pm_add_request(struct request_queue *q,
38 38
39static inline void blk_pm_put_request(struct request *rq) 39static inline void blk_pm_put_request(struct request *rq)
40{ 40{
41 lockdep_assert_held(rq->q->queue_lock); 41 lockdep_assert_held(&rq->q->queue_lock);
42 42
43 if (rq->q->dev && !(rq->rq_flags & RQF_PM)) 43 if (rq->q->dev && !(rq->rq_flags & RQF_PM))
44 --rq->q->nr_pending; 44 --rq->q->nr_pending;
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 0005dfd568dd..d169d7188fa6 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -27,75 +27,67 @@ bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
27 return atomic_inc_below(&rq_wait->inflight, limit); 27 return atomic_inc_below(&rq_wait->inflight, limit);
28} 28}
29 29
30void rq_qos_cleanup(struct request_queue *q, struct bio *bio) 30void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio)
31{ 31{
32 struct rq_qos *rqos; 32 do {
33
34 for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
35 if (rqos->ops->cleanup) 33 if (rqos->ops->cleanup)
36 rqos->ops->cleanup(rqos, bio); 34 rqos->ops->cleanup(rqos, bio);
37 } 35 rqos = rqos->next;
36 } while (rqos);
38} 37}
39 38
40void rq_qos_done(struct request_queue *q, struct request *rq) 39void __rq_qos_done(struct rq_qos *rqos, struct request *rq)
41{ 40{
42 struct rq_qos *rqos; 41 do {
43
44 for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
45 if (rqos->ops->done) 42 if (rqos->ops->done)
46 rqos->ops->done(rqos, rq); 43 rqos->ops->done(rqos, rq);
47 } 44 rqos = rqos->next;
45 } while (rqos);
48} 46}
49 47
50void rq_qos_issue(struct request_queue *q, struct request *rq) 48void __rq_qos_issue(struct rq_qos *rqos, struct request *rq)
51{ 49{
52 struct rq_qos *rqos; 50 do {
53
54 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
55 if (rqos->ops->issue) 51 if (rqos->ops->issue)
56 rqos->ops->issue(rqos, rq); 52 rqos->ops->issue(rqos, rq);
57 } 53 rqos = rqos->next;
54 } while (rqos);
58} 55}
59 56
60void rq_qos_requeue(struct request_queue *q, struct request *rq) 57void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq)
61{ 58{
62 struct rq_qos *rqos; 59 do {
63
64 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
65 if (rqos->ops->requeue) 60 if (rqos->ops->requeue)
66 rqos->ops->requeue(rqos, rq); 61 rqos->ops->requeue(rqos, rq);
67 } 62 rqos = rqos->next;
63 } while (rqos);
68} 64}
69 65
70void rq_qos_throttle(struct request_queue *q, struct bio *bio, 66void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio)
71 spinlock_t *lock)
72{ 67{
73 struct rq_qos *rqos; 68 do {
74
75 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
76 if (rqos->ops->throttle) 69 if (rqos->ops->throttle)
77 rqos->ops->throttle(rqos, bio, lock); 70 rqos->ops->throttle(rqos, bio);
78 } 71 rqos = rqos->next;
72 } while (rqos);
79} 73}
80 74
81void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) 75void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
82{ 76{
83 struct rq_qos *rqos; 77 do {
84
85 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
86 if (rqos->ops->track) 78 if (rqos->ops->track)
87 rqos->ops->track(rqos, rq, bio); 79 rqos->ops->track(rqos, rq, bio);
88 } 80 rqos = rqos->next;
81 } while (rqos);
89} 82}
90 83
91void rq_qos_done_bio(struct request_queue *q, struct bio *bio) 84void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
92{ 85{
93 struct rq_qos *rqos; 86 do {
94
95 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
96 if (rqos->ops->done_bio) 87 if (rqos->ops->done_bio)
97 rqos->ops->done_bio(rqos, bio); 88 rqos->ops->done_bio(rqos, bio);
98 } 89 rqos = rqos->next;
90 } while (rqos);
99} 91}
100 92
101/* 93/*
@@ -184,8 +176,96 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
184 rq_depth_calc_max_depth(rqd); 176 rq_depth_calc_max_depth(rqd);
185} 177}
186 178
179struct rq_qos_wait_data {
180 struct wait_queue_entry wq;
181 struct task_struct *task;
182 struct rq_wait *rqw;
183 acquire_inflight_cb_t *cb;
184 void *private_data;
185 bool got_token;
186};
187
188static int rq_qos_wake_function(struct wait_queue_entry *curr,
189 unsigned int mode, int wake_flags, void *key)
190{
191 struct rq_qos_wait_data *data = container_of(curr,
192 struct rq_qos_wait_data,
193 wq);
194
195 /*
196 * If we fail to get a budget, return -1 to interrupt the wake up loop
197 * in __wake_up_common.
198 */
199 if (!data->cb(data->rqw, data->private_data))
200 return -1;
201
202 data->got_token = true;
203 list_del_init(&curr->entry);
204 wake_up_process(data->task);
205 return 1;
206}
207
208/**
209 * rq_qos_wait - throttle on a rqw if we need to
210 * @private_data - caller provided specific data
211 * @acquire_inflight_cb - inc the rqw->inflight counter if we can
212 * @cleanup_cb - the callback to cleanup in case we race with a waker
213 *
214 * This provides a uniform place for the rq_qos users to do their throttling.
215 * Since you can end up with a lot of things sleeping at once, this manages the
216 * waking up based on the resources available. The acquire_inflight_cb should
217 * inc the rqw->inflight if we have the ability to do so, or return false if not
218 * and then we will sleep until the room becomes available.
219 *
220 * cleanup_cb is in case that we race with a waker and need to cleanup the
221 * inflight count accordingly.
222 */
223void rq_qos_wait(struct rq_wait *rqw, void *private_data,
224 acquire_inflight_cb_t *acquire_inflight_cb,
225 cleanup_cb_t *cleanup_cb)
226{
227 struct rq_qos_wait_data data = {
228 .wq = {
229 .func = rq_qos_wake_function,
230 .entry = LIST_HEAD_INIT(data.wq.entry),
231 },
232 .task = current,
233 .rqw = rqw,
234 .cb = acquire_inflight_cb,
235 .private_data = private_data,
236 };
237 bool has_sleeper;
238
239 has_sleeper = wq_has_sleeper(&rqw->wait);
240 if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
241 return;
242
243 prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
244 do {
245 if (data.got_token)
246 break;
247 if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
248 finish_wait(&rqw->wait, &data.wq);
249
250 /*
251 * We raced with wbt_wake_function() getting a token,
252 * which means we now have two. Put our local token
253 * and wake anyone else potentially waiting for one.
254 */
255 if (data.got_token)
256 cleanup_cb(rqw, private_data);
257 break;
258 }
259 io_schedule();
260 has_sleeper = false;
261 } while (1);
262 finish_wait(&rqw->wait, &data.wq);
263}
264
187void rq_qos_exit(struct request_queue *q) 265void rq_qos_exit(struct request_queue *q)
188{ 266{
267 blk_mq_debugfs_unregister_queue_rqos(q);
268
189 while (q->rq_qos) { 269 while (q->rq_qos) {
190 struct rq_qos *rqos = q->rq_qos; 270 struct rq_qos *rqos = q->rq_qos;
191 q->rq_qos = rqos->next; 271 q->rq_qos = rqos->next;
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 32b02efbfa66..564851889550 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -7,6 +7,10 @@
7#include <linux/atomic.h> 7#include <linux/atomic.h>
8#include <linux/wait.h> 8#include <linux/wait.h>
9 9
10#include "blk-mq-debugfs.h"
11
12struct blk_mq_debugfs_attr;
13
10enum rq_qos_id { 14enum rq_qos_id {
11 RQ_QOS_WBT, 15 RQ_QOS_WBT,
12 RQ_QOS_CGROUP, 16 RQ_QOS_CGROUP,
@@ -22,10 +26,13 @@ struct rq_qos {
22 struct request_queue *q; 26 struct request_queue *q;
23 enum rq_qos_id id; 27 enum rq_qos_id id;
24 struct rq_qos *next; 28 struct rq_qos *next;
29#ifdef CONFIG_BLK_DEBUG_FS
30 struct dentry *debugfs_dir;
31#endif
25}; 32};
26 33
27struct rq_qos_ops { 34struct rq_qos_ops {
28 void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *); 35 void (*throttle)(struct rq_qos *, struct bio *);
29 void (*track)(struct rq_qos *, struct request *, struct bio *); 36 void (*track)(struct rq_qos *, struct request *, struct bio *);
30 void (*issue)(struct rq_qos *, struct request *); 37 void (*issue)(struct rq_qos *, struct request *);
31 void (*requeue)(struct rq_qos *, struct request *); 38 void (*requeue)(struct rq_qos *, struct request *);
@@ -33,6 +40,7 @@ struct rq_qos_ops {
33 void (*done_bio)(struct rq_qos *, struct bio *); 40 void (*done_bio)(struct rq_qos *, struct bio *);
34 void (*cleanup)(struct rq_qos *, struct bio *); 41 void (*cleanup)(struct rq_qos *, struct bio *);
35 void (*exit)(struct rq_qos *); 42 void (*exit)(struct rq_qos *);
43 const struct blk_mq_debugfs_attr *debugfs_attrs;
36}; 44};
37 45
38struct rq_depth { 46struct rq_depth {
@@ -66,6 +74,17 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
66 return rq_qos_id(q, RQ_QOS_CGROUP); 74 return rq_qos_id(q, RQ_QOS_CGROUP);
67} 75}
68 76
77static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
78{
79 switch (id) {
80 case RQ_QOS_WBT:
81 return "wbt";
82 case RQ_QOS_CGROUP:
83 return "cgroup";
84 }
85 return "unknown";
86}
87
69static inline void rq_wait_init(struct rq_wait *rq_wait) 88static inline void rq_wait_init(struct rq_wait *rq_wait)
70{ 89{
71 atomic_set(&rq_wait->inflight, 0); 90 atomic_set(&rq_wait->inflight, 0);
@@ -76,6 +95,9 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
76{ 95{
77 rqos->next = q->rq_qos; 96 rqos->next = q->rq_qos;
78 q->rq_qos = rqos; 97 q->rq_qos = rqos;
98
99 if (rqos->ops->debugfs_attrs)
100 blk_mq_debugfs_register_rqos(rqos);
79} 101}
80 102
81static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) 103static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
@@ -91,19 +113,77 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
91 } 113 }
92 prev = cur; 114 prev = cur;
93 } 115 }
116
117 blk_mq_debugfs_unregister_rqos(rqos);
94} 118}
95 119
120typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
121typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);
122
123void rq_qos_wait(struct rq_wait *rqw, void *private_data,
124 acquire_inflight_cb_t *acquire_inflight_cb,
125 cleanup_cb_t *cleanup_cb);
96bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); 126bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
97void rq_depth_scale_up(struct rq_depth *rqd); 127void rq_depth_scale_up(struct rq_depth *rqd);
98void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); 128void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
99bool rq_depth_calc_max_depth(struct rq_depth *rqd); 129bool rq_depth_calc_max_depth(struct rq_depth *rqd);
100 130
101void rq_qos_cleanup(struct request_queue *, struct bio *); 131void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
102void rq_qos_done(struct request_queue *, struct request *); 132void __rq_qos_done(struct rq_qos *rqos, struct request *rq);
103void rq_qos_issue(struct request_queue *, struct request *); 133void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
104void rq_qos_requeue(struct request_queue *, struct request *); 134void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
105void rq_qos_done_bio(struct request_queue *q, struct bio *bio); 135void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
106void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); 136void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
107void rq_qos_track(struct request_queue *q, struct request *, struct bio *); 137void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
138
139static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
140{
141 if (q->rq_qos)
142 __rq_qos_cleanup(q->rq_qos, bio);
143}
144
145static inline void rq_qos_done(struct request_queue *q, struct request *rq)
146{
147 if (q->rq_qos)
148 __rq_qos_done(q->rq_qos, rq);
149}
150
151static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
152{
153 if (q->rq_qos)
154 __rq_qos_issue(q->rq_qos, rq);
155}
156
157static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
158{
159 if (q->rq_qos)
160 __rq_qos_requeue(q->rq_qos, rq);
161}
162
163static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
164{
165 if (q->rq_qos)
166 __rq_qos_done_bio(q->rq_qos, bio);
167}
168
169static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
170{
171 /*
172 * BIO_TRACKED lets controllers know that a bio went through the
173 * normal rq_qos path.
174 */
175 bio_set_flag(bio, BIO_TRACKED);
176 if (q->rq_qos)
177 __rq_qos_throttle(q->rq_qos, bio);
178}
179
180static inline void rq_qos_track(struct request_queue *q, struct request *rq,
181 struct bio *bio)
182{
183 if (q->rq_qos)
184 __rq_qos_track(q->rq_qos, rq, bio);
185}
186
108void rq_qos_exit(struct request_queue *); 187void rq_qos_exit(struct request_queue *);
188
109#endif 189#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 696c04c1ab6c..3abe831e92c8 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -20,65 +20,12 @@ EXPORT_SYMBOL(blk_max_low_pfn);
20 20
21unsigned long blk_max_pfn; 21unsigned long blk_max_pfn;
22 22
23/**
24 * blk_queue_prep_rq - set a prepare_request function for queue
25 * @q: queue
26 * @pfn: prepare_request function
27 *
28 * It's possible for a queue to register a prepare_request callback which
29 * is invoked before the request is handed to the request_fn. The goal of
30 * the function is to prepare a request for I/O, it can be used to build a
31 * cdb from the request data for instance.
32 *
33 */
34void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
35{
36 q->prep_rq_fn = pfn;
37}
38EXPORT_SYMBOL(blk_queue_prep_rq);
39
40/**
41 * blk_queue_unprep_rq - set an unprepare_request function for queue
42 * @q: queue
43 * @ufn: unprepare_request function
44 *
45 * It's possible for a queue to register an unprepare_request callback
46 * which is invoked before the request is finally completed. The goal
47 * of the function is to deallocate any data that was allocated in the
48 * prepare_request callback.
49 *
50 */
51void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
52{
53 q->unprep_rq_fn = ufn;
54}
55EXPORT_SYMBOL(blk_queue_unprep_rq);
56
57void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
58{
59 q->softirq_done_fn = fn;
60}
61EXPORT_SYMBOL(blk_queue_softirq_done);
62
63void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) 23void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
64{ 24{
65 q->rq_timeout = timeout; 25 q->rq_timeout = timeout;
66} 26}
67EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); 27EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
68 28
69void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
70{
71 WARN_ON_ONCE(q->mq_ops);
72 q->rq_timed_out_fn = fn;
73}
74EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
75
76void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
77{
78 q->lld_busy_fn = fn;
79}
80EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
81
82/** 29/**
83 * blk_set_default_limits - reset limits to default values 30 * blk_set_default_limits - reset limits to default values
84 * @lim: the queue_limits structure to reset 31 * @lim: the queue_limits structure to reset
@@ -169,8 +116,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
169 116
170 q->make_request_fn = mfn; 117 q->make_request_fn = mfn;
171 blk_queue_dma_alignment(q, 511); 118 blk_queue_dma_alignment(q, 511);
172 blk_queue_congestion_threshold(q);
173 q->nr_batching = BLK_BATCH_REQ;
174 119
175 blk_set_default_limits(&q->limits); 120 blk_set_default_limits(&q->limits);
176} 121}
@@ -889,16 +834,14 @@ EXPORT_SYMBOL(blk_set_queue_depth);
889 */ 834 */
890void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) 835void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
891{ 836{
892 spin_lock_irq(q->queue_lock);
893 if (wc) 837 if (wc)
894 queue_flag_set(QUEUE_FLAG_WC, q); 838 blk_queue_flag_set(QUEUE_FLAG_WC, q);
895 else 839 else
896 queue_flag_clear(QUEUE_FLAG_WC, q); 840 blk_queue_flag_clear(QUEUE_FLAG_WC, q);
897 if (fua) 841 if (fua)
898 queue_flag_set(QUEUE_FLAG_FUA, q); 842 blk_queue_flag_set(QUEUE_FLAG_FUA, q);
899 else 843 else
900 queue_flag_clear(QUEUE_FLAG_FUA, q); 844 blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
901 spin_unlock_irq(q->queue_lock);
902 845
903 wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 846 wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
904} 847}
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index e47a2f751884..457d9ba3eb20 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -34,7 +34,7 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h)
34 34
35 rq = list_entry(local_list.next, struct request, ipi_list); 35 rq = list_entry(local_list.next, struct request, ipi_list);
36 list_del_init(&rq->ipi_list); 36 list_del_init(&rq->ipi_list);
37 rq->q->softirq_done_fn(rq); 37 rq->q->mq_ops->complete(rq);
38 } 38 }
39} 39}
40 40
@@ -98,11 +98,11 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
98void __blk_complete_request(struct request *req) 98void __blk_complete_request(struct request *req)
99{ 99{
100 struct request_queue *q = req->q; 100 struct request_queue *q = req->q;
101 int cpu, ccpu = q->mq_ops ? req->mq_ctx->cpu : req->cpu; 101 int cpu, ccpu = req->mq_ctx->cpu;
102 unsigned long flags; 102 unsigned long flags;
103 bool shared = false; 103 bool shared = false;
104 104
105 BUG_ON(!q->softirq_done_fn); 105 BUG_ON(!q->mq_ops->complete);
106 106
107 local_irq_save(flags); 107 local_irq_save(flags);
108 cpu = smp_processor_id(); 108 cpu = smp_processor_id();
@@ -143,27 +143,6 @@ do_local:
143 143
144 local_irq_restore(flags); 144 local_irq_restore(flags);
145} 145}
146EXPORT_SYMBOL(__blk_complete_request);
147
148/**
149 * blk_complete_request - end I/O on a request
150 * @req: the request being processed
151 *
152 * Description:
153 * Ends all I/O on a request. It does not handle partial completions,
154 * unless the driver actually implements this in its completion callback
155 * through requeueing. The actual completion happens out-of-order,
156 * through a softirq handler. The user must have registered a completion
157 * callback through blk_queue_softirq_done().
158 **/
159void blk_complete_request(struct request *req)
160{
161 if (unlikely(blk_should_fake_timeout(req->q)))
162 return;
163 if (!blk_mark_rq_complete(req))
164 __blk_complete_request(req);
165}
166EXPORT_SYMBOL(blk_complete_request);
167 146
168static __init int blk_softirq_init(void) 147static __init int blk_softirq_init(void)
169{ 148{
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 90561af85a62..696a04176e4d 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -130,7 +130,6 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
130 130
131 return cb; 131 return cb;
132} 132}
133EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
134 133
135void blk_stat_add_callback(struct request_queue *q, 134void blk_stat_add_callback(struct request_queue *q,
136 struct blk_stat_callback *cb) 135 struct blk_stat_callback *cb)
@@ -151,7 +150,6 @@ void blk_stat_add_callback(struct request_queue *q,
151 blk_queue_flag_set(QUEUE_FLAG_STATS, q); 150 blk_queue_flag_set(QUEUE_FLAG_STATS, q);
152 spin_unlock(&q->stats->lock); 151 spin_unlock(&q->stats->lock);
153} 152}
154EXPORT_SYMBOL_GPL(blk_stat_add_callback);
155 153
156void blk_stat_remove_callback(struct request_queue *q, 154void blk_stat_remove_callback(struct request_queue *q,
157 struct blk_stat_callback *cb) 155 struct blk_stat_callback *cb)
@@ -164,7 +162,6 @@ void blk_stat_remove_callback(struct request_queue *q,
164 162
165 del_timer_sync(&cb->timer); 163 del_timer_sync(&cb->timer);
166} 164}
167EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
168 165
169static void blk_stat_free_callback_rcu(struct rcu_head *head) 166static void blk_stat_free_callback_rcu(struct rcu_head *head)
170{ 167{
@@ -181,7 +178,6 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
181 if (cb) 178 if (cb)
182 call_rcu(&cb->rcu, blk_stat_free_callback_rcu); 179 call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
183} 180}
184EXPORT_SYMBOL_GPL(blk_stat_free_callback);
185 181
186void blk_stat_enable_accounting(struct request_queue *q) 182void blk_stat_enable_accounting(struct request_queue *q)
187{ 183{
diff --git a/block/blk-stat.h b/block/blk-stat.h
index f4a1568e81a4..17b47a86eefb 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -145,6 +145,11 @@ static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
145 mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); 145 mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
146} 146}
147 147
148static inline void blk_stat_deactivate(struct blk_stat_callback *cb)
149{
150 del_timer_sync(&cb->timer);
151}
152
148/** 153/**
149 * blk_stat_activate_msecs() - Gather block statistics during a time window in 154 * blk_stat_activate_msecs() - Gather block statistics during a time window in
150 * milliseconds. 155 * milliseconds.
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 844a454a7b3a..0619c8922893 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -68,7 +68,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
68 unsigned long nr; 68 unsigned long nr;
69 int ret, err; 69 int ret, err;
70 70
71 if (!q->request_fn && !q->mq_ops) 71 if (!queue_is_mq(q))
72 return -EINVAL; 72 return -EINVAL;
73 73
74 ret = queue_var_store(&nr, page, count); 74 ret = queue_var_store(&nr, page, count);
@@ -78,11 +78,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
78 if (nr < BLKDEV_MIN_RQ) 78 if (nr < BLKDEV_MIN_RQ)
79 nr = BLKDEV_MIN_RQ; 79 nr = BLKDEV_MIN_RQ;
80 80
81 if (q->request_fn) 81 err = blk_mq_update_nr_requests(q, nr);
82 err = blk_update_nr_requests(q, nr);
83 else
84 err = blk_mq_update_nr_requests(q, nr);
85
86 if (err) 82 if (err)
87 return err; 83 return err;
88 84
@@ -242,10 +238,10 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
242 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) 238 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
243 return -EINVAL; 239 return -EINVAL;
244 240
245 spin_lock_irq(q->queue_lock); 241 spin_lock_irq(&q->queue_lock);
246 q->limits.max_sectors = max_sectors_kb << 1; 242 q->limits.max_sectors = max_sectors_kb << 1;
247 q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); 243 q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
248 spin_unlock_irq(q->queue_lock); 244 spin_unlock_irq(&q->queue_lock);
249 245
250 return ret; 246 return ret;
251} 247}
@@ -320,14 +316,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
320 if (ret < 0) 316 if (ret < 0)
321 return ret; 317 return ret;
322 318
323 spin_lock_irq(q->queue_lock); 319 blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
324 queue_flag_clear(QUEUE_FLAG_NOMERGES, q); 320 blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
325 queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
326 if (nm == 2) 321 if (nm == 2)
327 queue_flag_set(QUEUE_FLAG_NOMERGES, q); 322 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
328 else if (nm) 323 else if (nm)
329 queue_flag_set(QUEUE_FLAG_NOXMERGES, q); 324 blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
330 spin_unlock_irq(q->queue_lock);
331 325
332 return ret; 326 return ret;
333} 327}
@@ -351,18 +345,16 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
351 if (ret < 0) 345 if (ret < 0)
352 return ret; 346 return ret;
353 347
354 spin_lock_irq(q->queue_lock);
355 if (val == 2) { 348 if (val == 2) {
356 queue_flag_set(QUEUE_FLAG_SAME_COMP, q); 349 blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
357 queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); 350 blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
358 } else if (val == 1) { 351 } else if (val == 1) {
359 queue_flag_set(QUEUE_FLAG_SAME_COMP, q); 352 blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
360 queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); 353 blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
361 } else if (val == 0) { 354 } else if (val == 0) {
362 queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); 355 blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
363 queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); 356 blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
364 } 357 }
365 spin_unlock_irq(q->queue_lock);
366#endif 358#endif
367 return ret; 359 return ret;
368} 360}
@@ -410,7 +402,8 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
410 unsigned long poll_on; 402 unsigned long poll_on;
411 ssize_t ret; 403 ssize_t ret;
412 404
413 if (!q->mq_ops || !q->mq_ops->poll) 405 if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
406 !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
414 return -EINVAL; 407 return -EINVAL;
415 408
416 ret = queue_var_store(&poll_on, page, count); 409 ret = queue_var_store(&poll_on, page, count);
@@ -425,6 +418,26 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
425 return ret; 418 return ret;
426} 419}
427 420
421static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
422{
423 return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout));
424}
425
426static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page,
427 size_t count)
428{
429 unsigned int val;
430 int err;
431
432 err = kstrtou32(page, 10, &val);
433 if (err || val == 0)
434 return -EINVAL;
435
436 blk_queue_rq_timeout(q, msecs_to_jiffies(val));
437
438 return count;
439}
440
428static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) 441static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
429{ 442{
430 if (!wbt_rq_qos(q)) 443 if (!wbt_rq_qos(q))
@@ -463,20 +476,14 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
463 * ends up either enabling or disabling wbt completely. We can't 476 * ends up either enabling or disabling wbt completely. We can't
464 * have IO inflight if that happens. 477 * have IO inflight if that happens.
465 */ 478 */
466 if (q->mq_ops) { 479 blk_mq_freeze_queue(q);
467 blk_mq_freeze_queue(q); 480 blk_mq_quiesce_queue(q);
468 blk_mq_quiesce_queue(q);
469 } else
470 blk_queue_bypass_start(q);
471 481
472 wbt_set_min_lat(q, val); 482 wbt_set_min_lat(q, val);
473 wbt_update_limits(q); 483 wbt_update_limits(q);
474 484
475 if (q->mq_ops) { 485 blk_mq_unquiesce_queue(q);
476 blk_mq_unquiesce_queue(q); 486 blk_mq_unfreeze_queue(q);
477 blk_mq_unfreeze_queue(q);
478 } else
479 blk_queue_bypass_end(q);
480 487
481 return count; 488 return count;
482} 489}
@@ -699,6 +706,12 @@ static struct queue_sysfs_entry queue_dax_entry = {
699 .show = queue_dax_show, 706 .show = queue_dax_show,
700}; 707};
701 708
709static struct queue_sysfs_entry queue_io_timeout_entry = {
710 .attr = {.name = "io_timeout", .mode = 0644 },
711 .show = queue_io_timeout_show,
712 .store = queue_io_timeout_store,
713};
714
702static struct queue_sysfs_entry queue_wb_lat_entry = { 715static struct queue_sysfs_entry queue_wb_lat_entry = {
703 .attr = {.name = "wbt_lat_usec", .mode = 0644 }, 716 .attr = {.name = "wbt_lat_usec", .mode = 0644 },
704 .show = queue_wb_lat_show, 717 .show = queue_wb_lat_show,
@@ -748,6 +761,7 @@ static struct attribute *default_attrs[] = {
748 &queue_dax_entry.attr, 761 &queue_dax_entry.attr,
749 &queue_wb_lat_entry.attr, 762 &queue_wb_lat_entry.attr,
750 &queue_poll_delay_entry.attr, 763 &queue_poll_delay_entry.attr,
764 &queue_io_timeout_entry.attr,
751#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 765#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
752 &throtl_sample_time_entry.attr, 766 &throtl_sample_time_entry.attr,
753#endif 767#endif
@@ -847,24 +861,14 @@ static void __blk_release_queue(struct work_struct *work)
847 861
848 blk_free_queue_stats(q->stats); 862 blk_free_queue_stats(q->stats);
849 863
850 blk_exit_rl(q, &q->root_rl);
851
852 if (q->queue_tags)
853 __blk_queue_free_tags(q);
854
855 blk_queue_free_zone_bitmaps(q); 864 blk_queue_free_zone_bitmaps(q);
856 865
857 if (!q->mq_ops) { 866 if (queue_is_mq(q))
858 if (q->exit_rq_fn)
859 q->exit_rq_fn(q, q->fq->flush_rq);
860 blk_free_flush_queue(q->fq);
861 } else {
862 blk_mq_release(q); 867 blk_mq_release(q);
863 }
864 868
865 blk_trace_shutdown(q); 869 blk_trace_shutdown(q);
866 870
867 if (q->mq_ops) 871 if (queue_is_mq(q))
868 blk_mq_debugfs_unregister(q); 872 blk_mq_debugfs_unregister(q);
869 873
870 bioset_exit(&q->bio_split); 874 bioset_exit(&q->bio_split);
@@ -909,7 +913,7 @@ int blk_register_queue(struct gendisk *disk)
909 WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), 913 WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
910 "%s is registering an already registered queue\n", 914 "%s is registering an already registered queue\n",
911 kobject_name(&dev->kobj)); 915 kobject_name(&dev->kobj));
912 queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q); 916 blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
913 917
914 /* 918 /*
915 * SCSI probing may synchronously create and destroy a lot of 919 * SCSI probing may synchronously create and destroy a lot of
@@ -921,9 +925,8 @@ int blk_register_queue(struct gendisk *disk)
921 * request_queues for non-existent devices never get registered. 925 * request_queues for non-existent devices never get registered.
922 */ 926 */
923 if (!blk_queue_init_done(q)) { 927 if (!blk_queue_init_done(q)) {
924 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); 928 blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
925 percpu_ref_switch_to_percpu(&q->q_usage_counter); 929 percpu_ref_switch_to_percpu(&q->q_usage_counter);
926 blk_queue_bypass_end(q);
927 } 930 }
928 931
929 ret = blk_trace_init_sysfs(dev); 932 ret = blk_trace_init_sysfs(dev);
@@ -939,7 +942,7 @@ int blk_register_queue(struct gendisk *disk)
939 goto unlock; 942 goto unlock;
940 } 943 }
941 944
942 if (q->mq_ops) { 945 if (queue_is_mq(q)) {
943 __blk_mq_register_dev(dev, q); 946 __blk_mq_register_dev(dev, q);
944 blk_mq_debugfs_register(q); 947 blk_mq_debugfs_register(q);
945 } 948 }
@@ -950,7 +953,7 @@ int blk_register_queue(struct gendisk *disk)
950 953
951 blk_throtl_register_queue(q); 954 blk_throtl_register_queue(q);
952 955
953 if (q->request_fn || (q->mq_ops && q->elevator)) { 956 if (q->elevator) {
954 ret = elv_register_queue(q); 957 ret = elv_register_queue(q);
955 if (ret) { 958 if (ret) {
956 mutex_unlock(&q->sysfs_lock); 959 mutex_unlock(&q->sysfs_lock);
@@ -999,7 +1002,7 @@ void blk_unregister_queue(struct gendisk *disk)
999 * Remove the sysfs attributes before unregistering the queue data 1002 * Remove the sysfs attributes before unregistering the queue data
1000 * structures that can be modified through sysfs. 1003 * structures that can be modified through sysfs.
1001 */ 1004 */
1002 if (q->mq_ops) 1005 if (queue_is_mq(q))
1003 blk_mq_unregister_dev(disk_to_dev(disk), q); 1006 blk_mq_unregister_dev(disk_to_dev(disk), q);
1004 mutex_unlock(&q->sysfs_lock); 1007 mutex_unlock(&q->sysfs_lock);
1005 1008
@@ -1008,7 +1011,7 @@ void blk_unregister_queue(struct gendisk *disk)
1008 blk_trace_remove_sysfs(disk_to_dev(disk)); 1011 blk_trace_remove_sysfs(disk_to_dev(disk));
1009 1012
1010 mutex_lock(&q->sysfs_lock); 1013 mutex_lock(&q->sysfs_lock);
1011 if (q->request_fn || (q->mq_ops && q->elevator)) 1014 if (q->elevator)
1012 elv_unregister_queue(q); 1015 elv_unregister_queue(q);
1013 mutex_unlock(&q->sysfs_lock); 1016 mutex_unlock(&q->sysfs_lock);
1014 1017
diff --git a/block/blk-tag.c b/block/blk-tag.c
deleted file mode 100644
index fbc153aef166..000000000000
--- a/block/blk-tag.c
+++ /dev/null
@@ -1,378 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Functions related to tagged command queuing
4 */
5#include <linux/kernel.h>
6#include <linux/module.h>
7#include <linux/bio.h>
8#include <linux/blkdev.h>
9#include <linux/slab.h>
10
11#include "blk.h"
12
13/**
14 * blk_queue_find_tag - find a request by its tag and queue
15 * @q: The request queue for the device
16 * @tag: The tag of the request
17 *
18 * Notes:
19 * Should be used when a device returns a tag and you want to match
20 * it with a request.
21 *
22 * no locks need be held.
23 **/
24struct request *blk_queue_find_tag(struct request_queue *q, int tag)
25{
26 return blk_map_queue_find_tag(q->queue_tags, tag);
27}
28EXPORT_SYMBOL(blk_queue_find_tag);
29
30/**
31 * blk_free_tags - release a given set of tag maintenance info
32 * @bqt: the tag map to free
33 *
34 * Drop the reference count on @bqt and frees it when the last reference
35 * is dropped.
36 */
37void blk_free_tags(struct blk_queue_tag *bqt)
38{
39 if (atomic_dec_and_test(&bqt->refcnt)) {
40 BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) <
41 bqt->max_depth);
42
43 kfree(bqt->tag_index);
44 bqt->tag_index = NULL;
45
46 kfree(bqt->tag_map);
47 bqt->tag_map = NULL;
48
49 kfree(bqt);
50 }
51}
52EXPORT_SYMBOL(blk_free_tags);
53
54/**
55 * __blk_queue_free_tags - release tag maintenance info
56 * @q: the request queue for the device
57 *
58 * Notes:
59 * blk_cleanup_queue() will take care of calling this function, if tagging
60 * has been used. So there's no need to call this directly.
61 **/
62void __blk_queue_free_tags(struct request_queue *q)
63{
64 struct blk_queue_tag *bqt = q->queue_tags;
65
66 if (!bqt)
67 return;
68
69 blk_free_tags(bqt);
70
71 q->queue_tags = NULL;
72 queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
73}
74
75/**
76 * blk_queue_free_tags - release tag maintenance info
77 * @q: the request queue for the device
78 *
79 * Notes:
80 * This is used to disable tagged queuing to a device, yet leave
81 * queue in function.
82 **/
83void blk_queue_free_tags(struct request_queue *q)
84{
85 queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
86}
87EXPORT_SYMBOL(blk_queue_free_tags);
88
89static int
90init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
91{
92 struct request **tag_index;
93 unsigned long *tag_map;
94 int nr_ulongs;
95
96 if (q && depth > q->nr_requests * 2) {
97 depth = q->nr_requests * 2;
98 printk(KERN_ERR "%s: adjusted depth to %d\n",
99 __func__, depth);
100 }
101
102 tag_index = kcalloc(depth, sizeof(struct request *), GFP_ATOMIC);
103 if (!tag_index)
104 goto fail;
105
106 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
107 tag_map = kcalloc(nr_ulongs, sizeof(unsigned long), GFP_ATOMIC);
108 if (!tag_map)
109 goto fail;
110
111 tags->real_max_depth = depth;
112 tags->max_depth = depth;
113 tags->tag_index = tag_index;
114 tags->tag_map = tag_map;
115
116 return 0;
117fail:
118 kfree(tag_index);
119 return -ENOMEM;
120}
121
122static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
123 int depth, int alloc_policy)
124{
125 struct blk_queue_tag *tags;
126
127 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
128 if (!tags)
129 goto fail;
130
131 if (init_tag_map(q, tags, depth))
132 goto fail;
133
134 atomic_set(&tags->refcnt, 1);
135 tags->alloc_policy = alloc_policy;
136 tags->next_tag = 0;
137 return tags;
138fail:
139 kfree(tags);
140 return NULL;
141}
142
143/**
144 * blk_init_tags - initialize the tag info for an external tag map
145 * @depth: the maximum queue depth supported
146 * @alloc_policy: tag allocation policy
147 **/
148struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy)
149{
150 return __blk_queue_init_tags(NULL, depth, alloc_policy);
151}
152EXPORT_SYMBOL(blk_init_tags);
153
154/**
155 * blk_queue_init_tags - initialize the queue tag info
156 * @q: the request queue for the device
157 * @depth: the maximum queue depth supported
158 * @tags: the tag to use
159 * @alloc_policy: tag allocation policy
160 *
161 * Queue lock must be held here if the function is called to resize an
162 * existing map.
163 **/
164int blk_queue_init_tags(struct request_queue *q, int depth,
165 struct blk_queue_tag *tags, int alloc_policy)
166{
167 int rc;
168
169 BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
170
171 if (!tags && !q->queue_tags) {
172 tags = __blk_queue_init_tags(q, depth, alloc_policy);
173
174 if (!tags)
175 return -ENOMEM;
176
177 } else if (q->queue_tags) {
178 rc = blk_queue_resize_tags(q, depth);
179 if (rc)
180 return rc;
181 queue_flag_set(QUEUE_FLAG_QUEUED, q);
182 return 0;
183 } else
184 atomic_inc(&tags->refcnt);
185
186 /*
187 * assign it, all done
188 */
189 q->queue_tags = tags;
190 queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
191 return 0;
192}
193EXPORT_SYMBOL(blk_queue_init_tags);
194
195/**
196 * blk_queue_resize_tags - change the queueing depth
197 * @q: the request queue for the device
198 * @new_depth: the new max command queueing depth
199 *
200 * Notes:
201 * Must be called with the queue lock held.
202 **/
203int blk_queue_resize_tags(struct request_queue *q, int new_depth)
204{
205 struct blk_queue_tag *bqt = q->queue_tags;
206 struct request **tag_index;
207 unsigned long *tag_map;
208 int max_depth, nr_ulongs;
209
210 if (!bqt)
211 return -ENXIO;
212
213 /*
214 * if we already have large enough real_max_depth. just
215 * adjust max_depth. *NOTE* as requests with tag value
216 * between new_depth and real_max_depth can be in-flight, tag
217 * map can not be shrunk blindly here.
218 */
219 if (new_depth <= bqt->real_max_depth) {
220 bqt->max_depth = new_depth;
221 return 0;
222 }
223
224 /*
225 * Currently cannot replace a shared tag map with a new
226 * one, so error out if this is the case
227 */
228 if (atomic_read(&bqt->refcnt) != 1)
229 return -EBUSY;
230
231 /*
232 * save the old state info, so we can copy it back
233 */
234 tag_index = bqt->tag_index;
235 tag_map = bqt->tag_map;
236 max_depth = bqt->real_max_depth;
237
238 if (init_tag_map(q, bqt, new_depth))
239 return -ENOMEM;
240
241 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
242 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
243 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
244
245 kfree(tag_index);
246 kfree(tag_map);
247 return 0;
248}
249EXPORT_SYMBOL(blk_queue_resize_tags);
250
251/**
252 * blk_queue_end_tag - end tag operations for a request
253 * @q: the request queue for the device
254 * @rq: the request that has completed
255 *
256 * Description:
257 * Typically called when end_that_request_first() returns %0, meaning
258 * all transfers have been done for a request. It's important to call
259 * this function before end_that_request_last(), as that will put the
260 * request back on the free list thus corrupting the internal tag list.
261 **/
262void blk_queue_end_tag(struct request_queue *q, struct request *rq)
263{
264 struct blk_queue_tag *bqt = q->queue_tags;
265 unsigned tag = rq->tag; /* negative tags invalid */
266
267 lockdep_assert_held(q->queue_lock);
268
269 BUG_ON(tag >= bqt->real_max_depth);
270
271 list_del_init(&rq->queuelist);
272 rq->rq_flags &= ~RQF_QUEUED;
273 rq->tag = -1;
274 rq->internal_tag = -1;
275
276 if (unlikely(bqt->tag_index[tag] == NULL))
277 printk(KERN_ERR "%s: tag %d is missing\n",
278 __func__, tag);
279
280 bqt->tag_index[tag] = NULL;
281
282 if (unlikely(!test_bit(tag, bqt->tag_map))) {
283 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
284 __func__, tag);
285 return;
286 }
287 /*
288 * The tag_map bit acts as a lock for tag_index[bit], so we need
289 * unlock memory barrier semantics.
290 */
291 clear_bit_unlock(tag, bqt->tag_map);
292}
293
294/**
295 * blk_queue_start_tag - find a free tag and assign it
296 * @q: the request queue for the device
297 * @rq: the block request that needs tagging
298 *
299 * Description:
300 * This can either be used as a stand-alone helper, or possibly be
301 * assigned as the queue &prep_rq_fn (in which case &struct request
302 * automagically gets a tag assigned). Note that this function
303 * assumes that any type of request can be queued! if this is not
304 * true for your device, you must check the request type before
305 * calling this function. The request will also be removed from
306 * the request queue, so it's the drivers responsibility to readd
307 * it if it should need to be restarted for some reason.
308 **/
309int blk_queue_start_tag(struct request_queue *q, struct request *rq)
310{
311 struct blk_queue_tag *bqt = q->queue_tags;
312 unsigned max_depth;
313 int tag;
314
315 lockdep_assert_held(q->queue_lock);
316
317 if (unlikely((rq->rq_flags & RQF_QUEUED))) {
318 printk(KERN_ERR
319 "%s: request %p for device [%s] already tagged %d",
320 __func__, rq,
321 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
322 BUG();
323 }
324
325 /*
326 * Protect against shared tag maps, as we may not have exclusive
327 * access to the tag map.
328 *
329 * We reserve a few tags just for sync IO, since we don't want
330 * to starve sync IO on behalf of flooding async IO.
331 */
332 max_depth = bqt->max_depth;
333 if (!rq_is_sync(rq) && max_depth > 1) {
334 switch (max_depth) {
335 case 2:
336 max_depth = 1;
337 break;
338 case 3:
339 max_depth = 2;
340 break;
341 default:
342 max_depth -= 2;
343 }
344 if (q->in_flight[BLK_RW_ASYNC] > max_depth)
345 return 1;
346 }
347
348 do {
349 if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) {
350 tag = find_first_zero_bit(bqt->tag_map, max_depth);
351 if (tag >= max_depth)
352 return 1;
353 } else {
354 int start = bqt->next_tag;
355 int size = min_t(int, bqt->max_depth, max_depth + start);
356 tag = find_next_zero_bit(bqt->tag_map, size, start);
357 if (tag >= size && start + size > bqt->max_depth) {
358 size = start + size - bqt->max_depth;
359 tag = find_first_zero_bit(bqt->tag_map, size);
360 }
361 if (tag >= size)
362 return 1;
363 }
364
365 } while (test_and_set_bit_lock(tag, bqt->tag_map));
366 /*
367 * We need lock ordering semantics given by test_and_set_bit_lock.
368 * See blk_queue_end_tag for details.
369 */
370
371 bqt->next_tag = (tag + 1) % bqt->max_depth;
372 rq->rq_flags |= RQF_QUEUED;
373 rq->tag = tag;
374 bqt->tag_index[tag] = rq;
375 blk_start_request(rq);
376 return 0;
377}
378EXPORT_SYMBOL(blk_queue_start_tag);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index db1a3a2ae006..1b97a73d2fb1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1243,7 +1243,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
1243 bool dispatched; 1243 bool dispatched;
1244 int ret; 1244 int ret;
1245 1245
1246 spin_lock_irq(q->queue_lock); 1246 spin_lock_irq(&q->queue_lock);
1247 if (throtl_can_upgrade(td, NULL)) 1247 if (throtl_can_upgrade(td, NULL))
1248 throtl_upgrade_state(td); 1248 throtl_upgrade_state(td);
1249 1249
@@ -1266,9 +1266,9 @@ again:
1266 break; 1266 break;
1267 1267
1268 /* this dispatch windows is still open, relax and repeat */ 1268 /* this dispatch windows is still open, relax and repeat */
1269 spin_unlock_irq(q->queue_lock); 1269 spin_unlock_irq(&q->queue_lock);
1270 cpu_relax(); 1270 cpu_relax();
1271 spin_lock_irq(q->queue_lock); 1271 spin_lock_irq(&q->queue_lock);
1272 } 1272 }
1273 1273
1274 if (!dispatched) 1274 if (!dispatched)
@@ -1290,7 +1290,7 @@ again:
1290 queue_work(kthrotld_workqueue, &td->dispatch_work); 1290 queue_work(kthrotld_workqueue, &td->dispatch_work);
1291 } 1291 }
1292out_unlock: 1292out_unlock:
1293 spin_unlock_irq(q->queue_lock); 1293 spin_unlock_irq(&q->queue_lock);
1294} 1294}
1295 1295
1296/** 1296/**
@@ -1314,11 +1314,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
1314 1314
1315 bio_list_init(&bio_list_on_stack); 1315 bio_list_init(&bio_list_on_stack);
1316 1316
1317 spin_lock_irq(q->queue_lock); 1317 spin_lock_irq(&q->queue_lock);
1318 for (rw = READ; rw <= WRITE; rw++) 1318 for (rw = READ; rw <= WRITE; rw++)
1319 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1319 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
1320 bio_list_add(&bio_list_on_stack, bio); 1320 bio_list_add(&bio_list_on_stack, bio);
1321 spin_unlock_irq(q->queue_lock); 1321 spin_unlock_irq(&q->queue_lock);
1322 1322
1323 if (!bio_list_empty(&bio_list_on_stack)) { 1323 if (!bio_list_empty(&bio_list_on_stack)) {
1324 blk_start_plug(&plug); 1324 blk_start_plug(&plug);
@@ -2115,16 +2115,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
2115} 2115}
2116#endif 2116#endif
2117 2117
2118static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
2119{
2120#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2121 /* fallback to root_blkg if we fail to get a blkg ref */
2122 if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV))
2123 bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg);
2124 bio_issue_init(&bio->bi_issue, bio_sectors(bio));
2125#endif
2126}
2127
2128bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, 2118bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
2129 struct bio *bio) 2119 struct bio *bio)
2130{ 2120{
@@ -2141,14 +2131,10 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
2141 if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) 2131 if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
2142 goto out; 2132 goto out;
2143 2133
2144 spin_lock_irq(q->queue_lock); 2134 spin_lock_irq(&q->queue_lock);
2145 2135
2146 throtl_update_latency_buckets(td); 2136 throtl_update_latency_buckets(td);
2147 2137
2148 if (unlikely(blk_queue_bypass(q)))
2149 goto out_unlock;
2150
2151 blk_throtl_assoc_bio(tg, bio);
2152 blk_throtl_update_idletime(tg); 2138 blk_throtl_update_idletime(tg);
2153 2139
2154 sq = &tg->service_queue; 2140 sq = &tg->service_queue;
@@ -2227,7 +2213,7 @@ again:
2227 } 2213 }
2228 2214
2229out_unlock: 2215out_unlock:
2230 spin_unlock_irq(q->queue_lock); 2216 spin_unlock_irq(&q->queue_lock);
2231out: 2217out:
2232 bio_set_flag(bio, BIO_THROTTLED); 2218 bio_set_flag(bio, BIO_THROTTLED);
2233 2219
@@ -2348,7 +2334,7 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq)
2348 * Dispatch all currently throttled bios on @q through ->make_request_fn(). 2334 * Dispatch all currently throttled bios on @q through ->make_request_fn().
2349 */ 2335 */
2350void blk_throtl_drain(struct request_queue *q) 2336void blk_throtl_drain(struct request_queue *q)
2351 __releases(q->queue_lock) __acquires(q->queue_lock) 2337 __releases(&q->queue_lock) __acquires(&q->queue_lock)
2352{ 2338{
2353 struct throtl_data *td = q->td; 2339 struct throtl_data *td = q->td;
2354 struct blkcg_gq *blkg; 2340 struct blkcg_gq *blkg;
@@ -2356,7 +2342,6 @@ void blk_throtl_drain(struct request_queue *q)
2356 struct bio *bio; 2342 struct bio *bio;
2357 int rw; 2343 int rw;
2358 2344
2359 queue_lockdep_assert_held(q);
2360 rcu_read_lock(); 2345 rcu_read_lock();
2361 2346
2362 /* 2347 /*
@@ -2372,7 +2357,7 @@ void blk_throtl_drain(struct request_queue *q)
2372 tg_drain_bios(&td->service_queue); 2357 tg_drain_bios(&td->service_queue);
2373 2358
2374 rcu_read_unlock(); 2359 rcu_read_unlock();
2375 spin_unlock_irq(q->queue_lock); 2360 spin_unlock_irq(&q->queue_lock);
2376 2361
2377 /* all bios now should be in td->service_queue, issue them */ 2362 /* all bios now should be in td->service_queue, issue them */
2378 for (rw = READ; rw <= WRITE; rw++) 2363 for (rw = READ; rw <= WRITE; rw++)
@@ -2380,7 +2365,7 @@ void blk_throtl_drain(struct request_queue *q)
2380 NULL))) 2365 NULL)))
2381 generic_make_request(bio); 2366 generic_make_request(bio);
2382 2367
2383 spin_lock_irq(q->queue_lock); 2368 spin_lock_irq(&q->queue_lock);
2384} 2369}
2385 2370
2386int blk_throtl_init(struct request_queue *q) 2371int blk_throtl_init(struct request_queue *q)
@@ -2460,7 +2445,7 @@ void blk_throtl_register_queue(struct request_queue *q)
2460 td->throtl_slice = DFL_THROTL_SLICE_HD; 2445 td->throtl_slice = DFL_THROTL_SLICE_HD;
2461#endif 2446#endif
2462 2447
2463 td->track_bio_latency = !queue_is_rq_based(q); 2448 td->track_bio_latency = !queue_is_mq(q);
2464 if (!td->track_bio_latency) 2449 if (!td->track_bio_latency)
2465 blk_stat_enable_accounting(q); 2450 blk_stat_enable_accounting(q);
2466} 2451}
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index f2cfd56e1606..124c26128bf6 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -68,80 +68,6 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
68 68
69#endif /* CONFIG_FAIL_IO_TIMEOUT */ 69#endif /* CONFIG_FAIL_IO_TIMEOUT */
70 70
71/*
72 * blk_delete_timer - Delete/cancel timer for a given function.
73 * @req: request that we are canceling timer for
74 *
75 */
76void blk_delete_timer(struct request *req)
77{
78 list_del_init(&req->timeout_list);
79}
80
81static void blk_rq_timed_out(struct request *req)
82{
83 struct request_queue *q = req->q;
84 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
85
86 if (q->rq_timed_out_fn)
87 ret = q->rq_timed_out_fn(req);
88 switch (ret) {
89 case BLK_EH_RESET_TIMER:
90 blk_add_timer(req);
91 blk_clear_rq_complete(req);
92 break;
93 case BLK_EH_DONE:
94 /*
95 * LLD handles this for now but in the future
96 * we can send a request msg to abort the command
97 * and we can move more of the generic scsi eh code to
98 * the blk layer.
99 */
100 break;
101 default:
102 printk(KERN_ERR "block: bad eh return: %d\n", ret);
103 break;
104 }
105}
106
107static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
108 unsigned int *next_set)
109{
110 const unsigned long deadline = blk_rq_deadline(rq);
111
112 if (time_after_eq(jiffies, deadline)) {
113 list_del_init(&rq->timeout_list);
114
115 /*
116 * Check if we raced with end io completion
117 */
118 if (!blk_mark_rq_complete(rq))
119 blk_rq_timed_out(rq);
120 } else if (!*next_set || time_after(*next_timeout, deadline)) {
121 *next_timeout = deadline;
122 *next_set = 1;
123 }
124}
125
126void blk_timeout_work(struct work_struct *work)
127{
128 struct request_queue *q =
129 container_of(work, struct request_queue, timeout_work);
130 unsigned long flags, next = 0;
131 struct request *rq, *tmp;
132 int next_set = 0;
133
134 spin_lock_irqsave(q->queue_lock, flags);
135
136 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
137 blk_rq_check_expired(rq, &next, &next_set);
138
139 if (next_set)
140 mod_timer(&q->timeout, round_jiffies_up(next));
141
142 spin_unlock_irqrestore(q->queue_lock, flags);
143}
144
145/** 71/**
146 * blk_abort_request -- Request request recovery for the specified command 72 * blk_abort_request -- Request request recovery for the specified command
147 * @req: pointer to the request of interest 73 * @req: pointer to the request of interest
@@ -149,24 +75,17 @@ void blk_timeout_work(struct work_struct *work)
149 * This function requests that the block layer start recovery for the 75 * This function requests that the block layer start recovery for the
150 * request by deleting the timer and calling the q's timeout function. 76 * request by deleting the timer and calling the q's timeout function.
151 * LLDDs who implement their own error recovery MAY ignore the timeout 77 * LLDDs who implement their own error recovery MAY ignore the timeout
152 * event if they generated blk_abort_req. Must hold queue lock. 78 * event if they generated blk_abort_request.
153 */ 79 */
154void blk_abort_request(struct request *req) 80void blk_abort_request(struct request *req)
155{ 81{
156 if (req->q->mq_ops) { 82 /*
157 /* 83 * All we need to ensure is that timeout scan takes place
158 * All we need to ensure is that timeout scan takes place 84 * immediately and that scan sees the new timeout value.
159 * immediately and that scan sees the new timeout value. 85 * No need for fancy synchronizations.
160 * No need for fancy synchronizations. 86 */
161 */ 87 WRITE_ONCE(req->deadline, jiffies);
162 blk_rq_set_deadline(req, jiffies); 88 kblockd_schedule_work(&req->q->timeout_work);
163 kblockd_schedule_work(&req->q->timeout_work);
164 } else {
165 if (blk_mark_rq_complete(req))
166 return;
167 blk_delete_timer(req);
168 blk_rq_timed_out(req);
169 }
170} 89}
171EXPORT_SYMBOL_GPL(blk_abort_request); 90EXPORT_SYMBOL_GPL(blk_abort_request);
172 91
@@ -194,15 +113,6 @@ void blk_add_timer(struct request *req)
194 struct request_queue *q = req->q; 113 struct request_queue *q = req->q;
195 unsigned long expiry; 114 unsigned long expiry;
196 115
197 if (!q->mq_ops)
198 lockdep_assert_held(q->queue_lock);
199
200 /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
201 if (!q->mq_ops && !q->rq_timed_out_fn)
202 return;
203
204 BUG_ON(!list_empty(&req->timeout_list));
205
206 /* 116 /*
207 * Some LLDs, like scsi, peek at the timeout to prevent a 117 * Some LLDs, like scsi, peek at the timeout to prevent a
208 * command from being retried forever. 118 * command from being retried forever.
@@ -211,21 +121,16 @@ void blk_add_timer(struct request *req)
211 req->timeout = q->rq_timeout; 121 req->timeout = q->rq_timeout;
212 122
213 req->rq_flags &= ~RQF_TIMED_OUT; 123 req->rq_flags &= ~RQF_TIMED_OUT;
214 blk_rq_set_deadline(req, jiffies + req->timeout);
215 124
216 /* 125 expiry = jiffies + req->timeout;
217 * Only the non-mq case needs to add the request to a protected list. 126 WRITE_ONCE(req->deadline, expiry);
218 * For the mq case we simply scan the tag map.
219 */
220 if (!q->mq_ops)
221 list_add_tail(&req->timeout_list, &req->q->timeout_list);
222 127
223 /* 128 /*
224 * If the timer isn't already pending or this timeout is earlier 129 * If the timer isn't already pending or this timeout is earlier
225 * than an existing one, modify the timer. Round up to next nearest 130 * than an existing one, modify the timer. Round up to next nearest
226 * second. 131 * second.
227 */ 132 */
228 expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req))); 133 expiry = blk_rq_timeout(round_jiffies_up(expiry));
229 134
230 if (!timer_pending(&q->timeout) || 135 if (!timer_pending(&q->timeout) ||
231 time_before(expiry, q->timeout.expires)) { 136 time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 8ac93fcbaa2e..f0c56649775f 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -489,31 +489,21 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
489} 489}
490 490
491struct wbt_wait_data { 491struct wbt_wait_data {
492 struct wait_queue_entry wq;
493 struct task_struct *task;
494 struct rq_wb *rwb; 492 struct rq_wb *rwb;
495 struct rq_wait *rqw; 493 enum wbt_flags wb_acct;
496 unsigned long rw; 494 unsigned long rw;
497 bool got_token;
498}; 495};
499 496
500static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, 497static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
501 int wake_flags, void *key)
502{ 498{
503 struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data, 499 struct wbt_wait_data *data = private_data;
504 wq); 500 return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw));
505 501}
506 /*
507 * If we fail to get a budget, return -1 to interrupt the wake up
508 * loop in __wake_up_common.
509 */
510 if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw)))
511 return -1;
512 502
513 data->got_token = true; 503static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
514 list_del_init(&curr->entry); 504{
515 wake_up_process(data->task); 505 struct wbt_wait_data *data = private_data;
516 return 1; 506 wbt_rqw_done(data->rwb, rqw, data->wb_acct);
517} 507}
518 508
519/* 509/*
@@ -521,57 +511,16 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode,
521 * the timer to kick off queuing again. 511 * the timer to kick off queuing again.
522 */ 512 */
523static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, 513static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
524 unsigned long rw, spinlock_t *lock) 514 unsigned long rw)
525 __releases(lock)
526 __acquires(lock)
527{ 515{
528 struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); 516 struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
529 struct wbt_wait_data data = { 517 struct wbt_wait_data data = {
530 .wq = {
531 .func = wbt_wake_function,
532 .entry = LIST_HEAD_INIT(data.wq.entry),
533 },
534 .task = current,
535 .rwb = rwb, 518 .rwb = rwb,
536 .rqw = rqw, 519 .wb_acct = wb_acct,
537 .rw = rw, 520 .rw = rw,
538 }; 521 };
539 bool has_sleeper;
540
541 has_sleeper = wq_has_sleeper(&rqw->wait);
542 if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
543 return;
544 522
545 prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); 523 rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
546 do {
547 if (data.got_token)
548 break;
549
550 if (!has_sleeper &&
551 rq_wait_inc_below(rqw, get_limit(rwb, rw))) {
552 finish_wait(&rqw->wait, &data.wq);
553
554 /*
555 * We raced with wbt_wake_function() getting a token,
556 * which means we now have two. Put our local token
557 * and wake anyone else potentially waiting for one.
558 */
559 if (data.got_token)
560 wbt_rqw_done(rwb, rqw, wb_acct);
561 break;
562 }
563
564 if (lock) {
565 spin_unlock_irq(lock);
566 io_schedule();
567 spin_lock_irq(lock);
568 } else
569 io_schedule();
570
571 has_sleeper = false;
572 } while (1);
573
574 finish_wait(&rqw->wait, &data.wq);
575} 524}
576 525
577static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) 526static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
@@ -624,7 +573,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
624 * in an irq held spinlock, if it holds one when calling this function. 573 * in an irq held spinlock, if it holds one when calling this function.
625 * If we do sleep, we'll release and re-grab it. 574 * If we do sleep, we'll release and re-grab it.
626 */ 575 */
627static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) 576static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
628{ 577{
629 struct rq_wb *rwb = RQWB(rqos); 578 struct rq_wb *rwb = RQWB(rqos);
630 enum wbt_flags flags; 579 enum wbt_flags flags;
@@ -636,7 +585,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
636 return; 585 return;
637 } 586 }
638 587
639 __wbt_wait(rwb, flags, bio->bi_opf, lock); 588 __wbt_wait(rwb, flags, bio->bi_opf);
640 589
641 if (!blk_stat_is_active(rwb->cb)) 590 if (!blk_stat_is_active(rwb->cb))
642 rwb_arm_timer(rwb); 591 rwb_arm_timer(rwb);
@@ -709,8 +658,7 @@ void wbt_enable_default(struct request_queue *q)
709 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) 658 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
710 return; 659 return;
711 660
712 if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || 661 if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
713 (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
714 wbt_init(q); 662 wbt_init(q);
715} 663}
716EXPORT_SYMBOL_GPL(wbt_enable_default); 664EXPORT_SYMBOL_GPL(wbt_enable_default);
@@ -760,11 +708,100 @@ void wbt_disable_default(struct request_queue *q)
760 if (!rqos) 708 if (!rqos)
761 return; 709 return;
762 rwb = RQWB(rqos); 710 rwb = RQWB(rqos);
763 if (rwb->enable_state == WBT_STATE_ON_DEFAULT) 711 if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
712 blk_stat_deactivate(rwb->cb);
764 rwb->wb_normal = 0; 713 rwb->wb_normal = 0;
714 }
765} 715}
766EXPORT_SYMBOL_GPL(wbt_disable_default); 716EXPORT_SYMBOL_GPL(wbt_disable_default);
767 717
718#ifdef CONFIG_BLK_DEBUG_FS
719static int wbt_curr_win_nsec_show(void *data, struct seq_file *m)
720{
721 struct rq_qos *rqos = data;
722 struct rq_wb *rwb = RQWB(rqos);
723
724 seq_printf(m, "%llu\n", rwb->cur_win_nsec);
725 return 0;
726}
727
728static int wbt_enabled_show(void *data, struct seq_file *m)
729{
730 struct rq_qos *rqos = data;
731 struct rq_wb *rwb = RQWB(rqos);
732
733 seq_printf(m, "%d\n", rwb->enable_state);
734 return 0;
735}
736
737static int wbt_id_show(void *data, struct seq_file *m)
738{
739 struct rq_qos *rqos = data;
740
741 seq_printf(m, "%u\n", rqos->id);
742 return 0;
743}
744
745static int wbt_inflight_show(void *data, struct seq_file *m)
746{
747 struct rq_qos *rqos = data;
748 struct rq_wb *rwb = RQWB(rqos);
749 int i;
750
751 for (i = 0; i < WBT_NUM_RWQ; i++)
752 seq_printf(m, "%d: inflight %d\n", i,
753 atomic_read(&rwb->rq_wait[i].inflight));
754 return 0;
755}
756
757static int wbt_min_lat_nsec_show(void *data, struct seq_file *m)
758{
759 struct rq_qos *rqos = data;
760 struct rq_wb *rwb = RQWB(rqos);
761
762 seq_printf(m, "%lu\n", rwb->min_lat_nsec);
763 return 0;
764}
765
766static int wbt_unknown_cnt_show(void *data, struct seq_file *m)
767{
768 struct rq_qos *rqos = data;
769 struct rq_wb *rwb = RQWB(rqos);
770
771 seq_printf(m, "%u\n", rwb->unknown_cnt);
772 return 0;
773}
774
775static int wbt_normal_show(void *data, struct seq_file *m)
776{
777 struct rq_qos *rqos = data;
778 struct rq_wb *rwb = RQWB(rqos);
779
780 seq_printf(m, "%u\n", rwb->wb_normal);
781 return 0;
782}
783
784static int wbt_background_show(void *data, struct seq_file *m)
785{
786 struct rq_qos *rqos = data;
787 struct rq_wb *rwb = RQWB(rqos);
788
789 seq_printf(m, "%u\n", rwb->wb_background);
790 return 0;
791}
792
793static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
794 {"curr_win_nsec", 0400, wbt_curr_win_nsec_show},
795 {"enabled", 0400, wbt_enabled_show},
796 {"id", 0400, wbt_id_show},
797 {"inflight", 0400, wbt_inflight_show},
798 {"min_lat_nsec", 0400, wbt_min_lat_nsec_show},
799 {"unknown_cnt", 0400, wbt_unknown_cnt_show},
800 {"wb_normal", 0400, wbt_normal_show},
801 {"wb_background", 0400, wbt_background_show},
802 {},
803};
804#endif
768 805
769static struct rq_qos_ops wbt_rqos_ops = { 806static struct rq_qos_ops wbt_rqos_ops = {
770 .throttle = wbt_wait, 807 .throttle = wbt_wait,
@@ -774,6 +811,9 @@ static struct rq_qos_ops wbt_rqos_ops = {
774 .done = wbt_done, 811 .done = wbt_done,
775 .cleanup = wbt_cleanup, 812 .cleanup = wbt_cleanup,
776 .exit = wbt_exit, 813 .exit = wbt_exit,
814#ifdef CONFIG_BLK_DEBUG_FS
815 .debugfs_attrs = wbt_debugfs_attrs,
816#endif
777}; 817};
778 818
779int wbt_init(struct request_queue *q) 819int wbt_init(struct request_queue *q)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index a327bef07642..2d98803faec2 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -421,7 +421,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
421 * BIO based queues do not use a scheduler so only q->nr_zones 421 * BIO based queues do not use a scheduler so only q->nr_zones
422 * needs to be updated so that the sysfs exposed value is correct. 422 * needs to be updated so that the sysfs exposed value is correct.
423 */ 423 */
424 if (!queue_is_rq_based(q)) { 424 if (!queue_is_mq(q)) {
425 q->nr_zones = nr_zones; 425 q->nr_zones = nr_zones;
426 return 0; 426 return 0;
427 } 427 }
diff --git a/block/blk.h b/block/blk.h
index 0089fefdf771..848278c52030 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -7,12 +7,6 @@
7#include <xen/xen.h> 7#include <xen/xen.h>
8#include "blk-mq.h" 8#include "blk-mq.h"
9 9
10/* Amount of time in which a process may batch requests */
11#define BLK_BATCH_TIME (HZ/50UL)
12
13/* Number of requests a "batching" process may submit */
14#define BLK_BATCH_REQ 32
15
16/* Max future timer expiry for timeouts */ 10/* Max future timer expiry for timeouts */
17#define BLK_MAX_TIMEOUT (5 * HZ) 11#define BLK_MAX_TIMEOUT (5 * HZ)
18 12
@@ -38,85 +32,13 @@ struct blk_flush_queue {
38}; 32};
39 33
40extern struct kmem_cache *blk_requestq_cachep; 34extern struct kmem_cache *blk_requestq_cachep;
41extern struct kmem_cache *request_cachep;
42extern struct kobj_type blk_queue_ktype; 35extern struct kobj_type blk_queue_ktype;
43extern struct ida blk_queue_ida; 36extern struct ida blk_queue_ida;
44 37
45/* 38static inline struct blk_flush_queue *
46 * @q->queue_lock is set while a queue is being initialized. Since we know 39blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
47 * that no other threads access the queue object before @q->queue_lock has
48 * been set, it is safe to manipulate queue flags without holding the
49 * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and
50 * blk_init_allocated_queue().
51 */
52static inline void queue_lockdep_assert_held(struct request_queue *q)
53{
54 if (q->queue_lock)
55 lockdep_assert_held(q->queue_lock);
56}
57
58static inline void queue_flag_set_unlocked(unsigned int flag,
59 struct request_queue *q)
60{
61 if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) &&
62 kref_read(&q->kobj.kref))
63 lockdep_assert_held(q->queue_lock);
64 __set_bit(flag, &q->queue_flags);
65}
66
67static inline void queue_flag_clear_unlocked(unsigned int flag,
68 struct request_queue *q)
69{
70 if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) &&
71 kref_read(&q->kobj.kref))
72 lockdep_assert_held(q->queue_lock);
73 __clear_bit(flag, &q->queue_flags);
74}
75
76static inline int queue_flag_test_and_clear(unsigned int flag,
77 struct request_queue *q)
78{
79 queue_lockdep_assert_held(q);
80
81 if (test_bit(flag, &q->queue_flags)) {
82 __clear_bit(flag, &q->queue_flags);
83 return 1;
84 }
85
86 return 0;
87}
88
89static inline int queue_flag_test_and_set(unsigned int flag,
90 struct request_queue *q)
91{
92 queue_lockdep_assert_held(q);
93
94 if (!test_bit(flag, &q->queue_flags)) {
95 __set_bit(flag, &q->queue_flags);
96 return 0;
97 }
98
99 return 1;
100}
101
102static inline void queue_flag_set(unsigned int flag, struct request_queue *q)
103{
104 queue_lockdep_assert_held(q);
105 __set_bit(flag, &q->queue_flags);
106}
107
108static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
109{
110 queue_lockdep_assert_held(q);
111 __clear_bit(flag, &q->queue_flags);
112}
113
114static inline struct blk_flush_queue *blk_get_flush_queue(
115 struct request_queue *q, struct blk_mq_ctx *ctx)
116{ 40{
117 if (q->mq_ops) 41 return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq;
118 return blk_mq_map_queue(q, ctx->cpu)->fq;
119 return q->fq;
120} 42}
121 43
122static inline void __blk_get_queue(struct request_queue *q) 44static inline void __blk_get_queue(struct request_queue *q)
@@ -128,15 +50,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
128 int node, int cmd_size, gfp_t flags); 50 int node, int cmd_size, gfp_t flags);
129void blk_free_flush_queue(struct blk_flush_queue *q); 51void blk_free_flush_queue(struct blk_flush_queue *q);
130 52
131int blk_init_rl(struct request_list *rl, struct request_queue *q,
132 gfp_t gfp_mask);
133void blk_exit_rl(struct request_queue *q, struct request_list *rl);
134void blk_exit_queue(struct request_queue *q); 53void blk_exit_queue(struct request_queue *q);
135void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 54void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
136 struct bio *bio); 55 struct bio *bio);
137void blk_queue_bypass_start(struct request_queue *q);
138void blk_queue_bypass_end(struct request_queue *q);
139void __blk_queue_free_tags(struct request_queue *q);
140void blk_freeze_queue(struct request_queue *q); 56void blk_freeze_queue(struct request_queue *q);
141 57
142static inline void blk_queue_enter_live(struct request_queue *q) 58static inline void blk_queue_enter_live(struct request_queue *q)
@@ -235,11 +151,8 @@ static inline bool bio_integrity_endio(struct bio *bio)
235} 151}
236#endif /* CONFIG_BLK_DEV_INTEGRITY */ 152#endif /* CONFIG_BLK_DEV_INTEGRITY */
237 153
238void blk_timeout_work(struct work_struct *work);
239unsigned long blk_rq_timeout(unsigned long timeout); 154unsigned long blk_rq_timeout(unsigned long timeout);
240void blk_add_timer(struct request *req); 155void blk_add_timer(struct request *req);
241void blk_delete_timer(struct request *);
242
243 156
244bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 157bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
245 struct bio *bio); 158 struct bio *bio);
@@ -248,58 +161,19 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
248bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, 161bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
249 struct bio *bio); 162 struct bio *bio);
250bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 163bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
251 unsigned int *request_count,
252 struct request **same_queue_rq); 164 struct request **same_queue_rq);
253unsigned int blk_plug_queued_count(struct request_queue *q);
254 165
255void blk_account_io_start(struct request *req, bool new_io); 166void blk_account_io_start(struct request *req, bool new_io);
256void blk_account_io_completion(struct request *req, unsigned int bytes); 167void blk_account_io_completion(struct request *req, unsigned int bytes);
257void blk_account_io_done(struct request *req, u64 now); 168void blk_account_io_done(struct request *req, u64 now);
258 169
259/* 170/*
260 * EH timer and IO completion will both attempt to 'grab' the request, make
261 * sure that only one of them succeeds. Steal the bottom bit of the
262 * __deadline field for this.
263 */
264static inline int blk_mark_rq_complete(struct request *rq)
265{
266 return test_and_set_bit(0, &rq->__deadline);
267}
268
269static inline void blk_clear_rq_complete(struct request *rq)
270{
271 clear_bit(0, &rq->__deadline);
272}
273
274static inline bool blk_rq_is_complete(struct request *rq)
275{
276 return test_bit(0, &rq->__deadline);
277}
278
279/*
280 * Internal elevator interface 171 * Internal elevator interface
281 */ 172 */
282#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) 173#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
283 174
284void blk_insert_flush(struct request *rq); 175void blk_insert_flush(struct request *rq);
285 176
286static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
287{
288 struct elevator_queue *e = q->elevator;
289
290 if (e->type->ops.sq.elevator_activate_req_fn)
291 e->type->ops.sq.elevator_activate_req_fn(q, rq);
292}
293
294static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
295{
296 struct elevator_queue *e = q->elevator;
297
298 if (e->type->ops.sq.elevator_deactivate_req_fn)
299 e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
300}
301
302int elevator_init(struct request_queue *);
303int elevator_init_mq(struct request_queue *q); 177int elevator_init_mq(struct request_queue *q);
304int elevator_switch_mq(struct request_queue *q, 178int elevator_switch_mq(struct request_queue *q,
305 struct elevator_type *new_e); 179 struct elevator_type *new_e);
@@ -334,31 +208,8 @@ void blk_rq_set_mixed_merge(struct request *rq);
334bool blk_rq_merge_ok(struct request *rq, struct bio *bio); 208bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
335enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); 209enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
336 210
337void blk_queue_congestion_threshold(struct request_queue *q);
338
339int blk_dev_init(void); 211int blk_dev_init(void);
340 212
341
342/*
343 * Return the threshold (number of used requests) at which the queue is
344 * considered to be congested. It include a little hysteresis to keep the
345 * context switch rate down.
346 */
347static inline int queue_congestion_on_threshold(struct request_queue *q)
348{
349 return q->nr_congestion_on;
350}
351
352/*
353 * The threshold at which a queue is considered to be uncongested
354 */
355static inline int queue_congestion_off_threshold(struct request_queue *q)
356{
357 return q->nr_congestion_off;
358}
359
360extern int blk_update_nr_requests(struct request_queue *, unsigned int);
361
362/* 213/*
363 * Contribute to IO statistics IFF: 214 * Contribute to IO statistics IFF:
364 * 215 *
@@ -381,21 +232,6 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
381} 232}
382 233
383/* 234/*
384 * Steal a bit from this field for legacy IO path atomic IO marking. Note that
385 * setting the deadline clears the bottom bit, potentially clearing the
386 * completed bit. The user has to be OK with this (current ones are fine).
387 */
388static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
389{
390 rq->__deadline = time & ~0x1UL;
391}
392
393static inline unsigned long blk_rq_deadline(struct request *rq)
394{
395 return rq->__deadline & ~0x1UL;
396}
397
398/*
399 * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size 235 * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
400 * is defined as 'unsigned int', meantime it has to aligned to with logical 236 * is defined as 'unsigned int', meantime it has to aligned to with logical
401 * block size which is the minimum accepted unit by hardware. 237 * block size which is the minimum accepted unit by hardware.
@@ -417,22 +253,6 @@ void ioc_clear_queue(struct request_queue *q);
417int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); 253int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
418 254
419/** 255/**
420 * rq_ioc - determine io_context for request allocation
421 * @bio: request being allocated is for this bio (can be %NULL)
422 *
423 * Determine io_context to use for request allocation for @bio. May return
424 * %NULL if %current->io_context doesn't exist.
425 */
426static inline struct io_context *rq_ioc(struct bio *bio)
427{
428#ifdef CONFIG_BLK_CGROUP
429 if (bio && bio->bi_ioc)
430 return bio->bi_ioc;
431#endif
432 return current->io_context;
433}
434
435/**
436 * create_io_context - try to create task->io_context 256 * create_io_context - try to create task->io_context
437 * @gfp_mask: allocation mask 257 * @gfp_mask: allocation mask
438 * @node: allocation node 258 * @node: allocation node
@@ -490,8 +310,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
490} 310}
491#endif /* CONFIG_BOUNCE */ 311#endif /* CONFIG_BOUNCE */
492 312
493extern void blk_drain_queue(struct request_queue *q);
494
495#ifdef CONFIG_BLK_CGROUP_IOLATENCY 313#ifdef CONFIG_BLK_CGROUP_IOLATENCY
496extern int blk_iolatency_init(struct request_queue *q); 314extern int blk_iolatency_init(struct request_queue *q);
497#else 315#else
diff --git a/block/bounce.c b/block/bounce.c
index 559c55bda040..ffb9e9ecfa7e 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -277,7 +277,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
277 } 277 }
278 } 278 }
279 279
280 bio_clone_blkcg_association(bio, bio_src); 280 bio_clone_blkg_association(bio, bio_src);
281 blkcg_bio_issue_init(bio);
281 282
282 return bio; 283 return bio;
283} 284}
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index f3501cdaf1a6..192129856342 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -21,7 +21,7 @@
21 * 21 *
22 */ 22 */
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/blkdev.h> 24#include <linux/blk-mq.h>
25#include <linux/delay.h> 25#include <linux/delay.h>
26#include <linux/scatterlist.h> 26#include <linux/scatterlist.h>
27#include <linux/bsg-lib.h> 27#include <linux/bsg-lib.h>
@@ -31,6 +31,12 @@
31 31
32#define uptr64(val) ((void __user *)(uintptr_t)(val)) 32#define uptr64(val) ((void __user *)(uintptr_t)(val))
33 33
34struct bsg_set {
35 struct blk_mq_tag_set tag_set;
36 bsg_job_fn *job_fn;
37 bsg_timeout_fn *timeout_fn;
38};
39
34static int bsg_transport_check_proto(struct sg_io_v4 *hdr) 40static int bsg_transport_check_proto(struct sg_io_v4 *hdr)
35{ 41{
36 if (hdr->protocol != BSG_PROTOCOL_SCSI || 42 if (hdr->protocol != BSG_PROTOCOL_SCSI ||
@@ -129,7 +135,7 @@ static void bsg_teardown_job(struct kref *kref)
129 kfree(job->request_payload.sg_list); 135 kfree(job->request_payload.sg_list);
130 kfree(job->reply_payload.sg_list); 136 kfree(job->reply_payload.sg_list);
131 137
132 blk_end_request_all(rq, BLK_STS_OK); 138 blk_mq_end_request(rq, BLK_STS_OK);
133} 139}
134 140
135void bsg_job_put(struct bsg_job *job) 141void bsg_job_put(struct bsg_job *job)
@@ -157,15 +163,15 @@ void bsg_job_done(struct bsg_job *job, int result,
157{ 163{
158 job->result = result; 164 job->result = result;
159 job->reply_payload_rcv_len = reply_payload_rcv_len; 165 job->reply_payload_rcv_len = reply_payload_rcv_len;
160 blk_complete_request(blk_mq_rq_from_pdu(job)); 166 blk_mq_complete_request(blk_mq_rq_from_pdu(job));
161} 167}
162EXPORT_SYMBOL_GPL(bsg_job_done); 168EXPORT_SYMBOL_GPL(bsg_job_done);
163 169
164/** 170/**
165 * bsg_softirq_done - softirq done routine for destroying the bsg requests 171 * bsg_complete - softirq done routine for destroying the bsg requests
166 * @rq: BSG request that holds the job to be destroyed 172 * @rq: BSG request that holds the job to be destroyed
167 */ 173 */
168static void bsg_softirq_done(struct request *rq) 174static void bsg_complete(struct request *rq)
169{ 175{
170 struct bsg_job *job = blk_mq_rq_to_pdu(rq); 176 struct bsg_job *job = blk_mq_rq_to_pdu(rq);
171 177
@@ -224,54 +230,48 @@ failjob_rls_job:
224} 230}
225 231
226/** 232/**
227 * bsg_request_fn - generic handler for bsg requests 233 * bsg_queue_rq - generic handler for bsg requests
228 * @q: request queue to manage 234 * @hctx: hardware queue
235 * @bd: queue data
229 * 236 *
230 * On error the create_bsg_job function should return a -Exyz error value 237 * On error the create_bsg_job function should return a -Exyz error value
231 * that will be set to ->result. 238 * that will be set to ->result.
232 * 239 *
233 * Drivers/subsys should pass this to the queue init function. 240 * Drivers/subsys should pass this to the queue init function.
234 */ 241 */
235static void bsg_request_fn(struct request_queue *q) 242static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
236 __releases(q->queue_lock) 243 const struct blk_mq_queue_data *bd)
237 __acquires(q->queue_lock)
238{ 244{
245 struct request_queue *q = hctx->queue;
239 struct device *dev = q->queuedata; 246 struct device *dev = q->queuedata;
240 struct request *req; 247 struct request *req = bd->rq;
248 struct bsg_set *bset =
249 container_of(q->tag_set, struct bsg_set, tag_set);
241 int ret; 250 int ret;
242 251
252 blk_mq_start_request(req);
253
243 if (!get_device(dev)) 254 if (!get_device(dev))
244 return; 255 return BLK_STS_IOERR;
245 256
246 while (1) { 257 if (!bsg_prepare_job(dev, req))
247 req = blk_fetch_request(q); 258 return BLK_STS_IOERR;
248 if (!req) 259
249 break; 260 ret = bset->job_fn(blk_mq_rq_to_pdu(req));
250 spin_unlock_irq(q->queue_lock); 261 if (ret)
251 262 return BLK_STS_IOERR;
252 if (!bsg_prepare_job(dev, req)) {
253 blk_end_request_all(req, BLK_STS_OK);
254 spin_lock_irq(q->queue_lock);
255 continue;
256 }
257
258 ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req));
259 spin_lock_irq(q->queue_lock);
260 if (ret)
261 break;
262 }
263 263
264 spin_unlock_irq(q->queue_lock);
265 put_device(dev); 264 put_device(dev);
266 spin_lock_irq(q->queue_lock); 265 return BLK_STS_OK;
267} 266}
268 267
269/* called right after the request is allocated for the request_queue */ 268/* called right after the request is allocated for the request_queue */
270static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) 269static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
270 unsigned int hctx_idx, unsigned int numa_node)
271{ 271{
272 struct bsg_job *job = blk_mq_rq_to_pdu(req); 272 struct bsg_job *job = blk_mq_rq_to_pdu(req);
273 273
274 job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp); 274 job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
275 if (!job->reply) 275 if (!job->reply)
276 return -ENOMEM; 276 return -ENOMEM;
277 return 0; 277 return 0;
@@ -289,13 +289,47 @@ static void bsg_initialize_rq(struct request *req)
289 job->dd_data = job + 1; 289 job->dd_data = job + 1;
290} 290}
291 291
292static void bsg_exit_rq(struct request_queue *q, struct request *req) 292static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
293 unsigned int hctx_idx)
293{ 294{
294 struct bsg_job *job = blk_mq_rq_to_pdu(req); 295 struct bsg_job *job = blk_mq_rq_to_pdu(req);
295 296
296 kfree(job->reply); 297 kfree(job->reply);
297} 298}
298 299
300void bsg_remove_queue(struct request_queue *q)
301{
302 if (q) {
303 struct bsg_set *bset =
304 container_of(q->tag_set, struct bsg_set, tag_set);
305
306 bsg_unregister_queue(q);
307 blk_cleanup_queue(q);
308 blk_mq_free_tag_set(&bset->tag_set);
309 kfree(bset);
310 }
311}
312EXPORT_SYMBOL_GPL(bsg_remove_queue);
313
314static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved)
315{
316 struct bsg_set *bset =
317 container_of(rq->q->tag_set, struct bsg_set, tag_set);
318
319 if (!bset->timeout_fn)
320 return BLK_EH_DONE;
321 return bset->timeout_fn(rq);
322}
323
324static const struct blk_mq_ops bsg_mq_ops = {
325 .queue_rq = bsg_queue_rq,
326 .init_request = bsg_init_rq,
327 .exit_request = bsg_exit_rq,
328 .initialize_rq_fn = bsg_initialize_rq,
329 .complete = bsg_complete,
330 .timeout = bsg_timeout,
331};
332
299/** 333/**
300 * bsg_setup_queue - Create and add the bsg hooks so we can receive requests 334 * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
301 * @dev: device to attach bsg device to 335 * @dev: device to attach bsg device to
@@ -304,28 +338,38 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
304 * @dd_job_size: size of LLD data needed for each job 338 * @dd_job_size: size of LLD data needed for each job
305 */ 339 */
306struct request_queue *bsg_setup_queue(struct device *dev, const char *name, 340struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
307 bsg_job_fn *job_fn, int dd_job_size) 341 bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size)
308{ 342{
343 struct bsg_set *bset;
344 struct blk_mq_tag_set *set;
309 struct request_queue *q; 345 struct request_queue *q;
310 int ret; 346 int ret = -ENOMEM;
311 347
312 q = blk_alloc_queue(GFP_KERNEL); 348 bset = kzalloc(sizeof(*bset), GFP_KERNEL);
313 if (!q) 349 if (!bset)
314 return ERR_PTR(-ENOMEM); 350 return ERR_PTR(-ENOMEM);
315 q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
316 q->init_rq_fn = bsg_init_rq;
317 q->exit_rq_fn = bsg_exit_rq;
318 q->initialize_rq_fn = bsg_initialize_rq;
319 q->request_fn = bsg_request_fn;
320 351
321 ret = blk_init_allocated_queue(q); 352 bset->job_fn = job_fn;
322 if (ret) 353 bset->timeout_fn = timeout;
323 goto out_cleanup_queue; 354
355 set = &bset->tag_set;
356 set->ops = &bsg_mq_ops,
357 set->nr_hw_queues = 1;
358 set->queue_depth = 128;
359 set->numa_node = NUMA_NO_NODE;
360 set->cmd_size = sizeof(struct bsg_job) + dd_job_size;
361 set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING;
362 if (blk_mq_alloc_tag_set(set))
363 goto out_tag_set;
364
365 q = blk_mq_init_queue(set);
366 if (IS_ERR(q)) {
367 ret = PTR_ERR(q);
368 goto out_queue;
369 }
324 370
325 q->queuedata = dev; 371 q->queuedata = dev;
326 q->bsg_job_fn = job_fn;
327 blk_queue_flag_set(QUEUE_FLAG_BIDI, q); 372 blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
328 blk_queue_softirq_done(q, bsg_softirq_done);
329 blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); 373 blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
330 374
331 ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); 375 ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
@@ -338,6 +382,10 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
338 return q; 382 return q;
339out_cleanup_queue: 383out_cleanup_queue:
340 blk_cleanup_queue(q); 384 blk_cleanup_queue(q);
385out_queue:
386 blk_mq_free_tag_set(set);
387out_tag_set:
388 kfree(bset);
341 return ERR_PTR(ret); 389 return ERR_PTR(ret);
342} 390}
343EXPORT_SYMBOL_GPL(bsg_setup_queue); 391EXPORT_SYMBOL_GPL(bsg_setup_queue);
diff --git a/block/bsg.c b/block/bsg.c
index 9a442c23a715..44f6028b9567 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -471,7 +471,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
471 /* 471 /*
472 * we need a proper transport to send commands, not a stacked device 472 * we need a proper transport to send commands, not a stacked device
473 */ 473 */
474 if (!queue_is_rq_based(q)) 474 if (!queue_is_mq(q))
475 return 0; 475 return 0;
476 476
477 bcd = &q->bsg_dev; 477 bcd = &q->bsg_dev;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
deleted file mode 100644
index ed41aa978c4a..000000000000
--- a/block/cfq-iosched.c
+++ /dev/null
@@ -1,4916 +0,0 @@
1/*
2 * CFQ, or complete fairness queueing, disk scheduler.
3 *
4 * Based on ideas from a previously unfinished io
5 * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
6 *
7 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
8 */
9#include <linux/module.h>
10#include <linux/slab.h>
11#include <linux/sched/clock.h>
12#include <linux/blkdev.h>
13#include <linux/elevator.h>
14#include <linux/ktime.h>
15#include <linux/rbtree.h>
16#include <linux/ioprio.h>
17#include <linux/blktrace_api.h>
18#include <linux/blk-cgroup.h>
19#include "blk.h"
20#include "blk-wbt.h"
21
22/*
23 * tunables
24 */
25/* max queue in one round of service */
26static const int cfq_quantum = 8;
27static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
28/* maximum backwards seek, in KiB */
29static const int cfq_back_max = 16 * 1024;
30/* penalty of a backwards seek */
31static const int cfq_back_penalty = 2;
32static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
33static u64 cfq_slice_async = NSEC_PER_SEC / 25;
34static const int cfq_slice_async_rq = 2;
35static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
36static u64 cfq_group_idle = NSEC_PER_SEC / 125;
37static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
38static const int cfq_hist_divisor = 4;
39
40/*
41 * offset from end of queue service tree for idle class
42 */
43#define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5)
44/* offset from end of group service tree under time slice mode */
45#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
46/* offset from end of group service under IOPS mode */
47#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
48
49/*
50 * below this threshold, we consider thinktime immediate
51 */
52#define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ)
53
54#define CFQ_SLICE_SCALE (5)
55#define CFQ_HW_QUEUE_MIN (5)
56#define CFQ_SERVICE_SHIFT 12
57
58#define CFQQ_SEEK_THR (sector_t)(8 * 100)
59#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
60#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
61#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
62
63#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
64#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
65#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
66
67static struct kmem_cache *cfq_pool;
68
69#define CFQ_PRIO_LISTS IOPRIO_BE_NR
70#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
71#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
72
73#define sample_valid(samples) ((samples) > 80)
74#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
75
76/* blkio-related constants */
77#define CFQ_WEIGHT_LEGACY_MIN 10
78#define CFQ_WEIGHT_LEGACY_DFL 500
79#define CFQ_WEIGHT_LEGACY_MAX 1000
80
81struct cfq_ttime {
82 u64 last_end_request;
83
84 u64 ttime_total;
85 u64 ttime_mean;
86 unsigned long ttime_samples;
87};
88
89/*
90 * Most of our rbtree usage is for sorting with min extraction, so
91 * if we cache the leftmost node we don't have to walk down the tree
92 * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
93 * move this into the elevator for the rq sorting as well.
94 */
95struct cfq_rb_root {
96 struct rb_root_cached rb;
97 struct rb_node *rb_rightmost;
98 unsigned count;
99 u64 min_vdisktime;
100 struct cfq_ttime ttime;
101};
102#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
103 .rb_rightmost = NULL, \
104 .ttime = {.last_end_request = ktime_get_ns(),},}
105
106/*
107 * Per process-grouping structure
108 */
109struct cfq_queue {
110 /* reference count */
111 int ref;
112 /* various state flags, see below */
113 unsigned int flags;
114 /* parent cfq_data */
115 struct cfq_data *cfqd;
116 /* service_tree member */
117 struct rb_node rb_node;
118 /* service_tree key */
119 u64 rb_key;
120 /* prio tree member */
121 struct rb_node p_node;
122 /* prio tree root we belong to, if any */
123 struct rb_root *p_root;
124 /* sorted list of pending requests */
125 struct rb_root sort_list;
126 /* if fifo isn't expired, next request to serve */
127 struct request *next_rq;
128 /* requests queued in sort_list */
129 int queued[2];
130 /* currently allocated requests */
131 int allocated[2];
132 /* fifo list of requests in sort_list */
133 struct list_head fifo;
134
135 /* time when queue got scheduled in to dispatch first request. */
136 u64 dispatch_start;
137 u64 allocated_slice;
138 u64 slice_dispatch;
139 /* time when first request from queue completed and slice started. */
140 u64 slice_start;
141 u64 slice_end;
142 s64 slice_resid;
143
144 /* pending priority requests */
145 int prio_pending;
146 /* number of requests that are on the dispatch list or inside driver */
147 int dispatched;
148
149 /* io prio of this group */
150 unsigned short ioprio, org_ioprio;
151 unsigned short ioprio_class, org_ioprio_class;
152
153 pid_t pid;
154
155 u32 seek_history;
156 sector_t last_request_pos;
157
158 struct cfq_rb_root *service_tree;
159 struct cfq_queue *new_cfqq;
160 struct cfq_group *cfqg;
161 /* Number of sectors dispatched from queue in single dispatch round */
162 unsigned long nr_sectors;
163};
164
165/*
166 * First index in the service_trees.
167 * IDLE is handled separately, so it has negative index
168 */
169enum wl_class_t {
170 BE_WORKLOAD = 0,
171 RT_WORKLOAD = 1,
172 IDLE_WORKLOAD = 2,
173 CFQ_PRIO_NR,
174};
175
176/*
177 * Second index in the service_trees.
178 */
179enum wl_type_t {
180 ASYNC_WORKLOAD = 0,
181 SYNC_NOIDLE_WORKLOAD = 1,
182 SYNC_WORKLOAD = 2
183};
184
185struct cfqg_stats {
186#ifdef CONFIG_CFQ_GROUP_IOSCHED
187 /* number of ios merged */
188 struct blkg_rwstat merged;
189 /* total time spent on device in ns, may not be accurate w/ queueing */
190 struct blkg_rwstat service_time;
191 /* total time spent waiting in scheduler queue in ns */
192 struct blkg_rwstat wait_time;
193 /* number of IOs queued up */
194 struct blkg_rwstat queued;
195 /* total disk time and nr sectors dispatched by this group */
196 struct blkg_stat time;
197#ifdef CONFIG_DEBUG_BLK_CGROUP
198 /* time not charged to this cgroup */
199 struct blkg_stat unaccounted_time;
200 /* sum of number of ios queued across all samples */
201 struct blkg_stat avg_queue_size_sum;
202 /* count of samples taken for average */
203 struct blkg_stat avg_queue_size_samples;
204 /* how many times this group has been removed from service tree */
205 struct blkg_stat dequeue;
206 /* total time spent waiting for it to be assigned a timeslice. */
207 struct blkg_stat group_wait_time;
208 /* time spent idling for this blkcg_gq */
209 struct blkg_stat idle_time;
210 /* total time with empty current active q with other requests queued */
211 struct blkg_stat empty_time;
212 /* fields after this shouldn't be cleared on stat reset */
213 u64 start_group_wait_time;
214 u64 start_idle_time;
215 u64 start_empty_time;
216 uint16_t flags;
217#endif /* CONFIG_DEBUG_BLK_CGROUP */
218#endif /* CONFIG_CFQ_GROUP_IOSCHED */
219};
220
221/* Per-cgroup data */
222struct cfq_group_data {
223 /* must be the first member */
224 struct blkcg_policy_data cpd;
225
226 unsigned int weight;
227 unsigned int leaf_weight;
228};
229
230/* This is per cgroup per device grouping structure */
231struct cfq_group {
232 /* must be the first member */
233 struct blkg_policy_data pd;
234
235 /* group service_tree member */
236 struct rb_node rb_node;
237
238 /* group service_tree key */
239 u64 vdisktime;
240
241 /*
242 * The number of active cfqgs and sum of their weights under this
243 * cfqg. This covers this cfqg's leaf_weight and all children's
244 * weights, but does not cover weights of further descendants.
245 *
246 * If a cfqg is on the service tree, it's active. An active cfqg
247 * also activates its parent and contributes to the children_weight
248 * of the parent.
249 */
250 int nr_active;
251 unsigned int children_weight;
252
253 /*
254 * vfraction is the fraction of vdisktime that the tasks in this
255 * cfqg are entitled to. This is determined by compounding the
256 * ratios walking up from this cfqg to the root.
257 *
258 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
259 * vfractions on a service tree is approximately 1. The sum may
260 * deviate a bit due to rounding errors and fluctuations caused by
261 * cfqgs entering and leaving the service tree.
262 */
263 unsigned int vfraction;
264
265 /*
266 * There are two weights - (internal) weight is the weight of this
267 * cfqg against the sibling cfqgs. leaf_weight is the wight of
268 * this cfqg against the child cfqgs. For the root cfqg, both
269 * weights are kept in sync for backward compatibility.
270 */
271 unsigned int weight;
272 unsigned int new_weight;
273 unsigned int dev_weight;
274
275 unsigned int leaf_weight;
276 unsigned int new_leaf_weight;
277 unsigned int dev_leaf_weight;
278
279 /* number of cfqq currently on this group */
280 int nr_cfqq;
281
282 /*
283 * Per group busy queues average. Useful for workload slice calc. We
284 * create the array for each prio class but at run time it is used
285 * only for RT and BE class and slot for IDLE class remains unused.
286 * This is primarily done to avoid confusion and a gcc warning.
287 */
288 unsigned int busy_queues_avg[CFQ_PRIO_NR];
289 /*
290 * rr lists of queues with requests. We maintain service trees for
291 * RT and BE classes. These trees are subdivided in subclasses
292 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
293 * class there is no subclassification and all the cfq queues go on
294 * a single tree service_tree_idle.
295 * Counts are embedded in the cfq_rb_root
296 */
297 struct cfq_rb_root service_trees[2][3];
298 struct cfq_rb_root service_tree_idle;
299
300 u64 saved_wl_slice;
301 enum wl_type_t saved_wl_type;
302 enum wl_class_t saved_wl_class;
303
304 /* number of requests that are on the dispatch list or inside driver */
305 int dispatched;
306 struct cfq_ttime ttime;
307 struct cfqg_stats stats; /* stats for this cfqg */
308
309 /* async queue for each priority case */
310 struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
311 struct cfq_queue *async_idle_cfqq;
312
313};
314
315struct cfq_io_cq {
316 struct io_cq icq; /* must be the first member */
317 struct cfq_queue *cfqq[2];
318 struct cfq_ttime ttime;
319 int ioprio; /* the current ioprio */
320#ifdef CONFIG_CFQ_GROUP_IOSCHED
321 uint64_t blkcg_serial_nr; /* the current blkcg serial */
322#endif
323};
324
325/*
326 * Per block device queue structure
327 */
328struct cfq_data {
329 struct request_queue *queue;
330 /* Root service tree for cfq_groups */
331 struct cfq_rb_root grp_service_tree;
332 struct cfq_group *root_group;
333
334 /*
335 * The priority currently being served
336 */
337 enum wl_class_t serving_wl_class;
338 enum wl_type_t serving_wl_type;
339 u64 workload_expires;
340 struct cfq_group *serving_group;
341
342 /*
343 * Each priority tree is sorted by next_request position. These
344 * trees are used when determining if two or more queues are
345 * interleaving requests (see cfq_close_cooperator).
346 */
347 struct rb_root prio_trees[CFQ_PRIO_LISTS];
348
349 unsigned int busy_queues;
350 unsigned int busy_sync_queues;
351
352 int rq_in_driver;
353 int rq_in_flight[2];
354
355 /*
356 * queue-depth detection
357 */
358 int rq_queued;
359 int hw_tag;
360 /*
361 * hw_tag can be
362 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
363 * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
364 * 0 => no NCQ
365 */
366 int hw_tag_est_depth;
367 unsigned int hw_tag_samples;
368
369 /*
370 * idle window management
371 */
372 struct hrtimer idle_slice_timer;
373 struct work_struct unplug_work;
374
375 struct cfq_queue *active_queue;
376 struct cfq_io_cq *active_cic;
377
378 sector_t last_position;
379
380 /*
381 * tunables, see top of file
382 */
383 unsigned int cfq_quantum;
384 unsigned int cfq_back_penalty;
385 unsigned int cfq_back_max;
386 unsigned int cfq_slice_async_rq;
387 unsigned int cfq_latency;
388 u64 cfq_fifo_expire[2];
389 u64 cfq_slice[2];
390 u64 cfq_slice_idle;
391 u64 cfq_group_idle;
392 u64 cfq_target_latency;
393
394 /*
395 * Fallback dummy cfqq for extreme OOM conditions
396 */
397 struct cfq_queue oom_cfqq;
398
399 u64 last_delayed_sync;
400};
401
402static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
403static void cfq_put_queue(struct cfq_queue *cfqq);
404
405static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
406 enum wl_class_t class,
407 enum wl_type_t type)
408{
409 if (!cfqg)
410 return NULL;
411
412 if (class == IDLE_WORKLOAD)
413 return &cfqg->service_tree_idle;
414
415 return &cfqg->service_trees[class][type];
416}
417
418enum cfqq_state_flags {
419 CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
420 CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
421 CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
422 CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
423 CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
424 CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
425 CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
426 CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
427 CFQ_CFQQ_FLAG_sync, /* synchronous queue */
428 CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
429 CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
430 CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
431 CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
432};
433
434#define CFQ_CFQQ_FNS(name) \
435static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
436{ \
437 (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
438} \
439static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
440{ \
441 (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
442} \
443static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
444{ \
445 return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
446}
447
448CFQ_CFQQ_FNS(on_rr);
449CFQ_CFQQ_FNS(wait_request);
450CFQ_CFQQ_FNS(must_dispatch);
451CFQ_CFQQ_FNS(must_alloc_slice);
452CFQ_CFQQ_FNS(fifo_expire);
453CFQ_CFQQ_FNS(idle_window);
454CFQ_CFQQ_FNS(prio_changed);
455CFQ_CFQQ_FNS(slice_new);
456CFQ_CFQQ_FNS(sync);
457CFQ_CFQQ_FNS(coop);
458CFQ_CFQQ_FNS(split_coop);
459CFQ_CFQQ_FNS(deep);
460CFQ_CFQQ_FNS(wait_busy);
461#undef CFQ_CFQQ_FNS
462
463#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
464
465/* cfqg stats flags */
466enum cfqg_stats_flags {
467 CFQG_stats_waiting = 0,
468 CFQG_stats_idling,
469 CFQG_stats_empty,
470};
471
472#define CFQG_FLAG_FNS(name) \
473static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
474{ \
475 stats->flags |= (1 << CFQG_stats_##name); \
476} \
477static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
478{ \
479 stats->flags &= ~(1 << CFQG_stats_##name); \
480} \
481static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
482{ \
483 return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
484} \
485
486CFQG_FLAG_FNS(waiting)
487CFQG_FLAG_FNS(idling)
488CFQG_FLAG_FNS(empty)
489#undef CFQG_FLAG_FNS
490
491/* This should be called with the queue_lock held. */
492static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
493{
494 u64 now;
495
496 if (!cfqg_stats_waiting(stats))
497 return;
498
499 now = ktime_get_ns();
500 if (now > stats->start_group_wait_time)
501 blkg_stat_add(&stats->group_wait_time,
502 now - stats->start_group_wait_time);
503 cfqg_stats_clear_waiting(stats);
504}
505
506/* This should be called with the queue_lock held. */
507static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
508 struct cfq_group *curr_cfqg)
509{
510 struct cfqg_stats *stats = &cfqg->stats;
511
512 if (cfqg_stats_waiting(stats))
513 return;
514 if (cfqg == curr_cfqg)
515 return;
516 stats->start_group_wait_time = ktime_get_ns();
517 cfqg_stats_mark_waiting(stats);
518}
519
520/* This should be called with the queue_lock held. */
521static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
522{
523 u64 now;
524
525 if (!cfqg_stats_empty(stats))
526 return;
527
528 now = ktime_get_ns();
529 if (now > stats->start_empty_time)
530 blkg_stat_add(&stats->empty_time,
531 now - stats->start_empty_time);
532 cfqg_stats_clear_empty(stats);
533}
534
535static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
536{
537 blkg_stat_add(&cfqg->stats.dequeue, 1);
538}
539
540static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
541{
542 struct cfqg_stats *stats = &cfqg->stats;
543
544 if (blkg_rwstat_total(&stats->queued))
545 return;
546
547 /*
548 * group is already marked empty. This can happen if cfqq got new
549 * request in parent group and moved to this group while being added
550 * to service tree. Just ignore the event and move on.
551 */
552 if (cfqg_stats_empty(stats))
553 return;
554
555 stats->start_empty_time = ktime_get_ns();
556 cfqg_stats_mark_empty(stats);
557}
558
559static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
560{
561 struct cfqg_stats *stats = &cfqg->stats;
562
563 if (cfqg_stats_idling(stats)) {
564 u64 now = ktime_get_ns();
565
566 if (now > stats->start_idle_time)
567 blkg_stat_add(&stats->idle_time,
568 now - stats->start_idle_time);
569 cfqg_stats_clear_idling(stats);
570 }
571}
572
573static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
574{
575 struct cfqg_stats *stats = &cfqg->stats;
576
577 BUG_ON(cfqg_stats_idling(stats));
578
579 stats->start_idle_time = ktime_get_ns();
580 cfqg_stats_mark_idling(stats);
581}
582
583static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
584{
585 struct cfqg_stats *stats = &cfqg->stats;
586
587 blkg_stat_add(&stats->avg_queue_size_sum,
588 blkg_rwstat_total(&stats->queued));
589 blkg_stat_add(&stats->avg_queue_size_samples, 1);
590 cfqg_stats_update_group_wait_time(stats);
591}
592
593#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
594
595static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
596static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
597static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
598static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
599static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
600static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
601static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
602
603#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
604
605#ifdef CONFIG_CFQ_GROUP_IOSCHED
606
607static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
608{
609 return pd ? container_of(pd, struct cfq_group, pd) : NULL;
610}
611
612static struct cfq_group_data
613*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
614{
615 return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
616}
617
618static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
619{
620 return pd_to_blkg(&cfqg->pd);
621}
622
623static struct blkcg_policy blkcg_policy_cfq;
624
625static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
626{
627 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
628}
629
630static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
631{
632 return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
633}
634
635static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
636{
637 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
638
639 return pblkg ? blkg_to_cfqg(pblkg) : NULL;
640}
641
642static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
643 struct cfq_group *ancestor)
644{
645 return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
646 cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
647}
648
649static inline void cfqg_get(struct cfq_group *cfqg)
650{
651 return blkg_get(cfqg_to_blkg(cfqg));
652}
653
654static inline void cfqg_put(struct cfq_group *cfqg)
655{
656 return blkg_put(cfqg_to_blkg(cfqg));
657}
658
659#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
660 blk_add_cgroup_trace_msg((cfqd)->queue, \
661 cfqg_to_blkg((cfqq)->cfqg)->blkcg, \
662 "cfq%d%c%c " fmt, (cfqq)->pid, \
663 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
664 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
665 ##args); \
666} while (0)
667
668#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
669 blk_add_cgroup_trace_msg((cfqd)->queue, \
670 cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \
671} while (0)
672
673static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
674 struct cfq_group *curr_cfqg,
675 unsigned int op)
676{
677 blkg_rwstat_add(&cfqg->stats.queued, op, 1);
678 cfqg_stats_end_empty_time(&cfqg->stats);
679 cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
680}
681
682static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
683 uint64_t time, unsigned long unaccounted_time)
684{
685 blkg_stat_add(&cfqg->stats.time, time);
686#ifdef CONFIG_DEBUG_BLK_CGROUP
687 blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
688#endif
689}
690
691static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
692 unsigned int op)
693{
694 blkg_rwstat_add(&cfqg->stats.queued, op, -1);
695}
696
697static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
698 unsigned int op)
699{
700 blkg_rwstat_add(&cfqg->stats.merged, op, 1);
701}
702
703static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
704 u64 start_time_ns,
705 u64 io_start_time_ns,
706 unsigned int op)
707{
708 struct cfqg_stats *stats = &cfqg->stats;
709 u64 now = ktime_get_ns();
710
711 if (now > io_start_time_ns)
712 blkg_rwstat_add(&stats->service_time, op,
713 now - io_start_time_ns);
714 if (io_start_time_ns > start_time_ns)
715 blkg_rwstat_add(&stats->wait_time, op,
716 io_start_time_ns - start_time_ns);
717}
718
719/* @stats = 0 */
720static void cfqg_stats_reset(struct cfqg_stats *stats)
721{
722 /* queued stats shouldn't be cleared */
723 blkg_rwstat_reset(&stats->merged);
724 blkg_rwstat_reset(&stats->service_time);
725 blkg_rwstat_reset(&stats->wait_time);
726 blkg_stat_reset(&stats->time);
727#ifdef CONFIG_DEBUG_BLK_CGROUP
728 blkg_stat_reset(&stats->unaccounted_time);
729 blkg_stat_reset(&stats->avg_queue_size_sum);
730 blkg_stat_reset(&stats->avg_queue_size_samples);
731 blkg_stat_reset(&stats->dequeue);
732 blkg_stat_reset(&stats->group_wait_time);
733 blkg_stat_reset(&stats->idle_time);
734 blkg_stat_reset(&stats->empty_time);
735#endif
736}
737
738/* @to += @from */
739static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
740{
741 /* queued stats shouldn't be cleared */
742 blkg_rwstat_add_aux(&to->merged, &from->merged);
743 blkg_rwstat_add_aux(&to->service_time, &from->service_time);
744 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
745 blkg_stat_add_aux(&from->time, &from->time);
746#ifdef CONFIG_DEBUG_BLK_CGROUP
747 blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
748 blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
749 blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
750 blkg_stat_add_aux(&to->dequeue, &from->dequeue);
751 blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
752 blkg_stat_add_aux(&to->idle_time, &from->idle_time);
753 blkg_stat_add_aux(&to->empty_time, &from->empty_time);
754#endif
755}
756
757/*
758 * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
759 * recursive stats can still account for the amount used by this cfqg after
760 * it's gone.
761 */
762static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
763{
764 struct cfq_group *parent = cfqg_parent(cfqg);
765
766 lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
767
768 if (unlikely(!parent))
769 return;
770
771 cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
772 cfqg_stats_reset(&cfqg->stats);
773}
774
775#else /* CONFIG_CFQ_GROUP_IOSCHED */
776
777static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
778static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
779 struct cfq_group *ancestor)
780{
781 return true;
782}
783static inline void cfqg_get(struct cfq_group *cfqg) { }
784static inline void cfqg_put(struct cfq_group *cfqg) { }
785
786#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
787 blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
788 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
789 cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
790 ##args)
791#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
792
793static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
794 struct cfq_group *curr_cfqg, unsigned int op) { }
795static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
796 uint64_t time, unsigned long unaccounted_time) { }
797static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
798 unsigned int op) { }
799static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
800 unsigned int op) { }
801static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
802 u64 start_time_ns,
803 u64 io_start_time_ns,
804 unsigned int op) { }
805
806#endif /* CONFIG_CFQ_GROUP_IOSCHED */
807
808#define cfq_log(cfqd, fmt, args...) \
809 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
810
811/* Traverses through cfq group service trees */
812#define for_each_cfqg_st(cfqg, i, j, st) \
813 for (i = 0; i <= IDLE_WORKLOAD; i++) \
814 for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
815 : &cfqg->service_tree_idle; \
816 (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
817 (i == IDLE_WORKLOAD && j == 0); \
818 j++, st = i < IDLE_WORKLOAD ? \
819 &cfqg->service_trees[i][j]: NULL) \
820
821static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
822 struct cfq_ttime *ttime, bool group_idle)
823{
824 u64 slice;
825 if (!sample_valid(ttime->ttime_samples))
826 return false;
827 if (group_idle)
828 slice = cfqd->cfq_group_idle;
829 else
830 slice = cfqd->cfq_slice_idle;
831 return ttime->ttime_mean > slice;
832}
833
834static inline bool iops_mode(struct cfq_data *cfqd)
835{
836 /*
837 * If we are not idling on queues and it is a NCQ drive, parallel
838 * execution of requests is on and measuring time is not possible
839 * in most of the cases until and unless we drive shallower queue
840 * depths and that becomes a performance bottleneck. In such cases
841 * switch to start providing fairness in terms of number of IOs.
842 */
843 if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
844 return true;
845 else
846 return false;
847}
848
849static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
850{
851 if (cfq_class_idle(cfqq))
852 return IDLE_WORKLOAD;
853 if (cfq_class_rt(cfqq))
854 return RT_WORKLOAD;
855 return BE_WORKLOAD;
856}
857
858
859static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
860{
861 if (!cfq_cfqq_sync(cfqq))
862 return ASYNC_WORKLOAD;
863 if (!cfq_cfqq_idle_window(cfqq))
864 return SYNC_NOIDLE_WORKLOAD;
865 return SYNC_WORKLOAD;
866}
867
868static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
869 struct cfq_data *cfqd,
870 struct cfq_group *cfqg)
871{
872 if (wl_class == IDLE_WORKLOAD)
873 return cfqg->service_tree_idle.count;
874
875 return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
876 cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
877 cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
878}
879
880static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
881 struct cfq_group *cfqg)
882{
883 return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
884 cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
885}
886
887static void cfq_dispatch_insert(struct request_queue *, struct request *);
888static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
889 struct cfq_io_cq *cic, struct bio *bio);
890
891static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
892{
893 /* cic->icq is the first member, %NULL will convert to %NULL */
894 return container_of(icq, struct cfq_io_cq, icq);
895}
896
897static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
898 struct io_context *ioc)
899{
900 if (ioc)
901 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
902 return NULL;
903}
904
905static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
906{
907 return cic->cfqq[is_sync];
908}
909
910static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
911 bool is_sync)
912{
913 cic->cfqq[is_sync] = cfqq;
914}
915
916static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
917{
918 return cic->icq.q->elevator->elevator_data;
919}
920
921/*
922 * scheduler run of queue, if there are requests pending and no one in the
923 * driver that will restart queueing
924 */
925static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
926{
927 if (cfqd->busy_queues) {
928 cfq_log(cfqd, "schedule dispatch");
929 kblockd_schedule_work(&cfqd->unplug_work);
930 }
931}
932
933/*
934 * Scale schedule slice based on io priority. Use the sync time slice only
935 * if a queue is marked sync and has sync io queued. A sync queue with async
936 * io only, should not get full sync slice length.
937 */
938static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
939 unsigned short prio)
940{
941 u64 base_slice = cfqd->cfq_slice[sync];
942 u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
943
944 WARN_ON(prio >= IOPRIO_BE_NR);
945
946 return base_slice + (slice * (4 - prio));
947}
948
949static inline u64
950cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
951{
952 return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
953}
954
955/**
956 * cfqg_scale_charge - scale disk time charge according to cfqg weight
957 * @charge: disk time being charged
958 * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
959 *
960 * Scale @charge according to @vfraction, which is in range (0, 1]. The
961 * scaling is inversely proportional.
962 *
963 * scaled = charge / vfraction
964 *
965 * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
966 */
967static inline u64 cfqg_scale_charge(u64 charge,
968 unsigned int vfraction)
969{
970 u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
971
972 /* charge / vfraction */
973 c <<= CFQ_SERVICE_SHIFT;
974 return div_u64(c, vfraction);
975}
976
977static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
978{
979 s64 delta = (s64)(vdisktime - min_vdisktime);
980 if (delta > 0)
981 min_vdisktime = vdisktime;
982
983 return min_vdisktime;
984}
985
986static void update_min_vdisktime(struct cfq_rb_root *st)
987{
988 if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
989 struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
990
991 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
992 cfqg->vdisktime);
993 }
994}
995
996/*
997 * get averaged number of queues of RT/BE priority.
998 * average is updated, with a formula that gives more weight to higher numbers,
999 * to quickly follows sudden increases and decrease slowly
1000 */
1001
1002static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
1003 struct cfq_group *cfqg, bool rt)
1004{
1005 unsigned min_q, max_q;
1006 unsigned mult = cfq_hist_divisor - 1;
1007 unsigned round = cfq_hist_divisor / 2;
1008 unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
1009
1010 min_q = min(cfqg->busy_queues_avg[rt], busy);
1011 max_q = max(cfqg->busy_queues_avg[rt], busy);
1012 cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
1013 cfq_hist_divisor;
1014 return cfqg->busy_queues_avg[rt];
1015}
1016
1017static inline u64
1018cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
1019{
1020 return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
1021}
1022
1023static inline u64
1024cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1025{
1026 u64 slice = cfq_prio_to_slice(cfqd, cfqq);
1027 if (cfqd->cfq_latency) {
1028 /*
1029 * interested queues (we consider only the ones with the same
1030 * priority class in the cfq group)
1031 */
1032 unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
1033 cfq_class_rt(cfqq));
1034 u64 sync_slice = cfqd->cfq_slice[1];
1035 u64 expect_latency = sync_slice * iq;
1036 u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
1037
1038 if (expect_latency > group_slice) {
1039 u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
1040 u64 low_slice;
1041
1042 /* scale low_slice according to IO priority
1043 * and sync vs async */
1044 low_slice = div64_u64(base_low_slice*slice, sync_slice);
1045 low_slice = min(slice, low_slice);
1046 /* the adapted slice value is scaled to fit all iqs
1047 * into the target latency */
1048 slice = div64_u64(slice*group_slice, expect_latency);
1049 slice = max(slice, low_slice);
1050 }
1051 }
1052 return slice;
1053}
1054
1055static inline void
1056cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1057{
1058 u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
1059 u64 now = ktime_get_ns();
1060
1061 cfqq->slice_start = now;
1062 cfqq->slice_end = now + slice;
1063 cfqq->allocated_slice = slice;
1064 cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
1065}
1066
1067/*
1068 * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
1069 * isn't valid until the first request from the dispatch is activated
1070 * and the slice time set.
1071 */
1072static inline bool cfq_slice_used(struct cfq_queue *cfqq)
1073{
1074 if (cfq_cfqq_slice_new(cfqq))
1075 return false;
1076 if (ktime_get_ns() < cfqq->slice_end)
1077 return false;
1078
1079 return true;
1080}
1081
1082/*
1083 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1084 * We choose the request that is closest to the head right now. Distance
1085 * behind the head is penalized and only allowed to a certain extent.
1086 */
1087static struct request *
1088cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
1089{
1090 sector_t s1, s2, d1 = 0, d2 = 0;
1091 unsigned long back_max;
1092#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1093#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1094 unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1095
1096 if (rq1 == NULL || rq1 == rq2)
1097 return rq2;
1098 if (rq2 == NULL)
1099 return rq1;
1100
1101 if (rq_is_sync(rq1) != rq_is_sync(rq2))
1102 return rq_is_sync(rq1) ? rq1 : rq2;
1103
1104 if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
1105 return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
1106
1107 s1 = blk_rq_pos(rq1);
1108 s2 = blk_rq_pos(rq2);
1109
1110 /*
1111 * by definition, 1KiB is 2 sectors
1112 */
1113 back_max = cfqd->cfq_back_max * 2;
1114
1115 /*
1116 * Strict one way elevator _except_ in the case where we allow
1117 * short backward seeks which are biased as twice the cost of a
1118 * similar forward seek.
1119 */
1120 if (s1 >= last)
1121 d1 = s1 - last;
1122 else if (s1 + back_max >= last)
1123 d1 = (last - s1) * cfqd->cfq_back_penalty;
1124 else
1125 wrap |= CFQ_RQ1_WRAP;
1126
1127 if (s2 >= last)
1128 d2 = s2 - last;
1129 else if (s2 + back_max >= last)
1130 d2 = (last - s2) * cfqd->cfq_back_penalty;
1131 else
1132 wrap |= CFQ_RQ2_WRAP;
1133
1134 /* Found required data */
1135
1136 /*
1137 * By doing switch() on the bit mask "wrap" we avoid having to
1138 * check two variables for all permutations: --> faster!
1139 */
1140 switch (wrap) {
1141 case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1142 if (d1 < d2)
1143 return rq1;
1144 else if (d2 < d1)
1145 return rq2;
1146 else {
1147 if (s1 >= s2)
1148 return rq1;
1149 else
1150 return rq2;
1151 }
1152
1153 case CFQ_RQ2_WRAP:
1154 return rq1;
1155 case CFQ_RQ1_WRAP:
1156 return rq2;
1157 case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
1158 default:
1159 /*
1160 * Since both rqs are wrapped,
1161 * start with the one that's further behind head
1162 * (--> only *one* back seek required),
1163 * since back seek takes more time than forward.
1164 */
1165 if (s1 <= s2)
1166 return rq1;
1167 else
1168 return rq2;
1169 }
1170}
1171
1172static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
1173{
1174 /* Service tree is empty */
1175 if (!root->count)
1176 return NULL;
1177
1178 return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
1179}
1180
1181static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
1182{
1183 return rb_entry_cfqg(rb_first_cached(&root->rb));
1184}
1185
1186static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
1187{
1188 if (root->rb_rightmost == n)
1189 root->rb_rightmost = rb_prev(n);
1190
1191 rb_erase_cached(n, &root->rb);
1192 RB_CLEAR_NODE(n);
1193
1194 --root->count;
1195}
1196
1197/*
1198 * would be nice to take fifo expire time into account as well
1199 */
1200static struct request *
1201cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1202 struct request *last)
1203{
1204 struct rb_node *rbnext = rb_next(&last->rb_node);
1205 struct rb_node *rbprev = rb_prev(&last->rb_node);
1206 struct request *next = NULL, *prev = NULL;
1207
1208 BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1209
1210 if (rbprev)
1211 prev = rb_entry_rq(rbprev);
1212
1213 if (rbnext)
1214 next = rb_entry_rq(rbnext);
1215 else {
1216 rbnext = rb_first(&cfqq->sort_list);
1217 if (rbnext && rbnext != &last->rb_node)
1218 next = rb_entry_rq(rbnext);
1219 }
1220
1221 return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
1222}
1223
1224static u64 cfq_slice_offset(struct cfq_data *cfqd,
1225 struct cfq_queue *cfqq)
1226{
1227 /*
1228 * just an approximation, should be ok.
1229 */
1230 return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
1231 cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
1232}
1233
1234static inline s64
1235cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
1236{
1237 return cfqg->vdisktime - st->min_vdisktime;
1238}
1239
1240static void
1241__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1242{
1243 struct rb_node **node = &st->rb.rb_root.rb_node;
1244 struct rb_node *parent = NULL;
1245 struct cfq_group *__cfqg;
1246 s64 key = cfqg_key(st, cfqg);
1247 bool leftmost = true, rightmost = true;
1248
1249 while (*node != NULL) {
1250 parent = *node;
1251 __cfqg = rb_entry_cfqg(parent);
1252
1253 if (key < cfqg_key(st, __cfqg)) {
1254 node = &parent->rb_left;
1255 rightmost = false;
1256 } else {
1257 node = &parent->rb_right;
1258 leftmost = false;
1259 }
1260 }
1261
1262 if (rightmost)
1263 st->rb_rightmost = &cfqg->rb_node;
1264
1265 rb_link_node(&cfqg->rb_node, parent, node);
1266 rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
1267}
1268
1269/*
1270 * This has to be called only on activation of cfqg
1271 */
1272static void
1273cfq_update_group_weight(struct cfq_group *cfqg)
1274{
1275 if (cfqg->new_weight) {
1276 cfqg->weight = cfqg->new_weight;
1277 cfqg->new_weight = 0;
1278 }
1279}
1280
1281static void
1282cfq_update_group_leaf_weight(struct cfq_group *cfqg)
1283{
1284 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1285
1286 if (cfqg->new_leaf_weight) {
1287 cfqg->leaf_weight = cfqg->new_leaf_weight;
1288 cfqg->new_leaf_weight = 0;
1289 }
1290}
1291
1292static void
1293cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
1294{
1295 unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
1296 struct cfq_group *pos = cfqg;
1297 struct cfq_group *parent;
1298 bool propagate;
1299
1300 /* add to the service tree */
1301 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1302
1303 /*
1304 * Update leaf_weight. We cannot update weight at this point
1305 * because cfqg might already have been activated and is
1306 * contributing its current weight to the parent's child_weight.
1307 */
1308 cfq_update_group_leaf_weight(cfqg);
1309 __cfq_group_service_tree_add(st, cfqg);
1310
1311 /*
1312 * Activate @cfqg and calculate the portion of vfraction @cfqg is
1313 * entitled to. vfraction is calculated by walking the tree
1314 * towards the root calculating the fraction it has at each level.
1315 * The compounded ratio is how much vfraction @cfqg owns.
1316 *
1317 * Start with the proportion tasks in this cfqg has against active
1318 * children cfqgs - its leaf_weight against children_weight.
1319 */
1320 propagate = !pos->nr_active++;
1321 pos->children_weight += pos->leaf_weight;
1322 vfr = vfr * pos->leaf_weight / pos->children_weight;
1323
1324 /*
1325 * Compound ->weight walking up the tree. Both activation and
1326 * vfraction calculation are done in the same loop. Propagation
1327 * stops once an already activated node is met. vfraction
1328 * calculation should always continue to the root.
1329 */
1330 while ((parent = cfqg_parent(pos))) {
1331 if (propagate) {
1332 cfq_update_group_weight(pos);
1333 propagate = !parent->nr_active++;
1334 parent->children_weight += pos->weight;
1335 }
1336 vfr = vfr * pos->weight / parent->children_weight;
1337 pos = parent;
1338 }
1339
1340 cfqg->vfraction = max_t(unsigned, vfr, 1);
1341}
1342
1343static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
1344{
1345 if (!iops_mode(cfqd))
1346 return CFQ_SLICE_MODE_GROUP_DELAY;
1347 else
1348 return CFQ_IOPS_MODE_GROUP_DELAY;
1349}
1350
1351static void
1352cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
1353{
1354 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1355 struct cfq_group *__cfqg;
1356 struct rb_node *n;
1357
1358 cfqg->nr_cfqq++;
1359 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1360 return;
1361
1362 /*
1363 * Currently put the group at the end. Later implement something
1364 * so that groups get lesser vtime based on their weights, so that
1365 * if group does not loose all if it was not continuously backlogged.
1366 */
1367 n = st->rb_rightmost;
1368 if (n) {
1369 __cfqg = rb_entry_cfqg(n);
1370 cfqg->vdisktime = __cfqg->vdisktime +
1371 cfq_get_cfqg_vdisktime_delay(cfqd);
1372 } else
1373 cfqg->vdisktime = st->min_vdisktime;
1374 cfq_group_service_tree_add(st, cfqg);
1375}
1376
1377static void
1378cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
1379{
1380 struct cfq_group *pos = cfqg;
1381 bool propagate;
1382
1383 /*
1384 * Undo activation from cfq_group_service_tree_add(). Deactivate
1385 * @cfqg and propagate deactivation upwards.
1386 */
1387 propagate = !--pos->nr_active;
1388 pos->children_weight -= pos->leaf_weight;
1389
1390 while (propagate) {
1391 struct cfq_group *parent = cfqg_parent(pos);
1392
1393 /* @pos has 0 nr_active at this point */
1394 WARN_ON_ONCE(pos->children_weight);
1395 pos->vfraction = 0;
1396
1397 if (!parent)
1398 break;
1399
1400 propagate = !--parent->nr_active;
1401 parent->children_weight -= pos->weight;
1402 pos = parent;
1403 }
1404
1405 /* remove from the service tree */
1406 if (!RB_EMPTY_NODE(&cfqg->rb_node))
1407 cfq_rb_erase(&cfqg->rb_node, st);
1408}
1409
1410static void
1411cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1412{
1413 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1414
1415 BUG_ON(cfqg->nr_cfqq < 1);
1416 cfqg->nr_cfqq--;
1417
1418 /* If there are other cfq queues under this group, don't delete it */
1419 if (cfqg->nr_cfqq)
1420 return;
1421
1422 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1423 cfq_group_service_tree_del(st, cfqg);
1424 cfqg->saved_wl_slice = 0;
1425 cfqg_stats_update_dequeue(cfqg);
1426}
1427
1428static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
1429 u64 *unaccounted_time)
1430{
1431 u64 slice_used;
1432 u64 now = ktime_get_ns();
1433
1434 /*
1435 * Queue got expired before even a single request completed or
1436 * got expired immediately after first request completion.
1437 */
1438 if (!cfqq->slice_start || cfqq->slice_start == now) {
1439 /*
1440 * Also charge the seek time incurred to the group, otherwise
1441 * if there are mutiple queues in the group, each can dispatch
1442 * a single request on seeky media and cause lots of seek time
1443 * and group will never know it.
1444 */
1445 slice_used = max_t(u64, (now - cfqq->dispatch_start),
1446 jiffies_to_nsecs(1));
1447 } else {
1448 slice_used = now - cfqq->slice_start;
1449 if (slice_used > cfqq->allocated_slice) {
1450 *unaccounted_time = slice_used - cfqq->allocated_slice;
1451 slice_used = cfqq->allocated_slice;
1452 }
1453 if (cfqq->slice_start > cfqq->dispatch_start)
1454 *unaccounted_time += cfqq->slice_start -
1455 cfqq->dispatch_start;
1456 }
1457
1458 return slice_used;
1459}
1460
1461static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1462 struct cfq_queue *cfqq)
1463{
1464 struct cfq_rb_root *st = &cfqd->grp_service_tree;
1465 u64 used_sl, charge, unaccounted_sl = 0;
1466 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
1467 - cfqg->service_tree_idle.count;
1468 unsigned int vfr;
1469 u64 now = ktime_get_ns();
1470
1471 BUG_ON(nr_sync < 0);
1472 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
1473
1474 if (iops_mode(cfqd))
1475 charge = cfqq->slice_dispatch;
1476 else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
1477 charge = cfqq->allocated_slice;
1478
1479 /*
1480 * Can't update vdisktime while on service tree and cfqg->vfraction
1481 * is valid only while on it. Cache vfr, leave the service tree,
1482 * update vdisktime and go back on. The re-addition to the tree
1483 * will also update the weights as necessary.
1484 */
1485 vfr = cfqg->vfraction;
1486 cfq_group_service_tree_del(st, cfqg);
1487 cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
1488 cfq_group_service_tree_add(st, cfqg);
1489
1490 /* This group is being expired. Save the context */
1491 if (cfqd->workload_expires > now) {
1492 cfqg->saved_wl_slice = cfqd->workload_expires - now;
1493 cfqg->saved_wl_type = cfqd->serving_wl_type;
1494 cfqg->saved_wl_class = cfqd->serving_wl_class;
1495 } else
1496 cfqg->saved_wl_slice = 0;
1497
1498 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
1499 st->min_vdisktime);
1500 cfq_log_cfqq(cfqq->cfqd, cfqq,
1501 "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
1502 used_sl, cfqq->slice_dispatch, charge,
1503 iops_mode(cfqd), cfqq->nr_sectors);
1504 cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1505 cfqg_stats_set_start_empty_time(cfqg);
1506}
1507
1508/**
1509 * cfq_init_cfqg_base - initialize base part of a cfq_group
1510 * @cfqg: cfq_group to initialize
1511 *
1512 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1513 * is enabled or not.
1514 */
1515static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1516{
1517 struct cfq_rb_root *st;
1518 int i, j;
1519
1520 for_each_cfqg_st(cfqg, i, j, st)
1521 *st = CFQ_RB_ROOT;
1522 RB_CLEAR_NODE(&cfqg->rb_node);
1523
1524 cfqg->ttime.last_end_request = ktime_get_ns();
1525}
1526
1527#ifdef CONFIG_CFQ_GROUP_IOSCHED
1528static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1529 bool on_dfl, bool reset_dev, bool is_leaf_weight);
1530
1531static void cfqg_stats_exit(struct cfqg_stats *stats)
1532{
1533 blkg_rwstat_exit(&stats->merged);
1534 blkg_rwstat_exit(&stats->service_time);
1535 blkg_rwstat_exit(&stats->wait_time);
1536 blkg_rwstat_exit(&stats->queued);
1537 blkg_stat_exit(&stats->time);
1538#ifdef CONFIG_DEBUG_BLK_CGROUP
1539 blkg_stat_exit(&stats->unaccounted_time);
1540 blkg_stat_exit(&stats->avg_queue_size_sum);
1541 blkg_stat_exit(&stats->avg_queue_size_samples);
1542 blkg_stat_exit(&stats->dequeue);
1543 blkg_stat_exit(&stats->group_wait_time);
1544 blkg_stat_exit(&stats->idle_time);
1545 blkg_stat_exit(&stats->empty_time);
1546#endif
1547}
1548
1549static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
1550{
1551 if (blkg_rwstat_init(&stats->merged, gfp) ||
1552 blkg_rwstat_init(&stats->service_time, gfp) ||
1553 blkg_rwstat_init(&stats->wait_time, gfp) ||
1554 blkg_rwstat_init(&stats->queued, gfp) ||
1555 blkg_stat_init(&stats->time, gfp))
1556 goto err;
1557
1558#ifdef CONFIG_DEBUG_BLK_CGROUP
1559 if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
1560 blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
1561 blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
1562 blkg_stat_init(&stats->dequeue, gfp) ||
1563 blkg_stat_init(&stats->group_wait_time, gfp) ||
1564 blkg_stat_init(&stats->idle_time, gfp) ||
1565 blkg_stat_init(&stats->empty_time, gfp))
1566 goto err;
1567#endif
1568 return 0;
1569err:
1570 cfqg_stats_exit(stats);
1571 return -ENOMEM;
1572}
1573
1574static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
1575{
1576 struct cfq_group_data *cgd;
1577
1578 cgd = kzalloc(sizeof(*cgd), gfp);
1579 if (!cgd)
1580 return NULL;
1581 return &cgd->cpd;
1582}
1583
1584static void cfq_cpd_init(struct blkcg_policy_data *cpd)
1585{
1586 struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
1587 unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
1588 CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1589
1590 if (cpd_to_blkcg(cpd) == &blkcg_root)
1591 weight *= 2;
1592
1593 cgd->weight = weight;
1594 cgd->leaf_weight = weight;
1595}
1596
1597static void cfq_cpd_free(struct blkcg_policy_data *cpd)
1598{
1599 kfree(cpd_to_cfqgd(cpd));
1600}
1601
1602static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
1603{
1604 struct blkcg *blkcg = cpd_to_blkcg(cpd);
1605 bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
1606 unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
1607
1608 if (blkcg == &blkcg_root)
1609 weight *= 2;
1610
1611 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
1612 WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
1613}
1614
1615static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
1616{
1617 struct cfq_group *cfqg;
1618
1619 cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
1620 if (!cfqg)
1621 return NULL;
1622
1623 cfq_init_cfqg_base(cfqg);
1624 if (cfqg_stats_init(&cfqg->stats, gfp)) {
1625 kfree(cfqg);
1626 return NULL;
1627 }
1628
1629 return &cfqg->pd;
1630}
1631
1632static void cfq_pd_init(struct blkg_policy_data *pd)
1633{
1634 struct cfq_group *cfqg = pd_to_cfqg(pd);
1635 struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
1636
1637 cfqg->weight = cgd->weight;
1638 cfqg->leaf_weight = cgd->leaf_weight;
1639}
1640
1641static void cfq_pd_offline(struct blkg_policy_data *pd)
1642{
1643 struct cfq_group *cfqg = pd_to_cfqg(pd);
1644 int i;
1645
1646 for (i = 0; i < IOPRIO_BE_NR; i++) {
1647 if (cfqg->async_cfqq[0][i]) {
1648 cfq_put_queue(cfqg->async_cfqq[0][i]);
1649 cfqg->async_cfqq[0][i] = NULL;
1650 }
1651 if (cfqg->async_cfqq[1][i]) {
1652 cfq_put_queue(cfqg->async_cfqq[1][i]);
1653 cfqg->async_cfqq[1][i] = NULL;
1654 }
1655 }
1656
1657 if (cfqg->async_idle_cfqq) {
1658 cfq_put_queue(cfqg->async_idle_cfqq);
1659 cfqg->async_idle_cfqq = NULL;
1660 }
1661
1662 /*
1663 * @blkg is going offline and will be ignored by
1664 * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
1665 * that they don't get lost. If IOs complete after this point, the
1666 * stats for them will be lost. Oh well...
1667 */
1668 cfqg_stats_xfer_dead(cfqg);
1669}
1670
1671static void cfq_pd_free(struct blkg_policy_data *pd)
1672{
1673 struct cfq_group *cfqg = pd_to_cfqg(pd);
1674
1675 cfqg_stats_exit(&cfqg->stats);
1676 return kfree(cfqg);
1677}
1678
1679static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
1680{
1681 struct cfq_group *cfqg = pd_to_cfqg(pd);
1682
1683 cfqg_stats_reset(&cfqg->stats);
1684}
1685
1686static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
1687 struct blkcg *blkcg)
1688{
1689 struct blkcg_gq *blkg;
1690
1691 blkg = blkg_lookup(blkcg, cfqd->queue);
1692 if (likely(blkg))
1693 return blkg_to_cfqg(blkg);
1694 return NULL;
1695}
1696
1697static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1698{
1699 cfqq->cfqg = cfqg;
1700 /* cfqq reference on cfqg */
1701 cfqg_get(cfqg);
1702}
1703
1704static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1705 struct blkg_policy_data *pd, int off)
1706{
1707 struct cfq_group *cfqg = pd_to_cfqg(pd);
1708
1709 if (!cfqg->dev_weight)
1710 return 0;
1711 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1712}
1713
1714static int cfqg_print_weight_device(struct seq_file *sf, void *v)
1715{
1716 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1717 cfqg_prfill_weight_device, &blkcg_policy_cfq,
1718 0, false);
1719 return 0;
1720}
1721
1722static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1723 struct blkg_policy_data *pd, int off)
1724{
1725 struct cfq_group *cfqg = pd_to_cfqg(pd);
1726
1727 if (!cfqg->dev_leaf_weight)
1728 return 0;
1729 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1730}
1731
1732static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1733{
1734 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1735 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
1736 0, false);
1737 return 0;
1738}
1739
1740static int cfq_print_weight(struct seq_file *sf, void *v)
1741{
1742 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1743 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1744 unsigned int val = 0;
1745
1746 if (cgd)
1747 val = cgd->weight;
1748
1749 seq_printf(sf, "%u\n", val);
1750 return 0;
1751}
1752
1753static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1754{
1755 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1756 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1757 unsigned int val = 0;
1758
1759 if (cgd)
1760 val = cgd->leaf_weight;
1761
1762 seq_printf(sf, "%u\n", val);
1763 return 0;
1764}
1765
1766static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1767 char *buf, size_t nbytes, loff_t off,
1768 bool on_dfl, bool is_leaf_weight)
1769{
1770 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1771 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1772 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1773 struct blkg_conf_ctx ctx;
1774 struct cfq_group *cfqg;
1775 struct cfq_group_data *cfqgd;
1776 int ret;
1777 u64 v;
1778
1779 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1780 if (ret)
1781 return ret;
1782
1783 if (sscanf(ctx.body, "%llu", &v) == 1) {
1784 /* require "default" on dfl */
1785 ret = -ERANGE;
1786 if (!v && on_dfl)
1787 goto out_finish;
1788 } else if (!strcmp(strim(ctx.body), "default")) {
1789 v = 0;
1790 } else {
1791 ret = -EINVAL;
1792 goto out_finish;
1793 }
1794
1795 cfqg = blkg_to_cfqg(ctx.blkg);
1796 cfqgd = blkcg_to_cfqgd(blkcg);
1797
1798 ret = -ERANGE;
1799 if (!v || (v >= min && v <= max)) {
1800 if (!is_leaf_weight) {
1801 cfqg->dev_weight = v;
1802 cfqg->new_weight = v ?: cfqgd->weight;
1803 } else {
1804 cfqg->dev_leaf_weight = v;
1805 cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
1806 }
1807 ret = 0;
1808 }
1809out_finish:
1810 blkg_conf_finish(&ctx);
1811 return ret ?: nbytes;
1812}
1813
1814static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
1815 char *buf, size_t nbytes, loff_t off)
1816{
1817 return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
1818}
1819
1820static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
1821 char *buf, size_t nbytes, loff_t off)
1822{
1823 return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
1824}
1825
1826static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
1827 bool on_dfl, bool reset_dev, bool is_leaf_weight)
1828{
1829 unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
1830 unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
1831 struct blkcg *blkcg = css_to_blkcg(css);
1832 struct blkcg_gq *blkg;
1833 struct cfq_group_data *cfqgd;
1834 int ret = 0;
1835
1836 if (val < min || val > max)
1837 return -ERANGE;
1838
1839 spin_lock_irq(&blkcg->lock);
1840 cfqgd = blkcg_to_cfqgd(blkcg);
1841 if (!cfqgd) {
1842 ret = -EINVAL;
1843 goto out;
1844 }
1845
1846 if (!is_leaf_weight)
1847 cfqgd->weight = val;
1848 else
1849 cfqgd->leaf_weight = val;
1850
1851 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1852 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1853
1854 if (!cfqg)
1855 continue;
1856
1857 if (!is_leaf_weight) {
1858 if (reset_dev)
1859 cfqg->dev_weight = 0;
1860 if (!cfqg->dev_weight)
1861 cfqg->new_weight = cfqgd->weight;
1862 } else {
1863 if (reset_dev)
1864 cfqg->dev_leaf_weight = 0;
1865 if (!cfqg->dev_leaf_weight)
1866 cfqg->new_leaf_weight = cfqgd->leaf_weight;
1867 }
1868 }
1869
1870out:
1871 spin_unlock_irq(&blkcg->lock);
1872 return ret;
1873}
1874
1875static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1876 u64 val)
1877{
1878 return __cfq_set_weight(css, val, false, false, false);
1879}
1880
1881static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1882 struct cftype *cft, u64 val)
1883{
1884 return __cfq_set_weight(css, val, false, false, true);
1885}
1886
1887static int cfqg_print_stat(struct seq_file *sf, void *v)
1888{
1889 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1890 &blkcg_policy_cfq, seq_cft(sf)->private, false);
1891 return 0;
1892}
1893
1894static int cfqg_print_rwstat(struct seq_file *sf, void *v)
1895{
1896 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1897 &blkcg_policy_cfq, seq_cft(sf)->private, true);
1898 return 0;
1899}
1900
1901static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
1902 struct blkg_policy_data *pd, int off)
1903{
1904 u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
1905 &blkcg_policy_cfq, off);
1906 return __blkg_prfill_u64(sf, pd, sum);
1907}
1908
1909static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1910 struct blkg_policy_data *pd, int off)
1911{
1912 struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
1913 &blkcg_policy_cfq, off);
1914 return __blkg_prfill_rwstat(sf, pd, &sum);
1915}
1916
1917static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
1918{
1919 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1920 cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
1921 seq_cft(sf)->private, false);
1922 return 0;
1923}
1924
1925static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1926{
1927 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1928 cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
1929 seq_cft(sf)->private, true);
1930 return 0;
1931}
1932
1933static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
1934 int off)
1935{
1936 u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
1937
1938 return __blkg_prfill_u64(sf, pd, sum >> 9);
1939}
1940
1941static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
1942{
1943 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1944 cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
1945 return 0;
1946}
1947
1948static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
1949 struct blkg_policy_data *pd, int off)
1950{
1951 struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
1952 offsetof(struct blkcg_gq, stat_bytes));
1953 u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
1954 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
1955
1956 return __blkg_prfill_u64(sf, pd, sum >> 9);
1957}
1958
1959static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
1960{
1961 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1962 cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
1963 false);
1964 return 0;
1965}
1966
1967#ifdef CONFIG_DEBUG_BLK_CGROUP
1968static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1969 struct blkg_policy_data *pd, int off)
1970{
1971 struct cfq_group *cfqg = pd_to_cfqg(pd);
1972 u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1973 u64 v = 0;
1974
1975 if (samples) {
1976 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1977 v = div64_u64(v, samples);
1978 }
1979 __blkg_prfill_u64(sf, pd, v);
1980 return 0;
1981}
1982
1983/* print avg_queue_size */
1984static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1985{
1986 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1987 cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
1988 0, false);
1989 return 0;
1990}
1991#endif /* CONFIG_DEBUG_BLK_CGROUP */
1992
1993static struct cftype cfq_blkcg_legacy_files[] = {
1994 /* on root, weight is mapped to leaf_weight */
1995 {
1996 .name = "weight_device",
1997 .flags = CFTYPE_ONLY_ON_ROOT,
1998 .seq_show = cfqg_print_leaf_weight_device,
1999 .write = cfqg_set_leaf_weight_device,
2000 },
2001 {
2002 .name = "weight",
2003 .flags = CFTYPE_ONLY_ON_ROOT,
2004 .seq_show = cfq_print_leaf_weight,
2005 .write_u64 = cfq_set_leaf_weight,
2006 },
2007
2008 /* no such mapping necessary for !roots */
2009 {
2010 .name = "weight_device",
2011 .flags = CFTYPE_NOT_ON_ROOT,
2012 .seq_show = cfqg_print_weight_device,
2013 .write = cfqg_set_weight_device,
2014 },
2015 {
2016 .name = "weight",
2017 .flags = CFTYPE_NOT_ON_ROOT,
2018 .seq_show = cfq_print_weight,
2019 .write_u64 = cfq_set_weight,
2020 },
2021
2022 {
2023 .name = "leaf_weight_device",
2024 .seq_show = cfqg_print_leaf_weight_device,
2025 .write = cfqg_set_leaf_weight_device,
2026 },
2027 {
2028 .name = "leaf_weight",
2029 .seq_show = cfq_print_leaf_weight,
2030 .write_u64 = cfq_set_leaf_weight,
2031 },
2032
2033 /* statistics, covers only the tasks in the cfqg */
2034 {
2035 .name = "time",
2036 .private = offsetof(struct cfq_group, stats.time),
2037 .seq_show = cfqg_print_stat,
2038 },
2039 {
2040 .name = "sectors",
2041 .seq_show = cfqg_print_stat_sectors,
2042 },
2043 {
2044 .name = "io_service_bytes",
2045 .private = (unsigned long)&blkcg_policy_cfq,
2046 .seq_show = blkg_print_stat_bytes,
2047 },
2048 {
2049 .name = "io_serviced",
2050 .private = (unsigned long)&blkcg_policy_cfq,
2051 .seq_show = blkg_print_stat_ios,
2052 },
2053 {
2054 .name = "io_service_time",
2055 .private = offsetof(struct cfq_group, stats.service_time),
2056 .seq_show = cfqg_print_rwstat,
2057 },
2058 {
2059 .name = "io_wait_time",
2060 .private = offsetof(struct cfq_group, stats.wait_time),
2061 .seq_show = cfqg_print_rwstat,
2062 },
2063 {
2064 .name = "io_merged",
2065 .private = offsetof(struct cfq_group, stats.merged),
2066 .seq_show = cfqg_print_rwstat,
2067 },
2068 {
2069 .name = "io_queued",
2070 .private = offsetof(struct cfq_group, stats.queued),
2071 .seq_show = cfqg_print_rwstat,
2072 },
2073
2074 /* the same statictics which cover the cfqg and its descendants */
2075 {
2076 .name = "time_recursive",
2077 .private = offsetof(struct cfq_group, stats.time),
2078 .seq_show = cfqg_print_stat_recursive,
2079 },
2080 {
2081 .name = "sectors_recursive",
2082 .seq_show = cfqg_print_stat_sectors_recursive,
2083 },
2084 {
2085 .name = "io_service_bytes_recursive",
2086 .private = (unsigned long)&blkcg_policy_cfq,
2087 .seq_show = blkg_print_stat_bytes_recursive,
2088 },
2089 {
2090 .name = "io_serviced_recursive",
2091 .private = (unsigned long)&blkcg_policy_cfq,
2092 .seq_show = blkg_print_stat_ios_recursive,
2093 },
2094 {
2095 .name = "io_service_time_recursive",
2096 .private = offsetof(struct cfq_group, stats.service_time),
2097 .seq_show = cfqg_print_rwstat_recursive,
2098 },
2099 {
2100 .name = "io_wait_time_recursive",
2101 .private = offsetof(struct cfq_group, stats.wait_time),
2102 .seq_show = cfqg_print_rwstat_recursive,
2103 },
2104 {
2105 .name = "io_merged_recursive",
2106 .private = offsetof(struct cfq_group, stats.merged),
2107 .seq_show = cfqg_print_rwstat_recursive,
2108 },
2109 {
2110 .name = "io_queued_recursive",
2111 .private = offsetof(struct cfq_group, stats.queued),
2112 .seq_show = cfqg_print_rwstat_recursive,
2113 },
2114#ifdef CONFIG_DEBUG_BLK_CGROUP
2115 {
2116 .name = "avg_queue_size",
2117 .seq_show = cfqg_print_avg_queue_size,
2118 },
2119 {
2120 .name = "group_wait_time",
2121 .private = offsetof(struct cfq_group, stats.group_wait_time),
2122 .seq_show = cfqg_print_stat,
2123 },
2124 {
2125 .name = "idle_time",
2126 .private = offsetof(struct cfq_group, stats.idle_time),
2127 .seq_show = cfqg_print_stat,
2128 },
2129 {
2130 .name = "empty_time",
2131 .private = offsetof(struct cfq_group, stats.empty_time),
2132 .seq_show = cfqg_print_stat,
2133 },
2134 {
2135 .name = "dequeue",
2136 .private = offsetof(struct cfq_group, stats.dequeue),
2137 .seq_show = cfqg_print_stat,
2138 },
2139 {
2140 .name = "unaccounted_time",
2141 .private = offsetof(struct cfq_group, stats.unaccounted_time),
2142 .seq_show = cfqg_print_stat,
2143 },
2144#endif /* CONFIG_DEBUG_BLK_CGROUP */
2145 { } /* terminate */
2146};
2147
2148static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
2149{
2150 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2151 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
2152
2153 seq_printf(sf, "default %u\n", cgd->weight);
2154 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
2155 &blkcg_policy_cfq, 0, false);
2156 return 0;
2157}
2158
2159static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
2160 char *buf, size_t nbytes, loff_t off)
2161{
2162 char *endp;
2163 int ret;
2164 u64 v;
2165
2166 buf = strim(buf);
2167
2168 /* "WEIGHT" or "default WEIGHT" sets the default weight */
2169 v = simple_strtoull(buf, &endp, 0);
2170 if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
2171 ret = __cfq_set_weight(of_css(of), v, true, false, false);
2172 return ret ?: nbytes;
2173 }
2174
2175 /* "MAJ:MIN WEIGHT" */
2176 return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
2177}
2178
2179static struct cftype cfq_blkcg_files[] = {
2180 {
2181 .name = "weight",
2182 .flags = CFTYPE_NOT_ON_ROOT,
2183 .seq_show = cfq_print_weight_on_dfl,
2184 .write = cfq_set_weight_on_dfl,
2185 },
2186 { } /* terminate */
2187};
2188
2189#else /* GROUP_IOSCHED */
2190static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
2191 struct blkcg *blkcg)
2192{
2193 return cfqd->root_group;
2194}
2195
2196static inline void
2197cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
2198 cfqq->cfqg = cfqg;
2199}
2200
2201#endif /* GROUP_IOSCHED */
2202
2203/*
2204 * The cfqd->service_trees holds all pending cfq_queue's that have
2205 * requests waiting to be processed. It is sorted in the order that
2206 * we will service the queues.
2207 */
2208static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2209 bool add_front)
2210{
2211 struct rb_node **p, *parent;
2212 struct cfq_queue *__cfqq;
2213 u64 rb_key;
2214 struct cfq_rb_root *st;
2215 bool leftmost = true;
2216 int new_cfqq = 1;
2217 u64 now = ktime_get_ns();
2218
2219 st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
2220 if (cfq_class_idle(cfqq)) {
2221 rb_key = CFQ_IDLE_DELAY;
2222 parent = st->rb_rightmost;
2223 if (parent && parent != &cfqq->rb_node) {
2224 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2225 rb_key += __cfqq->rb_key;
2226 } else
2227 rb_key += now;
2228 } else if (!add_front) {
2229 /*
2230 * Get our rb key offset. Subtract any residual slice
2231 * value carried from last service. A negative resid
2232 * count indicates slice overrun, and this should position
2233 * the next service time further away in the tree.
2234 */
2235 rb_key = cfq_slice_offset(cfqd, cfqq) + now;
2236 rb_key -= cfqq->slice_resid;
2237 cfqq->slice_resid = 0;
2238 } else {
2239 rb_key = -NSEC_PER_SEC;
2240 __cfqq = cfq_rb_first(st);
2241 rb_key += __cfqq ? __cfqq->rb_key : now;
2242 }
2243
2244 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2245 new_cfqq = 0;
2246 /*
2247 * same position, nothing more to do
2248 */
2249 if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
2250 return;
2251
2252 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2253 cfqq->service_tree = NULL;
2254 }
2255
2256 parent = NULL;
2257 cfqq->service_tree = st;
2258 p = &st->rb.rb_root.rb_node;
2259 while (*p) {
2260 parent = *p;
2261 __cfqq = rb_entry(parent, struct cfq_queue, rb_node);
2262
2263 /*
2264 * sort by key, that represents service time.
2265 */
2266 if (rb_key < __cfqq->rb_key)
2267 p = &parent->rb_left;
2268 else {
2269 p = &parent->rb_right;
2270 leftmost = false;
2271 }
2272 }
2273
2274 cfqq->rb_key = rb_key;
2275 rb_link_node(&cfqq->rb_node, parent, p);
2276 rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
2277 st->count++;
2278 if (add_front || !new_cfqq)
2279 return;
2280 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
2281}
2282
2283static struct cfq_queue *
2284cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
2285 sector_t sector, struct rb_node **ret_parent,
2286 struct rb_node ***rb_link)
2287{
2288 struct rb_node **p, *parent;
2289 struct cfq_queue *cfqq = NULL;
2290
2291 parent = NULL;
2292 p = &root->rb_node;
2293 while (*p) {
2294 struct rb_node **n;
2295
2296 parent = *p;
2297 cfqq = rb_entry(parent, struct cfq_queue, p_node);
2298
2299 /*
2300 * Sort strictly based on sector. Smallest to the left,
2301 * largest to the right.
2302 */
2303 if (sector > blk_rq_pos(cfqq->next_rq))
2304 n = &(*p)->rb_right;
2305 else if (sector < blk_rq_pos(cfqq->next_rq))
2306 n = &(*p)->rb_left;
2307 else
2308 break;
2309 p = n;
2310 cfqq = NULL;
2311 }
2312
2313 *ret_parent = parent;
2314 if (rb_link)
2315 *rb_link = p;
2316 return cfqq;
2317}
2318
2319static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2320{
2321 struct rb_node **p, *parent;
2322 struct cfq_queue *__cfqq;
2323
2324 if (cfqq->p_root) {
2325 rb_erase(&cfqq->p_node, cfqq->p_root);
2326 cfqq->p_root = NULL;
2327 }
2328
2329 if (cfq_class_idle(cfqq))
2330 return;
2331 if (!cfqq->next_rq)
2332 return;
2333
2334 cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
2335 __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
2336 blk_rq_pos(cfqq->next_rq), &parent, &p);
2337 if (!__cfqq) {
2338 rb_link_node(&cfqq->p_node, parent, p);
2339 rb_insert_color(&cfqq->p_node, cfqq->p_root);
2340 } else
2341 cfqq->p_root = NULL;
2342}
2343
2344/*
2345 * Update cfqq's position in the service tree.
2346 */
2347static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2348{
2349 /*
2350 * Resorting requires the cfqq to be on the RR list already.
2351 */
2352 if (cfq_cfqq_on_rr(cfqq)) {
2353 cfq_service_tree_add(cfqd, cfqq, 0);
2354 cfq_prio_tree_add(cfqd, cfqq);
2355 }
2356}
2357
2358/*
2359 * add to busy list of queues for service, trying to be fair in ordering
2360 * the pending list according to last request service
2361 */
2362static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2363{
2364 cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
2365 BUG_ON(cfq_cfqq_on_rr(cfqq));
2366 cfq_mark_cfqq_on_rr(cfqq);
2367 cfqd->busy_queues++;
2368 if (cfq_cfqq_sync(cfqq))
2369 cfqd->busy_sync_queues++;
2370
2371 cfq_resort_rr_list(cfqd, cfqq);
2372}
2373
2374/*
2375 * Called when the cfqq no longer has requests pending, remove it from
2376 * the service tree.
2377 */
2378static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2379{
2380 cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
2381 BUG_ON(!cfq_cfqq_on_rr(cfqq));
2382 cfq_clear_cfqq_on_rr(cfqq);
2383
2384 if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
2385 cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
2386 cfqq->service_tree = NULL;
2387 }
2388 if (cfqq->p_root) {
2389 rb_erase(&cfqq->p_node, cfqq->p_root);
2390 cfqq->p_root = NULL;
2391 }
2392
2393 cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
2394 BUG_ON(!cfqd->busy_queues);
2395 cfqd->busy_queues--;
2396 if (cfq_cfqq_sync(cfqq))
2397 cfqd->busy_sync_queues--;
2398}
2399
2400/*
2401 * rb tree support functions
2402 */
2403static void cfq_del_rq_rb(struct request *rq)
2404{
2405 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2406 const int sync = rq_is_sync(rq);
2407
2408 BUG_ON(!cfqq->queued[sync]);
2409 cfqq->queued[sync]--;
2410
2411 elv_rb_del(&cfqq->sort_list, rq);
2412
2413 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
2414 /*
2415 * Queue will be deleted from service tree when we actually
2416 * expire it later. Right now just remove it from prio tree
2417 * as it is empty.
2418 */
2419 if (cfqq->p_root) {
2420 rb_erase(&cfqq->p_node, cfqq->p_root);
2421 cfqq->p_root = NULL;
2422 }
2423 }
2424}
2425
2426static void cfq_add_rq_rb(struct request *rq)
2427{
2428 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2429 struct cfq_data *cfqd = cfqq->cfqd;
2430 struct request *prev;
2431
2432 cfqq->queued[rq_is_sync(rq)]++;
2433
2434 elv_rb_add(&cfqq->sort_list, rq);
2435
2436 if (!cfq_cfqq_on_rr(cfqq))
2437 cfq_add_cfqq_rr(cfqd, cfqq);
2438
2439 /*
2440 * check if this request is a better next-serve candidate
2441 */
2442 prev = cfqq->next_rq;
2443 cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
2444
2445 /*
2446 * adjust priority tree position, if ->next_rq changes
2447 */
2448 if (prev != cfqq->next_rq)
2449 cfq_prio_tree_add(cfqd, cfqq);
2450
2451 BUG_ON(!cfqq->next_rq);
2452}
2453
2454static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
2455{
2456 elv_rb_del(&cfqq->sort_list, rq);
2457 cfqq->queued[rq_is_sync(rq)]--;
2458 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2459 cfq_add_rq_rb(rq);
2460 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
2461 rq->cmd_flags);
2462}
2463
2464static struct request *
2465cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
2466{
2467 struct task_struct *tsk = current;
2468 struct cfq_io_cq *cic;
2469 struct cfq_queue *cfqq;
2470
2471 cic = cfq_cic_lookup(cfqd, tsk->io_context);
2472 if (!cic)
2473 return NULL;
2474
2475 cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
2476 if (cfqq)
2477 return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
2478
2479 return NULL;
2480}
2481
2482static void cfq_activate_request(struct request_queue *q, struct request *rq)
2483{
2484 struct cfq_data *cfqd = q->elevator->elevator_data;
2485
2486 cfqd->rq_in_driver++;
2487 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
2488 cfqd->rq_in_driver);
2489
2490 cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2491}
2492
2493static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
2494{
2495 struct cfq_data *cfqd = q->elevator->elevator_data;
2496
2497 WARN_ON(!cfqd->rq_in_driver);
2498 cfqd->rq_in_driver--;
2499 cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
2500 cfqd->rq_in_driver);
2501}
2502
2503static void cfq_remove_request(struct request *rq)
2504{
2505 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2506
2507 if (cfqq->next_rq == rq)
2508 cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
2509
2510 list_del_init(&rq->queuelist);
2511 cfq_del_rq_rb(rq);
2512
2513 cfqq->cfqd->rq_queued--;
2514 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
2515 if (rq->cmd_flags & REQ_PRIO) {
2516 WARN_ON(!cfqq->prio_pending);
2517 cfqq->prio_pending--;
2518 }
2519}
2520
2521static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
2522 struct bio *bio)
2523{
2524 struct cfq_data *cfqd = q->elevator->elevator_data;
2525 struct request *__rq;
2526
2527 __rq = cfq_find_rq_fmerge(cfqd, bio);
2528 if (__rq && elv_bio_merge_ok(__rq, bio)) {
2529 *req = __rq;
2530 return ELEVATOR_FRONT_MERGE;
2531 }
2532
2533 return ELEVATOR_NO_MERGE;
2534}
2535
2536static void cfq_merged_request(struct request_queue *q, struct request *req,
2537 enum elv_merge type)
2538{
2539 if (type == ELEVATOR_FRONT_MERGE) {
2540 struct cfq_queue *cfqq = RQ_CFQQ(req);
2541
2542 cfq_reposition_rq_rb(cfqq, req);
2543 }
2544}
2545
2546static void cfq_bio_merged(struct request_queue *q, struct request *req,
2547 struct bio *bio)
2548{
2549 cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
2550}
2551
2552static void
2553cfq_merged_requests(struct request_queue *q, struct request *rq,
2554 struct request *next)
2555{
2556 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2557 struct cfq_data *cfqd = q->elevator->elevator_data;
2558
2559 /*
2560 * reposition in fifo if next is older than rq
2561 */
2562 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2563 next->fifo_time < rq->fifo_time &&
2564 cfqq == RQ_CFQQ(next)) {
2565 list_move(&rq->queuelist, &next->queuelist);
2566 rq->fifo_time = next->fifo_time;
2567 }
2568
2569 if (cfqq->next_rq == next)
2570 cfqq->next_rq = rq;
2571 cfq_remove_request(next);
2572 cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
2573
2574 cfqq = RQ_CFQQ(next);
2575 /*
2576 * all requests of this queue are merged to other queues, delete it
2577 * from the service tree. If it's the active_queue,
2578 * cfq_dispatch_requests() will choose to expire it or do idle
2579 */
2580 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
2581 cfqq != cfqd->active_queue)
2582 cfq_del_cfqq_rr(cfqd, cfqq);
2583}
2584
2585static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2586 struct bio *bio)
2587{
2588 struct cfq_data *cfqd = q->elevator->elevator_data;
2589 bool is_sync = op_is_sync(bio->bi_opf);
2590 struct cfq_io_cq *cic;
2591 struct cfq_queue *cfqq;
2592
2593 /*
2594 * Disallow merge of a sync bio into an async request.
2595 */
2596 if (is_sync && !rq_is_sync(rq))
2597 return false;
2598
2599 /*
2600 * Lookup the cfqq that this bio will be queued with and allow
2601 * merge only if rq is queued there.
2602 */
2603 cic = cfq_cic_lookup(cfqd, current->io_context);
2604 if (!cic)
2605 return false;
2606
2607 cfqq = cic_to_cfqq(cic, is_sync);
2608 return cfqq == RQ_CFQQ(rq);
2609}
2610
2611static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
2612 struct request *next)
2613{
2614 return RQ_CFQQ(rq) == RQ_CFQQ(next);
2615}
2616
2617static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2618{
2619 hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
2620 cfqg_stats_update_idle_time(cfqq->cfqg);
2621}
2622
2623static void __cfq_set_active_queue(struct cfq_data *cfqd,
2624 struct cfq_queue *cfqq)
2625{
2626 if (cfqq) {
2627 cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
2628 cfqd->serving_wl_class, cfqd->serving_wl_type);
2629 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
2630 cfqq->slice_start = 0;
2631 cfqq->dispatch_start = ktime_get_ns();
2632 cfqq->allocated_slice = 0;
2633 cfqq->slice_end = 0;
2634 cfqq->slice_dispatch = 0;
2635 cfqq->nr_sectors = 0;
2636
2637 cfq_clear_cfqq_wait_request(cfqq);
2638 cfq_clear_cfqq_must_dispatch(cfqq);
2639 cfq_clear_cfqq_must_alloc_slice(cfqq);
2640 cfq_clear_cfqq_fifo_expire(cfqq);
2641 cfq_mark_cfqq_slice_new(cfqq);
2642
2643 cfq_del_timer(cfqd, cfqq);
2644 }
2645
2646 cfqd->active_queue = cfqq;
2647}
2648
2649/*
2650 * current cfqq expired its slice (or was too idle), select new one
2651 */
2652static void
2653__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2654 bool timed_out)
2655{
2656 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
2657
2658 if (cfq_cfqq_wait_request(cfqq))
2659 cfq_del_timer(cfqd, cfqq);
2660
2661 cfq_clear_cfqq_wait_request(cfqq);
2662 cfq_clear_cfqq_wait_busy(cfqq);
2663
2664 /*
2665 * If this cfqq is shared between multiple processes, check to
2666 * make sure that those processes are still issuing I/Os within
2667 * the mean seek distance. If not, it may be time to break the
2668 * queues apart again.
2669 */
2670 if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
2671 cfq_mark_cfqq_split_coop(cfqq);
2672
2673 /*
2674 * store what was left of this slice, if the queue idled/timed out
2675 */
2676 if (timed_out) {
2677 if (cfq_cfqq_slice_new(cfqq))
2678 cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
2679 else
2680 cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
2681 cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
2682 }
2683
2684 cfq_group_served(cfqd, cfqq->cfqg, cfqq);
2685
2686 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
2687 cfq_del_cfqq_rr(cfqd, cfqq);
2688
2689 cfq_resort_rr_list(cfqd, cfqq);
2690
2691 if (cfqq == cfqd->active_queue)
2692 cfqd->active_queue = NULL;
2693
2694 if (cfqd->active_cic) {
2695 put_io_context(cfqd->active_cic->icq.ioc);
2696 cfqd->active_cic = NULL;
2697 }
2698}
2699
2700static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
2701{
2702 struct cfq_queue *cfqq = cfqd->active_queue;
2703
2704 if (cfqq)
2705 __cfq_slice_expired(cfqd, cfqq, timed_out);
2706}
2707
2708/*
2709 * Get next queue for service. Unless we have a queue preemption,
2710 * we'll simply select the first cfqq in the service tree.
2711 */
2712static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
2713{
2714 struct cfq_rb_root *st = st_for(cfqd->serving_group,
2715 cfqd->serving_wl_class, cfqd->serving_wl_type);
2716
2717 if (!cfqd->rq_queued)
2718 return NULL;
2719
2720 /* There is nothing to dispatch */
2721 if (!st)
2722 return NULL;
2723 if (RB_EMPTY_ROOT(&st->rb.rb_root))
2724 return NULL;
2725 return cfq_rb_first(st);
2726}
2727
2728static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
2729{
2730 struct cfq_group *cfqg;
2731 struct cfq_queue *cfqq;
2732 int i, j;
2733 struct cfq_rb_root *st;
2734
2735 if (!cfqd->rq_queued)
2736 return NULL;
2737
2738 cfqg = cfq_get_next_cfqg(cfqd);
2739 if (!cfqg)
2740 return NULL;
2741
2742 for_each_cfqg_st(cfqg, i, j, st) {
2743 cfqq = cfq_rb_first(st);
2744 if (cfqq)
2745 return cfqq;
2746 }
2747 return NULL;
2748}
2749
2750/*
2751 * Get and set a new active queue for service.
2752 */
2753static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
2754 struct cfq_queue *cfqq)
2755{
2756 if (!cfqq)
2757 cfqq = cfq_get_next_queue(cfqd);
2758
2759 __cfq_set_active_queue(cfqd, cfqq);
2760 return cfqq;
2761}
2762
2763static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
2764 struct request *rq)
2765{
2766 if (blk_rq_pos(rq) >= cfqd->last_position)
2767 return blk_rq_pos(rq) - cfqd->last_position;
2768 else
2769 return cfqd->last_position - blk_rq_pos(rq);
2770}
2771
2772static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2773 struct request *rq)
2774{
2775 return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
2776}
2777
2778static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
2779 struct cfq_queue *cur_cfqq)
2780{
2781 struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
2782 struct rb_node *parent, *node;
2783 struct cfq_queue *__cfqq;
2784 sector_t sector = cfqd->last_position;
2785
2786 if (RB_EMPTY_ROOT(root))
2787 return NULL;
2788
2789 /*
2790 * First, if we find a request starting at the end of the last
2791 * request, choose it.
2792 */
2793 __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
2794 if (__cfqq)
2795 return __cfqq;
2796
2797 /*
2798 * If the exact sector wasn't found, the parent of the NULL leaf
2799 * will contain the closest sector.
2800 */
2801 __cfqq = rb_entry(parent, struct cfq_queue, p_node);
2802 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2803 return __cfqq;
2804
2805 if (blk_rq_pos(__cfqq->next_rq) < sector)
2806 node = rb_next(&__cfqq->p_node);
2807 else
2808 node = rb_prev(&__cfqq->p_node);
2809 if (!node)
2810 return NULL;
2811
2812 __cfqq = rb_entry(node, struct cfq_queue, p_node);
2813 if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
2814 return __cfqq;
2815
2816 return NULL;
2817}
2818
2819/*
2820 * cfqd - obvious
2821 * cur_cfqq - passed in so that we don't decide that the current queue is
2822 * closely cooperating with itself.
2823 *
2824 * So, basically we're assuming that that cur_cfqq has dispatched at least
2825 * one request, and that cfqd->last_position reflects a position on the disk
2826 * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid
2827 * assumption.
2828 */
2829static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
2830 struct cfq_queue *cur_cfqq)
2831{
2832 struct cfq_queue *cfqq;
2833
2834 if (cfq_class_idle(cur_cfqq))
2835 return NULL;
2836 if (!cfq_cfqq_sync(cur_cfqq))
2837 return NULL;
2838 if (CFQQ_SEEKY(cur_cfqq))
2839 return NULL;
2840
2841 /*
2842 * Don't search priority tree if it's the only queue in the group.
2843 */
2844 if (cur_cfqq->cfqg->nr_cfqq == 1)
2845 return NULL;
2846
2847 /*
2848 * We should notice if some of the queues are cooperating, eg
2849 * working closely on the same area of the disk. In that case,
2850 * we can group them together and don't waste time idling.
2851 */
2852 cfqq = cfqq_close(cfqd, cur_cfqq);
2853 if (!cfqq)
2854 return NULL;
2855
2856 /* If new queue belongs to different cfq_group, don't choose it */
2857 if (cur_cfqq->cfqg != cfqq->cfqg)
2858 return NULL;
2859
2860 /*
2861 * It only makes sense to merge sync queues.
2862 */
2863 if (!cfq_cfqq_sync(cfqq))
2864 return NULL;
2865 if (CFQQ_SEEKY(cfqq))
2866 return NULL;
2867
2868 /*
2869 * Do not merge queues of different priority classes
2870 */
2871 if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
2872 return NULL;
2873
2874 return cfqq;
2875}
2876
2877/*
2878 * Determine whether we should enforce idle window for this queue.
2879 */
2880
2881static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2882{
2883 enum wl_class_t wl_class = cfqq_class(cfqq);
2884 struct cfq_rb_root *st = cfqq->service_tree;
2885
2886 BUG_ON(!st);
2887 BUG_ON(!st->count);
2888
2889 if (!cfqd->cfq_slice_idle)
2890 return false;
2891
2892 /* We never do for idle class queues. */
2893 if (wl_class == IDLE_WORKLOAD)
2894 return false;
2895
2896 /* We do for queues that were marked with idle window flag. */
2897 if (cfq_cfqq_idle_window(cfqq) &&
2898 !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
2899 return true;
2900
2901 /*
2902 * Otherwise, we do only if they are the last ones
2903 * in their service tree.
2904 */
2905 if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
2906 !cfq_io_thinktime_big(cfqd, &st->ttime, false))
2907 return true;
2908 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
2909 return false;
2910}
2911
2912static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2913{
2914 struct cfq_queue *cfqq = cfqd->active_queue;
2915 struct cfq_rb_root *st = cfqq->service_tree;
2916 struct cfq_io_cq *cic;
2917 u64 sl, group_idle = 0;
2918 u64 now = ktime_get_ns();
2919
2920 /*
2921 * SSD device without seek penalty, disable idling. But only do so
2922 * for devices that support queuing, otherwise we still have a problem
2923 * with sync vs async workloads.
2924 */
2925 if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
2926 !cfqd->cfq_group_idle)
2927 return;
2928
2929 WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
2930 WARN_ON(cfq_cfqq_slice_new(cfqq));
2931
2932 /*
2933 * idle is disabled, either manually or by past process history
2934 */
2935 if (!cfq_should_idle(cfqd, cfqq)) {
2936 /* no queue idling. Check for group idling */
2937 if (cfqd->cfq_group_idle)
2938 group_idle = cfqd->cfq_group_idle;
2939 else
2940 return;
2941 }
2942
2943 /*
2944 * still active requests from this queue, don't idle
2945 */
2946 if (cfqq->dispatched)
2947 return;
2948
2949 /*
2950 * task has exited, don't wait
2951 */
2952 cic = cfqd->active_cic;
2953 if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2954 return;
2955
2956 /*
2957 * If our average think time is larger than the remaining time
2958 * slice, then don't idle. This avoids overrunning the allotted
2959 * time slice.
2960 */
2961 if (sample_valid(cic->ttime.ttime_samples) &&
2962 (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
2963 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
2964 cic->ttime.ttime_mean);
2965 return;
2966 }
2967
2968 /*
2969 * There are other queues in the group or this is the only group and
2970 * it has too big thinktime, don't do group idle.
2971 */
2972 if (group_idle &&
2973 (cfqq->cfqg->nr_cfqq > 1 ||
2974 cfq_io_thinktime_big(cfqd, &st->ttime, true)))
2975 return;
2976
2977 cfq_mark_cfqq_wait_request(cfqq);
2978
2979 if (group_idle)
2980 sl = cfqd->cfq_group_idle;
2981 else
2982 sl = cfqd->cfq_slice_idle;
2983
2984 hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
2985 HRTIMER_MODE_REL);
2986 cfqg_stats_set_start_idle_time(cfqq->cfqg);
2987 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
2988 group_idle ? 1 : 0);
2989}
2990
2991/*
2992 * Move request from internal lists to the request queue dispatch list.
2993 */
2994static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2995{
2996 struct cfq_data *cfqd = q->elevator->elevator_data;
2997 struct cfq_queue *cfqq = RQ_CFQQ(rq);
2998
2999 cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
3000
3001 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
3002 cfq_remove_request(rq);
3003 cfqq->dispatched++;
3004 (RQ_CFQG(rq))->dispatched++;
3005 elv_dispatch_sort(q, rq);
3006
3007 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
3008 cfqq->nr_sectors += blk_rq_sectors(rq);
3009}
3010
3011/*
3012 * return expired entry, or NULL to just start from scratch in rbtree
3013 */
3014static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
3015{
3016 struct request *rq = NULL;
3017
3018 if (cfq_cfqq_fifo_expire(cfqq))
3019 return NULL;
3020
3021 cfq_mark_cfqq_fifo_expire(cfqq);
3022
3023 if (list_empty(&cfqq->fifo))
3024 return NULL;
3025
3026 rq = rq_entry_fifo(cfqq->fifo.next);
3027 if (ktime_get_ns() < rq->fifo_time)
3028 rq = NULL;
3029
3030 return rq;
3031}
3032
3033static inline int
3034cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3035{
3036 const int base_rq = cfqd->cfq_slice_async_rq;
3037
3038 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
3039
3040 return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
3041}
3042
3043/*
3044 * Must be called with the queue_lock held.
3045 */
3046static int cfqq_process_refs(struct cfq_queue *cfqq)
3047{
3048 int process_refs, io_refs;
3049
3050 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
3051 process_refs = cfqq->ref - io_refs;
3052 BUG_ON(process_refs < 0);
3053 return process_refs;
3054}
3055
3056static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
3057{
3058 int process_refs, new_process_refs;
3059 struct cfq_queue *__cfqq;
3060
3061 /*
3062 * If there are no process references on the new_cfqq, then it is
3063 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
3064 * chain may have dropped their last reference (not just their
3065 * last process reference).
3066 */
3067 if (!cfqq_process_refs(new_cfqq))
3068 return;
3069
3070 /* Avoid a circular list and skip interim queue merges */
3071 while ((__cfqq = new_cfqq->new_cfqq)) {
3072 if (__cfqq == cfqq)
3073 return;
3074 new_cfqq = __cfqq;
3075 }
3076
3077 process_refs = cfqq_process_refs(cfqq);
3078 new_process_refs = cfqq_process_refs(new_cfqq);
3079 /*
3080 * If the process for the cfqq has gone away, there is no
3081 * sense in merging the queues.
3082 */
3083 if (process_refs == 0 || new_process_refs == 0)
3084 return;
3085
3086 /*
3087 * Merge in the direction of the lesser amount of work.
3088 */
3089 if (new_process_refs >= process_refs) {
3090 cfqq->new_cfqq = new_cfqq;
3091 new_cfqq->ref += process_refs;
3092 } else {
3093 new_cfqq->new_cfqq = cfqq;
3094 cfqq->ref += new_process_refs;
3095 }
3096}
3097
3098static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
3099 struct cfq_group *cfqg, enum wl_class_t wl_class)
3100{
3101 struct cfq_queue *queue;
3102 int i;
3103 bool key_valid = false;
3104 u64 lowest_key = 0;
3105 enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
3106
3107 for (i = 0; i <= SYNC_WORKLOAD; ++i) {
3108 /* select the one with lowest rb_key */
3109 queue = cfq_rb_first(st_for(cfqg, wl_class, i));
3110 if (queue &&
3111 (!key_valid || queue->rb_key < lowest_key)) {
3112 lowest_key = queue->rb_key;
3113 cur_best = i;
3114 key_valid = true;
3115 }
3116 }
3117
3118 return cur_best;
3119}
3120
3121static void
3122choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
3123{
3124 u64 slice;
3125 unsigned count;
3126 struct cfq_rb_root *st;
3127 u64 group_slice;
3128 enum wl_class_t original_class = cfqd->serving_wl_class;
3129 u64 now = ktime_get_ns();
3130
3131 /* Choose next priority. RT > BE > IDLE */
3132 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
3133 cfqd->serving_wl_class = RT_WORKLOAD;
3134 else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
3135 cfqd->serving_wl_class = BE_WORKLOAD;
3136 else {
3137 cfqd->serving_wl_class = IDLE_WORKLOAD;
3138 cfqd->workload_expires = now + jiffies_to_nsecs(1);
3139 return;
3140 }
3141
3142 if (original_class != cfqd->serving_wl_class)
3143 goto new_workload;
3144
3145 /*
3146 * For RT and BE, we have to choose also the type
3147 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
3148 * expiration time
3149 */
3150 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3151 count = st->count;
3152
3153 /*
3154 * check workload expiration, and that we still have other queues ready
3155 */
3156 if (count && !(now > cfqd->workload_expires))
3157 return;
3158
3159new_workload:
3160 /* otherwise select new workload type */
3161 cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
3162 cfqd->serving_wl_class);
3163 st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
3164 count = st->count;
3165
3166 /*
3167 * the workload slice is computed as a fraction of target latency
3168 * proportional to the number of queues in that workload, over
3169 * all the queues in the same priority class
3170 */
3171 group_slice = cfq_group_slice(cfqd, cfqg);
3172
3173 slice = div_u64(group_slice * count,
3174 max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
3175 cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
3176 cfqg)));
3177
3178 if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
3179 u64 tmp;
3180
3181 /*
3182 * Async queues are currently system wide. Just taking
3183 * proportion of queues with-in same group will lead to higher
3184 * async ratio system wide as generally root group is going
3185 * to have higher weight. A more accurate thing would be to
3186 * calculate system wide asnc/sync ratio.
3187 */
3188 tmp = cfqd->cfq_target_latency *
3189 cfqg_busy_async_queues(cfqd, cfqg);
3190 tmp = div_u64(tmp, cfqd->busy_queues);
3191 slice = min_t(u64, slice, tmp);
3192
3193 /* async workload slice is scaled down according to
3194 * the sync/async slice ratio. */
3195 slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
3196 } else
3197 /* sync workload slice is at least 2 * cfq_slice_idle */
3198 slice = max(slice, 2 * cfqd->cfq_slice_idle);
3199
3200 slice = max_t(u64, slice, CFQ_MIN_TT);
3201 cfq_log(cfqd, "workload slice:%llu", slice);
3202 cfqd->workload_expires = now + slice;
3203}
3204
3205static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
3206{
3207 struct cfq_rb_root *st = &cfqd->grp_service_tree;
3208 struct cfq_group *cfqg;
3209
3210 if (RB_EMPTY_ROOT(&st->rb.rb_root))
3211 return NULL;
3212 cfqg = cfq_rb_first_group(st);
3213 update_min_vdisktime(st);
3214 return cfqg;
3215}
3216
3217static void cfq_choose_cfqg(struct cfq_data *cfqd)
3218{
3219 struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
3220 u64 now = ktime_get_ns();
3221
3222 cfqd->serving_group = cfqg;
3223
3224 /* Restore the workload type data */
3225 if (cfqg->saved_wl_slice) {
3226 cfqd->workload_expires = now + cfqg->saved_wl_slice;
3227 cfqd->serving_wl_type = cfqg->saved_wl_type;
3228 cfqd->serving_wl_class = cfqg->saved_wl_class;
3229 } else
3230 cfqd->workload_expires = now - 1;
3231
3232 choose_wl_class_and_type(cfqd, cfqg);
3233}
3234
3235/*
3236 * Select a queue for service. If we have a current active queue,
3237 * check whether to continue servicing it, or retrieve and set a new one.
3238 */
3239static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
3240{
3241 struct cfq_queue *cfqq, *new_cfqq = NULL;
3242 u64 now = ktime_get_ns();
3243
3244 cfqq = cfqd->active_queue;
3245 if (!cfqq)
3246 goto new_queue;
3247
3248 if (!cfqd->rq_queued)
3249 return NULL;
3250
3251 /*
3252 * We were waiting for group to get backlogged. Expire the queue
3253 */
3254 if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
3255 goto expire;
3256
3257 /*
3258 * The active queue has run out of time, expire it and select new.
3259 */
3260 if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
3261 /*
3262 * If slice had not expired at the completion of last request
3263 * we might not have turned on wait_busy flag. Don't expire
3264 * the queue yet. Allow the group to get backlogged.
3265 *
3266 * The very fact that we have used the slice, that means we
3267 * have been idling all along on this queue and it should be
3268 * ok to wait for this request to complete.
3269 */
3270 if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
3271 && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3272 cfqq = NULL;
3273 goto keep_queue;
3274 } else
3275 goto check_group_idle;
3276 }
3277
3278 /*
3279 * The active queue has requests and isn't expired, allow it to
3280 * dispatch.
3281 */
3282 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3283 goto keep_queue;
3284
3285 /*
3286 * If another queue has a request waiting within our mean seek
3287 * distance, let it run. The expire code will check for close
3288 * cooperators and put the close queue at the front of the service
3289 * tree. If possible, merge the expiring queue with the new cfqq.
3290 */
3291 new_cfqq = cfq_close_cooperator(cfqd, cfqq);
3292 if (new_cfqq) {
3293 if (!cfqq->new_cfqq)
3294 cfq_setup_merge(cfqq, new_cfqq);
3295 goto expire;
3296 }
3297
3298 /*
3299 * No requests pending. If the active queue still has requests in
3300 * flight or is idling for a new request, allow either of these
3301 * conditions to happen (or time out) before selecting a new queue.
3302 */
3303 if (hrtimer_active(&cfqd->idle_slice_timer)) {
3304 cfqq = NULL;
3305 goto keep_queue;
3306 }
3307
3308 /*
3309 * This is a deep seek queue, but the device is much faster than
3310 * the queue can deliver, don't idle
3311 **/
3312 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
3313 (cfq_cfqq_slice_new(cfqq) ||
3314 (cfqq->slice_end - now > now - cfqq->slice_start))) {
3315 cfq_clear_cfqq_deep(cfqq);
3316 cfq_clear_cfqq_idle_window(cfqq);
3317 }
3318
3319 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
3320 cfqq = NULL;
3321 goto keep_queue;
3322 }
3323
3324 /*
3325 * If group idle is enabled and there are requests dispatched from
3326 * this group, wait for requests to complete.
3327 */
3328check_group_idle:
3329 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
3330 cfqq->cfqg->dispatched &&
3331 !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
3332 cfqq = NULL;
3333 goto keep_queue;
3334 }
3335
3336expire:
3337 cfq_slice_expired(cfqd, 0);
3338new_queue:
3339 /*
3340 * Current queue expired. Check if we have to switch to a new
3341 * service tree
3342 */
3343 if (!new_cfqq)
3344 cfq_choose_cfqg(cfqd);
3345
3346 cfqq = cfq_set_active_queue(cfqd, new_cfqq);
3347keep_queue:
3348 return cfqq;
3349}
3350
3351static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
3352{
3353 int dispatched = 0;
3354
3355 while (cfqq->next_rq) {
3356 cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
3357 dispatched++;
3358 }
3359
3360 BUG_ON(!list_empty(&cfqq->fifo));
3361
3362 /* By default cfqq is not expired if it is empty. Do it explicitly */
3363 __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
3364 return dispatched;
3365}
3366
3367/*
3368 * Drain our current requests. Used for barriers and when switching
3369 * io schedulers on-the-fly.
3370 */
3371static int cfq_forced_dispatch(struct cfq_data *cfqd)
3372{
3373 struct cfq_queue *cfqq;
3374 int dispatched = 0;
3375
3376 /* Expire the timeslice of the current active queue first */
3377 cfq_slice_expired(cfqd, 0);
3378 while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
3379 __cfq_set_active_queue(cfqd, cfqq);
3380 dispatched += __cfq_forced_dispatch_cfqq(cfqq);
3381 }
3382
3383 BUG_ON(cfqd->busy_queues);
3384
3385 cfq_log(cfqd, "forced_dispatch=%d", dispatched);
3386 return dispatched;
3387}
3388
3389static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
3390 struct cfq_queue *cfqq)
3391{
3392 u64 now = ktime_get_ns();
3393
3394 /* the queue hasn't finished any request, can't estimate */
3395 if (cfq_cfqq_slice_new(cfqq))
3396 return true;
3397 if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
3398 return true;
3399
3400 return false;
3401}
3402
3403static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3404{
3405 unsigned int max_dispatch;
3406
3407 if (cfq_cfqq_must_dispatch(cfqq))
3408 return true;
3409
3410 /*
3411 * Drain async requests before we start sync IO
3412 */
3413 if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
3414 return false;
3415
3416 /*
3417 * If this is an async queue and we have sync IO in flight, let it wait
3418 */
3419 if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
3420 return false;
3421
3422 max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
3423 if (cfq_class_idle(cfqq))
3424 max_dispatch = 1;
3425
3426 /*
3427 * Does this cfqq already have too much IO in flight?
3428 */
3429 if (cfqq->dispatched >= max_dispatch) {
3430 bool promote_sync = false;
3431 /*
3432 * idle queue must always only have a single IO in flight
3433 */
3434 if (cfq_class_idle(cfqq))
3435 return false;
3436
3437 /*
3438 * If there is only one sync queue
3439 * we can ignore async queue here and give the sync
3440 * queue no dispatch limit. The reason is a sync queue can
3441 * preempt async queue, limiting the sync queue doesn't make
3442 * sense. This is useful for aiostress test.
3443 */
3444 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
3445 promote_sync = true;
3446
3447 /*
3448 * We have other queues, don't allow more IO from this one
3449 */
3450 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
3451 !promote_sync)
3452 return false;
3453
3454 /*
3455 * Sole queue user, no limit
3456 */
3457 if (cfqd->busy_queues == 1 || promote_sync)
3458 max_dispatch = -1;
3459 else
3460 /*
3461 * Normally we start throttling cfqq when cfq_quantum/2
3462 * requests have been dispatched. But we can drive
3463 * deeper queue depths at the beginning of slice
3464 * subjected to upper limit of cfq_quantum.
3465 * */
3466 max_dispatch = cfqd->cfq_quantum;
3467 }
3468
3469 /*
3470 * Async queues must wait a bit before being allowed dispatch.
3471 * We also ramp up the dispatch depth gradually for async IO,
3472 * based on the last sync IO we serviced
3473 */
3474 if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
3475 u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
3476 unsigned int depth;
3477
3478 depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
3479 if (!depth && !cfqq->dispatched)
3480 depth = 1;
3481 if (depth < max_dispatch)
3482 max_dispatch = depth;
3483 }
3484
3485 /*
3486 * If we're below the current max, allow a dispatch
3487 */
3488 return cfqq->dispatched < max_dispatch;
3489}
3490
3491/*
3492 * Dispatch a request from cfqq, moving them to the request queue
3493 * dispatch list.
3494 */
3495static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3496{
3497 struct request *rq;
3498
3499 BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
3500
3501 rq = cfq_check_fifo(cfqq);
3502 if (rq)
3503 cfq_mark_cfqq_must_dispatch(cfqq);
3504
3505 if (!cfq_may_dispatch(cfqd, cfqq))
3506 return false;
3507
3508 /*
3509 * follow expired path, else get first next available
3510 */
3511 if (!rq)
3512 rq = cfqq->next_rq;
3513 else
3514 cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
3515
3516 /*
3517 * insert request into driver dispatch list
3518 */
3519 cfq_dispatch_insert(cfqd->queue, rq);
3520
3521 if (!cfqd->active_cic) {
3522 struct cfq_io_cq *cic = RQ_CIC(rq);
3523
3524 atomic_long_inc(&cic->icq.ioc->refcount);
3525 cfqd->active_cic = cic;
3526 }
3527
3528 return true;
3529}
3530
3531/*
3532 * Find the cfqq that we need to service and move a request from that to the
3533 * dispatch list
3534 */
3535static int cfq_dispatch_requests(struct request_queue *q, int force)
3536{
3537 struct cfq_data *cfqd = q->elevator->elevator_data;
3538 struct cfq_queue *cfqq;
3539
3540 if (!cfqd->busy_queues)
3541 return 0;
3542
3543 if (unlikely(force))
3544 return cfq_forced_dispatch(cfqd);
3545
3546 cfqq = cfq_select_queue(cfqd);
3547 if (!cfqq)
3548 return 0;
3549
3550 /*
3551 * Dispatch a request from this cfqq, if it is allowed
3552 */
3553 if (!cfq_dispatch_request(cfqd, cfqq))
3554 return 0;
3555
3556 cfqq->slice_dispatch++;
3557 cfq_clear_cfqq_must_dispatch(cfqq);
3558
3559 /*
3560 * expire an async queue immediately if it has used up its slice. idle
3561 * queue always expire after 1 dispatch round.
3562 */
3563 if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
3564 cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
3565 cfq_class_idle(cfqq))) {
3566 cfqq->slice_end = ktime_get_ns() + 1;
3567 cfq_slice_expired(cfqd, 0);
3568 }
3569
3570 cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
3571 return 1;
3572}
3573
3574/*
3575 * task holds one reference to the queue, dropped when task exits. each rq
3576 * in-flight on this queue also holds a reference, dropped when rq is freed.
3577 *
3578 * Each cfq queue took a reference on the parent group. Drop it now.
3579 * queue lock must be held here.
3580 */
3581static void cfq_put_queue(struct cfq_queue *cfqq)
3582{
3583 struct cfq_data *cfqd = cfqq->cfqd;
3584 struct cfq_group *cfqg;
3585
3586 BUG_ON(cfqq->ref <= 0);
3587
3588 cfqq->ref--;
3589 if (cfqq->ref)
3590 return;
3591
3592 cfq_log_cfqq(cfqd, cfqq, "put_queue");
3593 BUG_ON(rb_first(&cfqq->sort_list));
3594 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
3595 cfqg = cfqq->cfqg;
3596
3597 if (unlikely(cfqd->active_queue == cfqq)) {
3598 __cfq_slice_expired(cfqd, cfqq, 0);
3599 cfq_schedule_dispatch(cfqd);
3600 }
3601
3602 BUG_ON(cfq_cfqq_on_rr(cfqq));
3603 kmem_cache_free(cfq_pool, cfqq);
3604 cfqg_put(cfqg);
3605}
3606
3607static void cfq_put_cooperator(struct cfq_queue *cfqq)
3608{
3609 struct cfq_queue *__cfqq, *next;
3610
3611 /*
3612 * If this queue was scheduled to merge with another queue, be
3613 * sure to drop the reference taken on that queue (and others in
3614 * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
3615 */
3616 __cfqq = cfqq->new_cfqq;
3617 while (__cfqq) {
3618 if (__cfqq == cfqq) {
3619 WARN(1, "cfqq->new_cfqq loop detected\n");
3620 break;
3621 }
3622 next = __cfqq->new_cfqq;
3623 cfq_put_queue(__cfqq);
3624 __cfqq = next;
3625 }
3626}
3627
3628static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3629{
3630 if (unlikely(cfqq == cfqd->active_queue)) {
3631 __cfq_slice_expired(cfqd, cfqq, 0);
3632 cfq_schedule_dispatch(cfqd);
3633 }
3634
3635 cfq_put_cooperator(cfqq);
3636
3637 cfq_put_queue(cfqq);
3638}
3639
3640static void cfq_init_icq(struct io_cq *icq)
3641{
3642 struct cfq_io_cq *cic = icq_to_cic(icq);
3643
3644 cic->ttime.last_end_request = ktime_get_ns();
3645}
3646
3647static void cfq_exit_icq(struct io_cq *icq)
3648{
3649 struct cfq_io_cq *cic = icq_to_cic(icq);
3650 struct cfq_data *cfqd = cic_to_cfqd(cic);
3651
3652 if (cic_to_cfqq(cic, false)) {
3653 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
3654 cic_set_cfqq(cic, NULL, false);
3655 }
3656
3657 if (cic_to_cfqq(cic, true)) {
3658 cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
3659 cic_set_cfqq(cic, NULL, true);
3660 }
3661}
3662
3663static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3664{
3665 struct task_struct *tsk = current;
3666 int ioprio_class;
3667
3668 if (!cfq_cfqq_prio_changed(cfqq))
3669 return;
3670
3671 ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3672 switch (ioprio_class) {
3673 default:
3674 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3675 /* fall through */
3676 case IOPRIO_CLASS_NONE:
3677 /*
3678 * no prio set, inherit CPU scheduling settings
3679 */
3680 cfqq->ioprio = task_nice_ioprio(tsk);
3681 cfqq->ioprio_class = task_nice_ioclass(tsk);
3682 break;
3683 case IOPRIO_CLASS_RT:
3684 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3685 cfqq->ioprio_class = IOPRIO_CLASS_RT;
3686 break;
3687 case IOPRIO_CLASS_BE:
3688 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3689 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3690 break;
3691 case IOPRIO_CLASS_IDLE:
3692 cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
3693 cfqq->ioprio = 7;
3694 cfq_clear_cfqq_idle_window(cfqq);
3695 break;
3696 }
3697
3698 /*
3699 * keep track of original prio settings in case we have to temporarily
3700 * elevate the priority of this queue
3701 */
3702 cfqq->org_ioprio = cfqq->ioprio;
3703 cfqq->org_ioprio_class = cfqq->ioprio_class;
3704 cfq_clear_cfqq_prio_changed(cfqq);
3705}
3706
3707static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3708{
3709 int ioprio = cic->icq.ioc->ioprio;
3710 struct cfq_data *cfqd = cic_to_cfqd(cic);
3711 struct cfq_queue *cfqq;
3712
3713 /*
3714 * Check whether ioprio has changed. The condition may trigger
3715 * spuriously on a newly created cic but there's no harm.
3716 */
3717 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3718 return;
3719
3720 cfqq = cic_to_cfqq(cic, false);
3721 if (cfqq) {
3722 cfq_put_queue(cfqq);
3723 cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
3724 cic_set_cfqq(cic, cfqq, false);
3725 }
3726
3727 cfqq = cic_to_cfqq(cic, true);
3728 if (cfqq)
3729 cfq_mark_cfqq_prio_changed(cfqq);
3730
3731 cic->ioprio = ioprio;
3732}
3733
3734static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3735 pid_t pid, bool is_sync)
3736{
3737 RB_CLEAR_NODE(&cfqq->rb_node);
3738 RB_CLEAR_NODE(&cfqq->p_node);
3739 INIT_LIST_HEAD(&cfqq->fifo);
3740
3741 cfqq->ref = 0;
3742 cfqq->cfqd = cfqd;
3743
3744 cfq_mark_cfqq_prio_changed(cfqq);
3745
3746 if (is_sync) {
3747 if (!cfq_class_idle(cfqq))
3748 cfq_mark_cfqq_idle_window(cfqq);
3749 cfq_mark_cfqq_sync(cfqq);
3750 }
3751 cfqq->pid = pid;
3752}
3753
3754#ifdef CONFIG_CFQ_GROUP_IOSCHED
3755static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3756{
3757 struct cfq_data *cfqd = cic_to_cfqd(cic);
3758 struct cfq_queue *cfqq;
3759 uint64_t serial_nr;
3760
3761 rcu_read_lock();
3762 serial_nr = bio_blkcg(bio)->css.serial_nr;
3763 rcu_read_unlock();
3764
3765 /*
3766 * Check whether blkcg has changed. The condition may trigger
3767 * spuriously on a newly created cic but there's no harm.
3768 */
3769 if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
3770 return;
3771
3772 /*
3773 * Drop reference to queues. New queues will be assigned in new
3774 * group upon arrival of fresh requests.
3775 */
3776 cfqq = cic_to_cfqq(cic, false);
3777 if (cfqq) {
3778 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3779 cic_set_cfqq(cic, NULL, false);
3780 cfq_put_queue(cfqq);
3781 }
3782
3783 cfqq = cic_to_cfqq(cic, true);
3784 if (cfqq) {
3785 cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
3786 cic_set_cfqq(cic, NULL, true);
3787 cfq_put_queue(cfqq);
3788 }
3789
3790 cic->blkcg_serial_nr = serial_nr;
3791}
3792#else
3793static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3794{
3795}
3796#endif /* CONFIG_CFQ_GROUP_IOSCHED */
3797
3798static struct cfq_queue **
3799cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
3800{
3801 switch (ioprio_class) {
3802 case IOPRIO_CLASS_RT:
3803 return &cfqg->async_cfqq[0][ioprio];
3804 case IOPRIO_CLASS_NONE:
3805 ioprio = IOPRIO_NORM;
3806 /* fall through */
3807 case IOPRIO_CLASS_BE:
3808 return &cfqg->async_cfqq[1][ioprio];
3809 case IOPRIO_CLASS_IDLE:
3810 return &cfqg->async_idle_cfqq;
3811 default:
3812 BUG();
3813 }
3814}
3815
3816static struct cfq_queue *
3817cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3818 struct bio *bio)
3819{
3820 int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
3821 int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
3822 struct cfq_queue **async_cfqq = NULL;
3823 struct cfq_queue *cfqq;
3824 struct cfq_group *cfqg;
3825
3826 rcu_read_lock();
3827 cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
3828 if (!cfqg) {
3829 cfqq = &cfqd->oom_cfqq;
3830 goto out;
3831 }
3832
3833 if (!is_sync) {
3834 if (!ioprio_valid(cic->ioprio)) {
3835 struct task_struct *tsk = current;
3836 ioprio = task_nice_ioprio(tsk);
3837 ioprio_class = task_nice_ioclass(tsk);
3838 }
3839 async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
3840 cfqq = *async_cfqq;
3841 if (cfqq)
3842 goto out;
3843 }
3844
3845 cfqq = kmem_cache_alloc_node(cfq_pool,
3846 GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
3847 cfqd->queue->node);
3848 if (!cfqq) {
3849 cfqq = &cfqd->oom_cfqq;
3850 goto out;
3851 }
3852
3853 /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
3854 cfqq->ioprio_class = IOPRIO_CLASS_NONE;
3855 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3856 cfq_init_prio_data(cfqq, cic);
3857 cfq_link_cfqq_cfqg(cfqq, cfqg);
3858 cfq_log_cfqq(cfqd, cfqq, "alloced");
3859
3860 if (async_cfqq) {
3861 /* a new async queue is created, pin and remember */
3862 cfqq->ref++;
3863 *async_cfqq = cfqq;
3864 }
3865out:
3866 cfqq->ref++;
3867 rcu_read_unlock();
3868 return cfqq;
3869}
3870
3871static void
3872__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
3873{
3874 u64 elapsed = ktime_get_ns() - ttime->last_end_request;
3875 elapsed = min(elapsed, 2UL * slice_idle);
3876
3877 ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
3878 ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
3879 ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
3880 ttime->ttime_samples);
3881}
3882
3883static void
3884cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3885 struct cfq_io_cq *cic)
3886{
3887 if (cfq_cfqq_sync(cfqq)) {
3888 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
3889 __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
3890 cfqd->cfq_slice_idle);
3891 }
3892#ifdef CONFIG_CFQ_GROUP_IOSCHED
3893 __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
3894#endif
3895}
3896
3897static void
3898cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3899 struct request *rq)
3900{
3901 sector_t sdist = 0;
3902 sector_t n_sec = blk_rq_sectors(rq);
3903 if (cfqq->last_request_pos) {
3904 if (cfqq->last_request_pos < blk_rq_pos(rq))
3905 sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
3906 else
3907 sdist = cfqq->last_request_pos - blk_rq_pos(rq);
3908 }
3909
3910 cfqq->seek_history <<= 1;
3911 if (blk_queue_nonrot(cfqd->queue))
3912 cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
3913 else
3914 cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
3915}
3916
3917static inline bool req_noidle(struct request *req)
3918{
3919 return req_op(req) == REQ_OP_WRITE &&
3920 (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
3921}
3922
3923/*
3924 * Disable idle window if the process thinks too long or seeks so much that
3925 * it doesn't matter
3926 */
3927static void
3928cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3929 struct cfq_io_cq *cic)
3930{
3931 int old_idle, enable_idle;
3932
3933 /*
3934 * Don't idle for async or idle io prio class
3935 */
3936 if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
3937 return;
3938
3939 enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
3940
3941 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3942 cfq_mark_cfqq_deep(cfqq);
3943
3944 if (cfqq->next_rq && req_noidle(cfqq->next_rq))
3945 enable_idle = 0;
3946 else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3947 !cfqd->cfq_slice_idle ||
3948 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3949 enable_idle = 0;
3950 else if (sample_valid(cic->ttime.ttime_samples)) {
3951 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
3952 enable_idle = 0;
3953 else
3954 enable_idle = 1;
3955 }
3956
3957 if (old_idle != enable_idle) {
3958 cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
3959 if (enable_idle)
3960 cfq_mark_cfqq_idle_window(cfqq);
3961 else
3962 cfq_clear_cfqq_idle_window(cfqq);
3963 }
3964}
3965
3966/*
3967 * Check if new_cfqq should preempt the currently active queue. Return 0 for
3968 * no or if we aren't sure, a 1 will cause a preempt.
3969 */
3970static bool
3971cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3972 struct request *rq)
3973{
3974 struct cfq_queue *cfqq;
3975
3976 cfqq = cfqd->active_queue;
3977 if (!cfqq)
3978 return false;
3979
3980 if (cfq_class_idle(new_cfqq))
3981 return false;
3982
3983 if (cfq_class_idle(cfqq))
3984 return true;
3985
3986 /*
3987 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
3988 */
3989 if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
3990 return false;
3991
3992 /*
3993 * if the new request is sync, but the currently running queue is
3994 * not, let the sync request have priority.
3995 */
3996 if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
3997 return true;
3998
3999 /*
4000 * Treat ancestors of current cgroup the same way as current cgroup.
4001 * For anybody else we disallow preemption to guarantee service
4002 * fairness among cgroups.
4003 */
4004 if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
4005 return false;
4006
4007 if (cfq_slice_used(cfqq))
4008 return true;
4009
4010 /*
4011 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
4012 */
4013 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
4014 return true;
4015
4016 WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
4017 /* Allow preemption only if we are idling on sync-noidle tree */
4018 if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
4019 cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
4020 RB_EMPTY_ROOT(&cfqq->sort_list))
4021 return true;
4022
4023 /*
4024 * So both queues are sync. Let the new request get disk time if
4025 * it's a metadata request and the current queue is doing regular IO.
4026 */
4027 if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
4028 return true;
4029
4030 /* An idle queue should not be idle now for some reason */
4031 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
4032 return true;
4033
4034 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
4035 return false;
4036
4037 /*
4038 * if this request is as-good as one we would expect from the
4039 * current cfqq, let it preempt
4040 */
4041 if (cfq_rq_close(cfqd, cfqq, rq))
4042 return true;
4043
4044 return false;
4045}
4046
4047/*
4048 * cfqq preempts the active queue. if we allowed preempt with no slice left,
4049 * let it have half of its nominal slice.
4050 */
4051static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4052{
4053 enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
4054
4055 cfq_log_cfqq(cfqd, cfqq, "preempt");
4056 cfq_slice_expired(cfqd, 1);
4057
4058 /*
4059 * workload type is changed, don't save slice, otherwise preempt
4060 * doesn't happen
4061 */
4062 if (old_type != cfqq_type(cfqq))
4063 cfqq->cfqg->saved_wl_slice = 0;
4064
4065 /*
4066 * Put the new queue at the front of the of the current list,
4067 * so we know that it will be selected next.
4068 */
4069 BUG_ON(!cfq_cfqq_on_rr(cfqq));
4070
4071 cfq_service_tree_add(cfqd, cfqq, 1);
4072
4073 cfqq->slice_end = 0;
4074 cfq_mark_cfqq_slice_new(cfqq);
4075}
4076
4077/*
4078 * Called when a new fs request (rq) is added (to cfqq). Check if there's
4079 * something we should do about it
4080 */
4081static void
4082cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
4083 struct request *rq)
4084{
4085 struct cfq_io_cq *cic = RQ_CIC(rq);
4086
4087 cfqd->rq_queued++;
4088 if (rq->cmd_flags & REQ_PRIO)
4089 cfqq->prio_pending++;
4090
4091 cfq_update_io_thinktime(cfqd, cfqq, cic);
4092 cfq_update_io_seektime(cfqd, cfqq, rq);
4093 cfq_update_idle_window(cfqd, cfqq, cic);
4094
4095 cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
4096
4097 if (cfqq == cfqd->active_queue) {
4098 /*
4099 * Remember that we saw a request from this process, but
4100 * don't start queuing just yet. Otherwise we risk seeing lots
4101 * of tiny requests, because we disrupt the normal plugging
4102 * and merging. If the request is already larger than a single
4103 * page, let it rip immediately. For that case we assume that
4104 * merging is already done. Ditto for a busy system that
4105 * has other work pending, don't risk delaying until the
4106 * idle timer unplug to continue working.
4107 */
4108 if (cfq_cfqq_wait_request(cfqq)) {
4109 if (blk_rq_bytes(rq) > PAGE_SIZE ||
4110 cfqd->busy_queues > 1) {
4111 cfq_del_timer(cfqd, cfqq);
4112 cfq_clear_cfqq_wait_request(cfqq);
4113 __blk_run_queue(cfqd->queue);
4114 } else {
4115 cfqg_stats_update_idle_time(cfqq->cfqg);
4116 cfq_mark_cfqq_must_dispatch(cfqq);
4117 }
4118 }
4119 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
4120 /*
4121 * not the active queue - expire current slice if it is
4122 * idle and has expired it's mean thinktime or this new queue
4123 * has some old slice time left and is of higher priority or
4124 * this new queue is RT and the current one is BE
4125 */
4126 cfq_preempt_queue(cfqd, cfqq);
4127 __blk_run_queue(cfqd->queue);
4128 }
4129}
4130
4131static void cfq_insert_request(struct request_queue *q, struct request *rq)
4132{
4133 struct cfq_data *cfqd = q->elevator->elevator_data;
4134 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4135
4136 cfq_log_cfqq(cfqd, cfqq, "insert_request");
4137 cfq_init_prio_data(cfqq, RQ_CIC(rq));
4138
4139 rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
4140 list_add_tail(&rq->queuelist, &cfqq->fifo);
4141 cfq_add_rq_rb(rq);
4142 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
4143 rq->cmd_flags);
4144 cfq_rq_enqueued(cfqd, cfqq, rq);
4145}
4146
4147/*
4148 * Update hw_tag based on peak queue depth over 50 samples under
4149 * sufficient load.
4150 */
4151static void cfq_update_hw_tag(struct cfq_data *cfqd)
4152{
4153 struct cfq_queue *cfqq = cfqd->active_queue;
4154
4155 if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
4156 cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
4157
4158 if (cfqd->hw_tag == 1)
4159 return;
4160
4161 if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
4162 cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
4163 return;
4164
4165 /*
4166 * If active queue hasn't enough requests and can idle, cfq might not
4167 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
4168 * case
4169 */
4170 if (cfqq && cfq_cfqq_idle_window(cfqq) &&
4171 cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
4172 CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
4173 return;
4174
4175 if (cfqd->hw_tag_samples++ < 50)
4176 return;
4177
4178 if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
4179 cfqd->hw_tag = 1;
4180 else
4181 cfqd->hw_tag = 0;
4182}
4183
4184static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
4185{
4186 struct cfq_io_cq *cic = cfqd->active_cic;
4187 u64 now = ktime_get_ns();
4188
4189 /* If the queue already has requests, don't wait */
4190 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4191 return false;
4192
4193 /* If there are other queues in the group, don't wait */
4194 if (cfqq->cfqg->nr_cfqq > 1)
4195 return false;
4196
4197 /* the only queue in the group, but think time is big */
4198 if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
4199 return false;
4200
4201 if (cfq_slice_used(cfqq))
4202 return true;
4203
4204 /* if slice left is less than think time, wait busy */
4205 if (cic && sample_valid(cic->ttime.ttime_samples)
4206 && (cfqq->slice_end - now < cic->ttime.ttime_mean))
4207 return true;
4208
4209 /*
4210 * If think times is less than a jiffy than ttime_mean=0 and above
4211 * will not be true. It might happen that slice has not expired yet
4212 * but will expire soon (4-5 ns) during select_queue(). To cover the
4213 * case where think time is less than a jiffy, mark the queue wait
4214 * busy if only 1 jiffy is left in the slice.
4215 */
4216 if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
4217 return true;
4218
4219 return false;
4220}
4221
4222static void cfq_completed_request(struct request_queue *q, struct request *rq)
4223{
4224 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4225 struct cfq_data *cfqd = cfqq->cfqd;
4226 const int sync = rq_is_sync(rq);
4227 u64 now = ktime_get_ns();
4228
4229 cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
4230
4231 cfq_update_hw_tag(cfqd);
4232
4233 WARN_ON(!cfqd->rq_in_driver);
4234 WARN_ON(!cfqq->dispatched);
4235 cfqd->rq_in_driver--;
4236 cfqq->dispatched--;
4237 (RQ_CFQG(rq))->dispatched--;
4238 cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
4239 rq->io_start_time_ns, rq->cmd_flags);
4240
4241 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
4242
4243 if (sync) {
4244 struct cfq_rb_root *st;
4245
4246 RQ_CIC(rq)->ttime.last_end_request = now;
4247
4248 if (cfq_cfqq_on_rr(cfqq))
4249 st = cfqq->service_tree;
4250 else
4251 st = st_for(cfqq->cfqg, cfqq_class(cfqq),
4252 cfqq_type(cfqq));
4253
4254 st->ttime.last_end_request = now;
4255 if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
4256 cfqd->last_delayed_sync = now;
4257 }
4258
4259#ifdef CONFIG_CFQ_GROUP_IOSCHED
4260 cfqq->cfqg->ttime.last_end_request = now;
4261#endif
4262
4263 /*
4264 * If this is the active queue, check if it needs to be expired,
4265 * or if we want to idle in case it has no pending requests.
4266 */
4267 if (cfqd->active_queue == cfqq) {
4268 const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
4269
4270 if (cfq_cfqq_slice_new(cfqq)) {
4271 cfq_set_prio_slice(cfqd, cfqq);
4272 cfq_clear_cfqq_slice_new(cfqq);
4273 }
4274
4275 /*
4276 * Should we wait for next request to come in before we expire
4277 * the queue.
4278 */
4279 if (cfq_should_wait_busy(cfqd, cfqq)) {
4280 u64 extend_sl = cfqd->cfq_slice_idle;
4281 if (!cfqd->cfq_slice_idle)
4282 extend_sl = cfqd->cfq_group_idle;
4283 cfqq->slice_end = now + extend_sl;
4284 cfq_mark_cfqq_wait_busy(cfqq);
4285 cfq_log_cfqq(cfqd, cfqq, "will busy wait");
4286 }
4287
4288 /*
4289 * Idling is not enabled on:
4290 * - expired queues
4291 * - idle-priority queues
4292 * - async queues
4293 * - queues with still some requests queued
4294 * - when there is a close cooperator
4295 */
4296 if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
4297 cfq_slice_expired(cfqd, 1);
4298 else if (sync && cfqq_empty &&
4299 !cfq_close_cooperator(cfqd, cfqq)) {
4300 cfq_arm_slice_timer(cfqd);
4301 }
4302 }
4303
4304 if (!cfqd->rq_in_driver)
4305 cfq_schedule_dispatch(cfqd);
4306}
4307
4308static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
4309{
4310 /*
4311 * If REQ_PRIO is set, boost class and prio level, if it's below
4312 * BE/NORM. If prio is not set, restore the potentially boosted
4313 * class/prio level.
4314 */
4315 if (!(op & REQ_PRIO)) {
4316 cfqq->ioprio_class = cfqq->org_ioprio_class;
4317 cfqq->ioprio = cfqq->org_ioprio;
4318 } else {
4319 if (cfq_class_idle(cfqq))
4320 cfqq->ioprio_class = IOPRIO_CLASS_BE;
4321 if (cfqq->ioprio > IOPRIO_NORM)
4322 cfqq->ioprio = IOPRIO_NORM;
4323 }
4324}
4325
4326static inline int __cfq_may_queue(struct cfq_queue *cfqq)
4327{
4328 if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
4329 cfq_mark_cfqq_must_alloc_slice(cfqq);
4330 return ELV_MQUEUE_MUST;
4331 }
4332
4333 return ELV_MQUEUE_MAY;
4334}
4335
4336static int cfq_may_queue(struct request_queue *q, unsigned int op)
4337{
4338 struct cfq_data *cfqd = q->elevator->elevator_data;
4339 struct task_struct *tsk = current;
4340 struct cfq_io_cq *cic;
4341 struct cfq_queue *cfqq;
4342
4343 /*
4344 * don't force setup of a queue from here, as a call to may_queue
4345 * does not necessarily imply that a request actually will be queued.
4346 * so just lookup a possibly existing queue, or return 'may queue'
4347 * if that fails
4348 */
4349 cic = cfq_cic_lookup(cfqd, tsk->io_context);
4350 if (!cic)
4351 return ELV_MQUEUE_MAY;
4352
4353 cfqq = cic_to_cfqq(cic, op_is_sync(op));
4354 if (cfqq) {
4355 cfq_init_prio_data(cfqq, cic);
4356 cfqq_boost_on_prio(cfqq, op);
4357
4358 return __cfq_may_queue(cfqq);
4359 }
4360
4361 return ELV_MQUEUE_MAY;
4362}
4363
4364/*
4365 * queue lock held here
4366 */
4367static void cfq_put_request(struct request *rq)
4368{
4369 struct cfq_queue *cfqq = RQ_CFQQ(rq);
4370
4371 if (cfqq) {
4372 const int rw = rq_data_dir(rq);
4373
4374 BUG_ON(!cfqq->allocated[rw]);
4375 cfqq->allocated[rw]--;
4376
4377 /* Put down rq reference on cfqg */
4378 cfqg_put(RQ_CFQG(rq));
4379 rq->elv.priv[0] = NULL;
4380 rq->elv.priv[1] = NULL;
4381
4382 cfq_put_queue(cfqq);
4383 }
4384}
4385
4386static struct cfq_queue *
4387cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
4388 struct cfq_queue *cfqq)
4389{
4390 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
4391 cic_set_cfqq(cic, cfqq->new_cfqq, 1);
4392 cfq_mark_cfqq_coop(cfqq->new_cfqq);
4393 cfq_put_queue(cfqq);
4394 return cic_to_cfqq(cic, 1);
4395}
4396
4397/*
4398 * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
4399 * was the last process referring to said cfqq.
4400 */
4401static struct cfq_queue *
4402split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
4403{
4404 if (cfqq_process_refs(cfqq) == 1) {
4405 cfqq->pid = current->pid;
4406 cfq_clear_cfqq_coop(cfqq);
4407 cfq_clear_cfqq_split_coop(cfqq);
4408 return cfqq;
4409 }
4410
4411 cic_set_cfqq(cic, NULL, 1);
4412
4413 cfq_put_cooperator(cfqq);
4414
4415 cfq_put_queue(cfqq);
4416 return NULL;
4417}
4418/*
4419 * Allocate cfq data structures associated with this request.
4420 */
4421static int
4422cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
4423 gfp_t gfp_mask)
4424{
4425 struct cfq_data *cfqd = q->elevator->elevator_data;
4426 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
4427 const int rw = rq_data_dir(rq);
4428 const bool is_sync = rq_is_sync(rq);
4429 struct cfq_queue *cfqq;
4430
4431 spin_lock_irq(q->queue_lock);
4432
4433 check_ioprio_changed(cic, bio);
4434 check_blkcg_changed(cic, bio);
4435new_queue:
4436 cfqq = cic_to_cfqq(cic, is_sync);
4437 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
4438 if (cfqq)
4439 cfq_put_queue(cfqq);
4440 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
4441 cic_set_cfqq(cic, cfqq, is_sync);
4442 } else {
4443 /*
4444 * If the queue was seeky for too long, break it apart.
4445 */
4446 if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
4447 cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
4448 cfqq = split_cfqq(cic, cfqq);
4449 if (!cfqq)
4450 goto new_queue;
4451 }
4452
4453 /*
4454 * Check to see if this queue is scheduled to merge with
4455 * another, closely cooperating queue. The merging of
4456 * queues happens here as it must be done in process context.
4457 * The reference on new_cfqq was taken in merge_cfqqs.
4458 */
4459 if (cfqq->new_cfqq)
4460 cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
4461 }
4462
4463 cfqq->allocated[rw]++;
4464
4465 cfqq->ref++;
4466 cfqg_get(cfqq->cfqg);
4467 rq->elv.priv[0] = cfqq;
4468 rq->elv.priv[1] = cfqq->cfqg;
4469 spin_unlock_irq(q->queue_lock);
4470
4471 return 0;
4472}
4473
4474static void cfq_kick_queue(struct work_struct *work)
4475{
4476 struct cfq_data *cfqd =
4477 container_of(work, struct cfq_data, unplug_work);
4478 struct request_queue *q = cfqd->queue;
4479
4480 spin_lock_irq(q->queue_lock);
4481 __blk_run_queue(cfqd->queue);
4482 spin_unlock_irq(q->queue_lock);
4483}
4484
4485/*
4486 * Timer running if the active_queue is currently idling inside its time slice
4487 */
4488static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
4489{
4490 struct cfq_data *cfqd = container_of(timer, struct cfq_data,
4491 idle_slice_timer);
4492 struct cfq_queue *cfqq;
4493 unsigned long flags;
4494 int timed_out = 1;
4495
4496 cfq_log(cfqd, "idle timer fired");
4497
4498 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
4499
4500 cfqq = cfqd->active_queue;
4501 if (cfqq) {
4502 timed_out = 0;
4503
4504 /*
4505 * We saw a request before the queue expired, let it through
4506 */
4507 if (cfq_cfqq_must_dispatch(cfqq))
4508 goto out_kick;
4509
4510 /*
4511 * expired
4512 */
4513 if (cfq_slice_used(cfqq))
4514 goto expire;
4515
4516 /*
4517 * only expire and reinvoke request handler, if there are
4518 * other queues with pending requests
4519 */
4520 if (!cfqd->busy_queues)
4521 goto out_cont;
4522
4523 /*
4524 * not expired and it has a request pending, let it dispatch
4525 */
4526 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
4527 goto out_kick;
4528
4529 /*
4530 * Queue depth flag is reset only when the idle didn't succeed
4531 */
4532 cfq_clear_cfqq_deep(cfqq);
4533 }
4534expire:
4535 cfq_slice_expired(cfqd, timed_out);
4536out_kick:
4537 cfq_schedule_dispatch(cfqd);
4538out_cont:
4539 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
4540 return HRTIMER_NORESTART;
4541}
4542
4543static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
4544{
4545 hrtimer_cancel(&cfqd->idle_slice_timer);
4546 cancel_work_sync(&cfqd->unplug_work);
4547}
4548
4549static void cfq_exit_queue(struct elevator_queue *e)
4550{
4551 struct cfq_data *cfqd = e->elevator_data;
4552 struct request_queue *q = cfqd->queue;
4553
4554 cfq_shutdown_timer_wq(cfqd);
4555
4556 spin_lock_irq(q->queue_lock);
4557
4558 if (cfqd->active_queue)
4559 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
4560
4561 spin_unlock_irq(q->queue_lock);
4562
4563 cfq_shutdown_timer_wq(cfqd);
4564
4565#ifdef CONFIG_CFQ_GROUP_IOSCHED
4566 blkcg_deactivate_policy(q, &blkcg_policy_cfq);
4567#else
4568 kfree(cfqd->root_group);
4569#endif
4570 kfree(cfqd);
4571}
4572
4573static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
4574{
4575 struct cfq_data *cfqd;
4576 struct blkcg_gq *blkg __maybe_unused;
4577 int i, ret;
4578 struct elevator_queue *eq;
4579
4580 eq = elevator_alloc(q, e);
4581 if (!eq)
4582 return -ENOMEM;
4583
4584 cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
4585 if (!cfqd) {
4586 kobject_put(&eq->kobj);
4587 return -ENOMEM;
4588 }
4589 eq->elevator_data = cfqd;
4590
4591 cfqd->queue = q;
4592 spin_lock_irq(q->queue_lock);
4593 q->elevator = eq;
4594 spin_unlock_irq(q->queue_lock);
4595
4596 /* Init root service tree */
4597 cfqd->grp_service_tree = CFQ_RB_ROOT;
4598
4599 /* Init root group and prefer root group over other groups by default */
4600#ifdef CONFIG_CFQ_GROUP_IOSCHED
4601 ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
4602 if (ret)
4603 goto out_free;
4604
4605 cfqd->root_group = blkg_to_cfqg(q->root_blkg);
4606#else
4607 ret = -ENOMEM;
4608 cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
4609 GFP_KERNEL, cfqd->queue->node);
4610 if (!cfqd->root_group)
4611 goto out_free;
4612
4613 cfq_init_cfqg_base(cfqd->root_group);
4614 cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4615 cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
4616#endif
4617
4618 /*
4619 * Not strictly needed (since RB_ROOT just clears the node and we
4620 * zeroed cfqd on alloc), but better be safe in case someone decides
4621 * to add magic to the rb code
4622 */
4623 for (i = 0; i < CFQ_PRIO_LISTS; i++)
4624 cfqd->prio_trees[i] = RB_ROOT;
4625
4626 /*
4627 * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
4628 * Grab a permanent reference to it, so that the normal code flow
4629 * will not attempt to free it. oom_cfqq is linked to root_group
4630 * but shouldn't hold a reference as it'll never be unlinked. Lose
4631 * the reference from linking right away.
4632 */
4633 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4634 cfqd->oom_cfqq.ref++;
4635
4636 spin_lock_irq(q->queue_lock);
4637 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4638 cfqg_put(cfqd->root_group);
4639 spin_unlock_irq(q->queue_lock);
4640
4641 hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
4642 HRTIMER_MODE_REL);
4643 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
4644
4645 INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
4646
4647 cfqd->cfq_quantum = cfq_quantum;
4648 cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
4649 cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
4650 cfqd->cfq_back_max = cfq_back_max;
4651 cfqd->cfq_back_penalty = cfq_back_penalty;
4652 cfqd->cfq_slice[0] = cfq_slice_async;
4653 cfqd->cfq_slice[1] = cfq_slice_sync;
4654 cfqd->cfq_target_latency = cfq_target_latency;
4655 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4656 cfqd->cfq_slice_idle = cfq_slice_idle;
4657 cfqd->cfq_group_idle = cfq_group_idle;
4658 cfqd->cfq_latency = 1;
4659 cfqd->hw_tag = -1;
4660 /*
4661 * we optimistically start assuming sync ops weren't delayed in last
4662 * second, in order to have larger depth for async operations.
4663 */
4664 cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
4665 return 0;
4666
4667out_free:
4668 kfree(cfqd);
4669 kobject_put(&eq->kobj);
4670 return ret;
4671}
4672
4673static void cfq_registered_queue(struct request_queue *q)
4674{
4675 struct elevator_queue *e = q->elevator;
4676 struct cfq_data *cfqd = e->elevator_data;
4677
4678 /*
4679 * Default to IOPS mode with no idling for SSDs
4680 */
4681 if (blk_queue_nonrot(q))
4682 cfqd->cfq_slice_idle = 0;
4683 wbt_disable_default(q);
4684}
4685
4686/*
4687 * sysfs parts below -->
4688 */
4689static ssize_t
4690cfq_var_show(unsigned int var, char *page)
4691{
4692 return sprintf(page, "%u\n", var);
4693}
4694
4695static void
4696cfq_var_store(unsigned int *var, const char *page)
4697{
4698 char *p = (char *) page;
4699
4700 *var = simple_strtoul(p, &p, 10);
4701}
4702
4703#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4704static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4705{ \
4706 struct cfq_data *cfqd = e->elevator_data; \
4707 u64 __data = __VAR; \
4708 if (__CONV) \
4709 __data = div_u64(__data, NSEC_PER_MSEC); \
4710 return cfq_var_show(__data, (page)); \
4711}
4712SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
4713SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
4714SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
4715SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
4716SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
4717SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
4718SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
4719SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4720SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4721SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4722SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4723SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4724#undef SHOW_FUNCTION
4725
4726#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
4727static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4728{ \
4729 struct cfq_data *cfqd = e->elevator_data; \
4730 u64 __data = __VAR; \
4731 __data = div_u64(__data, NSEC_PER_USEC); \
4732 return cfq_var_show(__data, (page)); \
4733}
4734USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
4735USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
4736USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
4737USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
4738USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
4739#undef USEC_SHOW_FUNCTION
4740
4741#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4742static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4743{ \
4744 struct cfq_data *cfqd = e->elevator_data; \
4745 unsigned int __data, __min = (MIN), __max = (MAX); \
4746 \
4747 cfq_var_store(&__data, (page)); \
4748 if (__data < __min) \
4749 __data = __min; \
4750 else if (__data > __max) \
4751 __data = __max; \
4752 if (__CONV) \
4753 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
4754 else \
4755 *(__PTR) = __data; \
4756 return count; \
4757}
4758STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
4759STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
4760 UINT_MAX, 1);
4761STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
4762 UINT_MAX, 1);
4763STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
4764STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
4765 UINT_MAX, 0);
4766STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4767STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
4768STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
4769STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4770STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4771 UINT_MAX, 0);
4772STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4773STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4774#undef STORE_FUNCTION
4775
4776#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
4777static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4778{ \
4779 struct cfq_data *cfqd = e->elevator_data; \
4780 unsigned int __data, __min = (MIN), __max = (MAX); \
4781 \
4782 cfq_var_store(&__data, (page)); \
4783 if (__data < __min) \
4784 __data = __min; \
4785 else if (__data > __max) \
4786 __data = __max; \
4787 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
4788 return count; \
4789}
4790USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
4791USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
4792USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
4793USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
4794USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
4795#undef USEC_STORE_FUNCTION
4796
4797#define CFQ_ATTR(name) \
4798 __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
4799
4800static struct elv_fs_entry cfq_attrs[] = {
4801 CFQ_ATTR(quantum),
4802 CFQ_ATTR(fifo_expire_sync),
4803 CFQ_ATTR(fifo_expire_async),
4804 CFQ_ATTR(back_seek_max),
4805 CFQ_ATTR(back_seek_penalty),
4806 CFQ_ATTR(slice_sync),
4807 CFQ_ATTR(slice_sync_us),
4808 CFQ_ATTR(slice_async),
4809 CFQ_ATTR(slice_async_us),
4810 CFQ_ATTR(slice_async_rq),
4811 CFQ_ATTR(slice_idle),
4812 CFQ_ATTR(slice_idle_us),
4813 CFQ_ATTR(group_idle),
4814 CFQ_ATTR(group_idle_us),
4815 CFQ_ATTR(low_latency),
4816 CFQ_ATTR(target_latency),
4817 CFQ_ATTR(target_latency_us),
4818 __ATTR_NULL
4819};
4820
4821static struct elevator_type iosched_cfq = {
4822 .ops.sq = {
4823 .elevator_merge_fn = cfq_merge,
4824 .elevator_merged_fn = cfq_merged_request,
4825 .elevator_merge_req_fn = cfq_merged_requests,
4826 .elevator_allow_bio_merge_fn = cfq_allow_bio_merge,
4827 .elevator_allow_rq_merge_fn = cfq_allow_rq_merge,
4828 .elevator_bio_merged_fn = cfq_bio_merged,
4829 .elevator_dispatch_fn = cfq_dispatch_requests,
4830 .elevator_add_req_fn = cfq_insert_request,
4831 .elevator_activate_req_fn = cfq_activate_request,
4832 .elevator_deactivate_req_fn = cfq_deactivate_request,
4833 .elevator_completed_req_fn = cfq_completed_request,
4834 .elevator_former_req_fn = elv_rb_former_request,
4835 .elevator_latter_req_fn = elv_rb_latter_request,
4836 .elevator_init_icq_fn = cfq_init_icq,
4837 .elevator_exit_icq_fn = cfq_exit_icq,
4838 .elevator_set_req_fn = cfq_set_request,
4839 .elevator_put_req_fn = cfq_put_request,
4840 .elevator_may_queue_fn = cfq_may_queue,
4841 .elevator_init_fn = cfq_init_queue,
4842 .elevator_exit_fn = cfq_exit_queue,
4843 .elevator_registered_fn = cfq_registered_queue,
4844 },
4845 .icq_size = sizeof(struct cfq_io_cq),
4846 .icq_align = __alignof__(struct cfq_io_cq),
4847 .elevator_attrs = cfq_attrs,
4848 .elevator_name = "cfq",
4849 .elevator_owner = THIS_MODULE,
4850};
4851
4852#ifdef CONFIG_CFQ_GROUP_IOSCHED
4853static struct blkcg_policy blkcg_policy_cfq = {
4854 .dfl_cftypes = cfq_blkcg_files,
4855 .legacy_cftypes = cfq_blkcg_legacy_files,
4856
4857 .cpd_alloc_fn = cfq_cpd_alloc,
4858 .cpd_init_fn = cfq_cpd_init,
4859 .cpd_free_fn = cfq_cpd_free,
4860 .cpd_bind_fn = cfq_cpd_bind,
4861
4862 .pd_alloc_fn = cfq_pd_alloc,
4863 .pd_init_fn = cfq_pd_init,
4864 .pd_offline_fn = cfq_pd_offline,
4865 .pd_free_fn = cfq_pd_free,
4866 .pd_reset_stats_fn = cfq_pd_reset_stats,
4867};
4868#endif
4869
4870static int __init cfq_init(void)
4871{
4872 int ret;
4873
4874#ifdef CONFIG_CFQ_GROUP_IOSCHED
4875 ret = blkcg_policy_register(&blkcg_policy_cfq);
4876 if (ret)
4877 return ret;
4878#else
4879 cfq_group_idle = 0;
4880#endif
4881
4882 ret = -ENOMEM;
4883 cfq_pool = KMEM_CACHE(cfq_queue, 0);
4884 if (!cfq_pool)
4885 goto err_pol_unreg;
4886
4887 ret = elv_register(&iosched_cfq);
4888 if (ret)
4889 goto err_free_pool;
4890
4891 return 0;
4892
4893err_free_pool:
4894 kmem_cache_destroy(cfq_pool);
4895err_pol_unreg:
4896#ifdef CONFIG_CFQ_GROUP_IOSCHED
4897 blkcg_policy_unregister(&blkcg_policy_cfq);
4898#endif
4899 return ret;
4900}
4901
4902static void __exit cfq_exit(void)
4903{
4904#ifdef CONFIG_CFQ_GROUP_IOSCHED
4905 blkcg_policy_unregister(&blkcg_policy_cfq);
4906#endif
4907 elv_unregister(&iosched_cfq);
4908 kmem_cache_destroy(cfq_pool);
4909}
4910
4911module_init(cfq_init);
4912module_exit(cfq_exit);
4913
4914MODULE_AUTHOR("Jens Axboe");
4915MODULE_LICENSE("GPL");
4916MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
deleted file mode 100644
index ef2f1f09e9b3..000000000000
--- a/block/deadline-iosched.c
+++ /dev/null
@@ -1,560 +0,0 @@
1/*
2 * Deadline i/o scheduler.
3 *
4 * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
5 */
6#include <linux/kernel.h>
7#include <linux/fs.h>
8#include <linux/blkdev.h>
9#include <linux/elevator.h>
10#include <linux/bio.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13#include <linux/init.h>
14#include <linux/compiler.h>
15#include <linux/rbtree.h>
16
17/*
18 * See Documentation/block/deadline-iosched.txt
19 */
20static const int read_expire = HZ / 2; /* max time before a read is submitted. */
21static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
22static const int writes_starved = 2; /* max times reads can starve a write */
23static const int fifo_batch = 16; /* # of sequential requests treated as one
24 by the above parameters. For throughput. */
25
26struct deadline_data {
27 /*
28 * run time data
29 */
30
31 /*
32 * requests (deadline_rq s) are present on both sort_list and fifo_list
33 */
34 struct rb_root sort_list[2];
35 struct list_head fifo_list[2];
36
37 /*
38 * next in sort order. read, write or both are NULL
39 */
40 struct request *next_rq[2];
41 unsigned int batching; /* number of sequential requests made */
42 unsigned int starved; /* times reads have starved writes */
43
44 /*
45 * settings that change how the i/o scheduler behaves
46 */
47 int fifo_expire[2];
48 int fifo_batch;
49 int writes_starved;
50 int front_merges;
51};
52
53static inline struct rb_root *
54deadline_rb_root(struct deadline_data *dd, struct request *rq)
55{
56 return &dd->sort_list[rq_data_dir(rq)];
57}
58
59/*
60 * get the request after `rq' in sector-sorted order
61 */
62static inline struct request *
63deadline_latter_request(struct request *rq)
64{
65 struct rb_node *node = rb_next(&rq->rb_node);
66
67 if (node)
68 return rb_entry_rq(node);
69
70 return NULL;
71}
72
73static void
74deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
75{
76 struct rb_root *root = deadline_rb_root(dd, rq);
77
78 elv_rb_add(root, rq);
79}
80
81static inline void
82deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
83{
84 const int data_dir = rq_data_dir(rq);
85
86 if (dd->next_rq[data_dir] == rq)
87 dd->next_rq[data_dir] = deadline_latter_request(rq);
88
89 elv_rb_del(deadline_rb_root(dd, rq), rq);
90}
91
92/*
93 * add rq to rbtree and fifo
94 */
95static void
96deadline_add_request(struct request_queue *q, struct request *rq)
97{
98 struct deadline_data *dd = q->elevator->elevator_data;
99 const int data_dir = rq_data_dir(rq);
100
101 /*
102 * This may be a requeue of a write request that has locked its
103 * target zone. If it is the case, this releases the zone lock.
104 */
105 blk_req_zone_write_unlock(rq);
106
107 deadline_add_rq_rb(dd, rq);
108
109 /*
110 * set expire time and add to fifo list
111 */
112 rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
113 list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
114}
115
116/*
117 * remove rq from rbtree and fifo.
118 */
119static void deadline_remove_request(struct request_queue *q, struct request *rq)
120{
121 struct deadline_data *dd = q->elevator->elevator_data;
122
123 rq_fifo_clear(rq);
124 deadline_del_rq_rb(dd, rq);
125}
126
127static enum elv_merge
128deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
129{
130 struct deadline_data *dd = q->elevator->elevator_data;
131 struct request *__rq;
132
133 /*
134 * check for front merge
135 */
136 if (dd->front_merges) {
137 sector_t sector = bio_end_sector(bio);
138
139 __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
140 if (__rq) {
141 BUG_ON(sector != blk_rq_pos(__rq));
142
143 if (elv_bio_merge_ok(__rq, bio)) {
144 *req = __rq;
145 return ELEVATOR_FRONT_MERGE;
146 }
147 }
148 }
149
150 return ELEVATOR_NO_MERGE;
151}
152
153static void deadline_merged_request(struct request_queue *q,
154 struct request *req, enum elv_merge type)
155{
156 struct deadline_data *dd = q->elevator->elevator_data;
157
158 /*
159 * if the merge was a front merge, we need to reposition request
160 */
161 if (type == ELEVATOR_FRONT_MERGE) {
162 elv_rb_del(deadline_rb_root(dd, req), req);
163 deadline_add_rq_rb(dd, req);
164 }
165}
166
167static void
168deadline_merged_requests(struct request_queue *q, struct request *req,
169 struct request *next)
170{
171 /*
172 * if next expires before rq, assign its expire time to rq
173 * and move into next position (next will be deleted) in fifo
174 */
175 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
176 if (time_before((unsigned long)next->fifo_time,
177 (unsigned long)req->fifo_time)) {
178 list_move(&req->queuelist, &next->queuelist);
179 req->fifo_time = next->fifo_time;
180 }
181 }
182
183 /*
184 * kill knowledge of next, this one is a goner
185 */
186 deadline_remove_request(q, next);
187}
188
189/*
190 * move request from sort list to dispatch queue.
191 */
192static inline void
193deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
194{
195 struct request_queue *q = rq->q;
196
197 /*
198 * For a zoned block device, write requests must write lock their
199 * target zone.
200 */
201 blk_req_zone_write_lock(rq);
202
203 deadline_remove_request(q, rq);
204 elv_dispatch_add_tail(q, rq);
205}
206
207/*
208 * move an entry to dispatch queue
209 */
210static void
211deadline_move_request(struct deadline_data *dd, struct request *rq)
212{
213 const int data_dir = rq_data_dir(rq);
214
215 dd->next_rq[READ] = NULL;
216 dd->next_rq[WRITE] = NULL;
217 dd->next_rq[data_dir] = deadline_latter_request(rq);
218
219 /*
220 * take it off the sort and fifo list, move
221 * to dispatch queue
222 */
223 deadline_move_to_dispatch(dd, rq);
224}
225
226/*
227 * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
228 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
229 */
230static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
231{
232 struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
233
234 /*
235 * rq is expired!
236 */
237 if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
238 return 1;
239
240 return 0;
241}
242
243/*
244 * For the specified data direction, return the next request to dispatch using
245 * arrival ordered lists.
246 */
247static struct request *
248deadline_fifo_request(struct deadline_data *dd, int data_dir)
249{
250 struct request *rq;
251
252 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
253 return NULL;
254
255 if (list_empty(&dd->fifo_list[data_dir]))
256 return NULL;
257
258 rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
259 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
260 return rq;
261
262 /*
263 * Look for a write request that can be dispatched, that is one with
264 * an unlocked target zone.
265 */
266 list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
267 if (blk_req_can_dispatch_to_zone(rq))
268 return rq;
269 }
270
271 return NULL;
272}
273
274/*
275 * For the specified data direction, return the next request to dispatch using
276 * sector position sorted lists.
277 */
278static struct request *
279deadline_next_request(struct deadline_data *dd, int data_dir)
280{
281 struct request *rq;
282
283 if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
284 return NULL;
285
286 rq = dd->next_rq[data_dir];
287 if (!rq)
288 return NULL;
289
290 if (data_dir == READ || !blk_queue_is_zoned(rq->q))
291 return rq;
292
293 /*
294 * Look for a write request that can be dispatched, that is one with
295 * an unlocked target zone.
296 */
297 while (rq) {
298 if (blk_req_can_dispatch_to_zone(rq))
299 return rq;
300 rq = deadline_latter_request(rq);
301 }
302
303 return NULL;
304}
305
306/*
307 * deadline_dispatch_requests selects the best request according to
308 * read/write expire, fifo_batch, etc
309 */
310static int deadline_dispatch_requests(struct request_queue *q, int force)
311{
312 struct deadline_data *dd = q->elevator->elevator_data;
313 const int reads = !list_empty(&dd->fifo_list[READ]);
314 const int writes = !list_empty(&dd->fifo_list[WRITE]);
315 struct request *rq, *next_rq;
316 int data_dir;
317
318 /*
319 * batches are currently reads XOR writes
320 */
321 rq = deadline_next_request(dd, WRITE);
322 if (!rq)
323 rq = deadline_next_request(dd, READ);
324
325 if (rq && dd->batching < dd->fifo_batch)
326 /* we have a next request are still entitled to batch */
327 goto dispatch_request;
328
329 /*
330 * at this point we are not running a batch. select the appropriate
331 * data direction (read / write)
332 */
333
334 if (reads) {
335 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
336
337 if (deadline_fifo_request(dd, WRITE) &&
338 (dd->starved++ >= dd->writes_starved))
339 goto dispatch_writes;
340
341 data_dir = READ;
342
343 goto dispatch_find_request;
344 }
345
346 /*
347 * there are either no reads or writes have been starved
348 */
349
350 if (writes) {
351dispatch_writes:
352 BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
353
354 dd->starved = 0;
355
356 data_dir = WRITE;
357
358 goto dispatch_find_request;
359 }
360
361 return 0;
362
363dispatch_find_request:
364 /*
365 * we are not running a batch, find best request for selected data_dir
366 */
367 next_rq = deadline_next_request(dd, data_dir);
368 if (deadline_check_fifo(dd, data_dir) || !next_rq) {
369 /*
370 * A deadline has expired, the last request was in the other
371 * direction, or we have run out of higher-sectored requests.
372 * Start again from the request with the earliest expiry time.
373 */
374 rq = deadline_fifo_request(dd, data_dir);
375 } else {
376 /*
377 * The last req was the same dir and we have a next request in
378 * sort order. No expired requests so continue on from here.
379 */
380 rq = next_rq;
381 }
382
383 /*
384 * For a zoned block device, if we only have writes queued and none of
385 * them can be dispatched, rq will be NULL.
386 */
387 if (!rq)
388 return 0;
389
390 dd->batching = 0;
391
392dispatch_request:
393 /*
394 * rq is the selected appropriate request.
395 */
396 dd->batching++;
397 deadline_move_request(dd, rq);
398
399 return 1;
400}
401
402/*
403 * For zoned block devices, write unlock the target zone of completed
404 * write requests.
405 */
406static void
407deadline_completed_request(struct request_queue *q, struct request *rq)
408{
409 blk_req_zone_write_unlock(rq);
410}
411
412static void deadline_exit_queue(struct elevator_queue *e)
413{
414 struct deadline_data *dd = e->elevator_data;
415
416 BUG_ON(!list_empty(&dd->fifo_list[READ]));
417 BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
418
419 kfree(dd);
420}
421
422/*
423 * initialize elevator private data (deadline_data).
424 */
425static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
426{
427 struct deadline_data *dd;
428 struct elevator_queue *eq;
429
430 eq = elevator_alloc(q, e);
431 if (!eq)
432 return -ENOMEM;
433
434 dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
435 if (!dd) {
436 kobject_put(&eq->kobj);
437 return -ENOMEM;
438 }
439 eq->elevator_data = dd;
440
441 INIT_LIST_HEAD(&dd->fifo_list[READ]);
442 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
443 dd->sort_list[READ] = RB_ROOT;
444 dd->sort_list[WRITE] = RB_ROOT;
445 dd->fifo_expire[READ] = read_expire;
446 dd->fifo_expire[WRITE] = write_expire;
447 dd->writes_starved = writes_starved;
448 dd->front_merges = 1;
449 dd->fifo_batch = fifo_batch;
450
451 spin_lock_irq(q->queue_lock);
452 q->elevator = eq;
453 spin_unlock_irq(q->queue_lock);
454 return 0;
455}
456
457/*
458 * sysfs parts below
459 */
460
461static ssize_t
462deadline_var_show(int var, char *page)
463{
464 return sprintf(page, "%d\n", var);
465}
466
467static void
468deadline_var_store(int *var, const char *page)
469{
470 char *p = (char *) page;
471
472 *var = simple_strtol(p, &p, 10);
473}
474
475#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
476static ssize_t __FUNC(struct elevator_queue *e, char *page) \
477{ \
478 struct deadline_data *dd = e->elevator_data; \
479 int __data = __VAR; \
480 if (__CONV) \
481 __data = jiffies_to_msecs(__data); \
482 return deadline_var_show(__data, (page)); \
483}
484SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
485SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
486SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
487SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
488SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
489#undef SHOW_FUNCTION
490
491#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
492static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
493{ \
494 struct deadline_data *dd = e->elevator_data; \
495 int __data; \
496 deadline_var_store(&__data, (page)); \
497 if (__data < (MIN)) \
498 __data = (MIN); \
499 else if (__data > (MAX)) \
500 __data = (MAX); \
501 if (__CONV) \
502 *(__PTR) = msecs_to_jiffies(__data); \
503 else \
504 *(__PTR) = __data; \
505 return count; \
506}
507STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
508STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
509STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
510STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
511STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
512#undef STORE_FUNCTION
513
514#define DD_ATTR(name) \
515 __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
516
517static struct elv_fs_entry deadline_attrs[] = {
518 DD_ATTR(read_expire),
519 DD_ATTR(write_expire),
520 DD_ATTR(writes_starved),
521 DD_ATTR(front_merges),
522 DD_ATTR(fifo_batch),
523 __ATTR_NULL
524};
525
526static struct elevator_type iosched_deadline = {
527 .ops.sq = {
528 .elevator_merge_fn = deadline_merge,
529 .elevator_merged_fn = deadline_merged_request,
530 .elevator_merge_req_fn = deadline_merged_requests,
531 .elevator_dispatch_fn = deadline_dispatch_requests,
532 .elevator_completed_req_fn = deadline_completed_request,
533 .elevator_add_req_fn = deadline_add_request,
534 .elevator_former_req_fn = elv_rb_former_request,
535 .elevator_latter_req_fn = elv_rb_latter_request,
536 .elevator_init_fn = deadline_init_queue,
537 .elevator_exit_fn = deadline_exit_queue,
538 },
539
540 .elevator_attrs = deadline_attrs,
541 .elevator_name = "deadline",
542 .elevator_owner = THIS_MODULE,
543};
544
545static int __init deadline_init(void)
546{
547 return elv_register(&iosched_deadline);
548}
549
550static void __exit deadline_exit(void)
551{
552 elv_unregister(&iosched_deadline);
553}
554
555module_init(deadline_init);
556module_exit(deadline_exit);
557
558MODULE_AUTHOR("Jens Axboe");
559MODULE_LICENSE("GPL");
560MODULE_DESCRIPTION("deadline IO scheduler");
diff --git a/block/elevator.c b/block/elevator.c
index 8fdcd64ae12e..f05e90d4e695 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -61,10 +61,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
61 struct request_queue *q = rq->q; 61 struct request_queue *q = rq->q;
62 struct elevator_queue *e = q->elevator; 62 struct elevator_queue *e = q->elevator;
63 63
64 if (e->uses_mq && e->type->ops.mq.allow_merge) 64 if (e->type->ops.allow_merge)
65 return e->type->ops.mq.allow_merge(q, rq, bio); 65 return e->type->ops.allow_merge(q, rq, bio);
66 else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
67 return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
68 66
69 return 1; 67 return 1;
70} 68}
@@ -95,14 +93,14 @@ static bool elevator_match(const struct elevator_type *e, const char *name)
95} 93}
96 94
97/* 95/*
98 * Return scheduler with name 'name' and with matching 'mq capability 96 * Return scheduler with name 'name'
99 */ 97 */
100static struct elevator_type *elevator_find(const char *name, bool mq) 98static struct elevator_type *elevator_find(const char *name)
101{ 99{
102 struct elevator_type *e; 100 struct elevator_type *e;
103 101
104 list_for_each_entry(e, &elv_list, list) { 102 list_for_each_entry(e, &elv_list, list) {
105 if (elevator_match(e, name) && (mq == e->uses_mq)) 103 if (elevator_match(e, name))
106 return e; 104 return e;
107 } 105 }
108 106
@@ -121,12 +119,12 @@ static struct elevator_type *elevator_get(struct request_queue *q,
121 119
122 spin_lock(&elv_list_lock); 120 spin_lock(&elv_list_lock);
123 121
124 e = elevator_find(name, q->mq_ops != NULL); 122 e = elevator_find(name);
125 if (!e && try_loading) { 123 if (!e && try_loading) {
126 spin_unlock(&elv_list_lock); 124 spin_unlock(&elv_list_lock);
127 request_module("%s-iosched", name); 125 request_module("%s-iosched", name);
128 spin_lock(&elv_list_lock); 126 spin_lock(&elv_list_lock);
129 e = elevator_find(name, q->mq_ops != NULL); 127 e = elevator_find(name);
130 } 128 }
131 129
132 if (e && !try_module_get(e->elevator_owner)) 130 if (e && !try_module_get(e->elevator_owner))
@@ -150,26 +148,6 @@ static int __init elevator_setup(char *str)
150 148
151__setup("elevator=", elevator_setup); 149__setup("elevator=", elevator_setup);
152 150
153/* called during boot to load the elevator chosen by the elevator param */
154void __init load_default_elevator_module(void)
155{
156 struct elevator_type *e;
157
158 if (!chosen_elevator[0])
159 return;
160
161 /*
162 * Boot parameter is deprecated, we haven't supported that for MQ.
163 * Only look for non-mq schedulers from here.
164 */
165 spin_lock(&elv_list_lock);
166 e = elevator_find(chosen_elevator, false);
167 spin_unlock(&elv_list_lock);
168
169 if (!e)
170 request_module("%s-iosched", chosen_elevator);
171}
172
173static struct kobj_type elv_ktype; 151static struct kobj_type elv_ktype;
174 152
175struct elevator_queue *elevator_alloc(struct request_queue *q, 153struct elevator_queue *elevator_alloc(struct request_queue *q,
@@ -185,7 +163,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
185 kobject_init(&eq->kobj, &elv_ktype); 163 kobject_init(&eq->kobj, &elv_ktype);
186 mutex_init(&eq->sysfs_lock); 164 mutex_init(&eq->sysfs_lock);
187 hash_init(eq->hash); 165 hash_init(eq->hash);
188 eq->uses_mq = e->uses_mq;
189 166
190 return eq; 167 return eq;
191} 168}
@@ -200,54 +177,11 @@ static void elevator_release(struct kobject *kobj)
200 kfree(e); 177 kfree(e);
201} 178}
202 179
203/*
204 * Use the default elevator specified by config boot param for non-mq devices,
205 * or by config option. Don't try to load modules as we could be running off
206 * async and request_module() isn't allowed from async.
207 */
208int elevator_init(struct request_queue *q)
209{
210 struct elevator_type *e = NULL;
211 int err = 0;
212
213 /*
214 * q->sysfs_lock must be held to provide mutual exclusion between
215 * elevator_switch() and here.
216 */
217 mutex_lock(&q->sysfs_lock);
218 if (unlikely(q->elevator))
219 goto out_unlock;
220
221 if (*chosen_elevator) {
222 e = elevator_get(q, chosen_elevator, false);
223 if (!e)
224 printk(KERN_ERR "I/O scheduler %s not found\n",
225 chosen_elevator);
226 }
227
228 if (!e)
229 e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
230 if (!e) {
231 printk(KERN_ERR
232 "Default I/O scheduler not found. Using noop.\n");
233 e = elevator_get(q, "noop", false);
234 }
235
236 err = e->ops.sq.elevator_init_fn(q, e);
237 if (err)
238 elevator_put(e);
239out_unlock:
240 mutex_unlock(&q->sysfs_lock);
241 return err;
242}
243
244void elevator_exit(struct request_queue *q, struct elevator_queue *e) 180void elevator_exit(struct request_queue *q, struct elevator_queue *e)
245{ 181{
246 mutex_lock(&e->sysfs_lock); 182 mutex_lock(&e->sysfs_lock);
247 if (e->uses_mq && e->type->ops.mq.exit_sched) 183 if (e->type->ops.exit_sched)
248 blk_mq_exit_sched(q, e); 184 blk_mq_exit_sched(q, e);
249 else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
250 e->type->ops.sq.elevator_exit_fn(e);
251 mutex_unlock(&e->sysfs_lock); 185 mutex_unlock(&e->sysfs_lock);
252 186
253 kobject_put(&e->kobj); 187 kobject_put(&e->kobj);
@@ -356,68 +290,6 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector)
356} 290}
357EXPORT_SYMBOL(elv_rb_find); 291EXPORT_SYMBOL(elv_rb_find);
358 292
359/*
360 * Insert rq into dispatch queue of q. Queue lock must be held on
361 * entry. rq is sort instead into the dispatch queue. To be used by
362 * specific elevators.
363 */
364void elv_dispatch_sort(struct request_queue *q, struct request *rq)
365{
366 sector_t boundary;
367 struct list_head *entry;
368
369 if (q->last_merge == rq)
370 q->last_merge = NULL;
371
372 elv_rqhash_del(q, rq);
373
374 q->nr_sorted--;
375
376 boundary = q->end_sector;
377 list_for_each_prev(entry, &q->queue_head) {
378 struct request *pos = list_entry_rq(entry);
379
380 if (req_op(rq) != req_op(pos))
381 break;
382 if (rq_data_dir(rq) != rq_data_dir(pos))
383 break;
384 if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
385 break;
386 if (blk_rq_pos(rq) >= boundary) {
387 if (blk_rq_pos(pos) < boundary)
388 continue;
389 } else {
390 if (blk_rq_pos(pos) >= boundary)
391 break;
392 }
393 if (blk_rq_pos(rq) >= blk_rq_pos(pos))
394 break;
395 }
396
397 list_add(&rq->queuelist, entry);
398}
399EXPORT_SYMBOL(elv_dispatch_sort);
400
401/*
402 * Insert rq into dispatch queue of q. Queue lock must be held on
403 * entry. rq is added to the back of the dispatch queue. To be used by
404 * specific elevators.
405 */
406void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
407{
408 if (q->last_merge == rq)
409 q->last_merge = NULL;
410
411 elv_rqhash_del(q, rq);
412
413 q->nr_sorted--;
414
415 q->end_sector = rq_end_sector(rq);
416 q->boundary_rq = rq;
417 list_add_tail(&rq->queuelist, &q->queue_head);
418}
419EXPORT_SYMBOL(elv_dispatch_add_tail);
420
421enum elv_merge elv_merge(struct request_queue *q, struct request **req, 293enum elv_merge elv_merge(struct request_queue *q, struct request **req,
422 struct bio *bio) 294 struct bio *bio)
423{ 295{
@@ -457,10 +329,8 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
457 return ELEVATOR_BACK_MERGE; 329 return ELEVATOR_BACK_MERGE;
458 } 330 }
459 331
460 if (e->uses_mq && e->type->ops.mq.request_merge) 332 if (e->type->ops.request_merge)
461 return e->type->ops.mq.request_merge(q, req, bio); 333 return e->type->ops.request_merge(q, req, bio);
462 else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
463 return e->type->ops.sq.elevator_merge_fn(q, req, bio);
464 334
465 return ELEVATOR_NO_MERGE; 335 return ELEVATOR_NO_MERGE;
466} 336}
@@ -511,10 +381,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq,
511{ 381{
512 struct elevator_queue *e = q->elevator; 382 struct elevator_queue *e = q->elevator;
513 383
514 if (e->uses_mq && e->type->ops.mq.request_merged) 384 if (e->type->ops.request_merged)
515 e->type->ops.mq.request_merged(q, rq, type); 385 e->type->ops.request_merged(q, rq, type);
516 else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
517 e->type->ops.sq.elevator_merged_fn(q, rq, type);
518 386
519 if (type == ELEVATOR_BACK_MERGE) 387 if (type == ELEVATOR_BACK_MERGE)
520 elv_rqhash_reposition(q, rq); 388 elv_rqhash_reposition(q, rq);
@@ -526,176 +394,20 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
526 struct request *next) 394 struct request *next)
527{ 395{
528 struct elevator_queue *e = q->elevator; 396 struct elevator_queue *e = q->elevator;
529 bool next_sorted = false;
530
531 if (e->uses_mq && e->type->ops.mq.requests_merged)
532 e->type->ops.mq.requests_merged(q, rq, next);
533 else if (e->type->ops.sq.elevator_merge_req_fn) {
534 next_sorted = (__force bool)(next->rq_flags & RQF_SORTED);
535 if (next_sorted)
536 e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
537 }
538 397
539 elv_rqhash_reposition(q, rq); 398 if (e->type->ops.requests_merged)
540 399 e->type->ops.requests_merged(q, rq, next);
541 if (next_sorted) {
542 elv_rqhash_del(q, next);
543 q->nr_sorted--;
544 }
545 400
401 elv_rqhash_reposition(q, rq);
546 q->last_merge = rq; 402 q->last_merge = rq;
547} 403}
548 404
549void elv_bio_merged(struct request_queue *q, struct request *rq,
550 struct bio *bio)
551{
552 struct elevator_queue *e = q->elevator;
553
554 if (WARN_ON_ONCE(e->uses_mq))
555 return;
556
557 if (e->type->ops.sq.elevator_bio_merged_fn)
558 e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
559}
560
561void elv_requeue_request(struct request_queue *q, struct request *rq)
562{
563 /*
564 * it already went through dequeue, we need to decrement the
565 * in_flight count again
566 */
567 if (blk_account_rq(rq)) {
568 q->in_flight[rq_is_sync(rq)]--;
569 if (rq->rq_flags & RQF_SORTED)
570 elv_deactivate_rq(q, rq);
571 }
572
573 rq->rq_flags &= ~RQF_STARTED;
574
575 blk_pm_requeue_request(rq);
576
577 __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
578}
579
580void elv_drain_elevator(struct request_queue *q)
581{
582 struct elevator_queue *e = q->elevator;
583 static int printed;
584
585 if (WARN_ON_ONCE(e->uses_mq))
586 return;
587
588 lockdep_assert_held(q->queue_lock);
589
590 while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
591 ;
592 if (q->nr_sorted && !blk_queue_is_zoned(q) && printed++ < 10 ) {
593 printk(KERN_ERR "%s: forced dispatching is broken "
594 "(nr_sorted=%u), please report this\n",
595 q->elevator->type->elevator_name, q->nr_sorted);
596 }
597}
598
599void __elv_add_request(struct request_queue *q, struct request *rq, int where)
600{
601 trace_block_rq_insert(q, rq);
602
603 blk_pm_add_request(q, rq);
604
605 rq->q = q;
606
607 if (rq->rq_flags & RQF_SOFTBARRIER) {
608 /* barriers are scheduling boundary, update end_sector */
609 if (!blk_rq_is_passthrough(rq)) {
610 q->end_sector = rq_end_sector(rq);
611 q->boundary_rq = rq;
612 }
613 } else if (!(rq->rq_flags & RQF_ELVPRIV) &&
614 (where == ELEVATOR_INSERT_SORT ||
615 where == ELEVATOR_INSERT_SORT_MERGE))
616 where = ELEVATOR_INSERT_BACK;
617
618 switch (where) {
619 case ELEVATOR_INSERT_REQUEUE:
620 case ELEVATOR_INSERT_FRONT:
621 rq->rq_flags |= RQF_SOFTBARRIER;
622 list_add(&rq->queuelist, &q->queue_head);
623 break;
624
625 case ELEVATOR_INSERT_BACK:
626 rq->rq_flags |= RQF_SOFTBARRIER;
627 elv_drain_elevator(q);
628 list_add_tail(&rq->queuelist, &q->queue_head);
629 /*
630 * We kick the queue here for the following reasons.
631 * - The elevator might have returned NULL previously
632 * to delay requests and returned them now. As the
633 * queue wasn't empty before this request, ll_rw_blk
634 * won't run the queue on return, resulting in hang.
635 * - Usually, back inserted requests won't be merged
636 * with anything. There's no point in delaying queue
637 * processing.
638 */
639 __blk_run_queue(q);
640 break;
641
642 case ELEVATOR_INSERT_SORT_MERGE:
643 /*
644 * If we succeed in merging this request with one in the
645 * queue already, we are done - rq has now been freed,
646 * so no need to do anything further.
647 */
648 if (elv_attempt_insert_merge(q, rq))
649 break;
650 /* fall through */
651 case ELEVATOR_INSERT_SORT:
652 BUG_ON(blk_rq_is_passthrough(rq));
653 rq->rq_flags |= RQF_SORTED;
654 q->nr_sorted++;
655 if (rq_mergeable(rq)) {
656 elv_rqhash_add(q, rq);
657 if (!q->last_merge)
658 q->last_merge = rq;
659 }
660
661 /*
662 * Some ioscheds (cfq) run q->request_fn directly, so
663 * rq cannot be accessed after calling
664 * elevator_add_req_fn.
665 */
666 q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
667 break;
668
669 case ELEVATOR_INSERT_FLUSH:
670 rq->rq_flags |= RQF_SOFTBARRIER;
671 blk_insert_flush(rq);
672 break;
673 default:
674 printk(KERN_ERR "%s: bad insertion point %d\n",
675 __func__, where);
676 BUG();
677 }
678}
679EXPORT_SYMBOL(__elv_add_request);
680
681void elv_add_request(struct request_queue *q, struct request *rq, int where)
682{
683 unsigned long flags;
684
685 spin_lock_irqsave(q->queue_lock, flags);
686 __elv_add_request(q, rq, where);
687 spin_unlock_irqrestore(q->queue_lock, flags);
688}
689EXPORT_SYMBOL(elv_add_request);
690
691struct request *elv_latter_request(struct request_queue *q, struct request *rq) 405struct request *elv_latter_request(struct request_queue *q, struct request *rq)
692{ 406{
693 struct elevator_queue *e = q->elevator; 407 struct elevator_queue *e = q->elevator;
694 408
695 if (e->uses_mq && e->type->ops.mq.next_request) 409 if (e->type->ops.next_request)
696 return e->type->ops.mq.next_request(q, rq); 410 return e->type->ops.next_request(q, rq);
697 else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
698 return e->type->ops.sq.elevator_latter_req_fn(q, rq);
699 411
700 return NULL; 412 return NULL;
701} 413}
@@ -704,66 +416,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
704{ 416{
705 struct elevator_queue *e = q->elevator; 417 struct elevator_queue *e = q->elevator;
706 418
707 if (e->uses_mq && e->type->ops.mq.former_request) 419 if (e->type->ops.former_request)
708 return e->type->ops.mq.former_request(q, rq); 420 return e->type->ops.former_request(q, rq);
709 if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
710 return e->type->ops.sq.elevator_former_req_fn(q, rq);
711 return NULL;
712}
713
714int elv_set_request(struct request_queue *q, struct request *rq,
715 struct bio *bio, gfp_t gfp_mask)
716{
717 struct elevator_queue *e = q->elevator;
718
719 if (WARN_ON_ONCE(e->uses_mq))
720 return 0;
721
722 if (e->type->ops.sq.elevator_set_req_fn)
723 return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
724 return 0;
725}
726
727void elv_put_request(struct request_queue *q, struct request *rq)
728{
729 struct elevator_queue *e = q->elevator;
730
731 if (WARN_ON_ONCE(e->uses_mq))
732 return;
733
734 if (e->type->ops.sq.elevator_put_req_fn)
735 e->type->ops.sq.elevator_put_req_fn(rq);
736}
737
738int elv_may_queue(struct request_queue *q, unsigned int op)
739{
740 struct elevator_queue *e = q->elevator;
741
742 if (WARN_ON_ONCE(e->uses_mq))
743 return 0;
744
745 if (e->type->ops.sq.elevator_may_queue_fn)
746 return e->type->ops.sq.elevator_may_queue_fn(q, op);
747
748 return ELV_MQUEUE_MAY;
749}
750
751void elv_completed_request(struct request_queue *q, struct request *rq)
752{
753 struct elevator_queue *e = q->elevator;
754
755 if (WARN_ON_ONCE(e->uses_mq))
756 return;
757 421
758 /* 422 return NULL;
759 * request is released from the driver, io must be done
760 */
761 if (blk_account_rq(rq)) {
762 q->in_flight[rq_is_sync(rq)]--;
763 if ((rq->rq_flags & RQF_SORTED) &&
764 e->type->ops.sq.elevator_completed_req_fn)
765 e->type->ops.sq.elevator_completed_req_fn(q, rq);
766 }
767} 423}
768 424
769#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 425#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
@@ -832,8 +488,6 @@ int elv_register_queue(struct request_queue *q)
832 } 488 }
833 kobject_uevent(&e->kobj, KOBJ_ADD); 489 kobject_uevent(&e->kobj, KOBJ_ADD);
834 e->registered = 1; 490 e->registered = 1;
835 if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
836 e->type->ops.sq.elevator_registered_fn(q);
837 } 491 }
838 return error; 492 return error;
839} 493}
@@ -873,7 +527,7 @@ int elv_register(struct elevator_type *e)
873 527
874 /* register, don't allow duplicate names */ 528 /* register, don't allow duplicate names */
875 spin_lock(&elv_list_lock); 529 spin_lock(&elv_list_lock);
876 if (elevator_find(e->elevator_name, e->uses_mq)) { 530 if (elevator_find(e->elevator_name)) {
877 spin_unlock(&elv_list_lock); 531 spin_unlock(&elv_list_lock);
878 kmem_cache_destroy(e->icq_cache); 532 kmem_cache_destroy(e->icq_cache);
879 return -EBUSY; 533 return -EBUSY;
@@ -881,12 +535,6 @@ int elv_register(struct elevator_type *e)
881 list_add_tail(&e->list, &elv_list); 535 list_add_tail(&e->list, &elv_list);
882 spin_unlock(&elv_list_lock); 536 spin_unlock(&elv_list_lock);
883 537
884 /* print pretty message */
885 if (elevator_match(e, chosen_elevator) ||
886 (!*chosen_elevator &&
887 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
888 def = " (default)";
889
890 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, 538 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
891 def); 539 def);
892 return 0; 540 return 0;
@@ -989,71 +637,17 @@ out_unlock:
989 */ 637 */
990static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 638static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
991{ 639{
992 struct elevator_queue *old = q->elevator;
993 bool old_registered = false;
994 int err; 640 int err;
995 641
996 lockdep_assert_held(&q->sysfs_lock); 642 lockdep_assert_held(&q->sysfs_lock);
997 643
998 if (q->mq_ops) { 644 blk_mq_freeze_queue(q);
999 blk_mq_freeze_queue(q); 645 blk_mq_quiesce_queue(q);
1000 blk_mq_quiesce_queue(q);
1001
1002 err = elevator_switch_mq(q, new_e);
1003
1004 blk_mq_unquiesce_queue(q);
1005 blk_mq_unfreeze_queue(q);
1006
1007 return err;
1008 }
1009
1010 /*
1011 * Turn on BYPASS and drain all requests w/ elevator private data.
1012 * Block layer doesn't call into a quiesced elevator - all requests
1013 * are directly put on the dispatch list without elevator data
1014 * using INSERT_BACK. All requests have SOFTBARRIER set and no
1015 * merge happens either.
1016 */
1017 if (old) {
1018 old_registered = old->registered;
1019
1020 blk_queue_bypass_start(q);
1021
1022 /* unregister and clear all auxiliary data of the old elevator */
1023 if (old_registered)
1024 elv_unregister_queue(q);
1025
1026 ioc_clear_queue(q);
1027 }
1028 646
1029 /* allocate, init and register new elevator */ 647 err = elevator_switch_mq(q, new_e);
1030 err = new_e->ops.sq.elevator_init_fn(q, new_e);
1031 if (err)
1032 goto fail_init;
1033
1034 err = elv_register_queue(q);
1035 if (err)
1036 goto fail_register;
1037
1038 /* done, kill the old one and finish */
1039 if (old) {
1040 elevator_exit(q, old);
1041 blk_queue_bypass_end(q);
1042 }
1043 648
1044 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 649 blk_mq_unquiesce_queue(q);
1045 650 blk_mq_unfreeze_queue(q);
1046 return 0;
1047
1048fail_register:
1049 elevator_exit(q, q->elevator);
1050fail_init:
1051 /* switch failed, restore and re-register old elevator */
1052 if (old) {
1053 q->elevator = old;
1054 elv_register_queue(q);
1055 blk_queue_bypass_end(q);
1056 }
1057 651
1058 return err; 652 return err;
1059} 653}
@@ -1073,7 +667,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
1073 /* 667 /*
1074 * Special case for mq, turn off scheduling 668 * Special case for mq, turn off scheduling
1075 */ 669 */
1076 if (q->mq_ops && !strncmp(name, "none", 4)) 670 if (!strncmp(name, "none", 4))
1077 return elevator_switch(q, NULL); 671 return elevator_switch(q, NULL);
1078 672
1079 strlcpy(elevator_name, name, sizeof(elevator_name)); 673 strlcpy(elevator_name, name, sizeof(elevator_name));
@@ -1091,8 +685,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
1091 685
1092static inline bool elv_support_iosched(struct request_queue *q) 686static inline bool elv_support_iosched(struct request_queue *q)
1093{ 687{
1094 if (q->mq_ops && q->tag_set && (q->tag_set->flags & 688 if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
1095 BLK_MQ_F_NO_SCHED))
1096 return false; 689 return false;
1097 return true; 690 return true;
1098} 691}
@@ -1102,7 +695,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
1102{ 695{
1103 int ret; 696 int ret;
1104 697
1105 if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q)) 698 if (!queue_is_mq(q) || !elv_support_iosched(q))
1106 return count; 699 return count;
1107 700
1108 ret = __elevator_change(q, name); 701 ret = __elevator_change(q, name);
@@ -1117,10 +710,9 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
1117 struct elevator_queue *e = q->elevator; 710 struct elevator_queue *e = q->elevator;
1118 struct elevator_type *elv = NULL; 711 struct elevator_type *elv = NULL;
1119 struct elevator_type *__e; 712 struct elevator_type *__e;
1120 bool uses_mq = q->mq_ops != NULL;
1121 int len = 0; 713 int len = 0;
1122 714
1123 if (!queue_is_rq_based(q)) 715 if (!queue_is_mq(q))
1124 return sprintf(name, "none\n"); 716 return sprintf(name, "none\n");
1125 717
1126 if (!q->elevator) 718 if (!q->elevator)
@@ -1130,19 +722,16 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
1130 722
1131 spin_lock(&elv_list_lock); 723 spin_lock(&elv_list_lock);
1132 list_for_each_entry(__e, &elv_list, list) { 724 list_for_each_entry(__e, &elv_list, list) {
1133 if (elv && elevator_match(elv, __e->elevator_name) && 725 if (elv && elevator_match(elv, __e->elevator_name)) {
1134 (__e->uses_mq == uses_mq)) {
1135 len += sprintf(name+len, "[%s] ", elv->elevator_name); 726 len += sprintf(name+len, "[%s] ", elv->elevator_name);
1136 continue; 727 continue;
1137 } 728 }
1138 if (__e->uses_mq && q->mq_ops && elv_support_iosched(q)) 729 if (elv_support_iosched(q))
1139 len += sprintf(name+len, "%s ", __e->elevator_name);
1140 else if (!__e->uses_mq && !q->mq_ops)
1141 len += sprintf(name+len, "%s ", __e->elevator_name); 730 len += sprintf(name+len, "%s ", __e->elevator_name);
1142 } 731 }
1143 spin_unlock(&elv_list_lock); 732 spin_unlock(&elv_list_lock);
1144 733
1145 if (q->mq_ops && q->elevator) 734 if (q->elevator)
1146 len += sprintf(name+len, "none"); 735 len += sprintf(name+len, "none");
1147 736
1148 len += sprintf(len+name, "\n"); 737 len += sprintf(len+name, "\n");
diff --git a/block/genhd.c b/block/genhd.c
index cff6bdf27226..1dd8fd6613b8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -47,51 +47,64 @@ static void disk_release_events(struct gendisk *disk);
47 47
48void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) 48void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
49{ 49{
50 if (q->mq_ops) 50 if (queue_is_mq(q))
51 return; 51 return;
52 52
53 atomic_inc(&part->in_flight[rw]); 53 part_stat_local_inc(part, in_flight[rw]);
54 if (part->partno) 54 if (part->partno)
55 atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); 55 part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]);
56} 56}
57 57
58void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) 58void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
59{ 59{
60 if (q->mq_ops) 60 if (queue_is_mq(q))
61 return; 61 return;
62 62
63 atomic_dec(&part->in_flight[rw]); 63 part_stat_local_dec(part, in_flight[rw]);
64 if (part->partno) 64 if (part->partno)
65 atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); 65 part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]);
66} 66}
67 67
68void part_in_flight(struct request_queue *q, struct hd_struct *part, 68unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part)
69 unsigned int inflight[2])
70{ 69{
71 if (q->mq_ops) { 70 int cpu;
72 blk_mq_in_flight(q, part, inflight); 71 unsigned int inflight;
73 return; 72
73 if (queue_is_mq(q)) {
74 return blk_mq_in_flight(q, part);
74 } 75 }
75 76
76 inflight[0] = atomic_read(&part->in_flight[0]) + 77 inflight = 0;
77 atomic_read(&part->in_flight[1]); 78 for_each_possible_cpu(cpu) {
78 if (part->partno) { 79 inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
79 part = &part_to_disk(part)->part0; 80 part_stat_local_read_cpu(part, in_flight[1], cpu);
80 inflight[1] = atomic_read(&part->in_flight[0]) +
81 atomic_read(&part->in_flight[1]);
82 } 81 }
82 if ((int)inflight < 0)
83 inflight = 0;
84
85 return inflight;
83} 86}
84 87
85void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, 88void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
86 unsigned int inflight[2]) 89 unsigned int inflight[2])
87{ 90{
88 if (q->mq_ops) { 91 int cpu;
92
93 if (queue_is_mq(q)) {
89 blk_mq_in_flight_rw(q, part, inflight); 94 blk_mq_in_flight_rw(q, part, inflight);
90 return; 95 return;
91 } 96 }
92 97
93 inflight[0] = atomic_read(&part->in_flight[0]); 98 inflight[0] = 0;
94 inflight[1] = atomic_read(&part->in_flight[1]); 99 inflight[1] = 0;
100 for_each_possible_cpu(cpu) {
101 inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
102 inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
103 }
104 if ((int)inflight[0] < 0)
105 inflight[0] = 0;
106 if ((int)inflight[1] < 0)
107 inflight[1] = 0;
95} 108}
96 109
97struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) 110struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
@@ -1325,8 +1338,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1325 struct disk_part_iter piter; 1338 struct disk_part_iter piter;
1326 struct hd_struct *hd; 1339 struct hd_struct *hd;
1327 char buf[BDEVNAME_SIZE]; 1340 char buf[BDEVNAME_SIZE];
1328 unsigned int inflight[2]; 1341 unsigned int inflight;
1329 int cpu;
1330 1342
1331 /* 1343 /*
1332 if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) 1344 if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1338,10 +1350,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1338 1350
1339 disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); 1351 disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
1340 while ((hd = disk_part_iter_next(&piter))) { 1352 while ((hd = disk_part_iter_next(&piter))) {
1341 cpu = part_stat_lock(); 1353 inflight = part_in_flight(gp->queue, hd);
1342 part_round_stats(gp->queue, cpu, hd);
1343 part_stat_unlock();
1344 part_in_flight(gp->queue, hd, inflight);
1345 seq_printf(seqf, "%4d %7d %s " 1354 seq_printf(seqf, "%4d %7d %s "
1346 "%lu %lu %lu %u " 1355 "%lu %lu %lu %u "
1347 "%lu %lu %lu %u " 1356 "%lu %lu %lu %u "
@@ -1357,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1357 part_stat_read(hd, merges[STAT_WRITE]), 1366 part_stat_read(hd, merges[STAT_WRITE]),
1358 part_stat_read(hd, sectors[STAT_WRITE]), 1367 part_stat_read(hd, sectors[STAT_WRITE]),
1359 (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), 1368 (unsigned int)part_stat_read_msecs(hd, STAT_WRITE),
1360 inflight[0], 1369 inflight,
1361 jiffies_to_msecs(part_stat_read(hd, io_ticks)), 1370 jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1362 jiffies_to_msecs(part_stat_read(hd, time_in_queue)), 1371 jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
1363 part_stat_read(hd, ios[STAT_DISCARD]), 1372 part_stat_read(hd, ios[STAT_DISCARD]),
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index eccac01a10b6..ec6a04e01bc1 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -195,7 +195,7 @@ struct kyber_hctx_data {
195 unsigned int batching; 195 unsigned int batching;
196 struct kyber_ctx_queue *kcqs; 196 struct kyber_ctx_queue *kcqs;
197 struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; 197 struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
198 wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; 198 struct sbq_wait domain_wait[KYBER_NUM_DOMAINS];
199 struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; 199 struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
200 atomic_t wait_index[KYBER_NUM_DOMAINS]; 200 atomic_t wait_index[KYBER_NUM_DOMAINS];
201}; 201};
@@ -501,10 +501,11 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
501 501
502 for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 502 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
503 INIT_LIST_HEAD(&khd->rqs[i]); 503 INIT_LIST_HEAD(&khd->rqs[i]);
504 init_waitqueue_func_entry(&khd->domain_wait[i], 504 khd->domain_wait[i].sbq = NULL;
505 init_waitqueue_func_entry(&khd->domain_wait[i].wait,
505 kyber_domain_wake); 506 kyber_domain_wake);
506 khd->domain_wait[i].private = hctx; 507 khd->domain_wait[i].wait.private = hctx;
507 INIT_LIST_HEAD(&khd->domain_wait[i].entry); 508 INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry);
508 atomic_set(&khd->wait_index[i], 0); 509 atomic_set(&khd->wait_index[i], 0);
509 } 510 }
510 511
@@ -576,7 +577,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
576{ 577{
577 struct kyber_hctx_data *khd = hctx->sched_data; 578 struct kyber_hctx_data *khd = hctx->sched_data;
578 struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); 579 struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
579 struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; 580 struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
580 unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); 581 unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
581 struct list_head *rq_list = &kcq->rq_list[sched_domain]; 582 struct list_head *rq_list = &kcq->rq_list[sched_domain];
582 bool merged; 583 bool merged;
@@ -602,7 +603,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
602 603
603 list_for_each_entry_safe(rq, next, rq_list, queuelist) { 604 list_for_each_entry_safe(rq, next, rq_list, queuelist) {
604 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); 605 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
605 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; 606 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];
606 struct list_head *head = &kcq->rq_list[sched_domain]; 607 struct list_head *head = &kcq->rq_list[sched_domain];
607 608
608 spin_lock(&kcq->lock); 609 spin_lock(&kcq->lock);
@@ -611,7 +612,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
611 else 612 else
612 list_move_tail(&rq->queuelist, head); 613 list_move_tail(&rq->queuelist, head);
613 sbitmap_set_bit(&khd->kcq_map[sched_domain], 614 sbitmap_set_bit(&khd->kcq_map[sched_domain],
614 rq->mq_ctx->index_hw); 615 rq->mq_ctx->index_hw[hctx->type]);
615 blk_mq_sched_request_inserted(rq); 616 blk_mq_sched_request_inserted(rq);
616 spin_unlock(&kcq->lock); 617 spin_unlock(&kcq->lock);
617 } 618 }
@@ -698,12 +699,13 @@ static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
698 flush_busy_kcq, &data); 699 flush_busy_kcq, &data);
699} 700}
700 701
701static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, 702static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags,
702 void *key) 703 void *key)
703{ 704{
704 struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); 705 struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private);
706 struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait);
705 707
706 list_del_init(&wait->entry); 708 sbitmap_del_wait_queue(wait);
707 blk_mq_run_hw_queue(hctx, true); 709 blk_mq_run_hw_queue(hctx, true);
708 return 1; 710 return 1;
709} 711}
@@ -714,7 +716,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
714{ 716{
715 unsigned int sched_domain = khd->cur_domain; 717 unsigned int sched_domain = khd->cur_domain;
716 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; 718 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
717 wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; 719 struct sbq_wait *wait = &khd->domain_wait[sched_domain];
718 struct sbq_wait_state *ws; 720 struct sbq_wait_state *ws;
719 int nr; 721 int nr;
720 722
@@ -725,11 +727,11 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
725 * run when one becomes available. Note that this is serialized on 727 * run when one becomes available. Note that this is serialized on
726 * khd->lock, but we still need to be careful about the waker. 728 * khd->lock, but we still need to be careful about the waker.
727 */ 729 */
728 if (nr < 0 && list_empty_careful(&wait->entry)) { 730 if (nr < 0 && list_empty_careful(&wait->wait.entry)) {
729 ws = sbq_wait_ptr(domain_tokens, 731 ws = sbq_wait_ptr(domain_tokens,
730 &khd->wait_index[sched_domain]); 732 &khd->wait_index[sched_domain]);
731 khd->domain_ws[sched_domain] = ws; 733 khd->domain_ws[sched_domain] = ws;
732 add_wait_queue(&ws->wait, wait); 734 sbitmap_add_wait_queue(domain_tokens, ws, wait);
733 735
734 /* 736 /*
735 * Try again in case a token was freed before we got on the wait 737 * Try again in case a token was freed before we got on the wait
@@ -745,10 +747,10 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
745 * between the !list_empty_careful() check and us grabbing the lock, but 747 * between the !list_empty_careful() check and us grabbing the lock, but
746 * list_del_init() is okay with that. 748 * list_del_init() is okay with that.
747 */ 749 */
748 if (nr >= 0 && !list_empty_careful(&wait->entry)) { 750 if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) {
749 ws = khd->domain_ws[sched_domain]; 751 ws = khd->domain_ws[sched_domain];
750 spin_lock_irq(&ws->wait.lock); 752 spin_lock_irq(&ws->wait.lock);
751 list_del_init(&wait->entry); 753 sbitmap_del_wait_queue(wait);
752 spin_unlock_irq(&ws->wait.lock); 754 spin_unlock_irq(&ws->wait.lock);
753 } 755 }
754 756
@@ -951,7 +953,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
951{ \ 953{ \
952 struct blk_mq_hw_ctx *hctx = data; \ 954 struct blk_mq_hw_ctx *hctx = data; \
953 struct kyber_hctx_data *khd = hctx->sched_data; \ 955 struct kyber_hctx_data *khd = hctx->sched_data; \
954 wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ 956 wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \
955 \ 957 \
956 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ 958 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
957 return 0; \ 959 return 0; \
@@ -1017,7 +1019,7 @@ static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
1017#endif 1019#endif
1018 1020
1019static struct elevator_type kyber_sched = { 1021static struct elevator_type kyber_sched = {
1020 .ops.mq = { 1022 .ops = {
1021 .init_sched = kyber_init_sched, 1023 .init_sched = kyber_init_sched,
1022 .exit_sched = kyber_exit_sched, 1024 .exit_sched = kyber_exit_sched,
1023 .init_hctx = kyber_init_hctx, 1025 .init_hctx = kyber_init_hctx,
@@ -1032,7 +1034,6 @@ static struct elevator_type kyber_sched = {
1032 .dispatch_request = kyber_dispatch_request, 1034 .dispatch_request = kyber_dispatch_request,
1033 .has_work = kyber_has_work, 1035 .has_work = kyber_has_work,
1034 }, 1036 },
1035 .uses_mq = true,
1036#ifdef CONFIG_BLK_DEBUG_FS 1037#ifdef CONFIG_BLK_DEBUG_FS
1037 .queue_debugfs_attrs = kyber_queue_debugfs_attrs, 1038 .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
1038 .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, 1039 .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 099a9e05854c..14288f864e94 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -373,9 +373,16 @@ done:
373 373
374/* 374/*
375 * One confusing aspect here is that we get called for a specific 375 * One confusing aspect here is that we get called for a specific
376 * hardware queue, but we return a request that may not be for a 376 * hardware queue, but we may return a request that is for a
377 * different hardware queue. This is because mq-deadline has shared 377 * different hardware queue. This is because mq-deadline has shared
378 * state for all hardware queues, in terms of sorting, FIFOs, etc. 378 * state for all hardware queues, in terms of sorting, FIFOs, etc.
379 *
380 * For a zoned block device, __dd_dispatch_request() may return NULL
381 * if all the queued write requests are directed at zones that are already
382 * locked due to on-going write requests. In this case, make sure to mark
383 * the queue as needing a restart to ensure that the queue is run again
384 * and the pending writes dispatched once the target zones for the ongoing
385 * write requests are unlocked in dd_finish_request().
379 */ 386 */
380static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) 387static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
381{ 388{
@@ -384,6 +391,9 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
384 391
385 spin_lock(&dd->lock); 392 spin_lock(&dd->lock);
386 rq = __dd_dispatch_request(dd); 393 rq = __dd_dispatch_request(dd);
394 if (!rq && blk_queue_is_zoned(hctx->queue) &&
395 !list_empty(&dd->fifo_list[WRITE]))
396 blk_mq_sched_mark_restart_hctx(hctx);
387 spin_unlock(&dd->lock); 397 spin_unlock(&dd->lock);
388 398
389 return rq; 399 return rq;
@@ -761,7 +771,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
761#endif 771#endif
762 772
763static struct elevator_type mq_deadline = { 773static struct elevator_type mq_deadline = {
764 .ops.mq = { 774 .ops = {
765 .insert_requests = dd_insert_requests, 775 .insert_requests = dd_insert_requests,
766 .dispatch_request = dd_dispatch_request, 776 .dispatch_request = dd_dispatch_request,
767 .prepare_request = dd_prepare_request, 777 .prepare_request = dd_prepare_request,
@@ -777,7 +787,6 @@ static struct elevator_type mq_deadline = {
777 .exit_sched = dd_exit_queue, 787 .exit_sched = dd_exit_queue,
778 }, 788 },
779 789
780 .uses_mq = true,
781#ifdef CONFIG_BLK_DEBUG_FS 790#ifdef CONFIG_BLK_DEBUG_FS
782 .queue_debugfs_attrs = deadline_queue_debugfs_attrs, 791 .queue_debugfs_attrs = deadline_queue_debugfs_attrs,
783#endif 792#endif
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
deleted file mode 100644
index 2d1b15d89b45..000000000000
--- a/block/noop-iosched.c
+++ /dev/null
@@ -1,124 +0,0 @@
1/*
2 * elevator noop
3 */
4#include <linux/blkdev.h>
5#include <linux/elevator.h>
6#include <linux/bio.h>
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/init.h>
10
11struct noop_data {
12 struct list_head queue;
13};
14
15static void noop_merged_requests(struct request_queue *q, struct request *rq,
16 struct request *next)
17{
18 list_del_init(&next->queuelist);
19}
20
21static int noop_dispatch(struct request_queue *q, int force)
22{
23 struct noop_data *nd = q->elevator->elevator_data;
24 struct request *rq;
25
26 rq = list_first_entry_or_null(&nd->queue, struct request, queuelist);
27 if (rq) {
28 list_del_init(&rq->queuelist);
29 elv_dispatch_sort(q, rq);
30 return 1;
31 }
32 return 0;
33}
34
35static void noop_add_request(struct request_queue *q, struct request *rq)
36{
37 struct noop_data *nd = q->elevator->elevator_data;
38
39 list_add_tail(&rq->queuelist, &nd->queue);
40}
41
42static struct request *
43noop_former_request(struct request_queue *q, struct request *rq)
44{
45 struct noop_data *nd = q->elevator->elevator_data;
46
47 if (rq->queuelist.prev == &nd->queue)
48 return NULL;
49 return list_prev_entry(rq, queuelist);
50}
51
52static struct request *
53noop_latter_request(struct request_queue *q, struct request *rq)
54{
55 struct noop_data *nd = q->elevator->elevator_data;
56
57 if (rq->queuelist.next == &nd->queue)
58 return NULL;
59 return list_next_entry(rq, queuelist);
60}
61
62static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
63{
64 struct noop_data *nd;
65 struct elevator_queue *eq;
66
67 eq = elevator_alloc(q, e);
68 if (!eq)
69 return -ENOMEM;
70
71 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
72 if (!nd) {
73 kobject_put(&eq->kobj);
74 return -ENOMEM;
75 }
76 eq->elevator_data = nd;
77
78 INIT_LIST_HEAD(&nd->queue);
79
80 spin_lock_irq(q->queue_lock);
81 q->elevator = eq;
82 spin_unlock_irq(q->queue_lock);
83 return 0;
84}
85
86static void noop_exit_queue(struct elevator_queue *e)
87{
88 struct noop_data *nd = e->elevator_data;
89
90 BUG_ON(!list_empty(&nd->queue));
91 kfree(nd);
92}
93
94static struct elevator_type elevator_noop = {
95 .ops.sq = {
96 .elevator_merge_req_fn = noop_merged_requests,
97 .elevator_dispatch_fn = noop_dispatch,
98 .elevator_add_req_fn = noop_add_request,
99 .elevator_former_req_fn = noop_former_request,
100 .elevator_latter_req_fn = noop_latter_request,
101 .elevator_init_fn = noop_init_queue,
102 .elevator_exit_fn = noop_exit_queue,
103 },
104 .elevator_name = "noop",
105 .elevator_owner = THIS_MODULE,
106};
107
108static int __init noop_init(void)
109{
110 return elv_register(&elevator_noop);
111}
112
113static void __exit noop_exit(void)
114{
115 elv_unregister(&elevator_noop);
116}
117
118module_init(noop_init);
119module_exit(noop_exit);
120
121
122MODULE_AUTHOR("Jens Axboe");
123MODULE_LICENSE("GPL");
124MODULE_DESCRIPTION("No-op IO scheduler");
diff --git a/block/partition-generic.c b/block/partition-generic.c
index d3d14e81fb12..8e596a8dff32 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -120,13 +120,9 @@ ssize_t part_stat_show(struct device *dev,
120{ 120{
121 struct hd_struct *p = dev_to_part(dev); 121 struct hd_struct *p = dev_to_part(dev);
122 struct request_queue *q = part_to_disk(p)->queue; 122 struct request_queue *q = part_to_disk(p)->queue;
123 unsigned int inflight[2]; 123 unsigned int inflight;
124 int cpu;
125 124
126 cpu = part_stat_lock(); 125 inflight = part_in_flight(q, p);
127 part_round_stats(q, cpu, p);
128 part_stat_unlock();
129 part_in_flight(q, p, inflight);
130 return sprintf(buf, 126 return sprintf(buf,
131 "%8lu %8lu %8llu %8u " 127 "%8lu %8lu %8llu %8u "
132 "%8lu %8lu %8llu %8u " 128 "%8lu %8lu %8llu %8u "
@@ -141,7 +137,7 @@ ssize_t part_stat_show(struct device *dev,
141 part_stat_read(p, merges[STAT_WRITE]), 137 part_stat_read(p, merges[STAT_WRITE]),
142 (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), 138 (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
143 (unsigned int)part_stat_read_msecs(p, STAT_WRITE), 139 (unsigned int)part_stat_read_msecs(p, STAT_WRITE),
144 inflight[0], 140 inflight,
145 jiffies_to_msecs(part_stat_read(p, io_ticks)), 141 jiffies_to_msecs(part_stat_read(p, io_ticks)),
146 jiffies_to_msecs(part_stat_read(p, time_in_queue)), 142 jiffies_to_msecs(part_stat_read(p, time_in_queue)),
147 part_stat_read(p, ios[STAT_DISCARD]), 143 part_stat_read(p, ios[STAT_DISCARD]),
@@ -249,9 +245,10 @@ struct device_type part_type = {
249 .uevent = part_uevent, 245 .uevent = part_uevent,
250}; 246};
251 247
252static void delete_partition_rcu_cb(struct rcu_head *head) 248static void delete_partition_work_fn(struct work_struct *work)
253{ 249{
254 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); 250 struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct,
251 rcu_work);
255 252
256 part->start_sect = 0; 253 part->start_sect = 0;
257 part->nr_sects = 0; 254 part->nr_sects = 0;
@@ -262,7 +259,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
262void __delete_partition(struct percpu_ref *ref) 259void __delete_partition(struct percpu_ref *ref)
263{ 260{
264 struct hd_struct *part = container_of(ref, struct hd_struct, ref); 261 struct hd_struct *part = container_of(ref, struct hd_struct, ref);
265 call_rcu(&part->rcu_head, delete_partition_rcu_cb); 262 INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn);
263 queue_rcu_work(system_wq, &part->rcu_work);
266} 264}
267 265
268/* 266/*
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 01306c018398..938ed513b070 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -919,8 +919,6 @@ static void ata_eh_set_pending(struct ata_port *ap, int fastdrain)
919void ata_qc_schedule_eh(struct ata_queued_cmd *qc) 919void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
920{ 920{
921 struct ata_port *ap = qc->ap; 921 struct ata_port *ap = qc->ap;
922 struct request_queue *q = qc->scsicmd->device->request_queue;
923 unsigned long flags;
924 922
925 WARN_ON(!ap->ops->error_handler); 923 WARN_ON(!ap->ops->error_handler);
926 924
@@ -932,9 +930,7 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
932 * Note that ATA_QCFLAG_FAILED is unconditionally set after 930 * Note that ATA_QCFLAG_FAILED is unconditionally set after
933 * this function completes. 931 * this function completes.
934 */ 932 */
935 spin_lock_irqsave(q->queue_lock, flags);
936 blk_abort_request(qc->scsicmd->request); 933 blk_abort_request(qc->scsicmd->request);
937 spin_unlock_irqrestore(q->queue_lock, flags);
938} 934}
939 935
940/** 936/**
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 7ca76ed2e71a..84d0fcebd6af 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -100,6 +100,10 @@ enum {
100 MAX_TAINT = 1000, /* cap on aoetgt taint */ 100 MAX_TAINT = 1000, /* cap on aoetgt taint */
101}; 101};
102 102
103struct aoe_req {
104 unsigned long nr_bios;
105};
106
103struct buf { 107struct buf {
104 ulong nframesout; 108 ulong nframesout;
105 struct bio *bio; 109 struct bio *bio;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index ed26b7287256..e2c6aae2d636 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -387,6 +387,7 @@ aoeblk_gdalloc(void *vp)
387 387
388 set = &d->tag_set; 388 set = &d->tag_set;
389 set->ops = &aoeblk_mq_ops; 389 set->ops = &aoeblk_mq_ops;
390 set->cmd_size = sizeof(struct aoe_req);
390 set->nr_hw_queues = 1; 391 set->nr_hw_queues = 1;
391 set->queue_depth = 128; 392 set->queue_depth = 128;
392 set->numa_node = NUMA_NO_NODE; 393 set->numa_node = NUMA_NO_NODE;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index bb2fba651bd2..3cf9bc5d8d95 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -822,17 +822,6 @@ out:
822 spin_unlock_irqrestore(&d->lock, flags); 822 spin_unlock_irqrestore(&d->lock, flags);
823} 823}
824 824
825static unsigned long
826rqbiocnt(struct request *r)
827{
828 struct bio *bio;
829 unsigned long n = 0;
830
831 __rq_for_each_bio(bio, r)
832 n++;
833 return n;
834}
835
836static void 825static void
837bufinit(struct buf *buf, struct request *rq, struct bio *bio) 826bufinit(struct buf *buf, struct request *rq, struct bio *bio)
838{ 827{
@@ -847,6 +836,7 @@ nextbuf(struct aoedev *d)
847{ 836{
848 struct request *rq; 837 struct request *rq;
849 struct request_queue *q; 838 struct request_queue *q;
839 struct aoe_req *req;
850 struct buf *buf; 840 struct buf *buf;
851 struct bio *bio; 841 struct bio *bio;
852 842
@@ -865,7 +855,11 @@ nextbuf(struct aoedev *d)
865 blk_mq_start_request(rq); 855 blk_mq_start_request(rq);
866 d->ip.rq = rq; 856 d->ip.rq = rq;
867 d->ip.nxbio = rq->bio; 857 d->ip.nxbio = rq->bio;
868 rq->special = (void *) rqbiocnt(rq); 858
859 req = blk_mq_rq_to_pdu(rq);
860 req->nr_bios = 0;
861 __rq_for_each_bio(bio, rq)
862 req->nr_bios++;
869 } 863 }
870 buf = mempool_alloc(d->bufpool, GFP_ATOMIC); 864 buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
871 if (buf == NULL) { 865 if (buf == NULL) {
@@ -1069,16 +1063,13 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
1069static void 1063static void
1070aoe_end_buf(struct aoedev *d, struct buf *buf) 1064aoe_end_buf(struct aoedev *d, struct buf *buf)
1071{ 1065{
1072 struct request *rq; 1066 struct request *rq = buf->rq;
1073 unsigned long n; 1067 struct aoe_req *req = blk_mq_rq_to_pdu(rq);
1074 1068
1075 if (buf == d->ip.buf) 1069 if (buf == d->ip.buf)
1076 d->ip.buf = NULL; 1070 d->ip.buf = NULL;
1077 rq = buf->rq;
1078 mempool_free(buf, d->bufpool); 1071 mempool_free(buf, d->bufpool);
1079 n = (unsigned long) rq->special; 1072 if (--req->nr_bios == 0)
1080 rq->special = (void *) --n;
1081 if (n == 0)
1082 aoe_end_request(d, rq, 0); 1073 aoe_end_request(d, rq, 0);
1083} 1074}
1084 1075
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 9063f8efbd3b..5b49f1b33ebe 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -160,21 +160,22 @@ static void
160aoe_failip(struct aoedev *d) 160aoe_failip(struct aoedev *d)
161{ 161{
162 struct request *rq; 162 struct request *rq;
163 struct aoe_req *req;
163 struct bio *bio; 164 struct bio *bio;
164 unsigned long n;
165 165
166 aoe_failbuf(d, d->ip.buf); 166 aoe_failbuf(d, d->ip.buf);
167
168 rq = d->ip.rq; 167 rq = d->ip.rq;
169 if (rq == NULL) 168 if (rq == NULL)
170 return; 169 return;
170
171 req = blk_mq_rq_to_pdu(rq);
171 while ((bio = d->ip.nxbio)) { 172 while ((bio = d->ip.nxbio)) {
172 bio->bi_status = BLK_STS_IOERR; 173 bio->bi_status = BLK_STS_IOERR;
173 d->ip.nxbio = bio->bi_next; 174 d->ip.nxbio = bio->bi_next;
174 n = (unsigned long) rq->special; 175 req->nr_bios--;
175 rq->special = (void *) --n;
176 } 176 }
177 if ((unsigned long) rq->special == 0) 177
178 if (!req->nr_bios)
178 aoe_end_request(d, rq, 0); 179 aoe_end_request(d, rq, 0);
179} 180}
180 181
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index 251482066977..1e4e2971171c 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -24,7 +24,7 @@ static void discover_timer(struct timer_list *t)
24 aoecmd_cfg(0xffff, 0xff); 24 aoecmd_cfg(0xffff, 0xff);
25} 25}
26 26
27static void 27static void __exit
28aoe_exit(void) 28aoe_exit(void)
29{ 29{
30 del_timer_sync(&timer); 30 del_timer_sync(&timer);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index f88b4c26d422..b0dbbdfeb33e 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1471,6 +1471,15 @@ static void setup_req_params( int drive )
1471 ReqTrack, ReqSector, (unsigned long)ReqData )); 1471 ReqTrack, ReqSector, (unsigned long)ReqData ));
1472} 1472}
1473 1473
1474static void ataflop_commit_rqs(struct blk_mq_hw_ctx *hctx)
1475{
1476 spin_lock_irq(&ataflop_lock);
1477 atari_disable_irq(IRQ_MFP_FDC);
1478 finish_fdc();
1479 atari_enable_irq(IRQ_MFP_FDC);
1480 spin_unlock_irq(&ataflop_lock);
1481}
1482
1474static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, 1483static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
1475 const struct blk_mq_queue_data *bd) 1484 const struct blk_mq_queue_data *bd)
1476{ 1485{
@@ -1947,6 +1956,7 @@ static const struct block_device_operations floppy_fops = {
1947 1956
1948static const struct blk_mq_ops ataflop_mq_ops = { 1957static const struct blk_mq_ops ataflop_mq_ops = {
1949 .queue_rq = ataflop_queue_rq, 1958 .queue_rq = ataflop_queue_rq,
1959 .commit_rqs = ataflop_commit_rqs,
1950}; 1960};
1951 1961
1952static struct kobject *floppy_find(dev_t dev, int *part, void *data) 1962static struct kobject *floppy_find(dev_t dev, int *part, void *data)
@@ -1982,6 +1992,7 @@ static int __init atari_floppy_init (void)
1982 &ataflop_mq_ops, 2, 1992 &ataflop_mq_ops, 2,
1983 BLK_MQ_F_SHOULD_MERGE); 1993 BLK_MQ_F_SHOULD_MERGE);
1984 if (IS_ERR(unit[i].disk->queue)) { 1994 if (IS_ERR(unit[i].disk->queue)) {
1995 put_disk(unit[i].disk);
1985 ret = PTR_ERR(unit[i].disk->queue); 1996 ret = PTR_ERR(unit[i].disk->queue);
1986 unit[i].disk->queue = NULL; 1997 unit[i].disk->queue = NULL;
1987 goto err; 1998 goto err;
@@ -2033,18 +2044,13 @@ static int __init atari_floppy_init (void)
2033 return 0; 2044 return 0;
2034 2045
2035err: 2046err:
2036 do { 2047 while (--i >= 0) {
2037 struct gendisk *disk = unit[i].disk; 2048 struct gendisk *disk = unit[i].disk;
2038 2049
2039 if (disk) { 2050 blk_cleanup_queue(disk->queue);
2040 if (disk->queue) { 2051 blk_mq_free_tag_set(&unit[i].tag_set);
2041 blk_cleanup_queue(disk->queue); 2052 put_disk(unit[i].disk);
2042 disk->queue = NULL; 2053 }
2043 }
2044 blk_mq_free_tag_set(&unit[i].tag_set);
2045 put_disk(unit[i].disk);
2046 }
2047 } while (i--);
2048 2054
2049 unregister_blkdev(FLOPPY_MAJOR, "fd"); 2055 unregister_blkdev(FLOPPY_MAJOR, "fd");
2050 return ret; 2056 return ret;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fa8204214ac0..f973a2a845c8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2792,7 +2792,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
2792 2792
2793 drbd_init_set_defaults(device); 2793 drbd_init_set_defaults(device);
2794 2794
2795 q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, &resource->req_lock); 2795 q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
2796 if (!q) 2796 if (!q)
2797 goto out_no_q; 2797 goto out_no_q;
2798 device->rq_queue = q; 2798 device->rq_queue = q;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index fb23578e9a41..6f2856c6d0f2 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2231,7 +2231,6 @@ static void request_done(int uptodate)
2231{ 2231{
2232 struct request *req = current_req; 2232 struct request *req = current_req;
2233 struct request_queue *q; 2233 struct request_queue *q;
2234 unsigned long flags;
2235 int block; 2234 int block;
2236 char msg[sizeof("request done ") + sizeof(int) * 3]; 2235 char msg[sizeof("request done ") + sizeof(int) * 3];
2237 2236
@@ -2254,10 +2253,7 @@ static void request_done(int uptodate)
2254 if (block > _floppy->sect) 2253 if (block > _floppy->sect)
2255 DRS->maxtrack = 1; 2254 DRS->maxtrack = 1;
2256 2255
2257 /* unlock chained buffers */
2258 spin_lock_irqsave(q->queue_lock, flags);
2259 floppy_end_request(req, 0); 2256 floppy_end_request(req, 0);
2260 spin_unlock_irqrestore(q->queue_lock, flags);
2261 } else { 2257 } else {
2262 if (rq_data_dir(req) == WRITE) { 2258 if (rq_data_dir(req) == WRITE) {
2263 /* record write error information */ 2259 /* record write error information */
@@ -2269,9 +2265,7 @@ static void request_done(int uptodate)
2269 DRWE->last_error_sector = blk_rq_pos(req); 2265 DRWE->last_error_sector = blk_rq_pos(req);
2270 DRWE->last_error_generation = DRS->generation; 2266 DRWE->last_error_generation = DRS->generation;
2271 } 2267 }
2272 spin_lock_irqsave(q->queue_lock, flags);
2273 floppy_end_request(req, BLK_STS_IOERR); 2268 floppy_end_request(req, BLK_STS_IOERR);
2274 spin_unlock_irqrestore(q->queue_lock, flags);
2275 } 2269 }
2276} 2270}
2277 2271
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cb0cc8685076..0939f36548c9 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,13 +77,14 @@
77#include <linux/falloc.h> 77#include <linux/falloc.h>
78#include <linux/uio.h> 78#include <linux/uio.h>
79#include <linux/ioprio.h> 79#include <linux/ioprio.h>
80#include <linux/blk-cgroup.h>
80 81
81#include "loop.h" 82#include "loop.h"
82 83
83#include <linux/uaccess.h> 84#include <linux/uaccess.h>
84 85
85static DEFINE_IDR(loop_index_idr); 86static DEFINE_IDR(loop_index_idr);
86static DEFINE_MUTEX(loop_index_mutex); 87static DEFINE_MUTEX(loop_ctl_mutex);
87 88
88static int max_part; 89static int max_part;
89static int part_shift; 90static int part_shift;
@@ -630,18 +631,7 @@ static void loop_reread_partitions(struct loop_device *lo,
630{ 631{
631 int rc; 632 int rc;
632 633
633 /* 634 rc = blkdev_reread_part(bdev);
634 * bd_mutex has been held already in release path, so don't
635 * acquire it if this function is called in such case.
636 *
637 * If the reread partition isn't from release path, lo_refcnt
638 * must be at least one and it can only become zero when the
639 * current holder is released.
640 */
641 if (!atomic_read(&lo->lo_refcnt))
642 rc = __blkdev_reread_part(bdev);
643 else
644 rc = blkdev_reread_part(bdev);
645 if (rc) 635 if (rc)
646 pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", 636 pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
647 __func__, lo->lo_number, lo->lo_file_name, rc); 637 __func__, lo->lo_number, lo->lo_file_name, rc);
@@ -688,26 +678,30 @@ static int loop_validate_file(struct file *file, struct block_device *bdev)
688static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, 678static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
689 unsigned int arg) 679 unsigned int arg)
690{ 680{
691 struct file *file, *old_file; 681 struct file *file = NULL, *old_file;
692 int error; 682 int error;
683 bool partscan;
693 684
685 error = mutex_lock_killable(&loop_ctl_mutex);
686 if (error)
687 return error;
694 error = -ENXIO; 688 error = -ENXIO;
695 if (lo->lo_state != Lo_bound) 689 if (lo->lo_state != Lo_bound)
696 goto out; 690 goto out_err;
697 691
698 /* the loop device has to be read-only */ 692 /* the loop device has to be read-only */
699 error = -EINVAL; 693 error = -EINVAL;
700 if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) 694 if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
701 goto out; 695 goto out_err;
702 696
703 error = -EBADF; 697 error = -EBADF;
704 file = fget(arg); 698 file = fget(arg);
705 if (!file) 699 if (!file)
706 goto out; 700 goto out_err;
707 701
708 error = loop_validate_file(file, bdev); 702 error = loop_validate_file(file, bdev);
709 if (error) 703 if (error)
710 goto out_putf; 704 goto out_err;
711 705
712 old_file = lo->lo_backing_file; 706 old_file = lo->lo_backing_file;
713 707
@@ -715,7 +709,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
715 709
716 /* size of the new backing store needs to be the same */ 710 /* size of the new backing store needs to be the same */
717 if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) 711 if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
718 goto out_putf; 712 goto out_err;
719 713
720 /* and ... switch */ 714 /* and ... switch */
721 blk_mq_freeze_queue(lo->lo_queue); 715 blk_mq_freeze_queue(lo->lo_queue);
@@ -726,15 +720,22 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
726 lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); 720 lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
727 loop_update_dio(lo); 721 loop_update_dio(lo);
728 blk_mq_unfreeze_queue(lo->lo_queue); 722 blk_mq_unfreeze_queue(lo->lo_queue);
729 723 partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
724 mutex_unlock(&loop_ctl_mutex);
725 /*
726 * We must drop file reference outside of loop_ctl_mutex as dropping
727 * the file ref can take bd_mutex which creates circular locking
728 * dependency.
729 */
730 fput(old_file); 730 fput(old_file);
731 if (lo->lo_flags & LO_FLAGS_PARTSCAN) 731 if (partscan)
732 loop_reread_partitions(lo, bdev); 732 loop_reread_partitions(lo, bdev);
733 return 0; 733 return 0;
734 734
735 out_putf: 735out_err:
736 fput(file); 736 mutex_unlock(&loop_ctl_mutex);
737 out: 737 if (file)
738 fput(file);
738 return error; 739 return error;
739} 740}
740 741
@@ -909,6 +910,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
909 int lo_flags = 0; 910 int lo_flags = 0;
910 int error; 911 int error;
911 loff_t size; 912 loff_t size;
913 bool partscan;
912 914
913 /* This is safe, since we have a reference from open(). */ 915 /* This is safe, since we have a reference from open(). */
914 __module_get(THIS_MODULE); 916 __module_get(THIS_MODULE);
@@ -918,13 +920,17 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
918 if (!file) 920 if (!file)
919 goto out; 921 goto out;
920 922
923 error = mutex_lock_killable(&loop_ctl_mutex);
924 if (error)
925 goto out_putf;
926
921 error = -EBUSY; 927 error = -EBUSY;
922 if (lo->lo_state != Lo_unbound) 928 if (lo->lo_state != Lo_unbound)
923 goto out_putf; 929 goto out_unlock;
924 930
925 error = loop_validate_file(file, bdev); 931 error = loop_validate_file(file, bdev);
926 if (error) 932 if (error)
927 goto out_putf; 933 goto out_unlock;
928 934
929 mapping = file->f_mapping; 935 mapping = file->f_mapping;
930 inode = mapping->host; 936 inode = mapping->host;
@@ -936,10 +942,10 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
936 error = -EFBIG; 942 error = -EFBIG;
937 size = get_loop_size(lo, file); 943 size = get_loop_size(lo, file);
938 if ((loff_t)(sector_t)size != size) 944 if ((loff_t)(sector_t)size != size)
939 goto out_putf; 945 goto out_unlock;
940 error = loop_prepare_queue(lo); 946 error = loop_prepare_queue(lo);
941 if (error) 947 if (error)
942 goto out_putf; 948 goto out_unlock;
943 949
944 error = 0; 950 error = 0;
945 951
@@ -971,18 +977,22 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
971 lo->lo_state = Lo_bound; 977 lo->lo_state = Lo_bound;
972 if (part_shift) 978 if (part_shift)
973 lo->lo_flags |= LO_FLAGS_PARTSCAN; 979 lo->lo_flags |= LO_FLAGS_PARTSCAN;
974 if (lo->lo_flags & LO_FLAGS_PARTSCAN) 980 partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
975 loop_reread_partitions(lo, bdev);
976 981
977 /* Grab the block_device to prevent its destruction after we 982 /* Grab the block_device to prevent its destruction after we
978 * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). 983 * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
979 */ 984 */
980 bdgrab(bdev); 985 bdgrab(bdev);
986 mutex_unlock(&loop_ctl_mutex);
987 if (partscan)
988 loop_reread_partitions(lo, bdev);
981 return 0; 989 return 0;
982 990
983 out_putf: 991out_unlock:
992 mutex_unlock(&loop_ctl_mutex);
993out_putf:
984 fput(file); 994 fput(file);
985 out: 995out:
986 /* This is safe: open() is still holding a reference. */ 996 /* This is safe: open() is still holding a reference. */
987 module_put(THIS_MODULE); 997 module_put(THIS_MODULE);
988 return error; 998 return error;
@@ -1025,39 +1035,31 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
1025 return err; 1035 return err;
1026} 1036}
1027 1037
1028static int loop_clr_fd(struct loop_device *lo) 1038static int __loop_clr_fd(struct loop_device *lo, bool release)
1029{ 1039{
1030 struct file *filp = lo->lo_backing_file; 1040 struct file *filp = NULL;
1031 gfp_t gfp = lo->old_gfp_mask; 1041 gfp_t gfp = lo->old_gfp_mask;
1032 struct block_device *bdev = lo->lo_device; 1042 struct block_device *bdev = lo->lo_device;
1043 int err = 0;
1044 bool partscan = false;
1045 int lo_number;
1033 1046
1034 if (lo->lo_state != Lo_bound) 1047 mutex_lock(&loop_ctl_mutex);
1035 return -ENXIO; 1048 if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
1036 1049 err = -ENXIO;
1037 /* 1050 goto out_unlock;
1038 * If we've explicitly asked to tear down the loop device,
1039 * and it has an elevated reference count, set it for auto-teardown when
1040 * the last reference goes away. This stops $!~#$@ udev from
1041 * preventing teardown because it decided that it needs to run blkid on
1042 * the loopback device whenever they appear. xfstests is notorious for
1043 * failing tests because blkid via udev races with a losetup
1044 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
1045 * command to fail with EBUSY.
1046 */
1047 if (atomic_read(&lo->lo_refcnt) > 1) {
1048 lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
1049 mutex_unlock(&lo->lo_ctl_mutex);
1050 return 0;
1051 } 1051 }
1052 1052
1053 if (filp == NULL) 1053 filp = lo->lo_backing_file;
1054 return -EINVAL; 1054 if (filp == NULL) {
1055 err = -EINVAL;
1056 goto out_unlock;
1057 }
1055 1058
1056 /* freeze request queue during the transition */ 1059 /* freeze request queue during the transition */
1057 blk_mq_freeze_queue(lo->lo_queue); 1060 blk_mq_freeze_queue(lo->lo_queue);
1058 1061
1059 spin_lock_irq(&lo->lo_lock); 1062 spin_lock_irq(&lo->lo_lock);
1060 lo->lo_state = Lo_rundown;
1061 lo->lo_backing_file = NULL; 1063 lo->lo_backing_file = NULL;
1062 spin_unlock_irq(&lo->lo_lock); 1064 spin_unlock_irq(&lo->lo_lock);
1063 1065
@@ -1093,21 +1095,73 @@ static int loop_clr_fd(struct loop_device *lo)
1093 module_put(THIS_MODULE); 1095 module_put(THIS_MODULE);
1094 blk_mq_unfreeze_queue(lo->lo_queue); 1096 blk_mq_unfreeze_queue(lo->lo_queue);
1095 1097
1096 if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) 1098 partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
1097 loop_reread_partitions(lo, bdev); 1099 lo_number = lo->lo_number;
1098 lo->lo_flags = 0; 1100 lo->lo_flags = 0;
1099 if (!part_shift) 1101 if (!part_shift)
1100 lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; 1102 lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
1101 loop_unprepare_queue(lo); 1103 loop_unprepare_queue(lo);
1102 mutex_unlock(&lo->lo_ctl_mutex); 1104out_unlock:
1105 mutex_unlock(&loop_ctl_mutex);
1106 if (partscan) {
1107 /*
1108 * bd_mutex has been held already in release path, so don't
1109 * acquire it if this function is called in such case.
1110 *
1111 * If the reread partition isn't from release path, lo_refcnt
1112 * must be at least one and it can only become zero when the
1113 * current holder is released.
1114 */
1115 if (release)
1116 err = __blkdev_reread_part(bdev);
1117 else
1118 err = blkdev_reread_part(bdev);
1119 pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
1120 __func__, lo_number, err);
1121 /* Device is gone, no point in returning error */
1122 err = 0;
1123 }
1103 /* 1124 /*
1104 * Need not hold lo_ctl_mutex to fput backing file. 1125 * Need not hold loop_ctl_mutex to fput backing file.
1105 * Calling fput holding lo_ctl_mutex triggers a circular 1126 * Calling fput holding loop_ctl_mutex triggers a circular
1106 * lock dependency possibility warning as fput can take 1127 * lock dependency possibility warning as fput can take
1107 * bd_mutex which is usually taken before lo_ctl_mutex. 1128 * bd_mutex which is usually taken before loop_ctl_mutex.
1108 */ 1129 */
1109 fput(filp); 1130 if (filp)
1110 return 0; 1131 fput(filp);
1132 return err;
1133}
1134
1135static int loop_clr_fd(struct loop_device *lo)
1136{
1137 int err;
1138
1139 err = mutex_lock_killable(&loop_ctl_mutex);
1140 if (err)
1141 return err;
1142 if (lo->lo_state != Lo_bound) {
1143 mutex_unlock(&loop_ctl_mutex);
1144 return -ENXIO;
1145 }
1146 /*
1147 * If we've explicitly asked to tear down the loop device,
1148 * and it has an elevated reference count, set it for auto-teardown when
1149 * the last reference goes away. This stops $!~#$@ udev from
1150 * preventing teardown because it decided that it needs to run blkid on
1151 * the loopback device whenever they appear. xfstests is notorious for
1152 * failing tests because blkid via udev races with a losetup
1153 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
1154 * command to fail with EBUSY.
1155 */
1156 if (atomic_read(&lo->lo_refcnt) > 1) {
1157 lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
1158 mutex_unlock(&loop_ctl_mutex);
1159 return 0;
1160 }
1161 lo->lo_state = Lo_rundown;
1162 mutex_unlock(&loop_ctl_mutex);
1163
1164 return __loop_clr_fd(lo, false);
1111} 1165}
1112 1166
1113static int 1167static int
@@ -1116,47 +1170,58 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
1116 int err; 1170 int err;
1117 struct loop_func_table *xfer; 1171 struct loop_func_table *xfer;
1118 kuid_t uid = current_uid(); 1172 kuid_t uid = current_uid();
1173 struct block_device *bdev;
1174 bool partscan = false;
1119 1175
1176 err = mutex_lock_killable(&loop_ctl_mutex);
1177 if (err)
1178 return err;
1120 if (lo->lo_encrypt_key_size && 1179 if (lo->lo_encrypt_key_size &&
1121 !uid_eq(lo->lo_key_owner, uid) && 1180 !uid_eq(lo->lo_key_owner, uid) &&
1122 !capable(CAP_SYS_ADMIN)) 1181 !capable(CAP_SYS_ADMIN)) {
1123 return -EPERM; 1182 err = -EPERM;
1124 if (lo->lo_state != Lo_bound) 1183 goto out_unlock;
1125 return -ENXIO; 1184 }
1126 if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) 1185 if (lo->lo_state != Lo_bound) {
1127 return -EINVAL; 1186 err = -ENXIO;
1187 goto out_unlock;
1188 }
1189 if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) {
1190 err = -EINVAL;
1191 goto out_unlock;
1192 }
1128 1193
1129 /* I/O need to be drained during transfer transition */ 1194 /* I/O need to be drained during transfer transition */
1130 blk_mq_freeze_queue(lo->lo_queue); 1195 blk_mq_freeze_queue(lo->lo_queue);
1131 1196
1132 err = loop_release_xfer(lo); 1197 err = loop_release_xfer(lo);
1133 if (err) 1198 if (err)
1134 goto exit; 1199 goto out_unfreeze;
1135 1200
1136 if (info->lo_encrypt_type) { 1201 if (info->lo_encrypt_type) {
1137 unsigned int type = info->lo_encrypt_type; 1202 unsigned int type = info->lo_encrypt_type;
1138 1203
1139 if (type >= MAX_LO_CRYPT) { 1204 if (type >= MAX_LO_CRYPT) {
1140 err = -EINVAL; 1205 err = -EINVAL;
1141 goto exit; 1206 goto out_unfreeze;
1142 } 1207 }
1143 xfer = xfer_funcs[type]; 1208 xfer = xfer_funcs[type];
1144 if (xfer == NULL) { 1209 if (xfer == NULL) {
1145 err = -EINVAL; 1210 err = -EINVAL;
1146 goto exit; 1211 goto out_unfreeze;
1147 } 1212 }
1148 } else 1213 } else
1149 xfer = NULL; 1214 xfer = NULL;
1150 1215
1151 err = loop_init_xfer(lo, xfer, info); 1216 err = loop_init_xfer(lo, xfer, info);
1152 if (err) 1217 if (err)
1153 goto exit; 1218 goto out_unfreeze;
1154 1219
1155 if (lo->lo_offset != info->lo_offset || 1220 if (lo->lo_offset != info->lo_offset ||
1156 lo->lo_sizelimit != info->lo_sizelimit) { 1221 lo->lo_sizelimit != info->lo_sizelimit) {
1157 if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { 1222 if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
1158 err = -EFBIG; 1223 err = -EFBIG;
1159 goto exit; 1224 goto out_unfreeze;
1160 } 1225 }
1161 } 1226 }
1162 1227
@@ -1188,15 +1253,20 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
1188 /* update dio if lo_offset or transfer is changed */ 1253 /* update dio if lo_offset or transfer is changed */
1189 __loop_update_dio(lo, lo->use_dio); 1254 __loop_update_dio(lo, lo->use_dio);
1190 1255
1191 exit: 1256out_unfreeze:
1192 blk_mq_unfreeze_queue(lo->lo_queue); 1257 blk_mq_unfreeze_queue(lo->lo_queue);
1193 1258
1194 if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && 1259 if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
1195 !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { 1260 !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
1196 lo->lo_flags |= LO_FLAGS_PARTSCAN; 1261 lo->lo_flags |= LO_FLAGS_PARTSCAN;
1197 lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; 1262 lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
1198 loop_reread_partitions(lo, lo->lo_device); 1263 bdev = lo->lo_device;
1264 partscan = true;
1199 } 1265 }
1266out_unlock:
1267 mutex_unlock(&loop_ctl_mutex);
1268 if (partscan)
1269 loop_reread_partitions(lo, bdev);
1200 1270
1201 return err; 1271 return err;
1202} 1272}
@@ -1204,12 +1274,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
1204static int 1274static int
1205loop_get_status(struct loop_device *lo, struct loop_info64 *info) 1275loop_get_status(struct loop_device *lo, struct loop_info64 *info)
1206{ 1276{
1207 struct file *file; 1277 struct path path;
1208 struct kstat stat; 1278 struct kstat stat;
1209 int ret; 1279 int ret;
1210 1280
1281 ret = mutex_lock_killable(&loop_ctl_mutex);
1282 if (ret)
1283 return ret;
1211 if (lo->lo_state != Lo_bound) { 1284 if (lo->lo_state != Lo_bound) {
1212 mutex_unlock(&lo->lo_ctl_mutex); 1285 mutex_unlock(&loop_ctl_mutex);
1213 return -ENXIO; 1286 return -ENXIO;
1214 } 1287 }
1215 1288
@@ -1228,17 +1301,17 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
1228 lo->lo_encrypt_key_size); 1301 lo->lo_encrypt_key_size);
1229 } 1302 }
1230 1303
1231 /* Drop lo_ctl_mutex while we call into the filesystem. */ 1304 /* Drop loop_ctl_mutex while we call into the filesystem. */
1232 file = get_file(lo->lo_backing_file); 1305 path = lo->lo_backing_file->f_path;
1233 mutex_unlock(&lo->lo_ctl_mutex); 1306 path_get(&path);
1234 ret = vfs_getattr(&file->f_path, &stat, STATX_INO, 1307 mutex_unlock(&loop_ctl_mutex);
1235 AT_STATX_SYNC_AS_STAT); 1308 ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
1236 if (!ret) { 1309 if (!ret) {
1237 info->lo_device = huge_encode_dev(stat.dev); 1310 info->lo_device = huge_encode_dev(stat.dev);
1238 info->lo_inode = stat.ino; 1311 info->lo_inode = stat.ino;
1239 info->lo_rdevice = huge_encode_dev(stat.rdev); 1312 info->lo_rdevice = huge_encode_dev(stat.rdev);
1240 } 1313 }
1241 fput(file); 1314 path_put(&path);
1242 return ret; 1315 return ret;
1243} 1316}
1244 1317
@@ -1322,10 +1395,8 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
1322 struct loop_info64 info64; 1395 struct loop_info64 info64;
1323 int err; 1396 int err;
1324 1397
1325 if (!arg) { 1398 if (!arg)
1326 mutex_unlock(&lo->lo_ctl_mutex);
1327 return -EINVAL; 1399 return -EINVAL;
1328 }
1329 err = loop_get_status(lo, &info64); 1400 err = loop_get_status(lo, &info64);
1330 if (!err) 1401 if (!err)
1331 err = loop_info64_to_old(&info64, &info); 1402 err = loop_info64_to_old(&info64, &info);
@@ -1340,10 +1411,8 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
1340 struct loop_info64 info64; 1411 struct loop_info64 info64;
1341 int err; 1412 int err;
1342 1413
1343 if (!arg) { 1414 if (!arg)
1344 mutex_unlock(&lo->lo_ctl_mutex);
1345 return -EINVAL; 1415 return -EINVAL;
1346 }
1347 err = loop_get_status(lo, &info64); 1416 err = loop_get_status(lo, &info64);
1348 if (!err && copy_to_user(arg, &info64, sizeof(info64))) 1417 if (!err && copy_to_user(arg, &info64, sizeof(info64)))
1349 err = -EFAULT; 1418 err = -EFAULT;
@@ -1393,70 +1462,73 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
1393 return 0; 1462 return 0;
1394} 1463}
1395 1464
1465static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
1466 unsigned long arg)
1467{
1468 int err;
1469
1470 err = mutex_lock_killable(&loop_ctl_mutex);
1471 if (err)
1472 return err;
1473 switch (cmd) {
1474 case LOOP_SET_CAPACITY:
1475 err = loop_set_capacity(lo);
1476 break;
1477 case LOOP_SET_DIRECT_IO:
1478 err = loop_set_dio(lo, arg);
1479 break;
1480 case LOOP_SET_BLOCK_SIZE:
1481 err = loop_set_block_size(lo, arg);
1482 break;
1483 default:
1484 err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
1485 }
1486 mutex_unlock(&loop_ctl_mutex);
1487 return err;
1488}
1489
1396static int lo_ioctl(struct block_device *bdev, fmode_t mode, 1490static int lo_ioctl(struct block_device *bdev, fmode_t mode,
1397 unsigned int cmd, unsigned long arg) 1491 unsigned int cmd, unsigned long arg)
1398{ 1492{
1399 struct loop_device *lo = bdev->bd_disk->private_data; 1493 struct loop_device *lo = bdev->bd_disk->private_data;
1400 int err; 1494 int err;
1401 1495
1402 err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1);
1403 if (err)
1404 goto out_unlocked;
1405
1406 switch (cmd) { 1496 switch (cmd) {
1407 case LOOP_SET_FD: 1497 case LOOP_SET_FD:
1408 err = loop_set_fd(lo, mode, bdev, arg); 1498 return loop_set_fd(lo, mode, bdev, arg);
1409 break;
1410 case LOOP_CHANGE_FD: 1499 case LOOP_CHANGE_FD:
1411 err = loop_change_fd(lo, bdev, arg); 1500 return loop_change_fd(lo, bdev, arg);
1412 break;
1413 case LOOP_CLR_FD: 1501 case LOOP_CLR_FD:
1414 /* loop_clr_fd would have unlocked lo_ctl_mutex on success */ 1502 return loop_clr_fd(lo);
1415 err = loop_clr_fd(lo);
1416 if (!err)
1417 goto out_unlocked;
1418 break;
1419 case LOOP_SET_STATUS: 1503 case LOOP_SET_STATUS:
1420 err = -EPERM; 1504 err = -EPERM;
1421 if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) 1505 if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
1422 err = loop_set_status_old(lo, 1506 err = loop_set_status_old(lo,
1423 (struct loop_info __user *)arg); 1507 (struct loop_info __user *)arg);
1508 }
1424 break; 1509 break;
1425 case LOOP_GET_STATUS: 1510 case LOOP_GET_STATUS:
1426 err = loop_get_status_old(lo, (struct loop_info __user *) arg); 1511 return loop_get_status_old(lo, (struct loop_info __user *) arg);
1427 /* loop_get_status() unlocks lo_ctl_mutex */
1428 goto out_unlocked;
1429 case LOOP_SET_STATUS64: 1512 case LOOP_SET_STATUS64:
1430 err = -EPERM; 1513 err = -EPERM;
1431 if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) 1514 if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
1432 err = loop_set_status64(lo, 1515 err = loop_set_status64(lo,
1433 (struct loop_info64 __user *) arg); 1516 (struct loop_info64 __user *) arg);
1517 }
1434 break; 1518 break;
1435 case LOOP_GET_STATUS64: 1519 case LOOP_GET_STATUS64:
1436 err = loop_get_status64(lo, (struct loop_info64 __user *) arg); 1520 return loop_get_status64(lo, (struct loop_info64 __user *) arg);
1437 /* loop_get_status() unlocks lo_ctl_mutex */
1438 goto out_unlocked;
1439 case LOOP_SET_CAPACITY: 1521 case LOOP_SET_CAPACITY:
1440 err = -EPERM;
1441 if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
1442 err = loop_set_capacity(lo);
1443 break;
1444 case LOOP_SET_DIRECT_IO: 1522 case LOOP_SET_DIRECT_IO:
1445 err = -EPERM;
1446 if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
1447 err = loop_set_dio(lo, arg);
1448 break;
1449 case LOOP_SET_BLOCK_SIZE: 1523 case LOOP_SET_BLOCK_SIZE:
1450 err = -EPERM; 1524 if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN))
1451 if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) 1525 return -EPERM;
1452 err = loop_set_block_size(lo, arg); 1526 /* Fall through */
1453 break;
1454 default: 1527 default:
1455 err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; 1528 err = lo_simple_ioctl(lo, cmd, arg);
1529 break;
1456 } 1530 }
1457 mutex_unlock(&lo->lo_ctl_mutex);
1458 1531
1459out_unlocked:
1460 return err; 1532 return err;
1461} 1533}
1462 1534
@@ -1570,10 +1642,8 @@ loop_get_status_compat(struct loop_device *lo,
1570 struct loop_info64 info64; 1642 struct loop_info64 info64;
1571 int err; 1643 int err;
1572 1644
1573 if (!arg) { 1645 if (!arg)
1574 mutex_unlock(&lo->lo_ctl_mutex);
1575 return -EINVAL; 1646 return -EINVAL;
1576 }
1577 err = loop_get_status(lo, &info64); 1647 err = loop_get_status(lo, &info64);
1578 if (!err) 1648 if (!err)
1579 err = loop_info64_to_compat(&info64, arg); 1649 err = loop_info64_to_compat(&info64, arg);
@@ -1588,20 +1658,12 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
1588 1658
1589 switch(cmd) { 1659 switch(cmd) {
1590 case LOOP_SET_STATUS: 1660 case LOOP_SET_STATUS:
1591 err = mutex_lock_killable(&lo->lo_ctl_mutex); 1661 err = loop_set_status_compat(lo,
1592 if (!err) { 1662 (const struct compat_loop_info __user *)arg);
1593 err = loop_set_status_compat(lo,
1594 (const struct compat_loop_info __user *)arg);
1595 mutex_unlock(&lo->lo_ctl_mutex);
1596 }
1597 break; 1663 break;
1598 case LOOP_GET_STATUS: 1664 case LOOP_GET_STATUS:
1599 err = mutex_lock_killable(&lo->lo_ctl_mutex); 1665 err = loop_get_status_compat(lo,
1600 if (!err) { 1666 (struct compat_loop_info __user *)arg);
1601 err = loop_get_status_compat(lo,
1602 (struct compat_loop_info __user *)arg);
1603 /* loop_get_status() unlocks lo_ctl_mutex */
1604 }
1605 break; 1667 break;
1606 case LOOP_SET_CAPACITY: 1668 case LOOP_SET_CAPACITY:
1607 case LOOP_CLR_FD: 1669 case LOOP_CLR_FD:
@@ -1625,9 +1687,11 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
1625static int lo_open(struct block_device *bdev, fmode_t mode) 1687static int lo_open(struct block_device *bdev, fmode_t mode)
1626{ 1688{
1627 struct loop_device *lo; 1689 struct loop_device *lo;
1628 int err = 0; 1690 int err;
1629 1691
1630 mutex_lock(&loop_index_mutex); 1692 err = mutex_lock_killable(&loop_ctl_mutex);
1693 if (err)
1694 return err;
1631 lo = bdev->bd_disk->private_data; 1695 lo = bdev->bd_disk->private_data;
1632 if (!lo) { 1696 if (!lo) {
1633 err = -ENXIO; 1697 err = -ENXIO;
@@ -1636,26 +1700,30 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
1636 1700
1637 atomic_inc(&lo->lo_refcnt); 1701 atomic_inc(&lo->lo_refcnt);
1638out: 1702out:
1639 mutex_unlock(&loop_index_mutex); 1703 mutex_unlock(&loop_ctl_mutex);
1640 return err; 1704 return err;
1641} 1705}
1642 1706
1643static void __lo_release(struct loop_device *lo) 1707static void lo_release(struct gendisk *disk, fmode_t mode)
1644{ 1708{
1645 int err; 1709 struct loop_device *lo;
1646 1710
1711 mutex_lock(&loop_ctl_mutex);
1712 lo = disk->private_data;
1647 if (atomic_dec_return(&lo->lo_refcnt)) 1713 if (atomic_dec_return(&lo->lo_refcnt))
1648 return; 1714 goto out_unlock;
1649 1715
1650 mutex_lock(&lo->lo_ctl_mutex);
1651 if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { 1716 if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
1717 if (lo->lo_state != Lo_bound)
1718 goto out_unlock;
1719 lo->lo_state = Lo_rundown;
1720 mutex_unlock(&loop_ctl_mutex);
1652 /* 1721 /*
1653 * In autoclear mode, stop the loop thread 1722 * In autoclear mode, stop the loop thread
1654 * and remove configuration after last close. 1723 * and remove configuration after last close.
1655 */ 1724 */
1656 err = loop_clr_fd(lo); 1725 __loop_clr_fd(lo, true);
1657 if (!err) 1726 return;
1658 return;
1659 } else if (lo->lo_state == Lo_bound) { 1727 } else if (lo->lo_state == Lo_bound) {
1660 /* 1728 /*
1661 * Otherwise keep thread (if running) and config, 1729 * Otherwise keep thread (if running) and config,
@@ -1665,14 +1733,8 @@ static void __lo_release(struct loop_device *lo)
1665 blk_mq_unfreeze_queue(lo->lo_queue); 1733 blk_mq_unfreeze_queue(lo->lo_queue);
1666 } 1734 }
1667 1735
1668 mutex_unlock(&lo->lo_ctl_mutex); 1736out_unlock:
1669} 1737 mutex_unlock(&loop_ctl_mutex);
1670
1671static void lo_release(struct gendisk *disk, fmode_t mode)
1672{
1673 mutex_lock(&loop_index_mutex);
1674 __lo_release(disk->private_data);
1675 mutex_unlock(&loop_index_mutex);
1676} 1738}
1677 1739
1678static const struct block_device_operations lo_fops = { 1740static const struct block_device_operations lo_fops = {
@@ -1711,10 +1773,10 @@ static int unregister_transfer_cb(int id, void *ptr, void *data)
1711 struct loop_device *lo = ptr; 1773 struct loop_device *lo = ptr;
1712 struct loop_func_table *xfer = data; 1774 struct loop_func_table *xfer = data;
1713 1775
1714 mutex_lock(&lo->lo_ctl_mutex); 1776 mutex_lock(&loop_ctl_mutex);
1715 if (lo->lo_encryption == xfer) 1777 if (lo->lo_encryption == xfer)
1716 loop_release_xfer(lo); 1778 loop_release_xfer(lo);
1717 mutex_unlock(&lo->lo_ctl_mutex); 1779 mutex_unlock(&loop_ctl_mutex);
1718 return 0; 1780 return 0;
1719} 1781}
1720 1782
@@ -1759,8 +1821,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
1759 1821
1760 /* always use the first bio's css */ 1822 /* always use the first bio's css */
1761#ifdef CONFIG_BLK_CGROUP 1823#ifdef CONFIG_BLK_CGROUP
1762 if (cmd->use_aio && rq->bio && rq->bio->bi_css) { 1824 if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
1763 cmd->css = rq->bio->bi_css; 1825 cmd->css = &bio_blkcg(rq->bio)->css;
1764 css_get(cmd->css); 1826 css_get(cmd->css);
1765 } else 1827 } else
1766#endif 1828#endif
@@ -1853,7 +1915,7 @@ static int loop_add(struct loop_device **l, int i)
1853 goto out_free_idr; 1915 goto out_free_idr;
1854 1916
1855 lo->lo_queue = blk_mq_init_queue(&lo->tag_set); 1917 lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
1856 if (IS_ERR_OR_NULL(lo->lo_queue)) { 1918 if (IS_ERR(lo->lo_queue)) {
1857 err = PTR_ERR(lo->lo_queue); 1919 err = PTR_ERR(lo->lo_queue);
1858 goto out_cleanup_tags; 1920 goto out_cleanup_tags;
1859 } 1921 }
@@ -1895,7 +1957,6 @@ static int loop_add(struct loop_device **l, int i)
1895 if (!part_shift) 1957 if (!part_shift)
1896 disk->flags |= GENHD_FL_NO_PART_SCAN; 1958 disk->flags |= GENHD_FL_NO_PART_SCAN;
1897 disk->flags |= GENHD_FL_EXT_DEVT; 1959 disk->flags |= GENHD_FL_EXT_DEVT;
1898 mutex_init(&lo->lo_ctl_mutex);
1899 atomic_set(&lo->lo_refcnt, 0); 1960 atomic_set(&lo->lo_refcnt, 0);
1900 lo->lo_number = i; 1961 lo->lo_number = i;
1901 spin_lock_init(&lo->lo_lock); 1962 spin_lock_init(&lo->lo_lock);
@@ -1974,7 +2035,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
1974 struct kobject *kobj; 2035 struct kobject *kobj;
1975 int err; 2036 int err;
1976 2037
1977 mutex_lock(&loop_index_mutex); 2038 mutex_lock(&loop_ctl_mutex);
1978 err = loop_lookup(&lo, MINOR(dev) >> part_shift); 2039 err = loop_lookup(&lo, MINOR(dev) >> part_shift);
1979 if (err < 0) 2040 if (err < 0)
1980 err = loop_add(&lo, MINOR(dev) >> part_shift); 2041 err = loop_add(&lo, MINOR(dev) >> part_shift);
@@ -1982,7 +2043,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
1982 kobj = NULL; 2043 kobj = NULL;
1983 else 2044 else
1984 kobj = get_disk_and_module(lo->lo_disk); 2045 kobj = get_disk_and_module(lo->lo_disk);
1985 mutex_unlock(&loop_index_mutex); 2046 mutex_unlock(&loop_ctl_mutex);
1986 2047
1987 *part = 0; 2048 *part = 0;
1988 return kobj; 2049 return kobj;
@@ -1992,9 +2053,13 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
1992 unsigned long parm) 2053 unsigned long parm)
1993{ 2054{
1994 struct loop_device *lo; 2055 struct loop_device *lo;
1995 int ret = -ENOSYS; 2056 int ret;
1996 2057
1997 mutex_lock(&loop_index_mutex); 2058 ret = mutex_lock_killable(&loop_ctl_mutex);
2059 if (ret)
2060 return ret;
2061
2062 ret = -ENOSYS;
1998 switch (cmd) { 2063 switch (cmd) {
1999 case LOOP_CTL_ADD: 2064 case LOOP_CTL_ADD:
2000 ret = loop_lookup(&lo, parm); 2065 ret = loop_lookup(&lo, parm);
@@ -2008,21 +2073,15 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
2008 ret = loop_lookup(&lo, parm); 2073 ret = loop_lookup(&lo, parm);
2009 if (ret < 0) 2074 if (ret < 0)
2010 break; 2075 break;
2011 ret = mutex_lock_killable(&lo->lo_ctl_mutex);
2012 if (ret)
2013 break;
2014 if (lo->lo_state != Lo_unbound) { 2076 if (lo->lo_state != Lo_unbound) {
2015 ret = -EBUSY; 2077 ret = -EBUSY;
2016 mutex_unlock(&lo->lo_ctl_mutex);
2017 break; 2078 break;
2018 } 2079 }
2019 if (atomic_read(&lo->lo_refcnt) > 0) { 2080 if (atomic_read(&lo->lo_refcnt) > 0) {
2020 ret = -EBUSY; 2081 ret = -EBUSY;
2021 mutex_unlock(&lo->lo_ctl_mutex);
2022 break; 2082 break;
2023 } 2083 }
2024 lo->lo_disk->private_data = NULL; 2084 lo->lo_disk->private_data = NULL;
2025 mutex_unlock(&lo->lo_ctl_mutex);
2026 idr_remove(&loop_index_idr, lo->lo_number); 2085 idr_remove(&loop_index_idr, lo->lo_number);
2027 loop_remove(lo); 2086 loop_remove(lo);
2028 break; 2087 break;
@@ -2032,7 +2091,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
2032 break; 2091 break;
2033 ret = loop_add(&lo, -1); 2092 ret = loop_add(&lo, -1);
2034 } 2093 }
2035 mutex_unlock(&loop_index_mutex); 2094 mutex_unlock(&loop_ctl_mutex);
2036 2095
2037 return ret; 2096 return ret;
2038} 2097}
@@ -2116,10 +2175,10 @@ static int __init loop_init(void)
2116 THIS_MODULE, loop_probe, NULL, NULL); 2175 THIS_MODULE, loop_probe, NULL, NULL);
2117 2176
2118 /* pre-create number of devices given by config or max_loop */ 2177 /* pre-create number of devices given by config or max_loop */
2119 mutex_lock(&loop_index_mutex); 2178 mutex_lock(&loop_ctl_mutex);
2120 for (i = 0; i < nr; i++) 2179 for (i = 0; i < nr; i++)
2121 loop_add(&lo, i); 2180 loop_add(&lo, i);
2122 mutex_unlock(&loop_index_mutex); 2181 mutex_unlock(&loop_ctl_mutex);
2123 2182
2124 printk(KERN_INFO "loop: module loaded\n"); 2183 printk(KERN_INFO "loop: module loaded\n");
2125 return 0; 2184 return 0;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 4d42c7af7de7..af75a5ee4094 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -54,7 +54,6 @@ struct loop_device {
54 54
55 spinlock_t lo_lock; 55 spinlock_t lo_lock;
56 int lo_state; 56 int lo_state;
57 struct mutex lo_ctl_mutex;
58 struct kthread_worker worker; 57 struct kthread_worker worker;
59 struct task_struct *worker_task; 58 struct task_struct *worker_task;
60 bool use_dio; 59 bool use_dio;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index a7daa8acbab3..88e8440e75c3 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -168,41 +168,6 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
168 return false; /* device present */ 168 return false; /* device present */
169} 169}
170 170
171/* we have to use runtime tag to setup command header */
172static void mtip_init_cmd_header(struct request *rq)
173{
174 struct driver_data *dd = rq->q->queuedata;
175 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
176
177 /* Point the command headers at the command tables. */
178 cmd->command_header = dd->port->command_list +
179 (sizeof(struct mtip_cmd_hdr) * rq->tag);
180 cmd->command_header_dma = dd->port->command_list_dma +
181 (sizeof(struct mtip_cmd_hdr) * rq->tag);
182
183 if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
184 cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
185
186 cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
187}
188
189static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
190{
191 struct request *rq;
192
193 if (mtip_check_surprise_removal(dd->pdev))
194 return NULL;
195
196 rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED);
197 if (IS_ERR(rq))
198 return NULL;
199
200 /* Internal cmd isn't submitted via .queue_rq */
201 mtip_init_cmd_header(rq);
202
203 return blk_mq_rq_to_pdu(rq);
204}
205
206static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd, 171static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
207 unsigned int tag) 172 unsigned int tag)
208{ 173{
@@ -1023,13 +988,14 @@ static int mtip_exec_internal_command(struct mtip_port *port,
1023 return -EFAULT; 988 return -EFAULT;
1024 } 989 }
1025 990
1026 int_cmd = mtip_get_int_command(dd); 991 if (mtip_check_surprise_removal(dd->pdev))
1027 if (!int_cmd) { 992 return -EFAULT;
993
994 rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED);
995 if (IS_ERR(rq)) {
1028 dbg_printk(MTIP_DRV_NAME "Unable to allocate tag for PIO cmd\n"); 996 dbg_printk(MTIP_DRV_NAME "Unable to allocate tag for PIO cmd\n");
1029 return -EFAULT; 997 return -EFAULT;
1030 } 998 }
1031 rq = blk_mq_rq_from_pdu(int_cmd);
1032 rq->special = &icmd;
1033 999
1034 set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); 1000 set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
1035 1001
@@ -1050,6 +1016,8 @@ static int mtip_exec_internal_command(struct mtip_port *port,
1050 } 1016 }
1051 1017
1052 /* Copy the command to the command table */ 1018 /* Copy the command to the command table */
1019 int_cmd = blk_mq_rq_to_pdu(rq);
1020 int_cmd->icmd = &icmd;
1053 memcpy(int_cmd->command, fis, fis_len*4); 1021 memcpy(int_cmd->command, fis, fis_len*4);
1054 1022
1055 rq->timeout = timeout; 1023 rq->timeout = timeout;
@@ -1423,23 +1391,19 @@ static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
1423 * @dd pointer to driver_data structure 1391 * @dd pointer to driver_data structure
1424 * @lba starting lba 1392 * @lba starting lba
1425 * @len # of 512b sectors to trim 1393 * @len # of 512b sectors to trim
1426 *
1427 * return value
1428 * -ENOMEM Out of dma memory
1429 * -EINVAL Invalid parameters passed in, trim not supported
1430 * -EIO Error submitting trim request to hw
1431 */ 1394 */
1432static int mtip_send_trim(struct driver_data *dd, unsigned int lba, 1395static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
1433 unsigned int len) 1396 unsigned int len)
1434{ 1397{
1435 int i, rv = 0;
1436 u64 tlba, tlen, sect_left; 1398 u64 tlba, tlen, sect_left;
1437 struct mtip_trim_entry *buf; 1399 struct mtip_trim_entry *buf;
1438 dma_addr_t dma_addr; 1400 dma_addr_t dma_addr;
1439 struct host_to_dev_fis fis; 1401 struct host_to_dev_fis fis;
1402 blk_status_t ret = BLK_STS_OK;
1403 int i;
1440 1404
1441 if (!len || dd->trim_supp == false) 1405 if (!len || dd->trim_supp == false)
1442 return -EINVAL; 1406 return BLK_STS_IOERR;
1443 1407
1444 /* Trim request too big */ 1408 /* Trim request too big */
1445 WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES)); 1409 WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES));
@@ -1454,7 +1418,7 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
1454 buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr, 1418 buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
1455 GFP_KERNEL); 1419 GFP_KERNEL);
1456 if (!buf) 1420 if (!buf)
1457 return -ENOMEM; 1421 return BLK_STS_RESOURCE;
1458 memset(buf, 0, ATA_SECT_SIZE); 1422 memset(buf, 0, ATA_SECT_SIZE);
1459 1423
1460 for (i = 0, sect_left = len, tlba = lba; 1424 for (i = 0, sect_left = len, tlba = lba;
@@ -1463,8 +1427,8 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
1463 tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ? 1427 tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ?
1464 MTIP_MAX_TRIM_ENTRY_LEN : 1428 MTIP_MAX_TRIM_ENTRY_LEN :
1465 sect_left); 1429 sect_left);
1466 buf[i].lba = __force_bit2int cpu_to_le32(tlba); 1430 buf[i].lba = cpu_to_le32(tlba);
1467 buf[i].range = __force_bit2int cpu_to_le16(tlen); 1431 buf[i].range = cpu_to_le16(tlen);
1468 tlba += tlen; 1432 tlba += tlen;
1469 sect_left -= tlen; 1433 sect_left -= tlen;
1470 } 1434 }
@@ -1486,10 +1450,10 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
1486 ATA_SECT_SIZE, 1450 ATA_SECT_SIZE,
1487 0, 1451 0,
1488 MTIP_TRIM_TIMEOUT_MS) < 0) 1452 MTIP_TRIM_TIMEOUT_MS) < 0)
1489 rv = -EIO; 1453 ret = BLK_STS_IOERR;
1490 1454
1491 dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr); 1455 dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
1492 return rv; 1456 return ret;
1493} 1457}
1494 1458
1495/* 1459/*
@@ -1585,23 +1549,20 @@ static inline void fill_command_sg(struct driver_data *dd,
1585 int n; 1549 int n;
1586 unsigned int dma_len; 1550 unsigned int dma_len;
1587 struct mtip_cmd_sg *command_sg; 1551 struct mtip_cmd_sg *command_sg;
1588 struct scatterlist *sg = command->sg; 1552 struct scatterlist *sg;
1589 1553
1590 command_sg = command->command + AHCI_CMD_TBL_HDR_SZ; 1554 command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
1591 1555
1592 for (n = 0; n < nents; n++) { 1556 for_each_sg(command->sg, sg, nents, n) {
1593 dma_len = sg_dma_len(sg); 1557 dma_len = sg_dma_len(sg);
1594 if (dma_len > 0x400000) 1558 if (dma_len > 0x400000)
1595 dev_err(&dd->pdev->dev, 1559 dev_err(&dd->pdev->dev,
1596 "DMA segment length truncated\n"); 1560 "DMA segment length truncated\n");
1597 command_sg->info = __force_bit2int 1561 command_sg->info = cpu_to_le32((dma_len-1) & 0x3FFFFF);
1598 cpu_to_le32((dma_len-1) & 0x3FFFFF); 1562 command_sg->dba = cpu_to_le32(sg_dma_address(sg));
1599 command_sg->dba = __force_bit2int 1563 command_sg->dba_upper =
1600 cpu_to_le32(sg_dma_address(sg));
1601 command_sg->dba_upper = __force_bit2int
1602 cpu_to_le32((sg_dma_address(sg) >> 16) >> 16); 1564 cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
1603 command_sg++; 1565 command_sg++;
1604 sg++;
1605 } 1566 }
1606} 1567}
1607 1568
@@ -2171,7 +2132,6 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
2171 * @dd Pointer to the driver data structure. 2132 * @dd Pointer to the driver data structure.
2172 * @start First sector to read. 2133 * @start First sector to read.
2173 * @nsect Number of sectors to read. 2134 * @nsect Number of sectors to read.
2174 * @nents Number of entries in scatter list for the read command.
2175 * @tag The tag of this read command. 2135 * @tag The tag of this read command.
2176 * @callback Pointer to the function that should be called 2136 * @callback Pointer to the function that should be called
2177 * when the read completes. 2137 * when the read completes.
@@ -2183,16 +2143,20 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
2183 * None 2143 * None
2184 */ 2144 */
2185static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, 2145static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
2186 struct mtip_cmd *command, int nents, 2146 struct mtip_cmd *command,
2187 struct blk_mq_hw_ctx *hctx) 2147 struct blk_mq_hw_ctx *hctx)
2188{ 2148{
2149 struct mtip_cmd_hdr *hdr =
2150 dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
2189 struct host_to_dev_fis *fis; 2151 struct host_to_dev_fis *fis;
2190 struct mtip_port *port = dd->port; 2152 struct mtip_port *port = dd->port;
2191 int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 2153 int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2192 u64 start = blk_rq_pos(rq); 2154 u64 start = blk_rq_pos(rq);
2193 unsigned int nsect = blk_rq_sectors(rq); 2155 unsigned int nsect = blk_rq_sectors(rq);
2156 unsigned int nents;
2194 2157
2195 /* Map the scatter list for DMA access */ 2158 /* Map the scatter list for DMA access */
2159 nents = blk_rq_map_sg(hctx->queue, rq, command->sg);
2196 nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); 2160 nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
2197 2161
2198 prefetch(&port->flags); 2162 prefetch(&port->flags);
@@ -2233,10 +2197,11 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
2233 fis->device |= 1 << 7; 2197 fis->device |= 1 << 7;
2234 2198
2235 /* Populate the command header */ 2199 /* Populate the command header */
2236 command->command_header->opts = 2200 hdr->ctba = cpu_to_le32(command->command_dma & 0xFFFFFFFF);
2237 __force_bit2int cpu_to_le32( 2201 if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
2238 (nents << 16) | 5 | AHCI_CMD_PREFETCH); 2202 hdr->ctbau = cpu_to_le32((command->command_dma >> 16) >> 16);
2239 command->command_header->byte_count = 0; 2203 hdr->opts = cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH);
2204 hdr->byte_count = 0;
2240 2205
2241 command->direction = dma_dir; 2206 command->direction = dma_dir;
2242 2207
@@ -2715,12 +2680,12 @@ static void mtip_softirq_done_fn(struct request *rq)
2715 cmd->direction); 2680 cmd->direction);
2716 2681
2717 if (unlikely(cmd->unaligned)) 2682 if (unlikely(cmd->unaligned))
2718 up(&dd->port->cmd_slot_unal); 2683 atomic_inc(&dd->port->cmd_slot_unal);
2719 2684
2720 blk_mq_end_request(rq, cmd->status); 2685 blk_mq_end_request(rq, cmd->status);
2721} 2686}
2722 2687
2723static void mtip_abort_cmd(struct request *req, void *data, bool reserved) 2688static bool mtip_abort_cmd(struct request *req, void *data, bool reserved)
2724{ 2689{
2725 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); 2690 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
2726 struct driver_data *dd = data; 2691 struct driver_data *dd = data;
@@ -2730,14 +2695,16 @@ static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
2730 clear_bit(req->tag, dd->port->cmds_to_issue); 2695 clear_bit(req->tag, dd->port->cmds_to_issue);
2731 cmd->status = BLK_STS_IOERR; 2696 cmd->status = BLK_STS_IOERR;
2732 mtip_softirq_done_fn(req); 2697 mtip_softirq_done_fn(req);
2698 return true;
2733} 2699}
2734 2700
2735static void mtip_queue_cmd(struct request *req, void *data, bool reserved) 2701static bool mtip_queue_cmd(struct request *req, void *data, bool reserved)
2736{ 2702{
2737 struct driver_data *dd = data; 2703 struct driver_data *dd = data;
2738 2704
2739 set_bit(req->tag, dd->port->cmds_to_issue); 2705 set_bit(req->tag, dd->port->cmds_to_issue);
2740 blk_abort_request(req); 2706 blk_abort_request(req);
2707 return true;
2741} 2708}
2742 2709
2743/* 2710/*
@@ -2803,10 +2770,7 @@ restart_eh:
2803 2770
2804 blk_mq_quiesce_queue(dd->queue); 2771 blk_mq_quiesce_queue(dd->queue);
2805 2772
2806 spin_lock(dd->queue->queue_lock); 2773 blk_mq_tagset_busy_iter(&dd->tags, mtip_queue_cmd, dd);
2807 blk_mq_tagset_busy_iter(&dd->tags,
2808 mtip_queue_cmd, dd);
2809 spin_unlock(dd->queue->queue_lock);
2810 2774
2811 set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags); 2775 set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags);
2812 2776
@@ -3026,7 +2990,7 @@ static int mtip_hw_init(struct driver_data *dd)
3026 else 2990 else
3027 dd->unal_qdepth = 0; 2991 dd->unal_qdepth = 0;
3028 2992
3029 sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); 2993 atomic_set(&dd->port->cmd_slot_unal, dd->unal_qdepth);
3030 2994
3031 /* Spinlock to prevent concurrent issue */ 2995 /* Spinlock to prevent concurrent issue */
3032 for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) 2996 for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
@@ -3531,58 +3495,24 @@ static inline bool is_se_active(struct driver_data *dd)
3531 return false; 3495 return false;
3532} 3496}
3533 3497
3534/* 3498static inline bool is_stopped(struct driver_data *dd, struct request *rq)
3535 * Block layer make request function.
3536 *
3537 * This function is called by the kernel to process a BIO for
3538 * the P320 device.
3539 *
3540 * @queue Pointer to the request queue. Unused other than to obtain
3541 * the driver data structure.
3542 * @rq Pointer to the request.
3543 *
3544 */
3545static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
3546{ 3499{
3547 struct driver_data *dd = hctx->queue->queuedata; 3500 if (likely(!(dd->dd_flag & MTIP_DDF_STOP_IO)))
3548 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); 3501 return false;
3549 unsigned int nents;
3550
3551 if (is_se_active(dd))
3552 return -ENODATA;
3553
3554 if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
3555 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
3556 &dd->dd_flag))) {
3557 return -ENXIO;
3558 }
3559 if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
3560 return -ENODATA;
3561 }
3562 if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
3563 &dd->dd_flag) &&
3564 rq_data_dir(rq))) {
3565 return -ENODATA;
3566 }
3567 if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag) ||
3568 test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)))
3569 return -ENODATA;
3570 }
3571
3572 if (req_op(rq) == REQ_OP_DISCARD) {
3573 int err;
3574
3575 err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
3576 blk_mq_end_request(rq, err ? BLK_STS_IOERR : BLK_STS_OK);
3577 return 0;
3578 }
3579 3502
3580 /* Create the scatter list for this request. */ 3503 if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
3581 nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg); 3504 return true;
3505 if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
3506 return true;
3507 if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag) &&
3508 rq_data_dir(rq))
3509 return true;
3510 if (test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
3511 return true;
3512 if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
3513 return true;
3582 3514
3583 /* Issue the read/write. */ 3515 return false;
3584 mtip_hw_submit_io(dd, rq, cmd, nents, hctx);
3585 return 0;
3586} 3516}
3587 3517
3588static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, 3518static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
@@ -3603,7 +3533,7 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
3603 cmd->unaligned = 1; 3533 cmd->unaligned = 1;
3604 } 3534 }
3605 3535
3606 if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal)) 3536 if (cmd->unaligned && atomic_dec_if_positive(&dd->port->cmd_slot_unal) >= 0)
3607 return true; 3537 return true;
3608 3538
3609 return false; 3539 return false;
@@ -3613,32 +3543,33 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
3613 struct request *rq) 3543 struct request *rq)
3614{ 3544{
3615 struct driver_data *dd = hctx->queue->queuedata; 3545 struct driver_data *dd = hctx->queue->queuedata;
3616 struct mtip_int_cmd *icmd = rq->special;
3617 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); 3546 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
3547 struct mtip_int_cmd *icmd = cmd->icmd;
3548 struct mtip_cmd_hdr *hdr =
3549 dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
3618 struct mtip_cmd_sg *command_sg; 3550 struct mtip_cmd_sg *command_sg;
3619 3551
3620 if (mtip_commands_active(dd->port)) 3552 if (mtip_commands_active(dd->port))
3621 return BLK_STS_RESOURCE; 3553 return BLK_STS_DEV_RESOURCE;
3622 3554
3555 hdr->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
3556 if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
3557 hdr->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16);
3623 /* Populate the SG list */ 3558 /* Populate the SG list */
3624 cmd->command_header->opts = 3559 hdr->opts = cpu_to_le32(icmd->opts | icmd->fis_len);
3625 __force_bit2int cpu_to_le32(icmd->opts | icmd->fis_len);
3626 if (icmd->buf_len) { 3560 if (icmd->buf_len) {
3627 command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ; 3561 command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ;
3628 3562
3629 command_sg->info = 3563 command_sg->info = cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF);
3630 __force_bit2int cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF); 3564 command_sg->dba = cpu_to_le32(icmd->buffer & 0xFFFFFFFF);
3631 command_sg->dba =
3632 __force_bit2int cpu_to_le32(icmd->buffer & 0xFFFFFFFF);
3633 command_sg->dba_upper = 3565 command_sg->dba_upper =
3634 __force_bit2int cpu_to_le32((icmd->buffer >> 16) >> 16); 3566 cpu_to_le32((icmd->buffer >> 16) >> 16);
3635 3567
3636 cmd->command_header->opts |= 3568 hdr->opts |= cpu_to_le32((1 << 16));
3637 __force_bit2int cpu_to_le32((1 << 16));
3638 } 3569 }
3639 3570
3640 /* Populate the command header */ 3571 /* Populate the command header */
3641 cmd->command_header->byte_count = 0; 3572 hdr->byte_count = 0;
3642 3573
3643 blk_mq_start_request(rq); 3574 blk_mq_start_request(rq);
3644 mtip_issue_non_ncq_command(dd->port, rq->tag); 3575 mtip_issue_non_ncq_command(dd->port, rq->tag);
@@ -3648,23 +3579,25 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
3648static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx, 3579static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
3649 const struct blk_mq_queue_data *bd) 3580 const struct blk_mq_queue_data *bd)
3650{ 3581{
3582 struct driver_data *dd = hctx->queue->queuedata;
3651 struct request *rq = bd->rq; 3583 struct request *rq = bd->rq;
3652 int ret; 3584 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
3653
3654 mtip_init_cmd_header(rq);
3655 3585
3656 if (blk_rq_is_passthrough(rq)) 3586 if (blk_rq_is_passthrough(rq))
3657 return mtip_issue_reserved_cmd(hctx, rq); 3587 return mtip_issue_reserved_cmd(hctx, rq);
3658 3588
3659 if (unlikely(mtip_check_unal_depth(hctx, rq))) 3589 if (unlikely(mtip_check_unal_depth(hctx, rq)))
3660 return BLK_STS_RESOURCE; 3590 return BLK_STS_DEV_RESOURCE;
3591
3592 if (is_se_active(dd) || is_stopped(dd, rq))
3593 return BLK_STS_IOERR;
3661 3594
3662 blk_mq_start_request(rq); 3595 blk_mq_start_request(rq);
3663 3596
3664 ret = mtip_submit_request(hctx, rq); 3597 if (req_op(rq) == REQ_OP_DISCARD)
3665 if (likely(!ret)) 3598 return mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
3666 return BLK_STS_OK; 3599 mtip_hw_submit_io(dd, rq, cmd, hctx);
3667 return BLK_STS_IOERR; 3600 return BLK_STS_OK;
3668} 3601}
3669 3602
3670static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq, 3603static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq,
@@ -3920,12 +3853,13 @@ protocol_init_error:
3920 return rv; 3853 return rv;
3921} 3854}
3922 3855
3923static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv) 3856static bool mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
3924{ 3857{
3925 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); 3858 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
3926 3859
3927 cmd->status = BLK_STS_IOERR; 3860 cmd->status = BLK_STS_IOERR;
3928 blk_mq_complete_request(rq); 3861 blk_mq_complete_request(rq);
3862 return true;
3929} 3863}
3930 3864
3931/* 3865/*
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index e20e55dab443..abce25f27f57 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -126,8 +126,6 @@
126 126
127#define MTIP_DFS_MAX_BUF_SIZE 1024 127#define MTIP_DFS_MAX_BUF_SIZE 1024
128 128
129#define __force_bit2int (unsigned int __force)
130
131enum { 129enum {
132 /* below are bit numbers in 'flags' defined in mtip_port */ 130 /* below are bit numbers in 'flags' defined in mtip_port */
133 MTIP_PF_IC_ACTIVE_BIT = 0, /* pio/ioctl */ 131 MTIP_PF_IC_ACTIVE_BIT = 0, /* pio/ioctl */
@@ -174,10 +172,10 @@ enum {
174 172
175struct smart_attr { 173struct smart_attr {
176 u8 attr_id; 174 u8 attr_id;
177 u16 flags; 175 __le16 flags;
178 u8 cur; 176 u8 cur;
179 u8 worst; 177 u8 worst;
180 u32 data; 178 __le32 data;
181 u8 res[3]; 179 u8 res[3];
182} __packed; 180} __packed;
183 181
@@ -200,9 +198,9 @@ struct mtip_work {
200#define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8 198#define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8
201 199
202struct mtip_trim_entry { 200struct mtip_trim_entry {
203 u32 lba; /* starting lba of region */ 201 __le32 lba; /* starting lba of region */
204 u16 rsvd; /* unused */ 202 __le16 rsvd; /* unused */
205 u16 range; /* # of 512b blocks to trim */ 203 __le16 range; /* # of 512b blocks to trim */
206} __packed; 204} __packed;
207 205
208struct mtip_trim { 206struct mtip_trim {
@@ -278,24 +276,24 @@ struct mtip_cmd_hdr {
278 * - Bit 5 Unused in this implementation. 276 * - Bit 5 Unused in this implementation.
279 * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes). 277 * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
280 */ 278 */
281 unsigned int opts; 279 __le32 opts;
282 /* This field is unsed when using NCQ. */ 280 /* This field is unsed when using NCQ. */
283 union { 281 union {
284 unsigned int byte_count; 282 __le32 byte_count;
285 unsigned int status; 283 __le32 status;
286 }; 284 };
287 /* 285 /*
288 * Lower 32 bits of the command table address associated with this 286 * Lower 32 bits of the command table address associated with this
289 * header. The command table addresses must be 128 byte aligned. 287 * header. The command table addresses must be 128 byte aligned.
290 */ 288 */
291 unsigned int ctba; 289 __le32 ctba;
292 /* 290 /*
293 * If 64 bit addressing is used this field is the upper 32 bits 291 * If 64 bit addressing is used this field is the upper 32 bits
294 * of the command table address associated with this command. 292 * of the command table address associated with this command.
295 */ 293 */
296 unsigned int ctbau; 294 __le32 ctbau;
297 /* Reserved and unused. */ 295 /* Reserved and unused. */
298 unsigned int res[4]; 296 u32 res[4];
299}; 297};
300 298
301/* Command scatter gather structure (PRD). */ 299/* Command scatter gather structure (PRD). */
@@ -305,31 +303,28 @@ struct mtip_cmd_sg {
305 * address must be 8 byte aligned signified by bits 2:0 being 303 * address must be 8 byte aligned signified by bits 2:0 being
306 * set to 0. 304 * set to 0.
307 */ 305 */
308 unsigned int dba; 306 __le32 dba;
309 /* 307 /*
310 * When 64 bit addressing is used this field is the upper 308 * When 64 bit addressing is used this field is the upper
311 * 32 bits of the data buffer address. 309 * 32 bits of the data buffer address.
312 */ 310 */
313 unsigned int dba_upper; 311 __le32 dba_upper;
314 /* Unused. */ 312 /* Unused. */
315 unsigned int reserved; 313 __le32 reserved;
316 /* 314 /*
317 * Bit 31: interrupt when this data block has been transferred. 315 * Bit 31: interrupt when this data block has been transferred.
318 * Bits 30..22: reserved 316 * Bits 30..22: reserved
319 * Bits 21..0: byte count (minus 1). For P320 the byte count must be 317 * Bits 21..0: byte count (minus 1). For P320 the byte count must be
320 * 8 byte aligned signified by bits 2:0 being set to 1. 318 * 8 byte aligned signified by bits 2:0 being set to 1.
321 */ 319 */
322 unsigned int info; 320 __le32 info;
323}; 321};
324struct mtip_port; 322struct mtip_port;
325 323
324struct mtip_int_cmd;
325
326/* Structure used to describe a command. */ 326/* Structure used to describe a command. */
327struct mtip_cmd { 327struct mtip_cmd {
328
329 struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
330
331 dma_addr_t command_header_dma; /* corresponding physical address */
332
333 void *command; /* ptr to command table entry */ 328 void *command; /* ptr to command table entry */
334 329
335 dma_addr_t command_dma; /* corresponding physical address */ 330 dma_addr_t command_dma; /* corresponding physical address */
@@ -338,7 +333,10 @@ struct mtip_cmd {
338 333
339 int unaligned; /* command is unaligned on 4k boundary */ 334 int unaligned; /* command is unaligned on 4k boundary */
340 335
341 struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ 336 union {
337 struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
338 struct mtip_int_cmd *icmd;
339 };
342 340
343 int retries; /* The number of retries left for this command. */ 341 int retries; /* The number of retries left for this command. */
344 342
@@ -435,8 +433,8 @@ struct mtip_port {
435 */ 433 */
436 unsigned long ic_pause_timer; 434 unsigned long ic_pause_timer;
437 435
438 /* Semaphore to control queue depth of unaligned IOs */ 436 /* Counter to control queue depth of unaligned IOs */
439 struct semaphore cmd_slot_unal; 437 atomic_t cmd_slot_unal;
440 438
441 /* Spinlock for working around command-issue bug. */ 439 /* Spinlock for working around command-issue bug. */
442 spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; 440 spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4d4d6129ff66..08696f5f00bb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -734,12 +734,13 @@ static void recv_work(struct work_struct *work)
734 kfree(args); 734 kfree(args);
735} 735}
736 736
737static void nbd_clear_req(struct request *req, void *data, bool reserved) 737static bool nbd_clear_req(struct request *req, void *data, bool reserved)
738{ 738{
739 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); 739 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
740 740
741 cmd->status = BLK_STS_IOERR; 741 cmd->status = BLK_STS_IOERR;
742 blk_mq_complete_request(req); 742 blk_mq_complete_request(req);
743 return true;
743} 744}
744 745
745static void nbd_clear_que(struct nbd_device *nbd) 746static void nbd_clear_que(struct nbd_device *nbd)
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 7685df43f1ef..b3df2793e7cd 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -49,6 +49,7 @@ struct nullb_device {
49 unsigned long completion_nsec; /* time in ns to complete a request */ 49 unsigned long completion_nsec; /* time in ns to complete a request */
50 unsigned long cache_size; /* disk cache size in MB */ 50 unsigned long cache_size; /* disk cache size in MB */
51 unsigned long zone_size; /* zone size in MB if device is zoned */ 51 unsigned long zone_size; /* zone size in MB if device is zoned */
52 unsigned int zone_nr_conv; /* number of conventional zones */
52 unsigned int submit_queues; /* number of submission queues */ 53 unsigned int submit_queues; /* number of submission queues */
53 unsigned int home_node; /* home node for the device */ 54 unsigned int home_node; /* home node for the device */
54 unsigned int queue_mode; /* block interface */ 55 unsigned int queue_mode; /* block interface */
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 09339203dfba..62c9654b9ce8 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -188,6 +188,10 @@ static unsigned long g_zone_size = 256;
188module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); 188module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
189MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); 189MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
190 190
191static unsigned int g_zone_nr_conv;
192module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
193MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
194
191static struct nullb_device *null_alloc_dev(void); 195static struct nullb_device *null_alloc_dev(void);
192static void null_free_dev(struct nullb_device *dev); 196static void null_free_dev(struct nullb_device *dev);
193static void null_del_dev(struct nullb *nullb); 197static void null_del_dev(struct nullb *nullb);
@@ -293,6 +297,7 @@ NULLB_DEVICE_ATTR(mbps, uint);
293NULLB_DEVICE_ATTR(cache_size, ulong); 297NULLB_DEVICE_ATTR(cache_size, ulong);
294NULLB_DEVICE_ATTR(zoned, bool); 298NULLB_DEVICE_ATTR(zoned, bool);
295NULLB_DEVICE_ATTR(zone_size, ulong); 299NULLB_DEVICE_ATTR(zone_size, ulong);
300NULLB_DEVICE_ATTR(zone_nr_conv, uint);
296 301
297static ssize_t nullb_device_power_show(struct config_item *item, char *page) 302static ssize_t nullb_device_power_show(struct config_item *item, char *page)
298{ 303{
@@ -407,6 +412,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
407 &nullb_device_attr_badblocks, 412 &nullb_device_attr_badblocks,
408 &nullb_device_attr_zoned, 413 &nullb_device_attr_zoned,
409 &nullb_device_attr_zone_size, 414 &nullb_device_attr_zone_size,
415 &nullb_device_attr_zone_nr_conv,
410 NULL, 416 NULL,
411}; 417};
412 418
@@ -520,6 +526,7 @@ static struct nullb_device *null_alloc_dev(void)
520 dev->use_per_node_hctx = g_use_per_node_hctx; 526 dev->use_per_node_hctx = g_use_per_node_hctx;
521 dev->zoned = g_zoned; 527 dev->zoned = g_zoned;
522 dev->zone_size = g_zone_size; 528 dev->zone_size = g_zone_size;
529 dev->zone_nr_conv = g_zone_nr_conv;
523 return dev; 530 return dev;
524} 531}
525 532
@@ -635,14 +642,9 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
635 hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); 642 hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
636} 643}
637 644
638static void null_softirq_done_fn(struct request *rq) 645static void null_complete_rq(struct request *rq)
639{ 646{
640 struct nullb *nullb = rq->q->queuedata; 647 end_cmd(blk_mq_rq_to_pdu(rq));
641
642 if (nullb->dev->queue_mode == NULL_Q_MQ)
643 end_cmd(blk_mq_rq_to_pdu(rq));
644 else
645 end_cmd(rq->special);
646} 648}
647 649
648static struct nullb_page *null_alloc_page(gfp_t gfp_flags) 650static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
@@ -1350,7 +1352,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
1350 1352
1351static const struct blk_mq_ops null_mq_ops = { 1353static const struct blk_mq_ops null_mq_ops = {
1352 .queue_rq = null_queue_rq, 1354 .queue_rq = null_queue_rq,
1353 .complete = null_softirq_done_fn, 1355 .complete = null_complete_rq,
1354 .timeout = null_timeout_rq, 1356 .timeout = null_timeout_rq,
1355}; 1357};
1356 1358
@@ -1657,8 +1659,7 @@ static int null_add_dev(struct nullb_device *dev)
1657 } 1659 }
1658 null_init_queues(nullb); 1660 null_init_queues(nullb);
1659 } else if (dev->queue_mode == NULL_Q_BIO) { 1661 } else if (dev->queue_mode == NULL_Q_BIO) {
1660 nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node, 1662 nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node);
1661 NULL);
1662 if (!nullb->q) { 1663 if (!nullb->q) {
1663 rv = -ENOMEM; 1664 rv = -ENOMEM;
1664 goto out_cleanup_queues; 1665 goto out_cleanup_queues;
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index c0b0e4a3fa8f..5d1c261a2cfd 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -29,7 +29,25 @@ int null_zone_init(struct nullb_device *dev)
29 if (!dev->zones) 29 if (!dev->zones)
30 return -ENOMEM; 30 return -ENOMEM;
31 31
32 for (i = 0; i < dev->nr_zones; i++) { 32 if (dev->zone_nr_conv >= dev->nr_zones) {
33 dev->zone_nr_conv = dev->nr_zones - 1;
34 pr_info("null_blk: changed the number of conventional zones to %u",
35 dev->zone_nr_conv);
36 }
37
38 for (i = 0; i < dev->zone_nr_conv; i++) {
39 struct blk_zone *zone = &dev->zones[i];
40
41 zone->start = sector;
42 zone->len = dev->zone_size_sects;
43 zone->wp = zone->start + zone->len;
44 zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
45 zone->cond = BLK_ZONE_COND_NOT_WP;
46
47 sector += dev->zone_size_sects;
48 }
49
50 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
33 struct blk_zone *zone = &dev->zones[i]; 51 struct blk_zone *zone = &dev->zones[i];
34 52
35 zone->start = zone->wp = sector; 53 zone->start = zone->wp = sector;
@@ -98,6 +116,8 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
98 if (zone->wp == zone->start + zone->len) 116 if (zone->wp == zone->start + zone->len)
99 zone->cond = BLK_ZONE_COND_FULL; 117 zone->cond = BLK_ZONE_COND_FULL;
100 break; 118 break;
119 case BLK_ZONE_COND_NOT_WP:
120 break;
101 default: 121 default:
102 /* Invalid zone condition */ 122 /* Invalid zone condition */
103 cmd->error = BLK_STS_IOERR; 123 cmd->error = BLK_STS_IOERR;
@@ -111,6 +131,11 @@ void null_zone_reset(struct nullb_cmd *cmd, sector_t sector)
111 unsigned int zno = null_zone_no(dev, sector); 131 unsigned int zno = null_zone_no(dev, sector);
112 struct blk_zone *zone = &dev->zones[zno]; 132 struct blk_zone *zone = &dev->zones[zno];
113 133
134 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
135 cmd->error = BLK_STS_IOERR;
136 return;
137 }
138
114 zone->cond = BLK_ZONE_COND_EMPTY; 139 zone->cond = BLK_ZONE_COND_EMPTY;
115 zone->wp = zone->start; 140 zone->wp = zone->start;
116} 141}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index ae4971e5d9a8..0ff9b12d0e35 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -242,6 +242,11 @@ struct pd_unit {
242 242
243static struct pd_unit pd[PD_UNITS]; 243static struct pd_unit pd[PD_UNITS];
244 244
245struct pd_req {
246 /* for REQ_OP_DRV_IN: */
247 enum action (*func)(struct pd_unit *disk);
248};
249
245static char pd_scratch[512]; /* scratch block buffer */ 250static char pd_scratch[512]; /* scratch block buffer */
246 251
247static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR", 252static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR",
@@ -502,8 +507,9 @@ static enum action do_pd_io_start(void)
502 507
503static enum action pd_special(void) 508static enum action pd_special(void)
504{ 509{
505 enum action (*func)(struct pd_unit *) = pd_req->special; 510 struct pd_req *req = blk_mq_rq_to_pdu(pd_req);
506 return func(pd_current); 511
512 return req->func(pd_current);
507} 513}
508 514
509static int pd_next_buf(void) 515static int pd_next_buf(void)
@@ -767,12 +773,14 @@ static int pd_special_command(struct pd_unit *disk,
767 enum action (*func)(struct pd_unit *disk)) 773 enum action (*func)(struct pd_unit *disk))
768{ 774{
769 struct request *rq; 775 struct request *rq;
776 struct pd_req *req;
770 777
771 rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0); 778 rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
772 if (IS_ERR(rq)) 779 if (IS_ERR(rq))
773 return PTR_ERR(rq); 780 return PTR_ERR(rq);
781 req = blk_mq_rq_to_pdu(rq);
774 782
775 rq->special = func; 783 req->func = func;
776 blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); 784 blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
777 blk_put_request(rq); 785 blk_put_request(rq);
778 return 0; 786 return 0;
@@ -892,9 +900,21 @@ static void pd_probe_drive(struct pd_unit *disk)
892 disk->gd = p; 900 disk->gd = p;
893 p->private_data = disk; 901 p->private_data = disk;
894 902
895 p->queue = blk_mq_init_sq_queue(&disk->tag_set, &pd_mq_ops, 2, 903 memset(&disk->tag_set, 0, sizeof(disk->tag_set));
896 BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); 904 disk->tag_set.ops = &pd_mq_ops;
905 disk->tag_set.cmd_size = sizeof(struct pd_req);
906 disk->tag_set.nr_hw_queues = 1;
907 disk->tag_set.nr_maps = 1;
908 disk->tag_set.queue_depth = 2;
909 disk->tag_set.numa_node = NUMA_NO_NODE;
910 disk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
911
912 if (blk_mq_alloc_tag_set(&disk->tag_set))
913 return;
914
915 p->queue = blk_mq_init_queue(&disk->tag_set);
897 if (IS_ERR(p->queue)) { 916 if (IS_ERR(p->queue)) {
917 blk_mq_free_tag_set(&disk->tag_set);
898 p->queue = NULL; 918 p->queue = NULL;
899 return; 919 return;
900 } 920 }
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 9381f4e3b221..f5a71023f76c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2203,9 +2203,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2203 * Some CDRW drives can not handle writes larger than one packet, 2203 * Some CDRW drives can not handle writes larger than one packet,
2204 * even if the size is a multiple of the packet size. 2204 * even if the size is a multiple of the packet size.
2205 */ 2205 */
2206 spin_lock_irq(q->queue_lock);
2207 blk_queue_max_hw_sectors(q, pd->settings.size); 2206 blk_queue_max_hw_sectors(q, pd->settings.size);
2208 spin_unlock_irq(q->queue_lock);
2209 set_bit(PACKET_WRITABLE, &pd->flags); 2207 set_bit(PACKET_WRITABLE, &pd->flags);
2210 } else { 2208 } else {
2211 pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); 2209 pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 2459dcc04b1c..a10d5736d8f7 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -181,6 +181,7 @@ struct skd_request_context {
181 struct fit_completion_entry_v1 completion; 181 struct fit_completion_entry_v1 completion;
182 182
183 struct fit_comp_error_info err_info; 183 struct fit_comp_error_info err_info;
184 int retries;
184 185
185 blk_status_t status; 186 blk_status_t status;
186}; 187};
@@ -382,11 +383,12 @@ static void skd_log_skreq(struct skd_device *skdev,
382 * READ/WRITE REQUESTS 383 * READ/WRITE REQUESTS
383 ***************************************************************************** 384 *****************************************************************************
384 */ 385 */
385static void skd_inc_in_flight(struct request *rq, void *data, bool reserved) 386static bool skd_inc_in_flight(struct request *rq, void *data, bool reserved)
386{ 387{
387 int *count = data; 388 int *count = data;
388 389
389 count++; 390 count++;
391 return true;
390} 392}
391 393
392static int skd_in_flight(struct skd_device *skdev) 394static int skd_in_flight(struct skd_device *skdev)
@@ -494,6 +496,11 @@ static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
494 if (unlikely(skdev->state != SKD_DRVR_STATE_ONLINE)) 496 if (unlikely(skdev->state != SKD_DRVR_STATE_ONLINE))
495 return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE; 497 return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE;
496 498
499 if (!(req->rq_flags & RQF_DONTPREP)) {
500 skreq->retries = 0;
501 req->rq_flags |= RQF_DONTPREP;
502 }
503
497 blk_mq_start_request(req); 504 blk_mq_start_request(req);
498 505
499 WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n", 506 WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n",
@@ -1425,7 +1432,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev,
1425 break; 1432 break;
1426 1433
1427 case SKD_CHECK_STATUS_REQUEUE_REQUEST: 1434 case SKD_CHECK_STATUS_REQUEUE_REQUEST:
1428 if ((unsigned long) ++req->special < SKD_MAX_RETRIES) { 1435 if (++skreq->retries < SKD_MAX_RETRIES) {
1429 skd_log_skreq(skdev, skreq, "retry"); 1436 skd_log_skreq(skdev, skreq, "retry");
1430 blk_mq_requeue_request(req, true); 1437 blk_mq_requeue_request(req, true);
1431 break; 1438 break;
@@ -1887,13 +1894,13 @@ static void skd_isr_fwstate(struct skd_device *skdev)
1887 skd_skdev_state_to_str(skdev->state), skdev->state); 1894 skd_skdev_state_to_str(skdev->state), skdev->state);
1888} 1895}
1889 1896
1890static void skd_recover_request(struct request *req, void *data, bool reserved) 1897static bool skd_recover_request(struct request *req, void *data, bool reserved)
1891{ 1898{
1892 struct skd_device *const skdev = data; 1899 struct skd_device *const skdev = data;
1893 struct skd_request_context *skreq = blk_mq_rq_to_pdu(req); 1900 struct skd_request_context *skreq = blk_mq_rq_to_pdu(req);
1894 1901
1895 if (skreq->state != SKD_REQ_STATE_BUSY) 1902 if (skreq->state != SKD_REQ_STATE_BUSY)
1896 return; 1903 return true;
1897 1904
1898 skd_log_skreq(skdev, skreq, "recover"); 1905 skd_log_skreq(skdev, skreq, "recover");
1899 1906
@@ -1904,6 +1911,7 @@ static void skd_recover_request(struct request *req, void *data, bool reserved)
1904 skreq->state = SKD_REQ_STATE_IDLE; 1911 skreq->state = SKD_REQ_STATE_IDLE;
1905 skreq->status = BLK_STS_IOERR; 1912 skreq->status = BLK_STS_IOERR;
1906 blk_mq_complete_request(req); 1913 blk_mq_complete_request(req);
1914 return true;
1907} 1915}
1908 1916
1909static void skd_recover_requests(struct skd_device *skdev) 1917static void skd_recover_requests(struct skd_device *skdev)
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index b54fa6726303..9c0553dd13e7 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -6,7 +6,7 @@
6#include <linux/module.h> 6#include <linux/module.h>
7#include <linux/kernel.h> 7#include <linux/kernel.h>
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/blkdev.h> 9#include <linux/blk-mq.h>
10#include <linux/hdreg.h> 10#include <linux/hdreg.h>
11#include <linux/genhd.h> 11#include <linux/genhd.h>
12#include <linux/cdrom.h> 12#include <linux/cdrom.h>
@@ -45,6 +45,8 @@ MODULE_VERSION(DRV_MODULE_VERSION);
45#define WAITING_FOR_GEN_CMD 0x04 45#define WAITING_FOR_GEN_CMD 0x04
46#define WAITING_FOR_ANY -1 46#define WAITING_FOR_ANY -1
47 47
48#define VDC_MAX_RETRIES 10
49
48static struct workqueue_struct *sunvdc_wq; 50static struct workqueue_struct *sunvdc_wq;
49 51
50struct vdc_req_entry { 52struct vdc_req_entry {
@@ -66,9 +68,10 @@ struct vdc_port {
66 68
67 u64 max_xfer_size; 69 u64 max_xfer_size;
68 u32 vdisk_block_size; 70 u32 vdisk_block_size;
71 u32 drain;
69 72
70 u64 ldc_timeout; 73 u64 ldc_timeout;
71 struct timer_list ldc_reset_timer; 74 struct delayed_work ldc_reset_timer_work;
72 struct work_struct ldc_reset_work; 75 struct work_struct ldc_reset_work;
73 76
74 /* The server fills these in for us in the disk attribute 77 /* The server fills these in for us in the disk attribute
@@ -80,12 +83,14 @@ struct vdc_port {
80 u8 vdisk_mtype; 83 u8 vdisk_mtype;
81 u32 vdisk_phys_blksz; 84 u32 vdisk_phys_blksz;
82 85
86 struct blk_mq_tag_set tag_set;
87
83 char disk_name[32]; 88 char disk_name[32];
84}; 89};
85 90
86static void vdc_ldc_reset(struct vdc_port *port); 91static void vdc_ldc_reset(struct vdc_port *port);
87static void vdc_ldc_reset_work(struct work_struct *work); 92static void vdc_ldc_reset_work(struct work_struct *work);
88static void vdc_ldc_reset_timer(struct timer_list *t); 93static void vdc_ldc_reset_timer_work(struct work_struct *work);
89 94
90static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio) 95static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
91{ 96{
@@ -175,11 +180,8 @@ static void vdc_blk_queue_start(struct vdc_port *port)
175 * handshake completes, so check for initial handshake before we've 180 * handshake completes, so check for initial handshake before we've
176 * allocated a disk. 181 * allocated a disk.
177 */ 182 */
178 if (port->disk && blk_queue_stopped(port->disk->queue) && 183 if (port->disk && vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50)
179 vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) { 184 blk_mq_start_hw_queues(port->disk->queue);
180 blk_start_queue(port->disk->queue);
181 }
182
183} 185}
184 186
185static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for) 187static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for)
@@ -197,7 +199,7 @@ static void vdc_handshake_complete(struct vio_driver_state *vio)
197{ 199{
198 struct vdc_port *port = to_vdc_port(vio); 200 struct vdc_port *port = to_vdc_port(vio);
199 201
200 del_timer(&port->ldc_reset_timer); 202 cancel_delayed_work(&port->ldc_reset_timer_work);
201 vdc_finish(vio, 0, WAITING_FOR_LINK_UP); 203 vdc_finish(vio, 0, WAITING_FOR_LINK_UP);
202 vdc_blk_queue_start(port); 204 vdc_blk_queue_start(port);
203} 205}
@@ -320,7 +322,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
320 322
321 rqe->req = NULL; 323 rqe->req = NULL;
322 324
323 __blk_end_request(req, (desc->status ? BLK_STS_IOERR : 0), desc->size); 325 blk_mq_end_request(req, desc->status ? BLK_STS_IOERR : 0);
324 326
325 vdc_blk_queue_start(port); 327 vdc_blk_queue_start(port);
326} 328}
@@ -431,6 +433,7 @@ static int __vdc_tx_trigger(struct vdc_port *port)
431 .end_idx = dr->prod, 433 .end_idx = dr->prod,
432 }; 434 };
433 int err, delay; 435 int err, delay;
436 int retries = 0;
434 437
435 hdr.seq = dr->snd_nxt; 438 hdr.seq = dr->snd_nxt;
436 delay = 1; 439 delay = 1;
@@ -443,6 +446,8 @@ static int __vdc_tx_trigger(struct vdc_port *port)
443 udelay(delay); 446 udelay(delay);
444 if ((delay <<= 1) > 128) 447 if ((delay <<= 1) > 128)
445 delay = 128; 448 delay = 128;
449 if (retries++ > VDC_MAX_RETRIES)
450 break;
446 } while (err == -EAGAIN); 451 } while (err == -EAGAIN);
447 452
448 if (err == -ENOTCONN) 453 if (err == -ENOTCONN)
@@ -525,29 +530,40 @@ static int __send_request(struct request *req)
525 return err; 530 return err;
526} 531}
527 532
528static void do_vdc_request(struct request_queue *rq) 533static blk_status_t vdc_queue_rq(struct blk_mq_hw_ctx *hctx,
534 const struct blk_mq_queue_data *bd)
529{ 535{
530 struct request *req; 536 struct vdc_port *port = hctx->queue->queuedata;
537 struct vio_dring_state *dr;
538 unsigned long flags;
531 539
532 while ((req = blk_peek_request(rq)) != NULL) { 540 dr = &port->vio.drings[VIO_DRIVER_TX_RING];
533 struct vdc_port *port;
534 struct vio_dring_state *dr;
535 541
536 port = req->rq_disk->private_data; 542 blk_mq_start_request(bd->rq);
537 dr = &port->vio.drings[VIO_DRIVER_TX_RING];
538 if (unlikely(vdc_tx_dring_avail(dr) < 1))
539 goto wait;
540 543
541 blk_start_request(req); 544 spin_lock_irqsave(&port->vio.lock, flags);
542 545
543 if (__send_request(req) < 0) { 546 /*
544 blk_requeue_request(rq, req); 547 * Doing drain, just end the request in error
545wait: 548 */
546 /* Avoid pointless unplugs. */ 549 if (unlikely(port->drain)) {
547 blk_stop_queue(rq); 550 spin_unlock_irqrestore(&port->vio.lock, flags);
548 break; 551 return BLK_STS_IOERR;
549 } 552 }
553
554 if (unlikely(vdc_tx_dring_avail(dr) < 1)) {
555 spin_unlock_irqrestore(&port->vio.lock, flags);
556 blk_mq_stop_hw_queue(hctx);
557 return BLK_STS_DEV_RESOURCE;
558 }
559
560 if (__send_request(bd->rq) < 0) {
561 spin_unlock_irqrestore(&port->vio.lock, flags);
562 return BLK_STS_IOERR;
550 } 563 }
564
565 spin_unlock_irqrestore(&port->vio.lock, flags);
566 return BLK_STS_OK;
551} 567}
552 568
553static int generic_request(struct vdc_port *port, u8 op, void *buf, int len) 569static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
@@ -759,6 +775,31 @@ static void vdc_port_down(struct vdc_port *port)
759 vio_ldc_free(&port->vio); 775 vio_ldc_free(&port->vio);
760} 776}
761 777
778static const struct blk_mq_ops vdc_mq_ops = {
779 .queue_rq = vdc_queue_rq,
780};
781
782static void cleanup_queue(struct request_queue *q)
783{
784 struct vdc_port *port = q->queuedata;
785
786 blk_cleanup_queue(q);
787 blk_mq_free_tag_set(&port->tag_set);
788}
789
790static struct request_queue *init_queue(struct vdc_port *port)
791{
792 struct request_queue *q;
793
794 q = blk_mq_init_sq_queue(&port->tag_set, &vdc_mq_ops, VDC_TX_RING_SIZE,
795 BLK_MQ_F_SHOULD_MERGE);
796 if (IS_ERR(q))
797 return q;
798
799 q->queuedata = port;
800 return q;
801}
802
762static int probe_disk(struct vdc_port *port) 803static int probe_disk(struct vdc_port *port)
763{ 804{
764 struct request_queue *q; 805 struct request_queue *q;
@@ -796,17 +837,17 @@ static int probe_disk(struct vdc_port *port)
796 (u64)geom.num_sec); 837 (u64)geom.num_sec);
797 } 838 }
798 839
799 q = blk_init_queue(do_vdc_request, &port->vio.lock); 840 q = init_queue(port);
800 if (!q) { 841 if (IS_ERR(q)) {
801 printk(KERN_ERR PFX "%s: Could not allocate queue.\n", 842 printk(KERN_ERR PFX "%s: Could not allocate queue.\n",
802 port->vio.name); 843 port->vio.name);
803 return -ENOMEM; 844 return PTR_ERR(q);
804 } 845 }
805 g = alloc_disk(1 << PARTITION_SHIFT); 846 g = alloc_disk(1 << PARTITION_SHIFT);
806 if (!g) { 847 if (!g) {
807 printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", 848 printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
808 port->vio.name); 849 port->vio.name);
809 blk_cleanup_queue(q); 850 cleanup_queue(q);
810 return -ENOMEM; 851 return -ENOMEM;
811 } 852 }
812 853
@@ -981,7 +1022,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
981 */ 1022 */
982 ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL); 1023 ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL);
983 port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0; 1024 port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0;
984 timer_setup(&port->ldc_reset_timer, vdc_ldc_reset_timer, 0); 1025 INIT_DELAYED_WORK(&port->ldc_reset_timer_work, vdc_ldc_reset_timer_work);
985 INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work); 1026 INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work);
986 1027
987 err = vio_driver_init(&port->vio, vdev, VDEV_DISK, 1028 err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
@@ -1034,18 +1075,14 @@ static int vdc_port_remove(struct vio_dev *vdev)
1034 struct vdc_port *port = dev_get_drvdata(&vdev->dev); 1075 struct vdc_port *port = dev_get_drvdata(&vdev->dev);
1035 1076
1036 if (port) { 1077 if (port) {
1037 unsigned long flags; 1078 blk_mq_stop_hw_queues(port->disk->queue);
1038
1039 spin_lock_irqsave(&port->vio.lock, flags);
1040 blk_stop_queue(port->disk->queue);
1041 spin_unlock_irqrestore(&port->vio.lock, flags);
1042 1079
1043 flush_work(&port->ldc_reset_work); 1080 flush_work(&port->ldc_reset_work);
1044 del_timer_sync(&port->ldc_reset_timer); 1081 cancel_delayed_work_sync(&port->ldc_reset_timer_work);
1045 del_timer_sync(&port->vio.timer); 1082 del_timer_sync(&port->vio.timer);
1046 1083
1047 del_gendisk(port->disk); 1084 del_gendisk(port->disk);
1048 blk_cleanup_queue(port->disk->queue); 1085 cleanup_queue(port->disk->queue);
1049 put_disk(port->disk); 1086 put_disk(port->disk);
1050 port->disk = NULL; 1087 port->disk = NULL;
1051 1088
@@ -1080,32 +1117,46 @@ static void vdc_requeue_inflight(struct vdc_port *port)
1080 } 1117 }
1081 1118
1082 rqe->req = NULL; 1119 rqe->req = NULL;
1083 blk_requeue_request(port->disk->queue, req); 1120 blk_mq_requeue_request(req, false);
1084 } 1121 }
1085} 1122}
1086 1123
1087static void vdc_queue_drain(struct vdc_port *port) 1124static void vdc_queue_drain(struct vdc_port *port)
1088{ 1125{
1089 struct request *req; 1126 struct request_queue *q = port->disk->queue;
1127
1128 /*
1129 * Mark the queue as draining, then freeze/quiesce to ensure
1130 * that all existing requests are seen in ->queue_rq() and killed
1131 */
1132 port->drain = 1;
1133 spin_unlock_irq(&port->vio.lock);
1090 1134
1091 while ((req = blk_fetch_request(port->disk->queue)) != NULL) 1135 blk_mq_freeze_queue(q);
1092 __blk_end_request_all(req, BLK_STS_IOERR); 1136 blk_mq_quiesce_queue(q);
1137
1138 spin_lock_irq(&port->vio.lock);
1139 port->drain = 0;
1140 blk_mq_unquiesce_queue(q);
1141 blk_mq_unfreeze_queue(q);
1093} 1142}
1094 1143
1095static void vdc_ldc_reset_timer(struct timer_list *t) 1144static void vdc_ldc_reset_timer_work(struct work_struct *work)
1096{ 1145{
1097 struct vdc_port *port = from_timer(port, t, ldc_reset_timer); 1146 struct vdc_port *port;
1098 struct vio_driver_state *vio = &port->vio; 1147 struct vio_driver_state *vio;
1099 unsigned long flags;
1100 1148
1101 spin_lock_irqsave(&vio->lock, flags); 1149 port = container_of(work, struct vdc_port, ldc_reset_timer_work.work);
1150 vio = &port->vio;
1151
1152 spin_lock_irq(&vio->lock);
1102 if (!(port->vio.hs_state & VIO_HS_COMPLETE)) { 1153 if (!(port->vio.hs_state & VIO_HS_COMPLETE)) {
1103 pr_warn(PFX "%s ldc down %llu seconds, draining queue\n", 1154 pr_warn(PFX "%s ldc down %llu seconds, draining queue\n",
1104 port->disk_name, port->ldc_timeout); 1155 port->disk_name, port->ldc_timeout);
1105 vdc_queue_drain(port); 1156 vdc_queue_drain(port);
1106 vdc_blk_queue_start(port); 1157 vdc_blk_queue_start(port);
1107 } 1158 }
1108 spin_unlock_irqrestore(&vio->lock, flags); 1159 spin_unlock_irq(&vio->lock);
1109} 1160}
1110 1161
1111static void vdc_ldc_reset_work(struct work_struct *work) 1162static void vdc_ldc_reset_work(struct work_struct *work)
@@ -1129,7 +1180,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
1129 assert_spin_locked(&port->vio.lock); 1180 assert_spin_locked(&port->vio.lock);
1130 1181
1131 pr_warn(PFX "%s ldc link reset\n", port->disk_name); 1182 pr_warn(PFX "%s ldc link reset\n", port->disk_name);
1132 blk_stop_queue(port->disk->queue); 1183 blk_mq_stop_hw_queues(port->disk->queue);
1133 vdc_requeue_inflight(port); 1184 vdc_requeue_inflight(port);
1134 vdc_port_down(port); 1185 vdc_port_down(port);
1135 1186
@@ -1146,7 +1197,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
1146 } 1197 }
1147 1198
1148 if (port->ldc_timeout) 1199 if (port->ldc_timeout)
1149 mod_timer(&port->ldc_reset_timer, 1200 mod_delayed_work(system_wq, &port->ldc_reset_timer_work,
1150 round_jiffies(jiffies + HZ * port->ldc_timeout)); 1201 round_jiffies(jiffies + HZ * port->ldc_timeout));
1151 mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ)); 1202 mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ));
1152 return; 1203 return;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 064b8c5c7a32..4478eb7efee0 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -243,7 +243,6 @@ struct carm_port {
243 unsigned int port_no; 243 unsigned int port_no;
244 struct gendisk *disk; 244 struct gendisk *disk;
245 struct carm_host *host; 245 struct carm_host *host;
246 struct blk_mq_tag_set tag_set;
247 246
248 /* attached device characteristics */ 247 /* attached device characteristics */
249 u64 capacity; 248 u64 capacity;
@@ -254,13 +253,10 @@ struct carm_port {
254}; 253};
255 254
256struct carm_request { 255struct carm_request {
257 unsigned int tag;
258 int n_elem; 256 int n_elem;
259 unsigned int msg_type; 257 unsigned int msg_type;
260 unsigned int msg_subtype; 258 unsigned int msg_subtype;
261 unsigned int msg_bucket; 259 unsigned int msg_bucket;
262 struct request *rq;
263 struct carm_port *port;
264 struct scatterlist sg[CARM_MAX_REQ_SG]; 260 struct scatterlist sg[CARM_MAX_REQ_SG];
265}; 261};
266 262
@@ -291,9 +287,6 @@ struct carm_host {
291 unsigned int wait_q_cons; 287 unsigned int wait_q_cons;
292 struct request_queue *wait_q[CARM_MAX_WAIT_Q]; 288 struct request_queue *wait_q[CARM_MAX_WAIT_Q];
293 289
294 unsigned int n_msgs;
295 u64 msg_alloc;
296 struct carm_request req[CARM_MAX_REQ];
297 void *msg_base; 290 void *msg_base;
298 dma_addr_t msg_dma; 291 dma_addr_t msg_dma;
299 292
@@ -478,10 +471,10 @@ static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host,
478} 471}
479 472
480static int carm_send_msg(struct carm_host *host, 473static int carm_send_msg(struct carm_host *host,
481 struct carm_request *crq) 474 struct carm_request *crq, unsigned tag)
482{ 475{
483 void __iomem *mmio = host->mmio; 476 void __iomem *mmio = host->mmio;
484 u32 msg = (u32) carm_ref_msg_dma(host, crq->tag); 477 u32 msg = (u32) carm_ref_msg_dma(host, tag);
485 u32 cm_bucket = crq->msg_bucket; 478 u32 cm_bucket = crq->msg_bucket;
486 u32 tmp; 479 u32 tmp;
487 int rc = 0; 480 int rc = 0;
@@ -506,99 +499,24 @@ static int carm_send_msg(struct carm_host *host,
506 return rc; 499 return rc;
507} 500}
508 501
509static struct carm_request *carm_get_request(struct carm_host *host)
510{
511 unsigned int i;
512
513 /* obey global hardware limit on S/G entries */
514 if (host->hw_sg_used >= (CARM_MAX_HOST_SG - CARM_MAX_REQ_SG))
515 return NULL;
516
517 for (i = 0; i < max_queue; i++)
518 if ((host->msg_alloc & (1ULL << i)) == 0) {
519 struct carm_request *crq = &host->req[i];
520 crq->port = NULL;
521 crq->n_elem = 0;
522
523 host->msg_alloc |= (1ULL << i);
524 host->n_msgs++;
525
526 assert(host->n_msgs <= CARM_MAX_REQ);
527 sg_init_table(crq->sg, CARM_MAX_REQ_SG);
528 return crq;
529 }
530
531 DPRINTK("no request available, returning NULL\n");
532 return NULL;
533}
534
535static int carm_put_request(struct carm_host *host, struct carm_request *crq)
536{
537 assert(crq->tag < max_queue);
538
539 if (unlikely((host->msg_alloc & (1ULL << crq->tag)) == 0))
540 return -EINVAL; /* tried to clear a tag that was not active */
541
542 assert(host->hw_sg_used >= crq->n_elem);
543
544 host->msg_alloc &= ~(1ULL << crq->tag);
545 host->hw_sg_used -= crq->n_elem;
546 host->n_msgs--;
547
548 return 0;
549}
550
551static struct carm_request *carm_get_special(struct carm_host *host)
552{
553 unsigned long flags;
554 struct carm_request *crq = NULL;
555 struct request *rq;
556 int tries = 5000;
557
558 while (tries-- > 0) {
559 spin_lock_irqsave(&host->lock, flags);
560 crq = carm_get_request(host);
561 spin_unlock_irqrestore(&host->lock, flags);
562
563 if (crq)
564 break;
565 msleep(10);
566 }
567
568 if (!crq)
569 return NULL;
570
571 rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0);
572 if (IS_ERR(rq)) {
573 spin_lock_irqsave(&host->lock, flags);
574 carm_put_request(host, crq);
575 spin_unlock_irqrestore(&host->lock, flags);
576 return NULL;
577 }
578
579 crq->rq = rq;
580 return crq;
581}
582
583static int carm_array_info (struct carm_host *host, unsigned int array_idx) 502static int carm_array_info (struct carm_host *host, unsigned int array_idx)
584{ 503{
585 struct carm_msg_ioctl *ioc; 504 struct carm_msg_ioctl *ioc;
586 unsigned int idx;
587 u32 msg_data; 505 u32 msg_data;
588 dma_addr_t msg_dma; 506 dma_addr_t msg_dma;
589 struct carm_request *crq; 507 struct carm_request *crq;
508 struct request *rq;
590 int rc; 509 int rc;
591 510
592 crq = carm_get_special(host); 511 rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
593 if (!crq) { 512 if (IS_ERR(rq)) {
594 rc = -ENOMEM; 513 rc = -ENOMEM;
595 goto err_out; 514 goto err_out;
596 } 515 }
516 crq = blk_mq_rq_to_pdu(rq);
597 517
598 idx = crq->tag; 518 ioc = carm_ref_msg(host, rq->tag);
599 519 msg_dma = carm_ref_msg_dma(host, rq->tag);
600 ioc = carm_ref_msg(host, idx);
601 msg_dma = carm_ref_msg_dma(host, idx);
602 msg_data = (u32) (msg_dma + sizeof(struct carm_array_info)); 520 msg_data = (u32) (msg_dma + sizeof(struct carm_array_info));
603 521
604 crq->msg_type = CARM_MSG_ARRAY; 522 crq->msg_type = CARM_MSG_ARRAY;
@@ -612,7 +530,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
612 ioc->type = CARM_MSG_ARRAY; 530 ioc->type = CARM_MSG_ARRAY;
613 ioc->subtype = CARM_ARRAY_INFO; 531 ioc->subtype = CARM_ARRAY_INFO;
614 ioc->array_id = (u8) array_idx; 532 ioc->array_id = (u8) array_idx;
615 ioc->handle = cpu_to_le32(TAG_ENCODE(idx)); 533 ioc->handle = cpu_to_le32(TAG_ENCODE(rq->tag));
616 ioc->data_addr = cpu_to_le32(msg_data); 534 ioc->data_addr = cpu_to_le32(msg_data);
617 535
618 spin_lock_irq(&host->lock); 536 spin_lock_irq(&host->lock);
@@ -620,9 +538,8 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
620 host->state == HST_DEV_SCAN); 538 host->state == HST_DEV_SCAN);
621 spin_unlock_irq(&host->lock); 539 spin_unlock_irq(&host->lock);
622 540
623 DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); 541 DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
624 crq->rq->special = crq; 542 blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
625 blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
626 543
627 return 0; 544 return 0;
628 545
@@ -637,21 +554,21 @@ typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *);
637 554
638static int carm_send_special (struct carm_host *host, carm_sspc_t func) 555static int carm_send_special (struct carm_host *host, carm_sspc_t func)
639{ 556{
557 struct request *rq;
640 struct carm_request *crq; 558 struct carm_request *crq;
641 struct carm_msg_ioctl *ioc; 559 struct carm_msg_ioctl *ioc;
642 void *mem; 560 void *mem;
643 unsigned int idx, msg_size; 561 unsigned int msg_size;
644 int rc; 562 int rc;
645 563
646 crq = carm_get_special(host); 564 rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
647 if (!crq) 565 if (IS_ERR(rq))
648 return -ENOMEM; 566 return -ENOMEM;
567 crq = blk_mq_rq_to_pdu(rq);
649 568
650 idx = crq->tag; 569 mem = carm_ref_msg(host, rq->tag);
651 570
652 mem = carm_ref_msg(host, idx); 571 msg_size = func(host, rq->tag, mem);
653
654 msg_size = func(host, idx, mem);
655 572
656 ioc = mem; 573 ioc = mem;
657 crq->msg_type = ioc->type; 574 crq->msg_type = ioc->type;
@@ -660,9 +577,8 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
660 BUG_ON(rc < 0); 577 BUG_ON(rc < 0);
661 crq->msg_bucket = (u32) rc; 578 crq->msg_bucket = (u32) rc;
662 579
663 DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); 580 DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
664 crq->rq->special = crq; 581 blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
665 blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
666 582
667 return 0; 583 return 0;
668} 584}
@@ -744,19 +660,6 @@ static unsigned int carm_fill_get_fw_ver(struct carm_host *host,
744 sizeof(struct carm_fw_ver); 660 sizeof(struct carm_fw_ver);
745} 661}
746 662
747static inline void carm_end_request_queued(struct carm_host *host,
748 struct carm_request *crq,
749 blk_status_t error)
750{
751 struct request *req = crq->rq;
752 int rc;
753
754 blk_mq_end_request(req, error);
755
756 rc = carm_put_request(host, crq);
757 assert(rc == 0);
758}
759
760static inline void carm_push_q (struct carm_host *host, struct request_queue *q) 663static inline void carm_push_q (struct carm_host *host, struct request_queue *q)
761{ 664{
762 unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q; 665 unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q;
@@ -791,101 +694,50 @@ static inline void carm_round_robin(struct carm_host *host)
791 } 694 }
792} 695}
793 696
794static inline void carm_end_rq(struct carm_host *host, struct carm_request *crq, 697static inline enum dma_data_direction carm_rq_dir(struct request *rq)
795 blk_status_t error)
796{
797 carm_end_request_queued(host, crq, error);
798 if (max_queue == 1)
799 carm_round_robin(host);
800 else if ((host->n_msgs <= CARM_MSG_LOW_WATER) &&
801 (host->hw_sg_used <= CARM_SG_LOW_WATER)) {
802 carm_round_robin(host);
803 }
804}
805
806static blk_status_t carm_oob_queue_rq(struct blk_mq_hw_ctx *hctx,
807 const struct blk_mq_queue_data *bd)
808{ 698{
809 struct request_queue *q = hctx->queue; 699 return op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
810 struct carm_host *host = q->queuedata;
811 struct carm_request *crq;
812 int rc;
813
814 blk_mq_start_request(bd->rq);
815
816 spin_lock_irq(&host->lock);
817
818 crq = bd->rq->special;
819 assert(crq != NULL);
820 assert(crq->rq == bd->rq);
821
822 crq->n_elem = 0;
823
824 DPRINTK("send req\n");
825 rc = carm_send_msg(host, crq);
826 if (rc) {
827 carm_push_q(host, q);
828 spin_unlock_irq(&host->lock);
829 return BLK_STS_DEV_RESOURCE;
830 }
831
832 spin_unlock_irq(&host->lock);
833 return BLK_STS_OK;
834} 700}
835 701
836static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx, 702static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
837 const struct blk_mq_queue_data *bd) 703 const struct blk_mq_queue_data *bd)
838{ 704{
839 struct request_queue *q = hctx->queue; 705 struct request_queue *q = hctx->queue;
706 struct request *rq = bd->rq;
840 struct carm_port *port = q->queuedata; 707 struct carm_port *port = q->queuedata;
841 struct carm_host *host = port->host; 708 struct carm_host *host = port->host;
709 struct carm_request *crq = blk_mq_rq_to_pdu(rq);
842 struct carm_msg_rw *msg; 710 struct carm_msg_rw *msg;
843 struct carm_request *crq;
844 struct request *rq = bd->rq;
845 struct scatterlist *sg; 711 struct scatterlist *sg;
846 int writing = 0, pci_dir, i, n_elem, rc; 712 int i, n_elem = 0, rc;
847 u32 tmp;
848 unsigned int msg_size; 713 unsigned int msg_size;
714 u32 tmp;
715
716 crq->n_elem = 0;
717 sg_init_table(crq->sg, CARM_MAX_REQ_SG);
849 718
850 blk_mq_start_request(rq); 719 blk_mq_start_request(rq);
851 720
852 spin_lock_irq(&host->lock); 721 spin_lock_irq(&host->lock);
853 722 if (req_op(rq) == REQ_OP_DRV_OUT)
854 crq = carm_get_request(host); 723 goto send_msg;
855 if (!crq) {
856 carm_push_q(host, q);
857 spin_unlock_irq(&host->lock);
858 return BLK_STS_DEV_RESOURCE;
859 }
860 crq->rq = rq;
861
862 if (rq_data_dir(rq) == WRITE) {
863 writing = 1;
864 pci_dir = DMA_TO_DEVICE;
865 } else {
866 pci_dir = DMA_FROM_DEVICE;
867 }
868 724
869 /* get scatterlist from block layer */ 725 /* get scatterlist from block layer */
870 sg = &crq->sg[0]; 726 sg = &crq->sg[0];
871 n_elem = blk_rq_map_sg(q, rq, sg); 727 n_elem = blk_rq_map_sg(q, rq, sg);
872 if (n_elem <= 0) { 728 if (n_elem <= 0)
873 /* request with no s/g entries? */ 729 goto out_ioerr;
874 carm_end_rq(host, crq, BLK_STS_IOERR);
875 spin_unlock_irq(&host->lock);
876 return BLK_STS_IOERR;
877 }
878 730
879 /* map scatterlist to PCI bus addresses */ 731 /* map scatterlist to PCI bus addresses */
880 n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, pci_dir); 732 n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, carm_rq_dir(rq));
881 if (n_elem <= 0) { 733 if (n_elem <= 0)
882 /* request with no s/g entries? */ 734 goto out_ioerr;
883 carm_end_rq(host, crq, BLK_STS_IOERR); 735
884 spin_unlock_irq(&host->lock); 736 /* obey global hardware limit on S/G entries */
885 return BLK_STS_IOERR; 737 if (host->hw_sg_used >= CARM_MAX_HOST_SG - n_elem)
886 } 738 goto out_resource;
739
887 crq->n_elem = n_elem; 740 crq->n_elem = n_elem;
888 crq->port = port;
889 host->hw_sg_used += n_elem; 741 host->hw_sg_used += n_elem;
890 742
891 /* 743 /*
@@ -893,9 +745,9 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
893 */ 745 */
894 746
895 VPRINTK("build msg\n"); 747 VPRINTK("build msg\n");
896 msg = (struct carm_msg_rw *) carm_ref_msg(host, crq->tag); 748 msg = (struct carm_msg_rw *) carm_ref_msg(host, rq->tag);
897 749
898 if (writing) { 750 if (rq_data_dir(rq) == WRITE) {
899 msg->type = CARM_MSG_WRITE; 751 msg->type = CARM_MSG_WRITE;
900 crq->msg_type = CARM_MSG_WRITE; 752 crq->msg_type = CARM_MSG_WRITE;
901 } else { 753 } else {
@@ -906,7 +758,7 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
906 msg->id = port->port_no; 758 msg->id = port->port_no;
907 msg->sg_count = n_elem; 759 msg->sg_count = n_elem;
908 msg->sg_type = SGT_32BIT; 760 msg->sg_type = SGT_32BIT;
909 msg->handle = cpu_to_le32(TAG_ENCODE(crq->tag)); 761 msg->handle = cpu_to_le32(TAG_ENCODE(rq->tag));
910 msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff); 762 msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff);
911 tmp = (blk_rq_pos(rq) >> 16) >> 16; 763 tmp = (blk_rq_pos(rq) >> 16) >> 16;
912 msg->lba_high = cpu_to_le16( (u16) tmp ); 764 msg->lba_high = cpu_to_le16( (u16) tmp );
@@ -923,22 +775,28 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
923 rc = carm_lookup_bucket(msg_size); 775 rc = carm_lookup_bucket(msg_size);
924 BUG_ON(rc < 0); 776 BUG_ON(rc < 0);
925 crq->msg_bucket = (u32) rc; 777 crq->msg_bucket = (u32) rc;
926 778send_msg:
927 /* 779 /*
928 * queue read/write message to hardware 780 * queue read/write message to hardware
929 */ 781 */
930 782 VPRINTK("send msg, tag == %u\n", rq->tag);
931 VPRINTK("send msg, tag == %u\n", crq->tag); 783 rc = carm_send_msg(host, crq, rq->tag);
932 rc = carm_send_msg(host, crq);
933 if (rc) { 784 if (rc) {
934 carm_put_request(host, crq); 785 host->hw_sg_used -= n_elem;
935 carm_push_q(host, q); 786 goto out_resource;
936 spin_unlock_irq(&host->lock);
937 return BLK_STS_DEV_RESOURCE;
938 } 787 }
939 788
940 spin_unlock_irq(&host->lock); 789 spin_unlock_irq(&host->lock);
941 return BLK_STS_OK; 790 return BLK_STS_OK;
791out_resource:
792 dma_unmap_sg(&host->pdev->dev, &crq->sg[0], n_elem, carm_rq_dir(rq));
793 carm_push_q(host, q);
794 spin_unlock_irq(&host->lock);
795 return BLK_STS_DEV_RESOURCE;
796out_ioerr:
797 carm_round_robin(host);
798 spin_unlock_irq(&host->lock);
799 return BLK_STS_IOERR;
942} 800}
943 801
944static void carm_handle_array_info(struct carm_host *host, 802static void carm_handle_array_info(struct carm_host *host,
@@ -954,8 +812,6 @@ static void carm_handle_array_info(struct carm_host *host,
954 812
955 DPRINTK("ENTER\n"); 813 DPRINTK("ENTER\n");
956 814
957 carm_end_rq(host, crq, error);
958
959 if (error) 815 if (error)
960 goto out; 816 goto out;
961 if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST) 817 if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST)
@@ -1011,8 +867,6 @@ static void carm_handle_scan_chan(struct carm_host *host,
1011 867
1012 DPRINTK("ENTER\n"); 868 DPRINTK("ENTER\n");
1013 869
1014 carm_end_rq(host, crq, error);
1015
1016 if (error) { 870 if (error) {
1017 new_state = HST_ERROR; 871 new_state = HST_ERROR;
1018 goto out; 872 goto out;
@@ -1040,8 +894,6 @@ static void carm_handle_generic(struct carm_host *host,
1040{ 894{
1041 DPRINTK("ENTER\n"); 895 DPRINTK("ENTER\n");
1042 896
1043 carm_end_rq(host, crq, error);
1044
1045 assert(host->state == cur_state); 897 assert(host->state == cur_state);
1046 if (error) 898 if (error)
1047 host->state = HST_ERROR; 899 host->state = HST_ERROR;
@@ -1050,28 +902,12 @@ static void carm_handle_generic(struct carm_host *host,
1050 schedule_work(&host->fsm_task); 902 schedule_work(&host->fsm_task);
1051} 903}
1052 904
1053static inline void carm_handle_rw(struct carm_host *host,
1054 struct carm_request *crq, blk_status_t error)
1055{
1056 int pci_dir;
1057
1058 VPRINTK("ENTER\n");
1059
1060 if (rq_data_dir(crq->rq) == WRITE)
1061 pci_dir = DMA_TO_DEVICE;
1062 else
1063 pci_dir = DMA_FROM_DEVICE;
1064
1065 dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem, pci_dir);
1066
1067 carm_end_rq(host, crq, error);
1068}
1069
1070static inline void carm_handle_resp(struct carm_host *host, 905static inline void carm_handle_resp(struct carm_host *host,
1071 __le32 ret_handle_le, u32 status) 906 __le32 ret_handle_le, u32 status)
1072{ 907{
1073 u32 handle = le32_to_cpu(ret_handle_le); 908 u32 handle = le32_to_cpu(ret_handle_le);
1074 unsigned int msg_idx; 909 unsigned int msg_idx;
910 struct request *rq;
1075 struct carm_request *crq; 911 struct carm_request *crq;
1076 blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR; 912 blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR;
1077 u8 *mem; 913 u8 *mem;
@@ -1087,13 +923,15 @@ static inline void carm_handle_resp(struct carm_host *host,
1087 msg_idx = TAG_DECODE(handle); 923 msg_idx = TAG_DECODE(handle);
1088 VPRINTK("tag == %u\n", msg_idx); 924 VPRINTK("tag == %u\n", msg_idx);
1089 925
1090 crq = &host->req[msg_idx]; 926 rq = blk_mq_tag_to_rq(host->tag_set.tags[0], msg_idx);
927 crq = blk_mq_rq_to_pdu(rq);
1091 928
1092 /* fast path */ 929 /* fast path */
1093 if (likely(crq->msg_type == CARM_MSG_READ || 930 if (likely(crq->msg_type == CARM_MSG_READ ||
1094 crq->msg_type == CARM_MSG_WRITE)) { 931 crq->msg_type == CARM_MSG_WRITE)) {
1095 carm_handle_rw(host, crq, error); 932 dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem,
1096 return; 933 carm_rq_dir(rq));
934 goto done;
1097 } 935 }
1098 936
1099 mem = carm_ref_msg(host, msg_idx); 937 mem = carm_ref_msg(host, msg_idx);
@@ -1103,7 +941,7 @@ static inline void carm_handle_resp(struct carm_host *host,
1103 switch (crq->msg_subtype) { 941 switch (crq->msg_subtype) {
1104 case CARM_IOC_SCAN_CHAN: 942 case CARM_IOC_SCAN_CHAN:
1105 carm_handle_scan_chan(host, crq, mem, error); 943 carm_handle_scan_chan(host, crq, mem, error);
1106 break; 944 goto done;
1107 default: 945 default:
1108 /* unknown / invalid response */ 946 /* unknown / invalid response */
1109 goto err_out; 947 goto err_out;
@@ -1116,11 +954,11 @@ static inline void carm_handle_resp(struct carm_host *host,
1116 case MISC_ALLOC_MEM: 954 case MISC_ALLOC_MEM:
1117 carm_handle_generic(host, crq, error, 955 carm_handle_generic(host, crq, error,
1118 HST_ALLOC_BUF, HST_SYNC_TIME); 956 HST_ALLOC_BUF, HST_SYNC_TIME);
1119 break; 957 goto done;
1120 case MISC_SET_TIME: 958 case MISC_SET_TIME:
1121 carm_handle_generic(host, crq, error, 959 carm_handle_generic(host, crq, error,
1122 HST_SYNC_TIME, HST_GET_FW_VER); 960 HST_SYNC_TIME, HST_GET_FW_VER);
1123 break; 961 goto done;
1124 case MISC_GET_FW_VER: { 962 case MISC_GET_FW_VER: {
1125 struct carm_fw_ver *ver = (struct carm_fw_ver *) 963 struct carm_fw_ver *ver = (struct carm_fw_ver *)
1126 (mem + sizeof(struct carm_msg_get_fw_ver)); 964 (mem + sizeof(struct carm_msg_get_fw_ver));
@@ -1130,7 +968,7 @@ static inline void carm_handle_resp(struct carm_host *host,
1130 } 968 }
1131 carm_handle_generic(host, crq, error, 969 carm_handle_generic(host, crq, error,
1132 HST_GET_FW_VER, HST_PORT_SCAN); 970 HST_GET_FW_VER, HST_PORT_SCAN);
1133 break; 971 goto done;
1134 } 972 }
1135 default: 973 default:
1136 /* unknown / invalid response */ 974 /* unknown / invalid response */
@@ -1161,7 +999,13 @@ static inline void carm_handle_resp(struct carm_host *host,
1161err_out: 999err_out:
1162 printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n", 1000 printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n",
1163 pci_name(host->pdev), crq->msg_type, crq->msg_subtype); 1001 pci_name(host->pdev), crq->msg_type, crq->msg_subtype);
1164 carm_end_rq(host, crq, BLK_STS_IOERR); 1002 error = BLK_STS_IOERR;
1003done:
1004 host->hw_sg_used -= crq->n_elem;
1005 blk_mq_end_request(blk_mq_rq_from_pdu(crq), error);
1006
1007 if (host->hw_sg_used <= CARM_SG_LOW_WATER)
1008 carm_round_robin(host);
1165} 1009}
1166 1010
1167static inline void carm_handle_responses(struct carm_host *host) 1011static inline void carm_handle_responses(struct carm_host *host)
@@ -1491,78 +1335,56 @@ static int carm_init_host(struct carm_host *host)
1491 return 0; 1335 return 0;
1492} 1336}
1493 1337
1494static const struct blk_mq_ops carm_oob_mq_ops = {
1495 .queue_rq = carm_oob_queue_rq,
1496};
1497
1498static const struct blk_mq_ops carm_mq_ops = { 1338static const struct blk_mq_ops carm_mq_ops = {
1499 .queue_rq = carm_queue_rq, 1339 .queue_rq = carm_queue_rq,
1500}; 1340};
1501 1341
1502static int carm_init_disks(struct carm_host *host) 1342static int carm_init_disk(struct carm_host *host, unsigned int port_no)
1503{ 1343{
1504 unsigned int i; 1344 struct carm_port *port = &host->port[port_no];
1505 int rc = 0; 1345 struct gendisk *disk;
1346 struct request_queue *q;
1506 1347
1507 for (i = 0; i < CARM_MAX_PORTS; i++) { 1348 port->host = host;
1508 struct gendisk *disk; 1349 port->port_no = port_no;
1509 struct request_queue *q;
1510 struct carm_port *port;
1511 1350
1512 port = &host->port[i]; 1351 disk = alloc_disk(CARM_MINORS_PER_MAJOR);
1513 port->host = host; 1352 if (!disk)
1514 port->port_no = i; 1353 return -ENOMEM;
1515 1354
1516 disk = alloc_disk(CARM_MINORS_PER_MAJOR); 1355 port->disk = disk;
1517 if (!disk) { 1356 sprintf(disk->disk_name, DRV_NAME "/%u",
1518 rc = -ENOMEM; 1357 (unsigned int)host->id * CARM_MAX_PORTS + port_no);
1519 break; 1358 disk->major = host->major;
1520 } 1359 disk->first_minor = port_no * CARM_MINORS_PER_MAJOR;
1360 disk->fops = &carm_bd_ops;
1361 disk->private_data = port;
1521 1362
1522 port->disk = disk; 1363 q = blk_mq_init_queue(&host->tag_set);
1523 sprintf(disk->disk_name, DRV_NAME "/%u", 1364 if (IS_ERR(q))
1524 (unsigned int) (host->id * CARM_MAX_PORTS) + i); 1365 return PTR_ERR(q);
1525 disk->major = host->major;
1526 disk->first_minor = i * CARM_MINORS_PER_MAJOR;
1527 disk->fops = &carm_bd_ops;
1528 disk->private_data = port;
1529
1530 q = blk_mq_init_sq_queue(&port->tag_set, &carm_mq_ops,
1531 max_queue, BLK_MQ_F_SHOULD_MERGE);
1532 if (IS_ERR(q)) {
1533 rc = PTR_ERR(q);
1534 break;
1535 }
1536 disk->queue = q;
1537 blk_queue_max_segments(q, CARM_MAX_REQ_SG);
1538 blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
1539 1366
1540 q->queuedata = port; 1367 blk_queue_max_segments(q, CARM_MAX_REQ_SG);
1541 } 1368 blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
1542 1369
1543 return rc; 1370 q->queuedata = port;
1371 disk->queue = q;
1372 return 0;
1544} 1373}
1545 1374
1546static void carm_free_disks(struct carm_host *host) 1375static void carm_free_disk(struct carm_host *host, unsigned int port_no)
1547{ 1376{
1548 unsigned int i; 1377 struct carm_port *port = &host->port[port_no];
1549 1378 struct gendisk *disk = port->disk;
1550 for (i = 0; i < CARM_MAX_PORTS; i++) {
1551 struct carm_port *port = &host->port[i];
1552 struct gendisk *disk = port->disk;
1553 1379
1554 if (disk) { 1380 if (!disk)
1555 struct request_queue *q = disk->queue; 1381 return;
1556 1382
1557 if (disk->flags & GENHD_FL_UP) 1383 if (disk->flags & GENHD_FL_UP)
1558 del_gendisk(disk); 1384 del_gendisk(disk);
1559 if (q) { 1385 if (disk->queue)
1560 blk_mq_free_tag_set(&port->tag_set); 1386 blk_cleanup_queue(disk->queue);
1561 blk_cleanup_queue(q); 1387 put_disk(disk);
1562 }
1563 put_disk(disk);
1564 }
1565 }
1566} 1388}
1567 1389
1568static int carm_init_shm(struct carm_host *host) 1390static int carm_init_shm(struct carm_host *host)
@@ -1618,9 +1440,6 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
1618 INIT_WORK(&host->fsm_task, carm_fsm_task); 1440 INIT_WORK(&host->fsm_task, carm_fsm_task);
1619 init_completion(&host->probe_comp); 1441 init_completion(&host->probe_comp);
1620 1442
1621 for (i = 0; i < ARRAY_SIZE(host->req); i++)
1622 host->req[i].tag = i;
1623
1624 host->mmio = ioremap(pci_resource_start(pdev, 0), 1443 host->mmio = ioremap(pci_resource_start(pdev, 0),
1625 pci_resource_len(pdev, 0)); 1444 pci_resource_len(pdev, 0));
1626 if (!host->mmio) { 1445 if (!host->mmio) {
@@ -1637,14 +1456,26 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
1637 goto err_out_iounmap; 1456 goto err_out_iounmap;
1638 } 1457 }
1639 1458
1640 q = blk_mq_init_sq_queue(&host->tag_set, &carm_oob_mq_ops, 1, 1459 memset(&host->tag_set, 0, sizeof(host->tag_set));
1641 BLK_MQ_F_NO_SCHED); 1460 host->tag_set.ops = &carm_mq_ops;
1461 host->tag_set.cmd_size = sizeof(struct carm_request);
1462 host->tag_set.nr_hw_queues = 1;
1463 host->tag_set.nr_maps = 1;
1464 host->tag_set.queue_depth = max_queue;
1465 host->tag_set.numa_node = NUMA_NO_NODE;
1466 host->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1467
1468 rc = blk_mq_alloc_tag_set(&host->tag_set);
1469 if (rc)
1470 goto err_out_dma_free;
1471
1472 q = blk_mq_init_queue(&host->tag_set);
1642 if (IS_ERR(q)) { 1473 if (IS_ERR(q)) {
1643 printk(KERN_ERR DRV_NAME "(%s): OOB queue alloc failure\n",
1644 pci_name(pdev));
1645 rc = PTR_ERR(q); 1474 rc = PTR_ERR(q);
1475 blk_mq_free_tag_set(&host->tag_set);
1646 goto err_out_dma_free; 1476 goto err_out_dma_free;
1647 } 1477 }
1478
1648 host->oob_q = q; 1479 host->oob_q = q;
1649 q->queuedata = host; 1480 q->queuedata = host;
1650 1481
@@ -1667,9 +1498,11 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
1667 if (host->flags & FL_DYN_MAJOR) 1498 if (host->flags & FL_DYN_MAJOR)
1668 host->major = rc; 1499 host->major = rc;
1669 1500
1670 rc = carm_init_disks(host); 1501 for (i = 0; i < CARM_MAX_PORTS; i++) {
1671 if (rc) 1502 rc = carm_init_disk(host, i);
1672 goto err_out_blkdev_disks; 1503 if (rc)
1504 goto err_out_blkdev_disks;
1505 }
1673 1506
1674 pci_set_master(pdev); 1507 pci_set_master(pdev);
1675 1508
@@ -1699,7 +1532,8 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
1699err_out_free_irq: 1532err_out_free_irq:
1700 free_irq(pdev->irq, host); 1533 free_irq(pdev->irq, host);
1701err_out_blkdev_disks: 1534err_out_blkdev_disks:
1702 carm_free_disks(host); 1535 for (i = 0; i < CARM_MAX_PORTS; i++)
1536 carm_free_disk(host, i);
1703 unregister_blkdev(host->major, host->name); 1537 unregister_blkdev(host->major, host->name);
1704err_out_free_majors: 1538err_out_free_majors:
1705 if (host->major == 160) 1539 if (host->major == 160)
@@ -1724,6 +1558,7 @@ err_out:
1724static void carm_remove_one (struct pci_dev *pdev) 1558static void carm_remove_one (struct pci_dev *pdev)
1725{ 1559{
1726 struct carm_host *host = pci_get_drvdata(pdev); 1560 struct carm_host *host = pci_get_drvdata(pdev);
1561 unsigned int i;
1727 1562
1728 if (!host) { 1563 if (!host) {
1729 printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n", 1564 printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n",
@@ -1732,7 +1567,8 @@ static void carm_remove_one (struct pci_dev *pdev)
1732 } 1567 }
1733 1568
1734 free_irq(pdev->irq, host); 1569 free_irq(pdev->irq, host);
1735 carm_free_disks(host); 1570 for (i = 0; i < CARM_MAX_PORTS; i++)
1571 carm_free_disk(host, i);
1736 unregister_blkdev(host->major, host->name); 1572 unregister_blkdev(host->major, host->name);
1737 if (host->major == 160) 1573 if (host->major == 160)
1738 clear_bit(0, &carm_major_alloc); 1574 clear_bit(0, &carm_major_alloc);
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index be3e3ab79950..aa035cf8a51d 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -888,8 +888,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
888 card->biotail = &card->bio; 888 card->biotail = &card->bio;
889 spin_lock_init(&card->lock); 889 spin_lock_init(&card->lock);
890 890
891 card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, 891 card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
892 &card->lock);
893 if (!card->queue) 892 if (!card->queue)
894 goto failed_alloc; 893 goto failed_alloc;
895 894
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 086c6bb12baa..912c4265e592 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -214,6 +214,20 @@ static void virtblk_done(struct virtqueue *vq)
214 spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); 214 spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
215} 215}
216 216
217static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
218{
219 struct virtio_blk *vblk = hctx->queue->queuedata;
220 struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
221 bool kick;
222
223 spin_lock_irq(&vq->lock);
224 kick = virtqueue_kick_prepare(vq->vq);
225 spin_unlock_irq(&vq->lock);
226
227 if (kick)
228 virtqueue_notify(vq->vq);
229}
230
217static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, 231static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
218 const struct blk_mq_queue_data *bd) 232 const struct blk_mq_queue_data *bd)
219{ 233{
@@ -624,7 +638,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
624{ 638{
625 struct virtio_blk *vblk = set->driver_data; 639 struct virtio_blk *vblk = set->driver_data;
626 640
627 return blk_mq_virtio_map_queues(set, vblk->vdev, 0); 641 return blk_mq_virtio_map_queues(&set->map[0], vblk->vdev, 0);
628} 642}
629 643
630#ifdef CONFIG_VIRTIO_BLK_SCSI 644#ifdef CONFIG_VIRTIO_BLK_SCSI
@@ -638,6 +652,7 @@ static void virtblk_initialize_rq(struct request *req)
638 652
639static const struct blk_mq_ops virtio_mq_ops = { 653static const struct blk_mq_ops virtio_mq_ops = {
640 .queue_rq = virtio_queue_rq, 654 .queue_rq = virtio_queue_rq,
655 .commit_rqs = virtio_commit_rqs,
641 .complete = virtblk_request_done, 656 .complete = virtblk_request_done,
642 .init_request = virtblk_init_request, 657 .init_request = virtblk_init_request,
643#ifdef CONFIG_VIRTIO_BLK_SCSI 658#ifdef CONFIG_VIRTIO_BLK_SCSI
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 8b2b72b93885..da58020a144e 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,7 +94,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
94 94
95 rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); 95 rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
96 ide_req(rq)->type = ATA_PRIV_MISC; 96 ide_req(rq)->type = ATA_PRIV_MISC;
97 rq->special = (char *)pc; 97 ide_req(rq)->special = pc;
98 98
99 if (buf && bufflen) { 99 if (buf && bufflen) {
100 error = blk_rq_map_kern(drive->queue, rq, buf, bufflen, 100 error = blk_rq_map_kern(drive->queue, rq, buf, bufflen,
@@ -172,8 +172,8 @@ EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
172void ide_prep_sense(ide_drive_t *drive, struct request *rq) 172void ide_prep_sense(ide_drive_t *drive, struct request *rq)
173{ 173{
174 struct request_sense *sense = &drive->sense_data; 174 struct request_sense *sense = &drive->sense_data;
175 struct request *sense_rq = drive->sense_rq; 175 struct request *sense_rq;
176 struct scsi_request *req = scsi_req(sense_rq); 176 struct scsi_request *req;
177 unsigned int cmd_len, sense_len; 177 unsigned int cmd_len, sense_len;
178 int err; 178 int err;
179 179
@@ -196,9 +196,16 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
196 if (ata_sense_request(rq) || drive->sense_rq_armed) 196 if (ata_sense_request(rq) || drive->sense_rq_armed)
197 return; 197 return;
198 198
199 sense_rq = drive->sense_rq;
200 if (!sense_rq) {
201 sense_rq = blk_mq_alloc_request(drive->queue, REQ_OP_DRV_IN,
202 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
203 drive->sense_rq = sense_rq;
204 }
205 req = scsi_req(sense_rq);
206
199 memset(sense, 0, sizeof(*sense)); 207 memset(sense, 0, sizeof(*sense));
200 208
201 blk_rq_init(rq->q, sense_rq);
202 scsi_req_init(req); 209 scsi_req_init(req);
203 210
204 err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len, 211 err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
@@ -207,6 +214,8 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
207 if (printk_ratelimit()) 214 if (printk_ratelimit())
208 printk(KERN_WARNING PFX "%s: failed to map sense " 215 printk(KERN_WARNING PFX "%s: failed to map sense "
209 "buffer\n", drive->name); 216 "buffer\n", drive->name);
217 blk_mq_free_request(sense_rq);
218 drive->sense_rq = NULL;
210 return; 219 return;
211 } 220 }
212 221
@@ -226,6 +235,8 @@ EXPORT_SYMBOL_GPL(ide_prep_sense);
226 235
227int ide_queue_sense_rq(ide_drive_t *drive, void *special) 236int ide_queue_sense_rq(ide_drive_t *drive, void *special)
228{ 237{
238 struct request *sense_rq = drive->sense_rq;
239
229 /* deferred failure from ide_prep_sense() */ 240 /* deferred failure from ide_prep_sense() */
230 if (!drive->sense_rq_armed) { 241 if (!drive->sense_rq_armed) {
231 printk(KERN_WARNING PFX "%s: error queuing a sense request\n", 242 printk(KERN_WARNING PFX "%s: error queuing a sense request\n",
@@ -233,12 +244,12 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
233 return -ENOMEM; 244 return -ENOMEM;
234 } 245 }
235 246
236 drive->sense_rq->special = special; 247 ide_req(sense_rq)->special = special;
237 drive->sense_rq_armed = false; 248 drive->sense_rq_armed = false;
238 249
239 drive->hwif->rq = NULL; 250 drive->hwif->rq = NULL;
240 251
241 elv_add_request(drive->queue, drive->sense_rq, ELEVATOR_INSERT_FRONT); 252 ide_insert_request_head(drive, sense_rq);
242 return 0; 253 return 0;
243} 254}
244EXPORT_SYMBOL_GPL(ide_queue_sense_rq); 255EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
@@ -270,10 +281,8 @@ void ide_retry_pc(ide_drive_t *drive)
270 */ 281 */
271 drive->hwif->rq = NULL; 282 drive->hwif->rq = NULL;
272 ide_requeue_and_plug(drive, failed_rq); 283 ide_requeue_and_plug(drive, failed_rq);
273 if (ide_queue_sense_rq(drive, pc)) { 284 if (ide_queue_sense_rq(drive, pc))
274 blk_start_request(failed_rq);
275 ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(failed_rq)); 285 ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(failed_rq));
276 }
277} 286}
278EXPORT_SYMBOL_GPL(ide_retry_pc); 287EXPORT_SYMBOL_GPL(ide_retry_pc);
279 288
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index f9b59d41813f..1f03884a6808 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -211,12 +211,12 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
211static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) 211static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
212{ 212{
213 /* 213 /*
214 * For ATA_PRIV_SENSE, "rq->special" points to the original 214 * For ATA_PRIV_SENSE, "ide_req(rq)->special" points to the original
215 * failed request. Also, the sense data should be read 215 * failed request. Also, the sense data should be read
216 * directly from rq which might be different from the original 216 * directly from rq which might be different from the original
217 * sense buffer if it got copied during mapping. 217 * sense buffer if it got copied during mapping.
218 */ 218 */
219 struct request *failed = (struct request *)rq->special; 219 struct request *failed = ide_req(rq)->special;
220 void *sense = bio_data(rq->bio); 220 void *sense = bio_data(rq->bio);
221 221
222 if (failed) { 222 if (failed) {
@@ -258,11 +258,22 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
258 /* 258 /*
259 * take a breather 259 * take a breather
260 */ 260 */
261 blk_delay_queue(drive->queue, 1); 261 blk_mq_requeue_request(rq, false);
262 blk_mq_delay_kick_requeue_list(drive->queue, 1);
262 return 1; 263 return 1;
263 } 264 }
264} 265}
265 266
267static void ide_cd_free_sense(ide_drive_t *drive)
268{
269 if (!drive->sense_rq)
270 return;
271
272 blk_mq_free_request(drive->sense_rq);
273 drive->sense_rq = NULL;
274 drive->sense_rq_armed = false;
275}
276
266/** 277/**
267 * Returns: 278 * Returns:
268 * 0: if the request should be continued. 279 * 0: if the request should be continued.
@@ -516,6 +527,82 @@ static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
516 return false; 527 return false;
517} 528}
518 529
530/* standard prep_rq that builds 10 byte cmds */
531static bool ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
532{
533 int hard_sect = queue_logical_block_size(q);
534 long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
535 unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
536 struct scsi_request *req = scsi_req(rq);
537
538 if (rq_data_dir(rq) == READ)
539 req->cmd[0] = GPCMD_READ_10;
540 else
541 req->cmd[0] = GPCMD_WRITE_10;
542
543 /*
544 * fill in lba
545 */
546 req->cmd[2] = (block >> 24) & 0xff;
547 req->cmd[3] = (block >> 16) & 0xff;
548 req->cmd[4] = (block >> 8) & 0xff;
549 req->cmd[5] = block & 0xff;
550
551 /*
552 * and transfer length
553 */
554 req->cmd[7] = (blocks >> 8) & 0xff;
555 req->cmd[8] = blocks & 0xff;
556 req->cmd_len = 10;
557 return true;
558}
559
560/*
561 * Most of the SCSI commands are supported directly by ATAPI devices.
562 * This transform handles the few exceptions.
563 */
564static bool ide_cdrom_prep_pc(struct request *rq)
565{
566 u8 *c = scsi_req(rq)->cmd;
567
568 /* transform 6-byte read/write commands to the 10-byte version */
569 if (c[0] == READ_6 || c[0] == WRITE_6) {
570 c[8] = c[4];
571 c[5] = c[3];
572 c[4] = c[2];
573 c[3] = c[1] & 0x1f;
574 c[2] = 0;
575 c[1] &= 0xe0;
576 c[0] += (READ_10 - READ_6);
577 scsi_req(rq)->cmd_len = 10;
578 return true;
579 }
580
581 /*
582 * it's silly to pretend we understand 6-byte sense commands, just
583 * reject with ILLEGAL_REQUEST and the caller should take the
584 * appropriate action
585 */
586 if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
587 scsi_req(rq)->result = ILLEGAL_REQUEST;
588 return false;
589 }
590
591 return true;
592}
593
594static bool ide_cdrom_prep_rq(ide_drive_t *drive, struct request *rq)
595{
596 if (!blk_rq_is_passthrough(rq)) {
597 scsi_req_init(scsi_req(rq));
598
599 return ide_cdrom_prep_fs(drive->queue, rq);
600 } else if (blk_rq_is_scsi(rq))
601 return ide_cdrom_prep_pc(rq);
602
603 return true;
604}
605
519static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) 606static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
520{ 607{
521 ide_hwif_t *hwif = drive->hwif; 608 ide_hwif_t *hwif = drive->hwif;
@@ -675,7 +762,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
675out_end: 762out_end:
676 if (blk_rq_is_scsi(rq) && rc == 0) { 763 if (blk_rq_is_scsi(rq) && rc == 0) {
677 scsi_req(rq)->resid_len = 0; 764 scsi_req(rq)->resid_len = 0;
678 blk_end_request_all(rq, BLK_STS_OK); 765 blk_mq_end_request(rq, BLK_STS_OK);
679 hwif->rq = NULL; 766 hwif->rq = NULL;
680 } else { 767 } else {
681 if (sense && uptodate) 768 if (sense && uptodate)
@@ -705,6 +792,8 @@ out_end:
705 if (sense && rc == 2) 792 if (sense && rc == 2)
706 ide_error(drive, "request sense failure", stat); 793 ide_error(drive, "request sense failure", stat);
707 } 794 }
795
796 ide_cd_free_sense(drive);
708 return ide_stopped; 797 return ide_stopped;
709} 798}
710 799
@@ -729,7 +818,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
729 * We may be retrying this request after an error. Fix up any 818 * We may be retrying this request after an error. Fix up any
730 * weirdness which might be present in the request packet. 819 * weirdness which might be present in the request packet.
731 */ 820 */
732 q->prep_rq_fn(q, rq); 821 ide_cdrom_prep_rq(drive, rq);
733 } 822 }
734 823
735 /* fs requests *must* be hardware frame aligned */ 824 /* fs requests *must* be hardware frame aligned */
@@ -1323,82 +1412,6 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
1323 return nslots; 1412 return nslots;
1324} 1413}
1325 1414
1326/* standard prep_rq_fn that builds 10 byte cmds */
1327static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
1328{
1329 int hard_sect = queue_logical_block_size(q);
1330 long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
1331 unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
1332 struct scsi_request *req = scsi_req(rq);
1333
1334 q->initialize_rq_fn(rq);
1335
1336 if (rq_data_dir(rq) == READ)
1337 req->cmd[0] = GPCMD_READ_10;
1338 else
1339 req->cmd[0] = GPCMD_WRITE_10;
1340
1341 /*
1342 * fill in lba
1343 */
1344 req->cmd[2] = (block >> 24) & 0xff;
1345 req->cmd[3] = (block >> 16) & 0xff;
1346 req->cmd[4] = (block >> 8) & 0xff;
1347 req->cmd[5] = block & 0xff;
1348
1349 /*
1350 * and transfer length
1351 */
1352 req->cmd[7] = (blocks >> 8) & 0xff;
1353 req->cmd[8] = blocks & 0xff;
1354 req->cmd_len = 10;
1355 return BLKPREP_OK;
1356}
1357
1358/*
1359 * Most of the SCSI commands are supported directly by ATAPI devices.
1360 * This transform handles the few exceptions.
1361 */
1362static int ide_cdrom_prep_pc(struct request *rq)
1363{
1364 u8 *c = scsi_req(rq)->cmd;
1365
1366 /* transform 6-byte read/write commands to the 10-byte version */
1367 if (c[0] == READ_6 || c[0] == WRITE_6) {
1368 c[8] = c[4];
1369 c[5] = c[3];
1370 c[4] = c[2];
1371 c[3] = c[1] & 0x1f;
1372 c[2] = 0;
1373 c[1] &= 0xe0;
1374 c[0] += (READ_10 - READ_6);
1375 scsi_req(rq)->cmd_len = 10;
1376 return BLKPREP_OK;
1377 }
1378
1379 /*
1380 * it's silly to pretend we understand 6-byte sense commands, just
1381 * reject with ILLEGAL_REQUEST and the caller should take the
1382 * appropriate action
1383 */
1384 if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
1385 scsi_req(rq)->result = ILLEGAL_REQUEST;
1386 return BLKPREP_KILL;
1387 }
1388
1389 return BLKPREP_OK;
1390}
1391
1392static int ide_cdrom_prep_fn(struct request_queue *q, struct request *rq)
1393{
1394 if (!blk_rq_is_passthrough(rq))
1395 return ide_cdrom_prep_fs(q, rq);
1396 else if (blk_rq_is_scsi(rq))
1397 return ide_cdrom_prep_pc(rq);
1398
1399 return 0;
1400}
1401
1402struct cd_list_entry { 1415struct cd_list_entry {
1403 const char *id_model; 1416 const char *id_model;
1404 const char *id_firmware; 1417 const char *id_firmware;
@@ -1508,7 +1521,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
1508 1521
1509 ide_debug_log(IDE_DBG_PROBE, "enter"); 1522 ide_debug_log(IDE_DBG_PROBE, "enter");
1510 1523
1511 blk_queue_prep_rq(q, ide_cdrom_prep_fn); 1524 drive->prep_rq = ide_cdrom_prep_rq;
1512 blk_queue_dma_alignment(q, 31); 1525 blk_queue_dma_alignment(q, 31);
1513 blk_queue_update_dma_pad(q, 15); 1526 blk_queue_update_dma_pad(q, 15);
1514 1527
@@ -1569,7 +1582,7 @@ static void ide_cd_release(struct device *dev)
1569 if (devinfo->handle == drive) 1582 if (devinfo->handle == drive)
1570 unregister_cdrom(devinfo); 1583 unregister_cdrom(devinfo);
1571 drive->driver_data = NULL; 1584 drive->driver_data = NULL;
1572 blk_queue_prep_rq(drive->queue, NULL); 1585 drive->prep_rq = NULL;
1573 g->private_data = NULL; 1586 g->private_data = NULL;
1574 put_disk(g); 1587 put_disk(g);
1575 kfree(info); 1588 kfree(info);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index f4f8afdf8bbe..f2f93ed40356 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -171,7 +171,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
171 scsi_req(rq)->cmd_len = 5; 171 scsi_req(rq)->cmd_len = 5;
172 scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC; 172 scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
173 *(int *)&scsi_req(rq)->cmd[1] = arg; 173 *(int *)&scsi_req(rq)->cmd[1] = arg;
174 rq->special = setting->set; 174 ide_req(rq)->special = setting->set;
175 175
176 blk_execute_rq(q, NULL, rq, 0); 176 blk_execute_rq(q, NULL, rq, 0);
177 ret = scsi_req(rq)->result; 177 ret = scsi_req(rq)->result;
@@ -182,7 +182,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
182 182
183ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq) 183ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
184{ 184{
185 int err, (*setfunc)(ide_drive_t *, int) = rq->special; 185 int err, (*setfunc)(ide_drive_t *, int) = ide_req(rq)->special;
186 186
187 err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]); 187 err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
188 if (err) 188 if (err)
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index e3b4e659082d..197912af5c2f 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -427,16 +427,15 @@ static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
427 drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */ 427 drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
428} 428}
429 429
430static int idedisk_prep_fn(struct request_queue *q, struct request *rq) 430static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq)
431{ 431{
432 ide_drive_t *drive = q->queuedata;
433 struct ide_cmd *cmd; 432 struct ide_cmd *cmd;
434 433
435 if (req_op(rq) != REQ_OP_FLUSH) 434 if (req_op(rq) != REQ_OP_FLUSH)
436 return BLKPREP_OK; 435 return true;
437 436
438 if (rq->special) { 437 if (ide_req(rq)->special) {
439 cmd = rq->special; 438 cmd = ide_req(rq)->special;
440 memset(cmd, 0, sizeof(*cmd)); 439 memset(cmd, 0, sizeof(*cmd));
441 } else { 440 } else {
442 cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC); 441 cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
@@ -456,10 +455,10 @@ static int idedisk_prep_fn(struct request_queue *q, struct request *rq)
456 rq->cmd_flags &= ~REQ_OP_MASK; 455 rq->cmd_flags &= ~REQ_OP_MASK;
457 rq->cmd_flags |= REQ_OP_DRV_OUT; 456 rq->cmd_flags |= REQ_OP_DRV_OUT;
458 ide_req(rq)->type = ATA_PRIV_TASKFILE; 457 ide_req(rq)->type = ATA_PRIV_TASKFILE;
459 rq->special = cmd; 458 ide_req(rq)->special = cmd;
460 cmd->rq = rq; 459 cmd->rq = rq;
461 460
462 return BLKPREP_OK; 461 return true;
463} 462}
464 463
465ide_devset_get(multcount, mult_count); 464ide_devset_get(multcount, mult_count);
@@ -548,7 +547,7 @@ static void update_flush(ide_drive_t *drive)
548 547
549 if (barrier) { 548 if (barrier) {
550 wc = true; 549 wc = true;
551 blk_queue_prep_rq(drive->queue, idedisk_prep_fn); 550 drive->prep_rq = idedisk_prep_rq;
552 } 551 }
553 } 552 }
554 553
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 47d5f3379748..e1323e058454 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -125,7 +125,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
125 /* retry only "normal" I/O: */ 125 /* retry only "normal" I/O: */
126 if (blk_rq_is_passthrough(rq)) { 126 if (blk_rq_is_passthrough(rq)) {
127 if (ata_taskfile_request(rq)) { 127 if (ata_taskfile_request(rq)) {
128 struct ide_cmd *cmd = rq->special; 128 struct ide_cmd *cmd = ide_req(rq)->special;
129 129
130 if (cmd) 130 if (cmd)
131 ide_complete_cmd(drive, cmd, stat, err); 131 ide_complete_cmd(drive, cmd, stat, err);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a8df300f949c..780d33ccc5d8 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -276,7 +276,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
276 switch (ide_req(rq)->type) { 276 switch (ide_req(rq)->type) {
277 case ATA_PRIV_MISC: 277 case ATA_PRIV_MISC:
278 case ATA_PRIV_SENSE: 278 case ATA_PRIV_SENSE:
279 pc = (struct ide_atapi_pc *)rq->special; 279 pc = (struct ide_atapi_pc *)ide_req(rq)->special;
280 break; 280 break;
281 default: 281 default:
282 BUG(); 282 BUG();
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 0d93e0cfbeaf..8445b484ae69 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -67,7 +67,15 @@ int ide_end_rq(ide_drive_t *drive, struct request *rq, blk_status_t error,
67 ide_dma_on(drive); 67 ide_dma_on(drive);
68 } 68 }
69 69
70 return blk_end_request(rq, error, nr_bytes); 70 if (!blk_update_request(rq, error, nr_bytes)) {
71 if (rq == drive->sense_rq)
72 drive->sense_rq = NULL;
73
74 __blk_mq_end_request(rq, error);
75 return 0;
76 }
77
78 return 1;
71} 79}
72EXPORT_SYMBOL_GPL(ide_end_rq); 80EXPORT_SYMBOL_GPL(ide_end_rq);
73 81
@@ -103,7 +111,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
103 } 111 }
104 112
105 if (rq && ata_taskfile_request(rq)) { 113 if (rq && ata_taskfile_request(rq)) {
106 struct ide_cmd *orig_cmd = rq->special; 114 struct ide_cmd *orig_cmd = ide_req(rq)->special;
107 115
108 if (cmd->tf_flags & IDE_TFLAG_DYN) 116 if (cmd->tf_flags & IDE_TFLAG_DYN)
109 kfree(orig_cmd); 117 kfree(orig_cmd);
@@ -253,7 +261,7 @@ EXPORT_SYMBOL_GPL(ide_init_sg_cmd);
253static ide_startstop_t execute_drive_cmd (ide_drive_t *drive, 261static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
254 struct request *rq) 262 struct request *rq)
255{ 263{
256 struct ide_cmd *cmd = rq->special; 264 struct ide_cmd *cmd = ide_req(rq)->special;
257 265
258 if (cmd) { 266 if (cmd) {
259 if (cmd->protocol == ATA_PROT_PIO) { 267 if (cmd->protocol == ATA_PROT_PIO) {
@@ -307,8 +315,6 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
307{ 315{
308 ide_startstop_t startstop; 316 ide_startstop_t startstop;
309 317
310 BUG_ON(!(rq->rq_flags & RQF_STARTED));
311
312#ifdef DEBUG 318#ifdef DEBUG
313 printk("%s: start_request: current=0x%08lx\n", 319 printk("%s: start_request: current=0x%08lx\n",
314 drive->hwif->name, (unsigned long) rq); 320 drive->hwif->name, (unsigned long) rq);
@@ -320,6 +326,9 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
320 goto kill_rq; 326 goto kill_rq;
321 } 327 }
322 328
329 if (drive->prep_rq && !drive->prep_rq(drive, rq))
330 return ide_stopped;
331
323 if (ata_pm_request(rq)) 332 if (ata_pm_request(rq))
324 ide_check_pm_state(drive, rq); 333 ide_check_pm_state(drive, rq);
325 334
@@ -343,7 +352,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
343 if (ata_taskfile_request(rq)) 352 if (ata_taskfile_request(rq))
344 return execute_drive_cmd(drive, rq); 353 return execute_drive_cmd(drive, rq);
345 else if (ata_pm_request(rq)) { 354 else if (ata_pm_request(rq)) {
346 struct ide_pm_state *pm = rq->special; 355 struct ide_pm_state *pm = ide_req(rq)->special;
347#ifdef DEBUG_PM 356#ifdef DEBUG_PM
348 printk("%s: start_power_step(step: %d)\n", 357 printk("%s: start_power_step(step: %d)\n",
349 drive->name, pm->pm_step); 358 drive->name, pm->pm_step);
@@ -430,44 +439,42 @@ static inline void ide_unlock_host(struct ide_host *host)
430 } 439 }
431} 440}
432 441
433static void __ide_requeue_and_plug(struct request_queue *q, struct request *rq)
434{
435 if (rq)
436 blk_requeue_request(q, rq);
437 if (rq || blk_peek_request(q)) {
438 /* Use 3ms as that was the old plug delay */
439 blk_delay_queue(q, 3);
440 }
441}
442
443void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq) 442void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
444{ 443{
445 struct request_queue *q = drive->queue; 444 struct request_queue *q = drive->queue;
446 unsigned long flags;
447 445
448 spin_lock_irqsave(q->queue_lock, flags); 446 /* Use 3ms as that was the old plug delay */
449 __ide_requeue_and_plug(q, rq); 447 if (rq) {
450 spin_unlock_irqrestore(q->queue_lock, flags); 448 blk_mq_requeue_request(rq, false);
449 blk_mq_delay_kick_requeue_list(q, 3);
450 } else
451 blk_mq_delay_run_hw_queue(q->queue_hw_ctx[0], 3);
451} 452}
452 453
453/* 454/*
454 * Issue a new request to a device. 455 * Issue a new request to a device.
455 */ 456 */
456void do_ide_request(struct request_queue *q) 457blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
458 const struct blk_mq_queue_data *bd)
457{ 459{
458 ide_drive_t *drive = q->queuedata; 460 ide_drive_t *drive = hctx->queue->queuedata;
459 ide_hwif_t *hwif = drive->hwif; 461 ide_hwif_t *hwif = drive->hwif;
460 struct ide_host *host = hwif->host; 462 struct ide_host *host = hwif->host;
461 struct request *rq = NULL; 463 struct request *rq = bd->rq;
462 ide_startstop_t startstop; 464 ide_startstop_t startstop;
463 465
464 spin_unlock_irq(q->queue_lock); 466 if (!blk_rq_is_passthrough(rq) && !(rq->rq_flags & RQF_DONTPREP)) {
467 rq->rq_flags |= RQF_DONTPREP;
468 ide_req(rq)->special = NULL;
469 }
465 470
466 /* HLD do_request() callback might sleep, make sure it's okay */ 471 /* HLD do_request() callback might sleep, make sure it's okay */
467 might_sleep(); 472 might_sleep();
468 473
469 if (ide_lock_host(host, hwif)) 474 if (ide_lock_host(host, hwif))
470 goto plug_device_2; 475 return BLK_STS_DEV_RESOURCE;
476
477 blk_mq_start_request(rq);
471 478
472 spin_lock_irq(&hwif->lock); 479 spin_lock_irq(&hwif->lock);
473 480
@@ -503,21 +510,16 @@ repeat:
503 hwif->cur_dev = drive; 510 hwif->cur_dev = drive;
504 drive->dev_flags &= ~(IDE_DFLAG_SLEEPING | IDE_DFLAG_PARKED); 511 drive->dev_flags &= ~(IDE_DFLAG_SLEEPING | IDE_DFLAG_PARKED);
505 512
506 spin_unlock_irq(&hwif->lock);
507 spin_lock_irq(q->queue_lock);
508 /* 513 /*
509 * we know that the queue isn't empty, but this can happen 514 * we know that the queue isn't empty, but this can happen
510 * if the q->prep_rq_fn() decides to kill a request 515 * if ->prep_rq() decides to kill a request
511 */ 516 */
512 if (!rq)
513 rq = blk_fetch_request(drive->queue);
514
515 spin_unlock_irq(q->queue_lock);
516 spin_lock_irq(&hwif->lock);
517
518 if (!rq) { 517 if (!rq) {
519 ide_unlock_port(hwif); 518 rq = bd->rq;
520 goto out; 519 if (!rq) {
520 ide_unlock_port(hwif);
521 goto out;
522 }
521 } 523 }
522 524
523 /* 525 /*
@@ -551,23 +553,24 @@ repeat:
551 if (startstop == ide_stopped) { 553 if (startstop == ide_stopped) {
552 rq = hwif->rq; 554 rq = hwif->rq;
553 hwif->rq = NULL; 555 hwif->rq = NULL;
554 goto repeat; 556 if (rq)
557 goto repeat;
558 ide_unlock_port(hwif);
559 goto out;
555 } 560 }
556 } else 561 } else {
557 goto plug_device; 562plug_device:
563 spin_unlock_irq(&hwif->lock);
564 ide_unlock_host(host);
565 ide_requeue_and_plug(drive, rq);
566 return BLK_STS_OK;
567 }
568
558out: 569out:
559 spin_unlock_irq(&hwif->lock); 570 spin_unlock_irq(&hwif->lock);
560 if (rq == NULL) 571 if (rq == NULL)
561 ide_unlock_host(host); 572 ide_unlock_host(host);
562 spin_lock_irq(q->queue_lock); 573 return BLK_STS_OK;
563 return;
564
565plug_device:
566 spin_unlock_irq(&hwif->lock);
567 ide_unlock_host(host);
568plug_device_2:
569 spin_lock_irq(q->queue_lock);
570 __ide_requeue_and_plug(q, rq);
571} 574}
572 575
573static int drive_is_ready(ide_drive_t *drive) 576static int drive_is_ready(ide_drive_t *drive)
@@ -887,3 +890,16 @@ void ide_pad_transfer(ide_drive_t *drive, int write, int len)
887 } 890 }
888} 891}
889EXPORT_SYMBOL_GPL(ide_pad_transfer); 892EXPORT_SYMBOL_GPL(ide_pad_transfer);
893
894void ide_insert_request_head(ide_drive_t *drive, struct request *rq)
895{
896 ide_hwif_t *hwif = drive->hwif;
897 unsigned long flags;
898
899 spin_lock_irqsave(&hwif->lock, flags);
900 list_add_tail(&rq->queuelist, &drive->rq_list);
901 spin_unlock_irqrestore(&hwif->lock, flags);
902
903 kblockd_schedule_work(&drive->rq_work);
904}
905EXPORT_SYMBOL_GPL(ide_insert_request_head);
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 622f0edb3945..102aa3bc3e7f 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -27,7 +27,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
27 spin_unlock_irq(&hwif->lock); 27 spin_unlock_irq(&hwif->lock);
28 28
29 if (start_queue) 29 if (start_queue)
30 blk_run_queue(q); 30 blk_mq_run_hw_queues(q, true);
31 return; 31 return;
32 } 32 }
33 spin_unlock_irq(&hwif->lock); 33 spin_unlock_irq(&hwif->lock);
@@ -36,7 +36,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
36 scsi_req(rq)->cmd[0] = REQ_PARK_HEADS; 36 scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
37 scsi_req(rq)->cmd_len = 1; 37 scsi_req(rq)->cmd_len = 1;
38 ide_req(rq)->type = ATA_PRIV_MISC; 38 ide_req(rq)->type = ATA_PRIV_MISC;
39 rq->special = &timeout; 39 ide_req(rq)->special = &timeout;
40 blk_execute_rq(q, NULL, rq, 1); 40 blk_execute_rq(q, NULL, rq, 1);
41 rc = scsi_req(rq)->result ? -EIO : 0; 41 rc = scsi_req(rq)->result ? -EIO : 0;
42 blk_put_request(rq); 42 blk_put_request(rq);
@@ -54,7 +54,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
54 scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS; 54 scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS;
55 scsi_req(rq)->cmd_len = 1; 55 scsi_req(rq)->cmd_len = 1;
56 ide_req(rq)->type = ATA_PRIV_MISC; 56 ide_req(rq)->type = ATA_PRIV_MISC;
57 elv_add_request(q, rq, ELEVATOR_INSERT_FRONT); 57 ide_insert_request_head(drive, rq);
58 58
59out: 59out:
60 return; 60 return;
@@ -67,7 +67,7 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
67 67
68 memset(&cmd, 0, sizeof(cmd)); 68 memset(&cmd, 0, sizeof(cmd));
69 if (scsi_req(rq)->cmd[0] == REQ_PARK_HEADS) { 69 if (scsi_req(rq)->cmd[0] == REQ_PARK_HEADS) {
70 drive->sleep = *(unsigned long *)rq->special; 70 drive->sleep = *(unsigned long *)ide_req(rq)->special;
71 drive->dev_flags |= IDE_DFLAG_SLEEPING; 71 drive->dev_flags |= IDE_DFLAG_SLEEPING;
72 tf->command = ATA_CMD_IDLEIMMEDIATE; 72 tf->command = ATA_CMD_IDLEIMMEDIATE;
73 tf->feature = 0x44; 73 tf->feature = 0x44;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 59217aa1d1fb..192e6c65d34e 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -21,7 +21,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
21 memset(&rqpm, 0, sizeof(rqpm)); 21 memset(&rqpm, 0, sizeof(rqpm));
22 rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0); 22 rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
23 ide_req(rq)->type = ATA_PRIV_PM_SUSPEND; 23 ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
24 rq->special = &rqpm; 24 ide_req(rq)->special = &rqpm;
25 rqpm.pm_step = IDE_PM_START_SUSPEND; 25 rqpm.pm_step = IDE_PM_START_SUSPEND;
26 if (mesg.event == PM_EVENT_PRETHAW) 26 if (mesg.event == PM_EVENT_PRETHAW)
27 mesg.event = PM_EVENT_FREEZE; 27 mesg.event = PM_EVENT_FREEZE;
@@ -40,32 +40,17 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
40 return ret; 40 return ret;
41} 41}
42 42
43static void ide_end_sync_rq(struct request *rq, blk_status_t error)
44{
45 complete(rq->end_io_data);
46}
47
48static int ide_pm_execute_rq(struct request *rq) 43static int ide_pm_execute_rq(struct request *rq)
49{ 44{
50 struct request_queue *q = rq->q; 45 struct request_queue *q = rq->q;
51 DECLARE_COMPLETION_ONSTACK(wait);
52 46
53 rq->end_io_data = &wait;
54 rq->end_io = ide_end_sync_rq;
55
56 spin_lock_irq(q->queue_lock);
57 if (unlikely(blk_queue_dying(q))) { 47 if (unlikely(blk_queue_dying(q))) {
58 rq->rq_flags |= RQF_QUIET; 48 rq->rq_flags |= RQF_QUIET;
59 scsi_req(rq)->result = -ENXIO; 49 scsi_req(rq)->result = -ENXIO;
60 __blk_end_request_all(rq, BLK_STS_OK); 50 blk_mq_end_request(rq, BLK_STS_OK);
61 spin_unlock_irq(q->queue_lock);
62 return -ENXIO; 51 return -ENXIO;
63 } 52 }
64 __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT); 53 blk_execute_rq(q, NULL, rq, true);
65 __blk_run_queue_uncond(q);
66 spin_unlock_irq(q->queue_lock);
67
68 wait_for_completion_io(&wait);
69 54
70 return scsi_req(rq)->result ? -EIO : 0; 55 return scsi_req(rq)->result ? -EIO : 0;
71} 56}
@@ -79,6 +64,8 @@ int generic_ide_resume(struct device *dev)
79 struct ide_pm_state rqpm; 64 struct ide_pm_state rqpm;
80 int err; 65 int err;
81 66
67 blk_mq_start_stopped_hw_queues(drive->queue, true);
68
82 if (ide_port_acpi(hwif)) { 69 if (ide_port_acpi(hwif)) {
83 /* call ACPI _PS0 / _STM only once */ 70 /* call ACPI _PS0 / _STM only once */
84 if ((drive->dn & 1) == 0 || pair == NULL) { 71 if ((drive->dn & 1) == 0 || pair == NULL) {
@@ -92,7 +79,7 @@ int generic_ide_resume(struct device *dev)
92 memset(&rqpm, 0, sizeof(rqpm)); 79 memset(&rqpm, 0, sizeof(rqpm));
93 rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT); 80 rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT);
94 ide_req(rq)->type = ATA_PRIV_PM_RESUME; 81 ide_req(rq)->type = ATA_PRIV_PM_RESUME;
95 rq->special = &rqpm; 82 ide_req(rq)->special = &rqpm;
96 rqpm.pm_step = IDE_PM_START_RESUME; 83 rqpm.pm_step = IDE_PM_START_RESUME;
97 rqpm.pm_state = PM_EVENT_ON; 84 rqpm.pm_state = PM_EVENT_ON;
98 85
@@ -111,7 +98,7 @@ int generic_ide_resume(struct device *dev)
111 98
112void ide_complete_power_step(ide_drive_t *drive, struct request *rq) 99void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
113{ 100{
114 struct ide_pm_state *pm = rq->special; 101 struct ide_pm_state *pm = ide_req(rq)->special;
115 102
116#ifdef DEBUG_PM 103#ifdef DEBUG_PM
117 printk(KERN_INFO "%s: complete_power_step(step: %d)\n", 104 printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -141,7 +128,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
141 128
142ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) 129ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
143{ 130{
144 struct ide_pm_state *pm = rq->special; 131 struct ide_pm_state *pm = ide_req(rq)->special;
145 struct ide_cmd cmd = { }; 132 struct ide_cmd cmd = { };
146 133
147 switch (pm->pm_step) { 134 switch (pm->pm_step) {
@@ -213,8 +200,7 @@ out_do_tf:
213void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) 200void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
214{ 201{
215 struct request_queue *q = drive->queue; 202 struct request_queue *q = drive->queue;
216 struct ide_pm_state *pm = rq->special; 203 struct ide_pm_state *pm = ide_req(rq)->special;
217 unsigned long flags;
218 204
219 ide_complete_power_step(drive, rq); 205 ide_complete_power_step(drive, rq);
220 if (pm->pm_step != IDE_PM_COMPLETED) 206 if (pm->pm_step != IDE_PM_COMPLETED)
@@ -224,22 +210,19 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
224 printk("%s: completing PM request, %s\n", drive->name, 210 printk("%s: completing PM request, %s\n", drive->name,
225 (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume"); 211 (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume");
226#endif 212#endif
227 spin_lock_irqsave(q->queue_lock, flags);
228 if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) 213 if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
229 blk_stop_queue(q); 214 blk_mq_stop_hw_queues(q);
230 else 215 else
231 drive->dev_flags &= ~IDE_DFLAG_BLOCKED; 216 drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
232 spin_unlock_irqrestore(q->queue_lock, flags);
233 217
234 drive->hwif->rq = NULL; 218 drive->hwif->rq = NULL;
235 219
236 if (blk_end_request(rq, BLK_STS_OK, 0)) 220 blk_mq_end_request(rq, BLK_STS_OK);
237 BUG();
238} 221}
239 222
240void ide_check_pm_state(ide_drive_t *drive, struct request *rq) 223void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
241{ 224{
242 struct ide_pm_state *pm = rq->special; 225 struct ide_pm_state *pm = ide_req(rq)->special;
243 226
244 if (blk_rq_is_private(rq) && 227 if (blk_rq_is_private(rq) &&
245 ide_req(rq)->type == ATA_PRIV_PM_SUSPEND && 228 ide_req(rq)->type == ATA_PRIV_PM_SUSPEND &&
@@ -260,7 +243,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
260 ide_hwif_t *hwif = drive->hwif; 243 ide_hwif_t *hwif = drive->hwif;
261 const struct ide_tp_ops *tp_ops = hwif->tp_ops; 244 const struct ide_tp_ops *tp_ops = hwif->tp_ops;
262 struct request_queue *q = drive->queue; 245 struct request_queue *q = drive->queue;
263 unsigned long flags;
264 int rc; 246 int rc;
265#ifdef DEBUG_PM 247#ifdef DEBUG_PM
266 printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name); 248 printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
@@ -274,8 +256,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
274 if (rc) 256 if (rc)
275 printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name); 257 printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
276 258
277 spin_lock_irqsave(q->queue_lock, flags); 259 blk_mq_start_hw_queues(q);
278 blk_start_queue(q);
279 spin_unlock_irqrestore(q->queue_lock, flags);
280 } 260 }
281} 261}
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 3b75a7b7a284..63627be0811a 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -746,10 +746,16 @@ static void ide_initialize_rq(struct request *rq)
746{ 746{
747 struct ide_request *req = blk_mq_rq_to_pdu(rq); 747 struct ide_request *req = blk_mq_rq_to_pdu(rq);
748 748
749 req->special = NULL;
749 scsi_req_init(&req->sreq); 750 scsi_req_init(&req->sreq);
750 req->sreq.sense = req->sense; 751 req->sreq.sense = req->sense;
751} 752}
752 753
754static const struct blk_mq_ops ide_mq_ops = {
755 .queue_rq = ide_queue_rq,
756 .initialize_rq_fn = ide_initialize_rq,
757};
758
753/* 759/*
754 * init request queue 760 * init request queue
755 */ 761 */
@@ -759,6 +765,7 @@ static int ide_init_queue(ide_drive_t *drive)
759 ide_hwif_t *hwif = drive->hwif; 765 ide_hwif_t *hwif = drive->hwif;
760 int max_sectors = 256; 766 int max_sectors = 256;
761 int max_sg_entries = PRD_ENTRIES; 767 int max_sg_entries = PRD_ENTRIES;
768 struct blk_mq_tag_set *set;
762 769
763 /* 770 /*
764 * Our default set up assumes the normal IDE case, 771 * Our default set up assumes the normal IDE case,
@@ -767,19 +774,26 @@ static int ide_init_queue(ide_drive_t *drive)
767 * limits and LBA48 we could raise it but as yet 774 * limits and LBA48 we could raise it but as yet
768 * do not. 775 * do not.
769 */ 776 */
770 q = blk_alloc_queue_node(GFP_KERNEL, hwif_to_node(hwif), NULL); 777
771 if (!q) 778 set = &drive->tag_set;
779 set->ops = &ide_mq_ops;
780 set->nr_hw_queues = 1;
781 set->queue_depth = 32;
782 set->reserved_tags = 1;
783 set->cmd_size = sizeof(struct ide_request);
784 set->numa_node = hwif_to_node(hwif);
785 set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
786 if (blk_mq_alloc_tag_set(set))
772 return 1; 787 return 1;
773 788
774 q->request_fn = do_ide_request; 789 q = blk_mq_init_queue(set);
775 q->initialize_rq_fn = ide_initialize_rq; 790 if (IS_ERR(q)) {
776 q->cmd_size = sizeof(struct ide_request); 791 blk_mq_free_tag_set(set);
777 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
778 if (blk_init_allocated_queue(q) < 0) {
779 blk_cleanup_queue(q);
780 return 1; 792 return 1;
781 } 793 }
782 794
795 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
796
783 q->queuedata = drive; 797 q->queuedata = drive;
784 blk_queue_segment_boundary(q, 0xffff); 798 blk_queue_segment_boundary(q, 0xffff);
785 799
@@ -965,8 +979,12 @@ static void drive_release_dev (struct device *dev)
965 979
966 ide_proc_unregister_device(drive); 980 ide_proc_unregister_device(drive);
967 981
982 if (drive->sense_rq)
983 blk_mq_free_request(drive->sense_rq);
984
968 blk_cleanup_queue(drive->queue); 985 blk_cleanup_queue(drive->queue);
969 drive->queue = NULL; 986 drive->queue = NULL;
987 blk_mq_free_tag_set(&drive->tag_set);
970 988
971 drive->dev_flags &= ~IDE_DFLAG_PRESENT; 989 drive->dev_flags &= ~IDE_DFLAG_PRESENT;
972 990
@@ -1133,6 +1151,28 @@ static void ide_port_cable_detect(ide_hwif_t *hwif)
1133 } 1151 }
1134} 1152}
1135 1153
1154/*
1155 * Deferred request list insertion handler
1156 */
1157static void drive_rq_insert_work(struct work_struct *work)
1158{
1159 ide_drive_t *drive = container_of(work, ide_drive_t, rq_work);
1160 ide_hwif_t *hwif = drive->hwif;
1161 struct request *rq;
1162 LIST_HEAD(list);
1163
1164 spin_lock_irq(&hwif->lock);
1165 if (!list_empty(&drive->rq_list))
1166 list_splice_init(&drive->rq_list, &list);
1167 spin_unlock_irq(&hwif->lock);
1168
1169 while (!list_empty(&list)) {
1170 rq = list_first_entry(&list, struct request, queuelist);
1171 list_del_init(&rq->queuelist);
1172 blk_execute_rq_nowait(drive->queue, rq->rq_disk, rq, true, NULL);
1173 }
1174}
1175
1136static const u8 ide_hwif_to_major[] = 1176static const u8 ide_hwif_to_major[] =
1137 { IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR, IDE4_MAJOR, 1177 { IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR, IDE4_MAJOR,
1138 IDE5_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR }; 1178 IDE5_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR };
@@ -1145,12 +1185,10 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
1145 ide_port_for_each_dev(i, drive, hwif) { 1185 ide_port_for_each_dev(i, drive, hwif) {
1146 u8 j = (hwif->index * MAX_DRIVES) + i; 1186 u8 j = (hwif->index * MAX_DRIVES) + i;
1147 u16 *saved_id = drive->id; 1187 u16 *saved_id = drive->id;
1148 struct request *saved_sense_rq = drive->sense_rq;
1149 1188
1150 memset(drive, 0, sizeof(*drive)); 1189 memset(drive, 0, sizeof(*drive));
1151 memset(saved_id, 0, SECTOR_SIZE); 1190 memset(saved_id, 0, SECTOR_SIZE);
1152 drive->id = saved_id; 1191 drive->id = saved_id;
1153 drive->sense_rq = saved_sense_rq;
1154 1192
1155 drive->media = ide_disk; 1193 drive->media = ide_disk;
1156 drive->select = (i << 4) | ATA_DEVICE_OBS; 1194 drive->select = (i << 4) | ATA_DEVICE_OBS;
@@ -1166,6 +1204,9 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
1166 1204
1167 INIT_LIST_HEAD(&drive->list); 1205 INIT_LIST_HEAD(&drive->list);
1168 init_completion(&drive->gendev_rel_comp); 1206 init_completion(&drive->gendev_rel_comp);
1207
1208 INIT_WORK(&drive->rq_work, drive_rq_insert_work);
1209 INIT_LIST_HEAD(&drive->rq_list);
1169 } 1210 }
1170} 1211}
1171 1212
@@ -1255,7 +1296,6 @@ static void ide_port_free_devices(ide_hwif_t *hwif)
1255 int i; 1296 int i;
1256 1297
1257 ide_port_for_each_dev(i, drive, hwif) { 1298 ide_port_for_each_dev(i, drive, hwif) {
1258 kfree(drive->sense_rq);
1259 kfree(drive->id); 1299 kfree(drive->id);
1260 kfree(drive); 1300 kfree(drive);
1261 } 1301 }
@@ -1283,17 +1323,10 @@ static int ide_port_alloc_devices(ide_hwif_t *hwif, int node)
1283 if (drive->id == NULL) 1323 if (drive->id == NULL)
1284 goto out_free_drive; 1324 goto out_free_drive;
1285 1325
1286 drive->sense_rq = kmalloc(sizeof(struct request) +
1287 sizeof(struct ide_request), GFP_KERNEL);
1288 if (!drive->sense_rq)
1289 goto out_free_id;
1290
1291 hwif->devices[i] = drive; 1326 hwif->devices[i] = drive;
1292 } 1327 }
1293 return 0; 1328 return 0;
1294 1329
1295out_free_id:
1296 kfree(drive->id);
1297out_free_drive: 1330out_free_drive:
1298 kfree(drive); 1331 kfree(drive);
1299out_nomem: 1332out_nomem:
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 34c1165226a4..db1a65f4b490 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -639,7 +639,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
639 goto out; 639 goto out;
640 } 640 }
641 if (req->cmd[13] & REQ_IDETAPE_PC1) { 641 if (req->cmd[13] & REQ_IDETAPE_PC1) {
642 pc = (struct ide_atapi_pc *)rq->special; 642 pc = (struct ide_atapi_pc *)ide_req(rq)->special;
643 req->cmd[13] &= ~(REQ_IDETAPE_PC1); 643 req->cmd[13] &= ~(REQ_IDETAPE_PC1);
644 req->cmd[13] |= REQ_IDETAPE_PC2; 644 req->cmd[13] |= REQ_IDETAPE_PC2;
645 goto out; 645 goto out;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index c21d5c50ae3a..17b2e379e872 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -440,7 +440,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
440 goto put_req; 440 goto put_req;
441 } 441 }
442 442
443 rq->special = cmd; 443 ide_req(rq)->special = cmd;
444 cmd->rq = rq; 444 cmd->rq = rq;
445 445
446 blk_execute_rq(drive->queue, NULL, rq, 0); 446 blk_execute_rq(drive->queue, NULL, rq, 0);
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index efb976a863d2..5f82036fe322 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -389,7 +389,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
389 goto err_dev; 389 goto err_dev;
390 } 390 }
391 391
392 tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node, NULL); 392 tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
393 if (!tqueue) { 393 if (!tqueue) {
394 ret = -ENOMEM; 394 ret = -ENOMEM;
395 goto err_disk; 395 goto err_disk;
@@ -974,7 +974,7 @@ static int nvm_get_bb_meta(struct nvm_dev *dev, sector_t slba,
974 struct ppa_addr ppa; 974 struct ppa_addr ppa;
975 u8 *blks; 975 u8 *blks;
976 int ch, lun, nr_blks; 976 int ch, lun, nr_blks;
977 int ret; 977 int ret = 0;
978 978
979 ppa.ppa = slba; 979 ppa.ppa = slba;
980 ppa = dev_to_generic_addr(dev, ppa); 980 ppa = dev_to_generic_addr(dev, ppa);
@@ -1140,30 +1140,33 @@ EXPORT_SYMBOL(nvm_alloc_dev);
1140 1140
1141int nvm_register(struct nvm_dev *dev) 1141int nvm_register(struct nvm_dev *dev)
1142{ 1142{
1143 int ret; 1143 int ret, exp_pool_size;
1144 1144
1145 if (!dev->q || !dev->ops) 1145 if (!dev->q || !dev->ops)
1146 return -EINVAL; 1146 return -EINVAL;
1147 1147
1148 dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist"); 1148 ret = nvm_init(dev);
1149 if (ret)
1150 return ret;
1151
1152 exp_pool_size = max_t(int, PAGE_SIZE,
1153 (NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos)));
1154 exp_pool_size = round_up(exp_pool_size, PAGE_SIZE);
1155
1156 dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist",
1157 exp_pool_size);
1149 if (!dev->dma_pool) { 1158 if (!dev->dma_pool) {
1150 pr_err("nvm: could not create dma pool\n"); 1159 pr_err("nvm: could not create dma pool\n");
1160 nvm_free(dev);
1151 return -ENOMEM; 1161 return -ENOMEM;
1152 } 1162 }
1153 1163
1154 ret = nvm_init(dev);
1155 if (ret)
1156 goto err_init;
1157
1158 /* register device with a supported media manager */ 1164 /* register device with a supported media manager */
1159 down_write(&nvm_lock); 1165 down_write(&nvm_lock);
1160 list_add(&dev->devices, &nvm_devices); 1166 list_add(&dev->devices, &nvm_devices);
1161 up_write(&nvm_lock); 1167 up_write(&nvm_lock);
1162 1168
1163 return 0; 1169 return 0;
1164err_init:
1165 dev->ops->destroy_dma_pool(dev->dma_pool);
1166 return ret;
1167} 1170}
1168EXPORT_SYMBOL(nvm_register); 1171EXPORT_SYMBOL(nvm_register);
1169 1172
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 6944aac43b01..1ff165351180 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -250,8 +250,8 @@ int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd)
250 if (rqd->nr_ppas == 1) 250 if (rqd->nr_ppas == 1)
251 return 0; 251 return 0;
252 252
253 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; 253 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size(pblk);
254 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; 254 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size(pblk);
255 255
256 return 0; 256 return 0;
257} 257}
@@ -376,7 +376,7 @@ void pblk_write_should_kick(struct pblk *pblk)
376{ 376{
377 unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb); 377 unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
378 378
379 if (secs_avail >= pblk->min_write_pgs) 379 if (secs_avail >= pblk->min_write_pgs_data)
380 pblk_write_kick(pblk); 380 pblk_write_kick(pblk);
381} 381}
382 382
@@ -407,7 +407,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
407 struct pblk_line_meta *lm = &pblk->lm; 407 struct pblk_line_meta *lm = &pblk->lm;
408 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 408 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
409 struct list_head *move_list = NULL; 409 struct list_head *move_list = NULL;
410 int vsc = le32_to_cpu(*line->vsc); 410 int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
411 * (pblk->min_write_pgs - pblk->min_write_pgs_data);
412 int vsc = le32_to_cpu(*line->vsc) + packed_meta;
411 413
412 lockdep_assert_held(&line->lock); 414 lockdep_assert_held(&line->lock);
413 415
@@ -531,7 +533,7 @@ void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
531 if (caddr == 0) 533 if (caddr == 0)
532 trace_pblk_chunk_state(pblk_disk_name(pblk), 534 trace_pblk_chunk_state(pblk_disk_name(pblk),
533 ppa, NVM_CHK_ST_OPEN); 535 ppa, NVM_CHK_ST_OPEN);
534 else if (caddr == chunk->cnlb) 536 else if (caddr == (chunk->cnlb - 1))
535 trace_pblk_chunk_state(pblk_disk_name(pblk), 537 trace_pblk_chunk_state(pblk_disk_name(pblk),
536 ppa, NVM_CHK_ST_CLOSED); 538 ppa, NVM_CHK_ST_CLOSED);
537 } 539 }
@@ -620,12 +622,15 @@ out:
620} 622}
621 623
622int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, 624int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
623 unsigned long secs_to_flush) 625 unsigned long secs_to_flush, bool skip_meta)
624{ 626{
625 int max = pblk->sec_per_write; 627 int max = pblk->sec_per_write;
626 int min = pblk->min_write_pgs; 628 int min = pblk->min_write_pgs;
627 int secs_to_sync = 0; 629 int secs_to_sync = 0;
628 630
631 if (skip_meta && pblk->min_write_pgs_data != pblk->min_write_pgs)
632 min = max = pblk->min_write_pgs_data;
633
629 if (secs_avail >= max) 634 if (secs_avail >= max)
630 secs_to_sync = max; 635 secs_to_sync = max;
631 else if (secs_avail >= min) 636 else if (secs_avail >= min)
@@ -796,10 +801,11 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
796 rqd.is_seq = 1; 801 rqd.is_seq = 1;
797 802
798 for (i = 0; i < lm->smeta_sec; i++, paddr++) { 803 for (i = 0; i < lm->smeta_sec; i++, paddr++) {
799 struct pblk_sec_meta *meta_list = rqd.meta_list; 804 struct pblk_sec_meta *meta = pblk_get_meta(pblk,
805 rqd.meta_list, i);
800 806
801 rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); 807 rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
802 meta_list[i].lba = lba_list[paddr] = addr_empty; 808 meta->lba = lba_list[paddr] = addr_empty;
803 } 809 }
804 810
805 ret = pblk_submit_io_sync_sem(pblk, &rqd); 811 ret = pblk_submit_io_sync_sem(pblk, &rqd);
@@ -845,13 +851,13 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
845 if (!meta_list) 851 if (!meta_list)
846 return -ENOMEM; 852 return -ENOMEM;
847 853
848 ppa_list = meta_list + pblk_dma_meta_size; 854 ppa_list = meta_list + pblk_dma_meta_size(pblk);
849 dma_ppa_list = dma_meta_list + pblk_dma_meta_size; 855 dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
850 856
851next_rq: 857next_rq:
852 memset(&rqd, 0, sizeof(struct nvm_rq)); 858 memset(&rqd, 0, sizeof(struct nvm_rq));
853 859
854 rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); 860 rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
855 rq_len = rq_ppas * geo->csecs; 861 rq_len = rq_ppas * geo->csecs;
856 862
857 bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len, 863 bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
@@ -1276,6 +1282,7 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
1276 return 0; 1282 return 0;
1277} 1283}
1278 1284
1285/* Line allocations in the recovery path are always single threaded */
1279int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) 1286int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
1280{ 1287{
1281 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 1288 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1295,15 +1302,22 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
1295 1302
1296 ret = pblk_line_alloc_bitmaps(pblk, line); 1303 ret = pblk_line_alloc_bitmaps(pblk, line);
1297 if (ret) 1304 if (ret)
1298 return ret; 1305 goto fail;
1299 1306
1300 if (!pblk_line_init_bb(pblk, line, 0)) { 1307 if (!pblk_line_init_bb(pblk, line, 0)) {
1301 list_add(&line->list, &l_mg->free_list); 1308 ret = -EINTR;
1302 return -EINTR; 1309 goto fail;
1303 } 1310 }
1304 1311
1305 pblk_rl_free_lines_dec(&pblk->rl, line, true); 1312 pblk_rl_free_lines_dec(&pblk->rl, line, true);
1306 return 0; 1313 return 0;
1314
1315fail:
1316 spin_lock(&l_mg->free_lock);
1317 list_add(&line->list, &l_mg->free_list);
1318 spin_unlock(&l_mg->free_lock);
1319
1320 return ret;
1307} 1321}
1308 1322
1309void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) 1323void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
@@ -2160,3 +2174,38 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
2160 } 2174 }
2161 spin_unlock(&pblk->trans_lock); 2175 spin_unlock(&pblk->trans_lock);
2162} 2176}
2177
2178void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd)
2179{
2180 void *buffer;
2181
2182 if (pblk_is_oob_meta_supported(pblk)) {
2183 /* Just use OOB metadata buffer as always */
2184 buffer = rqd->meta_list;
2185 } else {
2186 /* We need to reuse last page of request (packed metadata)
2187 * in similar way as traditional oob metadata
2188 */
2189 buffer = page_to_virt(
2190 rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
2191 }
2192
2193 return buffer;
2194}
2195
2196void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
2197{
2198 void *meta_list = rqd->meta_list;
2199 void *page;
2200 int i = 0;
2201
2202 if (pblk_is_oob_meta_supported(pblk))
2203 return;
2204
2205 page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
2206 /* We need to fill oob meta buffer with data from packed metadata */
2207 for (; i < rqd->nr_ppas; i++)
2208 memcpy(pblk_get_meta(pblk, meta_list, i),
2209 page + (i * sizeof(struct pblk_sec_meta)),
2210 sizeof(struct pblk_sec_meta));
2211}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 13822594647c..f9a3e47b6a93 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -207,9 +207,6 @@ static int pblk_rwb_init(struct pblk *pblk)
207 return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs); 207 return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs);
208} 208}
209 209
210/* Minimum pages needed within a lun */
211#define ADDR_POOL_SIZE 64
212
213static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo, 210static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
214 struct nvm_addrf_12 *dst) 211 struct nvm_addrf_12 *dst)
215{ 212{
@@ -350,23 +347,19 @@ fail_destroy_ws:
350 347
351static int pblk_get_global_caches(void) 348static int pblk_get_global_caches(void)
352{ 349{
353 int ret; 350 int ret = 0;
354 351
355 mutex_lock(&pblk_caches.mutex); 352 mutex_lock(&pblk_caches.mutex);
356 353
357 if (kref_read(&pblk_caches.kref) > 0) { 354 if (kref_get_unless_zero(&pblk_caches.kref))
358 kref_get(&pblk_caches.kref); 355 goto out;
359 mutex_unlock(&pblk_caches.mutex);
360 return 0;
361 }
362 356
363 ret = pblk_create_global_caches(); 357 ret = pblk_create_global_caches();
364
365 if (!ret) 358 if (!ret)
366 kref_get(&pblk_caches.kref); 359 kref_init(&pblk_caches.kref);
367 360
361out:
368 mutex_unlock(&pblk_caches.mutex); 362 mutex_unlock(&pblk_caches.mutex);
369
370 return ret; 363 return ret;
371} 364}
372 365
@@ -406,12 +399,45 @@ static int pblk_core_init(struct pblk *pblk)
406 pblk->nr_flush_rst = 0; 399 pblk->nr_flush_rst = 0;
407 400
408 pblk->min_write_pgs = geo->ws_opt; 401 pblk->min_write_pgs = geo->ws_opt;
402 pblk->min_write_pgs_data = pblk->min_write_pgs;
409 max_write_ppas = pblk->min_write_pgs * geo->all_luns; 403 max_write_ppas = pblk->min_write_pgs * geo->all_luns;
410 pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); 404 pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
411 pblk->max_write_pgs = min_t(int, pblk->max_write_pgs, 405 pblk->max_write_pgs = min_t(int, pblk->max_write_pgs,
412 queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT)); 406 queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT));
413 pblk_set_sec_per_write(pblk, pblk->min_write_pgs); 407 pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
414 408
409 pblk->oob_meta_size = geo->sos;
410 if (!pblk_is_oob_meta_supported(pblk)) {
411 /* For drives which does not have OOB metadata feature
412 * in order to support recovery feature we need to use
413 * so called packed metadata. Packed metada will store
414 * the same information as OOB metadata (l2p table mapping,
415 * but in the form of the single page at the end of
416 * every write request.
417 */
418 if (pblk->min_write_pgs
419 * sizeof(struct pblk_sec_meta) > PAGE_SIZE) {
420 /* We want to keep all the packed metadata on single
421 * page per write requests. So we need to ensure that
422 * it will fit.
423 *
424 * This is more like sanity check, since there is
425 * no device with such a big minimal write size
426 * (above 1 metabytes).
427 */
428 pblk_err(pblk, "Not supported min write size\n");
429 return -EINVAL;
430 }
431 /* For packed meta approach we do some simplification.
432 * On read path we always issue requests which size
433 * equal to max_write_pgs, with all pages filled with
434 * user payload except of last one page which will be
435 * filled with packed metadata.
436 */
437 pblk->max_write_pgs = pblk->min_write_pgs;
438 pblk->min_write_pgs_data = pblk->min_write_pgs - 1;
439 }
440
415 pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t), 441 pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t),
416 GFP_KERNEL); 442 GFP_KERNEL);
417 if (!pblk->pad_dist) 443 if (!pblk->pad_dist)
@@ -635,40 +661,61 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
635 return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]); 661 return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]);
636} 662}
637 663
638static void pblk_set_provision(struct pblk *pblk, long nr_free_blks) 664static int pblk_set_provision(struct pblk *pblk, int nr_free_chks)
639{ 665{
640 struct nvm_tgt_dev *dev = pblk->dev; 666 struct nvm_tgt_dev *dev = pblk->dev;
641 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 667 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
642 struct pblk_line_meta *lm = &pblk->lm; 668 struct pblk_line_meta *lm = &pblk->lm;
643 struct nvm_geo *geo = &dev->geo; 669 struct nvm_geo *geo = &dev->geo;
644 sector_t provisioned; 670 sector_t provisioned;
645 int sec_meta, blk_meta; 671 int sec_meta, blk_meta, clba;
672 int minimum;
646 673
647 if (geo->op == NVM_TARGET_DEFAULT_OP) 674 if (geo->op == NVM_TARGET_DEFAULT_OP)
648 pblk->op = PBLK_DEFAULT_OP; 675 pblk->op = PBLK_DEFAULT_OP;
649 else 676 else
650 pblk->op = geo->op; 677 pblk->op = geo->op;
651 678
652 provisioned = nr_free_blks; 679 minimum = pblk_get_min_chks(pblk);
680 provisioned = nr_free_chks;
653 provisioned *= (100 - pblk->op); 681 provisioned *= (100 - pblk->op);
654 sector_div(provisioned, 100); 682 sector_div(provisioned, 100);
655 683
656 pblk->op_blks = nr_free_blks - provisioned; 684 if ((nr_free_chks - provisioned) < minimum) {
685 if (geo->op != NVM_TARGET_DEFAULT_OP) {
686 pblk_err(pblk, "OP too small to create a sane instance\n");
687 return -EINTR;
688 }
689
690 /* If the user did not specify an OP value, and PBLK_DEFAULT_OP
691 * is not enough, calculate and set sane value
692 */
693
694 provisioned = nr_free_chks - minimum;
695 pblk->op = (100 * minimum) / nr_free_chks;
696 pblk_info(pblk, "Default OP insufficient, adjusting OP to %d\n",
697 pblk->op);
698 }
699
700 pblk->op_blks = nr_free_chks - provisioned;
657 701
658 /* Internally pblk manages all free blocks, but all calculations based 702 /* Internally pblk manages all free blocks, but all calculations based
659 * on user capacity consider only provisioned blocks 703 * on user capacity consider only provisioned blocks
660 */ 704 */
661 pblk->rl.total_blocks = nr_free_blks; 705 pblk->rl.total_blocks = nr_free_chks;
662 pblk->rl.nr_secs = nr_free_blks * geo->clba; 706 pblk->rl.nr_secs = nr_free_chks * geo->clba;
663 707
664 /* Consider sectors used for metadata */ 708 /* Consider sectors used for metadata */
665 sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; 709 sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
666 blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); 710 blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
667 711
668 pblk->capacity = (provisioned - blk_meta) * geo->clba; 712 clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data;
713 pblk->capacity = (provisioned - blk_meta) * clba;
669 714
670 atomic_set(&pblk->rl.free_blocks, nr_free_blks); 715 atomic_set(&pblk->rl.free_blocks, nr_free_chks);
671 atomic_set(&pblk->rl.free_user_blocks, nr_free_blks); 716 atomic_set(&pblk->rl.free_user_blocks, nr_free_chks);
717
718 return 0;
672} 719}
673 720
674static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line, 721static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line,
@@ -984,7 +1031,7 @@ static int pblk_lines_init(struct pblk *pblk)
984 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 1031 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
985 struct pblk_line *line; 1032 struct pblk_line *line;
986 void *chunk_meta; 1033 void *chunk_meta;
987 long nr_free_chks = 0; 1034 int nr_free_chks = 0;
988 int i, ret; 1035 int i, ret;
989 1036
990 ret = pblk_line_meta_init(pblk); 1037 ret = pblk_line_meta_init(pblk);
@@ -1031,7 +1078,9 @@ static int pblk_lines_init(struct pblk *pblk)
1031 goto fail_free_lines; 1078 goto fail_free_lines;
1032 } 1079 }
1033 1080
1034 pblk_set_provision(pblk, nr_free_chks); 1081 ret = pblk_set_provision(pblk, nr_free_chks);
1082 if (ret)
1083 goto fail_free_lines;
1035 1084
1036 vfree(chunk_meta); 1085 vfree(chunk_meta);
1037 return 0; 1086 return 0;
@@ -1041,7 +1090,7 @@ fail_free_lines:
1041 pblk_line_meta_free(l_mg, &pblk->lines[i]); 1090 pblk_line_meta_free(l_mg, &pblk->lines[i]);
1042 kfree(pblk->lines); 1091 kfree(pblk->lines);
1043fail_free_chunk_meta: 1092fail_free_chunk_meta:
1044 kfree(chunk_meta); 1093 vfree(chunk_meta);
1045fail_free_luns: 1094fail_free_luns:
1046 kfree(pblk->luns); 1095 kfree(pblk->luns);
1047fail_free_meta: 1096fail_free_meta:
@@ -1154,6 +1203,12 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
1154 return ERR_PTR(-EINVAL); 1203 return ERR_PTR(-EINVAL);
1155 } 1204 }
1156 1205
1206 if (geo->ext) {
1207 pblk_err(pblk, "extended metadata not supported\n");
1208 kfree(pblk);
1209 return ERR_PTR(-EINVAL);
1210 }
1211
1157 spin_lock_init(&pblk->resubmit_lock); 1212 spin_lock_init(&pblk->resubmit_lock);
1158 spin_lock_init(&pblk->trans_lock); 1213 spin_lock_init(&pblk->trans_lock);
1159 spin_lock_init(&pblk->lock); 1214 spin_lock_init(&pblk->lock);
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 6dcbd44e3acb..79df583ea709 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -22,7 +22,7 @@
22static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, 22static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
23 struct ppa_addr *ppa_list, 23 struct ppa_addr *ppa_list,
24 unsigned long *lun_bitmap, 24 unsigned long *lun_bitmap,
25 struct pblk_sec_meta *meta_list, 25 void *meta_list,
26 unsigned int valid_secs) 26 unsigned int valid_secs)
27{ 27{
28 struct pblk_line *line = pblk_line_get_data(pblk); 28 struct pblk_line *line = pblk_line_get_data(pblk);
@@ -33,6 +33,9 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
33 int nr_secs = pblk->min_write_pgs; 33 int nr_secs = pblk->min_write_pgs;
34 int i; 34 int i;
35 35
36 if (!line)
37 return -ENOSPC;
38
36 if (pblk_line_is_full(line)) { 39 if (pblk_line_is_full(line)) {
37 struct pblk_line *prev_line = line; 40 struct pblk_line *prev_line = line;
38 41
@@ -42,8 +45,11 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
42 line = pblk_line_replace_data(pblk); 45 line = pblk_line_replace_data(pblk);
43 pblk_line_close_meta(pblk, prev_line); 46 pblk_line_close_meta(pblk, prev_line);
44 47
45 if (!line) 48 if (!line) {
46 return -EINTR; 49 pblk_pipeline_stop(pblk);
50 return -ENOSPC;
51 }
52
47 } 53 }
48 54
49 emeta = line->emeta; 55 emeta = line->emeta;
@@ -52,6 +58,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
52 paddr = pblk_alloc_page(pblk, line, nr_secs); 58 paddr = pblk_alloc_page(pblk, line, nr_secs);
53 59
54 for (i = 0; i < nr_secs; i++, paddr++) { 60 for (i = 0; i < nr_secs; i++, paddr++) {
61 struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
55 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); 62 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
56 63
57 /* ppa to be sent to the device */ 64 /* ppa to be sent to the device */
@@ -68,14 +75,15 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
68 kref_get(&line->ref); 75 kref_get(&line->ref);
69 w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i); 76 w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
70 w_ctx->ppa = ppa_list[i]; 77 w_ctx->ppa = ppa_list[i];
71 meta_list[i].lba = cpu_to_le64(w_ctx->lba); 78 meta->lba = cpu_to_le64(w_ctx->lba);
72 lba_list[paddr] = cpu_to_le64(w_ctx->lba); 79 lba_list[paddr] = cpu_to_le64(w_ctx->lba);
73 if (lba_list[paddr] != addr_empty) 80 if (lba_list[paddr] != addr_empty)
74 line->nr_valid_lbas++; 81 line->nr_valid_lbas++;
75 else 82 else
76 atomic64_inc(&pblk->pad_wa); 83 atomic64_inc(&pblk->pad_wa);
77 } else { 84 } else {
78 lba_list[paddr] = meta_list[i].lba = addr_empty; 85 lba_list[paddr] = addr_empty;
86 meta->lba = addr_empty;
79 __pblk_map_invalidate(pblk, line, paddr); 87 __pblk_map_invalidate(pblk, line, paddr);
80 } 88 }
81 } 89 }
@@ -84,50 +92,57 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
84 return 0; 92 return 0;
85} 93}
86 94
87void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, 95int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
88 unsigned long *lun_bitmap, unsigned int valid_secs, 96 unsigned long *lun_bitmap, unsigned int valid_secs,
89 unsigned int off) 97 unsigned int off)
90{ 98{
91 struct pblk_sec_meta *meta_list = rqd->meta_list; 99 void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
100 void *meta_buffer;
92 struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); 101 struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
93 unsigned int map_secs; 102 unsigned int map_secs;
94 int min = pblk->min_write_pgs; 103 int min = pblk->min_write_pgs;
95 int i; 104 int i;
105 int ret;
96 106
97 for (i = off; i < rqd->nr_ppas; i += min) { 107 for (i = off; i < rqd->nr_ppas; i += min) {
98 map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; 108 map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
99 if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i], 109 meta_buffer = pblk_get_meta(pblk, meta_list, i);
100 lun_bitmap, &meta_list[i], map_secs)) { 110
101 bio_put(rqd->bio); 111 ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
102 pblk_free_rqd(pblk, rqd, PBLK_WRITE); 112 lun_bitmap, meta_buffer, map_secs);
103 pblk_pipeline_stop(pblk); 113 if (ret)
104 } 114 return ret;
105 } 115 }
116
117 return 0;
106} 118}
107 119
108/* only if erase_ppa is set, acquire erase semaphore */ 120/* only if erase_ppa is set, acquire erase semaphore */
109void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, 121int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
110 unsigned int sentry, unsigned long *lun_bitmap, 122 unsigned int sentry, unsigned long *lun_bitmap,
111 unsigned int valid_secs, struct ppa_addr *erase_ppa) 123 unsigned int valid_secs, struct ppa_addr *erase_ppa)
112{ 124{
113 struct nvm_tgt_dev *dev = pblk->dev; 125 struct nvm_tgt_dev *dev = pblk->dev;
114 struct nvm_geo *geo = &dev->geo; 126 struct nvm_geo *geo = &dev->geo;
115 struct pblk_line_meta *lm = &pblk->lm; 127 struct pblk_line_meta *lm = &pblk->lm;
116 struct pblk_sec_meta *meta_list = rqd->meta_list; 128 void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
129 void *meta_buffer;
117 struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); 130 struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
118 struct pblk_line *e_line, *d_line; 131 struct pblk_line *e_line, *d_line;
119 unsigned int map_secs; 132 unsigned int map_secs;
120 int min = pblk->min_write_pgs; 133 int min = pblk->min_write_pgs;
121 int i, erase_lun; 134 int i, erase_lun;
135 int ret;
136
122 137
123 for (i = 0; i < rqd->nr_ppas; i += min) { 138 for (i = 0; i < rqd->nr_ppas; i += min) {
124 map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; 139 map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
125 if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i], 140 meta_buffer = pblk_get_meta(pblk, meta_list, i);
126 lun_bitmap, &meta_list[i], map_secs)) { 141
127 bio_put(rqd->bio); 142 ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
128 pblk_free_rqd(pblk, rqd, PBLK_WRITE); 143 lun_bitmap, meta_buffer, map_secs);
129 pblk_pipeline_stop(pblk); 144 if (ret)
130 } 145 return ret;
131 146
132 erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]); 147 erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]);
133 148
@@ -163,7 +178,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
163 */ 178 */
164 e_line = pblk_line_get_erase(pblk); 179 e_line = pblk_line_get_erase(pblk);
165 if (!e_line) 180 if (!e_line)
166 return; 181 return -ENOSPC;
167 182
168 /* Erase blocks that are bad in this line but might not be in next */ 183 /* Erase blocks that are bad in this line but might not be in next */
169 if (unlikely(pblk_ppa_empty(*erase_ppa)) && 184 if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
@@ -174,7 +189,7 @@ retry:
174 bit = find_next_bit(d_line->blk_bitmap, 189 bit = find_next_bit(d_line->blk_bitmap,
175 lm->blk_per_line, bit + 1); 190 lm->blk_per_line, bit + 1);
176 if (bit >= lm->blk_per_line) 191 if (bit >= lm->blk_per_line)
177 return; 192 return 0;
178 193
179 spin_lock(&e_line->lock); 194 spin_lock(&e_line->lock);
180 if (test_bit(bit, e_line->erase_bitmap)) { 195 if (test_bit(bit, e_line->erase_bitmap)) {
@@ -188,4 +203,6 @@ retry:
188 *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */ 203 *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */
189 erase_ppa->a.blk = e_line->id; 204 erase_ppa->a.blk = e_line->id;
190 } 205 }
206
207 return 0;
191} 208}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index b1f4b51783f4..d4ca8c64ee0f 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -147,7 +147,7 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
147 147
148 /* 148 /*
149 * Initialize rate-limiter, which controls access to the write buffer 149 * Initialize rate-limiter, which controls access to the write buffer
150 * but user and GC I/O 150 * by user and GC I/O
151 */ 151 */
152 pblk_rl_init(&pblk->rl, rb->nr_entries); 152 pblk_rl_init(&pblk->rl, rb->nr_entries);
153 153
@@ -552,6 +552,9 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
552 to_read = count; 552 to_read = count;
553 } 553 }
554 554
555 /* Add space for packed metadata if in use*/
556 pad += (pblk->min_write_pgs - pblk->min_write_pgs_data);
557
555 c_ctx->sentry = pos; 558 c_ctx->sentry = pos;
556 c_ctx->nr_valid = to_read; 559 c_ctx->nr_valid = to_read;
557 c_ctx->nr_padded = pad; 560 c_ctx->nr_padded = pad;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 9fba614adeeb..3789185144da 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -43,7 +43,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
43 struct bio *bio, sector_t blba, 43 struct bio *bio, sector_t blba,
44 unsigned long *read_bitmap) 44 unsigned long *read_bitmap)
45{ 45{
46 struct pblk_sec_meta *meta_list = rqd->meta_list; 46 void *meta_list = rqd->meta_list;
47 struct ppa_addr ppas[NVM_MAX_VLBA]; 47 struct ppa_addr ppas[NVM_MAX_VLBA];
48 int nr_secs = rqd->nr_ppas; 48 int nr_secs = rqd->nr_ppas;
49 bool advanced_bio = false; 49 bool advanced_bio = false;
@@ -53,12 +53,15 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
53 53
54 for (i = 0; i < nr_secs; i++) { 54 for (i = 0; i < nr_secs; i++) {
55 struct ppa_addr p = ppas[i]; 55 struct ppa_addr p = ppas[i];
56 struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
56 sector_t lba = blba + i; 57 sector_t lba = blba + i;
57 58
58retry: 59retry:
59 if (pblk_ppa_empty(p)) { 60 if (pblk_ppa_empty(p)) {
61 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
62
60 WARN_ON(test_and_set_bit(i, read_bitmap)); 63 WARN_ON(test_and_set_bit(i, read_bitmap));
61 meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); 64 meta->lba = addr_empty;
62 65
63 if (unlikely(!advanced_bio)) { 66 if (unlikely(!advanced_bio)) {
64 bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); 67 bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
@@ -78,7 +81,7 @@ retry:
78 goto retry; 81 goto retry;
79 } 82 }
80 WARN_ON(test_and_set_bit(i, read_bitmap)); 83 WARN_ON(test_and_set_bit(i, read_bitmap));
81 meta_list[i].lba = cpu_to_le64(lba); 84 meta->lba = cpu_to_le64(lba);
82 advanced_bio = true; 85 advanced_bio = true;
83#ifdef CONFIG_NVM_PBLK_DEBUG 86#ifdef CONFIG_NVM_PBLK_DEBUG
84 atomic_long_inc(&pblk->cache_reads); 87 atomic_long_inc(&pblk->cache_reads);
@@ -105,12 +108,16 @@ next:
105static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, 108static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
106 sector_t blba) 109 sector_t blba)
107{ 110{
108 struct pblk_sec_meta *meta_lba_list = rqd->meta_list; 111 void *meta_list = rqd->meta_list;
109 int nr_lbas = rqd->nr_ppas; 112 int nr_lbas = rqd->nr_ppas;
110 int i; 113 int i;
111 114
115 if (!pblk_is_oob_meta_supported(pblk))
116 return;
117
112 for (i = 0; i < nr_lbas; i++) { 118 for (i = 0; i < nr_lbas; i++) {
113 u64 lba = le64_to_cpu(meta_lba_list[i].lba); 119 struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
120 u64 lba = le64_to_cpu(meta->lba);
114 121
115 if (lba == ADDR_EMPTY) 122 if (lba == ADDR_EMPTY)
116 continue; 123 continue;
@@ -134,17 +141,22 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
134static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, 141static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
135 u64 *lba_list, int nr_lbas) 142 u64 *lba_list, int nr_lbas)
136{ 143{
137 struct pblk_sec_meta *meta_lba_list = rqd->meta_list; 144 void *meta_lba_list = rqd->meta_list;
138 int i, j; 145 int i, j;
139 146
147 if (!pblk_is_oob_meta_supported(pblk))
148 return;
149
140 for (i = 0, j = 0; i < nr_lbas; i++) { 150 for (i = 0, j = 0; i < nr_lbas; i++) {
151 struct pblk_sec_meta *meta = pblk_get_meta(pblk,
152 meta_lba_list, j);
141 u64 lba = lba_list[i]; 153 u64 lba = lba_list[i];
142 u64 meta_lba; 154 u64 meta_lba;
143 155
144 if (lba == ADDR_EMPTY) 156 if (lba == ADDR_EMPTY)
145 continue; 157 continue;
146 158
147 meta_lba = le64_to_cpu(meta_lba_list[j].lba); 159 meta_lba = le64_to_cpu(meta->lba);
148 160
149 if (lba != meta_lba) { 161 if (lba != meta_lba) {
150#ifdef CONFIG_NVM_PBLK_DEBUG 162#ifdef CONFIG_NVM_PBLK_DEBUG
@@ -216,15 +228,15 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
216 struct pblk *pblk = rqd->private; 228 struct pblk *pblk = rqd->private;
217 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); 229 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
218 struct pblk_pr_ctx *pr_ctx = r_ctx->private; 230 struct pblk_pr_ctx *pr_ctx = r_ctx->private;
231 struct pblk_sec_meta *meta;
219 struct bio *new_bio = rqd->bio; 232 struct bio *new_bio = rqd->bio;
220 struct bio *bio = pr_ctx->orig_bio; 233 struct bio *bio = pr_ctx->orig_bio;
221 struct bio_vec src_bv, dst_bv; 234 struct bio_vec src_bv, dst_bv;
222 struct pblk_sec_meta *meta_list = rqd->meta_list; 235 void *meta_list = rqd->meta_list;
223 int bio_init_idx = pr_ctx->bio_init_idx; 236 int bio_init_idx = pr_ctx->bio_init_idx;
224 unsigned long *read_bitmap = pr_ctx->bitmap; 237 unsigned long *read_bitmap = pr_ctx->bitmap;
225 int nr_secs = pr_ctx->orig_nr_secs; 238 int nr_secs = pr_ctx->orig_nr_secs;
226 int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); 239 int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
227 __le64 *lba_list_mem, *lba_list_media;
228 void *src_p, *dst_p; 240 void *src_p, *dst_p;
229 int hole, i; 241 int hole, i;
230 242
@@ -237,13 +249,10 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
237 rqd->ppa_list[0] = ppa; 249 rqd->ppa_list[0] = ppa;
238 } 250 }
239 251
240 /* Re-use allocated memory for intermediate lbas */
241 lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
242 lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
243
244 for (i = 0; i < nr_secs; i++) { 252 for (i = 0; i < nr_secs; i++) {
245 lba_list_media[i] = meta_list[i].lba; 253 meta = pblk_get_meta(pblk, meta_list, i);
246 meta_list[i].lba = lba_list_mem[i]; 254 pr_ctx->lba_list_media[i] = le64_to_cpu(meta->lba);
255 meta->lba = cpu_to_le64(pr_ctx->lba_list_mem[i]);
247 } 256 }
248 257
249 /* Fill the holes in the original bio */ 258 /* Fill the holes in the original bio */
@@ -255,7 +264,8 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
255 line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]); 264 line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]);
256 kref_put(&line->ref, pblk_line_put); 265 kref_put(&line->ref, pblk_line_put);
257 266
258 meta_list[hole].lba = lba_list_media[i]; 267 meta = pblk_get_meta(pblk, meta_list, hole);
268 meta->lba = cpu_to_le64(pr_ctx->lba_list_media[i]);
259 269
260 src_bv = new_bio->bi_io_vec[i++]; 270 src_bv = new_bio->bi_io_vec[i++];
261 dst_bv = bio->bi_io_vec[bio_init_idx + hole]; 271 dst_bv = bio->bi_io_vec[bio_init_idx + hole];
@@ -291,17 +301,13 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
291 unsigned long *read_bitmap, 301 unsigned long *read_bitmap,
292 int nr_holes) 302 int nr_holes)
293{ 303{
294 struct pblk_sec_meta *meta_list = rqd->meta_list; 304 void *meta_list = rqd->meta_list;
295 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); 305 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
296 struct pblk_pr_ctx *pr_ctx; 306 struct pblk_pr_ctx *pr_ctx;
297 struct bio *new_bio, *bio = r_ctx->private; 307 struct bio *new_bio, *bio = r_ctx->private;
298 __le64 *lba_list_mem;
299 int nr_secs = rqd->nr_ppas; 308 int nr_secs = rqd->nr_ppas;
300 int i; 309 int i;
301 310
302 /* Re-use allocated memory for intermediate lbas */
303 lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
304
305 new_bio = bio_alloc(GFP_KERNEL, nr_holes); 311 new_bio = bio_alloc(GFP_KERNEL, nr_holes);
306 312
307 if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) 313 if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
@@ -312,12 +318,15 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
312 goto fail_free_pages; 318 goto fail_free_pages;
313 } 319 }
314 320
315 pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL); 321 pr_ctx = kzalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
316 if (!pr_ctx) 322 if (!pr_ctx)
317 goto fail_free_pages; 323 goto fail_free_pages;
318 324
319 for (i = 0; i < nr_secs; i++) 325 for (i = 0; i < nr_secs; i++) {
320 lba_list_mem[i] = meta_list[i].lba; 326 struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
327
328 pr_ctx->lba_list_mem[i] = le64_to_cpu(meta->lba);
329 }
321 330
322 new_bio->bi_iter.bi_sector = 0; /* internal bio */ 331 new_bio->bi_iter.bi_sector = 0; /* internal bio */
323 bio_set_op_attrs(new_bio, REQ_OP_READ, 0); 332 bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
@@ -325,7 +334,6 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
325 rqd->bio = new_bio; 334 rqd->bio = new_bio;
326 rqd->nr_ppas = nr_holes; 335 rqd->nr_ppas = nr_holes;
327 336
328 pr_ctx->ppa_ptr = NULL;
329 pr_ctx->orig_bio = bio; 337 pr_ctx->orig_bio = bio;
330 bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA); 338 bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
331 pr_ctx->bio_init_idx = bio_init_idx; 339 pr_ctx->bio_init_idx = bio_init_idx;
@@ -383,7 +391,7 @@ err:
383static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, 391static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
384 sector_t lba, unsigned long *read_bitmap) 392 sector_t lba, unsigned long *read_bitmap)
385{ 393{
386 struct pblk_sec_meta *meta_list = rqd->meta_list; 394 struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0);
387 struct ppa_addr ppa; 395 struct ppa_addr ppa;
388 396
389 pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); 397 pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
@@ -394,8 +402,10 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
394 402
395retry: 403retry:
396 if (pblk_ppa_empty(ppa)) { 404 if (pblk_ppa_empty(ppa)) {
405 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
406
397 WARN_ON(test_and_set_bit(0, read_bitmap)); 407 WARN_ON(test_and_set_bit(0, read_bitmap));
398 meta_list[0].lba = cpu_to_le64(ADDR_EMPTY); 408 meta->lba = addr_empty;
399 return; 409 return;
400 } 410 }
401 411
@@ -409,7 +419,7 @@ retry:
409 } 419 }
410 420
411 WARN_ON(test_and_set_bit(0, read_bitmap)); 421 WARN_ON(test_and_set_bit(0, read_bitmap));
412 meta_list[0].lba = cpu_to_le64(lba); 422 meta->lba = cpu_to_le64(lba);
413 423
414#ifdef CONFIG_NVM_PBLK_DEBUG 424#ifdef CONFIG_NVM_PBLK_DEBUG
415 atomic_long_inc(&pblk->cache_reads); 425 atomic_long_inc(&pblk->cache_reads);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 5740b7509bd8..3fcf062d752c 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -13,6 +13,9 @@
13 * General Public License for more details. 13 * General Public License for more details.
14 * 14 *
15 * pblk-recovery.c - pblk's recovery path 15 * pblk-recovery.c - pblk's recovery path
16 *
17 * The L2P recovery path is single threaded as the L2P table is updated in order
18 * following the line sequence ID.
16 */ 19 */
17 20
18#include "pblk.h" 21#include "pblk.h"
@@ -124,7 +127,7 @@ static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line)
124 127
125struct pblk_recov_alloc { 128struct pblk_recov_alloc {
126 struct ppa_addr *ppa_list; 129 struct ppa_addr *ppa_list;
127 struct pblk_sec_meta *meta_list; 130 void *meta_list;
128 struct nvm_rq *rqd; 131 struct nvm_rq *rqd;
129 void *data; 132 void *data;
130 dma_addr_t dma_ppa_list; 133 dma_addr_t dma_ppa_list;
@@ -158,7 +161,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
158{ 161{
159 struct nvm_tgt_dev *dev = pblk->dev; 162 struct nvm_tgt_dev *dev = pblk->dev;
160 struct nvm_geo *geo = &dev->geo; 163 struct nvm_geo *geo = &dev->geo;
161 struct pblk_sec_meta *meta_list; 164 void *meta_list;
162 struct pblk_pad_rq *pad_rq; 165 struct pblk_pad_rq *pad_rq;
163 struct nvm_rq *rqd; 166 struct nvm_rq *rqd;
164 struct bio *bio; 167 struct bio *bio;
@@ -188,7 +191,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
188 kref_init(&pad_rq->ref); 191 kref_init(&pad_rq->ref);
189 192
190next_pad_rq: 193next_pad_rq:
191 rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); 194 rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
192 if (rq_ppas < pblk->min_write_pgs) { 195 if (rq_ppas < pblk->min_write_pgs) {
193 pblk_err(pblk, "corrupted pad line %d\n", line->id); 196 pblk_err(pblk, "corrupted pad line %d\n", line->id);
194 goto fail_free_pad; 197 goto fail_free_pad;
@@ -237,12 +240,15 @@ next_pad_rq:
237 240
238 for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) { 241 for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
239 struct ppa_addr dev_ppa; 242 struct ppa_addr dev_ppa;
243 struct pblk_sec_meta *meta;
240 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); 244 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
241 245
242 dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); 246 dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
243 247
244 pblk_map_invalidate(pblk, dev_ppa); 248 pblk_map_invalidate(pblk, dev_ppa);
245 lba_list[w_ptr] = meta_list[i].lba = addr_empty; 249 lba_list[w_ptr] = addr_empty;
250 meta = pblk_get_meta(pblk, meta_list, i);
251 meta->lba = addr_empty;
246 rqd->ppa_list[i] = dev_ppa; 252 rqd->ppa_list[i] = dev_ppa;
247 } 253 }
248 } 254 }
@@ -334,20 +340,21 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
334 struct pblk_recov_alloc p) 340 struct pblk_recov_alloc p)
335{ 341{
336 struct nvm_tgt_dev *dev = pblk->dev; 342 struct nvm_tgt_dev *dev = pblk->dev;
343 struct pblk_line_meta *lm = &pblk->lm;
337 struct nvm_geo *geo = &dev->geo; 344 struct nvm_geo *geo = &dev->geo;
338 struct ppa_addr *ppa_list; 345 struct ppa_addr *ppa_list;
339 struct pblk_sec_meta *meta_list; 346 void *meta_list;
340 struct nvm_rq *rqd; 347 struct nvm_rq *rqd;
341 struct bio *bio; 348 struct bio *bio;
342 void *data; 349 void *data;
343 dma_addr_t dma_ppa_list, dma_meta_list; 350 dma_addr_t dma_ppa_list, dma_meta_list;
344 __le64 *lba_list; 351 __le64 *lba_list;
345 u64 paddr = 0; 352 u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
346 bool padded = false; 353 bool padded = false;
347 int rq_ppas, rq_len; 354 int rq_ppas, rq_len;
348 int i, j; 355 int i, j;
349 int ret; 356 int ret;
350 u64 left_ppas = pblk_sec_in_open_line(pblk, line); 357 u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
351 358
352 if (pblk_line_wp_is_unbalanced(pblk, line)) 359 if (pblk_line_wp_is_unbalanced(pblk, line))
353 pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id); 360 pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id);
@@ -364,17 +371,19 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
364next_rq: 371next_rq:
365 memset(rqd, 0, pblk_g_rq_size); 372 memset(rqd, 0, pblk_g_rq_size);
366 373
367 rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); 374 rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
368 if (!rq_ppas) 375 if (!rq_ppas)
369 rq_ppas = pblk->min_write_pgs; 376 rq_ppas = pblk->min_write_pgs;
370 rq_len = rq_ppas * geo->csecs; 377 rq_len = rq_ppas * geo->csecs;
371 378
379retry_rq:
372 bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL); 380 bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
373 if (IS_ERR(bio)) 381 if (IS_ERR(bio))
374 return PTR_ERR(bio); 382 return PTR_ERR(bio);
375 383
376 bio->bi_iter.bi_sector = 0; /* internal bio */ 384 bio->bi_iter.bi_sector = 0; /* internal bio */
377 bio_set_op_attrs(bio, REQ_OP_READ, 0); 385 bio_set_op_attrs(bio, REQ_OP_READ, 0);
386 bio_get(bio);
378 387
379 rqd->bio = bio; 388 rqd->bio = bio;
380 rqd->opcode = NVM_OP_PREAD; 389 rqd->opcode = NVM_OP_PREAD;
@@ -387,7 +396,6 @@ next_rq:
387 if (pblk_io_aligned(pblk, rq_ppas)) 396 if (pblk_io_aligned(pblk, rq_ppas))
388 rqd->is_seq = 1; 397 rqd->is_seq = 1;
389 398
390retry_rq:
391 for (i = 0; i < rqd->nr_ppas; ) { 399 for (i = 0; i < rqd->nr_ppas; ) {
392 struct ppa_addr ppa; 400 struct ppa_addr ppa;
393 int pos; 401 int pos;
@@ -410,6 +418,7 @@ retry_rq:
410 if (ret) { 418 if (ret) {
411 pblk_err(pblk, "I/O submission failed: %d\n", ret); 419 pblk_err(pblk, "I/O submission failed: %d\n", ret);
412 bio_put(bio); 420 bio_put(bio);
421 bio_put(bio);
413 return ret; 422 return ret;
414 } 423 }
415 424
@@ -421,20 +430,28 @@ retry_rq:
421 430
422 if (padded) { 431 if (padded) {
423 pblk_log_read_err(pblk, rqd); 432 pblk_log_read_err(pblk, rqd);
433 bio_put(bio);
424 return -EINTR; 434 return -EINTR;
425 } 435 }
426 436
427 pad_distance = pblk_pad_distance(pblk, line); 437 pad_distance = pblk_pad_distance(pblk, line);
428 ret = pblk_recov_pad_line(pblk, line, pad_distance); 438 ret = pblk_recov_pad_line(pblk, line, pad_distance);
429 if (ret) 439 if (ret) {
440 bio_put(bio);
430 return ret; 441 return ret;
442 }
431 443
432 padded = true; 444 padded = true;
445 bio_put(bio);
433 goto retry_rq; 446 goto retry_rq;
434 } 447 }
435 448
449 pblk_get_packed_meta(pblk, rqd);
450 bio_put(bio);
451
436 for (i = 0; i < rqd->nr_ppas; i++) { 452 for (i = 0; i < rqd->nr_ppas; i++) {
437 u64 lba = le64_to_cpu(meta_list[i].lba); 453 struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
454 u64 lba = le64_to_cpu(meta->lba);
438 455
439 lba_list[paddr++] = cpu_to_le64(lba); 456 lba_list[paddr++] = cpu_to_le64(lba);
440 457
@@ -463,7 +480,7 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
463 struct nvm_geo *geo = &dev->geo; 480 struct nvm_geo *geo = &dev->geo;
464 struct nvm_rq *rqd; 481 struct nvm_rq *rqd;
465 struct ppa_addr *ppa_list; 482 struct ppa_addr *ppa_list;
466 struct pblk_sec_meta *meta_list; 483 void *meta_list;
467 struct pblk_recov_alloc p; 484 struct pblk_recov_alloc p;
468 void *data; 485 void *data;
469 dma_addr_t dma_ppa_list, dma_meta_list; 486 dma_addr_t dma_ppa_list, dma_meta_list;
@@ -473,8 +490,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
473 if (!meta_list) 490 if (!meta_list)
474 return -ENOMEM; 491 return -ENOMEM;
475 492
476 ppa_list = (void *)(meta_list) + pblk_dma_meta_size; 493 ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk);
477 dma_ppa_list = dma_meta_list + pblk_dma_meta_size; 494 dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
478 495
479 data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL); 496 data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL);
480 if (!data) { 497 if (!data) {
@@ -804,7 +821,6 @@ next:
804 WARN_ON_ONCE(!test_and_clear_bit(meta_line, 821 WARN_ON_ONCE(!test_and_clear_bit(meta_line,
805 &l_mg->meta_bitmap)); 822 &l_mg->meta_bitmap));
806 spin_unlock(&l_mg->free_lock); 823 spin_unlock(&l_mg->free_lock);
807 pblk_line_replace_data(pblk);
808 } else { 824 } else {
809 spin_lock(&l_mg->free_lock); 825 spin_lock(&l_mg->free_lock);
810 /* Allocate next line for preparation */ 826 /* Allocate next line for preparation */
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index db55a1c89997..76116d5f78e4 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -214,11 +214,10 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
214 struct nvm_geo *geo = &dev->geo; 214 struct nvm_geo *geo = &dev->geo;
215 struct pblk_line_mgmt *l_mg = &pblk->l_mg; 215 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
216 struct pblk_line_meta *lm = &pblk->lm; 216 struct pblk_line_meta *lm = &pblk->lm;
217 int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
218 int sec_meta, blk_meta; 217 int sec_meta, blk_meta;
219
220 unsigned int rb_windows; 218 unsigned int rb_windows;
221 219
220
222 /* Consider sectors used for metadata */ 221 /* Consider sectors used for metadata */
223 sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; 222 sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
224 blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); 223 blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
@@ -226,7 +225,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
226 rl->high = pblk->op_blks - blk_meta - lm->blk_per_line; 225 rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
227 rl->high_pw = get_count_order(rl->high); 226 rl->high_pw = get_count_order(rl->high);
228 227
229 rl->rsv_blocks = min_blocks; 228 rl->rsv_blocks = pblk_get_min_chks(pblk);
230 229
231 /* This will always be a power-of-2 */ 230 /* This will always be a power-of-2 */
232 rb_windows = budget / NVM_MAX_VLBA; 231 rb_windows = budget / NVM_MAX_VLBA;
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index 2d2818155aa8..7d8958df9472 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -479,6 +479,13 @@ static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk,
479 if (kstrtouint(page, 0, &sec_per_write)) 479 if (kstrtouint(page, 0, &sec_per_write))
480 return -EINVAL; 480 return -EINVAL;
481 481
482 if (!pblk_is_oob_meta_supported(pblk)) {
483 /* For packed metadata case it is
484 * not allowed to change sec_per_write.
485 */
486 return -EINVAL;
487 }
488
482 if (sec_per_write < pblk->min_write_pgs 489 if (sec_per_write < pblk->min_write_pgs
483 || sec_per_write > pblk->max_write_pgs 490 || sec_per_write > pblk->max_write_pgs
484 || sec_per_write % pblk->min_write_pgs != 0) 491 || sec_per_write % pblk->min_write_pgs != 0)
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index fa8726493b39..06d56deb645d 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -105,14 +105,20 @@ retry:
105} 105}
106 106
107/* Map remaining sectors in chunk, starting from ppa */ 107/* Map remaining sectors in chunk, starting from ppa */
108static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa) 108static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa,
109 int rqd_ppas)
109{ 110{
110 struct pblk_line *line; 111 struct pblk_line *line;
111 struct ppa_addr map_ppa = *ppa; 112 struct ppa_addr map_ppa = *ppa;
113 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
114 __le64 *lba_list;
112 u64 paddr; 115 u64 paddr;
113 int done = 0; 116 int done = 0;
117 int n = 0;
114 118
115 line = pblk_ppa_to_line(pblk, *ppa); 119 line = pblk_ppa_to_line(pblk, *ppa);
120 lba_list = emeta_to_lbas(pblk, line->emeta->buf);
121
116 spin_lock(&line->lock); 122 spin_lock(&line->lock);
117 123
118 while (!done) { 124 while (!done) {
@@ -121,10 +127,17 @@ static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
121 if (!test_and_set_bit(paddr, line->map_bitmap)) 127 if (!test_and_set_bit(paddr, line->map_bitmap))
122 line->left_msecs--; 128 line->left_msecs--;
123 129
130 if (n < rqd_ppas && lba_list[paddr] != addr_empty)
131 line->nr_valid_lbas--;
132
133 lba_list[paddr] = addr_empty;
134
124 if (!test_and_set_bit(paddr, line->invalid_bitmap)) 135 if (!test_and_set_bit(paddr, line->invalid_bitmap))
125 le32_add_cpu(line->vsc, -1); 136 le32_add_cpu(line->vsc, -1);
126 137
127 done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa); 138 done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa);
139
140 n++;
128 } 141 }
129 142
130 line->w_err_gc->has_write_err = 1; 143 line->w_err_gc->has_write_err = 1;
@@ -148,9 +161,11 @@ static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
148 w_ctx = &entry->w_ctx; 161 w_ctx = &entry->w_ctx;
149 162
150 /* Check if the lba has been overwritten */ 163 /* Check if the lba has been overwritten */
151 ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba); 164 if (w_ctx->lba != ADDR_EMPTY) {
152 if (!pblk_ppa_comp(ppa_l2p, entry->cacheline)) 165 ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
153 w_ctx->lba = ADDR_EMPTY; 166 if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
167 w_ctx->lba = ADDR_EMPTY;
168 }
154 169
155 /* Mark up the entry as submittable again */ 170 /* Mark up the entry as submittable again */
156 flags = READ_ONCE(w_ctx->flags); 171 flags = READ_ONCE(w_ctx->flags);
@@ -200,7 +215,7 @@ static void pblk_submit_rec(struct work_struct *work)
200 215
201 pblk_log_write_err(pblk, rqd); 216 pblk_log_write_err(pblk, rqd);
202 217
203 pblk_map_remaining(pblk, ppa_list); 218 pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas);
204 pblk_queue_resubmit(pblk, c_ctx); 219 pblk_queue_resubmit(pblk, c_ctx);
205 220
206 pblk_up_rq(pblk, c_ctx->lun_bitmap); 221 pblk_up_rq(pblk, c_ctx->lun_bitmap);
@@ -319,12 +334,13 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
319 } 334 }
320 335
321 if (likely(!e_line || !atomic_read(&e_line->left_eblks))) 336 if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
322 pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0); 337 ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
338 valid, 0);
323 else 339 else
324 pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, 340 ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
325 valid, erase_ppa); 341 valid, erase_ppa);
326 342
327 return 0; 343 return ret;
328} 344}
329 345
330static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, 346static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
@@ -332,7 +348,7 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
332{ 348{
333 int secs_to_sync; 349 int secs_to_sync;
334 350
335 secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush); 351 secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true);
336 352
337#ifdef CONFIG_NVM_PBLK_DEBUG 353#ifdef CONFIG_NVM_PBLK_DEBUG
338 if ((!secs_to_sync && secs_to_flush) 354 if ((!secs_to_sync && secs_to_flush)
@@ -548,15 +564,17 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
548 c_ctx->nr_padded); 564 c_ctx->nr_padded);
549} 565}
550 566
551static int pblk_submit_write(struct pblk *pblk) 567static int pblk_submit_write(struct pblk *pblk, int *secs_left)
552{ 568{
553 struct bio *bio; 569 struct bio *bio;
554 struct nvm_rq *rqd; 570 struct nvm_rq *rqd;
555 unsigned int secs_avail, secs_to_sync, secs_to_com; 571 unsigned int secs_avail, secs_to_sync, secs_to_com;
556 unsigned int secs_to_flush; 572 unsigned int secs_to_flush, packed_meta_pgs;
557 unsigned long pos; 573 unsigned long pos;
558 unsigned int resubmit; 574 unsigned int resubmit;
559 575
576 *secs_left = 0;
577
560 spin_lock(&pblk->resubmit_lock); 578 spin_lock(&pblk->resubmit_lock);
561 resubmit = !list_empty(&pblk->resubmit_list); 579 resubmit = !list_empty(&pblk->resubmit_list);
562 spin_unlock(&pblk->resubmit_lock); 580 spin_unlock(&pblk->resubmit_lock);
@@ -586,17 +604,17 @@ static int pblk_submit_write(struct pblk *pblk)
586 */ 604 */
587 secs_avail = pblk_rb_read_count(&pblk->rwb); 605 secs_avail = pblk_rb_read_count(&pblk->rwb);
588 if (!secs_avail) 606 if (!secs_avail)
589 return 1; 607 return 0;
590 608
591 secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); 609 secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
592 if (!secs_to_flush && secs_avail < pblk->min_write_pgs) 610 if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data)
593 return 1; 611 return 0;
594 612
595 secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, 613 secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
596 secs_to_flush); 614 secs_to_flush);
597 if (secs_to_sync > pblk->max_write_pgs) { 615 if (secs_to_sync > pblk->max_write_pgs) {
598 pblk_err(pblk, "bad buffer sync calculation\n"); 616 pblk_err(pblk, "bad buffer sync calculation\n");
599 return 1; 617 return 0;
600 } 618 }
601 619
602 secs_to_com = (secs_to_sync > secs_avail) ? 620 secs_to_com = (secs_to_sync > secs_avail) ?
@@ -604,7 +622,8 @@ static int pblk_submit_write(struct pblk *pblk)
604 pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); 622 pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
605 } 623 }
606 624
607 bio = bio_alloc(GFP_KERNEL, secs_to_sync); 625 packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data);
626 bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs);
608 627
609 bio->bi_iter.bi_sector = 0; /* internal bio */ 628 bio->bi_iter.bi_sector = 0; /* internal bio */
610 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 629 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -625,6 +644,7 @@ static int pblk_submit_write(struct pblk *pblk)
625 atomic_long_add(secs_to_sync, &pblk->sub_writes); 644 atomic_long_add(secs_to_sync, &pblk->sub_writes);
626#endif 645#endif
627 646
647 *secs_left = 1;
628 return 0; 648 return 0;
629 649
630fail_free_bio: 650fail_free_bio:
@@ -633,16 +653,22 @@ fail_put_bio:
633 bio_put(bio); 653 bio_put(bio);
634 pblk_free_rqd(pblk, rqd, PBLK_WRITE); 654 pblk_free_rqd(pblk, rqd, PBLK_WRITE);
635 655
636 return 1; 656 return -EINTR;
637} 657}
638 658
639int pblk_write_ts(void *data) 659int pblk_write_ts(void *data)
640{ 660{
641 struct pblk *pblk = data; 661 struct pblk *pblk = data;
662 int secs_left;
663 int write_failure = 0;
642 664
643 while (!kthread_should_stop()) { 665 while (!kthread_should_stop()) {
644 if (!pblk_submit_write(pblk)) 666 if (!write_failure) {
645 continue; 667 write_failure = pblk_submit_write(pblk, &secs_left);
668
669 if (secs_left)
670 continue;
671 }
646 set_current_state(TASK_INTERRUPTIBLE); 672 set_current_state(TASK_INTERRUPTIBLE);
647 io_schedule(); 673 io_schedule();
648 } 674 }
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 02bb2e98f8a9..85e38ed62f85 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -104,7 +104,6 @@ enum {
104 PBLK_RL_LOW = 4 104 PBLK_RL_LOW = 4
105}; 105};
106 106
107#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * NVM_MAX_VLBA)
108#define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA) 107#define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA)
109 108
110/* write buffer completion context */ 109/* write buffer completion context */
@@ -132,6 +131,8 @@ struct pblk_pr_ctx {
132 unsigned int bio_init_idx; 131 unsigned int bio_init_idx;
133 void *ppa_ptr; 132 void *ppa_ptr;
134 dma_addr_t dma_ppa_list; 133 dma_addr_t dma_ppa_list;
134 __le64 lba_list_mem[NVM_MAX_VLBA];
135 __le64 lba_list_media[NVM_MAX_VLBA];
135}; 136};
136 137
137/* Pad context */ 138/* Pad context */
@@ -631,7 +632,9 @@ struct pblk {
631 int state; /* pblk line state */ 632 int state; /* pblk line state */
632 633
633 int min_write_pgs; /* Minimum amount of pages required by controller */ 634 int min_write_pgs; /* Minimum amount of pages required by controller */
635 int min_write_pgs_data; /* Minimum amount of payload pages */
634 int max_write_pgs; /* Maximum amount of pages supported by controller */ 636 int max_write_pgs; /* Maximum amount of pages supported by controller */
637 int oob_meta_size; /* Size of OOB sector metadata */
635 638
636 sector_t capacity; /* Device capacity when bad blocks are subtracted */ 639 sector_t capacity; /* Device capacity when bad blocks are subtracted */
637 640
@@ -836,7 +839,7 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
836u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); 839u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
837u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); 840u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
838int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, 841int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
839 unsigned long secs_to_flush); 842 unsigned long secs_to_flush, bool skip_meta);
840void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa, 843void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa,
841 unsigned long *lun_bitmap); 844 unsigned long *lun_bitmap);
842void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa); 845void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa);
@@ -860,6 +863,8 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
860 u64 *lba_list, int nr_secs); 863 u64 *lba_list, int nr_secs);
861void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, 864void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
862 sector_t blba, int nr_secs); 865 sector_t blba, int nr_secs);
866void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd);
867void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd);
863 868
864/* 869/*
865 * pblk user I/O write path 870 * pblk user I/O write path
@@ -871,10 +876,10 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
871/* 876/*
872 * pblk map 877 * pblk map
873 */ 878 */
874void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, 879int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
875 unsigned int sentry, unsigned long *lun_bitmap, 880 unsigned int sentry, unsigned long *lun_bitmap,
876 unsigned int valid_secs, struct ppa_addr *erase_ppa); 881 unsigned int valid_secs, struct ppa_addr *erase_ppa);
877void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, 882int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
878 unsigned long *lun_bitmap, unsigned int valid_secs, 883 unsigned long *lun_bitmap, unsigned int valid_secs,
879 unsigned int off); 884 unsigned int off);
880 885
@@ -905,7 +910,6 @@ int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
905#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ 910#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */
906#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */ 911#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */
907#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ 912#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */
908#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */
909 913
910int pblk_gc_init(struct pblk *pblk); 914int pblk_gc_init(struct pblk *pblk);
911void pblk_gc_exit(struct pblk *pblk, bool graceful); 915void pblk_gc_exit(struct pblk *pblk, bool graceful);
@@ -1370,4 +1374,33 @@ static inline char *pblk_disk_name(struct pblk *pblk)
1370 1374
1371 return disk->disk_name; 1375 return disk->disk_name;
1372} 1376}
1377
1378static inline unsigned int pblk_get_min_chks(struct pblk *pblk)
1379{
1380 struct pblk_line_meta *lm = &pblk->lm;
1381 /* In a worst-case scenario every line will have OP invalid sectors.
1382 * We will then need a minimum of 1/OP lines to free up a single line
1383 */
1384
1385 return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line;
1386}
1387
1388static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk,
1389 void *meta, int index)
1390{
1391 return meta +
1392 max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
1393 * index;
1394}
1395
1396static inline int pblk_dma_meta_size(struct pblk *pblk)
1397{
1398 return max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
1399 * NVM_MAX_VLBA;
1400}
1401
1402static inline int pblk_is_oob_meta_supported(struct pblk *pblk)
1403{
1404 return pblk->oob_meta_size >= sizeof(struct pblk_sec_meta);
1405}
1373#endif /* PBLK_H_ */ 1406#endif /* PBLK_H_ */
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b61b83bbcfff..fdf75352e16a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -627,6 +627,20 @@ struct cache_set {
627 struct bkey gc_done; 627 struct bkey gc_done;
628 628
629 /* 629 /*
630 * For automatical garbage collection after writeback completed, this
631 * varialbe is used as bit fields,
632 * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
633 * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback
634 * This is an optimization for following write request after writeback
635 * finished, but read hit rate dropped due to clean data on cache is
636 * discarded. Unless user explicitly sets it via sysfs, it won't be
637 * enabled.
638 */
639#define BCH_ENABLE_AUTO_GC 1
640#define BCH_DO_AUTO_GC 2
641 uint8_t gc_after_writeback;
642
643 /*
630 * The allocation code needs gc_mark in struct bucket to be correct, but 644 * The allocation code needs gc_mark in struct bucket to be correct, but
631 * it's not while a gc is in progress. Protected by bucket_lock. 645 * it's not while a gc is in progress. Protected by bucket_lock.
632 */ 646 */
@@ -658,7 +672,11 @@ struct cache_set {
658 672
659 /* 673 /*
660 * A btree node on disk could have too many bsets for an iterator to fit 674 * A btree node on disk could have too many bsets for an iterator to fit
661 * on the stack - have to dynamically allocate them 675 * on the stack - have to dynamically allocate them.
676 * bch_cache_set_alloc() will make sure the pool can allocate iterators
677 * equipped with enough room that can host
678 * (sb.bucket_size / sb.block_size)
679 * btree_iter_sets, which is more than static MAX_BSETS.
662 */ 680 */
663 mempool_t fill_iter; 681 mempool_t fill_iter;
664 682
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3f4211b5cd33..23cb1dc7296b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b)
207 struct bset *i = btree_bset_first(b); 207 struct bset *i = btree_bset_first(b);
208 struct btree_iter *iter; 208 struct btree_iter *iter;
209 209
210 /*
211 * c->fill_iter can allocate an iterator with more memory space
212 * than static MAX_BSETS.
213 * See the comment arount cache_set->fill_iter.
214 */
210 iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); 215 iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
211 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 216 iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
212 iter->used = 0; 217 iter->used = 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index a68d6c55783b..d1c72ef64edf 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c)
266 wake_up(&c->gc_wait); 266 wake_up(&c->gc_wait);
267} 267}
268 268
269static inline void force_wake_up_gc(struct cache_set *c)
270{
271 /*
272 * Garbage collection thread only works when sectors_to_gc < 0,
273 * calling wake_up_gc() won't start gc thread if sectors_to_gc is
274 * not a nagetive value.
275 * Therefore sectors_to_gc is set to -1 here, before waking up
276 * gc thread by calling wake_up_gc(). Then gc_should_run() will
277 * give a chance to permit gc thread to run. "Give a chance" means
278 * before going into gc_should_run(), there is still possibility
279 * that c->sectors_to_gc being set to other positive value. So
280 * this routine won't 100% make sure gc thread will be woken up
281 * to run.
282 */
283 atomic_set(&c->sectors_to_gc, -1);
284 wake_up_gc(c);
285}
286
269#define MAP_DONE 0 287#define MAP_DONE 0
270#define MAP_CONTINUE 1 288#define MAP_CONTINUE 1
271 289
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 8f448b9c96a1..8b123be05254 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
249 249
250void bch_debug_exit(void) 250void bch_debug_exit(void)
251{ 251{
252 if (!IS_ERR_OR_NULL(bcache_debug)) 252 debugfs_remove_recursive(bcache_debug);
253 debugfs_remove_recursive(bcache_debug);
254} 253}
255 254
256void __init bch_debug_init(void) 255void __init bch_debug_init(void)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 522c7426f3a0..b2fd412715b1 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl)
663 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); 663 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
664 bch_bio_map(bio, w->data); 664 bch_bio_map(bio, w->data);
665 665
666 trace_bcache_journal_write(bio); 666 trace_bcache_journal_write(bio, w->data->keys);
667 bio_list_add(&list, bio); 667 bio_list_add(&list, bio);
668 668
669 SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); 669 SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3bf35914bb57..15070412a32e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,11 +311,11 @@ err:
311 * data is written it calls bch_journal, and after the keys have been added to 311 * data is written it calls bch_journal, and after the keys have been added to
312 * the next journal write they're inserted into the btree. 312 * the next journal write they're inserted into the btree.
313 * 313 *
314 * It inserts the data in s->cache_bio; bi_sector is used for the key offset, 314 * It inserts the data in op->bio; bi_sector is used for the key offset,
315 * and op->inode is used for the key inode. 315 * and op->inode is used for the key inode.
316 * 316 *
317 * If s->bypass is true, instead of inserting the data it invalidates the 317 * If op->bypass is true, instead of inserting the data it invalidates the
318 * region of the cache represented by s->cache_bio and op->inode. 318 * region of the cache represented by op->bio and op->inode.
319 */ 319 */
320void bch_data_insert(struct closure *cl) 320void bch_data_insert(struct closure *cl)
321{ 321{
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 7bbd670a5a84..4dee119c3664 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -25,8 +25,8 @@
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/sysfs.h> 26#include <linux/sysfs.h>
27 27
28MODULE_LICENSE("GPL"); 28unsigned int bch_cutoff_writeback;
29MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); 29unsigned int bch_cutoff_writeback_sync;
30 30
31static const char bcache_magic[] = { 31static const char bcache_magic[] = {
32 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, 32 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
@@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl)
1510 struct cache *ca; 1510 struct cache *ca;
1511 unsigned int i; 1511 unsigned int i;
1512 1512
1513 if (!IS_ERR_OR_NULL(c->debug)) 1513 debugfs_remove(c->debug);
1514 debugfs_remove(c->debug);
1515 1514
1516 bch_open_buckets_free(c); 1515 bch_open_buckets_free(c);
1517 bch_btree_cache_free(c); 1516 bch_btree_cache_free(c);
@@ -2424,6 +2423,32 @@ static void bcache_exit(void)
2424 mutex_destroy(&bch_register_lock); 2423 mutex_destroy(&bch_register_lock);
2425} 2424}
2426 2425
2426/* Check and fixup module parameters */
2427static void check_module_parameters(void)
2428{
2429 if (bch_cutoff_writeback_sync == 0)
2430 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
2431 else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
2432 pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
2433 bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
2434 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
2435 }
2436
2437 if (bch_cutoff_writeback == 0)
2438 bch_cutoff_writeback = CUTOFF_WRITEBACK;
2439 else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
2440 pr_warn("set bch_cutoff_writeback (%u) to max value %u",
2441 bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
2442 bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
2443 }
2444
2445 if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
2446 pr_warn("set bch_cutoff_writeback (%u) to %u",
2447 bch_cutoff_writeback, bch_cutoff_writeback_sync);
2448 bch_cutoff_writeback = bch_cutoff_writeback_sync;
2449 }
2450}
2451
2427static int __init bcache_init(void) 2452static int __init bcache_init(void)
2428{ 2453{
2429 static const struct attribute *files[] = { 2454 static const struct attribute *files[] = {
@@ -2432,6 +2457,8 @@ static int __init bcache_init(void)
2432 NULL 2457 NULL
2433 }; 2458 };
2434 2459
2460 check_module_parameters();
2461
2435 mutex_init(&bch_register_lock); 2462 mutex_init(&bch_register_lock);
2436 init_waitqueue_head(&unregister_wait); 2463 init_waitqueue_head(&unregister_wait);
2437 register_reboot_notifier(&reboot); 2464 register_reboot_notifier(&reboot);
@@ -2468,5 +2495,18 @@ err:
2468 return -ENOMEM; 2495 return -ENOMEM;
2469} 2496}
2470 2497
2498/*
2499 * Module hooks
2500 */
2471module_exit(bcache_exit); 2501module_exit(bcache_exit);
2472module_init(bcache_init); 2502module_init(bcache_init);
2503
2504module_param(bch_cutoff_writeback, uint, 0);
2505MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
2506
2507module_param(bch_cutoff_writeback_sync, uint, 0);
2508MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
2509
2510MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
2511MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
2512MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 26f035a0c5b9..557a8a3270a1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,7 +16,7 @@
16#include <linux/sort.h> 16#include <linux/sort.h>
17#include <linux/sched/clock.h> 17#include <linux/sched/clock.h>
18 18
19/* Default is -1; we skip past it for struct cached_dev's cache mode */ 19/* Default is 0 ("writethrough") */
20static const char * const bch_cache_modes[] = { 20static const char * const bch_cache_modes[] = {
21 "writethrough", 21 "writethrough",
22 "writeback", 22 "writeback",
@@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = {
25 NULL 25 NULL
26}; 26};
27 27
28/* Default is -1; we skip past it for stop_when_cache_set_failed */ 28/* Default is 0 ("auto") */
29static const char * const bch_stop_on_failure_modes[] = { 29static const char * const bch_stop_on_failure_modes[] = {
30 "auto", 30 "auto",
31 "always", 31 "always",
@@ -88,6 +88,8 @@ read_attribute(writeback_keys_done);
88read_attribute(writeback_keys_failed); 88read_attribute(writeback_keys_failed);
89read_attribute(io_errors); 89read_attribute(io_errors);
90read_attribute(congested); 90read_attribute(congested);
91read_attribute(cutoff_writeback);
92read_attribute(cutoff_writeback_sync);
91rw_attribute(congested_read_threshold_us); 93rw_attribute(congested_read_threshold_us);
92rw_attribute(congested_write_threshold_us); 94rw_attribute(congested_write_threshold_us);
93 95
@@ -128,6 +130,7 @@ rw_attribute(expensive_debug_checks);
128rw_attribute(cache_replacement_policy); 130rw_attribute(cache_replacement_policy);
129rw_attribute(btree_shrinker_disabled); 131rw_attribute(btree_shrinker_disabled);
130rw_attribute(copy_gc_enabled); 132rw_attribute(copy_gc_enabled);
133rw_attribute(gc_after_writeback);
131rw_attribute(size); 134rw_attribute(size);
132 135
133static ssize_t bch_snprint_string_list(char *buf, 136static ssize_t bch_snprint_string_list(char *buf,
@@ -264,7 +267,8 @@ STORE(__cached_dev)
264 d_strtoul(writeback_running); 267 d_strtoul(writeback_running);
265 d_strtoul(writeback_delay); 268 d_strtoul(writeback_delay);
266 269
267 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); 270 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
271 0, bch_cutoff_writeback);
268 272
269 if (attr == &sysfs_writeback_rate) { 273 if (attr == &sysfs_writeback_rate) {
270 ssize_t ret; 274 ssize_t ret;
@@ -384,8 +388,25 @@ STORE(bch_cached_dev)
384 mutex_lock(&bch_register_lock); 388 mutex_lock(&bch_register_lock);
385 size = __cached_dev_store(kobj, attr, buf, size); 389 size = __cached_dev_store(kobj, attr, buf, size);
386 390
387 if (attr == &sysfs_writeback_running) 391 if (attr == &sysfs_writeback_running) {
388 bch_writeback_queue(dc); 392 /* dc->writeback_running changed in __cached_dev_store() */
393 if (IS_ERR_OR_NULL(dc->writeback_thread)) {
394 /*
395 * reject setting it to 1 via sysfs if writeback
396 * kthread is not created yet.
397 */
398 if (dc->writeback_running) {
399 dc->writeback_running = false;
400 pr_err("%s: failed to run non-existent writeback thread",
401 dc->disk.disk->disk_name);
402 }
403 } else
404 /*
405 * writeback kthread will check if dc->writeback_running
406 * is true or false.
407 */
408 bch_writeback_queue(dc);
409 }
389 410
390 if (attr == &sysfs_writeback_percent) 411 if (attr == &sysfs_writeback_percent)
391 if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) 412 if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
@@ -668,6 +689,9 @@ SHOW(__bch_cache_set)
668 sysfs_print(congested_write_threshold_us, 689 sysfs_print(congested_write_threshold_us,
669 c->congested_write_threshold_us); 690 c->congested_write_threshold_us);
670 691
692 sysfs_print(cutoff_writeback, bch_cutoff_writeback);
693 sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync);
694
671 sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); 695 sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
672 sysfs_printf(verify, "%i", c->verify); 696 sysfs_printf(verify, "%i", c->verify);
673 sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); 697 sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
@@ -676,6 +700,7 @@ SHOW(__bch_cache_set)
676 sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); 700 sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
677 sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); 701 sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
678 sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); 702 sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
703 sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
679 sysfs_printf(io_disable, "%i", 704 sysfs_printf(io_disable, "%i",
680 test_bit(CACHE_SET_IO_DISABLE, &c->flags)); 705 test_bit(CACHE_SET_IO_DISABLE, &c->flags));
681 706
@@ -725,21 +750,8 @@ STORE(__bch_cache_set)
725 bch_cache_accounting_clear(&c->accounting); 750 bch_cache_accounting_clear(&c->accounting);
726 } 751 }
727 752
728 if (attr == &sysfs_trigger_gc) { 753 if (attr == &sysfs_trigger_gc)
729 /* 754 force_wake_up_gc(c);
730 * Garbage collection thread only works when sectors_to_gc < 0,
731 * when users write to sysfs entry trigger_gc, most of time
732 * they want to forcibly triger gargage collection. Here -1 is
733 * set to c->sectors_to_gc, to make gc_should_run() give a
734 * chance to permit gc thread to run. "give a chance" means
735 * before going into gc_should_run(), there is still chance
736 * that c->sectors_to_gc being set to other positive value. So
737 * writing sysfs entry trigger_gc won't always make sure gc
738 * thread takes effect.
739 */
740 atomic_set(&c->sectors_to_gc, -1);
741 wake_up_gc(c);
742 }
743 755
744 if (attr == &sysfs_prune_cache) { 756 if (attr == &sysfs_prune_cache) {
745 struct shrink_control sc; 757 struct shrink_control sc;
@@ -789,6 +801,12 @@ STORE(__bch_cache_set)
789 sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); 801 sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
790 sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); 802 sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
791 sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); 803 sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
804 /*
805 * write gc_after_writeback here may overwrite an already set
806 * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
807 * set in next chance.
808 */
809 sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
792 810
793 return size; 811 return size;
794} 812}
@@ -869,7 +887,10 @@ static struct attribute *bch_cache_set_internal_files[] = {
869 &sysfs_gc_always_rewrite, 887 &sysfs_gc_always_rewrite,
870 &sysfs_btree_shrinker_disabled, 888 &sysfs_btree_shrinker_disabled,
871 &sysfs_copy_gc_enabled, 889 &sysfs_copy_gc_enabled,
890 &sysfs_gc_after_writeback,
872 &sysfs_io_disable, 891 &sysfs_io_disable,
892 &sysfs_cutoff_writeback,
893 &sysfs_cutoff_writeback_sync,
873 NULL 894 NULL
874}; 895};
875KTYPE(bch_cache_set_internal); 896KTYPE(bch_cache_set_internal);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 08c3a9f9676c..73f0efac2b9f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -17,6 +17,15 @@
17#include <linux/sched/clock.h> 17#include <linux/sched/clock.h>
18#include <trace/events/bcache.h> 18#include <trace/events/bcache.h>
19 19
20static void update_gc_after_writeback(struct cache_set *c)
21{
22 if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
23 c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
24 return;
25
26 c->gc_after_writeback |= BCH_DO_AUTO_GC;
27}
28
20/* Rate limiting */ 29/* Rate limiting */
21static uint64_t __calc_target_rate(struct cached_dev *dc) 30static uint64_t __calc_target_rate(struct cached_dev *dc)
22{ 31{
@@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work)
191 if (!set_at_max_writeback_rate(c, dc)) { 200 if (!set_at_max_writeback_rate(c, dc)) {
192 down_read(&dc->writeback_lock); 201 down_read(&dc->writeback_lock);
193 __update_writeback_rate(dc); 202 __update_writeback_rate(dc);
203 update_gc_after_writeback(c);
194 up_read(&dc->writeback_lock); 204 up_read(&dc->writeback_lock);
195 } 205 }
196 } 206 }
@@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg)
689 up_write(&dc->writeback_lock); 699 up_write(&dc->writeback_lock);
690 break; 700 break;
691 } 701 }
702
703 /*
704 * When dirty data rate is high (e.g. 50%+), there might
705 * be heavy buckets fragmentation after writeback
706 * finished, which hurts following write performance.
707 * If users really care about write performance they
708 * may set BCH_ENABLE_AUTO_GC via sysfs, then when
709 * BCH_DO_AUTO_GC is set, garbage collection thread
710 * will be wake up here. After moving gc, the shrunk
711 * btree and discarded free buckets SSD space may be
712 * helpful for following write requests.
713 */
714 if (c->gc_after_writeback ==
715 (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
716 c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
717 force_wake_up_gc(c);
718 }
692 } 719 }
693 720
694 up_write(&dc->writeback_lock); 721 up_write(&dc->writeback_lock);
@@ -777,7 +804,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
777 bch_keybuf_init(&dc->writeback_keys); 804 bch_keybuf_init(&dc->writeback_keys);
778 805
779 dc->writeback_metadata = true; 806 dc->writeback_metadata = true;
780 dc->writeback_running = true; 807 dc->writeback_running = false;
781 dc->writeback_percent = 10; 808 dc->writeback_percent = 10;
782 dc->writeback_delay = 30; 809 dc->writeback_delay = 30;
783 atomic_long_set(&dc->writeback_rate.rate, 1024); 810 atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -805,6 +832,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
805 cached_dev_put(dc); 832 cached_dev_put(dc);
806 return PTR_ERR(dc->writeback_thread); 833 return PTR_ERR(dc->writeback_thread);
807 } 834 }
835 dc->writeback_running = true;
808 836
809 WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); 837 WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
810 schedule_delayed_work(&dc->writeback_rate_update, 838 schedule_delayed_work(&dc->writeback_rate_update,
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index d2b9fdbc8994..6a743d3bb338 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,12 +5,17 @@
5#define CUTOFF_WRITEBACK 40 5#define CUTOFF_WRITEBACK 40
6#define CUTOFF_WRITEBACK_SYNC 70 6#define CUTOFF_WRITEBACK_SYNC 70
7 7
8#define CUTOFF_WRITEBACK_MAX 70
9#define CUTOFF_WRITEBACK_SYNC_MAX 90
10
8#define MAX_WRITEBACKS_IN_PASS 5 11#define MAX_WRITEBACKS_IN_PASS 5
9#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ 12#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
10 13
11#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 14#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
12#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 15#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
13 16
17#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
18
14/* 19/*
15 * 14 (16384ths) is chosen here as something that each backing device 20 * 14 (16384ths) is chosen here as something that each backing device
16 * should be a reasonable fraction of the share, and not to blow up 21 * should be a reasonable fraction of the share, and not to blow up
@@ -53,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
53 } 58 }
54} 59}
55 60
61extern unsigned int bch_cutoff_writeback;
62extern unsigned int bch_cutoff_writeback_sync;
63
56static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, 64static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
57 unsigned int cache_mode, bool would_skip) 65 unsigned int cache_mode, bool would_skip)
58{ 66{
@@ -60,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
60 68
61 if (cache_mode != CACHE_MODE_WRITEBACK || 69 if (cache_mode != CACHE_MODE_WRITEBACK ||
62 test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || 70 test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
63 in_use > CUTOFF_WRITEBACK_SYNC) 71 in_use > bch_cutoff_writeback_sync)
64 return false; 72 return false;
65 73
66 if (dc->partial_stripes_expensive && 74 if (dc->partial_stripes_expensive &&
@@ -73,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
73 81
74 return (op_is_sync(bio->bi_opf) || 82 return (op_is_sync(bio->bi_opf) ||
75 bio->bi_opf & (REQ_META|REQ_PRIO) || 83 bio->bi_opf & (REQ_META|REQ_PRIO) ||
76 in_use <= CUTOFF_WRITEBACK); 84 in_use <= bch_cutoff_writeback);
77} 85}
78 86
79static inline void bch_writeback_queue(struct cached_dev *dc) 87static inline void bch_writeback_queue(struct cached_dev *dc)
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 224d44503a06..95c6d86ab5e8 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -65,7 +65,6 @@ struct mapped_device {
65 */ 65 */
66 struct work_struct work; 66 struct work_struct work;
67 wait_queue_head_t wait; 67 wait_queue_head_t wait;
68 atomic_t pending[2];
69 spinlock_t deferred_lock; 68 spinlock_t deferred_lock;
70 struct bio_list deferred; 69 struct bio_list deferred;
71 70
@@ -107,9 +106,6 @@ struct mapped_device {
107 106
108 struct block_device *bdev; 107 struct block_device *bdev;
109 108
110 /* zero-length flush that will be cloned and submitted to targets */
111 struct bio flush_bio;
112
113 struct dm_stats stats; 109 struct dm_stats stats;
114 110
115 /* for blk-mq request-based DM support */ 111 /* for blk-mq request-based DM support */
@@ -119,7 +115,6 @@ struct mapped_device {
119 struct srcu_struct io_barrier; 115 struct srcu_struct io_barrier;
120}; 116};
121 117
122int md_in_flight(struct mapped_device *md);
123void disable_write_same(struct mapped_device *md); 118void disable_write_same(struct mapped_device *md);
124void disable_write_zeroes(struct mapped_device *md); 119void disable_write_zeroes(struct mapped_device *md);
125 120
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 7cd36e4d1310..4e06be4f0a62 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
43 43
44int dm_request_based(struct mapped_device *md) 44int dm_request_based(struct mapped_device *md)
45{ 45{
46 return queue_is_rq_based(md->queue); 46 return queue_is_mq(md->queue);
47} 47}
48 48
49void dm_start_queue(struct request_queue *q) 49void dm_start_queue(struct request_queue *q)
@@ -130,10 +130,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
130 */ 130 */
131static void rq_completed(struct mapped_device *md, int rw, bool run_queue) 131static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
132{ 132{
133 atomic_dec(&md->pending[rw]);
134
135 /* nudge anyone waiting on suspend queue */ 133 /* nudge anyone waiting on suspend queue */
136 if (!md_in_flight(md)) 134 if (unlikely(waitqueue_active(&md->wait)))
137 wake_up(&md->wait); 135 wake_up(&md->wait);
138 136
139 /* 137 /*
@@ -436,7 +434,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
436static void dm_start_request(struct mapped_device *md, struct request *orig) 434static void dm_start_request(struct mapped_device *md, struct request *orig)
437{ 435{
438 blk_mq_start_request(orig); 436 blk_mq_start_request(orig);
439 atomic_inc(&md->pending[rq_data_dir(orig)]);
440 437
441 if (unlikely(dm_stats_used(&md->stats))) { 438 if (unlikely(dm_stats_used(&md->stats))) {
442 struct dm_rq_target_io *tio = tio_from_request(orig); 439 struct dm_rq_target_io *tio = tio_from_request(orig);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9038c302d5c2..844f7d0f2ef8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
919 struct request_queue *q = bdev_get_queue(dev->bdev); 919 struct request_queue *q = bdev_get_queue(dev->bdev);
920 struct verify_rq_based_data *v = data; 920 struct verify_rq_based_data *v = data;
921 921
922 if (q->mq_ops) 922 if (queue_is_mq(q))
923 v->mq_count++; 923 v->mq_count++;
924 else 924 else
925 v->sq_count++; 925 v->sq_count++;
926 926
927 return queue_is_rq_based(q); 927 return queue_is_mq(q);
928} 928}
929 929
930static int dm_table_determine_type(struct dm_table *t) 930static int dm_table_determine_type(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 63a7c416b224..a4a06982ed91 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -646,26 +646,38 @@ static void free_tio(struct dm_target_io *tio)
646 bio_put(&tio->clone); 646 bio_put(&tio->clone);
647} 647}
648 648
649int md_in_flight(struct mapped_device *md) 649static bool md_in_flight_bios(struct mapped_device *md)
650{ 650{
651 return atomic_read(&md->pending[READ]) + 651 int cpu;
652 atomic_read(&md->pending[WRITE]); 652 struct hd_struct *part = &dm_disk(md)->part0;
653 long sum = 0;
654
655 for_each_possible_cpu(cpu) {
656 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
657 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
658 }
659
660 return sum != 0;
661}
662
663static bool md_in_flight(struct mapped_device *md)
664{
665 if (queue_is_mq(md->queue))
666 return blk_mq_queue_inflight(md->queue);
667 else
668 return md_in_flight_bios(md);
653} 669}
654 670
655static void start_io_acct(struct dm_io *io) 671static void start_io_acct(struct dm_io *io)
656{ 672{
657 struct mapped_device *md = io->md; 673 struct mapped_device *md = io->md;
658 struct bio *bio = io->orig_bio; 674 struct bio *bio = io->orig_bio;
659 int rw = bio_data_dir(bio);
660 675
661 io->start_time = jiffies; 676 io->start_time = jiffies;
662 677
663 generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio), 678 generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
664 &dm_disk(md)->part0); 679 &dm_disk(md)->part0);
665 680
666 atomic_set(&dm_disk(md)->part0.in_flight[rw],
667 atomic_inc_return(&md->pending[rw]));
668
669 if (unlikely(dm_stats_used(&md->stats))) 681 if (unlikely(dm_stats_used(&md->stats)))
670 dm_stats_account_io(&md->stats, bio_data_dir(bio), 682 dm_stats_account_io(&md->stats, bio_data_dir(bio),
671 bio->bi_iter.bi_sector, bio_sectors(bio), 683 bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -677,8 +689,6 @@ static void end_io_acct(struct dm_io *io)
677 struct mapped_device *md = io->md; 689 struct mapped_device *md = io->md;
678 struct bio *bio = io->orig_bio; 690 struct bio *bio = io->orig_bio;
679 unsigned long duration = jiffies - io->start_time; 691 unsigned long duration = jiffies - io->start_time;
680 int pending;
681 int rw = bio_data_dir(bio);
682 692
683 generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, 693 generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
684 io->start_time); 694 io->start_time);
@@ -688,16 +698,8 @@ static void end_io_acct(struct dm_io *io)
688 bio->bi_iter.bi_sector, bio_sectors(bio), 698 bio->bi_iter.bi_sector, bio_sectors(bio),
689 true, duration, &io->stats_aux); 699 true, duration, &io->stats_aux);
690 700
691 /*
692 * After this is decremented the bio must not be touched if it is
693 * a flush.
694 */
695 pending = atomic_dec_return(&md->pending[rw]);
696 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
697 pending += atomic_read(&md->pending[rw^0x1]);
698
699 /* nudge anyone waiting on suspend queue */ 701 /* nudge anyone waiting on suspend queue */
700 if (!pending) 702 if (unlikely(waitqueue_active(&md->wait)))
701 wake_up(&md->wait); 703 wake_up(&md->wait);
702} 704}
703 705
@@ -1417,10 +1419,21 @@ static int __send_empty_flush(struct clone_info *ci)
1417 unsigned target_nr = 0; 1419 unsigned target_nr = 0;
1418 struct dm_target *ti; 1420 struct dm_target *ti;
1419 1421
1422 /*
1423 * Empty flush uses a statically initialized bio, as the base for
1424 * cloning. However, blkg association requires that a bdev is
1425 * associated with a gendisk, which doesn't happen until the bdev is
1426 * opened. So, blkg association is done at issue time of the flush
1427 * rather than when the device is created in alloc_dev().
1428 */
1429 bio_set_dev(ci->bio, ci->io->md->bdev);
1430
1420 BUG_ON(bio_has_data(ci->bio)); 1431 BUG_ON(bio_has_data(ci->bio));
1421 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1432 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1422 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1433 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1423 1434
1435 bio_disassociate_blkg(ci->bio);
1436
1424 return 0; 1437 return 0;
1425} 1438}
1426 1439
@@ -1598,7 +1611,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1598 init_clone_info(&ci, md, map, bio); 1611 init_clone_info(&ci, md, map, bio);
1599 1612
1600 if (bio->bi_opf & REQ_PREFLUSH) { 1613 if (bio->bi_opf & REQ_PREFLUSH) {
1601 ci.bio = &ci.io->md->flush_bio; 1614 struct bio flush_bio;
1615
1616 /*
1617 * Use an on-stack bio for this, it's safe since we don't
1618 * need to reference it after submit. It's just used as
1619 * the basis for the clone(s).
1620 */
1621 bio_init(&flush_bio, NULL, 0);
1622 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1623 ci.bio = &flush_bio;
1602 ci.sector_count = 0; 1624 ci.sector_count = 0;
1603 error = __send_empty_flush(&ci); 1625 error = __send_empty_flush(&ci);
1604 /* dec_pending submits any data associated with flush */ 1626 /* dec_pending submits any data associated with flush */
@@ -1654,7 +1676,16 @@ static blk_qc_t __process_bio(struct mapped_device *md,
1654 init_clone_info(&ci, md, map, bio); 1676 init_clone_info(&ci, md, map, bio);
1655 1677
1656 if (bio->bi_opf & REQ_PREFLUSH) { 1678 if (bio->bi_opf & REQ_PREFLUSH) {
1657 ci.bio = &ci.io->md->flush_bio; 1679 struct bio flush_bio;
1680
1681 /*
1682 * Use an on-stack bio for this, it's safe since we don't
1683 * need to reference it after submit. It's just used as
1684 * the basis for the clone(s).
1685 */
1686 bio_init(&flush_bio, NULL, 0);
1687 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1688 ci.bio = &flush_bio;
1658 ci.sector_count = 0; 1689 ci.sector_count = 0;
1659 error = __send_empty_flush(&ci); 1690 error = __send_empty_flush(&ci);
1660 /* dec_pending submits any data associated with flush */ 1691 /* dec_pending submits any data associated with flush */
@@ -1898,7 +1929,7 @@ static struct mapped_device *alloc_dev(int minor)
1898 INIT_LIST_HEAD(&md->table_devices); 1929 INIT_LIST_HEAD(&md->table_devices);
1899 spin_lock_init(&md->uevent_lock); 1930 spin_lock_init(&md->uevent_lock);
1900 1931
1901 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL); 1932 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1902 if (!md->queue) 1933 if (!md->queue)
1903 goto bad; 1934 goto bad;
1904 md->queue->queuedata = md; 1935 md->queue->queuedata = md;
@@ -1908,8 +1939,6 @@ static struct mapped_device *alloc_dev(int minor)
1908 if (!md->disk) 1939 if (!md->disk)
1909 goto bad; 1940 goto bad;
1910 1941
1911 atomic_set(&md->pending[0], 0);
1912 atomic_set(&md->pending[1], 0);
1913 init_waitqueue_head(&md->wait); 1942 init_waitqueue_head(&md->wait);
1914 INIT_WORK(&md->work, dm_wq_work); 1943 INIT_WORK(&md->work, dm_wq_work);
1915 init_waitqueue_head(&md->eventq); 1944 init_waitqueue_head(&md->eventq);
@@ -1940,10 +1969,6 @@ static struct mapped_device *alloc_dev(int minor)
1940 if (!md->bdev) 1969 if (!md->bdev)
1941 goto bad; 1970 goto bad;
1942 1971
1943 bio_init(&md->flush_bio, NULL, 0);
1944 bio_set_dev(&md->flush_bio, md->bdev);
1945 md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1946
1947 dm_stats_init(&md->stats); 1972 dm_stats_init(&md->stats);
1948 1973
1949 /* Populate the mapping, nobody knows we exist yet */ 1974 /* Populate the mapping, nobody knows we exist yet */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fc488cb30a94..9a0a1e0934d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
334 const int sgrp = op_stat_group(bio_op(bio)); 334 const int sgrp = op_stat_group(bio_op(bio));
335 struct mddev *mddev = q->queuedata; 335 struct mddev *mddev = q->queuedata;
336 unsigned int sectors; 336 unsigned int sectors;
337 int cpu;
338 337
339 blk_queue_split(q, &bio); 338 blk_queue_split(q, &bio);
340 339
@@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
359 358
360 md_handle_request(mddev, bio); 359 md_handle_request(mddev, bio);
361 360
362 cpu = part_stat_lock(); 361 part_stat_lock();
363 part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); 362 part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
364 part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); 363 part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
365 part_stat_unlock(); 364 part_stat_unlock();
366 365
367 return BLK_QC_T_NONE; 366 return BLK_QC_T_NONE;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ac1cffd2a09b..f3fb5bb8c82a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
542 !discard_bio) 542 !discard_bio)
543 continue; 543 continue;
544 bio_chain(discard_bio, bio); 544 bio_chain(discard_bio, bio);
545 bio_clone_blkcg_association(discard_bio, bio); 545 bio_clone_blkg_association(discard_bio, bio);
546 if (mddev->gendisk) 546 if (mddev->gendisk)
547 trace_block_bio_remap(bdev_get_queue(rdev->bdev), 547 trace_block_bio_remap(bdev_get_queue(rdev->bdev),
548 discard_bio, disk_devt(mddev->gendisk), 548 discard_bio, disk_devt(mddev->gendisk),
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 8a02f11076f9..82daccc9ea62 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -15,7 +15,7 @@
15#define pr_fmt(fmt) DRIVER_NAME ": " fmt 15#define pr_fmt(fmt) DRIVER_NAME ": " fmt
16 16
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/blkdev.h> 18#include <linux/blk-mq.h>
19#include <linux/memstick.h> 19#include <linux/memstick.h>
20#include <linux/idr.h> 20#include <linux/idr.h>
21#include <linux/hdreg.h> 21#include <linux/hdreg.h>
@@ -1873,69 +1873,65 @@ static void msb_io_work(struct work_struct *work)
1873 struct msb_data *msb = container_of(work, struct msb_data, io_work); 1873 struct msb_data *msb = container_of(work, struct msb_data, io_work);
1874 int page, error, len; 1874 int page, error, len;
1875 sector_t lba; 1875 sector_t lba;
1876 unsigned long flags;
1877 struct scatterlist *sg = msb->prealloc_sg; 1876 struct scatterlist *sg = msb->prealloc_sg;
1877 struct request *req;
1878 1878
1879 dbg_verbose("IO: work started"); 1879 dbg_verbose("IO: work started");
1880 1880
1881 while (1) { 1881 while (1) {
1882 spin_lock_irqsave(&msb->q_lock, flags); 1882 spin_lock_irq(&msb->q_lock);
1883 1883
1884 if (msb->need_flush_cache) { 1884 if (msb->need_flush_cache) {
1885 msb->need_flush_cache = false; 1885 msb->need_flush_cache = false;
1886 spin_unlock_irqrestore(&msb->q_lock, flags); 1886 spin_unlock_irq(&msb->q_lock);
1887 msb_cache_flush(msb); 1887 msb_cache_flush(msb);
1888 continue; 1888 continue;
1889 } 1889 }
1890 1890
1891 if (!msb->req) { 1891 req = msb->req;
1892 msb->req = blk_fetch_request(msb->queue); 1892 if (!req) {
1893 if (!msb->req) { 1893 dbg_verbose("IO: no more requests exiting");
1894 dbg_verbose("IO: no more requests exiting"); 1894 spin_unlock_irq(&msb->q_lock);
1895 spin_unlock_irqrestore(&msb->q_lock, flags); 1895 return;
1896 return;
1897 }
1898 } 1896 }
1899 1897
1900 spin_unlock_irqrestore(&msb->q_lock, flags); 1898 spin_unlock_irq(&msb->q_lock);
1901
1902 /* If card was removed meanwhile */
1903 if (!msb->req)
1904 return;
1905 1899
1906 /* process the request */ 1900 /* process the request */
1907 dbg_verbose("IO: processing new request"); 1901 dbg_verbose("IO: processing new request");
1908 blk_rq_map_sg(msb->queue, msb->req, sg); 1902 blk_rq_map_sg(msb->queue, req, sg);
1909 1903
1910 lba = blk_rq_pos(msb->req); 1904 lba = blk_rq_pos(req);
1911 1905
1912 sector_div(lba, msb->page_size / 512); 1906 sector_div(lba, msb->page_size / 512);
1913 page = sector_div(lba, msb->pages_in_block); 1907 page = sector_div(lba, msb->pages_in_block);
1914 1908
1915 if (rq_data_dir(msb->req) == READ) 1909 if (rq_data_dir(msb->req) == READ)
1916 error = msb_do_read_request(msb, lba, page, sg, 1910 error = msb_do_read_request(msb, lba, page, sg,
1917 blk_rq_bytes(msb->req), &len); 1911 blk_rq_bytes(req), &len);
1918 else 1912 else
1919 error = msb_do_write_request(msb, lba, page, sg, 1913 error = msb_do_write_request(msb, lba, page, sg,
1920 blk_rq_bytes(msb->req), &len); 1914 blk_rq_bytes(req), &len);
1921
1922 spin_lock_irqsave(&msb->q_lock, flags);
1923 1915
1924 if (len) 1916 if (len && !blk_update_request(req, BLK_STS_OK, len)) {
1925 if (!__blk_end_request(msb->req, BLK_STS_OK, len)) 1917 __blk_mq_end_request(req, BLK_STS_OK);
1926 msb->req = NULL; 1918 spin_lock_irq(&msb->q_lock);
1919 msb->req = NULL;
1920 spin_unlock_irq(&msb->q_lock);
1921 }
1927 1922
1928 if (error && msb->req) { 1923 if (error && msb->req) {
1929 blk_status_t ret = errno_to_blk_status(error); 1924 blk_status_t ret = errno_to_blk_status(error);
1925
1930 dbg_verbose("IO: ending one sector of the request with error"); 1926 dbg_verbose("IO: ending one sector of the request with error");
1931 if (!__blk_end_request(msb->req, ret, msb->page_size)) 1927 blk_mq_end_request(req, ret);
1932 msb->req = NULL; 1928 spin_lock_irq(&msb->q_lock);
1929 msb->req = NULL;
1930 spin_unlock_irq(&msb->q_lock);
1933 } 1931 }
1934 1932
1935 if (msb->req) 1933 if (msb->req)
1936 dbg_verbose("IO: request still pending"); 1934 dbg_verbose("IO: request still pending");
1937
1938 spin_unlock_irqrestore(&msb->q_lock, flags);
1939 } 1935 }
1940} 1936}
1941 1937
@@ -2002,29 +1998,40 @@ static int msb_bd_getgeo(struct block_device *bdev,
2002 return 0; 1998 return 0;
2003} 1999}
2004 2000
2005static void msb_submit_req(struct request_queue *q) 2001static blk_status_t msb_queue_rq(struct blk_mq_hw_ctx *hctx,
2002 const struct blk_mq_queue_data *bd)
2006{ 2003{
2007 struct memstick_dev *card = q->queuedata; 2004 struct memstick_dev *card = hctx->queue->queuedata;
2008 struct msb_data *msb = memstick_get_drvdata(card); 2005 struct msb_data *msb = memstick_get_drvdata(card);
2009 struct request *req = NULL; 2006 struct request *req = bd->rq;
2010 2007
2011 dbg_verbose("Submit request"); 2008 dbg_verbose("Submit request");
2012 2009
2010 spin_lock_irq(&msb->q_lock);
2011
2013 if (msb->card_dead) { 2012 if (msb->card_dead) {
2014 dbg("Refusing requests on removed card"); 2013 dbg("Refusing requests on removed card");
2015 2014
2016 WARN_ON(!msb->io_queue_stopped); 2015 WARN_ON(!msb->io_queue_stopped);
2017 2016
2018 while ((req = blk_fetch_request(q)) != NULL) 2017 spin_unlock_irq(&msb->q_lock);
2019 __blk_end_request_all(req, BLK_STS_IOERR); 2018 blk_mq_start_request(req);
2020 return; 2019 return BLK_STS_IOERR;
2021 } 2020 }
2022 2021
2023 if (msb->req) 2022 if (msb->req) {
2024 return; 2023 spin_unlock_irq(&msb->q_lock);
2024 return BLK_STS_DEV_RESOURCE;
2025 }
2026
2027 blk_mq_start_request(req);
2028 msb->req = req;
2025 2029
2026 if (!msb->io_queue_stopped) 2030 if (!msb->io_queue_stopped)
2027 queue_work(msb->io_queue, &msb->io_work); 2031 queue_work(msb->io_queue, &msb->io_work);
2032
2033 spin_unlock_irq(&msb->q_lock);
2034 return BLK_STS_OK;
2028} 2035}
2029 2036
2030static int msb_check_card(struct memstick_dev *card) 2037static int msb_check_card(struct memstick_dev *card)
@@ -2040,21 +2047,20 @@ static void msb_stop(struct memstick_dev *card)
2040 2047
2041 dbg("Stopping all msblock IO"); 2048 dbg("Stopping all msblock IO");
2042 2049
2050 blk_mq_stop_hw_queues(msb->queue);
2043 spin_lock_irqsave(&msb->q_lock, flags); 2051 spin_lock_irqsave(&msb->q_lock, flags);
2044 blk_stop_queue(msb->queue);
2045 msb->io_queue_stopped = true; 2052 msb->io_queue_stopped = true;
2046 spin_unlock_irqrestore(&msb->q_lock, flags); 2053 spin_unlock_irqrestore(&msb->q_lock, flags);
2047 2054
2048 del_timer_sync(&msb->cache_flush_timer); 2055 del_timer_sync(&msb->cache_flush_timer);
2049 flush_workqueue(msb->io_queue); 2056 flush_workqueue(msb->io_queue);
2050 2057
2058 spin_lock_irqsave(&msb->q_lock, flags);
2051 if (msb->req) { 2059 if (msb->req) {
2052 spin_lock_irqsave(&msb->q_lock, flags); 2060 blk_mq_requeue_request(msb->req, false);
2053 blk_requeue_request(msb->queue, msb->req);
2054 msb->req = NULL; 2061 msb->req = NULL;
2055 spin_unlock_irqrestore(&msb->q_lock, flags);
2056 } 2062 }
2057 2063 spin_unlock_irqrestore(&msb->q_lock, flags);
2058} 2064}
2059 2065
2060static void msb_start(struct memstick_dev *card) 2066static void msb_start(struct memstick_dev *card)
@@ -2077,9 +2083,7 @@ static void msb_start(struct memstick_dev *card)
2077 msb->need_flush_cache = true; 2083 msb->need_flush_cache = true;
2078 msb->io_queue_stopped = false; 2084 msb->io_queue_stopped = false;
2079 2085
2080 spin_lock_irqsave(&msb->q_lock, flags); 2086 blk_mq_start_hw_queues(msb->queue);
2081 blk_start_queue(msb->queue);
2082 spin_unlock_irqrestore(&msb->q_lock, flags);
2083 2087
2084 queue_work(msb->io_queue, &msb->io_work); 2088 queue_work(msb->io_queue, &msb->io_work);
2085 2089
@@ -2092,6 +2096,10 @@ static const struct block_device_operations msb_bdops = {
2092 .owner = THIS_MODULE 2096 .owner = THIS_MODULE
2093}; 2097};
2094 2098
2099static const struct blk_mq_ops msb_mq_ops = {
2100 .queue_rq = msb_queue_rq,
2101};
2102
2095/* Registers the block device */ 2103/* Registers the block device */
2096static int msb_init_disk(struct memstick_dev *card) 2104static int msb_init_disk(struct memstick_dev *card)
2097{ 2105{
@@ -2112,9 +2120,11 @@ static int msb_init_disk(struct memstick_dev *card)
2112 goto out_release_id; 2120 goto out_release_id;
2113 } 2121 }
2114 2122
2115 msb->queue = blk_init_queue(msb_submit_req, &msb->q_lock); 2123 msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &msb_mq_ops, 2,
2116 if (!msb->queue) { 2124 BLK_MQ_F_SHOULD_MERGE);
2117 rc = -ENOMEM; 2125 if (IS_ERR(msb->queue)) {
2126 rc = PTR_ERR(msb->queue);
2127 msb->queue = NULL;
2118 goto out_put_disk; 2128 goto out_put_disk;
2119 } 2129 }
2120 2130
@@ -2202,12 +2212,13 @@ static void msb_remove(struct memstick_dev *card)
2202 /* Take care of unhandled + new requests from now on */ 2212 /* Take care of unhandled + new requests from now on */
2203 spin_lock_irqsave(&msb->q_lock, flags); 2213 spin_lock_irqsave(&msb->q_lock, flags);
2204 msb->card_dead = true; 2214 msb->card_dead = true;
2205 blk_start_queue(msb->queue);
2206 spin_unlock_irqrestore(&msb->q_lock, flags); 2215 spin_unlock_irqrestore(&msb->q_lock, flags);
2216 blk_mq_start_hw_queues(msb->queue);
2207 2217
2208 /* Remove the disk */ 2218 /* Remove the disk */
2209 del_gendisk(msb->disk); 2219 del_gendisk(msb->disk);
2210 blk_cleanup_queue(msb->queue); 2220 blk_cleanup_queue(msb->queue);
2221 blk_mq_free_tag_set(&msb->tag_set);
2211 msb->queue = NULL; 2222 msb->queue = NULL;
2212 2223
2213 mutex_lock(&msb_disk_lock); 2224 mutex_lock(&msb_disk_lock);
diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h
index 53962c3b21df..9ba84e0ced63 100644
--- a/drivers/memstick/core/ms_block.h
+++ b/drivers/memstick/core/ms_block.h
@@ -152,6 +152,7 @@ struct msb_data {
152 struct gendisk *disk; 152 struct gendisk *disk;
153 struct request_queue *queue; 153 struct request_queue *queue;
154 spinlock_t q_lock; 154 spinlock_t q_lock;
155 struct blk_mq_tag_set tag_set;
155 struct hd_geometry geometry; 156 struct hd_geometry geometry;
156 struct attribute_group attr_group; 157 struct attribute_group attr_group;
157 struct request *req; 158 struct request *req;
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 0cd30dcb6801..aba50ec98b4d 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -12,7 +12,7 @@
12 * 12 *
13 */ 13 */
14 14
15#include <linux/blkdev.h> 15#include <linux/blk-mq.h>
16#include <linux/idr.h> 16#include <linux/idr.h>
17#include <linux/hdreg.h> 17#include <linux/hdreg.h>
18#include <linux/kthread.h> 18#include <linux/kthread.h>
@@ -142,6 +142,7 @@ struct mspro_block_data {
142 struct gendisk *disk; 142 struct gendisk *disk;
143 struct request_queue *queue; 143 struct request_queue *queue;
144 struct request *block_req; 144 struct request *block_req;
145 struct blk_mq_tag_set tag_set;
145 spinlock_t q_lock; 146 spinlock_t q_lock;
146 147
147 unsigned short page_size; 148 unsigned short page_size;
@@ -152,7 +153,6 @@ struct mspro_block_data {
152 unsigned char system; 153 unsigned char system;
153 unsigned char read_only:1, 154 unsigned char read_only:1,
154 eject:1, 155 eject:1,
155 has_request:1,
156 data_dir:1, 156 data_dir:1,
157 active:1; 157 active:1;
158 unsigned char transfer_cmd; 158 unsigned char transfer_cmd;
@@ -694,13 +694,12 @@ static void h_mspro_block_setup_cmd(struct memstick_dev *card, u64 offset,
694 694
695/*** Data transfer ***/ 695/*** Data transfer ***/
696 696
697static int mspro_block_issue_req(struct memstick_dev *card, int chunk) 697static int mspro_block_issue_req(struct memstick_dev *card, bool chunk)
698{ 698{
699 struct mspro_block_data *msb = memstick_get_drvdata(card); 699 struct mspro_block_data *msb = memstick_get_drvdata(card);
700 u64 t_off; 700 u64 t_off;
701 unsigned int count; 701 unsigned int count;
702 702
703try_again:
704 while (chunk) { 703 while (chunk) {
705 msb->current_page = 0; 704 msb->current_page = 0;
706 msb->current_seg = 0; 705 msb->current_seg = 0;
@@ -709,9 +708,17 @@ try_again:
709 msb->req_sg); 708 msb->req_sg);
710 709
711 if (!msb->seg_count) { 710 if (!msb->seg_count) {
712 chunk = __blk_end_request_cur(msb->block_req, 711 unsigned int bytes = blk_rq_cur_bytes(msb->block_req);
713 BLK_STS_RESOURCE); 712
714 continue; 713 chunk = blk_update_request(msb->block_req,
714 BLK_STS_RESOURCE,
715 bytes);
716 if (chunk)
717 continue;
718 __blk_mq_end_request(msb->block_req,
719 BLK_STS_RESOURCE);
720 msb->block_req = NULL;
721 break;
715 } 722 }
716 723
717 t_off = blk_rq_pos(msb->block_req); 724 t_off = blk_rq_pos(msb->block_req);
@@ -729,30 +736,22 @@ try_again:
729 return 0; 736 return 0;
730 } 737 }
731 738
732 dev_dbg(&card->dev, "blk_fetch\n"); 739 return 1;
733 msb->block_req = blk_fetch_request(msb->queue);
734 if (!msb->block_req) {
735 dev_dbg(&card->dev, "issue end\n");
736 return -EAGAIN;
737 }
738
739 dev_dbg(&card->dev, "trying again\n");
740 chunk = 1;
741 goto try_again;
742} 740}
743 741
744static int mspro_block_complete_req(struct memstick_dev *card, int error) 742static int mspro_block_complete_req(struct memstick_dev *card, int error)
745{ 743{
746 struct mspro_block_data *msb = memstick_get_drvdata(card); 744 struct mspro_block_data *msb = memstick_get_drvdata(card);
747 int chunk, cnt; 745 int cnt;
746 bool chunk;
748 unsigned int t_len = 0; 747 unsigned int t_len = 0;
749 unsigned long flags; 748 unsigned long flags;
750 749
751 spin_lock_irqsave(&msb->q_lock, flags); 750 spin_lock_irqsave(&msb->q_lock, flags);
752 dev_dbg(&card->dev, "complete %d, %d\n", msb->has_request ? 1 : 0, 751 dev_dbg(&card->dev, "complete %d, %d\n", msb->block_req ? 1 : 0,
753 error); 752 error);
754 753
755 if (msb->has_request) { 754 if (msb->block_req) {
756 /* Nothing to do - not really an error */ 755 /* Nothing to do - not really an error */
757 if (error == -EAGAIN) 756 if (error == -EAGAIN)
758 error = 0; 757 error = 0;
@@ -777,15 +776,17 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
777 if (error && !t_len) 776 if (error && !t_len)
778 t_len = blk_rq_cur_bytes(msb->block_req); 777 t_len = blk_rq_cur_bytes(msb->block_req);
779 778
780 chunk = __blk_end_request(msb->block_req, 779 chunk = blk_update_request(msb->block_req,
781 errno_to_blk_status(error), t_len); 780 errno_to_blk_status(error), t_len);
782 781 if (chunk) {
783 error = mspro_block_issue_req(card, chunk); 782 error = mspro_block_issue_req(card, chunk);
784 783 if (!error)
785 if (!error) 784 goto out;
786 goto out; 785 } else {
787 else 786 __blk_mq_end_request(msb->block_req,
788 msb->has_request = 0; 787 errno_to_blk_status(error));
788 msb->block_req = NULL;
789 }
789 } else { 790 } else {
790 if (!error) 791 if (!error)
791 error = -EAGAIN; 792 error = -EAGAIN;
@@ -806,8 +807,8 @@ static void mspro_block_stop(struct memstick_dev *card)
806 807
807 while (1) { 808 while (1) {
808 spin_lock_irqsave(&msb->q_lock, flags); 809 spin_lock_irqsave(&msb->q_lock, flags);
809 if (!msb->has_request) { 810 if (!msb->block_req) {
810 blk_stop_queue(msb->queue); 811 blk_mq_stop_hw_queues(msb->queue);
811 rc = 1; 812 rc = 1;
812 } 813 }
813 spin_unlock_irqrestore(&msb->q_lock, flags); 814 spin_unlock_irqrestore(&msb->q_lock, flags);
@@ -822,32 +823,37 @@ static void mspro_block_stop(struct memstick_dev *card)
822static void mspro_block_start(struct memstick_dev *card) 823static void mspro_block_start(struct memstick_dev *card)
823{ 824{
824 struct mspro_block_data *msb = memstick_get_drvdata(card); 825 struct mspro_block_data *msb = memstick_get_drvdata(card);
825 unsigned long flags;
826 826
827 spin_lock_irqsave(&msb->q_lock, flags); 827 blk_mq_start_hw_queues(msb->queue);
828 blk_start_queue(msb->queue);
829 spin_unlock_irqrestore(&msb->q_lock, flags);
830} 828}
831 829
832static void mspro_block_submit_req(struct request_queue *q) 830static blk_status_t mspro_queue_rq(struct blk_mq_hw_ctx *hctx,
831 const struct blk_mq_queue_data *bd)
833{ 832{
834 struct memstick_dev *card = q->queuedata; 833 struct memstick_dev *card = hctx->queue->queuedata;
835 struct mspro_block_data *msb = memstick_get_drvdata(card); 834 struct mspro_block_data *msb = memstick_get_drvdata(card);
836 struct request *req = NULL;
837 835
838 if (msb->has_request) 836 spin_lock_irq(&msb->q_lock);
839 return;
840 837
841 if (msb->eject) { 838 if (msb->block_req) {
842 while ((req = blk_fetch_request(q)) != NULL) 839 spin_unlock_irq(&msb->q_lock);
843 __blk_end_request_all(req, BLK_STS_IOERR); 840 return BLK_STS_DEV_RESOURCE;
841 }
844 842
845 return; 843 if (msb->eject) {
844 spin_unlock_irq(&msb->q_lock);
845 blk_mq_start_request(bd->rq);
846 return BLK_STS_IOERR;
846 } 847 }
847 848
848 msb->has_request = 1; 849 msb->block_req = bd->rq;
849 if (mspro_block_issue_req(card, 0)) 850 blk_mq_start_request(bd->rq);
850 msb->has_request = 0; 851
852 if (mspro_block_issue_req(card, true))
853 msb->block_req = NULL;
854
855 spin_unlock_irq(&msb->q_lock);
856 return BLK_STS_OK;
851} 857}
852 858
853/*** Initialization ***/ 859/*** Initialization ***/
@@ -1167,6 +1173,10 @@ static int mspro_block_init_card(struct memstick_dev *card)
1167 1173
1168} 1174}
1169 1175
1176static const struct blk_mq_ops mspro_mq_ops = {
1177 .queue_rq = mspro_queue_rq,
1178};
1179
1170static int mspro_block_init_disk(struct memstick_dev *card) 1180static int mspro_block_init_disk(struct memstick_dev *card)
1171{ 1181{
1172 struct mspro_block_data *msb = memstick_get_drvdata(card); 1182 struct mspro_block_data *msb = memstick_get_drvdata(card);
@@ -1206,9 +1216,11 @@ static int mspro_block_init_disk(struct memstick_dev *card)
1206 goto out_release_id; 1216 goto out_release_id;
1207 } 1217 }
1208 1218
1209 msb->queue = blk_init_queue(mspro_block_submit_req, &msb->q_lock); 1219 msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &mspro_mq_ops, 2,
1210 if (!msb->queue) { 1220 BLK_MQ_F_SHOULD_MERGE);
1211 rc = -ENOMEM; 1221 if (IS_ERR(msb->queue)) {
1222 rc = PTR_ERR(msb->queue);
1223 msb->queue = NULL;
1212 goto out_put_disk; 1224 goto out_put_disk;
1213 } 1225 }
1214 1226
@@ -1318,13 +1330,14 @@ static void mspro_block_remove(struct memstick_dev *card)
1318 1330
1319 spin_lock_irqsave(&msb->q_lock, flags); 1331 spin_lock_irqsave(&msb->q_lock, flags);
1320 msb->eject = 1; 1332 msb->eject = 1;
1321 blk_start_queue(msb->queue);
1322 spin_unlock_irqrestore(&msb->q_lock, flags); 1333 spin_unlock_irqrestore(&msb->q_lock, flags);
1334 blk_mq_start_hw_queues(msb->queue);
1323 1335
1324 del_gendisk(msb->disk); 1336 del_gendisk(msb->disk);
1325 dev_dbg(&card->dev, "mspro block remove\n"); 1337 dev_dbg(&card->dev, "mspro block remove\n");
1326 1338
1327 blk_cleanup_queue(msb->queue); 1339 blk_cleanup_queue(msb->queue);
1340 blk_mq_free_tag_set(&msb->tag_set);
1328 msb->queue = NULL; 1341 msb->queue = NULL;
1329 1342
1330 sysfs_remove_group(&card->dev.kobj, &msb->attr_group); 1343 sysfs_remove_group(&card->dev.kobj, &msb->attr_group);
@@ -1344,8 +1357,9 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state)
1344 struct mspro_block_data *msb = memstick_get_drvdata(card); 1357 struct mspro_block_data *msb = memstick_get_drvdata(card);
1345 unsigned long flags; 1358 unsigned long flags;
1346 1359
1360 blk_mq_stop_hw_queues(msb->queue);
1361
1347 spin_lock_irqsave(&msb->q_lock, flags); 1362 spin_lock_irqsave(&msb->q_lock, flags);
1348 blk_stop_queue(msb->queue);
1349 msb->active = 0; 1363 msb->active = 0;
1350 spin_unlock_irqrestore(&msb->q_lock, flags); 1364 spin_unlock_irqrestore(&msb->q_lock, flags);
1351 1365
@@ -1355,7 +1369,6 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state)
1355static int mspro_block_resume(struct memstick_dev *card) 1369static int mspro_block_resume(struct memstick_dev *card)
1356{ 1370{
1357 struct mspro_block_data *msb = memstick_get_drvdata(card); 1371 struct mspro_block_data *msb = memstick_get_drvdata(card);
1358 unsigned long flags;
1359 int rc = 0; 1372 int rc = 0;
1360 1373
1361#ifdef CONFIG_MEMSTICK_UNSAFE_RESUME 1374#ifdef CONFIG_MEMSTICK_UNSAFE_RESUME
@@ -1401,9 +1414,7 @@ out_unlock:
1401 1414
1402#endif /* CONFIG_MEMSTICK_UNSAFE_RESUME */ 1415#endif /* CONFIG_MEMSTICK_UNSAFE_RESUME */
1403 1416
1404 spin_lock_irqsave(&msb->q_lock, flags); 1417 blk_mq_start_hw_queues(msb->queue);
1405 blk_start_queue(msb->queue);
1406 spin_unlock_irqrestore(&msb->q_lock, flags);
1407 return rc; 1418 return rc;
1408} 1419}
1409 1420
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 111934838da2..62e7619d5a4d 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -100,7 +100,6 @@ static DEFINE_IDA(mmc_rpmb_ida);
100 * There is one mmc_blk_data per slot. 100 * There is one mmc_blk_data per slot.
101 */ 101 */
102struct mmc_blk_data { 102struct mmc_blk_data {
103 spinlock_t lock;
104 struct device *parent; 103 struct device *parent;
105 struct gendisk *disk; 104 struct gendisk *disk;
106 struct mmc_queue queue; 105 struct mmc_queue queue;
@@ -1488,7 +1487,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
1488 blk_mq_end_request(req, BLK_STS_OK); 1487 blk_mq_end_request(req, BLK_STS_OK);
1489 } 1488 }
1490 1489
1491 spin_lock_irqsave(q->queue_lock, flags); 1490 spin_lock_irqsave(&mq->lock, flags);
1492 1491
1493 mq->in_flight[mmc_issue_type(mq, req)] -= 1; 1492 mq->in_flight[mmc_issue_type(mq, req)] -= 1;
1494 1493
@@ -1496,7 +1495,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
1496 1495
1497 mmc_cqe_check_busy(mq); 1496 mmc_cqe_check_busy(mq);
1498 1497
1499 spin_unlock_irqrestore(q->queue_lock, flags); 1498 spin_unlock_irqrestore(&mq->lock, flags);
1500 1499
1501 if (!mq->cqe_busy) 1500 if (!mq->cqe_busy)
1502 blk_mq_run_hw_queues(q, true); 1501 blk_mq_run_hw_queues(q, true);
@@ -1993,17 +1992,16 @@ static void mmc_blk_mq_poll_completion(struct mmc_queue *mq,
1993 1992
1994static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req) 1993static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req)
1995{ 1994{
1996 struct request_queue *q = req->q;
1997 unsigned long flags; 1995 unsigned long flags;
1998 bool put_card; 1996 bool put_card;
1999 1997
2000 spin_lock_irqsave(q->queue_lock, flags); 1998 spin_lock_irqsave(&mq->lock, flags);
2001 1999
2002 mq->in_flight[mmc_issue_type(mq, req)] -= 1; 2000 mq->in_flight[mmc_issue_type(mq, req)] -= 1;
2003 2001
2004 put_card = (mmc_tot_in_flight(mq) == 0); 2002 put_card = (mmc_tot_in_flight(mq) == 0);
2005 2003
2006 spin_unlock_irqrestore(q->queue_lock, flags); 2004 spin_unlock_irqrestore(&mq->lock, flags);
2007 2005
2008 if (put_card) 2006 if (put_card)
2009 mmc_put_card(mq->card, &mq->ctx); 2007 mmc_put_card(mq->card, &mq->ctx);
@@ -2099,11 +2097,11 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
2099 * request does not need to wait (although it does need to 2097 * request does not need to wait (although it does need to
2100 * complete complete_req first). 2098 * complete complete_req first).
2101 */ 2099 */
2102 spin_lock_irqsave(q->queue_lock, flags); 2100 spin_lock_irqsave(&mq->lock, flags);
2103 mq->complete_req = req; 2101 mq->complete_req = req;
2104 mq->rw_wait = false; 2102 mq->rw_wait = false;
2105 waiting = mq->waiting; 2103 waiting = mq->waiting;
2106 spin_unlock_irqrestore(q->queue_lock, flags); 2104 spin_unlock_irqrestore(&mq->lock, flags);
2107 2105
2108 /* 2106 /*
2109 * If 'waiting' then the waiting task will complete this 2107 * If 'waiting' then the waiting task will complete this
@@ -2122,10 +2120,10 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
2122 /* Take the recovery path for errors or urgent background operations */ 2120 /* Take the recovery path for errors or urgent background operations */
2123 if (mmc_blk_rq_error(&mqrq->brq) || 2121 if (mmc_blk_rq_error(&mqrq->brq) ||
2124 mmc_blk_urgent_bkops_needed(mq, mqrq)) { 2122 mmc_blk_urgent_bkops_needed(mq, mqrq)) {
2125 spin_lock_irqsave(q->queue_lock, flags); 2123 spin_lock_irqsave(&mq->lock, flags);
2126 mq->recovery_needed = true; 2124 mq->recovery_needed = true;
2127 mq->recovery_req = req; 2125 mq->recovery_req = req;
2128 spin_unlock_irqrestore(q->queue_lock, flags); 2126 spin_unlock_irqrestore(&mq->lock, flags);
2129 wake_up(&mq->wait); 2127 wake_up(&mq->wait);
2130 schedule_work(&mq->recovery_work); 2128 schedule_work(&mq->recovery_work);
2131 return; 2129 return;
@@ -2141,7 +2139,6 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
2141 2139
2142static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err) 2140static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
2143{ 2141{
2144 struct request_queue *q = mq->queue;
2145 unsigned long flags; 2142 unsigned long flags;
2146 bool done; 2143 bool done;
2147 2144
@@ -2149,7 +2146,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
2149 * Wait while there is another request in progress, but not if recovery 2146 * Wait while there is another request in progress, but not if recovery
2150 * is needed. Also indicate whether there is a request waiting to start. 2147 * is needed. Also indicate whether there is a request waiting to start.
2151 */ 2148 */
2152 spin_lock_irqsave(q->queue_lock, flags); 2149 spin_lock_irqsave(&mq->lock, flags);
2153 if (mq->recovery_needed) { 2150 if (mq->recovery_needed) {
2154 *err = -EBUSY; 2151 *err = -EBUSY;
2155 done = true; 2152 done = true;
@@ -2157,7 +2154,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
2157 done = !mq->rw_wait; 2154 done = !mq->rw_wait;
2158 } 2155 }
2159 mq->waiting = !done; 2156 mq->waiting = !done;
2160 spin_unlock_irqrestore(q->queue_lock, flags); 2157 spin_unlock_irqrestore(&mq->lock, flags);
2161 2158
2162 return done; 2159 return done;
2163} 2160}
@@ -2334,12 +2331,11 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
2334 goto err_kfree; 2331 goto err_kfree;
2335 } 2332 }
2336 2333
2337 spin_lock_init(&md->lock);
2338 INIT_LIST_HEAD(&md->part); 2334 INIT_LIST_HEAD(&md->part);
2339 INIT_LIST_HEAD(&md->rpmbs); 2335 INIT_LIST_HEAD(&md->rpmbs);
2340 md->usage = 1; 2336 md->usage = 1;
2341 2337
2342 ret = mmc_init_queue(&md->queue, card, &md->lock, subname); 2338 ret = mmc_init_queue(&md->queue, card);
2343 if (ret) 2339 if (ret)
2344 goto err_putdisk; 2340 goto err_putdisk;
2345 2341
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 6edffeed9953..35cc138b096d 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -89,9 +89,9 @@ void mmc_cqe_recovery_notifier(struct mmc_request *mrq)
89 struct mmc_queue *mq = q->queuedata; 89 struct mmc_queue *mq = q->queuedata;
90 unsigned long flags; 90 unsigned long flags;
91 91
92 spin_lock_irqsave(q->queue_lock, flags); 92 spin_lock_irqsave(&mq->lock, flags);
93 __mmc_cqe_recovery_notifier(mq); 93 __mmc_cqe_recovery_notifier(mq);
94 spin_unlock_irqrestore(q->queue_lock, flags); 94 spin_unlock_irqrestore(&mq->lock, flags);
95} 95}
96 96
97static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req) 97static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
@@ -128,14 +128,14 @@ static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
128 unsigned long flags; 128 unsigned long flags;
129 int ret; 129 int ret;
130 130
131 spin_lock_irqsave(q->queue_lock, flags); 131 spin_lock_irqsave(&mq->lock, flags);
132 132
133 if (mq->recovery_needed || !mq->use_cqe) 133 if (mq->recovery_needed || !mq->use_cqe)
134 ret = BLK_EH_RESET_TIMER; 134 ret = BLK_EH_RESET_TIMER;
135 else 135 else
136 ret = mmc_cqe_timed_out(req); 136 ret = mmc_cqe_timed_out(req);
137 137
138 spin_unlock_irqrestore(q->queue_lock, flags); 138 spin_unlock_irqrestore(&mq->lock, flags);
139 139
140 return ret; 140 return ret;
141} 141}
@@ -157,9 +157,9 @@ static void mmc_mq_recovery_handler(struct work_struct *work)
157 157
158 mq->in_recovery = false; 158 mq->in_recovery = false;
159 159
160 spin_lock_irq(q->queue_lock); 160 spin_lock_irq(&mq->lock);
161 mq->recovery_needed = false; 161 mq->recovery_needed = false;
162 spin_unlock_irq(q->queue_lock); 162 spin_unlock_irq(&mq->lock);
163 163
164 mmc_put_card(mq->card, &mq->ctx); 164 mmc_put_card(mq->card, &mq->ctx);
165 165
@@ -258,10 +258,10 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
258 258
259 issue_type = mmc_issue_type(mq, req); 259 issue_type = mmc_issue_type(mq, req);
260 260
261 spin_lock_irq(q->queue_lock); 261 spin_lock_irq(&mq->lock);
262 262
263 if (mq->recovery_needed || mq->busy) { 263 if (mq->recovery_needed || mq->busy) {
264 spin_unlock_irq(q->queue_lock); 264 spin_unlock_irq(&mq->lock);
265 return BLK_STS_RESOURCE; 265 return BLK_STS_RESOURCE;
266 } 266 }
267 267
@@ -269,7 +269,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
269 case MMC_ISSUE_DCMD: 269 case MMC_ISSUE_DCMD:
270 if (mmc_cqe_dcmd_busy(mq)) { 270 if (mmc_cqe_dcmd_busy(mq)) {
271 mq->cqe_busy |= MMC_CQE_DCMD_BUSY; 271 mq->cqe_busy |= MMC_CQE_DCMD_BUSY;
272 spin_unlock_irq(q->queue_lock); 272 spin_unlock_irq(&mq->lock);
273 return BLK_STS_RESOURCE; 273 return BLK_STS_RESOURCE;
274 } 274 }
275 break; 275 break;
@@ -294,7 +294,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
294 get_card = (mmc_tot_in_flight(mq) == 1); 294 get_card = (mmc_tot_in_flight(mq) == 1);
295 cqe_retune_ok = (mmc_cqe_qcnt(mq) == 1); 295 cqe_retune_ok = (mmc_cqe_qcnt(mq) == 1);
296 296
297 spin_unlock_irq(q->queue_lock); 297 spin_unlock_irq(&mq->lock);
298 298
299 if (!(req->rq_flags & RQF_DONTPREP)) { 299 if (!(req->rq_flags & RQF_DONTPREP)) {
300 req_to_mmc_queue_req(req)->retries = 0; 300 req_to_mmc_queue_req(req)->retries = 0;
@@ -328,12 +328,12 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
328 if (issued != MMC_REQ_STARTED) { 328 if (issued != MMC_REQ_STARTED) {
329 bool put_card = false; 329 bool put_card = false;
330 330
331 spin_lock_irq(q->queue_lock); 331 spin_lock_irq(&mq->lock);
332 mq->in_flight[issue_type] -= 1; 332 mq->in_flight[issue_type] -= 1;
333 if (mmc_tot_in_flight(mq) == 0) 333 if (mmc_tot_in_flight(mq) == 0)
334 put_card = true; 334 put_card = true;
335 mq->busy = false; 335 mq->busy = false;
336 spin_unlock_irq(q->queue_lock); 336 spin_unlock_irq(&mq->lock);
337 if (put_card) 337 if (put_card)
338 mmc_put_card(card, &mq->ctx); 338 mmc_put_card(card, &mq->ctx);
339 } else { 339 } else {
@@ -378,14 +378,37 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
378 init_waitqueue_head(&mq->wait); 378 init_waitqueue_head(&mq->wait);
379} 379}
380 380
381static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth, 381/* Set queue depth to get a reasonable value for q->nr_requests */
382 const struct blk_mq_ops *mq_ops, spinlock_t *lock) 382#define MMC_QUEUE_DEPTH 64
383
384/**
385 * mmc_init_queue - initialise a queue structure.
386 * @mq: mmc queue
387 * @card: mmc card to attach this queue
388 *
389 * Initialise a MMC card request queue.
390 */
391int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
383{ 392{
393 struct mmc_host *host = card->host;
384 int ret; 394 int ret;
385 395
396 mq->card = card;
397 mq->use_cqe = host->cqe_enabled;
398
399 spin_lock_init(&mq->lock);
400
386 memset(&mq->tag_set, 0, sizeof(mq->tag_set)); 401 memset(&mq->tag_set, 0, sizeof(mq->tag_set));
387 mq->tag_set.ops = mq_ops; 402 mq->tag_set.ops = &mmc_mq_ops;
388 mq->tag_set.queue_depth = q_depth; 403 /*
404 * The queue depth for CQE must match the hardware because the request
405 * tag is used to index the hardware queue.
406 */
407 if (mq->use_cqe)
408 mq->tag_set.queue_depth =
409 min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth);
410 else
411 mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
389 mq->tag_set.numa_node = NUMA_NO_NODE; 412 mq->tag_set.numa_node = NUMA_NO_NODE;
390 mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE | 413 mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
391 BLK_MQ_F_BLOCKING; 414 BLK_MQ_F_BLOCKING;
@@ -403,68 +426,17 @@ static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth,
403 goto free_tag_set; 426 goto free_tag_set;
404 } 427 }
405 428
406 mq->queue->queue_lock = lock;
407 mq->queue->queuedata = mq; 429 mq->queue->queuedata = mq;
430 blk_queue_rq_timeout(mq->queue, 60 * HZ);
408 431
432 mmc_setup_queue(mq, card);
409 return 0; 433 return 0;
410 434
411free_tag_set: 435free_tag_set:
412 blk_mq_free_tag_set(&mq->tag_set); 436 blk_mq_free_tag_set(&mq->tag_set);
413
414 return ret; 437 return ret;
415} 438}
416 439
417/* Set queue depth to get a reasonable value for q->nr_requests */
418#define MMC_QUEUE_DEPTH 64
419
420static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card,
421 spinlock_t *lock)
422{
423 struct mmc_host *host = card->host;
424 int q_depth;
425 int ret;
426
427 /*
428 * The queue depth for CQE must match the hardware because the request
429 * tag is used to index the hardware queue.
430 */
431 if (mq->use_cqe)
432 q_depth = min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth);
433 else
434 q_depth = MMC_QUEUE_DEPTH;
435
436 ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock);
437 if (ret)
438 return ret;
439
440 blk_queue_rq_timeout(mq->queue, 60 * HZ);
441
442 mmc_setup_queue(mq, card);
443
444 return 0;
445}
446
447/**
448 * mmc_init_queue - initialise a queue structure.
449 * @mq: mmc queue
450 * @card: mmc card to attach this queue
451 * @lock: queue lock
452 * @subname: partition subname
453 *
454 * Initialise a MMC card request queue.
455 */
456int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
457 spinlock_t *lock, const char *subname)
458{
459 struct mmc_host *host = card->host;
460
461 mq->card = card;
462
463 mq->use_cqe = host->cqe_enabled;
464
465 return mmc_mq_init(mq, card, lock);
466}
467
468void mmc_queue_suspend(struct mmc_queue *mq) 440void mmc_queue_suspend(struct mmc_queue *mq)
469{ 441{
470 blk_mq_quiesce_queue(mq->queue); 442 blk_mq_quiesce_queue(mq->queue);
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 9bf3c9245075..fd11491ced9f 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -77,6 +77,7 @@ struct mmc_queue {
77 struct blk_mq_tag_set tag_set; 77 struct blk_mq_tag_set tag_set;
78 struct mmc_blk_data *blkdata; 78 struct mmc_blk_data *blkdata;
79 struct request_queue *queue; 79 struct request_queue *queue;
80 spinlock_t lock;
80 int in_flight[MMC_ISSUE_MAX]; 81 int in_flight[MMC_ISSUE_MAX];
81 unsigned int cqe_busy; 82 unsigned int cqe_busy;
82#define MMC_CQE_DCMD_BUSY BIT(0) 83#define MMC_CQE_DCMD_BUSY BIT(0)
@@ -95,8 +96,7 @@ struct mmc_queue {
95 struct work_struct complete_work; 96 struct work_struct complete_work;
96}; 97};
97 98
98extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *, 99extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *);
99 const char *);
100extern void mmc_cleanup_queue(struct mmc_queue *); 100extern void mmc_cleanup_queue(struct mmc_queue *);
101extern void mmc_queue_suspend(struct mmc_queue *); 101extern void mmc_queue_suspend(struct mmc_queue *);
102extern void mmc_queue_resume(struct mmc_queue *); 102extern void mmc_queue_resume(struct mmc_queue *);
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 59dd50866932..5477a014e1fb 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1322,7 +1322,7 @@ static int ath6kl_cfg80211_set_default_key(struct wiphy *wiphy,
1322 struct ath6kl_vif *vif = netdev_priv(ndev); 1322 struct ath6kl_vif *vif = netdev_priv(ndev);
1323 struct ath6kl_key *key = NULL; 1323 struct ath6kl_key *key = NULL;
1324 u8 key_usage; 1324 u8 key_usage;
1325 enum crypto_type key_type = NONE_CRYPT; 1325 enum ath6kl_crypto_type key_type = NONE_CRYPT;
1326 1326
1327 ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: index %d\n", __func__, key_index); 1327 ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: index %d\n", __func__, key_index);
1328 1328
diff --git a/drivers/net/wireless/ath/ath6kl/common.h b/drivers/net/wireless/ath/ath6kl/common.h
index 4f82e8632d37..d6e5234f67a1 100644
--- a/drivers/net/wireless/ath/ath6kl/common.h
+++ b/drivers/net/wireless/ath/ath6kl/common.h
@@ -67,7 +67,7 @@ struct ath6kl_llc_snap_hdr {
67 __be16 eth_type; 67 __be16 eth_type;
68} __packed; 68} __packed;
69 69
70enum crypto_type { 70enum ath6kl_crypto_type {
71 NONE_CRYPT = 0x01, 71 NONE_CRYPT = 0x01,
72 WEP_CRYPT = 0x02, 72 WEP_CRYPT = 0x02,
73 TKIP_CRYPT = 0x04, 73 TKIP_CRYPT = 0x04,
diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c
index 777acc564ac9..9d7ac1ab2d02 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.c
+++ b/drivers/net/wireless/ath/ath6kl/wmi.c
@@ -1849,9 +1849,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx,
1849 enum network_type nw_type, 1849 enum network_type nw_type,
1850 enum dot11_auth_mode dot11_auth_mode, 1850 enum dot11_auth_mode dot11_auth_mode,
1851 enum auth_mode auth_mode, 1851 enum auth_mode auth_mode,
1852 enum crypto_type pairwise_crypto, 1852 enum ath6kl_crypto_type pairwise_crypto,
1853 u8 pairwise_crypto_len, 1853 u8 pairwise_crypto_len,
1854 enum crypto_type group_crypto, 1854 enum ath6kl_crypto_type group_crypto,
1855 u8 group_crypto_len, int ssid_len, u8 *ssid, 1855 u8 group_crypto_len, int ssid_len, u8 *ssid,
1856 u8 *bssid, u16 channel, u32 ctrl_flags, 1856 u8 *bssid, u16 channel, u32 ctrl_flags,
1857 u8 nw_subtype) 1857 u8 nw_subtype)
@@ -2301,7 +2301,7 @@ int ath6kl_wmi_disctimeout_cmd(struct wmi *wmi, u8 if_idx, u8 timeout)
2301} 2301}
2302 2302
2303int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index, 2303int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index,
2304 enum crypto_type key_type, 2304 enum ath6kl_crypto_type key_type,
2305 u8 key_usage, u8 key_len, 2305 u8 key_usage, u8 key_len,
2306 u8 *key_rsc, unsigned int key_rsc_len, 2306 u8 *key_rsc, unsigned int key_rsc_len,
2307 u8 *key_material, 2307 u8 *key_material,
diff --git a/drivers/net/wireless/ath/ath6kl/wmi.h b/drivers/net/wireless/ath/ath6kl/wmi.h
index a60bb49fe920..784940ba4c90 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.h
+++ b/drivers/net/wireless/ath/ath6kl/wmi.h
@@ -2556,9 +2556,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx,
2556 enum network_type nw_type, 2556 enum network_type nw_type,
2557 enum dot11_auth_mode dot11_auth_mode, 2557 enum dot11_auth_mode dot11_auth_mode,
2558 enum auth_mode auth_mode, 2558 enum auth_mode auth_mode,
2559 enum crypto_type pairwise_crypto, 2559 enum ath6kl_crypto_type pairwise_crypto,
2560 u8 pairwise_crypto_len, 2560 u8 pairwise_crypto_len,
2561 enum crypto_type group_crypto, 2561 enum ath6kl_crypto_type group_crypto,
2562 u8 group_crypto_len, int ssid_len, u8 *ssid, 2562 u8 group_crypto_len, int ssid_len, u8 *ssid,
2563 u8 *bssid, u16 channel, u32 ctrl_flags, 2563 u8 *bssid, u16 channel, u32 ctrl_flags,
2564 u8 nw_subtype); 2564 u8 nw_subtype);
@@ -2610,7 +2610,7 @@ int ath6kl_wmi_config_debug_module_cmd(struct wmi *wmi, u32 valid, u32 config);
2610 2610
2611int ath6kl_wmi_get_stats_cmd(struct wmi *wmi, u8 if_idx); 2611int ath6kl_wmi_get_stats_cmd(struct wmi *wmi, u8 if_idx);
2612int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index, 2612int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index,
2613 enum crypto_type key_type, 2613 enum ath6kl_crypto_type key_type,
2614 u8 key_usage, u8 key_len, 2614 u8 key_usage, u8 key_len,
2615 u8 *key_rsc, unsigned int key_rsc_len, 2615 u8 *key_rsc, unsigned int key_rsc_len,
2616 u8 *key_material, 2616 u8 *key_material,
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 0e39e3d1846f..f7019294740c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -393,7 +393,7 @@ static int pmem_attach_disk(struct device *dev,
393 return -EBUSY; 393 return -EBUSY;
394 } 394 }
395 395
396 q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL); 396 q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
397 if (!q) 397 if (!q)
398 return -ENOMEM; 398 return -ENOMEM;
399 399
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 88a8b5916624..0f345e207675 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -57,3 +57,18 @@ config NVME_FC
57 from https://github.com/linux-nvme/nvme-cli. 57 from https://github.com/linux-nvme/nvme-cli.
58 58
59 If unsure, say N. 59 If unsure, say N.
60
61config NVME_TCP
62 tristate "NVM Express over Fabrics TCP host driver"
63 depends on INET
64 depends on BLK_DEV_NVME
65 select NVME_FABRICS
66 help
67 This provides support for the NVMe over Fabrics protocol using
68 the TCP transport. This allows you to use remote block devices
69 exported using the NVMe protocol set.
70
71 To configure a NVMe over Fabrics controller use the nvme-cli tool
72 from https://github.com/linux-nvme/nvme-cli.
73
74 If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index aea459c65ae1..8a4b671c5f0c 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
7obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o 7obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
8obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o 8obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
9obj-$(CONFIG_NVME_FC) += nvme-fc.o 9obj-$(CONFIG_NVME_FC) += nvme-fc.o
10obj-$(CONFIG_NVME_TCP) += nvme-tcp.o
10 11
11nvme-core-y := core.o 12nvme-core-y := core.o
12nvme-core-$(CONFIG_TRACING) += trace.o 13nvme-core-$(CONFIG_TRACING) += trace.o
@@ -21,3 +22,5 @@ nvme-fabrics-y += fabrics.o
21nvme-rdma-y += rdma.o 22nvme-rdma-y += rdma.o
22 23
23nvme-fc-y += fc.o 24nvme-fc-y += fc.o
25
26nvme-tcp-y += tcp.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 962012135b62..08f2c92602f4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -97,7 +97,6 @@ static dev_t nvme_chr_devt;
97static struct class *nvme_class; 97static struct class *nvme_class;
98static struct class *nvme_subsys_class; 98static struct class *nvme_subsys_class;
99 99
100static void nvme_ns_remove(struct nvme_ns *ns);
101static int nvme_revalidate_disk(struct gendisk *disk); 100static int nvme_revalidate_disk(struct gendisk *disk);
102static void nvme_put_subsystem(struct nvme_subsystem *subsys); 101static void nvme_put_subsystem(struct nvme_subsystem *subsys);
103static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 102static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@@ -245,12 +244,31 @@ static inline bool nvme_req_needs_retry(struct request *req)
245 return true; 244 return true;
246} 245}
247 246
247static void nvme_retry_req(struct request *req)
248{
249 struct nvme_ns *ns = req->q->queuedata;
250 unsigned long delay = 0;
251 u16 crd;
252
253 /* The mask and shift result must be <= 3 */
254 crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
255 if (ns && crd)
256 delay = ns->ctrl->crdt[crd - 1] * 100;
257
258 nvme_req(req)->retries++;
259 blk_mq_requeue_request(req, false);
260 blk_mq_delay_kick_requeue_list(req->q, delay);
261}
262
248void nvme_complete_rq(struct request *req) 263void nvme_complete_rq(struct request *req)
249{ 264{
250 blk_status_t status = nvme_error_status(req); 265 blk_status_t status = nvme_error_status(req);
251 266
252 trace_nvme_complete_rq(req); 267 trace_nvme_complete_rq(req);
253 268
269 if (nvme_req(req)->ctrl->kas)
270 nvme_req(req)->ctrl->comp_seen = true;
271
254 if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { 272 if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
255 if ((req->cmd_flags & REQ_NVME_MPATH) && 273 if ((req->cmd_flags & REQ_NVME_MPATH) &&
256 blk_path_error(status)) { 274 blk_path_error(status)) {
@@ -259,8 +277,7 @@ void nvme_complete_rq(struct request *req)
259 } 277 }
260 278
261 if (!blk_queue_dying(req->q)) { 279 if (!blk_queue_dying(req->q)) {
262 nvme_req(req)->retries++; 280 nvme_retry_req(req);
263 blk_mq_requeue_request(req, true);
264 return; 281 return;
265 } 282 }
266 } 283 }
@@ -268,14 +285,14 @@ void nvme_complete_rq(struct request *req)
268} 285}
269EXPORT_SYMBOL_GPL(nvme_complete_rq); 286EXPORT_SYMBOL_GPL(nvme_complete_rq);
270 287
271void nvme_cancel_request(struct request *req, void *data, bool reserved) 288bool nvme_cancel_request(struct request *req, void *data, bool reserved)
272{ 289{
273 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, 290 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
274 "Cancelling I/O %d", req->tag); 291 "Cancelling I/O %d", req->tag);
275 292
276 nvme_req(req)->status = NVME_SC_ABORT_REQ; 293 nvme_req(req)->status = NVME_SC_ABORT_REQ;
277 blk_mq_complete_request(req); 294 blk_mq_complete_request(req);
278 295 return true;
279} 296}
280EXPORT_SYMBOL_GPL(nvme_cancel_request); 297EXPORT_SYMBOL_GPL(nvme_cancel_request);
281 298
@@ -536,7 +553,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
536static inline void nvme_setup_flush(struct nvme_ns *ns, 553static inline void nvme_setup_flush(struct nvme_ns *ns,
537 struct nvme_command *cmnd) 554 struct nvme_command *cmnd)
538{ 555{
539 memset(cmnd, 0, sizeof(*cmnd));
540 cmnd->common.opcode = nvme_cmd_flush; 556 cmnd->common.opcode = nvme_cmd_flush;
541 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); 557 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
542} 558}
@@ -548,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
548 struct nvme_dsm_range *range; 564 struct nvme_dsm_range *range;
549 struct bio *bio; 565 struct bio *bio;
550 566
551 range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); 567 range = kmalloc_array(segments, sizeof(*range),
552 if (!range) 568 GFP_ATOMIC | __GFP_NOWARN);
553 return BLK_STS_RESOURCE; 569 if (!range) {
570 /*
571 * If we fail allocation our range, fallback to the controller
572 * discard page. If that's also busy, it's safe to return
573 * busy, as we know we can make progress once that's freed.
574 */
575 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
576 return BLK_STS_RESOURCE;
577
578 range = page_address(ns->ctrl->discard_page);
579 }
554 580
555 __rq_for_each_bio(bio, req) { 581 __rq_for_each_bio(bio, req) {
556 u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); 582 u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
@@ -565,11 +591,13 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
565 } 591 }
566 592
567 if (WARN_ON_ONCE(n != segments)) { 593 if (WARN_ON_ONCE(n != segments)) {
568 kfree(range); 594 if (virt_to_page(range) == ns->ctrl->discard_page)
595 clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
596 else
597 kfree(range);
569 return BLK_STS_IOERR; 598 return BLK_STS_IOERR;
570 } 599 }
571 600
572 memset(cmnd, 0, sizeof(*cmnd));
573 cmnd->dsm.opcode = nvme_cmd_dsm; 601 cmnd->dsm.opcode = nvme_cmd_dsm;
574 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); 602 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
575 cmnd->dsm.nr = cpu_to_le32(segments - 1); 603 cmnd->dsm.nr = cpu_to_le32(segments - 1);
@@ -598,7 +626,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
598 if (req->cmd_flags & REQ_RAHEAD) 626 if (req->cmd_flags & REQ_RAHEAD)
599 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; 627 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
600 628
601 memset(cmnd, 0, sizeof(*cmnd));
602 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); 629 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
603 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); 630 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
604 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); 631 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
@@ -650,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req)
650 blk_rq_bytes(req) >> ns->lba_shift); 677 blk_rq_bytes(req) >> ns->lba_shift);
651 } 678 }
652 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { 679 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
653 kfree(page_address(req->special_vec.bv_page) + 680 struct nvme_ns *ns = req->rq_disk->private_data;
654 req->special_vec.bv_offset); 681 struct page *page = req->special_vec.bv_page;
682
683 if (page == ns->ctrl->discard_page)
684 clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
685 else
686 kfree(page_address(page) + req->special_vec.bv_offset);
655 } 687 }
656} 688}
657EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); 689EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -663,6 +695,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
663 695
664 nvme_clear_nvme_request(req); 696 nvme_clear_nvme_request(req);
665 697
698 memset(cmd, 0, sizeof(*cmd));
666 switch (req_op(req)) { 699 switch (req_op(req)) {
667 case REQ_OP_DRV_IN: 700 case REQ_OP_DRV_IN:
668 case REQ_OP_DRV_OUT: 701 case REQ_OP_DRV_OUT:
@@ -691,6 +724,31 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
691} 724}
692EXPORT_SYMBOL_GPL(nvme_setup_cmd); 725EXPORT_SYMBOL_GPL(nvme_setup_cmd);
693 726
727static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
728{
729 struct completion *waiting = rq->end_io_data;
730
731 rq->end_io_data = NULL;
732 complete(waiting);
733}
734
735static void nvme_execute_rq_polled(struct request_queue *q,
736 struct gendisk *bd_disk, struct request *rq, int at_head)
737{
738 DECLARE_COMPLETION_ONSTACK(wait);
739
740 WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
741
742 rq->cmd_flags |= REQ_HIPRI;
743 rq->end_io_data = &wait;
744 blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
745
746 while (!completion_done(&wait)) {
747 blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
748 cond_resched();
749 }
750}
751
694/* 752/*
695 * Returns 0 on success. If the result is negative, it's a Linux error code; 753 * Returns 0 on success. If the result is negative, it's a Linux error code;
696 * if the result is positive, it's an NVM Express status code 754 * if the result is positive, it's an NVM Express status code
@@ -698,7 +756,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
698int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 756int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
699 union nvme_result *result, void *buffer, unsigned bufflen, 757 union nvme_result *result, void *buffer, unsigned bufflen,
700 unsigned timeout, int qid, int at_head, 758 unsigned timeout, int qid, int at_head,
701 blk_mq_req_flags_t flags) 759 blk_mq_req_flags_t flags, bool poll)
702{ 760{
703 struct request *req; 761 struct request *req;
704 int ret; 762 int ret;
@@ -715,7 +773,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
715 goto out; 773 goto out;
716 } 774 }
717 775
718 blk_execute_rq(req->q, NULL, req, at_head); 776 if (poll)
777 nvme_execute_rq_polled(req->q, NULL, req, at_head);
778 else
779 blk_execute_rq(req->q, NULL, req, at_head);
719 if (result) 780 if (result)
720 *result = nvme_req(req)->result; 781 *result = nvme_req(req)->result;
721 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 782 if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -732,7 +793,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
732 void *buffer, unsigned bufflen) 793 void *buffer, unsigned bufflen)
733{ 794{
734 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, 795 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
735 NVME_QID_ANY, 0, 0); 796 NVME_QID_ANY, 0, 0, false);
736} 797}
737EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); 798EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
738 799
@@ -843,6 +904,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
843 return; 904 return;
844 } 905 }
845 906
907 ctrl->comp_seen = false;
846 spin_lock_irqsave(&ctrl->lock, flags); 908 spin_lock_irqsave(&ctrl->lock, flags);
847 if (ctrl->state == NVME_CTRL_LIVE || 909 if (ctrl->state == NVME_CTRL_LIVE ||
848 ctrl->state == NVME_CTRL_CONNECTING) 910 ctrl->state == NVME_CTRL_CONNECTING)
@@ -873,6 +935,15 @@ static void nvme_keep_alive_work(struct work_struct *work)
873{ 935{
874 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), 936 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
875 struct nvme_ctrl, ka_work); 937 struct nvme_ctrl, ka_work);
938 bool comp_seen = ctrl->comp_seen;
939
940 if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
941 dev_dbg(ctrl->device,
942 "reschedule traffic based keep-alive timer\n");
943 ctrl->comp_seen = false;
944 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
945 return;
946 }
876 947
877 if (nvme_keep_alive(ctrl)) { 948 if (nvme_keep_alive(ctrl)) {
878 /* allocation failure, reset the controller */ 949 /* allocation failure, reset the controller */
@@ -1041,7 +1112,7 @@ static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword
1041 c.features.dword11 = cpu_to_le32(dword11); 1112 c.features.dword11 = cpu_to_le32(dword11);
1042 1113
1043 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, 1114 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1044 buffer, buflen, 0, NVME_QID_ANY, 0, 0); 1115 buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
1045 if (ret >= 0 && result) 1116 if (ret >= 0 && result)
1046 *result = le32_to_cpu(res.u32); 1117 *result = le32_to_cpu(res.u32);
1047 return ret; 1118 return ret;
@@ -1240,12 +1311,12 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1240 c.common.nsid = cpu_to_le32(cmd.nsid); 1311 c.common.nsid = cpu_to_le32(cmd.nsid);
1241 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 1312 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1242 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 1313 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1243 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); 1314 c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1244 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); 1315 c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1245 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); 1316 c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1246 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); 1317 c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1247 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); 1318 c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1248 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); 1319 c.common.cdw15 = cpu_to_le32(cmd.cdw15);
1249 1320
1250 if (cmd.timeout_ms) 1321 if (cmd.timeout_ms)
1251 timeout = msecs_to_jiffies(cmd.timeout_ms); 1322 timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -1524,8 +1595,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
1524 if (ns->noiob) 1595 if (ns->noiob)
1525 nvme_set_chunk_size(ns); 1596 nvme_set_chunk_size(ns);
1526 nvme_update_disk_info(disk, ns, id); 1597 nvme_update_disk_info(disk, ns, id);
1527 if (ns->ndev)
1528 nvme_nvm_update_nvm_info(ns);
1529#ifdef CONFIG_NVME_MULTIPATH 1598#ifdef CONFIG_NVME_MULTIPATH
1530 if (ns->head->disk) { 1599 if (ns->head->disk) {
1531 nvme_update_disk_info(ns->head->disk, ns, id); 1600 nvme_update_disk_info(ns->head->disk, ns, id);
@@ -1608,7 +1677,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
1608 memset(&c, 0, sizeof(c)); 1677 memset(&c, 0, sizeof(c));
1609 c.common.opcode = op; 1678 c.common.opcode = op;
1610 c.common.nsid = cpu_to_le32(ns->head->ns_id); 1679 c.common.nsid = cpu_to_le32(ns->head->ns_id);
1611 c.common.cdw10[0] = cpu_to_le32(cdw10); 1680 c.common.cdw10 = cpu_to_le32(cdw10);
1612 1681
1613 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); 1682 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
1614 nvme_put_ns_from_disk(head, srcu_idx); 1683 nvme_put_ns_from_disk(head, srcu_idx);
@@ -1682,11 +1751,11 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
1682 else 1751 else
1683 cmd.common.opcode = nvme_admin_security_recv; 1752 cmd.common.opcode = nvme_admin_security_recv;
1684 cmd.common.nsid = 0; 1753 cmd.common.nsid = 0;
1685 cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); 1754 cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
1686 cmd.common.cdw10[1] = cpu_to_le32(len); 1755 cmd.common.cdw11 = cpu_to_le32(len);
1687 1756
1688 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 1757 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
1689 ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0); 1758 ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
1690} 1759}
1691EXPORT_SYMBOL_GPL(nvme_sec_submit); 1760EXPORT_SYMBOL_GPL(nvme_sec_submit);
1692#endif /* CONFIG_BLK_SED_OPAL */ 1761#endif /* CONFIG_BLK_SED_OPAL */
@@ -1881,6 +1950,26 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
1881 return ret; 1950 return ret;
1882} 1951}
1883 1952
1953static int nvme_configure_acre(struct nvme_ctrl *ctrl)
1954{
1955 struct nvme_feat_host_behavior *host;
1956 int ret;
1957
1958 /* Don't bother enabling the feature if retry delay is not reported */
1959 if (!ctrl->crdt[0])
1960 return 0;
1961
1962 host = kzalloc(sizeof(*host), GFP_KERNEL);
1963 if (!host)
1964 return 0;
1965
1966 host->acre = NVME_ENABLE_ACRE;
1967 ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
1968 host, sizeof(*host), NULL);
1969 kfree(host);
1970 return ret;
1971}
1972
1884static int nvme_configure_apst(struct nvme_ctrl *ctrl) 1973static int nvme_configure_apst(struct nvme_ctrl *ctrl)
1885{ 1974{
1886 /* 1975 /*
@@ -2402,6 +2491,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
2402 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; 2491 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
2403 } 2492 }
2404 2493
2494 ctrl->crdt[0] = le16_to_cpu(id->crdt1);
2495 ctrl->crdt[1] = le16_to_cpu(id->crdt2);
2496 ctrl->crdt[2] = le16_to_cpu(id->crdt3);
2497
2405 ctrl->oacs = le16_to_cpu(id->oacs); 2498 ctrl->oacs = le16_to_cpu(id->oacs);
2406 ctrl->oncs = le16_to_cpup(&id->oncs); 2499 ctrl->oncs = le16_to_cpup(&id->oncs);
2407 ctrl->oaes = le32_to_cpu(id->oaes); 2500 ctrl->oaes = le32_to_cpu(id->oaes);
@@ -2419,6 +2512,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
2419 ctrl->sgls = le32_to_cpu(id->sgls); 2512 ctrl->sgls = le32_to_cpu(id->sgls);
2420 ctrl->kas = le16_to_cpu(id->kas); 2513 ctrl->kas = le16_to_cpu(id->kas);
2421 ctrl->max_namespaces = le32_to_cpu(id->mnan); 2514 ctrl->max_namespaces = le32_to_cpu(id->mnan);
2515 ctrl->ctratt = le32_to_cpu(id->ctratt);
2422 2516
2423 if (id->rtd3e) { 2517 if (id->rtd3e) {
2424 /* us -> s */ 2518 /* us -> s */
@@ -2501,6 +2595,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
2501 if (ret < 0) 2595 if (ret < 0)
2502 return ret; 2596 return ret;
2503 2597
2598 ret = nvme_configure_acre(ctrl);
2599 if (ret < 0)
2600 return ret;
2601
2504 ctrl->identified = true; 2602 ctrl->identified = true;
2505 2603
2506 return 0; 2604 return 0;
@@ -2776,6 +2874,7 @@ static ssize_t field##_show(struct device *dev, \
2776static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); 2874static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2777 2875
2778nvme_show_int_function(cntlid); 2876nvme_show_int_function(cntlid);
2877nvme_show_int_function(numa_node);
2779 2878
2780static ssize_t nvme_sysfs_delete(struct device *dev, 2879static ssize_t nvme_sysfs_delete(struct device *dev,
2781 struct device_attribute *attr, const char *buf, 2880 struct device_attribute *attr, const char *buf,
@@ -2855,6 +2954,7 @@ static struct attribute *nvme_dev_attrs[] = {
2855 &dev_attr_subsysnqn.attr, 2954 &dev_attr_subsysnqn.attr,
2856 &dev_attr_address.attr, 2955 &dev_attr_address.attr,
2857 &dev_attr_state.attr, 2956 &dev_attr_state.attr,
2957 &dev_attr_numa_node.attr,
2858 NULL 2958 NULL
2859}; 2959};
2860 2960
@@ -3065,7 +3165,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3065 struct gendisk *disk; 3165 struct gendisk *disk;
3066 struct nvme_id_ns *id; 3166 struct nvme_id_ns *id;
3067 char disk_name[DISK_NAME_LEN]; 3167 char disk_name[DISK_NAME_LEN];
3068 int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT; 3168 int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
3069 3169
3070 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); 3170 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
3071 if (!ns) 3171 if (!ns)
@@ -3100,13 +3200,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3100 nvme_setup_streams_ns(ctrl, ns); 3200 nvme_setup_streams_ns(ctrl, ns);
3101 nvme_set_disk_name(disk_name, ns, ctrl, &flags); 3201 nvme_set_disk_name(disk_name, ns, ctrl, &flags);
3102 3202
3103 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3104 if (nvme_nvm_register(ns, disk_name, node)) {
3105 dev_warn(ctrl->device, "LightNVM init failure\n");
3106 goto out_unlink_ns;
3107 }
3108 }
3109
3110 disk = alloc_disk_node(0, node); 3203 disk = alloc_disk_node(0, node);
3111 if (!disk) 3204 if (!disk)
3112 goto out_unlink_ns; 3205 goto out_unlink_ns;
@@ -3120,6 +3213,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3120 3213
3121 __nvme_revalidate_disk(disk, id); 3214 __nvme_revalidate_disk(disk, id);
3122 3215
3216 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3217 if (nvme_nvm_register(ns, disk_name, node)) {
3218 dev_warn(ctrl->device, "LightNVM init failure\n");
3219 goto out_put_disk;
3220 }
3221 }
3222
3123 down_write(&ctrl->namespaces_rwsem); 3223 down_write(&ctrl->namespaces_rwsem);
3124 list_add_tail(&ns->list, &ctrl->namespaces); 3224 list_add_tail(&ns->list, &ctrl->namespaces);
3125 up_write(&ctrl->namespaces_rwsem); 3225 up_write(&ctrl->namespaces_rwsem);
@@ -3133,6 +3233,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3133 kfree(id); 3233 kfree(id);
3134 3234
3135 return; 3235 return;
3236 out_put_disk:
3237 put_disk(ns->disk);
3136 out_unlink_ns: 3238 out_unlink_ns:
3137 mutex_lock(&ctrl->subsys->lock); 3239 mutex_lock(&ctrl->subsys->lock);
3138 list_del_rcu(&ns->siblings); 3240 list_del_rcu(&ns->siblings);
@@ -3522,6 +3624,7 @@ static void nvme_free_ctrl(struct device *dev)
3522 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 3624 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3523 kfree(ctrl->effects); 3625 kfree(ctrl->effects);
3524 nvme_mpath_uninit(ctrl); 3626 nvme_mpath_uninit(ctrl);
3627 __free_page(ctrl->discard_page);
3525 3628
3526 if (subsys) { 3629 if (subsys) {
3527 mutex_lock(&subsys->lock); 3630 mutex_lock(&subsys->lock);
@@ -3562,6 +3665,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
3562 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); 3665 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
3563 ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; 3666 ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
3564 3667
3668 BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
3669 PAGE_SIZE);
3670 ctrl->discard_page = alloc_page(GFP_KERNEL);
3671 if (!ctrl->discard_page) {
3672 ret = -ENOMEM;
3673 goto out;
3674 }
3675
3565 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); 3676 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
3566 if (ret < 0) 3677 if (ret < 0)
3567 goto out; 3678 goto out;
@@ -3599,6 +3710,8 @@ out_free_name:
3599out_release_instance: 3710out_release_instance:
3600 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 3711 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3601out: 3712out:
3713 if (ctrl->discard_page)
3714 __free_page(ctrl->discard_page);
3602 return ret; 3715 return ret;
3603} 3716}
3604EXPORT_SYMBOL_GPL(nvme_init_ctrl); 3717EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -3746,7 +3859,7 @@ out:
3746 return result; 3859 return result;
3747} 3860}
3748 3861
3749void nvme_core_exit(void) 3862void __exit nvme_core_exit(void)
3750{ 3863{
3751 ida_destroy(&nvme_subsystems_ida); 3864 ida_destroy(&nvme_subsystems_ida);
3752 class_destroy(nvme_subsys_class); 3865 class_destroy(nvme_subsys_class);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index bd0969db6225..b2ab213f43de 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -159,7 +159,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
159 cmd.prop_get.offset = cpu_to_le32(off); 159 cmd.prop_get.offset = cpu_to_le32(off);
160 160
161 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, 161 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
162 NVME_QID_ANY, 0, 0); 162 NVME_QID_ANY, 0, 0, false);
163 163
164 if (ret >= 0) 164 if (ret >= 0)
165 *val = le64_to_cpu(res.u64); 165 *val = le64_to_cpu(res.u64);
@@ -206,7 +206,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
206 cmd.prop_get.offset = cpu_to_le32(off); 206 cmd.prop_get.offset = cpu_to_le32(off);
207 207
208 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, 208 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
209 NVME_QID_ANY, 0, 0); 209 NVME_QID_ANY, 0, 0, false);
210 210
211 if (ret >= 0) 211 if (ret >= 0)
212 *val = le64_to_cpu(res.u64); 212 *val = le64_to_cpu(res.u64);
@@ -252,7 +252,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
252 cmd.prop_set.value = cpu_to_le64(val); 252 cmd.prop_set.value = cpu_to_le64(val);
253 253
254 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, 254 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
255 NVME_QID_ANY, 0, 0); 255 NVME_QID_ANY, 0, 0, false);
256 if (unlikely(ret)) 256 if (unlikely(ret))
257 dev_err(ctrl->device, 257 dev_err(ctrl->device,
258 "Property Set error: %d, offset %#x\n", 258 "Property Set error: %d, offset %#x\n",
@@ -392,6 +392,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
392 cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 : 392 cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
393 cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000); 393 cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
394 394
395 if (ctrl->opts->disable_sqflow)
396 cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
397
395 data = kzalloc(sizeof(*data), GFP_KERNEL); 398 data = kzalloc(sizeof(*data), GFP_KERNEL);
396 if (!data) 399 if (!data)
397 return -ENOMEM; 400 return -ENOMEM;
@@ -403,7 +406,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
403 406
404 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, 407 ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
405 data, sizeof(*data), 0, NVME_QID_ANY, 1, 408 data, sizeof(*data), 0, NVME_QID_ANY, 1,
406 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); 409 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
407 if (ret) { 410 if (ret) {
408 nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), 411 nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
409 &cmd, data); 412 &cmd, data);
@@ -438,7 +441,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_admin_queue);
438 * > 0: NVMe error status code 441 * > 0: NVMe error status code
439 * < 0: Linux errno error code 442 * < 0: Linux errno error code
440 */ 443 */
441int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) 444int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll)
442{ 445{
443 struct nvme_command cmd; 446 struct nvme_command cmd;
444 struct nvmf_connect_data *data; 447 struct nvmf_connect_data *data;
@@ -451,6 +454,9 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
451 cmd.connect.qid = cpu_to_le16(qid); 454 cmd.connect.qid = cpu_to_le16(qid);
452 cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); 455 cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
453 456
457 if (ctrl->opts->disable_sqflow)
458 cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
459
454 data = kzalloc(sizeof(*data), GFP_KERNEL); 460 data = kzalloc(sizeof(*data), GFP_KERNEL);
455 if (!data) 461 if (!data)
456 return -ENOMEM; 462 return -ENOMEM;
@@ -462,7 +468,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
462 468
463 ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res, 469 ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
464 data, sizeof(*data), 0, qid, 1, 470 data, sizeof(*data), 0, qid, 1,
465 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); 471 BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, poll);
466 if (ret) { 472 if (ret) {
467 nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), 473 nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
468 &cmd, data); 474 &cmd, data);
@@ -607,6 +613,11 @@ static const match_table_t opt_tokens = {
607 { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, 613 { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" },
608 { NVMF_OPT_HOST_ID, "hostid=%s" }, 614 { NVMF_OPT_HOST_ID, "hostid=%s" },
609 { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, 615 { NVMF_OPT_DUP_CONNECT, "duplicate_connect" },
616 { NVMF_OPT_DISABLE_SQFLOW, "disable_sqflow" },
617 { NVMF_OPT_HDR_DIGEST, "hdr_digest" },
618 { NVMF_OPT_DATA_DIGEST, "data_digest" },
619 { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
620 { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
610 { NVMF_OPT_ERR, NULL } 621 { NVMF_OPT_ERR, NULL }
611}; 622};
612 623
@@ -626,6 +637,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
626 opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; 637 opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
627 opts->kato = NVME_DEFAULT_KATO; 638 opts->kato = NVME_DEFAULT_KATO;
628 opts->duplicate_connect = false; 639 opts->duplicate_connect = false;
640 opts->hdr_digest = false;
641 opts->data_digest = false;
629 642
630 options = o = kstrdup(buf, GFP_KERNEL); 643 options = o = kstrdup(buf, GFP_KERNEL);
631 if (!options) 644 if (!options)
@@ -817,6 +830,39 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
817 case NVMF_OPT_DUP_CONNECT: 830 case NVMF_OPT_DUP_CONNECT:
818 opts->duplicate_connect = true; 831 opts->duplicate_connect = true;
819 break; 832 break;
833 case NVMF_OPT_DISABLE_SQFLOW:
834 opts->disable_sqflow = true;
835 break;
836 case NVMF_OPT_HDR_DIGEST:
837 opts->hdr_digest = true;
838 break;
839 case NVMF_OPT_DATA_DIGEST:
840 opts->data_digest = true;
841 break;
842 case NVMF_OPT_NR_WRITE_QUEUES:
843 if (match_int(args, &token)) {
844 ret = -EINVAL;
845 goto out;
846 }
847 if (token <= 0) {
848 pr_err("Invalid nr_write_queues %d\n", token);
849 ret = -EINVAL;
850 goto out;
851 }
852 opts->nr_write_queues = token;
853 break;
854 case NVMF_OPT_NR_POLL_QUEUES:
855 if (match_int(args, &token)) {
856 ret = -EINVAL;
857 goto out;
858 }
859 if (token <= 0) {
860 pr_err("Invalid nr_poll_queues %d\n", token);
861 ret = -EINVAL;
862 goto out;
863 }
864 opts->nr_poll_queues = token;
865 break;
820 default: 866 default:
821 pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", 867 pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
822 p); 868 p);
@@ -933,7 +979,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
933#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) 979#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
934#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ 980#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
935 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ 981 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
936 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT) 982 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
983 NVMF_OPT_DISABLE_SQFLOW)
937 984
938static struct nvme_ctrl * 985static struct nvme_ctrl *
939nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) 986nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 6ea6275f332a..478343b73e38 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -58,6 +58,11 @@ enum {
58 NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, 58 NVMF_OPT_CTRL_LOSS_TMO = 1 << 11,
59 NVMF_OPT_HOST_ID = 1 << 12, 59 NVMF_OPT_HOST_ID = 1 << 12,
60 NVMF_OPT_DUP_CONNECT = 1 << 13, 60 NVMF_OPT_DUP_CONNECT = 1 << 13,
61 NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
62 NVMF_OPT_HDR_DIGEST = 1 << 15,
63 NVMF_OPT_DATA_DIGEST = 1 << 16,
64 NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
65 NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
61}; 66};
62 67
63/** 68/**
@@ -85,6 +90,11 @@ enum {
85 * @max_reconnects: maximum number of allowed reconnect attempts before removing 90 * @max_reconnects: maximum number of allowed reconnect attempts before removing
86 * the controller, (-1) means reconnect forever, zero means remove 91 * the controller, (-1) means reconnect forever, zero means remove
87 * immediately; 92 * immediately;
93 * @disable_sqflow: disable controller sq flow control
94 * @hdr_digest: generate/verify header digest (TCP)
95 * @data_digest: generate/verify data digest (TCP)
96 * @nr_write_queues: number of queues for write I/O
97 * @nr_poll_queues: number of queues for polling I/O
88 */ 98 */
89struct nvmf_ctrl_options { 99struct nvmf_ctrl_options {
90 unsigned mask; 100 unsigned mask;
@@ -101,6 +111,11 @@ struct nvmf_ctrl_options {
101 unsigned int kato; 111 unsigned int kato;
102 struct nvmf_host *host; 112 struct nvmf_host *host;
103 int max_reconnects; 113 int max_reconnects;
114 bool disable_sqflow;
115 bool hdr_digest;
116 bool data_digest;
117 unsigned int nr_write_queues;
118 unsigned int nr_poll_queues;
104}; 119};
105 120
106/* 121/*
@@ -156,7 +171,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
156int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); 171int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
157int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); 172int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
158int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl); 173int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
159int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); 174int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll);
160int nvmf_register_transport(struct nvmf_transport_ops *ops); 175int nvmf_register_transport(struct nvmf_transport_ops *ops);
161void nvmf_unregister_transport(struct nvmf_transport_ops *ops); 176void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
162void nvmf_free_options(struct nvmf_ctrl_options *opts); 177void nvmf_free_options(struct nvmf_ctrl_options *opts);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index feb86b59170e..89accc76d71c 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1975,7 +1975,7 @@ nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
1975 (qsize / 5)); 1975 (qsize / 5));
1976 if (ret) 1976 if (ret)
1977 break; 1977 break;
1978 ret = nvmf_connect_io_queue(&ctrl->ctrl, i); 1978 ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false);
1979 if (ret) 1979 if (ret)
1980 break; 1980 break;
1981 1981
@@ -2326,38 +2326,6 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
2326 return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir); 2326 return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir);
2327} 2327}
2328 2328
2329static struct blk_mq_tags *
2330nvme_fc_tagset(struct nvme_fc_queue *queue)
2331{
2332 if (queue->qnum == 0)
2333 return queue->ctrl->admin_tag_set.tags[queue->qnum];
2334
2335 return queue->ctrl->tag_set.tags[queue->qnum - 1];
2336}
2337
2338static int
2339nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
2340
2341{
2342 struct nvme_fc_queue *queue = hctx->driver_data;
2343 struct nvme_fc_ctrl *ctrl = queue->ctrl;
2344 struct request *req;
2345 struct nvme_fc_fcp_op *op;
2346
2347 req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
2348 if (!req)
2349 return 0;
2350
2351 op = blk_mq_rq_to_pdu(req);
2352
2353 if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) &&
2354 (ctrl->lport->ops->poll_queue))
2355 ctrl->lport->ops->poll_queue(&ctrl->lport->localport,
2356 queue->lldd_handle);
2357
2358 return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE));
2359}
2360
2361static void 2329static void
2362nvme_fc_submit_async_event(struct nvme_ctrl *arg) 2330nvme_fc_submit_async_event(struct nvme_ctrl *arg)
2363{ 2331{
@@ -2410,7 +2378,7 @@ nvme_fc_complete_rq(struct request *rq)
2410 * status. The done path will return the io request back to the block 2378 * status. The done path will return the io request back to the block
2411 * layer with an error status. 2379 * layer with an error status.
2412 */ 2380 */
2413static void 2381static bool
2414nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) 2382nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
2415{ 2383{
2416 struct nvme_ctrl *nctrl = data; 2384 struct nvme_ctrl *nctrl = data;
@@ -2418,6 +2386,7 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
2418 struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); 2386 struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
2419 2387
2420 __nvme_fc_abort_op(ctrl, op); 2388 __nvme_fc_abort_op(ctrl, op);
2389 return true;
2421} 2390}
2422 2391
2423 2392
@@ -2427,7 +2396,6 @@ static const struct blk_mq_ops nvme_fc_mq_ops = {
2427 .init_request = nvme_fc_init_request, 2396 .init_request = nvme_fc_init_request,
2428 .exit_request = nvme_fc_exit_request, 2397 .exit_request = nvme_fc_exit_request,
2429 .init_hctx = nvme_fc_init_hctx, 2398 .init_hctx = nvme_fc_init_hctx,
2430 .poll = nvme_fc_poll,
2431 .timeout = nvme_fc_timeout, 2399 .timeout = nvme_fc_timeout,
2432}; 2400};
2433 2401
@@ -2457,7 +2425,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
2457 ctrl->tag_set.ops = &nvme_fc_mq_ops; 2425 ctrl->tag_set.ops = &nvme_fc_mq_ops;
2458 ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; 2426 ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
2459 ctrl->tag_set.reserved_tags = 1; /* fabric connect */ 2427 ctrl->tag_set.reserved_tags = 1; /* fabric connect */
2460 ctrl->tag_set.numa_node = NUMA_NO_NODE; 2428 ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
2461 ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 2429 ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
2462 ctrl->tag_set.cmd_size = 2430 ctrl->tag_set.cmd_size =
2463 struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, 2431 struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
@@ -3050,6 +3018,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
3050 3018
3051 ctrl->ctrl.opts = opts; 3019 ctrl->ctrl.opts = opts;
3052 ctrl->ctrl.nr_reconnects = 0; 3020 ctrl->ctrl.nr_reconnects = 0;
3021 ctrl->ctrl.numa_node = dev_to_node(lport->dev);
3053 INIT_LIST_HEAD(&ctrl->ctrl_list); 3022 INIT_LIST_HEAD(&ctrl->ctrl_list);
3054 ctrl->lport = lport; 3023 ctrl->lport = lport;
3055 ctrl->rport = rport; 3024 ctrl->rport = rport;
@@ -3090,7 +3059,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
3090 ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; 3059 ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
3091 ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; 3060 ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
3092 ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */ 3061 ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
3093 ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; 3062 ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
3094 ctrl->admin_tag_set.cmd_size = 3063 ctrl->admin_tag_set.cmd_size =
3095 struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, 3064 struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
3096 ctrl->lport->ops->fcprqst_priv_sz); 3065 ctrl->lport->ops->fcprqst_priv_sz);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index a4f3b263cd6c..b759c25c89c8 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -577,7 +577,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
577 struct ppa_addr ppa; 577 struct ppa_addr ppa;
578 size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); 578 size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
579 size_t log_pos, offset, len; 579 size_t log_pos, offset, len;
580 int ret, i, max_len; 580 int i, max_len;
581 int ret = 0;
581 582
582 /* 583 /*
583 * limit requests to maximum 256K to avoid issuing arbitrary large 584 * limit requests to maximum 256K to avoid issuing arbitrary large
@@ -731,11 +732,12 @@ static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
731 return ret; 732 return ret;
732} 733}
733 734
734static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) 735static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name,
736 int size)
735{ 737{
736 struct nvme_ns *ns = nvmdev->q->queuedata; 738 struct nvme_ns *ns = nvmdev->q->queuedata;
737 739
738 return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0); 740 return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0);
739} 741}
740 742
741static void nvme_nvm_destroy_dma_pool(void *pool) 743static void nvme_nvm_destroy_dma_pool(void *pool)
@@ -935,9 +937,9 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
935 /* cdw11-12 */ 937 /* cdw11-12 */
936 c.ph_rw.length = cpu_to_le16(vcmd.nppas); 938 c.ph_rw.length = cpu_to_le16(vcmd.nppas);
937 c.ph_rw.control = cpu_to_le16(vcmd.control); 939 c.ph_rw.control = cpu_to_le16(vcmd.control);
938 c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13); 940 c.common.cdw13 = cpu_to_le32(vcmd.cdw13);
939 c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14); 941 c.common.cdw14 = cpu_to_le32(vcmd.cdw14);
940 c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15); 942 c.common.cdw15 = cpu_to_le32(vcmd.cdw15);
941 943
942 if (vcmd.timeout_ms) 944 if (vcmd.timeout_ms)
943 timeout = msecs_to_jiffies(vcmd.timeout_ms); 945 timeout = msecs_to_jiffies(vcmd.timeout_ms);
@@ -972,22 +974,11 @@ int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg)
972 } 974 }
973} 975}
974 976
975void nvme_nvm_update_nvm_info(struct nvme_ns *ns)
976{
977 struct nvm_dev *ndev = ns->ndev;
978 struct nvm_geo *geo = &ndev->geo;
979
980 if (geo->version == NVM_OCSSD_SPEC_12)
981 return;
982
983 geo->csecs = 1 << ns->lba_shift;
984 geo->sos = ns->ms;
985}
986
987int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) 977int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
988{ 978{
989 struct request_queue *q = ns->queue; 979 struct request_queue *q = ns->queue;
990 struct nvm_dev *dev; 980 struct nvm_dev *dev;
981 struct nvm_geo *geo;
991 982
992 _nvme_nvm_check_size(); 983 _nvme_nvm_check_size();
993 984
@@ -995,6 +986,12 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
995 if (!dev) 986 if (!dev)
996 return -ENOMEM; 987 return -ENOMEM;
997 988
989 /* Note that csecs and sos will be overridden if it is a 1.2 drive. */
990 geo = &dev->geo;
991 geo->csecs = 1 << ns->lba_shift;
992 geo->sos = ns->ms;
993 geo->ext = ns->ext;
994
998 dev->q = q; 995 dev->q = q;
999 memcpy(dev->name, disk_name, DISK_NAME_LEN); 996 memcpy(dev->name, disk_name, DISK_NAME_LEN);
1000 dev->ops = &nvme_nvm_dev_ops; 997 dev->ops = &nvme_nvm_dev_ops;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 9901afd804ce..183ec17ba067 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
141 test_bit(NVME_NS_ANA_PENDING, &ns->flags)) 141 test_bit(NVME_NS_ANA_PENDING, &ns->flags))
142 continue; 142 continue;
143 143
144 distance = node_distance(node, dev_to_node(ns->ctrl->dev)); 144 distance = node_distance(node, ns->ctrl->numa_node);
145 145
146 switch (ns->ana_state) { 146 switch (ns->ana_state) {
147 case NVME_ANA_OPTIMIZED: 147 case NVME_ANA_OPTIMIZED:
@@ -220,21 +220,6 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
220 return ret; 220 return ret;
221} 221}
222 222
223static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
224{
225 struct nvme_ns_head *head = q->queuedata;
226 struct nvme_ns *ns;
227 bool found = false;
228 int srcu_idx;
229
230 srcu_idx = srcu_read_lock(&head->srcu);
231 ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu);
232 if (likely(ns && nvme_path_is_optimized(ns)))
233 found = ns->queue->poll_fn(q, qc);
234 srcu_read_unlock(&head->srcu, srcu_idx);
235 return found;
236}
237
238static void nvme_requeue_work(struct work_struct *work) 223static void nvme_requeue_work(struct work_struct *work)
239{ 224{
240 struct nvme_ns_head *head = 225 struct nvme_ns_head *head =
@@ -276,12 +261,11 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
276 if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) 261 if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
277 return 0; 262 return 0;
278 263
279 q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL); 264 q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
280 if (!q) 265 if (!q)
281 goto out; 266 goto out;
282 q->queuedata = head; 267 q->queuedata = head;
283 blk_queue_make_request(q, nvme_ns_head_make_request); 268 blk_queue_make_request(q, nvme_ns_head_make_request);
284 q->poll_fn = nvme_ns_head_poll;
285 blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 269 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
286 /* set to a default value for 512 until disk is validated */ 270 /* set to a default value for 512 until disk is validated */
287 blk_queue_logical_block_size(q, 512); 271 blk_queue_logical_block_size(q, 512);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 081cbdcce880..2b36ac922596 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -145,6 +145,7 @@ enum nvme_ctrl_state {
145}; 145};
146 146
147struct nvme_ctrl { 147struct nvme_ctrl {
148 bool comp_seen;
148 enum nvme_ctrl_state state; 149 enum nvme_ctrl_state state;
149 bool identified; 150 bool identified;
150 spinlock_t lock; 151 spinlock_t lock;
@@ -153,6 +154,7 @@ struct nvme_ctrl {
153 struct request_queue *connect_q; 154 struct request_queue *connect_q;
154 struct device *dev; 155 struct device *dev;
155 int instance; 156 int instance;
157 int numa_node;
156 struct blk_mq_tag_set *tagset; 158 struct blk_mq_tag_set *tagset;
157 struct blk_mq_tag_set *admin_tagset; 159 struct blk_mq_tag_set *admin_tagset;
158 struct list_head namespaces; 160 struct list_head namespaces;
@@ -179,6 +181,7 @@ struct nvme_ctrl {
179 u32 page_size; 181 u32 page_size;
180 u32 max_hw_sectors; 182 u32 max_hw_sectors;
181 u32 max_segments; 183 u32 max_segments;
184 u16 crdt[3];
182 u16 oncs; 185 u16 oncs;
183 u16 oacs; 186 u16 oacs;
184 u16 nssa; 187 u16 nssa;
@@ -193,6 +196,7 @@ struct nvme_ctrl {
193 u8 apsta; 196 u8 apsta;
194 u32 oaes; 197 u32 oaes;
195 u32 aen_result; 198 u32 aen_result;
199 u32 ctratt;
196 unsigned int shutdown_timeout; 200 unsigned int shutdown_timeout;
197 unsigned int kato; 201 unsigned int kato;
198 bool subsystem; 202 bool subsystem;
@@ -237,6 +241,9 @@ struct nvme_ctrl {
237 u16 maxcmd; 241 u16 maxcmd;
238 int nr_reconnects; 242 int nr_reconnects;
239 struct nvmf_ctrl_options *opts; 243 struct nvmf_ctrl_options *opts;
244
245 struct page *discard_page;
246 unsigned long discard_page_busy;
240}; 247};
241 248
242struct nvme_subsystem { 249struct nvme_subsystem {
@@ -364,15 +371,6 @@ static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {}
364static inline void nvme_should_fail(struct request *req) {} 371static inline void nvme_should_fail(struct request *req) {}
365#endif 372#endif
366 373
367static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
368{
369 u32 val = 0;
370
371 if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
372 return false;
373 return val & NVME_CSTS_RDY;
374}
375
376static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) 374static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
377{ 375{
378 if (!ctrl->subsystem) 376 if (!ctrl->subsystem)
@@ -408,7 +406,7 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
408} 406}
409 407
410void nvme_complete_rq(struct request *req); 408void nvme_complete_rq(struct request *req);
411void nvme_cancel_request(struct request *req, void *data, bool reserved); 409bool nvme_cancel_request(struct request *req, void *data, bool reserved);
412bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 410bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
413 enum nvme_ctrl_state new_state); 411 enum nvme_ctrl_state new_state);
414int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); 412int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
@@ -449,7 +447,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
449int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 447int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
450 union nvme_result *result, void *buffer, unsigned bufflen, 448 union nvme_result *result, void *buffer, unsigned bufflen,
451 unsigned timeout, int qid, int at_head, 449 unsigned timeout, int qid, int at_head,
452 blk_mq_req_flags_t flags); 450 blk_mq_req_flags_t flags, bool poll);
453int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); 451int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
454void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); 452void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
455int nvme_reset_ctrl(struct nvme_ctrl *ctrl); 453int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
@@ -545,13 +543,11 @@ static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
545#endif /* CONFIG_NVME_MULTIPATH */ 543#endif /* CONFIG_NVME_MULTIPATH */
546 544
547#ifdef CONFIG_NVM 545#ifdef CONFIG_NVM
548void nvme_nvm_update_nvm_info(struct nvme_ns *ns);
549int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); 546int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
550void nvme_nvm_unregister(struct nvme_ns *ns); 547void nvme_nvm_unregister(struct nvme_ns *ns);
551extern const struct attribute_group nvme_nvm_attr_group; 548extern const struct attribute_group nvme_nvm_attr_group;
552int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); 549int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
553#else 550#else
554static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {};
555static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, 551static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
556 int node) 552 int node)
557{ 553{
@@ -572,6 +568,6 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
572} 568}
573 569
574int __init nvme_core_init(void); 570int __init nvme_core_init(void);
575void nvme_core_exit(void); 571void __exit nvme_core_exit(void);
576 572
577#endif /* _NVME_H */ 573#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c33bb201b884..5a0bf6a24d50 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -32,6 +32,7 @@
32#include <linux/sed-opal.h> 32#include <linux/sed-opal.h>
33#include <linux/pci-p2pdma.h> 33#include <linux/pci-p2pdma.h>
34 34
35#include "trace.h"
35#include "nvme.h" 36#include "nvme.h"
36 37
37#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) 38#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
@@ -74,6 +75,22 @@ static int io_queue_depth = 1024;
74module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); 75module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
75MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); 76MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
76 77
78static int queue_count_set(const char *val, const struct kernel_param *kp);
79static const struct kernel_param_ops queue_count_ops = {
80 .set = queue_count_set,
81 .get = param_get_int,
82};
83
84static int write_queues;
85module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
86MODULE_PARM_DESC(write_queues,
87 "Number of queues to use for writes. If not set, reads and writes "
88 "will share a queue set.");
89
90static int poll_queues = 0;
91module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
92MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
93
77struct nvme_dev; 94struct nvme_dev;
78struct nvme_queue; 95struct nvme_queue;
79 96
@@ -92,6 +109,7 @@ struct nvme_dev {
92 struct dma_pool *prp_small_pool; 109 struct dma_pool *prp_small_pool;
93 unsigned online_queues; 110 unsigned online_queues;
94 unsigned max_qid; 111 unsigned max_qid;
112 unsigned io_queues[HCTX_MAX_TYPES];
95 unsigned int num_vecs; 113 unsigned int num_vecs;
96 int q_depth; 114 int q_depth;
97 u32 db_stride; 115 u32 db_stride;
@@ -105,7 +123,6 @@ struct nvme_dev {
105 u32 cmbsz; 123 u32 cmbsz;
106 u32 cmbloc; 124 u32 cmbloc;
107 struct nvme_ctrl ctrl; 125 struct nvme_ctrl ctrl;
108 struct completion ioq_wait;
109 126
110 mempool_t *iod_mempool; 127 mempool_t *iod_mempool;
111 128
@@ -134,6 +151,17 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
134 return param_set_int(val, kp); 151 return param_set_int(val, kp);
135} 152}
136 153
154static int queue_count_set(const char *val, const struct kernel_param *kp)
155{
156 int n = 0, ret;
157
158 ret = kstrtoint(val, 10, &n);
159 if (n > num_possible_cpus())
160 n = num_possible_cpus();
161
162 return param_set_int(val, kp);
163}
164
137static inline unsigned int sq_idx(unsigned int qid, u32 stride) 165static inline unsigned int sq_idx(unsigned int qid, u32 stride)
138{ 166{
139 return qid * 2 * stride; 167 return qid * 2 * stride;
@@ -158,8 +186,8 @@ struct nvme_queue {
158 struct nvme_dev *dev; 186 struct nvme_dev *dev;
159 spinlock_t sq_lock; 187 spinlock_t sq_lock;
160 struct nvme_command *sq_cmds; 188 struct nvme_command *sq_cmds;
161 bool sq_cmds_is_io; 189 /* only used for poll queues: */
162 spinlock_t cq_lock ____cacheline_aligned_in_smp; 190 spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
163 volatile struct nvme_completion *cqes; 191 volatile struct nvme_completion *cqes;
164 struct blk_mq_tags **tags; 192 struct blk_mq_tags **tags;
165 dma_addr_t sq_dma_addr; 193 dma_addr_t sq_dma_addr;
@@ -168,14 +196,20 @@ struct nvme_queue {
168 u16 q_depth; 196 u16 q_depth;
169 s16 cq_vector; 197 s16 cq_vector;
170 u16 sq_tail; 198 u16 sq_tail;
199 u16 last_sq_tail;
171 u16 cq_head; 200 u16 cq_head;
172 u16 last_cq_head; 201 u16 last_cq_head;
173 u16 qid; 202 u16 qid;
174 u8 cq_phase; 203 u8 cq_phase;
204 unsigned long flags;
205#define NVMEQ_ENABLED 0
206#define NVMEQ_SQ_CMB 1
207#define NVMEQ_DELETE_ERROR 2
175 u32 *dbbuf_sq_db; 208 u32 *dbbuf_sq_db;
176 u32 *dbbuf_cq_db; 209 u32 *dbbuf_cq_db;
177 u32 *dbbuf_sq_ei; 210 u32 *dbbuf_sq_ei;
178 u32 *dbbuf_cq_ei; 211 u32 *dbbuf_cq_ei;
212 struct completion delete_done;
179}; 213};
180 214
181/* 215/*
@@ -218,9 +252,20 @@ static inline void _nvme_check_size(void)
218 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); 252 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
219} 253}
220 254
255static unsigned int max_io_queues(void)
256{
257 return num_possible_cpus() + write_queues + poll_queues;
258}
259
260static unsigned int max_queue_count(void)
261{
262 /* IO queues + admin queue */
263 return 1 + max_io_queues();
264}
265
221static inline unsigned int nvme_dbbuf_size(u32 stride) 266static inline unsigned int nvme_dbbuf_size(u32 stride)
222{ 267{
223 return ((num_possible_cpus() + 1) * 8 * stride); 268 return (max_queue_count() * 8 * stride);
224} 269}
225 270
226static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) 271static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
@@ -431,30 +476,90 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
431 return 0; 476 return 0;
432} 477}
433 478
479static int queue_irq_offset(struct nvme_dev *dev)
480{
481 /* if we have more than 1 vec, admin queue offsets us by 1 */
482 if (dev->num_vecs > 1)
483 return 1;
484
485 return 0;
486}
487
434static int nvme_pci_map_queues(struct blk_mq_tag_set *set) 488static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
435{ 489{
436 struct nvme_dev *dev = set->driver_data; 490 struct nvme_dev *dev = set->driver_data;
491 int i, qoff, offset;
492
493 offset = queue_irq_offset(dev);
494 for (i = 0, qoff = 0; i < set->nr_maps; i++) {
495 struct blk_mq_queue_map *map = &set->map[i];
496
497 map->nr_queues = dev->io_queues[i];
498 if (!map->nr_queues) {
499 BUG_ON(i == HCTX_TYPE_DEFAULT);
500 continue;
501 }
437 502
438 return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), 503 /*
439 dev->num_vecs > 1 ? 1 /* admin queue */ : 0); 504 * The poll queue(s) doesn't have an IRQ (and hence IRQ
505 * affinity), so use the regular blk-mq cpu mapping
506 */
507 map->queue_offset = qoff;
508 if (i != HCTX_TYPE_POLL)
509 blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
510 else
511 blk_mq_map_queues(map);
512 qoff += map->nr_queues;
513 offset += map->nr_queues;
514 }
515
516 return 0;
517}
518
519/*
520 * Write sq tail if we are asked to, or if the next command would wrap.
521 */
522static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
523{
524 if (!write_sq) {
525 u16 next_tail = nvmeq->sq_tail + 1;
526
527 if (next_tail == nvmeq->q_depth)
528 next_tail = 0;
529 if (next_tail != nvmeq->last_sq_tail)
530 return;
531 }
532
533 if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
534 nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
535 writel(nvmeq->sq_tail, nvmeq->q_db);
536 nvmeq->last_sq_tail = nvmeq->sq_tail;
440} 537}
441 538
442/** 539/**
443 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell 540 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
444 * @nvmeq: The queue to use 541 * @nvmeq: The queue to use
445 * @cmd: The command to send 542 * @cmd: The command to send
543 * @write_sq: whether to write to the SQ doorbell
446 */ 544 */
447static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) 545static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
546 bool write_sq)
448{ 547{
449 spin_lock(&nvmeq->sq_lock); 548 spin_lock(&nvmeq->sq_lock);
450
451 memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); 549 memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
452
453 if (++nvmeq->sq_tail == nvmeq->q_depth) 550 if (++nvmeq->sq_tail == nvmeq->q_depth)
454 nvmeq->sq_tail = 0; 551 nvmeq->sq_tail = 0;
455 if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail, 552 nvme_write_sq_db(nvmeq, write_sq);
456 nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) 553 spin_unlock(&nvmeq->sq_lock);
457 writel(nvmeq->sq_tail, nvmeq->q_db); 554}
555
556static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
557{
558 struct nvme_queue *nvmeq = hctx->driver_data;
559
560 spin_lock(&nvmeq->sq_lock);
561 if (nvmeq->sq_tail != nvmeq->last_sq_tail)
562 nvme_write_sq_db(nvmeq, true);
458 spin_unlock(&nvmeq->sq_lock); 563 spin_unlock(&nvmeq->sq_lock);
459} 564}
460 565
@@ -822,7 +927,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
822 * We should not need to do this, but we're still using this to 927 * We should not need to do this, but we're still using this to
823 * ensure we can drain requests on a dying queue. 928 * ensure we can drain requests on a dying queue.
824 */ 929 */
825 if (unlikely(nvmeq->cq_vector < 0)) 930 if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
826 return BLK_STS_IOERR; 931 return BLK_STS_IOERR;
827 932
828 ret = nvme_setup_cmd(ns, req, &cmnd); 933 ret = nvme_setup_cmd(ns, req, &cmnd);
@@ -840,7 +945,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
840 } 945 }
841 946
842 blk_mq_start_request(req); 947 blk_mq_start_request(req);
843 nvme_submit_cmd(nvmeq, &cmnd); 948 nvme_submit_cmd(nvmeq, &cmnd, bd->last);
844 return BLK_STS_OK; 949 return BLK_STS_OK;
845out_cleanup_iod: 950out_cleanup_iod:
846 nvme_free_iod(dev, req); 951 nvme_free_iod(dev, req);
@@ -899,6 +1004,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
899 } 1004 }
900 1005
901 req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); 1006 req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
1007 trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
902 nvme_end_request(req, cqe->status, cqe->result); 1008 nvme_end_request(req, cqe->status, cqe->result);
903} 1009}
904 1010
@@ -919,15 +1025,15 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
919 } 1025 }
920} 1026}
921 1027
922static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, 1028static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
923 u16 *end, int tag) 1029 u16 *end, unsigned int tag)
924{ 1030{
925 bool found = false; 1031 int found = 0;
926 1032
927 *start = nvmeq->cq_head; 1033 *start = nvmeq->cq_head;
928 while (!found && nvme_cqe_pending(nvmeq)) { 1034 while (nvme_cqe_pending(nvmeq)) {
929 if (nvmeq->cqes[nvmeq->cq_head].command_id == tag) 1035 if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
930 found = true; 1036 found++;
931 nvme_update_cq_head(nvmeq); 1037 nvme_update_cq_head(nvmeq);
932 } 1038 }
933 *end = nvmeq->cq_head; 1039 *end = nvmeq->cq_head;
@@ -943,12 +1049,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
943 irqreturn_t ret = IRQ_NONE; 1049 irqreturn_t ret = IRQ_NONE;
944 u16 start, end; 1050 u16 start, end;
945 1051
946 spin_lock(&nvmeq->cq_lock); 1052 /*
1053 * The rmb/wmb pair ensures we see all updates from a previous run of
1054 * the irq handler, even if that was on another CPU.
1055 */
1056 rmb();
947 if (nvmeq->cq_head != nvmeq->last_cq_head) 1057 if (nvmeq->cq_head != nvmeq->last_cq_head)
948 ret = IRQ_HANDLED; 1058 ret = IRQ_HANDLED;
949 nvme_process_cq(nvmeq, &start, &end, -1); 1059 nvme_process_cq(nvmeq, &start, &end, -1);
950 nvmeq->last_cq_head = nvmeq->cq_head; 1060 nvmeq->last_cq_head = nvmeq->cq_head;
951 spin_unlock(&nvmeq->cq_lock); 1061 wmb();
952 1062
953 if (start != end) { 1063 if (start != end) {
954 nvme_complete_cqes(nvmeq, start, end); 1064 nvme_complete_cqes(nvmeq, start, end);
@@ -966,27 +1076,50 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
966 return IRQ_NONE; 1076 return IRQ_NONE;
967} 1077}
968 1078
969static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) 1079/*
1080 * Poll for completions any queue, including those not dedicated to polling.
1081 * Can be called from any context.
1082 */
1083static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
970{ 1084{
1085 struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
971 u16 start, end; 1086 u16 start, end;
972 bool found; 1087 int found;
973 1088
974 if (!nvme_cqe_pending(nvmeq)) 1089 /*
975 return 0; 1090 * For a poll queue we need to protect against the polling thread
976 1091 * using the CQ lock. For normal interrupt driven threads we have
977 spin_lock_irq(&nvmeq->cq_lock); 1092 * to disable the interrupt to avoid racing with it.
978 found = nvme_process_cq(nvmeq, &start, &end, tag); 1093 */
979 spin_unlock_irq(&nvmeq->cq_lock); 1094 if (nvmeq->cq_vector == -1) {
1095 spin_lock(&nvmeq->cq_poll_lock);
1096 found = nvme_process_cq(nvmeq, &start, &end, tag);
1097 spin_unlock(&nvmeq->cq_poll_lock);
1098 } else {
1099 disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
1100 found = nvme_process_cq(nvmeq, &start, &end, tag);
1101 enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
1102 }
980 1103
981 nvme_complete_cqes(nvmeq, start, end); 1104 nvme_complete_cqes(nvmeq, start, end);
982 return found; 1105 return found;
983} 1106}
984 1107
985static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) 1108static int nvme_poll(struct blk_mq_hw_ctx *hctx)
986{ 1109{
987 struct nvme_queue *nvmeq = hctx->driver_data; 1110 struct nvme_queue *nvmeq = hctx->driver_data;
1111 u16 start, end;
1112 bool found;
1113
1114 if (!nvme_cqe_pending(nvmeq))
1115 return 0;
1116
1117 spin_lock(&nvmeq->cq_poll_lock);
1118 found = nvme_process_cq(nvmeq, &start, &end, -1);
1119 spin_unlock(&nvmeq->cq_poll_lock);
988 1120
989 return __nvme_poll(nvmeq, tag); 1121 nvme_complete_cqes(nvmeq, start, end);
1122 return found;
990} 1123}
991 1124
992static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) 1125static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
@@ -998,7 +1131,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
998 memset(&c, 0, sizeof(c)); 1131 memset(&c, 0, sizeof(c));
999 c.common.opcode = nvme_admin_async_event; 1132 c.common.opcode = nvme_admin_async_event;
1000 c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; 1133 c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1001 nvme_submit_cmd(nvmeq, &c); 1134 nvme_submit_cmd(nvmeq, &c, true);
1002} 1135}
1003 1136
1004static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) 1137static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1016,7 +1149,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1016 struct nvme_queue *nvmeq, s16 vector) 1149 struct nvme_queue *nvmeq, s16 vector)
1017{ 1150{
1018 struct nvme_command c; 1151 struct nvme_command c;
1019 int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; 1152 int flags = NVME_QUEUE_PHYS_CONTIG;
1153
1154 if (vector != -1)
1155 flags |= NVME_CQ_IRQ_ENABLED;
1020 1156
1021 /* 1157 /*
1022 * Note: we (ab)use the fact that the prp fields survive if no data 1158 * Note: we (ab)use the fact that the prp fields survive if no data
@@ -1028,7 +1164,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1028 c.create_cq.cqid = cpu_to_le16(qid); 1164 c.create_cq.cqid = cpu_to_le16(qid);
1029 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1165 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
1030 c.create_cq.cq_flags = cpu_to_le16(flags); 1166 c.create_cq.cq_flags = cpu_to_le16(flags);
1031 c.create_cq.irq_vector = cpu_to_le16(vector); 1167 if (vector != -1)
1168 c.create_cq.irq_vector = cpu_to_le16(vector);
1169 else
1170 c.create_cq.irq_vector = 0;
1032 1171
1033 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1172 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
1034} 1173}
@@ -1157,7 +1296,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1157 /* 1296 /*
1158 * Did we miss an interrupt? 1297 * Did we miss an interrupt?
1159 */ 1298 */
1160 if (__nvme_poll(nvmeq, req->tag)) { 1299 if (nvme_poll_irqdisable(nvmeq, req->tag)) {
1161 dev_warn(dev->ctrl.device, 1300 dev_warn(dev->ctrl.device,
1162 "I/O %d QID %d timeout, completion polled\n", 1301 "I/O %d QID %d timeout, completion polled\n",
1163 req->tag, nvmeq->qid); 1302 req->tag, nvmeq->qid);
@@ -1237,17 +1376,15 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
1237{ 1376{
1238 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1377 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
1239 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1378 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1379 if (!nvmeq->sq_cmds)
1380 return;
1240 1381
1241 if (nvmeq->sq_cmds) { 1382 if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
1242 if (nvmeq->sq_cmds_is_io) 1383 pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
1243 pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev), 1384 nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
1244 nvmeq->sq_cmds, 1385 } else {
1245 SQ_SIZE(nvmeq->q_depth)); 1386 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1246 else 1387 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1247 dma_free_coherent(nvmeq->q_dmadev,
1248 SQ_SIZE(nvmeq->q_depth),
1249 nvmeq->sq_cmds,
1250 nvmeq->sq_dma_addr);
1251 } 1388 }
1252} 1389}
1253 1390
@@ -1267,47 +1404,32 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1267 */ 1404 */
1268static int nvme_suspend_queue(struct nvme_queue *nvmeq) 1405static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1269{ 1406{
1270 int vector; 1407 if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
1271
1272 spin_lock_irq(&nvmeq->cq_lock);
1273 if (nvmeq->cq_vector == -1) {
1274 spin_unlock_irq(&nvmeq->cq_lock);
1275 return 1; 1408 return 1;
1276 }
1277 vector = nvmeq->cq_vector;
1278 nvmeq->dev->online_queues--;
1279 nvmeq->cq_vector = -1;
1280 spin_unlock_irq(&nvmeq->cq_lock);
1281 1409
1282 /* 1410 /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1283 * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without
1284 * having to grab the lock.
1285 */
1286 mb(); 1411 mb();
1287 1412
1413 nvmeq->dev->online_queues--;
1288 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) 1414 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1289 blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); 1415 blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1290 1416 if (nvmeq->cq_vector == -1)
1291 pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq); 1417 return 0;
1292 1418 pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
1419 nvmeq->cq_vector = -1;
1293 return 0; 1420 return 0;
1294} 1421}
1295 1422
1296static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) 1423static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
1297{ 1424{
1298 struct nvme_queue *nvmeq = &dev->queues[0]; 1425 struct nvme_queue *nvmeq = &dev->queues[0];
1299 u16 start, end;
1300 1426
1301 if (shutdown) 1427 if (shutdown)
1302 nvme_shutdown_ctrl(&dev->ctrl); 1428 nvme_shutdown_ctrl(&dev->ctrl);
1303 else 1429 else
1304 nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); 1430 nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
1305 1431
1306 spin_lock_irq(&nvmeq->cq_lock); 1432 nvme_poll_irqdisable(nvmeq, -1);
1307 nvme_process_cq(nvmeq, &start, &end, -1);
1308 spin_unlock_irq(&nvmeq->cq_lock);
1309
1310 nvme_complete_cqes(nvmeq, start, end);
1311} 1433}
1312 1434
1313static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, 1435static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -1343,15 +1465,14 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1343 nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); 1465 nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
1344 nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, 1466 nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
1345 nvmeq->sq_cmds); 1467 nvmeq->sq_cmds);
1346 nvmeq->sq_cmds_is_io = true; 1468 if (nvmeq->sq_dma_addr) {
1347 } 1469 set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
1348 1470 return 0;
1349 if (!nvmeq->sq_cmds) { 1471 }
1350 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
1351 &nvmeq->sq_dma_addr, GFP_KERNEL);
1352 nvmeq->sq_cmds_is_io = false;
1353 } 1472 }
1354 1473
1474 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
1475 &nvmeq->sq_dma_addr, GFP_KERNEL);
1355 if (!nvmeq->sq_cmds) 1476 if (!nvmeq->sq_cmds)
1356 return -ENOMEM; 1477 return -ENOMEM;
1357 return 0; 1478 return 0;
@@ -1375,7 +1496,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
1375 nvmeq->q_dmadev = dev->dev; 1496 nvmeq->q_dmadev = dev->dev;
1376 nvmeq->dev = dev; 1497 nvmeq->dev = dev;
1377 spin_lock_init(&nvmeq->sq_lock); 1498 spin_lock_init(&nvmeq->sq_lock);
1378 spin_lock_init(&nvmeq->cq_lock); 1499 spin_lock_init(&nvmeq->cq_poll_lock);
1379 nvmeq->cq_head = 0; 1500 nvmeq->cq_head = 0;
1380 nvmeq->cq_phase = 1; 1501 nvmeq->cq_phase = 1;
1381 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1502 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1411,28 +1532,34 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
1411{ 1532{
1412 struct nvme_dev *dev = nvmeq->dev; 1533 struct nvme_dev *dev = nvmeq->dev;
1413 1534
1414 spin_lock_irq(&nvmeq->cq_lock);
1415 nvmeq->sq_tail = 0; 1535 nvmeq->sq_tail = 0;
1536 nvmeq->last_sq_tail = 0;
1416 nvmeq->cq_head = 0; 1537 nvmeq->cq_head = 0;
1417 nvmeq->cq_phase = 1; 1538 nvmeq->cq_phase = 1;
1418 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1539 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1419 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1540 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
1420 nvme_dbbuf_init(dev, nvmeq, qid); 1541 nvme_dbbuf_init(dev, nvmeq, qid);
1421 dev->online_queues++; 1542 dev->online_queues++;
1422 spin_unlock_irq(&nvmeq->cq_lock); 1543 wmb(); /* ensure the first interrupt sees the initialization */
1423} 1544}
1424 1545
1425static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1546static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1426{ 1547{
1427 struct nvme_dev *dev = nvmeq->dev; 1548 struct nvme_dev *dev = nvmeq->dev;
1428 int result; 1549 int result;
1429 s16 vector; 1550 s16 vector;
1430 1551
1552 clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
1553
1431 /* 1554 /*
1432 * A queue's vector matches the queue identifier unless the controller 1555 * A queue's vector matches the queue identifier unless the controller
1433 * has only one vector available. 1556 * has only one vector available.
1434 */ 1557 */
1435 vector = dev->num_vecs == 1 ? 0 : qid; 1558 if (!polled)
1559 vector = dev->num_vecs == 1 ? 0 : qid;
1560 else
1561 vector = -1;
1562
1436 result = adapter_alloc_cq(dev, qid, nvmeq, vector); 1563 result = adapter_alloc_cq(dev, qid, nvmeq, vector);
1437 if (result) 1564 if (result)
1438 return result; 1565 return result;
@@ -1443,17 +1570,16 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
1443 else if (result) 1570 else if (result)
1444 goto release_cq; 1571 goto release_cq;
1445 1572
1446 /*
1447 * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will
1448 * invoke free_irq for it and cause a 'Trying to free already-free IRQ
1449 * xxx' warning if the create CQ/SQ command times out.
1450 */
1451 nvmeq->cq_vector = vector; 1573 nvmeq->cq_vector = vector;
1452 nvme_init_queue(nvmeq, qid); 1574 nvme_init_queue(nvmeq, qid);
1453 result = queue_request_irq(nvmeq);
1454 if (result < 0)
1455 goto release_sq;
1456 1575
1576 if (vector != -1) {
1577 result = queue_request_irq(nvmeq);
1578 if (result < 0)
1579 goto release_sq;
1580 }
1581
1582 set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1457 return result; 1583 return result;
1458 1584
1459release_sq: 1585release_sq:
@@ -1477,6 +1603,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
1477static const struct blk_mq_ops nvme_mq_ops = { 1603static const struct blk_mq_ops nvme_mq_ops = {
1478 .queue_rq = nvme_queue_rq, 1604 .queue_rq = nvme_queue_rq,
1479 .complete = nvme_pci_complete_rq, 1605 .complete = nvme_pci_complete_rq,
1606 .commit_rqs = nvme_commit_rqs,
1480 .init_hctx = nvme_init_hctx, 1607 .init_hctx = nvme_init_hctx,
1481 .init_request = nvme_init_request, 1608 .init_request = nvme_init_request,
1482 .map_queues = nvme_pci_map_queues, 1609 .map_queues = nvme_pci_map_queues,
@@ -1602,12 +1729,13 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
1602 return result; 1729 return result;
1603 } 1730 }
1604 1731
1732 set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1605 return result; 1733 return result;
1606} 1734}
1607 1735
1608static int nvme_create_io_queues(struct nvme_dev *dev) 1736static int nvme_create_io_queues(struct nvme_dev *dev)
1609{ 1737{
1610 unsigned i, max; 1738 unsigned i, max, rw_queues;
1611 int ret = 0; 1739 int ret = 0;
1612 1740
1613 for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { 1741 for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
@@ -1618,8 +1746,17 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
1618 } 1746 }
1619 1747
1620 max = min(dev->max_qid, dev->ctrl.queue_count - 1); 1748 max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1749 if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
1750 rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
1751 dev->io_queues[HCTX_TYPE_READ];
1752 } else {
1753 rw_queues = max;
1754 }
1755
1621 for (i = dev->online_queues; i <= max; i++) { 1756 for (i = dev->online_queues; i <= max; i++) {
1622 ret = nvme_create_queue(&dev->queues[i], i); 1757 bool polled = i > rw_queues;
1758
1759 ret = nvme_create_queue(&dev->queues[i], i, polled);
1623 if (ret) 1760 if (ret)
1624 break; 1761 break;
1625 } 1762 }
@@ -1891,6 +2028,110 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
1891 return ret; 2028 return ret;
1892} 2029}
1893 2030
2031static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
2032{
2033 unsigned int this_w_queues = write_queues;
2034
2035 /*
2036 * Setup read/write queue split
2037 */
2038 if (irq_queues == 1) {
2039 dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
2040 dev->io_queues[HCTX_TYPE_READ] = 0;
2041 return;
2042 }
2043
2044 /*
2045 * If 'write_queues' is set, ensure it leaves room for at least
2046 * one read queue
2047 */
2048 if (this_w_queues >= irq_queues)
2049 this_w_queues = irq_queues - 1;
2050
2051 /*
2052 * If 'write_queues' is set to zero, reads and writes will share
2053 * a queue set.
2054 */
2055 if (!this_w_queues) {
2056 dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues;
2057 dev->io_queues[HCTX_TYPE_READ] = 0;
2058 } else {
2059 dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
2060 dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
2061 }
2062}
2063
2064static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
2065{
2066 struct pci_dev *pdev = to_pci_dev(dev->dev);
2067 int irq_sets[2];
2068 struct irq_affinity affd = {
2069 .pre_vectors = 1,
2070 .nr_sets = ARRAY_SIZE(irq_sets),
2071 .sets = irq_sets,
2072 };
2073 int result = 0;
2074 unsigned int irq_queues, this_p_queues;
2075
2076 /*
2077 * Poll queues don't need interrupts, but we need at least one IO
2078 * queue left over for non-polled IO.
2079 */
2080 this_p_queues = poll_queues;
2081 if (this_p_queues >= nr_io_queues) {
2082 this_p_queues = nr_io_queues - 1;
2083 irq_queues = 1;
2084 } else {
2085 irq_queues = nr_io_queues - this_p_queues;
2086 }
2087 dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
2088
2089 /*
2090 * For irq sets, we have to ask for minvec == maxvec. This passes
2091 * any reduction back to us, so we can adjust our queue counts and
2092 * IRQ vector needs.
2093 */
2094 do {
2095 nvme_calc_io_queues(dev, irq_queues);
2096 irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
2097 irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
2098 if (!irq_sets[1])
2099 affd.nr_sets = 1;
2100
2101 /*
2102 * If we got a failure and we're down to asking for just
2103 * 1 + 1 queues, just ask for a single vector. We'll share
2104 * that between the single IO queue and the admin queue.
2105 */
2106 if (result >= 0 && irq_queues > 1)
2107 irq_queues = irq_sets[0] + irq_sets[1] + 1;
2108
2109 result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
2110 irq_queues,
2111 PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
2112
2113 /*
2114 * Need to reduce our vec counts. If we get ENOSPC, the
2115 * platform should support mulitple vecs, we just need
2116 * to decrease our ask. If we get EINVAL, the platform
2117 * likely does not. Back down to ask for just one vector.
2118 */
2119 if (result == -ENOSPC) {
2120 irq_queues--;
2121 if (!irq_queues)
2122 return result;
2123 continue;
2124 } else if (result == -EINVAL) {
2125 irq_queues = 1;
2126 continue;
2127 } else if (result <= 0)
2128 return -EIO;
2129 break;
2130 } while (1);
2131
2132 return result;
2133}
2134
1894static int nvme_setup_io_queues(struct nvme_dev *dev) 2135static int nvme_setup_io_queues(struct nvme_dev *dev)
1895{ 2136{
1896 struct nvme_queue *adminq = &dev->queues[0]; 2137 struct nvme_queue *adminq = &dev->queues[0];
@@ -1898,17 +2139,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1898 int result, nr_io_queues; 2139 int result, nr_io_queues;
1899 unsigned long size; 2140 unsigned long size;
1900 2141
1901 struct irq_affinity affd = { 2142 nr_io_queues = max_io_queues();
1902 .pre_vectors = 1
1903 };
1904
1905 nr_io_queues = num_possible_cpus();
1906 result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); 2143 result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
1907 if (result < 0) 2144 if (result < 0)
1908 return result; 2145 return result;
1909 2146
1910 if (nr_io_queues == 0) 2147 if (nr_io_queues == 0)
1911 return 0; 2148 return 0;
2149
2150 clear_bit(NVMEQ_ENABLED, &adminq->flags);
1912 2151
1913 if (dev->cmb_use_sqes) { 2152 if (dev->cmb_use_sqes) {
1914 result = nvme_cmb_qdepth(dev, nr_io_queues, 2153 result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -1937,12 +2176,19 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1937 * setting up the full range we need. 2176 * setting up the full range we need.
1938 */ 2177 */
1939 pci_free_irq_vectors(pdev); 2178 pci_free_irq_vectors(pdev);
1940 result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1, 2179
1941 PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); 2180 result = nvme_setup_irqs(dev, nr_io_queues);
1942 if (result <= 0) 2181 if (result <= 0)
1943 return -EIO; 2182 return -EIO;
2183
1944 dev->num_vecs = result; 2184 dev->num_vecs = result;
1945 dev->max_qid = max(result - 1, 1); 2185 result = max(result - 1, 1);
2186 dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
2187
2188 dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
2189 dev->io_queues[HCTX_TYPE_DEFAULT],
2190 dev->io_queues[HCTX_TYPE_READ],
2191 dev->io_queues[HCTX_TYPE_POLL]);
1946 2192
1947 /* 2193 /*
1948 * Should investigate if there's a performance win from allocating 2194 * Should investigate if there's a performance win from allocating
@@ -1956,6 +2202,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1956 adminq->cq_vector = -1; 2202 adminq->cq_vector = -1;
1957 return result; 2203 return result;
1958 } 2204 }
2205 set_bit(NVMEQ_ENABLED, &adminq->flags);
1959 return nvme_create_io_queues(dev); 2206 return nvme_create_io_queues(dev);
1960} 2207}
1961 2208
@@ -1964,23 +2211,15 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
1964 struct nvme_queue *nvmeq = req->end_io_data; 2211 struct nvme_queue *nvmeq = req->end_io_data;
1965 2212
1966 blk_mq_free_request(req); 2213 blk_mq_free_request(req);
1967 complete(&nvmeq->dev->ioq_wait); 2214 complete(&nvmeq->delete_done);
1968} 2215}
1969 2216
1970static void nvme_del_cq_end(struct request *req, blk_status_t error) 2217static void nvme_del_cq_end(struct request *req, blk_status_t error)
1971{ 2218{
1972 struct nvme_queue *nvmeq = req->end_io_data; 2219 struct nvme_queue *nvmeq = req->end_io_data;
1973 u16 start, end;
1974
1975 if (!error) {
1976 unsigned long flags;
1977 2220
1978 spin_lock_irqsave(&nvmeq->cq_lock, flags); 2221 if (error)
1979 nvme_process_cq(nvmeq, &start, &end, -1); 2222 set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
1980 spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
1981
1982 nvme_complete_cqes(nvmeq, start, end);
1983 }
1984 2223
1985 nvme_del_queue_end(req, error); 2224 nvme_del_queue_end(req, error);
1986} 2225}
@@ -2002,37 +2241,44 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2002 req->timeout = ADMIN_TIMEOUT; 2241 req->timeout = ADMIN_TIMEOUT;
2003 req->end_io_data = nvmeq; 2242 req->end_io_data = nvmeq;
2004 2243
2244 init_completion(&nvmeq->delete_done);
2005 blk_execute_rq_nowait(q, NULL, req, false, 2245 blk_execute_rq_nowait(q, NULL, req, false,
2006 opcode == nvme_admin_delete_cq ? 2246 opcode == nvme_admin_delete_cq ?
2007 nvme_del_cq_end : nvme_del_queue_end); 2247 nvme_del_cq_end : nvme_del_queue_end);
2008 return 0; 2248 return 0;
2009} 2249}
2010 2250
2011static void nvme_disable_io_queues(struct nvme_dev *dev) 2251static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
2012{ 2252{
2013 int pass, queues = dev->online_queues - 1; 2253 int nr_queues = dev->online_queues - 1, sent = 0;
2014 unsigned long timeout; 2254 unsigned long timeout;
2015 u8 opcode = nvme_admin_delete_sq;
2016
2017 for (pass = 0; pass < 2; pass++) {
2018 int sent = 0, i = queues;
2019 2255
2020 reinit_completion(&dev->ioq_wait);
2021 retry: 2256 retry:
2022 timeout = ADMIN_TIMEOUT; 2257 timeout = ADMIN_TIMEOUT;
2023 for (; i > 0; i--, sent++) 2258 while (nr_queues > 0) {
2024 if (nvme_delete_queue(&dev->queues[i], opcode)) 2259 if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
2025 break; 2260 break;
2026 2261 nr_queues--;
2027 while (sent--) { 2262 sent++;
2028 timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
2029 if (timeout == 0)
2030 return;
2031 if (i)
2032 goto retry;
2033 }
2034 opcode = nvme_admin_delete_cq;
2035 } 2263 }
2264 while (sent) {
2265 struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
2266
2267 timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2268 timeout);
2269 if (timeout == 0)
2270 return false;
2271
2272 /* handle any remaining CQEs */
2273 if (opcode == nvme_admin_delete_cq &&
2274 !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
2275 nvme_poll_irqdisable(nvmeq, -1);
2276
2277 sent--;
2278 if (nr_queues)
2279 goto retry;
2280 }
2281 return true;
2036} 2282}
2037 2283
2038/* 2284/*
@@ -2045,6 +2291,10 @@ static int nvme_dev_add(struct nvme_dev *dev)
2045 if (!dev->ctrl.tagset) { 2291 if (!dev->ctrl.tagset) {
2046 dev->tagset.ops = &nvme_mq_ops; 2292 dev->tagset.ops = &nvme_mq_ops;
2047 dev->tagset.nr_hw_queues = dev->online_queues - 1; 2293 dev->tagset.nr_hw_queues = dev->online_queues - 1;
2294 dev->tagset.nr_maps = 2; /* default + read */
2295 if (dev->io_queues[HCTX_TYPE_POLL])
2296 dev->tagset.nr_maps++;
2297 dev->tagset.nr_maps = HCTX_MAX_TYPES;
2048 dev->tagset.timeout = NVME_IO_TIMEOUT; 2298 dev->tagset.timeout = NVME_IO_TIMEOUT;
2049 dev->tagset.numa_node = dev_to_node(dev->dev); 2299 dev->tagset.numa_node = dev_to_node(dev->dev);
2050 dev->tagset.queue_depth = 2300 dev->tagset.queue_depth =
@@ -2187,7 +2437,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
2187 nvme_stop_queues(&dev->ctrl); 2437 nvme_stop_queues(&dev->ctrl);
2188 2438
2189 if (!dead && dev->ctrl.queue_count > 0) { 2439 if (!dead && dev->ctrl.queue_count > 0) {
2190 nvme_disable_io_queues(dev); 2440 if (nvme_disable_io_queues(dev, nvme_admin_delete_sq))
2441 nvme_disable_io_queues(dev, nvme_admin_delete_cq);
2191 nvme_disable_admin_queue(dev, shutdown); 2442 nvme_disable_admin_queue(dev, shutdown);
2192 } 2443 }
2193 for (i = dev->ctrl.queue_count - 1; i >= 0; i--) 2444 for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
@@ -2491,8 +2742,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2491 if (!dev) 2742 if (!dev)
2492 return -ENOMEM; 2743 return -ENOMEM;
2493 2744
2494 dev->queues = kcalloc_node(num_possible_cpus() + 1, 2745 dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
2495 sizeof(struct nvme_queue), GFP_KERNEL, node); 2746 GFP_KERNEL, node);
2496 if (!dev->queues) 2747 if (!dev->queues)
2497 goto free; 2748 goto free;
2498 2749
@@ -2506,7 +2757,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2506 INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); 2757 INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2507 INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); 2758 INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2508 mutex_init(&dev->shutdown_lock); 2759 mutex_init(&dev->shutdown_lock);
2509 init_completion(&dev->ioq_wait);
2510 2760
2511 result = nvme_setup_prp_pools(dev); 2761 result = nvme_setup_prp_pools(dev);
2512 if (result) 2762 if (result)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ab6ec7295bf9..0a2fd2949ad7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -162,6 +162,13 @@ static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
162 return queue - queue->ctrl->queues; 162 return queue - queue->ctrl->queues;
163} 163}
164 164
165static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
166{
167 return nvme_rdma_queue_idx(queue) >
168 queue->ctrl->ctrl.opts->nr_io_queues +
169 queue->ctrl->ctrl.opts->nr_write_queues;
170}
171
165static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue) 172static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
166{ 173{
167 return queue->cmnd_capsule_len - sizeof(struct nvme_command); 174 return queue->cmnd_capsule_len - sizeof(struct nvme_command);
@@ -440,6 +447,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
440 const int send_wr_factor = 3; /* MR, SEND, INV */ 447 const int send_wr_factor = 3; /* MR, SEND, INV */
441 const int cq_factor = send_wr_factor + 1; /* + RECV */ 448 const int cq_factor = send_wr_factor + 1; /* + RECV */
442 int comp_vector, idx = nvme_rdma_queue_idx(queue); 449 int comp_vector, idx = nvme_rdma_queue_idx(queue);
450 enum ib_poll_context poll_ctx;
443 int ret; 451 int ret;
444 452
445 queue->device = nvme_rdma_find_get_device(queue->cm_id); 453 queue->device = nvme_rdma_find_get_device(queue->cm_id);
@@ -456,10 +464,16 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
456 */ 464 */
457 comp_vector = idx == 0 ? idx : idx - 1; 465 comp_vector = idx == 0 ? idx : idx - 1;
458 466
467 /* Polling queues need direct cq polling context */
468 if (nvme_rdma_poll_queue(queue))
469 poll_ctx = IB_POLL_DIRECT;
470 else
471 poll_ctx = IB_POLL_SOFTIRQ;
472
459 /* +1 for ib_stop_cq */ 473 /* +1 for ib_stop_cq */
460 queue->ib_cq = ib_alloc_cq(ibdev, queue, 474 queue->ib_cq = ib_alloc_cq(ibdev, queue,
461 cq_factor * queue->queue_size + 1, 475 cq_factor * queue->queue_size + 1,
462 comp_vector, IB_POLL_SOFTIRQ); 476 comp_vector, poll_ctx);
463 if (IS_ERR(queue->ib_cq)) { 477 if (IS_ERR(queue->ib_cq)) {
464 ret = PTR_ERR(queue->ib_cq); 478 ret = PTR_ERR(queue->ib_cq);
465 goto out_put_dev; 479 goto out_put_dev;
@@ -595,15 +609,17 @@ static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
595 609
596static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) 610static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
597{ 611{
612 struct nvme_rdma_queue *queue = &ctrl->queues[idx];
613 bool poll = nvme_rdma_poll_queue(queue);
598 int ret; 614 int ret;
599 615
600 if (idx) 616 if (idx)
601 ret = nvmf_connect_io_queue(&ctrl->ctrl, idx); 617 ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
602 else 618 else
603 ret = nvmf_connect_admin_queue(&ctrl->ctrl); 619 ret = nvmf_connect_admin_queue(&ctrl->ctrl);
604 620
605 if (!ret) 621 if (!ret)
606 set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags); 622 set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
607 else 623 else
608 dev_info(ctrl->ctrl.device, 624 dev_info(ctrl->ctrl.device,
609 "failed to connect queue: %d ret=%d\n", idx, ret); 625 "failed to connect queue: %d ret=%d\n", idx, ret);
@@ -645,6 +661,9 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
645 nr_io_queues = min_t(unsigned int, nr_io_queues, 661 nr_io_queues = min_t(unsigned int, nr_io_queues,
646 ibdev->num_comp_vectors); 662 ibdev->num_comp_vectors);
647 663
664 nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
665 nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
666
648 ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); 667 ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
649 if (ret) 668 if (ret)
650 return ret; 669 return ret;
@@ -694,7 +713,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
694 set->ops = &nvme_rdma_admin_mq_ops; 713 set->ops = &nvme_rdma_admin_mq_ops;
695 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 714 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
696 set->reserved_tags = 2; /* connect + keep-alive */ 715 set->reserved_tags = 2; /* connect + keep-alive */
697 set->numa_node = NUMA_NO_NODE; 716 set->numa_node = nctrl->numa_node;
698 set->cmd_size = sizeof(struct nvme_rdma_request) + 717 set->cmd_size = sizeof(struct nvme_rdma_request) +
699 SG_CHUNK_SIZE * sizeof(struct scatterlist); 718 SG_CHUNK_SIZE * sizeof(struct scatterlist);
700 set->driver_data = ctrl; 719 set->driver_data = ctrl;
@@ -707,13 +726,14 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
707 set->ops = &nvme_rdma_mq_ops; 726 set->ops = &nvme_rdma_mq_ops;
708 set->queue_depth = nctrl->sqsize + 1; 727 set->queue_depth = nctrl->sqsize + 1;
709 set->reserved_tags = 1; /* fabric connect */ 728 set->reserved_tags = 1; /* fabric connect */
710 set->numa_node = NUMA_NO_NODE; 729 set->numa_node = nctrl->numa_node;
711 set->flags = BLK_MQ_F_SHOULD_MERGE; 730 set->flags = BLK_MQ_F_SHOULD_MERGE;
712 set->cmd_size = sizeof(struct nvme_rdma_request) + 731 set->cmd_size = sizeof(struct nvme_rdma_request) +
713 SG_CHUNK_SIZE * sizeof(struct scatterlist); 732 SG_CHUNK_SIZE * sizeof(struct scatterlist);
714 set->driver_data = ctrl; 733 set->driver_data = ctrl;
715 set->nr_hw_queues = nctrl->queue_count - 1; 734 set->nr_hw_queues = nctrl->queue_count - 1;
716 set->timeout = NVME_IO_TIMEOUT; 735 set->timeout = NVME_IO_TIMEOUT;
736 set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
717 } 737 }
718 738
719 ret = blk_mq_alloc_tag_set(set); 739 ret = blk_mq_alloc_tag_set(set);
@@ -763,6 +783,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
763 return error; 783 return error;
764 784
765 ctrl->device = ctrl->queues[0].device; 785 ctrl->device = ctrl->queues[0].device;
786 ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
766 787
767 ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); 788 ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
768 789
@@ -1411,12 +1432,11 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
1411 WARN_ON_ONCE(ret); 1432 WARN_ON_ONCE(ret);
1412} 1433}
1413 1434
1414static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, 1435static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1415 struct nvme_completion *cqe, struct ib_wc *wc, int tag) 1436 struct nvme_completion *cqe, struct ib_wc *wc)
1416{ 1437{
1417 struct request *rq; 1438 struct request *rq;
1418 struct nvme_rdma_request *req; 1439 struct nvme_rdma_request *req;
1419 int ret = 0;
1420 1440
1421 rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); 1441 rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1422 if (!rq) { 1442 if (!rq) {
@@ -1424,7 +1444,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1424 "tag 0x%x on QP %#x not found\n", 1444 "tag 0x%x on QP %#x not found\n",
1425 cqe->command_id, queue->qp->qp_num); 1445 cqe->command_id, queue->qp->qp_num);
1426 nvme_rdma_error_recovery(queue->ctrl); 1446 nvme_rdma_error_recovery(queue->ctrl);
1427 return ret; 1447 return;
1428 } 1448 }
1429 req = blk_mq_rq_to_pdu(rq); 1449 req = blk_mq_rq_to_pdu(rq);
1430 1450
@@ -1439,6 +1459,8 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1439 nvme_rdma_error_recovery(queue->ctrl); 1459 nvme_rdma_error_recovery(queue->ctrl);
1440 } 1460 }
1441 } else if (req->mr) { 1461 } else if (req->mr) {
1462 int ret;
1463
1442 ret = nvme_rdma_inv_rkey(queue, req); 1464 ret = nvme_rdma_inv_rkey(queue, req);
1443 if (unlikely(ret < 0)) { 1465 if (unlikely(ret < 0)) {
1444 dev_err(queue->ctrl->ctrl.device, 1466 dev_err(queue->ctrl->ctrl.device,
@@ -1447,19 +1469,14 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1447 nvme_rdma_error_recovery(queue->ctrl); 1469 nvme_rdma_error_recovery(queue->ctrl);
1448 } 1470 }
1449 /* the local invalidation completion will end the request */ 1471 /* the local invalidation completion will end the request */
1450 return 0; 1472 return;
1451 } 1473 }
1452 1474
1453 if (refcount_dec_and_test(&req->ref)) { 1475 if (refcount_dec_and_test(&req->ref))
1454 if (rq->tag == tag)
1455 ret = 1;
1456 nvme_end_request(rq, req->status, req->result); 1476 nvme_end_request(rq, req->status, req->result);
1457 }
1458
1459 return ret;
1460} 1477}
1461 1478
1462static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) 1479static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1463{ 1480{
1464 struct nvme_rdma_qe *qe = 1481 struct nvme_rdma_qe *qe =
1465 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe); 1482 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
@@ -1467,11 +1484,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1467 struct ib_device *ibdev = queue->device->dev; 1484 struct ib_device *ibdev = queue->device->dev;
1468 struct nvme_completion *cqe = qe->data; 1485 struct nvme_completion *cqe = qe->data;
1469 const size_t len = sizeof(struct nvme_completion); 1486 const size_t len = sizeof(struct nvme_completion);
1470 int ret = 0;
1471 1487
1472 if (unlikely(wc->status != IB_WC_SUCCESS)) { 1488 if (unlikely(wc->status != IB_WC_SUCCESS)) {
1473 nvme_rdma_wr_error(cq, wc, "RECV"); 1489 nvme_rdma_wr_error(cq, wc, "RECV");
1474 return 0; 1490 return;
1475 } 1491 }
1476 1492
1477 ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); 1493 ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
@@ -1486,16 +1502,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1486 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, 1502 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
1487 &cqe->result); 1503 &cqe->result);
1488 else 1504 else
1489 ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag); 1505 nvme_rdma_process_nvme_rsp(queue, cqe, wc);
1490 ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE); 1506 ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1491 1507
1492 nvme_rdma_post_recv(queue, qe); 1508 nvme_rdma_post_recv(queue, qe);
1493 return ret;
1494}
1495
1496static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1497{
1498 __nvme_rdma_recv_done(cq, wc, -1);
1499} 1509}
1500 1510
1501static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue) 1511static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
@@ -1749,25 +1759,11 @@ err:
1749 return BLK_STS_IOERR; 1759 return BLK_STS_IOERR;
1750} 1760}
1751 1761
1752static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) 1762static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
1753{ 1763{
1754 struct nvme_rdma_queue *queue = hctx->driver_data; 1764 struct nvme_rdma_queue *queue = hctx->driver_data;
1755 struct ib_cq *cq = queue->ib_cq;
1756 struct ib_wc wc;
1757 int found = 0;
1758
1759 while (ib_poll_cq(cq, 1, &wc) > 0) {
1760 struct ib_cqe *cqe = wc.wr_cqe;
1761
1762 if (cqe) {
1763 if (cqe->done == nvme_rdma_recv_done)
1764 found |= __nvme_rdma_recv_done(cq, &wc, tag);
1765 else
1766 cqe->done(cq, &wc);
1767 }
1768 }
1769 1765
1770 return found; 1766 return ib_process_cq_direct(queue->ib_cq, -1);
1771} 1767}
1772 1768
1773static void nvme_rdma_complete_rq(struct request *rq) 1769static void nvme_rdma_complete_rq(struct request *rq)
@@ -1782,7 +1778,36 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
1782{ 1778{
1783 struct nvme_rdma_ctrl *ctrl = set->driver_data; 1779 struct nvme_rdma_ctrl *ctrl = set->driver_data;
1784 1780
1785 return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0); 1781 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
1782 set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
1783 if (ctrl->ctrl.opts->nr_write_queues) {
1784 /* separate read/write queues */
1785 set->map[HCTX_TYPE_DEFAULT].nr_queues =
1786 ctrl->ctrl.opts->nr_write_queues;
1787 set->map[HCTX_TYPE_READ].queue_offset =
1788 ctrl->ctrl.opts->nr_write_queues;
1789 } else {
1790 /* mixed read/write queues */
1791 set->map[HCTX_TYPE_DEFAULT].nr_queues =
1792 ctrl->ctrl.opts->nr_io_queues;
1793 set->map[HCTX_TYPE_READ].queue_offset = 0;
1794 }
1795 blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
1796 ctrl->device->dev, 0);
1797 blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
1798 ctrl->device->dev, 0);
1799
1800 if (ctrl->ctrl.opts->nr_poll_queues) {
1801 set->map[HCTX_TYPE_POLL].nr_queues =
1802 ctrl->ctrl.opts->nr_poll_queues;
1803 set->map[HCTX_TYPE_POLL].queue_offset =
1804 ctrl->ctrl.opts->nr_io_queues;
1805 if (ctrl->ctrl.opts->nr_write_queues)
1806 set->map[HCTX_TYPE_POLL].queue_offset +=
1807 ctrl->ctrl.opts->nr_write_queues;
1808 blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
1809 }
1810 return 0;
1786} 1811}
1787 1812
1788static const struct blk_mq_ops nvme_rdma_mq_ops = { 1813static const struct blk_mq_ops nvme_rdma_mq_ops = {
@@ -1791,9 +1816,9 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = {
1791 .init_request = nvme_rdma_init_request, 1816 .init_request = nvme_rdma_init_request,
1792 .exit_request = nvme_rdma_exit_request, 1817 .exit_request = nvme_rdma_exit_request,
1793 .init_hctx = nvme_rdma_init_hctx, 1818 .init_hctx = nvme_rdma_init_hctx,
1794 .poll = nvme_rdma_poll,
1795 .timeout = nvme_rdma_timeout, 1819 .timeout = nvme_rdma_timeout,
1796 .map_queues = nvme_rdma_map_queues, 1820 .map_queues = nvme_rdma_map_queues,
1821 .poll = nvme_rdma_poll,
1797}; 1822};
1798 1823
1799static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { 1824static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
@@ -1938,7 +1963,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1938 INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); 1963 INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
1939 INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); 1964 INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
1940 1965
1941 ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ 1966 ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
1967 opts->nr_poll_queues + 1;
1942 ctrl->ctrl.sqsize = opts->queue_size - 1; 1968 ctrl->ctrl.sqsize = opts->queue_size - 1;
1943 ctrl->ctrl.kato = opts->kato; 1969 ctrl->ctrl.kato = opts->kato;
1944 1970
@@ -1989,7 +2015,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
1989 .module = THIS_MODULE, 2015 .module = THIS_MODULE,
1990 .required_opts = NVMF_OPT_TRADDR, 2016 .required_opts = NVMF_OPT_TRADDR,
1991 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | 2017 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
1992 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO, 2018 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2019 NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES,
1993 .create_ctrl = nvme_rdma_create_ctrl, 2020 .create_ctrl = nvme_rdma_create_ctrl,
1994}; 2021};
1995 2022
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
new file mode 100644
index 000000000000..de174912445e
--- /dev/null
+++ b/drivers/nvme/host/tcp.c
@@ -0,0 +1,2278 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe over Fabrics TCP host.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/nvme-tcp.h>
12#include <net/sock.h>
13#include <net/tcp.h>
14#include <linux/blk-mq.h>
15#include <crypto/hash.h>
16
17#include "nvme.h"
18#include "fabrics.h"
19
20struct nvme_tcp_queue;
21
22enum nvme_tcp_send_state {
23 NVME_TCP_SEND_CMD_PDU = 0,
24 NVME_TCP_SEND_H2C_PDU,
25 NVME_TCP_SEND_DATA,
26 NVME_TCP_SEND_DDGST,
27};
28
29struct nvme_tcp_request {
30 struct nvme_request req;
31 void *pdu;
32 struct nvme_tcp_queue *queue;
33 u32 data_len;
34 u32 pdu_len;
35 u32 pdu_sent;
36 u16 ttag;
37 struct list_head entry;
38 __le32 ddgst;
39
40 struct bio *curr_bio;
41 struct iov_iter iter;
42
43 /* send state */
44 size_t offset;
45 size_t data_sent;
46 enum nvme_tcp_send_state state;
47};
48
49enum nvme_tcp_queue_flags {
50 NVME_TCP_Q_ALLOCATED = 0,
51 NVME_TCP_Q_LIVE = 1,
52};
53
54enum nvme_tcp_recv_state {
55 NVME_TCP_RECV_PDU = 0,
56 NVME_TCP_RECV_DATA,
57 NVME_TCP_RECV_DDGST,
58};
59
60struct nvme_tcp_ctrl;
61struct nvme_tcp_queue {
62 struct socket *sock;
63 struct work_struct io_work;
64 int io_cpu;
65
66 spinlock_t lock;
67 struct list_head send_list;
68
69 /* recv state */
70 void *pdu;
71 int pdu_remaining;
72 int pdu_offset;
73 size_t data_remaining;
74 size_t ddgst_remaining;
75
76 /* send state */
77 struct nvme_tcp_request *request;
78
79 int queue_size;
80 size_t cmnd_capsule_len;
81 struct nvme_tcp_ctrl *ctrl;
82 unsigned long flags;
83 bool rd_enabled;
84
85 bool hdr_digest;
86 bool data_digest;
87 struct ahash_request *rcv_hash;
88 struct ahash_request *snd_hash;
89 __le32 exp_ddgst;
90 __le32 recv_ddgst;
91
92 struct page_frag_cache pf_cache;
93
94 void (*state_change)(struct sock *);
95 void (*data_ready)(struct sock *);
96 void (*write_space)(struct sock *);
97};
98
99struct nvme_tcp_ctrl {
100 /* read only in the hot path */
101 struct nvme_tcp_queue *queues;
102 struct blk_mq_tag_set tag_set;
103
104 /* other member variables */
105 struct list_head list;
106 struct blk_mq_tag_set admin_tag_set;
107 struct sockaddr_storage addr;
108 struct sockaddr_storage src_addr;
109 struct nvme_ctrl ctrl;
110
111 struct work_struct err_work;
112 struct delayed_work connect_work;
113 struct nvme_tcp_request async_req;
114};
115
116static LIST_HEAD(nvme_tcp_ctrl_list);
117static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
118static struct workqueue_struct *nvme_tcp_wq;
119static struct blk_mq_ops nvme_tcp_mq_ops;
120static struct blk_mq_ops nvme_tcp_admin_mq_ops;
121
122static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
123{
124 return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
125}
126
127static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
128{
129 return queue - queue->ctrl->queues;
130}
131
132static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
133{
134 u32 queue_idx = nvme_tcp_queue_id(queue);
135
136 if (queue_idx == 0)
137 return queue->ctrl->admin_tag_set.tags[queue_idx];
138 return queue->ctrl->tag_set.tags[queue_idx - 1];
139}
140
141static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
142{
143 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
144}
145
146static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
147{
148 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
149}
150
151static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
152{
153 return queue->cmnd_capsule_len - sizeof(struct nvme_command);
154}
155
156static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
157{
158 return req == &req->queue->ctrl->async_req;
159}
160
161static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
162{
163 struct request *rq;
164 unsigned int bytes;
165
166 if (unlikely(nvme_tcp_async_req(req)))
167 return false; /* async events don't have a request */
168
169 rq = blk_mq_rq_from_pdu(req);
170 bytes = blk_rq_payload_bytes(rq);
171
172 return rq_data_dir(rq) == WRITE && bytes &&
173 bytes <= nvme_tcp_inline_data_size(req->queue);
174}
175
176static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
177{
178 return req->iter.bvec->bv_page;
179}
180
181static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
182{
183 return req->iter.bvec->bv_offset + req->iter.iov_offset;
184}
185
186static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
187{
188 return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
189 req->pdu_len - req->pdu_sent);
190}
191
192static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
193{
194 return req->iter.iov_offset;
195}
196
197static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
198{
199 return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
200 req->pdu_len - req->pdu_sent : 0;
201}
202
203static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
204 int len)
205{
206 return nvme_tcp_pdu_data_left(req) <= len;
207}
208
209static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
210 unsigned int dir)
211{
212 struct request *rq = blk_mq_rq_from_pdu(req);
213 struct bio_vec *vec;
214 unsigned int size;
215 int nsegs;
216 size_t offset;
217
218 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
219 vec = &rq->special_vec;
220 nsegs = 1;
221 size = blk_rq_payload_bytes(rq);
222 offset = 0;
223 } else {
224 struct bio *bio = req->curr_bio;
225
226 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
227 nsegs = bio_segments(bio);
228 size = bio->bi_iter.bi_size;
229 offset = bio->bi_iter.bi_bvec_done;
230 }
231
232 iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
233 req->iter.iov_offset = offset;
234}
235
236static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
237 int len)
238{
239 req->data_sent += len;
240 req->pdu_sent += len;
241 iov_iter_advance(&req->iter, len);
242 if (!iov_iter_count(&req->iter) &&
243 req->data_sent < req->data_len) {
244 req->curr_bio = req->curr_bio->bi_next;
245 nvme_tcp_init_iter(req, WRITE);
246 }
247}
248
249static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
250{
251 struct nvme_tcp_queue *queue = req->queue;
252
253 spin_lock(&queue->lock);
254 list_add_tail(&req->entry, &queue->send_list);
255 spin_unlock(&queue->lock);
256
257 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
258}
259
260static inline struct nvme_tcp_request *
261nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
262{
263 struct nvme_tcp_request *req;
264
265 spin_lock(&queue->lock);
266 req = list_first_entry_or_null(&queue->send_list,
267 struct nvme_tcp_request, entry);
268 if (req)
269 list_del(&req->entry);
270 spin_unlock(&queue->lock);
271
272 return req;
273}
274
275static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
276 __le32 *dgst)
277{
278 ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
279 crypto_ahash_final(hash);
280}
281
282static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
283 struct page *page, off_t off, size_t len)
284{
285 struct scatterlist sg;
286
287 sg_init_marker(&sg, 1);
288 sg_set_page(&sg, page, len, off);
289 ahash_request_set_crypt(hash, &sg, NULL, len);
290 crypto_ahash_update(hash);
291}
292
293static inline void nvme_tcp_hdgst(struct ahash_request *hash,
294 void *pdu, size_t len)
295{
296 struct scatterlist sg;
297
298 sg_init_one(&sg, pdu, len);
299 ahash_request_set_crypt(hash, &sg, pdu + len, len);
300 crypto_ahash_digest(hash);
301}
302
303static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
304 void *pdu, size_t pdu_len)
305{
306 struct nvme_tcp_hdr *hdr = pdu;
307 __le32 recv_digest;
308 __le32 exp_digest;
309
310 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
311 dev_err(queue->ctrl->ctrl.device,
312 "queue %d: header digest flag is cleared\n",
313 nvme_tcp_queue_id(queue));
314 return -EPROTO;
315 }
316
317 recv_digest = *(__le32 *)(pdu + hdr->hlen);
318 nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
319 exp_digest = *(__le32 *)(pdu + hdr->hlen);
320 if (recv_digest != exp_digest) {
321 dev_err(queue->ctrl->ctrl.device,
322 "header digest error: recv %#x expected %#x\n",
323 le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
324 return -EIO;
325 }
326
327 return 0;
328}
329
330static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
331{
332 struct nvme_tcp_hdr *hdr = pdu;
333 u8 digest_len = nvme_tcp_hdgst_len(queue);
334 u32 len;
335
336 len = le32_to_cpu(hdr->plen) - hdr->hlen -
337 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
338
339 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
340 dev_err(queue->ctrl->ctrl.device,
341 "queue %d: data digest flag is cleared\n",
342 nvme_tcp_queue_id(queue));
343 return -EPROTO;
344 }
345 crypto_ahash_init(queue->rcv_hash);
346
347 return 0;
348}
349
350static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
351 struct request *rq, unsigned int hctx_idx)
352{
353 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
354
355 page_frag_free(req->pdu);
356}
357
358static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
359 struct request *rq, unsigned int hctx_idx,
360 unsigned int numa_node)
361{
362 struct nvme_tcp_ctrl *ctrl = set->driver_data;
363 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
364 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
365 struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
366 u8 hdgst = nvme_tcp_hdgst_len(queue);
367
368 req->pdu = page_frag_alloc(&queue->pf_cache,
369 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
370 GFP_KERNEL | __GFP_ZERO);
371 if (!req->pdu)
372 return -ENOMEM;
373
374 req->queue = queue;
375 nvme_req(rq)->ctrl = &ctrl->ctrl;
376
377 return 0;
378}
379
380static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
381 unsigned int hctx_idx)
382{
383 struct nvme_tcp_ctrl *ctrl = data;
384 struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
385
386 hctx->driver_data = queue;
387 return 0;
388}
389
390static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
391 unsigned int hctx_idx)
392{
393 struct nvme_tcp_ctrl *ctrl = data;
394 struct nvme_tcp_queue *queue = &ctrl->queues[0];
395
396 hctx->driver_data = queue;
397 return 0;
398}
399
400static enum nvme_tcp_recv_state
401nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
402{
403 return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
404 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
405 NVME_TCP_RECV_DATA;
406}
407
408static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
409{
410 queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
411 nvme_tcp_hdgst_len(queue);
412 queue->pdu_offset = 0;
413 queue->data_remaining = -1;
414 queue->ddgst_remaining = 0;
415}
416
417static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
418{
419 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
420 return;
421
422 queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
423}
424
425static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
426 struct nvme_completion *cqe)
427{
428 struct request *rq;
429
430 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
431 if (!rq) {
432 dev_err(queue->ctrl->ctrl.device,
433 "queue %d tag 0x%x not found\n",
434 nvme_tcp_queue_id(queue), cqe->command_id);
435 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
436 return -EINVAL;
437 }
438
439 nvme_end_request(rq, cqe->status, cqe->result);
440
441 return 0;
442}
443
444static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
445 struct nvme_tcp_data_pdu *pdu)
446{
447 struct request *rq;
448
449 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
450 if (!rq) {
451 dev_err(queue->ctrl->ctrl.device,
452 "queue %d tag %#x not found\n",
453 nvme_tcp_queue_id(queue), pdu->command_id);
454 return -ENOENT;
455 }
456
457 if (!blk_rq_payload_bytes(rq)) {
458 dev_err(queue->ctrl->ctrl.device,
459 "queue %d tag %#x unexpected data\n",
460 nvme_tcp_queue_id(queue), rq->tag);
461 return -EIO;
462 }
463
464 queue->data_remaining = le32_to_cpu(pdu->data_length);
465
466 return 0;
467
468}
469
470static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
471 struct nvme_tcp_rsp_pdu *pdu)
472{
473 struct nvme_completion *cqe = &pdu->cqe;
474 int ret = 0;
475
476 /*
477 * AEN requests are special as they don't time out and can
478 * survive any kind of queue freeze and often don't respond to
479 * aborts. We don't even bother to allocate a struct request
480 * for them but rather special case them here.
481 */
482 if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
483 cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
484 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
485 &cqe->result);
486 else
487 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
488
489 return ret;
490}
491
492static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
493 struct nvme_tcp_r2t_pdu *pdu)
494{
495 struct nvme_tcp_data_pdu *data = req->pdu;
496 struct nvme_tcp_queue *queue = req->queue;
497 struct request *rq = blk_mq_rq_from_pdu(req);
498 u8 hdgst = nvme_tcp_hdgst_len(queue);
499 u8 ddgst = nvme_tcp_ddgst_len(queue);
500
501 req->pdu_len = le32_to_cpu(pdu->r2t_length);
502 req->pdu_sent = 0;
503
504 if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
505 dev_err(queue->ctrl->ctrl.device,
506 "req %d r2t len %u exceeded data len %u (%zu sent)\n",
507 rq->tag, req->pdu_len, req->data_len,
508 req->data_sent);
509 return -EPROTO;
510 }
511
512 if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
513 dev_err(queue->ctrl->ctrl.device,
514 "req %d unexpected r2t offset %u (expected %zu)\n",
515 rq->tag, le32_to_cpu(pdu->r2t_offset),
516 req->data_sent);
517 return -EPROTO;
518 }
519
520 memset(data, 0, sizeof(*data));
521 data->hdr.type = nvme_tcp_h2c_data;
522 data->hdr.flags = NVME_TCP_F_DATA_LAST;
523 if (queue->hdr_digest)
524 data->hdr.flags |= NVME_TCP_F_HDGST;
525 if (queue->data_digest)
526 data->hdr.flags |= NVME_TCP_F_DDGST;
527 data->hdr.hlen = sizeof(*data);
528 data->hdr.pdo = data->hdr.hlen + hdgst;
529 data->hdr.plen =
530 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
531 data->ttag = pdu->ttag;
532 data->command_id = rq->tag;
533 data->data_offset = cpu_to_le32(req->data_sent);
534 data->data_length = cpu_to_le32(req->pdu_len);
535 return 0;
536}
537
538static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
539 struct nvme_tcp_r2t_pdu *pdu)
540{
541 struct nvme_tcp_request *req;
542 struct request *rq;
543 int ret;
544
545 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
546 if (!rq) {
547 dev_err(queue->ctrl->ctrl.device,
548 "queue %d tag %#x not found\n",
549 nvme_tcp_queue_id(queue), pdu->command_id);
550 return -ENOENT;
551 }
552 req = blk_mq_rq_to_pdu(rq);
553
554 ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
555 if (unlikely(ret))
556 return ret;
557
558 req->state = NVME_TCP_SEND_H2C_PDU;
559 req->offset = 0;
560
561 nvme_tcp_queue_request(req);
562
563 return 0;
564}
565
566static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
567 unsigned int *offset, size_t *len)
568{
569 struct nvme_tcp_hdr *hdr;
570 char *pdu = queue->pdu;
571 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
572 int ret;
573
574 ret = skb_copy_bits(skb, *offset,
575 &pdu[queue->pdu_offset], rcv_len);
576 if (unlikely(ret))
577 return ret;
578
579 queue->pdu_remaining -= rcv_len;
580 queue->pdu_offset += rcv_len;
581 *offset += rcv_len;
582 *len -= rcv_len;
583 if (queue->pdu_remaining)
584 return 0;
585
586 hdr = queue->pdu;
587 if (queue->hdr_digest) {
588 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
589 if (unlikely(ret))
590 return ret;
591 }
592
593
594 if (queue->data_digest) {
595 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
596 if (unlikely(ret))
597 return ret;
598 }
599
600 switch (hdr->type) {
601 case nvme_tcp_c2h_data:
602 ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
603 break;
604 case nvme_tcp_rsp:
605 nvme_tcp_init_recv_ctx(queue);
606 ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
607 break;
608 case nvme_tcp_r2t:
609 nvme_tcp_init_recv_ctx(queue);
610 ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
611 break;
612 default:
613 dev_err(queue->ctrl->ctrl.device,
614 "unsupported pdu type (%d)\n", hdr->type);
615 return -EINVAL;
616 }
617
618 return ret;
619}
620
621static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
622 unsigned int *offset, size_t *len)
623{
624 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
625 struct nvme_tcp_request *req;
626 struct request *rq;
627
628 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
629 if (!rq) {
630 dev_err(queue->ctrl->ctrl.device,
631 "queue %d tag %#x not found\n",
632 nvme_tcp_queue_id(queue), pdu->command_id);
633 return -ENOENT;
634 }
635 req = blk_mq_rq_to_pdu(rq);
636
637 while (true) {
638 int recv_len, ret;
639
640 recv_len = min_t(size_t, *len, queue->data_remaining);
641 if (!recv_len)
642 break;
643
644 if (!iov_iter_count(&req->iter)) {
645 req->curr_bio = req->curr_bio->bi_next;
646
647 /*
648 * If we don`t have any bios it means that controller
649 * sent more data than we requested, hence error
650 */
651 if (!req->curr_bio) {
652 dev_err(queue->ctrl->ctrl.device,
653 "queue %d no space in request %#x",
654 nvme_tcp_queue_id(queue), rq->tag);
655 nvme_tcp_init_recv_ctx(queue);
656 return -EIO;
657 }
658 nvme_tcp_init_iter(req, READ);
659 }
660
661 /* we can read only from what is left in this bio */
662 recv_len = min_t(size_t, recv_len,
663 iov_iter_count(&req->iter));
664
665 if (queue->data_digest)
666 ret = skb_copy_and_hash_datagram_iter(skb, *offset,
667 &req->iter, recv_len, queue->rcv_hash);
668 else
669 ret = skb_copy_datagram_iter(skb, *offset,
670 &req->iter, recv_len);
671 if (ret) {
672 dev_err(queue->ctrl->ctrl.device,
673 "queue %d failed to copy request %#x data",
674 nvme_tcp_queue_id(queue), rq->tag);
675 return ret;
676 }
677
678 *len -= recv_len;
679 *offset += recv_len;
680 queue->data_remaining -= recv_len;
681 }
682
683 if (!queue->data_remaining) {
684 if (queue->data_digest) {
685 nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
686 queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
687 } else {
688 nvme_tcp_init_recv_ctx(queue);
689 }
690 }
691
692 return 0;
693}
694
695static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
696 struct sk_buff *skb, unsigned int *offset, size_t *len)
697{
698 char *ddgst = (char *)&queue->recv_ddgst;
699 size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
700 off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
701 int ret;
702
703 ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
704 if (unlikely(ret))
705 return ret;
706
707 queue->ddgst_remaining -= recv_len;
708 *offset += recv_len;
709 *len -= recv_len;
710 if (queue->ddgst_remaining)
711 return 0;
712
713 if (queue->recv_ddgst != queue->exp_ddgst) {
714 dev_err(queue->ctrl->ctrl.device,
715 "data digest error: recv %#x expected %#x\n",
716 le32_to_cpu(queue->recv_ddgst),
717 le32_to_cpu(queue->exp_ddgst));
718 return -EIO;
719 }
720
721 nvme_tcp_init_recv_ctx(queue);
722 return 0;
723}
724
725static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
726 unsigned int offset, size_t len)
727{
728 struct nvme_tcp_queue *queue = desc->arg.data;
729 size_t consumed = len;
730 int result;
731
732 while (len) {
733 switch (nvme_tcp_recv_state(queue)) {
734 case NVME_TCP_RECV_PDU:
735 result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
736 break;
737 case NVME_TCP_RECV_DATA:
738 result = nvme_tcp_recv_data(queue, skb, &offset, &len);
739 break;
740 case NVME_TCP_RECV_DDGST:
741 result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
742 break;
743 default:
744 result = -EFAULT;
745 }
746 if (result) {
747 dev_err(queue->ctrl->ctrl.device,
748 "receive failed: %d\n", result);
749 queue->rd_enabled = false;
750 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
751 return result;
752 }
753 }
754
755 return consumed;
756}
757
758static void nvme_tcp_data_ready(struct sock *sk)
759{
760 struct nvme_tcp_queue *queue;
761
762 read_lock(&sk->sk_callback_lock);
763 queue = sk->sk_user_data;
764 if (likely(queue && queue->rd_enabled))
765 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
766 read_unlock(&sk->sk_callback_lock);
767}
768
769static void nvme_tcp_write_space(struct sock *sk)
770{
771 struct nvme_tcp_queue *queue;
772
773 read_lock_bh(&sk->sk_callback_lock);
774 queue = sk->sk_user_data;
775 if (likely(queue && sk_stream_is_writeable(sk))) {
776 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
777 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
778 }
779 read_unlock_bh(&sk->sk_callback_lock);
780}
781
782static void nvme_tcp_state_change(struct sock *sk)
783{
784 struct nvme_tcp_queue *queue;
785
786 read_lock(&sk->sk_callback_lock);
787 queue = sk->sk_user_data;
788 if (!queue)
789 goto done;
790
791 switch (sk->sk_state) {
792 case TCP_CLOSE:
793 case TCP_CLOSE_WAIT:
794 case TCP_LAST_ACK:
795 case TCP_FIN_WAIT1:
796 case TCP_FIN_WAIT2:
797 /* fallthrough */
798 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
799 break;
800 default:
801 dev_info(queue->ctrl->ctrl.device,
802 "queue %d socket state %d\n",
803 nvme_tcp_queue_id(queue), sk->sk_state);
804 }
805
806 queue->state_change(sk);
807done:
808 read_unlock(&sk->sk_callback_lock);
809}
810
811static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
812{
813 queue->request = NULL;
814}
815
816static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
817{
818 union nvme_result res = {};
819
820 nvme_end_request(blk_mq_rq_from_pdu(req),
821 cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res);
822}
823
824static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
825{
826 struct nvme_tcp_queue *queue = req->queue;
827
828 while (true) {
829 struct page *page = nvme_tcp_req_cur_page(req);
830 size_t offset = nvme_tcp_req_cur_offset(req);
831 size_t len = nvme_tcp_req_cur_length(req);
832 bool last = nvme_tcp_pdu_last_send(req, len);
833 int ret, flags = MSG_DONTWAIT;
834
835 if (last && !queue->data_digest)
836 flags |= MSG_EOR;
837 else
838 flags |= MSG_MORE;
839
840 ret = kernel_sendpage(queue->sock, page, offset, len, flags);
841 if (ret <= 0)
842 return ret;
843
844 nvme_tcp_advance_req(req, ret);
845 if (queue->data_digest)
846 nvme_tcp_ddgst_update(queue->snd_hash, page,
847 offset, ret);
848
849 /* fully successful last write*/
850 if (last && ret == len) {
851 if (queue->data_digest) {
852 nvme_tcp_ddgst_final(queue->snd_hash,
853 &req->ddgst);
854 req->state = NVME_TCP_SEND_DDGST;
855 req->offset = 0;
856 } else {
857 nvme_tcp_done_send_req(queue);
858 }
859 return 1;
860 }
861 }
862 return -EAGAIN;
863}
864
865static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
866{
867 struct nvme_tcp_queue *queue = req->queue;
868 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
869 bool inline_data = nvme_tcp_has_inline_data(req);
870 int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
871 u8 hdgst = nvme_tcp_hdgst_len(queue);
872 int len = sizeof(*pdu) + hdgst - req->offset;
873 int ret;
874
875 if (queue->hdr_digest && !req->offset)
876 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
877
878 ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
879 offset_in_page(pdu) + req->offset, len, flags);
880 if (unlikely(ret <= 0))
881 return ret;
882
883 len -= ret;
884 if (!len) {
885 if (inline_data) {
886 req->state = NVME_TCP_SEND_DATA;
887 if (queue->data_digest)
888 crypto_ahash_init(queue->snd_hash);
889 nvme_tcp_init_iter(req, WRITE);
890 } else {
891 nvme_tcp_done_send_req(queue);
892 }
893 return 1;
894 }
895 req->offset += ret;
896
897 return -EAGAIN;
898}
899
900static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
901{
902 struct nvme_tcp_queue *queue = req->queue;
903 struct nvme_tcp_data_pdu *pdu = req->pdu;
904 u8 hdgst = nvme_tcp_hdgst_len(queue);
905 int len = sizeof(*pdu) - req->offset + hdgst;
906 int ret;
907
908 if (queue->hdr_digest && !req->offset)
909 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
910
911 ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
912 offset_in_page(pdu) + req->offset, len,
913 MSG_DONTWAIT | MSG_MORE);
914 if (unlikely(ret <= 0))
915 return ret;
916
917 len -= ret;
918 if (!len) {
919 req->state = NVME_TCP_SEND_DATA;
920 if (queue->data_digest)
921 crypto_ahash_init(queue->snd_hash);
922 if (!req->data_sent)
923 nvme_tcp_init_iter(req, WRITE);
924 return 1;
925 }
926 req->offset += ret;
927
928 return -EAGAIN;
929}
930
931static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
932{
933 struct nvme_tcp_queue *queue = req->queue;
934 int ret;
935 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
936 struct kvec iov = {
937 .iov_base = &req->ddgst + req->offset,
938 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
939 };
940
941 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
942 if (unlikely(ret <= 0))
943 return ret;
944
945 if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
946 nvme_tcp_done_send_req(queue);
947 return 1;
948 }
949
950 req->offset += ret;
951 return -EAGAIN;
952}
953
954static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
955{
956 struct nvme_tcp_request *req;
957 int ret = 1;
958
959 if (!queue->request) {
960 queue->request = nvme_tcp_fetch_request(queue);
961 if (!queue->request)
962 return 0;
963 }
964 req = queue->request;
965
966 if (req->state == NVME_TCP_SEND_CMD_PDU) {
967 ret = nvme_tcp_try_send_cmd_pdu(req);
968 if (ret <= 0)
969 goto done;
970 if (!nvme_tcp_has_inline_data(req))
971 return ret;
972 }
973
974 if (req->state == NVME_TCP_SEND_H2C_PDU) {
975 ret = nvme_tcp_try_send_data_pdu(req);
976 if (ret <= 0)
977 goto done;
978 }
979
980 if (req->state == NVME_TCP_SEND_DATA) {
981 ret = nvme_tcp_try_send_data(req);
982 if (ret <= 0)
983 goto done;
984 }
985
986 if (req->state == NVME_TCP_SEND_DDGST)
987 ret = nvme_tcp_try_send_ddgst(req);
988done:
989 if (ret == -EAGAIN)
990 ret = 0;
991 return ret;
992}
993
994static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
995{
996 struct sock *sk = queue->sock->sk;
997 read_descriptor_t rd_desc;
998 int consumed;
999
1000 rd_desc.arg.data = queue;
1001 rd_desc.count = 1;
1002 lock_sock(sk);
1003 consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1004 release_sock(sk);
1005 return consumed;
1006}
1007
1008static void nvme_tcp_io_work(struct work_struct *w)
1009{
1010 struct nvme_tcp_queue *queue =
1011 container_of(w, struct nvme_tcp_queue, io_work);
1012 unsigned long start = jiffies + msecs_to_jiffies(1);
1013
1014 do {
1015 bool pending = false;
1016 int result;
1017
1018 result = nvme_tcp_try_send(queue);
1019 if (result > 0) {
1020 pending = true;
1021 } else if (unlikely(result < 0)) {
1022 dev_err(queue->ctrl->ctrl.device,
1023 "failed to send request %d\n", result);
1024 if (result != -EPIPE)
1025 nvme_tcp_fail_request(queue->request);
1026 nvme_tcp_done_send_req(queue);
1027 return;
1028 }
1029
1030 result = nvme_tcp_try_recv(queue);
1031 if (result > 0)
1032 pending = true;
1033
1034 if (!pending)
1035 return;
1036
1037 } while (time_after(jiffies, start)); /* quota is exhausted */
1038
1039 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1040}
1041
1042static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1043{
1044 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1045
1046 ahash_request_free(queue->rcv_hash);
1047 ahash_request_free(queue->snd_hash);
1048 crypto_free_ahash(tfm);
1049}
1050
1051static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1052{
1053 struct crypto_ahash *tfm;
1054
1055 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1056 if (IS_ERR(tfm))
1057 return PTR_ERR(tfm);
1058
1059 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1060 if (!queue->snd_hash)
1061 goto free_tfm;
1062 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1063
1064 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1065 if (!queue->rcv_hash)
1066 goto free_snd_hash;
1067 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1068
1069 return 0;
1070free_snd_hash:
1071 ahash_request_free(queue->snd_hash);
1072free_tfm:
1073 crypto_free_ahash(tfm);
1074 return -ENOMEM;
1075}
1076
1077static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1078{
1079 struct nvme_tcp_request *async = &ctrl->async_req;
1080
1081 page_frag_free(async->pdu);
1082}
1083
1084static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1085{
1086 struct nvme_tcp_queue *queue = &ctrl->queues[0];
1087 struct nvme_tcp_request *async = &ctrl->async_req;
1088 u8 hdgst = nvme_tcp_hdgst_len(queue);
1089
1090 async->pdu = page_frag_alloc(&queue->pf_cache,
1091 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1092 GFP_KERNEL | __GFP_ZERO);
1093 if (!async->pdu)
1094 return -ENOMEM;
1095
1096 async->queue = &ctrl->queues[0];
1097 return 0;
1098}
1099
1100static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1101{
1102 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1103 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1104
1105 if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1106 return;
1107
1108 if (queue->hdr_digest || queue->data_digest)
1109 nvme_tcp_free_crypto(queue);
1110
1111 sock_release(queue->sock);
1112 kfree(queue->pdu);
1113}
1114
1115static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1116{
1117 struct nvme_tcp_icreq_pdu *icreq;
1118 struct nvme_tcp_icresp_pdu *icresp;
1119 struct msghdr msg = {};
1120 struct kvec iov;
1121 bool ctrl_hdgst, ctrl_ddgst;
1122 int ret;
1123
1124 icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1125 if (!icreq)
1126 return -ENOMEM;
1127
1128 icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1129 if (!icresp) {
1130 ret = -ENOMEM;
1131 goto free_icreq;
1132 }
1133
1134 icreq->hdr.type = nvme_tcp_icreq;
1135 icreq->hdr.hlen = sizeof(*icreq);
1136 icreq->hdr.pdo = 0;
1137 icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1138 icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1139 icreq->maxr2t = 0; /* single inflight r2t supported */
1140 icreq->hpda = 0; /* no alignment constraint */
1141 if (queue->hdr_digest)
1142 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1143 if (queue->data_digest)
1144 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1145
1146 iov.iov_base = icreq;
1147 iov.iov_len = sizeof(*icreq);
1148 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1149 if (ret < 0)
1150 goto free_icresp;
1151
1152 memset(&msg, 0, sizeof(msg));
1153 iov.iov_base = icresp;
1154 iov.iov_len = sizeof(*icresp);
1155 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1156 iov.iov_len, msg.msg_flags);
1157 if (ret < 0)
1158 goto free_icresp;
1159
1160 ret = -EINVAL;
1161 if (icresp->hdr.type != nvme_tcp_icresp) {
1162 pr_err("queue %d: bad type returned %d\n",
1163 nvme_tcp_queue_id(queue), icresp->hdr.type);
1164 goto free_icresp;
1165 }
1166
1167 if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1168 pr_err("queue %d: bad pdu length returned %d\n",
1169 nvme_tcp_queue_id(queue), icresp->hdr.plen);
1170 goto free_icresp;
1171 }
1172
1173 if (icresp->pfv != NVME_TCP_PFV_1_0) {
1174 pr_err("queue %d: bad pfv returned %d\n",
1175 nvme_tcp_queue_id(queue), icresp->pfv);
1176 goto free_icresp;
1177 }
1178
1179 ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1180 if ((queue->data_digest && !ctrl_ddgst) ||
1181 (!queue->data_digest && ctrl_ddgst)) {
1182 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1183 nvme_tcp_queue_id(queue),
1184 queue->data_digest ? "enabled" : "disabled",
1185 ctrl_ddgst ? "enabled" : "disabled");
1186 goto free_icresp;
1187 }
1188
1189 ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1190 if ((queue->hdr_digest && !ctrl_hdgst) ||
1191 (!queue->hdr_digest && ctrl_hdgst)) {
1192 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1193 nvme_tcp_queue_id(queue),
1194 queue->hdr_digest ? "enabled" : "disabled",
1195 ctrl_hdgst ? "enabled" : "disabled");
1196 goto free_icresp;
1197 }
1198
1199 if (icresp->cpda != 0) {
1200 pr_err("queue %d: unsupported cpda returned %d\n",
1201 nvme_tcp_queue_id(queue), icresp->cpda);
1202 goto free_icresp;
1203 }
1204
1205 ret = 0;
1206free_icresp:
1207 kfree(icresp);
1208free_icreq:
1209 kfree(icreq);
1210 return ret;
1211}
1212
1213static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1214 int qid, size_t queue_size)
1215{
1216 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1217 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1218 struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1219 int ret, opt, rcv_pdu_size, n;
1220
1221 queue->ctrl = ctrl;
1222 INIT_LIST_HEAD(&queue->send_list);
1223 spin_lock_init(&queue->lock);
1224 INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1225 queue->queue_size = queue_size;
1226
1227 if (qid > 0)
1228 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
1229 else
1230 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1231 NVME_TCP_ADMIN_CCSZ;
1232
1233 ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1234 IPPROTO_TCP, &queue->sock);
1235 if (ret) {
1236 dev_err(ctrl->ctrl.device,
1237 "failed to create socket: %d\n", ret);
1238 return ret;
1239 }
1240
1241 /* Single syn retry */
1242 opt = 1;
1243 ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
1244 (char *)&opt, sizeof(opt));
1245 if (ret) {
1246 dev_err(ctrl->ctrl.device,
1247 "failed to set TCP_SYNCNT sock opt %d\n", ret);
1248 goto err_sock;
1249 }
1250
1251 /* Set TCP no delay */
1252 opt = 1;
1253 ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
1254 TCP_NODELAY, (char *)&opt, sizeof(opt));
1255 if (ret) {
1256 dev_err(ctrl->ctrl.device,
1257 "failed to set TCP_NODELAY sock opt %d\n", ret);
1258 goto err_sock;
1259 }
1260
1261 /*
1262 * Cleanup whatever is sitting in the TCP transmit queue on socket
1263 * close. This is done to prevent stale data from being sent should
1264 * the network connection be restored before TCP times out.
1265 */
1266 ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
1267 (char *)&sol, sizeof(sol));
1268 if (ret) {
1269 dev_err(ctrl->ctrl.device,
1270 "failed to set SO_LINGER sock opt %d\n", ret);
1271 goto err_sock;
1272 }
1273
1274 queue->sock->sk->sk_allocation = GFP_ATOMIC;
1275 if (!qid)
1276 n = 0;
1277 else
1278 n = (qid - 1) % num_online_cpus();
1279 queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1280 queue->request = NULL;
1281 queue->data_remaining = 0;
1282 queue->ddgst_remaining = 0;
1283 queue->pdu_remaining = 0;
1284 queue->pdu_offset = 0;
1285 sk_set_memalloc(queue->sock->sk);
1286
1287 if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
1288 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1289 sizeof(ctrl->src_addr));
1290 if (ret) {
1291 dev_err(ctrl->ctrl.device,
1292 "failed to bind queue %d socket %d\n",
1293 qid, ret);
1294 goto err_sock;
1295 }
1296 }
1297
1298 queue->hdr_digest = nctrl->opts->hdr_digest;
1299 queue->data_digest = nctrl->opts->data_digest;
1300 if (queue->hdr_digest || queue->data_digest) {
1301 ret = nvme_tcp_alloc_crypto(queue);
1302 if (ret) {
1303 dev_err(ctrl->ctrl.device,
1304 "failed to allocate queue %d crypto\n", qid);
1305 goto err_sock;
1306 }
1307 }
1308
1309 rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1310 nvme_tcp_hdgst_len(queue);
1311 queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1312 if (!queue->pdu) {
1313 ret = -ENOMEM;
1314 goto err_crypto;
1315 }
1316
1317 dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
1318 nvme_tcp_queue_id(queue));
1319
1320 ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1321 sizeof(ctrl->addr), 0);
1322 if (ret) {
1323 dev_err(ctrl->ctrl.device,
1324 "failed to connect socket: %d\n", ret);
1325 goto err_rcv_pdu;
1326 }
1327
1328 ret = nvme_tcp_init_connection(queue);
1329 if (ret)
1330 goto err_init_connect;
1331
1332 queue->rd_enabled = true;
1333 set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1334 nvme_tcp_init_recv_ctx(queue);
1335
1336 write_lock_bh(&queue->sock->sk->sk_callback_lock);
1337 queue->sock->sk->sk_user_data = queue;
1338 queue->state_change = queue->sock->sk->sk_state_change;
1339 queue->data_ready = queue->sock->sk->sk_data_ready;
1340 queue->write_space = queue->sock->sk->sk_write_space;
1341 queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1342 queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1343 queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1344 write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1345
1346 return 0;
1347
1348err_init_connect:
1349 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1350err_rcv_pdu:
1351 kfree(queue->pdu);
1352err_crypto:
1353 if (queue->hdr_digest || queue->data_digest)
1354 nvme_tcp_free_crypto(queue);
1355err_sock:
1356 sock_release(queue->sock);
1357 queue->sock = NULL;
1358 return ret;
1359}
1360
1361static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1362{
1363 struct socket *sock = queue->sock;
1364
1365 write_lock_bh(&sock->sk->sk_callback_lock);
1366 sock->sk->sk_user_data = NULL;
1367 sock->sk->sk_data_ready = queue->data_ready;
1368 sock->sk->sk_state_change = queue->state_change;
1369 sock->sk->sk_write_space = queue->write_space;
1370 write_unlock_bh(&sock->sk->sk_callback_lock);
1371}
1372
1373static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1374{
1375 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1376 nvme_tcp_restore_sock_calls(queue);
1377 cancel_work_sync(&queue->io_work);
1378}
1379
1380static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1381{
1382 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1383 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1384
1385 if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1386 return;
1387
1388 __nvme_tcp_stop_queue(queue);
1389}
1390
1391static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1392{
1393 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1394 int ret;
1395
1396 if (idx)
1397 ret = nvmf_connect_io_queue(nctrl, idx, false);
1398 else
1399 ret = nvmf_connect_admin_queue(nctrl);
1400
1401 if (!ret) {
1402 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1403 } else {
1404 __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1405 dev_err(nctrl->device,
1406 "failed to connect queue: %d ret=%d\n", idx, ret);
1407 }
1408 return ret;
1409}
1410
1411static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1412 bool admin)
1413{
1414 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1415 struct blk_mq_tag_set *set;
1416 int ret;
1417
1418 if (admin) {
1419 set = &ctrl->admin_tag_set;
1420 memset(set, 0, sizeof(*set));
1421 set->ops = &nvme_tcp_admin_mq_ops;
1422 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1423 set->reserved_tags = 2; /* connect + keep-alive */
1424 set->numa_node = NUMA_NO_NODE;
1425 set->cmd_size = sizeof(struct nvme_tcp_request);
1426 set->driver_data = ctrl;
1427 set->nr_hw_queues = 1;
1428 set->timeout = ADMIN_TIMEOUT;
1429 } else {
1430 set = &ctrl->tag_set;
1431 memset(set, 0, sizeof(*set));
1432 set->ops = &nvme_tcp_mq_ops;
1433 set->queue_depth = nctrl->sqsize + 1;
1434 set->reserved_tags = 1; /* fabric connect */
1435 set->numa_node = NUMA_NO_NODE;
1436 set->flags = BLK_MQ_F_SHOULD_MERGE;
1437 set->cmd_size = sizeof(struct nvme_tcp_request);
1438 set->driver_data = ctrl;
1439 set->nr_hw_queues = nctrl->queue_count - 1;
1440 set->timeout = NVME_IO_TIMEOUT;
1441 set->nr_maps = 2 /* default + read */;
1442 }
1443
1444 ret = blk_mq_alloc_tag_set(set);
1445 if (ret)
1446 return ERR_PTR(ret);
1447
1448 return set;
1449}
1450
1451static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1452{
1453 if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1454 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1455 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1456 }
1457
1458 nvme_tcp_free_queue(ctrl, 0);
1459}
1460
1461static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1462{
1463 int i;
1464
1465 for (i = 1; i < ctrl->queue_count; i++)
1466 nvme_tcp_free_queue(ctrl, i);
1467}
1468
1469static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1470{
1471 int i;
1472
1473 for (i = 1; i < ctrl->queue_count; i++)
1474 nvme_tcp_stop_queue(ctrl, i);
1475}
1476
1477static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1478{
1479 int i, ret = 0;
1480
1481 for (i = 1; i < ctrl->queue_count; i++) {
1482 ret = nvme_tcp_start_queue(ctrl, i);
1483 if (ret)
1484 goto out_stop_queues;
1485 }
1486
1487 return 0;
1488
1489out_stop_queues:
1490 for (i--; i >= 1; i--)
1491 nvme_tcp_stop_queue(ctrl, i);
1492 return ret;
1493}
1494
1495static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1496{
1497 int ret;
1498
1499 ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1500 if (ret)
1501 return ret;
1502
1503 ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1504 if (ret)
1505 goto out_free_queue;
1506
1507 return 0;
1508
1509out_free_queue:
1510 nvme_tcp_free_queue(ctrl, 0);
1511 return ret;
1512}
1513
1514static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1515{
1516 int i, ret;
1517
1518 for (i = 1; i < ctrl->queue_count; i++) {
1519 ret = nvme_tcp_alloc_queue(ctrl, i,
1520 ctrl->sqsize + 1);
1521 if (ret)
1522 goto out_free_queues;
1523 }
1524
1525 return 0;
1526
1527out_free_queues:
1528 for (i--; i >= 1; i--)
1529 nvme_tcp_free_queue(ctrl, i);
1530
1531 return ret;
1532}
1533
1534static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1535{
1536 unsigned int nr_io_queues;
1537
1538 nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1539 nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1540
1541 return nr_io_queues;
1542}
1543
1544static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
1545{
1546 unsigned int nr_io_queues;
1547 int ret;
1548
1549 nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1550 ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1551 if (ret)
1552 return ret;
1553
1554 ctrl->queue_count = nr_io_queues + 1;
1555 if (ctrl->queue_count < 2)
1556 return 0;
1557
1558 dev_info(ctrl->device,
1559 "creating %d I/O queues.\n", nr_io_queues);
1560
1561 return nvme_tcp_alloc_io_queues(ctrl);
1562}
1563
1564static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1565{
1566 nvme_tcp_stop_io_queues(ctrl);
1567 if (remove) {
1568 if (ctrl->ops->flags & NVME_F_FABRICS)
1569 blk_cleanup_queue(ctrl->connect_q);
1570 blk_mq_free_tag_set(ctrl->tagset);
1571 }
1572 nvme_tcp_free_io_queues(ctrl);
1573}
1574
1575static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1576{
1577 int ret;
1578
1579 ret = nvme_alloc_io_queues(ctrl);
1580 if (ret)
1581 return ret;
1582
1583 if (new) {
1584 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1585 if (IS_ERR(ctrl->tagset)) {
1586 ret = PTR_ERR(ctrl->tagset);
1587 goto out_free_io_queues;
1588 }
1589
1590 if (ctrl->ops->flags & NVME_F_FABRICS) {
1591 ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1592 if (IS_ERR(ctrl->connect_q)) {
1593 ret = PTR_ERR(ctrl->connect_q);
1594 goto out_free_tag_set;
1595 }
1596 }
1597 } else {
1598 blk_mq_update_nr_hw_queues(ctrl->tagset,
1599 ctrl->queue_count - 1);
1600 }
1601
1602 ret = nvme_tcp_start_io_queues(ctrl);
1603 if (ret)
1604 goto out_cleanup_connect_q;
1605
1606 return 0;
1607
1608out_cleanup_connect_q:
1609 if (new && (ctrl->ops->flags & NVME_F_FABRICS))
1610 blk_cleanup_queue(ctrl->connect_q);
1611out_free_tag_set:
1612 if (new)
1613 blk_mq_free_tag_set(ctrl->tagset);
1614out_free_io_queues:
1615 nvme_tcp_free_io_queues(ctrl);
1616 return ret;
1617}
1618
1619static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1620{
1621 nvme_tcp_stop_queue(ctrl, 0);
1622 if (remove) {
1623 free_opal_dev(ctrl->opal_dev);
1624 blk_cleanup_queue(ctrl->admin_q);
1625 blk_mq_free_tag_set(ctrl->admin_tagset);
1626 }
1627 nvme_tcp_free_admin_queue(ctrl);
1628}
1629
1630static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1631{
1632 int error;
1633
1634 error = nvme_tcp_alloc_admin_queue(ctrl);
1635 if (error)
1636 return error;
1637
1638 if (new) {
1639 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1640 if (IS_ERR(ctrl->admin_tagset)) {
1641 error = PTR_ERR(ctrl->admin_tagset);
1642 goto out_free_queue;
1643 }
1644
1645 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1646 if (IS_ERR(ctrl->admin_q)) {
1647 error = PTR_ERR(ctrl->admin_q);
1648 goto out_free_tagset;
1649 }
1650 }
1651
1652 error = nvme_tcp_start_queue(ctrl, 0);
1653 if (error)
1654 goto out_cleanup_queue;
1655
1656 error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
1657 if (error) {
1658 dev_err(ctrl->device,
1659 "prop_get NVME_REG_CAP failed\n");
1660 goto out_stop_queue;
1661 }
1662
1663 ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
1664
1665 error = nvme_enable_ctrl(ctrl, ctrl->cap);
1666 if (error)
1667 goto out_stop_queue;
1668
1669 error = nvme_init_identify(ctrl);
1670 if (error)
1671 goto out_stop_queue;
1672
1673 return 0;
1674
1675out_stop_queue:
1676 nvme_tcp_stop_queue(ctrl, 0);
1677out_cleanup_queue:
1678 if (new)
1679 blk_cleanup_queue(ctrl->admin_q);
1680out_free_tagset:
1681 if (new)
1682 blk_mq_free_tag_set(ctrl->admin_tagset);
1683out_free_queue:
1684 nvme_tcp_free_admin_queue(ctrl);
1685 return error;
1686}
1687
1688static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1689 bool remove)
1690{
1691 blk_mq_quiesce_queue(ctrl->admin_q);
1692 nvme_tcp_stop_queue(ctrl, 0);
1693 blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl);
1694 blk_mq_unquiesce_queue(ctrl->admin_q);
1695 nvme_tcp_destroy_admin_queue(ctrl, remove);
1696}
1697
1698static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1699 bool remove)
1700{
1701 if (ctrl->queue_count <= 1)
1702 return;
1703 nvme_stop_queues(ctrl);
1704 nvme_tcp_stop_io_queues(ctrl);
1705 blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
1706 if (remove)
1707 nvme_start_queues(ctrl);
1708 nvme_tcp_destroy_io_queues(ctrl, remove);
1709}
1710
1711static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1712{
1713 /* If we are resetting/deleting then do nothing */
1714 if (ctrl->state != NVME_CTRL_CONNECTING) {
1715 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1716 ctrl->state == NVME_CTRL_LIVE);
1717 return;
1718 }
1719
1720 if (nvmf_should_reconnect(ctrl)) {
1721 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1722 ctrl->opts->reconnect_delay);
1723 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1724 ctrl->opts->reconnect_delay * HZ);
1725 } else {
1726 dev_info(ctrl->device, "Removing controller...\n");
1727 nvme_delete_ctrl(ctrl);
1728 }
1729}
1730
1731static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1732{
1733 struct nvmf_ctrl_options *opts = ctrl->opts;
1734 int ret = -EINVAL;
1735
1736 ret = nvme_tcp_configure_admin_queue(ctrl, new);
1737 if (ret)
1738 return ret;
1739
1740 if (ctrl->icdoff) {
1741 dev_err(ctrl->device, "icdoff is not supported!\n");
1742 goto destroy_admin;
1743 }
1744
1745 if (opts->queue_size > ctrl->sqsize + 1)
1746 dev_warn(ctrl->device,
1747 "queue_size %zu > ctrl sqsize %u, clamping down\n",
1748 opts->queue_size, ctrl->sqsize + 1);
1749
1750 if (ctrl->sqsize + 1 > ctrl->maxcmd) {
1751 dev_warn(ctrl->device,
1752 "sqsize %u > ctrl maxcmd %u, clamping down\n",
1753 ctrl->sqsize + 1, ctrl->maxcmd);
1754 ctrl->sqsize = ctrl->maxcmd - 1;
1755 }
1756
1757 if (ctrl->queue_count > 1) {
1758 ret = nvme_tcp_configure_io_queues(ctrl, new);
1759 if (ret)
1760 goto destroy_admin;
1761 }
1762
1763 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
1764 /* state change failure is ok if we're in DELETING state */
1765 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1766 ret = -EINVAL;
1767 goto destroy_io;
1768 }
1769
1770 nvme_start_ctrl(ctrl);
1771 return 0;
1772
1773destroy_io:
1774 if (ctrl->queue_count > 1)
1775 nvme_tcp_destroy_io_queues(ctrl, new);
1776destroy_admin:
1777 nvme_tcp_stop_queue(ctrl, 0);
1778 nvme_tcp_destroy_admin_queue(ctrl, new);
1779 return ret;
1780}
1781
1782static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
1783{
1784 struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
1785 struct nvme_tcp_ctrl, connect_work);
1786 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1787
1788 ++ctrl->nr_reconnects;
1789
1790 if (nvme_tcp_setup_ctrl(ctrl, false))
1791 goto requeue;
1792
1793 dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
1794 ctrl->nr_reconnects);
1795
1796 ctrl->nr_reconnects = 0;
1797
1798 return;
1799
1800requeue:
1801 dev_info(ctrl->device, "Failed reconnect attempt %d\n",
1802 ctrl->nr_reconnects);
1803 nvme_tcp_reconnect_or_remove(ctrl);
1804}
1805
1806static void nvme_tcp_error_recovery_work(struct work_struct *work)
1807{
1808 struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
1809 struct nvme_tcp_ctrl, err_work);
1810 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1811
1812 nvme_stop_keep_alive(ctrl);
1813 nvme_tcp_teardown_io_queues(ctrl, false);
1814 /* unquiesce to fail fast pending requests */
1815 nvme_start_queues(ctrl);
1816 nvme_tcp_teardown_admin_queue(ctrl, false);
1817
1818 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1819 /* state change failure is ok if we're in DELETING state */
1820 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1821 return;
1822 }
1823
1824 nvme_tcp_reconnect_or_remove(ctrl);
1825}
1826
1827static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
1828{
1829 nvme_tcp_teardown_io_queues(ctrl, shutdown);
1830 if (shutdown)
1831 nvme_shutdown_ctrl(ctrl);
1832 else
1833 nvme_disable_ctrl(ctrl, ctrl->cap);
1834 nvme_tcp_teardown_admin_queue(ctrl, shutdown);
1835}
1836
1837static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
1838{
1839 nvme_tcp_teardown_ctrl(ctrl, true);
1840}
1841
1842static void nvme_reset_ctrl_work(struct work_struct *work)
1843{
1844 struct nvme_ctrl *ctrl =
1845 container_of(work, struct nvme_ctrl, reset_work);
1846
1847 nvme_stop_ctrl(ctrl);
1848 nvme_tcp_teardown_ctrl(ctrl, false);
1849
1850 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1851 /* state change failure is ok if we're in DELETING state */
1852 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1853 return;
1854 }
1855
1856 if (nvme_tcp_setup_ctrl(ctrl, false))
1857 goto out_fail;
1858
1859 return;
1860
1861out_fail:
1862 ++ctrl->nr_reconnects;
1863 nvme_tcp_reconnect_or_remove(ctrl);
1864}
1865
1866static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
1867{
1868 cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
1869 cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
1870}
1871
1872static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
1873{
1874 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1875
1876 if (list_empty(&ctrl->list))
1877 goto free_ctrl;
1878
1879 mutex_lock(&nvme_tcp_ctrl_mutex);
1880 list_del(&ctrl->list);
1881 mutex_unlock(&nvme_tcp_ctrl_mutex);
1882
1883 nvmf_free_options(nctrl->opts);
1884free_ctrl:
1885 kfree(ctrl->queues);
1886 kfree(ctrl);
1887}
1888
1889static void nvme_tcp_set_sg_null(struct nvme_command *c)
1890{
1891 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1892
1893 sg->addr = 0;
1894 sg->length = 0;
1895 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
1896 NVME_SGL_FMT_TRANSPORT_A;
1897}
1898
1899static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
1900 struct nvme_command *c, u32 data_len)
1901{
1902 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1903
1904 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
1905 sg->length = cpu_to_le32(data_len);
1906 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
1907}
1908
1909static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
1910 u32 data_len)
1911{
1912 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1913
1914 sg->addr = 0;
1915 sg->length = cpu_to_le32(data_len);
1916 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
1917 NVME_SGL_FMT_TRANSPORT_A;
1918}
1919
1920static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
1921{
1922 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
1923 struct nvme_tcp_queue *queue = &ctrl->queues[0];
1924 struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
1925 struct nvme_command *cmd = &pdu->cmd;
1926 u8 hdgst = nvme_tcp_hdgst_len(queue);
1927
1928 memset(pdu, 0, sizeof(*pdu));
1929 pdu->hdr.type = nvme_tcp_cmd;
1930 if (queue->hdr_digest)
1931 pdu->hdr.flags |= NVME_TCP_F_HDGST;
1932 pdu->hdr.hlen = sizeof(*pdu);
1933 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
1934
1935 cmd->common.opcode = nvme_admin_async_event;
1936 cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1937 cmd->common.flags |= NVME_CMD_SGL_METABUF;
1938 nvme_tcp_set_sg_null(cmd);
1939
1940 ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
1941 ctrl->async_req.offset = 0;
1942 ctrl->async_req.curr_bio = NULL;
1943 ctrl->async_req.data_len = 0;
1944
1945 nvme_tcp_queue_request(&ctrl->async_req);
1946}
1947
1948static enum blk_eh_timer_return
1949nvme_tcp_timeout(struct request *rq, bool reserved)
1950{
1951 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1952 struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
1953 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1954
1955 dev_dbg(ctrl->ctrl.device,
1956 "queue %d: timeout request %#x type %d\n",
1957 nvme_tcp_queue_id(req->queue), rq->tag,
1958 pdu->hdr.type);
1959
1960 if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
1961 union nvme_result res = {};
1962
1963 nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
1964 nvme_end_request(rq, cpu_to_le16(NVME_SC_ABORT_REQ), res);
1965 return BLK_EH_DONE;
1966 }
1967
1968 /* queue error recovery */
1969 nvme_tcp_error_recovery(&ctrl->ctrl);
1970
1971 return BLK_EH_RESET_TIMER;
1972}
1973
1974static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
1975 struct request *rq)
1976{
1977 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1978 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1979 struct nvme_command *c = &pdu->cmd;
1980
1981 c->common.flags |= NVME_CMD_SGL_METABUF;
1982
1983 if (rq_data_dir(rq) == WRITE && req->data_len &&
1984 req->data_len <= nvme_tcp_inline_data_size(queue))
1985 nvme_tcp_set_sg_inline(queue, c, req->data_len);
1986 else
1987 nvme_tcp_set_sg_host_data(c, req->data_len);
1988
1989 return 0;
1990}
1991
1992static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
1993 struct request *rq)
1994{
1995 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
1996 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1997 struct nvme_tcp_queue *queue = req->queue;
1998 u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
1999 blk_status_t ret;
2000
2001 ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
2002 if (ret)
2003 return ret;
2004
2005 req->state = NVME_TCP_SEND_CMD_PDU;
2006 req->offset = 0;
2007 req->data_sent = 0;
2008 req->pdu_len = 0;
2009 req->pdu_sent = 0;
2010 req->data_len = blk_rq_payload_bytes(rq);
2011 req->curr_bio = rq->bio;
2012
2013 if (rq_data_dir(rq) == WRITE &&
2014 req->data_len <= nvme_tcp_inline_data_size(queue))
2015 req->pdu_len = req->data_len;
2016 else if (req->curr_bio)
2017 nvme_tcp_init_iter(req, READ);
2018
2019 pdu->hdr.type = nvme_tcp_cmd;
2020 pdu->hdr.flags = 0;
2021 if (queue->hdr_digest)
2022 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2023 if (queue->data_digest && req->pdu_len) {
2024 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2025 ddgst = nvme_tcp_ddgst_len(queue);
2026 }
2027 pdu->hdr.hlen = sizeof(*pdu);
2028 pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2029 pdu->hdr.plen =
2030 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2031
2032 ret = nvme_tcp_map_data(queue, rq);
2033 if (unlikely(ret)) {
2034 dev_err(queue->ctrl->ctrl.device,
2035 "Failed to map data (%d)\n", ret);
2036 return ret;
2037 }
2038
2039 return 0;
2040}
2041
2042static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2043 const struct blk_mq_queue_data *bd)
2044{
2045 struct nvme_ns *ns = hctx->queue->queuedata;
2046 struct nvme_tcp_queue *queue = hctx->driver_data;
2047 struct request *rq = bd->rq;
2048 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2049 bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2050 blk_status_t ret;
2051
2052 if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2053 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2054
2055 ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2056 if (unlikely(ret))
2057 return ret;
2058
2059 blk_mq_start_request(rq);
2060
2061 nvme_tcp_queue_request(req);
2062
2063 return BLK_STS_OK;
2064}
2065
2066static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2067{
2068 struct nvme_tcp_ctrl *ctrl = set->driver_data;
2069
2070 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2071 set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
2072 if (ctrl->ctrl.opts->nr_write_queues) {
2073 /* separate read/write queues */
2074 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2075 ctrl->ctrl.opts->nr_write_queues;
2076 set->map[HCTX_TYPE_READ].queue_offset =
2077 ctrl->ctrl.opts->nr_write_queues;
2078 } else {
2079 /* mixed read/write queues */
2080 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2081 ctrl->ctrl.opts->nr_io_queues;
2082 set->map[HCTX_TYPE_READ].queue_offset = 0;
2083 }
2084 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2085 blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2086 return 0;
2087}
2088
2089static struct blk_mq_ops nvme_tcp_mq_ops = {
2090 .queue_rq = nvme_tcp_queue_rq,
2091 .complete = nvme_complete_rq,
2092 .init_request = nvme_tcp_init_request,
2093 .exit_request = nvme_tcp_exit_request,
2094 .init_hctx = nvme_tcp_init_hctx,
2095 .timeout = nvme_tcp_timeout,
2096 .map_queues = nvme_tcp_map_queues,
2097};
2098
2099static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2100 .queue_rq = nvme_tcp_queue_rq,
2101 .complete = nvme_complete_rq,
2102 .init_request = nvme_tcp_init_request,
2103 .exit_request = nvme_tcp_exit_request,
2104 .init_hctx = nvme_tcp_init_admin_hctx,
2105 .timeout = nvme_tcp_timeout,
2106};
2107
2108static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2109 .name = "tcp",
2110 .module = THIS_MODULE,
2111 .flags = NVME_F_FABRICS,
2112 .reg_read32 = nvmf_reg_read32,
2113 .reg_read64 = nvmf_reg_read64,
2114 .reg_write32 = nvmf_reg_write32,
2115 .free_ctrl = nvme_tcp_free_ctrl,
2116 .submit_async_event = nvme_tcp_submit_async_event,
2117 .delete_ctrl = nvme_tcp_delete_ctrl,
2118 .get_address = nvmf_get_address,
2119 .stop_ctrl = nvme_tcp_stop_ctrl,
2120};
2121
2122static bool
2123nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2124{
2125 struct nvme_tcp_ctrl *ctrl;
2126 bool found = false;
2127
2128 mutex_lock(&nvme_tcp_ctrl_mutex);
2129 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2130 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2131 if (found)
2132 break;
2133 }
2134 mutex_unlock(&nvme_tcp_ctrl_mutex);
2135
2136 return found;
2137}
2138
2139static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2140 struct nvmf_ctrl_options *opts)
2141{
2142 struct nvme_tcp_ctrl *ctrl;
2143 int ret;
2144
2145 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2146 if (!ctrl)
2147 return ERR_PTR(-ENOMEM);
2148
2149 INIT_LIST_HEAD(&ctrl->list);
2150 ctrl->ctrl.opts = opts;
2151 ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
2152 ctrl->ctrl.sqsize = opts->queue_size - 1;
2153 ctrl->ctrl.kato = opts->kato;
2154
2155 INIT_DELAYED_WORK(&ctrl->connect_work,
2156 nvme_tcp_reconnect_ctrl_work);
2157 INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2158 INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2159
2160 if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2161 opts->trsvcid =
2162 kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2163 if (!opts->trsvcid) {
2164 ret = -ENOMEM;
2165 goto out_free_ctrl;
2166 }
2167 opts->mask |= NVMF_OPT_TRSVCID;
2168 }
2169
2170 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2171 opts->traddr, opts->trsvcid, &ctrl->addr);
2172 if (ret) {
2173 pr_err("malformed address passed: %s:%s\n",
2174 opts->traddr, opts->trsvcid);
2175 goto out_free_ctrl;
2176 }
2177
2178 if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2179 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2180 opts->host_traddr, NULL, &ctrl->src_addr);
2181 if (ret) {
2182 pr_err("malformed src address passed: %s\n",
2183 opts->host_traddr);
2184 goto out_free_ctrl;
2185 }
2186 }
2187
2188 if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2189 ret = -EALREADY;
2190 goto out_free_ctrl;
2191 }
2192
2193 ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2194 GFP_KERNEL);
2195 if (!ctrl->queues) {
2196 ret = -ENOMEM;
2197 goto out_free_ctrl;
2198 }
2199
2200 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2201 if (ret)
2202 goto out_kfree_queues;
2203
2204 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2205 WARN_ON_ONCE(1);
2206 ret = -EINTR;
2207 goto out_uninit_ctrl;
2208 }
2209
2210 ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2211 if (ret)
2212 goto out_uninit_ctrl;
2213
2214 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2215 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2216
2217 nvme_get_ctrl(&ctrl->ctrl);
2218
2219 mutex_lock(&nvme_tcp_ctrl_mutex);
2220 list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2221 mutex_unlock(&nvme_tcp_ctrl_mutex);
2222
2223 return &ctrl->ctrl;
2224
2225out_uninit_ctrl:
2226 nvme_uninit_ctrl(&ctrl->ctrl);
2227 nvme_put_ctrl(&ctrl->ctrl);
2228 if (ret > 0)
2229 ret = -EIO;
2230 return ERR_PTR(ret);
2231out_kfree_queues:
2232 kfree(ctrl->queues);
2233out_free_ctrl:
2234 kfree(ctrl);
2235 return ERR_PTR(ret);
2236}
2237
2238static struct nvmf_transport_ops nvme_tcp_transport = {
2239 .name = "tcp",
2240 .module = THIS_MODULE,
2241 .required_opts = NVMF_OPT_TRADDR,
2242 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2243 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2244 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2245 NVMF_OPT_NR_WRITE_QUEUES,
2246 .create_ctrl = nvme_tcp_create_ctrl,
2247};
2248
2249static int __init nvme_tcp_init_module(void)
2250{
2251 nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2252 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2253 if (!nvme_tcp_wq)
2254 return -ENOMEM;
2255
2256 nvmf_register_transport(&nvme_tcp_transport);
2257 return 0;
2258}
2259
2260static void __exit nvme_tcp_cleanup_module(void)
2261{
2262 struct nvme_tcp_ctrl *ctrl;
2263
2264 nvmf_unregister_transport(&nvme_tcp_transport);
2265
2266 mutex_lock(&nvme_tcp_ctrl_mutex);
2267 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2268 nvme_delete_ctrl(&ctrl->ctrl);
2269 mutex_unlock(&nvme_tcp_ctrl_mutex);
2270 flush_workqueue(nvme_delete_wq);
2271
2272 destroy_workqueue(nvme_tcp_wq);
2273}
2274
2275module_init(nvme_tcp_init_module);
2276module_exit(nvme_tcp_cleanup_module);
2277
2278MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 25b0e310f4a8..5566dda3237a 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -139,3 +139,6 @@ const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
139 139
140 return ret; 140 return ret;
141} 141}
142EXPORT_SYMBOL_GPL(nvme_trace_disk_name);
143
144EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq);
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 196d5bd56718..3564120aa7b3 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -115,8 +115,8 @@ TRACE_EVENT(nvme_setup_cmd,
115 __entry->nsid = le32_to_cpu(cmd->common.nsid); 115 __entry->nsid = le32_to_cpu(cmd->common.nsid);
116 __entry->metadata = le64_to_cpu(cmd->common.metadata); 116 __entry->metadata = le64_to_cpu(cmd->common.metadata);
117 __assign_disk_name(__entry->disk, req->rq_disk); 117 __assign_disk_name(__entry->disk, req->rq_disk);
118 memcpy(__entry->cdw10, cmd->common.cdw10, 118 memcpy(__entry->cdw10, &cmd->common.cdw10,
119 sizeof(__entry->cdw10)); 119 6 * sizeof(__entry->cdw10));
120 ), 120 ),
121 TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", 121 TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
122 __entry->ctrl_id, __print_disk_name(__entry->disk), 122 __entry->ctrl_id, __print_disk_name(__entry->disk),
@@ -184,6 +184,29 @@ TRACE_EVENT(nvme_async_event,
184 184
185#undef aer_name 185#undef aer_name
186 186
187TRACE_EVENT(nvme_sq,
188 TP_PROTO(struct request *req, __le16 sq_head, int sq_tail),
189 TP_ARGS(req, sq_head, sq_tail),
190 TP_STRUCT__entry(
191 __field(int, ctrl_id)
192 __array(char, disk, DISK_NAME_LEN)
193 __field(int, qid)
194 __field(u16, sq_head)
195 __field(u16, sq_tail)
196 ),
197 TP_fast_assign(
198 __entry->ctrl_id = nvme_req(req)->ctrl->instance;
199 __assign_disk_name(__entry->disk, req->rq_disk);
200 __entry->qid = nvme_req_qid(req);
201 __entry->sq_head = le16_to_cpu(sq_head);
202 __entry->sq_tail = sq_tail;
203 ),
204 TP_printk("nvme%d: %sqid=%d, head=%u, tail=%u",
205 __entry->ctrl_id, __print_disk_name(__entry->disk),
206 __entry->qid, __entry->sq_head, __entry->sq_tail
207 )
208);
209
187#endif /* _TRACE_NVME_H */ 210#endif /* _TRACE_NVME_H */
188 211
189#undef TRACE_INCLUDE_PATH 212#undef TRACE_INCLUDE_PATH
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 3c7b61ddb0d1..d94f25cde019 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -60,3 +60,13 @@ config NVME_TARGET_FCLOOP
60 to test NVMe-FC transport interfaces. 60 to test NVMe-FC transport interfaces.
61 61
62 If unsure, say N. 62 If unsure, say N.
63
64config NVME_TARGET_TCP
65 tristate "NVMe over Fabrics TCP target support"
66 depends on INET
67 depends on NVME_TARGET
68 help
69 This enables the NVMe TCP target support, which allows exporting NVMe
70 devices over TCP.
71
72 If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 8118c93391c6..8c3ad0fb6860 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o
5obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o 5obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o
6obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o 6obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o
7obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o 7obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o
8obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o
8 9
9nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ 10nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
10 discovery.o io-cmd-file.o io-cmd-bdev.o 11 discovery.o io-cmd-file.o io-cmd-bdev.o
@@ -12,3 +13,4 @@ nvme-loop-y += loop.o
12nvmet-rdma-y += rdma.o 13nvmet-rdma-y += rdma.o
13nvmet-fc-y += fc.o 14nvmet-fc-y += fc.o
14nvme-fcloop-y += fcloop.o 15nvme-fcloop-y += fcloop.o
16nvmet-tcp-y += tcp.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 1179f6314323..11baeb14c388 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -19,19 +19,6 @@
19#include <asm/unaligned.h> 19#include <asm/unaligned.h>
20#include "nvmet.h" 20#include "nvmet.h"
21 21
22/*
23 * This helper allows us to clear the AEN based on the RAE bit,
24 * Please use this helper when processing the log pages which are
25 * associated with the AEN.
26 */
27static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
28{
29 int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
30
31 if (!rae)
32 clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
33}
34
35u32 nvmet_get_log_page_len(struct nvme_command *cmd) 22u32 nvmet_get_log_page_len(struct nvme_command *cmd)
36{ 23{
37 u32 len = le16_to_cpu(cmd->get_log_page.numdu); 24 u32 len = le16_to_cpu(cmd->get_log_page.numdu);
@@ -50,6 +37,34 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req)
50 nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len)); 37 nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len));
51} 38}
52 39
40static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
41{
42 struct nvmet_ctrl *ctrl = req->sq->ctrl;
43 u16 status = NVME_SC_SUCCESS;
44 unsigned long flags;
45 off_t offset = 0;
46 u64 slot;
47 u64 i;
48
49 spin_lock_irqsave(&ctrl->error_lock, flags);
50 slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS;
51
52 for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) {
53 status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot],
54 sizeof(struct nvme_error_slot));
55 if (status)
56 break;
57
58 if (slot == 0)
59 slot = NVMET_ERROR_LOG_SLOTS - 1;
60 else
61 slot--;
62 offset += sizeof(struct nvme_error_slot);
63 }
64 spin_unlock_irqrestore(&ctrl->error_lock, flags);
65 nvmet_req_complete(req, status);
66}
67
53static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, 68static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
54 struct nvme_smart_log *slog) 69 struct nvme_smart_log *slog)
55{ 70{
@@ -60,6 +75,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
60 if (!ns) { 75 if (!ns) {
61 pr_err("Could not find namespace id : %d\n", 76 pr_err("Could not find namespace id : %d\n",
62 le32_to_cpu(req->cmd->get_log_page.nsid)); 77 le32_to_cpu(req->cmd->get_log_page.nsid));
78 req->error_loc = offsetof(struct nvme_rw_command, nsid);
63 return NVME_SC_INVALID_NS; 79 return NVME_SC_INVALID_NS;
64 } 80 }
65 81
@@ -119,6 +135,7 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
119{ 135{
120 struct nvme_smart_log *log; 136 struct nvme_smart_log *log;
121 u16 status = NVME_SC_INTERNAL; 137 u16 status = NVME_SC_INTERNAL;
138 unsigned long flags;
122 139
123 if (req->data_len != sizeof(*log)) 140 if (req->data_len != sizeof(*log))
124 goto out; 141 goto out;
@@ -134,6 +151,11 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
134 if (status) 151 if (status)
135 goto out_free_log; 152 goto out_free_log;
136 153
154 spin_lock_irqsave(&req->sq->ctrl->error_lock, flags);
155 put_unaligned_le64(req->sq->ctrl->err_counter,
156 &log->num_err_log_entries);
157 spin_unlock_irqrestore(&req->sq->ctrl->error_lock, flags);
158
137 status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); 159 status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
138out_free_log: 160out_free_log:
139 kfree(log); 161 kfree(log);
@@ -189,7 +211,7 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
189 if (!status) 211 if (!status)
190 status = nvmet_zero_sgl(req, len, req->data_len - len); 212 status = nvmet_zero_sgl(req, len, req->data_len - len);
191 ctrl->nr_changed_ns = 0; 213 ctrl->nr_changed_ns = 0;
192 nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR); 214 nvmet_clear_aen_bit(req, NVME_AEN_BIT_NS_ATTR);
193 mutex_unlock(&ctrl->lock); 215 mutex_unlock(&ctrl->lock);
194out: 216out:
195 nvmet_req_complete(req, status); 217 nvmet_req_complete(req, status);
@@ -252,7 +274,7 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
252 274
253 hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); 275 hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
254 hdr.ngrps = cpu_to_le16(ngrps); 276 hdr.ngrps = cpu_to_le16(ngrps);
255 nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE); 277 nvmet_clear_aen_bit(req, NVME_AEN_BIT_ANA_CHANGE);
256 up_read(&nvmet_ana_sem); 278 up_read(&nvmet_ana_sem);
257 279
258 kfree(desc); 280 kfree(desc);
@@ -304,7 +326,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
304 326
305 /* XXX: figure out what to do about RTD3R/RTD3 */ 327 /* XXX: figure out what to do about RTD3R/RTD3 */
306 id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL); 328 id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
307 id->ctratt = cpu_to_le32(1 << 0); 329 id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT |
330 NVME_CTRL_ATTR_TBKAS);
308 331
309 id->oacs = 0; 332 id->oacs = 0;
310 333
@@ -392,6 +415,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
392 u16 status = 0; 415 u16 status = 0;
393 416
394 if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { 417 if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
418 req->error_loc = offsetof(struct nvme_identify, nsid);
395 status = NVME_SC_INVALID_NS | NVME_SC_DNR; 419 status = NVME_SC_INVALID_NS | NVME_SC_DNR;
396 goto out; 420 goto out;
397 } 421 }
@@ -512,6 +536,7 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req)
512 536
513 ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); 537 ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
514 if (!ns) { 538 if (!ns) {
539 req->error_loc = offsetof(struct nvme_identify, nsid);
515 status = NVME_SC_INVALID_NS | NVME_SC_DNR; 540 status = NVME_SC_INVALID_NS | NVME_SC_DNR;
516 goto out; 541 goto out;
517 } 542 }
@@ -569,13 +594,15 @@ static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req)
569 594
570static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) 595static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
571{ 596{
572 u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]); 597 u32 write_protect = le32_to_cpu(req->cmd->common.cdw11);
573 struct nvmet_subsys *subsys = req->sq->ctrl->subsys; 598 struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
574 u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE; 599 u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE;
575 600
576 req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid); 601 req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid);
577 if (unlikely(!req->ns)) 602 if (unlikely(!req->ns)) {
603 req->error_loc = offsetof(struct nvme_common_command, nsid);
578 return status; 604 return status;
605 }
579 606
580 mutex_lock(&subsys->lock); 607 mutex_lock(&subsys->lock);
581 switch (write_protect) { 608 switch (write_protect) {
@@ -599,11 +626,36 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
599 return status; 626 return status;
600} 627}
601 628
629u16 nvmet_set_feat_kato(struct nvmet_req *req)
630{
631 u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
632
633 req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
634
635 nvmet_set_result(req, req->sq->ctrl->kato);
636
637 return 0;
638}
639
640u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
641{
642 u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
643
644 if (val32 & ~mask) {
645 req->error_loc = offsetof(struct nvme_common_command, cdw11);
646 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
647 }
648
649 WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
650 nvmet_set_result(req, val32);
651
652 return 0;
653}
654
602static void nvmet_execute_set_features(struct nvmet_req *req) 655static void nvmet_execute_set_features(struct nvmet_req *req)
603{ 656{
604 struct nvmet_subsys *subsys = req->sq->ctrl->subsys; 657 struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
605 u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); 658 u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
606 u32 val32;
607 u16 status = 0; 659 u16 status = 0;
608 660
609 switch (cdw10 & 0xff) { 661 switch (cdw10 & 0xff) {
@@ -612,19 +664,10 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
612 (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); 664 (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16));
613 break; 665 break;
614 case NVME_FEAT_KATO: 666 case NVME_FEAT_KATO:
615 val32 = le32_to_cpu(req->cmd->common.cdw10[1]); 667 status = nvmet_set_feat_kato(req);
616 req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
617 nvmet_set_result(req, req->sq->ctrl->kato);
618 break; 668 break;
619 case NVME_FEAT_ASYNC_EVENT: 669 case NVME_FEAT_ASYNC_EVENT:
620 val32 = le32_to_cpu(req->cmd->common.cdw10[1]); 670 status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL);
621 if (val32 & ~NVMET_AEN_CFG_ALL) {
622 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
623 break;
624 }
625
626 WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
627 nvmet_set_result(req, val32);
628 break; 671 break;
629 case NVME_FEAT_HOST_ID: 672 case NVME_FEAT_HOST_ID:
630 status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; 673 status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
@@ -633,6 +676,7 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
633 status = nvmet_set_feat_write_protect(req); 676 status = nvmet_set_feat_write_protect(req);
634 break; 677 break;
635 default: 678 default:
679 req->error_loc = offsetof(struct nvme_common_command, cdw10);
636 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 680 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
637 break; 681 break;
638 } 682 }
@@ -646,9 +690,10 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
646 u32 result; 690 u32 result;
647 691
648 req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid); 692 req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid);
649 if (!req->ns) 693 if (!req->ns) {
694 req->error_loc = offsetof(struct nvme_common_command, nsid);
650 return NVME_SC_INVALID_NS | NVME_SC_DNR; 695 return NVME_SC_INVALID_NS | NVME_SC_DNR;
651 696 }
652 mutex_lock(&subsys->lock); 697 mutex_lock(&subsys->lock);
653 if (req->ns->readonly == true) 698 if (req->ns->readonly == true)
654 result = NVME_NS_WRITE_PROTECT; 699 result = NVME_NS_WRITE_PROTECT;
@@ -660,10 +705,20 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
660 return 0; 705 return 0;
661} 706}
662 707
708void nvmet_get_feat_kato(struct nvmet_req *req)
709{
710 nvmet_set_result(req, req->sq->ctrl->kato * 1000);
711}
712
713void nvmet_get_feat_async_event(struct nvmet_req *req)
714{
715 nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
716}
717
663static void nvmet_execute_get_features(struct nvmet_req *req) 718static void nvmet_execute_get_features(struct nvmet_req *req)
664{ 719{
665 struct nvmet_subsys *subsys = req->sq->ctrl->subsys; 720 struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
666 u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); 721 u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
667 u16 status = 0; 722 u16 status = 0;
668 723
669 switch (cdw10 & 0xff) { 724 switch (cdw10 & 0xff) {
@@ -689,7 +744,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
689 break; 744 break;
690#endif 745#endif
691 case NVME_FEAT_ASYNC_EVENT: 746 case NVME_FEAT_ASYNC_EVENT:
692 nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); 747 nvmet_get_feat_async_event(req);
693 break; 748 break;
694 case NVME_FEAT_VOLATILE_WC: 749 case NVME_FEAT_VOLATILE_WC:
695 nvmet_set_result(req, 1); 750 nvmet_set_result(req, 1);
@@ -699,11 +754,13 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
699 (subsys->max_qid-1) | ((subsys->max_qid-1) << 16)); 754 (subsys->max_qid-1) | ((subsys->max_qid-1) << 16));
700 break; 755 break;
701 case NVME_FEAT_KATO: 756 case NVME_FEAT_KATO:
702 nvmet_set_result(req, req->sq->ctrl->kato * 1000); 757 nvmet_get_feat_kato(req);
703 break; 758 break;
704 case NVME_FEAT_HOST_ID: 759 case NVME_FEAT_HOST_ID:
705 /* need 128-bit host identifier flag */ 760 /* need 128-bit host identifier flag */
706 if (!(req->cmd->common.cdw10[1] & cpu_to_le32(1 << 0))) { 761 if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) {
762 req->error_loc =
763 offsetof(struct nvme_common_command, cdw11);
707 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 764 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
708 break; 765 break;
709 } 766 }
@@ -715,6 +772,8 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
715 status = nvmet_get_feat_write_protect(req); 772 status = nvmet_get_feat_write_protect(req);
716 break; 773 break;
717 default: 774 default:
775 req->error_loc =
776 offsetof(struct nvme_common_command, cdw10);
718 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 777 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
719 break; 778 break;
720 } 779 }
@@ -722,7 +781,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
722 nvmet_req_complete(req, status); 781 nvmet_req_complete(req, status);
723} 782}
724 783
725static void nvmet_execute_async_event(struct nvmet_req *req) 784void nvmet_execute_async_event(struct nvmet_req *req)
726{ 785{
727 struct nvmet_ctrl *ctrl = req->sq->ctrl; 786 struct nvmet_ctrl *ctrl = req->sq->ctrl;
728 787
@@ -738,7 +797,7 @@ static void nvmet_execute_async_event(struct nvmet_req *req)
738 schedule_work(&ctrl->async_event_work); 797 schedule_work(&ctrl->async_event_work);
739} 798}
740 799
741static void nvmet_execute_keep_alive(struct nvmet_req *req) 800void nvmet_execute_keep_alive(struct nvmet_req *req)
742{ 801{
743 struct nvmet_ctrl *ctrl = req->sq->ctrl; 802 struct nvmet_ctrl *ctrl = req->sq->ctrl;
744 803
@@ -764,13 +823,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
764 823
765 switch (cmd->get_log_page.lid) { 824 switch (cmd->get_log_page.lid) {
766 case NVME_LOG_ERROR: 825 case NVME_LOG_ERROR:
767 /* 826 req->execute = nvmet_execute_get_log_page_error;
768 * We currently never set the More bit in the status
769 * field, so all error log entries are invalid and can
770 * be zeroed out. This is called a minum viable
771 * implementation (TM) of this mandatory log page.
772 */
773 req->execute = nvmet_execute_get_log_page_noop;
774 return 0; 827 return 0;
775 case NVME_LOG_SMART: 828 case NVME_LOG_SMART:
776 req->execute = nvmet_execute_get_log_page_smart; 829 req->execute = nvmet_execute_get_log_page_smart;
@@ -836,5 +889,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
836 889
837 pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, 890 pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
838 req->sq->qid); 891 req->sq->qid);
892 req->error_loc = offsetof(struct nvme_common_command, opcode);
839 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 893 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
840} 894}
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index d895579b6c5d..618bbd006544 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -25,12 +25,16 @@
25static const struct config_item_type nvmet_host_type; 25static const struct config_item_type nvmet_host_type;
26static const struct config_item_type nvmet_subsys_type; 26static const struct config_item_type nvmet_subsys_type;
27 27
28static LIST_HEAD(nvmet_ports_list);
29struct list_head *nvmet_ports = &nvmet_ports_list;
30
28static const struct nvmet_transport_name { 31static const struct nvmet_transport_name {
29 u8 type; 32 u8 type;
30 const char *name; 33 const char *name;
31} nvmet_transport_names[] = { 34} nvmet_transport_names[] = {
32 { NVMF_TRTYPE_RDMA, "rdma" }, 35 { NVMF_TRTYPE_RDMA, "rdma" },
33 { NVMF_TRTYPE_FC, "fc" }, 36 { NVMF_TRTYPE_FC, "fc" },
37 { NVMF_TRTYPE_TCP, "tcp" },
34 { NVMF_TRTYPE_LOOP, "loop" }, 38 { NVMF_TRTYPE_LOOP, "loop" },
35}; 39};
36 40
@@ -150,7 +154,8 @@ CONFIGFS_ATTR(nvmet_, addr_traddr);
150static ssize_t nvmet_addr_treq_show(struct config_item *item, 154static ssize_t nvmet_addr_treq_show(struct config_item *item,
151 char *page) 155 char *page)
152{ 156{
153 switch (to_nvmet_port(item)->disc_addr.treq) { 157 switch (to_nvmet_port(item)->disc_addr.treq &
158 NVME_TREQ_SECURE_CHANNEL_MASK) {
154 case NVMF_TREQ_NOT_SPECIFIED: 159 case NVMF_TREQ_NOT_SPECIFIED:
155 return sprintf(page, "not specified\n"); 160 return sprintf(page, "not specified\n");
156 case NVMF_TREQ_REQUIRED: 161 case NVMF_TREQ_REQUIRED:
@@ -166,6 +171,7 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
166 const char *page, size_t count) 171 const char *page, size_t count)
167{ 172{
168 struct nvmet_port *port = to_nvmet_port(item); 173 struct nvmet_port *port = to_nvmet_port(item);
174 u8 treq = port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK;
169 175
170 if (port->enabled) { 176 if (port->enabled) {
171 pr_err("Cannot modify address while enabled\n"); 177 pr_err("Cannot modify address while enabled\n");
@@ -174,15 +180,16 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
174 } 180 }
175 181
176 if (sysfs_streq(page, "not specified")) { 182 if (sysfs_streq(page, "not specified")) {
177 port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED; 183 treq |= NVMF_TREQ_NOT_SPECIFIED;
178 } else if (sysfs_streq(page, "required")) { 184 } else if (sysfs_streq(page, "required")) {
179 port->disc_addr.treq = NVMF_TREQ_REQUIRED; 185 treq |= NVMF_TREQ_REQUIRED;
180 } else if (sysfs_streq(page, "not required")) { 186 } else if (sysfs_streq(page, "not required")) {
181 port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED; 187 treq |= NVMF_TREQ_NOT_REQUIRED;
182 } else { 188 } else {
183 pr_err("Invalid value '%s' for treq\n", page); 189 pr_err("Invalid value '%s' for treq\n", page);
184 return -EINVAL; 190 return -EINVAL;
185 } 191 }
192 port->disc_addr.treq = treq;
186 193
187 return count; 194 return count;
188} 195}
@@ -646,7 +653,8 @@ static int nvmet_port_subsys_allow_link(struct config_item *parent,
646 } 653 }
647 654
648 list_add_tail(&link->entry, &port->subsystems); 655 list_add_tail(&link->entry, &port->subsystems);
649 nvmet_genctr++; 656 nvmet_port_disc_changed(port, subsys);
657
650 up_write(&nvmet_config_sem); 658 up_write(&nvmet_config_sem);
651 return 0; 659 return 0;
652 660
@@ -673,7 +681,8 @@ static void nvmet_port_subsys_drop_link(struct config_item *parent,
673 681
674found: 682found:
675 list_del(&p->entry); 683 list_del(&p->entry);
676 nvmet_genctr++; 684 nvmet_port_disc_changed(port, subsys);
685
677 if (list_empty(&port->subsystems)) 686 if (list_empty(&port->subsystems))
678 nvmet_disable_port(port); 687 nvmet_disable_port(port);
679 up_write(&nvmet_config_sem); 688 up_write(&nvmet_config_sem);
@@ -722,7 +731,8 @@ static int nvmet_allowed_hosts_allow_link(struct config_item *parent,
722 goto out_free_link; 731 goto out_free_link;
723 } 732 }
724 list_add_tail(&link->entry, &subsys->hosts); 733 list_add_tail(&link->entry, &subsys->hosts);
725 nvmet_genctr++; 734 nvmet_subsys_disc_changed(subsys, host);
735
726 up_write(&nvmet_config_sem); 736 up_write(&nvmet_config_sem);
727 return 0; 737 return 0;
728out_free_link: 738out_free_link:
@@ -748,7 +758,8 @@ static void nvmet_allowed_hosts_drop_link(struct config_item *parent,
748 758
749found: 759found:
750 list_del(&p->entry); 760 list_del(&p->entry);
751 nvmet_genctr++; 761 nvmet_subsys_disc_changed(subsys, host);
762
752 up_write(&nvmet_config_sem); 763 up_write(&nvmet_config_sem);
753 kfree(p); 764 kfree(p);
754} 765}
@@ -787,7 +798,11 @@ static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item,
787 goto out_unlock; 798 goto out_unlock;
788 } 799 }
789 800
790 subsys->allow_any_host = allow_any_host; 801 if (subsys->allow_any_host != allow_any_host) {
802 subsys->allow_any_host = allow_any_host;
803 nvmet_subsys_disc_changed(subsys, NULL);
804 }
805
791out_unlock: 806out_unlock:
792 up_write(&nvmet_config_sem); 807 up_write(&nvmet_config_sem);
793 return ret ? ret : count; 808 return ret ? ret : count;
@@ -936,7 +951,7 @@ static ssize_t nvmet_referral_enable_store(struct config_item *item,
936 if (enable) 951 if (enable)
937 nvmet_referral_enable(parent, port); 952 nvmet_referral_enable(parent, port);
938 else 953 else
939 nvmet_referral_disable(port); 954 nvmet_referral_disable(parent, port);
940 955
941 return count; 956 return count;
942inval: 957inval:
@@ -962,9 +977,10 @@ static struct configfs_attribute *nvmet_referral_attrs[] = {
962 977
963static void nvmet_referral_release(struct config_item *item) 978static void nvmet_referral_release(struct config_item *item)
964{ 979{
980 struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent);
965 struct nvmet_port *port = to_nvmet_port(item); 981 struct nvmet_port *port = to_nvmet_port(item);
966 982
967 nvmet_referral_disable(port); 983 nvmet_referral_disable(parent, port);
968 kfree(port); 984 kfree(port);
969} 985}
970 986
@@ -1137,6 +1153,8 @@ static void nvmet_port_release(struct config_item *item)
1137{ 1153{
1138 struct nvmet_port *port = to_nvmet_port(item); 1154 struct nvmet_port *port = to_nvmet_port(item);
1139 1155
1156 list_del(&port->global_entry);
1157
1140 kfree(port->ana_state); 1158 kfree(port->ana_state);
1141 kfree(port); 1159 kfree(port);
1142} 1160}
@@ -1189,12 +1207,15 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
1189 port->ana_state[i] = NVME_ANA_INACCESSIBLE; 1207 port->ana_state[i] = NVME_ANA_INACCESSIBLE;
1190 } 1208 }
1191 1209
1210 list_add(&port->global_entry, &nvmet_ports_list);
1211
1192 INIT_LIST_HEAD(&port->entry); 1212 INIT_LIST_HEAD(&port->entry);
1193 INIT_LIST_HEAD(&port->subsystems); 1213 INIT_LIST_HEAD(&port->subsystems);
1194 INIT_LIST_HEAD(&port->referrals); 1214 INIT_LIST_HEAD(&port->referrals);
1195 port->inline_data_size = -1; /* < 0 == let the transport choose */ 1215 port->inline_data_size = -1; /* < 0 == let the transport choose */
1196 1216
1197 port->disc_addr.portid = cpu_to_le16(portid); 1217 port->disc_addr.portid = cpu_to_le16(portid);
1218 port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW;
1198 config_group_init_type_name(&port->group, name, &nvmet_port_type); 1219 config_group_init_type_name(&port->group, name, &nvmet_port_type);
1199 1220
1200 config_group_init_type_name(&port->subsys_group, 1221 config_group_init_type_name(&port->subsys_group,
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index a5f9bbce863f..88d260f31835 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -45,28 +45,72 @@ u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
45u64 nvmet_ana_chgcnt; 45u64 nvmet_ana_chgcnt;
46DECLARE_RWSEM(nvmet_ana_sem); 46DECLARE_RWSEM(nvmet_ana_sem);
47 47
48inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
49{
50 u16 status;
51
52 switch (errno) {
53 case -ENOSPC:
54 req->error_loc = offsetof(struct nvme_rw_command, length);
55 status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
56 break;
57 case -EREMOTEIO:
58 req->error_loc = offsetof(struct nvme_rw_command, slba);
59 status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
60 break;
61 case -EOPNOTSUPP:
62 req->error_loc = offsetof(struct nvme_common_command, opcode);
63 switch (req->cmd->common.opcode) {
64 case nvme_cmd_dsm:
65 case nvme_cmd_write_zeroes:
66 status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
67 break;
68 default:
69 status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
70 }
71 break;
72 case -ENODATA:
73 req->error_loc = offsetof(struct nvme_rw_command, nsid);
74 status = NVME_SC_ACCESS_DENIED;
75 break;
76 case -EIO:
77 /* FALLTHRU */
78 default:
79 req->error_loc = offsetof(struct nvme_common_command, opcode);
80 status = NVME_SC_INTERNAL | NVME_SC_DNR;
81 }
82
83 return status;
84}
85
48static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, 86static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
49 const char *subsysnqn); 87 const char *subsysnqn);
50 88
51u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, 89u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
52 size_t len) 90 size_t len)
53{ 91{
54 if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) 92 if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
93 req->error_loc = offsetof(struct nvme_common_command, dptr);
55 return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; 94 return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
95 }
56 return 0; 96 return 0;
57} 97}
58 98
59u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) 99u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
60{ 100{
61 if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) 101 if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
102 req->error_loc = offsetof(struct nvme_common_command, dptr);
62 return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; 103 return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
104 }
63 return 0; 105 return 0;
64} 106}
65 107
66u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) 108u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
67{ 109{
68 if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) 110 if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
111 req->error_loc = offsetof(struct nvme_common_command, dptr);
69 return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; 112 return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
113 }
70 return 0; 114 return 0;
71} 115}
72 116
@@ -130,7 +174,7 @@ static void nvmet_async_event_work(struct work_struct *work)
130 } 174 }
131} 175}
132 176
133static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, 177void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
134 u8 event_info, u8 log_page) 178 u8 event_info, u8 log_page)
135{ 179{
136 struct nvmet_async_event *aen; 180 struct nvmet_async_event *aen;
@@ -150,13 +194,6 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
150 schedule_work(&ctrl->async_event_work); 194 schedule_work(&ctrl->async_event_work);
151} 195}
152 196
153static bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
154{
155 if (!(READ_ONCE(ctrl->aen_enabled) & aen))
156 return true;
157 return test_and_set_bit(aen, &ctrl->aen_masked);
158}
159
160static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid) 197static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
161{ 198{
162 u32 i; 199 u32 i;
@@ -187,7 +224,7 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
187 224
188 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 225 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
189 nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid)); 226 nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
190 if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_NS_ATTR)) 227 if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
191 continue; 228 continue;
192 nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 229 nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
193 NVME_AER_NOTICE_NS_CHANGED, 230 NVME_AER_NOTICE_NS_CHANGED,
@@ -204,7 +241,7 @@ void nvmet_send_ana_event(struct nvmet_subsys *subsys,
204 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 241 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
205 if (port && ctrl->port != port) 242 if (port && ctrl->port != port)
206 continue; 243 continue;
207 if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE)) 244 if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
208 continue; 245 continue;
209 nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 246 nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
210 NVME_AER_NOTICE_ANA, NVME_LOG_ANA); 247 NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
@@ -299,6 +336,15 @@ static void nvmet_keep_alive_timer(struct work_struct *work)
299{ 336{
300 struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), 337 struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
301 struct nvmet_ctrl, ka_work); 338 struct nvmet_ctrl, ka_work);
339 bool cmd_seen = ctrl->cmd_seen;
340
341 ctrl->cmd_seen = false;
342 if (cmd_seen) {
343 pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
344 ctrl->cntlid);
345 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
346 return;
347 }
302 348
303 pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n", 349 pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
304 ctrl->cntlid, ctrl->kato); 350 ctrl->cntlid, ctrl->kato);
@@ -595,26 +641,58 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
595 return ns; 641 return ns;
596} 642}
597 643
598static void __nvmet_req_complete(struct nvmet_req *req, u16 status) 644static void nvmet_update_sq_head(struct nvmet_req *req)
599{ 645{
600 u32 old_sqhd, new_sqhd;
601 u16 sqhd;
602
603 if (status)
604 nvmet_set_status(req, status);
605
606 if (req->sq->size) { 646 if (req->sq->size) {
647 u32 old_sqhd, new_sqhd;
648
607 do { 649 do {
608 old_sqhd = req->sq->sqhd; 650 old_sqhd = req->sq->sqhd;
609 new_sqhd = (old_sqhd + 1) % req->sq->size; 651 new_sqhd = (old_sqhd + 1) % req->sq->size;
610 } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) != 652 } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
611 old_sqhd); 653 old_sqhd);
612 } 654 }
613 sqhd = req->sq->sqhd & 0x0000FFFF; 655 req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
614 req->rsp->sq_head = cpu_to_le16(sqhd); 656}
657
658static void nvmet_set_error(struct nvmet_req *req, u16 status)
659{
660 struct nvmet_ctrl *ctrl = req->sq->ctrl;
661 struct nvme_error_slot *new_error_slot;
662 unsigned long flags;
663
664 req->rsp->status = cpu_to_le16(status << 1);
665
666 if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
667 return;
668
669 spin_lock_irqsave(&ctrl->error_lock, flags);
670 ctrl->err_counter++;
671 new_error_slot =
672 &ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
673
674 new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
675 new_error_slot->sqid = cpu_to_le16(req->sq->qid);
676 new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
677 new_error_slot->status_field = cpu_to_le16(status << 1);
678 new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
679 new_error_slot->lba = cpu_to_le64(req->error_slba);
680 new_error_slot->nsid = req->cmd->common.nsid;
681 spin_unlock_irqrestore(&ctrl->error_lock, flags);
682
683 /* set the more bit for this request */
684 req->rsp->status |= cpu_to_le16(1 << 14);
685}
686
687static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
688{
689 if (!req->sq->sqhd_disabled)
690 nvmet_update_sq_head(req);
615 req->rsp->sq_id = cpu_to_le16(req->sq->qid); 691 req->rsp->sq_id = cpu_to_le16(req->sq->qid);
616 req->rsp->command_id = req->cmd->common.command_id; 692 req->rsp->command_id = req->cmd->common.command_id;
617 693
694 if (unlikely(status))
695 nvmet_set_error(req, status);
618 if (req->ns) 696 if (req->ns)
619 nvmet_put_namespace(req->ns); 697 nvmet_put_namespace(req->ns);
620 req->ops->queue_response(req); 698 req->ops->queue_response(req);
@@ -735,14 +813,20 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
735 return ret; 813 return ret;
736 814
737 req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); 815 req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
738 if (unlikely(!req->ns)) 816 if (unlikely(!req->ns)) {
817 req->error_loc = offsetof(struct nvme_common_command, nsid);
739 return NVME_SC_INVALID_NS | NVME_SC_DNR; 818 return NVME_SC_INVALID_NS | NVME_SC_DNR;
819 }
740 ret = nvmet_check_ana_state(req->port, req->ns); 820 ret = nvmet_check_ana_state(req->port, req->ns);
741 if (unlikely(ret)) 821 if (unlikely(ret)) {
822 req->error_loc = offsetof(struct nvme_common_command, nsid);
742 return ret; 823 return ret;
824 }
743 ret = nvmet_io_cmd_check_access(req); 825 ret = nvmet_io_cmd_check_access(req);
744 if (unlikely(ret)) 826 if (unlikely(ret)) {
827 req->error_loc = offsetof(struct nvme_common_command, nsid);
745 return ret; 828 return ret;
829 }
746 830
747 if (req->ns->file) 831 if (req->ns->file)
748 return nvmet_file_parse_io_cmd(req); 832 return nvmet_file_parse_io_cmd(req);
@@ -763,10 +847,14 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
763 req->sg_cnt = 0; 847 req->sg_cnt = 0;
764 req->transfer_len = 0; 848 req->transfer_len = 0;
765 req->rsp->status = 0; 849 req->rsp->status = 0;
850 req->rsp->sq_head = 0;
766 req->ns = NULL; 851 req->ns = NULL;
852 req->error_loc = NVMET_NO_ERROR_LOC;
853 req->error_slba = 0;
767 854
768 /* no support for fused commands yet */ 855 /* no support for fused commands yet */
769 if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { 856 if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
857 req->error_loc = offsetof(struct nvme_common_command, flags);
770 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 858 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
771 goto fail; 859 goto fail;
772 } 860 }
@@ -777,6 +865,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
777 * byte aligned. 865 * byte aligned.
778 */ 866 */
779 if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) { 867 if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
868 req->error_loc = offsetof(struct nvme_common_command, flags);
780 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 869 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
781 goto fail; 870 goto fail;
782 } 871 }
@@ -801,6 +890,9 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
801 goto fail; 890 goto fail;
802 } 891 }
803 892
893 if (sq->ctrl)
894 sq->ctrl->cmd_seen = true;
895
804 return true; 896 return true;
805 897
806fail: 898fail:
@@ -819,9 +911,10 @@ EXPORT_SYMBOL_GPL(nvmet_req_uninit);
819 911
820void nvmet_req_execute(struct nvmet_req *req) 912void nvmet_req_execute(struct nvmet_req *req)
821{ 913{
822 if (unlikely(req->data_len != req->transfer_len)) 914 if (unlikely(req->data_len != req->transfer_len)) {
915 req->error_loc = offsetof(struct nvme_common_command, dptr);
823 nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); 916 nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
824 else 917 } else
825 req->execute(req); 918 req->execute(req);
826} 919}
827EXPORT_SYMBOL_GPL(nvmet_req_execute); 920EXPORT_SYMBOL_GPL(nvmet_req_execute);
@@ -1027,14 +1120,18 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
1027 return 0; 1120 return 0;
1028} 1121}
1029 1122
1030static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, 1123bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
1031 const char *hostnqn)
1032{ 1124{
1033 struct nvmet_host_link *p; 1125 struct nvmet_host_link *p;
1034 1126
1127 lockdep_assert_held(&nvmet_config_sem);
1128
1035 if (subsys->allow_any_host) 1129 if (subsys->allow_any_host)
1036 return true; 1130 return true;
1037 1131
1132 if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
1133 return true;
1134
1038 list_for_each_entry(p, &subsys->hosts, entry) { 1135 list_for_each_entry(p, &subsys->hosts, entry) {
1039 if (!strcmp(nvmet_host_name(p->host), hostnqn)) 1136 if (!strcmp(nvmet_host_name(p->host), hostnqn))
1040 return true; 1137 return true;
@@ -1043,30 +1140,6 @@ static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
1043 return false; 1140 return false;
1044} 1141}
1045 1142
1046static bool nvmet_host_discovery_allowed(struct nvmet_req *req,
1047 const char *hostnqn)
1048{
1049 struct nvmet_subsys_link *s;
1050
1051 list_for_each_entry(s, &req->port->subsystems, entry) {
1052 if (__nvmet_host_allowed(s->subsys, hostnqn))
1053 return true;
1054 }
1055
1056 return false;
1057}
1058
1059bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
1060 const char *hostnqn)
1061{
1062 lockdep_assert_held(&nvmet_config_sem);
1063
1064 if (subsys->type == NVME_NQN_DISC)
1065 return nvmet_host_discovery_allowed(req, hostnqn);
1066 else
1067 return __nvmet_host_allowed(subsys, hostnqn);
1068}
1069
1070/* 1143/*
1071 * Note: ctrl->subsys->lock should be held when calling this function 1144 * Note: ctrl->subsys->lock should be held when calling this function
1072 */ 1145 */
@@ -1117,7 +1190,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1117 1190
1118 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; 1191 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1119 down_read(&nvmet_config_sem); 1192 down_read(&nvmet_config_sem);
1120 if (!nvmet_host_allowed(req, subsys, hostnqn)) { 1193 if (!nvmet_host_allowed(subsys, hostnqn)) {
1121 pr_info("connect by host %s for subsystem %s not allowed\n", 1194 pr_info("connect by host %s for subsystem %s not allowed\n",
1122 hostnqn, subsysnqn); 1195 hostnqn, subsysnqn);
1123 req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); 1196 req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
@@ -1175,31 +1248,20 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1175 ctrl->cntlid = ret; 1248 ctrl->cntlid = ret;
1176 1249
1177 ctrl->ops = req->ops; 1250 ctrl->ops = req->ops;
1178 if (ctrl->subsys->type == NVME_NQN_DISC) {
1179 /* Don't accept keep-alive timeout for discovery controllers */
1180 if (kato) {
1181 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
1182 goto out_remove_ida;
1183 }
1184 1251
1185 /* 1252 /*
1186 * Discovery controllers use some arbitrary high value in order 1253 * Discovery controllers may use some arbitrary high value
1187 * to cleanup stale discovery sessions 1254 * in order to cleanup stale discovery sessions
1188 * 1255 */
1189 * From the latest base diff RC: 1256 if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
1190 * "The Keep Alive command is not supported by 1257 kato = NVMET_DISC_KATO_MS;
1191 * Discovery controllers. A transport may specify a 1258
1192 * fixed Discovery controller activity timeout value 1259 /* keep-alive timeout in seconds */
1193 * (e.g., 2 minutes). If no commands are received 1260 ctrl->kato = DIV_ROUND_UP(kato, 1000);
1194 * by a Discovery controller within that time 1261
1195 * period, the controller may perform the 1262 ctrl->err_counter = 0;
1196 * actions for Keep Alive Timer expiration". 1263 spin_lock_init(&ctrl->error_lock);
1197 */ 1264
1198 ctrl->kato = NVMET_DISC_KATO;
1199 } else {
1200 /* keep-alive timeout in seconds */
1201 ctrl->kato = DIV_ROUND_UP(kato, 1000);
1202 }
1203 nvmet_start_keep_alive_timer(ctrl); 1265 nvmet_start_keep_alive_timer(ctrl);
1204 1266
1205 mutex_lock(&subsys->lock); 1267 mutex_lock(&subsys->lock);
@@ -1210,8 +1272,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1210 *ctrlp = ctrl; 1272 *ctrlp = ctrl;
1211 return 0; 1273 return 0;
1212 1274
1213out_remove_ida:
1214 ida_simple_remove(&cntlid_ida, ctrl->cntlid);
1215out_free_sqs: 1275out_free_sqs:
1216 kfree(ctrl->sqs); 1276 kfree(ctrl->sqs);
1217out_free_cqs: 1277out_free_cqs:
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index bc0aa0bf1543..d2cb71a0b419 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -18,7 +18,65 @@
18 18
19struct nvmet_subsys *nvmet_disc_subsys; 19struct nvmet_subsys *nvmet_disc_subsys;
20 20
21u64 nvmet_genctr; 21static u64 nvmet_genctr;
22
23static void __nvmet_disc_changed(struct nvmet_port *port,
24 struct nvmet_ctrl *ctrl)
25{
26 if (ctrl->port != port)
27 return;
28
29 if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_DISC_CHANGE))
30 return;
31
32 nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
33 NVME_AER_NOTICE_DISC_CHANGED, NVME_LOG_DISC);
34}
35
36void nvmet_port_disc_changed(struct nvmet_port *port,
37 struct nvmet_subsys *subsys)
38{
39 struct nvmet_ctrl *ctrl;
40
41 nvmet_genctr++;
42
43 list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
44 if (subsys && !nvmet_host_allowed(subsys, ctrl->hostnqn))
45 continue;
46
47 __nvmet_disc_changed(port, ctrl);
48 }
49}
50
51static void __nvmet_subsys_disc_changed(struct nvmet_port *port,
52 struct nvmet_subsys *subsys,
53 struct nvmet_host *host)
54{
55 struct nvmet_ctrl *ctrl;
56
57 list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
58 if (host && strcmp(nvmet_host_name(host), ctrl->hostnqn))
59 continue;
60
61 __nvmet_disc_changed(port, ctrl);
62 }
63}
64
65void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
66 struct nvmet_host *host)
67{
68 struct nvmet_port *port;
69 struct nvmet_subsys_link *s;
70
71 nvmet_genctr++;
72
73 list_for_each_entry(port, nvmet_ports, global_entry)
74 list_for_each_entry(s, &port->subsystems, entry) {
75 if (s->subsys != subsys)
76 continue;
77 __nvmet_subsys_disc_changed(port, subsys, host);
78 }
79}
22 80
23void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) 81void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port)
24{ 82{
@@ -26,18 +84,18 @@ void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port)
26 if (list_empty(&port->entry)) { 84 if (list_empty(&port->entry)) {
27 list_add_tail(&port->entry, &parent->referrals); 85 list_add_tail(&port->entry, &parent->referrals);
28 port->enabled = true; 86 port->enabled = true;
29 nvmet_genctr++; 87 nvmet_port_disc_changed(parent, NULL);
30 } 88 }
31 up_write(&nvmet_config_sem); 89 up_write(&nvmet_config_sem);
32} 90}
33 91
34void nvmet_referral_disable(struct nvmet_port *port) 92void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port)
35{ 93{
36 down_write(&nvmet_config_sem); 94 down_write(&nvmet_config_sem);
37 if (!list_empty(&port->entry)) { 95 if (!list_empty(&port->entry)) {
38 port->enabled = false; 96 port->enabled = false;
39 list_del_init(&port->entry); 97 list_del_init(&port->entry);
40 nvmet_genctr++; 98 nvmet_port_disc_changed(parent, NULL);
41 } 99 }
42 up_write(&nvmet_config_sem); 100 up_write(&nvmet_config_sem);
43} 101}
@@ -107,7 +165,7 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
107 165
108 down_read(&nvmet_config_sem); 166 down_read(&nvmet_config_sem);
109 list_for_each_entry(p, &req->port->subsystems, entry) { 167 list_for_each_entry(p, &req->port->subsystems, entry) {
110 if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn)) 168 if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
111 continue; 169 continue;
112 if (residual_len >= entry_size) { 170 if (residual_len >= entry_size) {
113 char traddr[NVMF_TRADDR_SIZE]; 171 char traddr[NVMF_TRADDR_SIZE];
@@ -136,6 +194,8 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
136 hdr->numrec = cpu_to_le64(numrec); 194 hdr->numrec = cpu_to_le64(numrec);
137 hdr->recfmt = cpu_to_le16(0); 195 hdr->recfmt = cpu_to_le16(0);
138 196
197 nvmet_clear_aen_bit(req, NVME_AEN_BIT_DISC_CHANGE);
198
139 up_read(&nvmet_config_sem); 199 up_read(&nvmet_config_sem);
140 200
141 status = nvmet_copy_to_sgl(req, 0, hdr, data_len); 201 status = nvmet_copy_to_sgl(req, 0, hdr, data_len);
@@ -174,6 +234,8 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
174 if (req->port->inline_data_size) 234 if (req->port->inline_data_size)
175 id->sgls |= cpu_to_le32(1 << 20); 235 id->sgls |= cpu_to_le32(1 << 20);
176 236
237 id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL);
238
177 strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); 239 strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn));
178 240
179 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); 241 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
@@ -183,6 +245,51 @@ out:
183 nvmet_req_complete(req, status); 245 nvmet_req_complete(req, status);
184} 246}
185 247
248static void nvmet_execute_disc_set_features(struct nvmet_req *req)
249{
250 u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
251 u16 stat;
252
253 switch (cdw10 & 0xff) {
254 case NVME_FEAT_KATO:
255 stat = nvmet_set_feat_kato(req);
256 break;
257 case NVME_FEAT_ASYNC_EVENT:
258 stat = nvmet_set_feat_async_event(req,
259 NVMET_DISC_AEN_CFG_OPTIONAL);
260 break;
261 default:
262 req->error_loc =
263 offsetof(struct nvme_common_command, cdw10);
264 stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
265 break;
266 }
267
268 nvmet_req_complete(req, stat);
269}
270
271static void nvmet_execute_disc_get_features(struct nvmet_req *req)
272{
273 u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
274 u16 stat = 0;
275
276 switch (cdw10 & 0xff) {
277 case NVME_FEAT_KATO:
278 nvmet_get_feat_kato(req);
279 break;
280 case NVME_FEAT_ASYNC_EVENT:
281 nvmet_get_feat_async_event(req);
282 break;
283 default:
284 req->error_loc =
285 offsetof(struct nvme_common_command, cdw10);
286 stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
287 break;
288 }
289
290 nvmet_req_complete(req, stat);
291}
292
186u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) 293u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
187{ 294{
188 struct nvme_command *cmd = req->cmd; 295 struct nvme_command *cmd = req->cmd;
@@ -190,10 +297,28 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
190 if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { 297 if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
191 pr_err("got cmd %d while not ready\n", 298 pr_err("got cmd %d while not ready\n",
192 cmd->common.opcode); 299 cmd->common.opcode);
300 req->error_loc =
301 offsetof(struct nvme_common_command, opcode);
193 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 302 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
194 } 303 }
195 304
196 switch (cmd->common.opcode) { 305 switch (cmd->common.opcode) {
306 case nvme_admin_set_features:
307 req->execute = nvmet_execute_disc_set_features;
308 req->data_len = 0;
309 return 0;
310 case nvme_admin_get_features:
311 req->execute = nvmet_execute_disc_get_features;
312 req->data_len = 0;
313 return 0;
314 case nvme_admin_async_event:
315 req->execute = nvmet_execute_async_event;
316 req->data_len = 0;
317 return 0;
318 case nvme_admin_keep_alive:
319 req->execute = nvmet_execute_keep_alive;
320 req->data_len = 0;
321 return 0;
197 case nvme_admin_get_log_page: 322 case nvme_admin_get_log_page:
198 req->data_len = nvmet_get_log_page_len(cmd); 323 req->data_len = nvmet_get_log_page_len(cmd);
199 324
@@ -204,6 +329,8 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
204 default: 329 default:
205 pr_err("unsupported get_log_page lid %d\n", 330 pr_err("unsupported get_log_page lid %d\n",
206 cmd->get_log_page.lid); 331 cmd->get_log_page.lid);
332 req->error_loc =
333 offsetof(struct nvme_get_log_page_command, lid);
207 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 334 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
208 } 335 }
209 case nvme_admin_identify: 336 case nvme_admin_identify:
@@ -216,10 +343,12 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
216 default: 343 default:
217 pr_err("unsupported identify cns %d\n", 344 pr_err("unsupported identify cns %d\n",
218 cmd->identify.cns); 345 cmd->identify.cns);
346 req->error_loc = offsetof(struct nvme_identify, cns);
219 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 347 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
220 } 348 }
221 default: 349 default:
222 pr_err("unhandled cmd %d\n", cmd->common.opcode); 350 pr_err("unhandled cmd %d\n", cmd->common.opcode);
351 req->error_loc = offsetof(struct nvme_common_command, opcode);
223 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 352 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
224 } 353 }
225 354
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index d84ae004cb85..6cf1fd9eb32e 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -17,23 +17,26 @@
17 17
18static void nvmet_execute_prop_set(struct nvmet_req *req) 18static void nvmet_execute_prop_set(struct nvmet_req *req)
19{ 19{
20 u64 val = le64_to_cpu(req->cmd->prop_set.value);
20 u16 status = 0; 21 u16 status = 0;
21 22
22 if (!(req->cmd->prop_set.attrib & 1)) { 23 if (req->cmd->prop_set.attrib & 1) {
23 u64 val = le64_to_cpu(req->cmd->prop_set.value); 24 req->error_loc =
24 25 offsetof(struct nvmf_property_set_command, attrib);
25 switch (le32_to_cpu(req->cmd->prop_set.offset)) {
26 case NVME_REG_CC:
27 nvmet_update_cc(req->sq->ctrl, val);
28 break;
29 default:
30 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
31 break;
32 }
33 } else {
34 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 26 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
27 goto out;
35 } 28 }
36 29
30 switch (le32_to_cpu(req->cmd->prop_set.offset)) {
31 case NVME_REG_CC:
32 nvmet_update_cc(req->sq->ctrl, val);
33 break;
34 default:
35 req->error_loc =
36 offsetof(struct nvmf_property_set_command, offset);
37 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
38 }
39out:
37 nvmet_req_complete(req, status); 40 nvmet_req_complete(req, status);
38} 41}
39 42
@@ -69,6 +72,14 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
69 } 72 }
70 } 73 }
71 74
75 if (status && req->cmd->prop_get.attrib & 1) {
76 req->error_loc =
77 offsetof(struct nvmf_property_get_command, offset);
78 } else {
79 req->error_loc =
80 offsetof(struct nvmf_property_get_command, attrib);
81 }
82
72 req->rsp->result.u64 = cpu_to_le64(val); 83 req->rsp->result.u64 = cpu_to_le64(val);
73 nvmet_req_complete(req, status); 84 nvmet_req_complete(req, status);
74} 85}
@@ -89,6 +100,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
89 default: 100 default:
90 pr_err("received unknown capsule type 0x%x\n", 101 pr_err("received unknown capsule type 0x%x\n",
91 cmd->fabrics.fctype); 102 cmd->fabrics.fctype);
103 req->error_loc = offsetof(struct nvmf_common_command, fctype);
92 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 104 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
93 } 105 }
94 106
@@ -105,16 +117,34 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
105 old = cmpxchg(&req->sq->ctrl, NULL, ctrl); 117 old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
106 if (old) { 118 if (old) {
107 pr_warn("queue already connected!\n"); 119 pr_warn("queue already connected!\n");
120 req->error_loc = offsetof(struct nvmf_connect_command, opcode);
108 return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; 121 return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
109 } 122 }
110 if (!sqsize) { 123 if (!sqsize) {
111 pr_warn("queue size zero!\n"); 124 pr_warn("queue size zero!\n");
125 req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
112 return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; 126 return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
113 } 127 }
114 128
115 /* note: convert queue size from 0's-based value to 1's-based value */ 129 /* note: convert queue size from 0's-based value to 1's-based value */
116 nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); 130 nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
117 nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); 131 nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
132
133 if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) {
134 req->sq->sqhd_disabled = true;
135 req->rsp->sq_head = cpu_to_le16(0xffff);
136 }
137
138 if (ctrl->ops->install_queue) {
139 u16 ret = ctrl->ops->install_queue(req->sq);
140
141 if (ret) {
142 pr_err("failed to install queue %d cntlid %d ret %x\n",
143 qid, ret, ctrl->cntlid);
144 return ret;
145 }
146 }
147
118 return 0; 148 return 0;
119} 149}
120 150
@@ -141,6 +171,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
141 if (c->recfmt != 0) { 171 if (c->recfmt != 0) {
142 pr_warn("invalid connect version (%d).\n", 172 pr_warn("invalid connect version (%d).\n",
143 le16_to_cpu(c->recfmt)); 173 le16_to_cpu(c->recfmt));
174 req->error_loc = offsetof(struct nvmf_connect_command, recfmt);
144 status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; 175 status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
145 goto out; 176 goto out;
146 } 177 }
@@ -155,8 +186,13 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
155 186
156 status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, 187 status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
157 le32_to_cpu(c->kato), &ctrl); 188 le32_to_cpu(c->kato), &ctrl);
158 if (status) 189 if (status) {
190 if (status == (NVME_SC_INVALID_FIELD | NVME_SC_DNR))
191 req->error_loc =
192 offsetof(struct nvme_common_command, opcode);
159 goto out; 193 goto out;
194 }
195
160 uuid_copy(&ctrl->hostid, &d->hostid); 196 uuid_copy(&ctrl->hostid, &d->hostid);
161 197
162 status = nvmet_install_queue(ctrl, req); 198 status = nvmet_install_queue(ctrl, req);
@@ -243,11 +279,13 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
243 if (cmd->common.opcode != nvme_fabrics_command) { 279 if (cmd->common.opcode != nvme_fabrics_command) {
244 pr_err("invalid command 0x%x on unconnected queue.\n", 280 pr_err("invalid command 0x%x on unconnected queue.\n",
245 cmd->fabrics.opcode); 281 cmd->fabrics.opcode);
282 req->error_loc = offsetof(struct nvme_common_command, opcode);
246 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 283 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
247 } 284 }
248 if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { 285 if (cmd->fabrics.fctype != nvme_fabrics_type_connect) {
249 pr_err("invalid capsule type 0x%x on unconnected queue.\n", 286 pr_err("invalid capsule type 0x%x on unconnected queue.\n",
250 cmd->fabrics.fctype); 287 cmd->fabrics.fctype);
288 req->error_loc = offsetof(struct nvmf_common_command, fctype);
251 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 289 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
252 } 290 }
253 291
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 409081a03b24..f98f5c5bea26 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -86,8 +86,6 @@ struct nvmet_fc_fcp_iod {
86 spinlock_t flock; 86 spinlock_t flock;
87 87
88 struct nvmet_req req; 88 struct nvmet_req req;
89 struct work_struct work;
90 struct work_struct done_work;
91 struct work_struct defer_work; 89 struct work_struct defer_work;
92 90
93 struct nvmet_fc_tgtport *tgtport; 91 struct nvmet_fc_tgtport *tgtport;
@@ -134,7 +132,6 @@ struct nvmet_fc_tgt_queue {
134 u16 sqsize; 132 u16 sqsize;
135 u16 ersp_ratio; 133 u16 ersp_ratio;
136 __le16 sqhd; 134 __le16 sqhd;
137 int cpu;
138 atomic_t connected; 135 atomic_t connected;
139 atomic_t sqtail; 136 atomic_t sqtail;
140 atomic_t zrspcnt; 137 atomic_t zrspcnt;
@@ -232,8 +229,6 @@ static LIST_HEAD(nvmet_fc_portentry_list);
232 229
233 230
234static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work); 231static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
235static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
236static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
237static void nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work); 232static void nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work);
238static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc); 233static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
239static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc); 234static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
@@ -438,8 +433,6 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
438 int i; 433 int i;
439 434
440 for (i = 0; i < queue->sqsize; fod++, i++) { 435 for (i = 0; i < queue->sqsize; fod++, i++) {
441 INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
442 INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
443 INIT_WORK(&fod->defer_work, nvmet_fc_fcp_rqst_op_defer_work); 436 INIT_WORK(&fod->defer_work, nvmet_fc_fcp_rqst_op_defer_work);
444 fod->tgtport = tgtport; 437 fod->tgtport = tgtport;
445 fod->queue = queue; 438 fod->queue = queue;
@@ -517,10 +510,7 @@ nvmet_fc_queue_fcp_req(struct nvmet_fc_tgtport *tgtport,
517 fcpreq->hwqid = queue->qid ? 510 fcpreq->hwqid = queue->qid ?
518 ((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0; 511 ((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
519 512
520 if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR) 513 nvmet_fc_handle_fcp_rqst(tgtport, fod);
521 queue_work_on(queue->cpu, queue->work_q, &fod->work);
522 else
523 nvmet_fc_handle_fcp_rqst(tgtport, fod);
524} 514}
525 515
526static void 516static void
@@ -599,30 +589,6 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
599 queue_work(queue->work_q, &fod->defer_work); 589 queue_work(queue->work_q, &fod->defer_work);
600} 590}
601 591
602static int
603nvmet_fc_queue_to_cpu(struct nvmet_fc_tgtport *tgtport, int qid)
604{
605 int cpu, idx, cnt;
606
607 if (tgtport->ops->max_hw_queues == 1)
608 return WORK_CPU_UNBOUND;
609
610 /* Simple cpu selection based on qid modulo active cpu count */
611 idx = !qid ? 0 : (qid - 1) % num_active_cpus();
612
613 /* find the n'th active cpu */
614 for (cpu = 0, cnt = 0; ; ) {
615 if (cpu_active(cpu)) {
616 if (cnt == idx)
617 break;
618 cnt++;
619 }
620 cpu = (cpu + 1) % num_possible_cpus();
621 }
622
623 return cpu;
624}
625
626static struct nvmet_fc_tgt_queue * 592static struct nvmet_fc_tgt_queue *
627nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, 593nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
628 u16 qid, u16 sqsize) 594 u16 qid, u16 sqsize)
@@ -653,7 +619,6 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
653 queue->qid = qid; 619 queue->qid = qid;
654 queue->sqsize = sqsize; 620 queue->sqsize = sqsize;
655 queue->assoc = assoc; 621 queue->assoc = assoc;
656 queue->cpu = nvmet_fc_queue_to_cpu(assoc->tgtport, qid);
657 INIT_LIST_HEAD(&queue->fod_list); 622 INIT_LIST_HEAD(&queue->fod_list);
658 INIT_LIST_HEAD(&queue->avail_defer_list); 623 INIT_LIST_HEAD(&queue->avail_defer_list);
659 INIT_LIST_HEAD(&queue->pending_cmd_list); 624 INIT_LIST_HEAD(&queue->pending_cmd_list);
@@ -2146,25 +2111,11 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
2146} 2111}
2147 2112
2148static void 2113static void
2149nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
2150{
2151 struct nvmet_fc_fcp_iod *fod =
2152 container_of(work, struct nvmet_fc_fcp_iod, done_work);
2153
2154 nvmet_fc_fod_op_done(fod);
2155}
2156
2157static void
2158nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq) 2114nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
2159{ 2115{
2160 struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private; 2116 struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
2161 struct nvmet_fc_tgt_queue *queue = fod->queue;
2162 2117
2163 if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR) 2118 nvmet_fc_fod_op_done(fod);
2164 /* context switch so completion is not in ISR context */
2165 queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
2166 else
2167 nvmet_fc_fod_op_done(fod);
2168} 2119}
2169 2120
2170/* 2121/*
@@ -2332,19 +2283,6 @@ transport_error:
2332 nvmet_fc_abort_op(tgtport, fod); 2283 nvmet_fc_abort_op(tgtport, fod);
2333} 2284}
2334 2285
2335/*
2336 * Actual processing routine for received FC-NVME LS Requests from the LLD
2337 */
2338static void
2339nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
2340{
2341 struct nvmet_fc_fcp_iod *fod =
2342 container_of(work, struct nvmet_fc_fcp_iod, work);
2343 struct nvmet_fc_tgtport *tgtport = fod->tgtport;
2344
2345 nvmet_fc_handle_fcp_rqst(tgtport, fod);
2346}
2347
2348/** 2286/**
2349 * nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD 2287 * nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD
2350 * upon the reception of a NVME FCP CMD IU. 2288 * upon the reception of a NVME FCP CMD IU.
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index c1ec3475a140..b6d030d3259f 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -44,13 +44,69 @@ void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
44 } 44 }
45} 45}
46 46
47static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
48{
49 u16 status = NVME_SC_SUCCESS;
50
51 if (likely(blk_sts == BLK_STS_OK))
52 return status;
53 /*
54 * Right now there exists M : 1 mapping between block layer error
55 * to the NVMe status code (see nvme_error_status()). For consistency,
56 * when we reverse map we use most appropriate NVMe Status code from
57 * the group of the NVMe staus codes used in the nvme_error_status().
58 */
59 switch (blk_sts) {
60 case BLK_STS_NOSPC:
61 status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
62 req->error_loc = offsetof(struct nvme_rw_command, length);
63 break;
64 case BLK_STS_TARGET:
65 status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
66 req->error_loc = offsetof(struct nvme_rw_command, slba);
67 break;
68 case BLK_STS_NOTSUPP:
69 req->error_loc = offsetof(struct nvme_common_command, opcode);
70 switch (req->cmd->common.opcode) {
71 case nvme_cmd_dsm:
72 case nvme_cmd_write_zeroes:
73 status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
74 break;
75 default:
76 status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
77 }
78 break;
79 case BLK_STS_MEDIUM:
80 status = NVME_SC_ACCESS_DENIED;
81 req->error_loc = offsetof(struct nvme_rw_command, nsid);
82 break;
83 case BLK_STS_IOERR:
84 /* fallthru */
85 default:
86 status = NVME_SC_INTERNAL | NVME_SC_DNR;
87 req->error_loc = offsetof(struct nvme_common_command, opcode);
88 }
89
90 switch (req->cmd->common.opcode) {
91 case nvme_cmd_read:
92 case nvme_cmd_write:
93 req->error_slba = le64_to_cpu(req->cmd->rw.slba);
94 break;
95 case nvme_cmd_write_zeroes:
96 req->error_slba =
97 le64_to_cpu(req->cmd->write_zeroes.slba);
98 break;
99 default:
100 req->error_slba = 0;
101 }
102 return status;
103}
104
47static void nvmet_bio_done(struct bio *bio) 105static void nvmet_bio_done(struct bio *bio)
48{ 106{
49 struct nvmet_req *req = bio->bi_private; 107 struct nvmet_req *req = bio->bi_private;
50 108
51 nvmet_req_complete(req, 109 nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status));
52 bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
53
54 if (bio != &req->b.inline_bio) 110 if (bio != &req->b.inline_bio)
55 bio_put(bio); 111 bio_put(bio);
56} 112}
@@ -61,7 +117,6 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
61 struct bio *bio; 117 struct bio *bio;
62 struct scatterlist *sg; 118 struct scatterlist *sg;
63 sector_t sector; 119 sector_t sector;
64 blk_qc_t cookie;
65 int op, op_flags = 0, i; 120 int op, op_flags = 0, i;
66 121
67 if (!req->sg_cnt) { 122 if (!req->sg_cnt) {
@@ -114,9 +169,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
114 sg_cnt--; 169 sg_cnt--;
115 } 170 }
116 171
117 cookie = submit_bio(bio); 172 submit_bio(bio);
118
119 blk_poll(bdev_get_queue(req->ns->bdev), cookie);
120} 173}
121 174
122static void nvmet_bdev_execute_flush(struct nvmet_req *req) 175static void nvmet_bdev_execute_flush(struct nvmet_req *req)
@@ -139,18 +192,21 @@ u16 nvmet_bdev_flush(struct nvmet_req *req)
139 return 0; 192 return 0;
140} 193}
141 194
142static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns, 195static u16 nvmet_bdev_discard_range(struct nvmet_req *req,
143 struct nvme_dsm_range *range, struct bio **bio) 196 struct nvme_dsm_range *range, struct bio **bio)
144{ 197{
198 struct nvmet_ns *ns = req->ns;
145 int ret; 199 int ret;
146 200
147 ret = __blkdev_issue_discard(ns->bdev, 201 ret = __blkdev_issue_discard(ns->bdev,
148 le64_to_cpu(range->slba) << (ns->blksize_shift - 9), 202 le64_to_cpu(range->slba) << (ns->blksize_shift - 9),
149 le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), 203 le32_to_cpu(range->nlb) << (ns->blksize_shift - 9),
150 GFP_KERNEL, 0, bio); 204 GFP_KERNEL, 0, bio);
151 if (ret && ret != -EOPNOTSUPP) 205
152 return NVME_SC_INTERNAL | NVME_SC_DNR; 206 if (ret)
153 return 0; 207 req->error_slba = le64_to_cpu(range->slba);
208
209 return blk_to_nvme_status(req, errno_to_blk_status(ret));
154} 210}
155 211
156static void nvmet_bdev_execute_discard(struct nvmet_req *req) 212static void nvmet_bdev_execute_discard(struct nvmet_req *req)
@@ -166,7 +222,7 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req)
166 if (status) 222 if (status)
167 break; 223 break;
168 224
169 status = nvmet_bdev_discard_range(req->ns, &range, &bio); 225 status = nvmet_bdev_discard_range(req, &range, &bio);
170 if (status) 226 if (status)
171 break; 227 break;
172 } 228 }
@@ -207,16 +263,16 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
207 u16 status = NVME_SC_SUCCESS; 263 u16 status = NVME_SC_SUCCESS;
208 sector_t sector; 264 sector_t sector;
209 sector_t nr_sector; 265 sector_t nr_sector;
266 int ret;
210 267
211 sector = le64_to_cpu(write_zeroes->slba) << 268 sector = le64_to_cpu(write_zeroes->slba) <<
212 (req->ns->blksize_shift - 9); 269 (req->ns->blksize_shift - 9);
213 nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << 270 nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
214 (req->ns->blksize_shift - 9)); 271 (req->ns->blksize_shift - 9));
215 272
216 if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, 273 ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
217 GFP_KERNEL, &bio, 0)) 274 GFP_KERNEL, &bio, 0);
218 status = NVME_SC_INTERNAL | NVME_SC_DNR; 275 status = blk_to_nvme_status(req, errno_to_blk_status(ret));
219
220 if (bio) { 276 if (bio) {
221 bio->bi_private = req; 277 bio->bi_private = req;
222 bio->bi_end_io = nvmet_bio_done; 278 bio->bi_end_io = nvmet_bio_done;
@@ -251,6 +307,7 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
251 default: 307 default:
252 pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, 308 pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
253 req->sq->qid); 309 req->sq->qid);
310 req->error_loc = offsetof(struct nvme_common_command, opcode);
254 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 311 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
255 } 312 }
256} 313}
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 01feebec29ea..517522305e5c 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -83,17 +83,16 @@ static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter)
83} 83}
84 84
85static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, 85static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
86 unsigned long nr_segs, size_t count) 86 unsigned long nr_segs, size_t count, int ki_flags)
87{ 87{
88 struct kiocb *iocb = &req->f.iocb; 88 struct kiocb *iocb = &req->f.iocb;
89 ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter); 89 ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter);
90 struct iov_iter iter; 90 struct iov_iter iter;
91 int ki_flags = 0, rw; 91 int rw;
92 ssize_t ret;
93 92
94 if (req->cmd->rw.opcode == nvme_cmd_write) { 93 if (req->cmd->rw.opcode == nvme_cmd_write) {
95 if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) 94 if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
96 ki_flags = IOCB_DSYNC; 95 ki_flags |= IOCB_DSYNC;
97 call_iter = req->ns->file->f_op->write_iter; 96 call_iter = req->ns->file->f_op->write_iter;
98 rw = WRITE; 97 rw = WRITE;
99 } else { 98 } else {
@@ -107,17 +106,13 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
107 iocb->ki_filp = req->ns->file; 106 iocb->ki_filp = req->ns->file;
108 iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); 107 iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
109 108
110 ret = call_iter(iocb, &iter); 109 return call_iter(iocb, &iter);
111
112 if (ret != -EIOCBQUEUED && iocb->ki_complete)
113 iocb->ki_complete(iocb, ret, 0);
114
115 return ret;
116} 110}
117 111
118static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) 112static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
119{ 113{
120 struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb); 114 struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb);
115 u16 status = NVME_SC_SUCCESS;
121 116
122 if (req->f.bvec != req->inline_bvec) { 117 if (req->f.bvec != req->inline_bvec) {
123 if (likely(req->f.mpool_alloc == false)) 118 if (likely(req->f.mpool_alloc == false))
@@ -126,11 +121,12 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
126 mempool_free(req->f.bvec, req->ns->bvec_pool); 121 mempool_free(req->f.bvec, req->ns->bvec_pool);
127 } 122 }
128 123
129 nvmet_req_complete(req, ret != req->data_len ? 124 if (unlikely(ret != req->data_len))
130 NVME_SC_INTERNAL | NVME_SC_DNR : 0); 125 status = errno_to_nvme_status(req, ret);
126 nvmet_req_complete(req, status);
131} 127}
132 128
133static void nvmet_file_execute_rw(struct nvmet_req *req) 129static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
134{ 130{
135 ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE); 131 ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
136 struct sg_page_iter sg_pg_iter; 132 struct sg_page_iter sg_pg_iter;
@@ -140,30 +136,14 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
140 ssize_t ret = 0; 136 ssize_t ret = 0;
141 loff_t pos; 137 loff_t pos;
142 138
143 if (!req->sg_cnt || !nr_bvec) { 139
144 nvmet_req_complete(req, 0); 140 if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC)
145 return; 141 is_sync = true;
146 }
147 142
148 pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; 143 pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
149 if (unlikely(pos + req->data_len > req->ns->size)) { 144 if (unlikely(pos + req->data_len > req->ns->size)) {
150 nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); 145 nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC));
151 return; 146 return true;
152 }
153
154 if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
155 req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
156 GFP_KERNEL);
157 else
158 req->f.bvec = req->inline_bvec;
159
160 req->f.mpool_alloc = false;
161 if (unlikely(!req->f.bvec)) {
162 /* fallback under memory pressure */
163 req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
164 req->f.mpool_alloc = true;
165 if (nr_bvec > NVMET_MAX_MPOOL_BVEC)
166 is_sync = true;
167 } 147 }
168 148
169 memset(&req->f.iocb, 0, sizeof(struct kiocb)); 149 memset(&req->f.iocb, 0, sizeof(struct kiocb));
@@ -177,9 +157,10 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
177 157
178 if (unlikely(is_sync) && 158 if (unlikely(is_sync) &&
179 (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) { 159 (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) {
180 ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len); 160 ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len, 0);
181 if (ret < 0) 161 if (ret < 0)
182 goto out; 162 goto complete;
163
183 pos += len; 164 pos += len;
184 bv_cnt = 0; 165 bv_cnt = 0;
185 len = 0; 166 len = 0;
@@ -187,35 +168,95 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
187 nr_bvec--; 168 nr_bvec--;
188 } 169 }
189 170
190 if (WARN_ON_ONCE(total_len != req->data_len)) 171 if (WARN_ON_ONCE(total_len != req->data_len)) {
191 ret = -EIO; 172 ret = -EIO;
192out: 173 goto complete;
193 if (unlikely(is_sync || ret)) { 174 }
194 nvmet_file_io_done(&req->f.iocb, ret < 0 ? ret : total_len, 0); 175
195 return; 176 if (unlikely(is_sync)) {
177 ret = total_len;
178 goto complete;
196 } 179 }
197 req->f.iocb.ki_complete = nvmet_file_io_done; 180
198 nvmet_file_submit_bvec(req, pos, bv_cnt, total_len); 181 /*
182 * A NULL ki_complete ask for synchronous execution, which we want
183 * for the IOCB_NOWAIT case.
184 */
185 if (!(ki_flags & IOCB_NOWAIT))
186 req->f.iocb.ki_complete = nvmet_file_io_done;
187
188 ret = nvmet_file_submit_bvec(req, pos, bv_cnt, total_len, ki_flags);
189
190 switch (ret) {
191 case -EIOCBQUEUED:
192 return true;
193 case -EAGAIN:
194 if (WARN_ON_ONCE(!(ki_flags & IOCB_NOWAIT)))
195 goto complete;
196 return false;
197 case -EOPNOTSUPP:
198 /*
199 * For file systems returning error -EOPNOTSUPP, handle
200 * IOCB_NOWAIT error case separately and retry without
201 * IOCB_NOWAIT.
202 */
203 if ((ki_flags & IOCB_NOWAIT))
204 return false;
205 break;
206 }
207
208complete:
209 nvmet_file_io_done(&req->f.iocb, ret, 0);
210 return true;
199} 211}
200 212
201static void nvmet_file_buffered_io_work(struct work_struct *w) 213static void nvmet_file_buffered_io_work(struct work_struct *w)
202{ 214{
203 struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); 215 struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
204 216
205 nvmet_file_execute_rw(req); 217 nvmet_file_execute_io(req, 0);
206} 218}
207 219
208static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req) 220static void nvmet_file_submit_buffered_io(struct nvmet_req *req)
209{ 221{
210 INIT_WORK(&req->f.work, nvmet_file_buffered_io_work); 222 INIT_WORK(&req->f.work, nvmet_file_buffered_io_work);
211 queue_work(buffered_io_wq, &req->f.work); 223 queue_work(buffered_io_wq, &req->f.work);
212} 224}
213 225
226static void nvmet_file_execute_rw(struct nvmet_req *req)
227{
228 ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
229
230 if (!req->sg_cnt || !nr_bvec) {
231 nvmet_req_complete(req, 0);
232 return;
233 }
234
235 if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
236 req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
237 GFP_KERNEL);
238 else
239 req->f.bvec = req->inline_bvec;
240
241 if (unlikely(!req->f.bvec)) {
242 /* fallback under memory pressure */
243 req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
244 req->f.mpool_alloc = true;
245 } else
246 req->f.mpool_alloc = false;
247
248 if (req->ns->buffered_io) {
249 if (likely(!req->f.mpool_alloc) &&
250 nvmet_file_execute_io(req, IOCB_NOWAIT))
251 return;
252 nvmet_file_submit_buffered_io(req);
253 } else
254 nvmet_file_execute_io(req, 0);
255}
256
214u16 nvmet_file_flush(struct nvmet_req *req) 257u16 nvmet_file_flush(struct nvmet_req *req)
215{ 258{
216 if (vfs_fsync(req->ns->file, 1) < 0) 259 return errno_to_nvme_status(req, vfs_fsync(req->ns->file, 1));
217 return NVME_SC_INTERNAL | NVME_SC_DNR;
218 return 0;
219} 260}
220 261
221static void nvmet_file_flush_work(struct work_struct *w) 262static void nvmet_file_flush_work(struct work_struct *w)
@@ -236,30 +277,34 @@ static void nvmet_file_execute_discard(struct nvmet_req *req)
236 int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; 277 int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
237 struct nvme_dsm_range range; 278 struct nvme_dsm_range range;
238 loff_t offset, len; 279 loff_t offset, len;
239 u16 ret; 280 u16 status = 0;
281 int ret;
240 int i; 282 int i;
241 283
242 for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { 284 for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
243 ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range, 285 status = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
244 sizeof(range)); 286 sizeof(range));
245 if (ret) 287 if (status)
246 break; 288 break;
247 289
248 offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; 290 offset = le64_to_cpu(range.slba) << req->ns->blksize_shift;
249 len = le32_to_cpu(range.nlb); 291 len = le32_to_cpu(range.nlb);
250 len <<= req->ns->blksize_shift; 292 len <<= req->ns->blksize_shift;
251 if (offset + len > req->ns->size) { 293 if (offset + len > req->ns->size) {
252 ret = NVME_SC_LBA_RANGE | NVME_SC_DNR; 294 req->error_slba = le64_to_cpu(range.slba);
295 status = errno_to_nvme_status(req, -ENOSPC);
253 break; 296 break;
254 } 297 }
255 298
256 if (vfs_fallocate(req->ns->file, mode, offset, len)) { 299 ret = vfs_fallocate(req->ns->file, mode, offset, len);
257 ret = NVME_SC_INTERNAL | NVME_SC_DNR; 300 if (ret) {
301 req->error_slba = le64_to_cpu(range.slba);
302 status = errno_to_nvme_status(req, ret);
258 break; 303 break;
259 } 304 }
260 } 305 }
261 306
262 nvmet_req_complete(req, ret); 307 nvmet_req_complete(req, status);
263} 308}
264 309
265static void nvmet_file_dsm_work(struct work_struct *w) 310static void nvmet_file_dsm_work(struct work_struct *w)
@@ -299,12 +344,12 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w)
299 req->ns->blksize_shift); 344 req->ns->blksize_shift);
300 345
301 if (unlikely(offset + len > req->ns->size)) { 346 if (unlikely(offset + len > req->ns->size)) {
302 nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); 347 nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC));
303 return; 348 return;
304 } 349 }
305 350
306 ret = vfs_fallocate(req->ns->file, mode, offset, len); 351 ret = vfs_fallocate(req->ns->file, mode, offset, len);
307 nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); 352 nvmet_req_complete(req, ret < 0 ? errno_to_nvme_status(req, ret) : 0);
308} 353}
309 354
310static void nvmet_file_execute_write_zeroes(struct nvmet_req *req) 355static void nvmet_file_execute_write_zeroes(struct nvmet_req *req)
@@ -320,10 +365,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
320 switch (cmd->common.opcode) { 365 switch (cmd->common.opcode) {
321 case nvme_cmd_read: 366 case nvme_cmd_read:
322 case nvme_cmd_write: 367 case nvme_cmd_write:
323 if (req->ns->buffered_io) 368 req->execute = nvmet_file_execute_rw;
324 req->execute = nvmet_file_execute_rw_buffered_io;
325 else
326 req->execute = nvmet_file_execute_rw;
327 req->data_len = nvmet_rw_len(req); 369 req->data_len = nvmet_rw_len(req);
328 return 0; 370 return 0;
329 case nvme_cmd_flush: 371 case nvme_cmd_flush:
@@ -342,6 +384,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
342 default: 384 default:
343 pr_err("unhandled cmd for file ns %d on qid %d\n", 385 pr_err("unhandled cmd for file ns %d on qid %d\n",
344 cmd->common.opcode, req->sq->qid); 386 cmd->common.opcode, req->sq->qid);
387 req->error_loc = offsetof(struct nvme_common_command, opcode);
345 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; 388 return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
346 } 389 }
347} 390}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 9908082b32c4..4aac1b4a8112 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -345,7 +345,7 @@ static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl)
345 int i, ret; 345 int i, ret;
346 346
347 for (i = 1; i < ctrl->ctrl.queue_count; i++) { 347 for (i = 1; i < ctrl->ctrl.queue_count; i++) {
348 ret = nvmf_connect_io_queue(&ctrl->ctrl, i); 348 ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false);
349 if (ret) 349 if (ret)
350 return ret; 350 return ret;
351 set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); 351 set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index c2b4d9ee6391..3e4719fdba85 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -30,12 +30,15 @@
30 30
31#define NVMET_ASYNC_EVENTS 4 31#define NVMET_ASYNC_EVENTS 4
32#define NVMET_ERROR_LOG_SLOTS 128 32#define NVMET_ERROR_LOG_SLOTS 128
33#define NVMET_NO_ERROR_LOC ((u16)-1)
33 34
34/* 35/*
35 * Supported optional AENs: 36 * Supported optional AENs:
36 */ 37 */
37#define NVMET_AEN_CFG_OPTIONAL \ 38#define NVMET_AEN_CFG_OPTIONAL \
38 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE) 39 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE)
40#define NVMET_DISC_AEN_CFG_OPTIONAL \
41 (NVME_AEN_CFG_DISC_CHANGE)
39 42
40/* 43/*
41 * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): 44 * Plus mandatory SMART AENs (we'll never send them, but allow enabling them):
@@ -104,6 +107,7 @@ struct nvmet_sq {
104 u16 qid; 107 u16 qid;
105 u16 size; 108 u16 size;
106 u32 sqhd; 109 u32 sqhd;
110 bool sqhd_disabled;
107 struct completion free_done; 111 struct completion free_done;
108 struct completion confirm_done; 112 struct completion confirm_done;
109}; 113};
@@ -137,6 +141,7 @@ struct nvmet_port {
137 struct list_head subsystems; 141 struct list_head subsystems;
138 struct config_group referrals_group; 142 struct config_group referrals_group;
139 struct list_head referrals; 143 struct list_head referrals;
144 struct list_head global_entry;
140 struct config_group ana_groups_group; 145 struct config_group ana_groups_group;
141 struct nvmet_ana_group ana_default_group; 146 struct nvmet_ana_group ana_default_group;
142 enum nvme_ana_state *ana_state; 147 enum nvme_ana_state *ana_state;
@@ -163,6 +168,8 @@ struct nvmet_ctrl {
163 struct nvmet_cq **cqs; 168 struct nvmet_cq **cqs;
164 struct nvmet_sq **sqs; 169 struct nvmet_sq **sqs;
165 170
171 bool cmd_seen;
172
166 struct mutex lock; 173 struct mutex lock;
167 u64 cap; 174 u64 cap;
168 u32 cc; 175 u32 cc;
@@ -194,8 +201,12 @@ struct nvmet_ctrl {
194 char subsysnqn[NVMF_NQN_FIELD_LEN]; 201 char subsysnqn[NVMF_NQN_FIELD_LEN];
195 char hostnqn[NVMF_NQN_FIELD_LEN]; 202 char hostnqn[NVMF_NQN_FIELD_LEN];
196 203
197 struct device *p2p_client; 204 struct device *p2p_client;
198 struct radix_tree_root p2p_ns_map; 205 struct radix_tree_root p2p_ns_map;
206
207 spinlock_t error_lock;
208 u64 err_counter;
209 struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS];
199}; 210};
200 211
201struct nvmet_subsys { 212struct nvmet_subsys {
@@ -273,6 +284,7 @@ struct nvmet_fabrics_ops {
273 void (*delete_ctrl)(struct nvmet_ctrl *ctrl); 284 void (*delete_ctrl)(struct nvmet_ctrl *ctrl);
274 void (*disc_traddr)(struct nvmet_req *req, 285 void (*disc_traddr)(struct nvmet_req *req,
275 struct nvmet_port *port, char *traddr); 286 struct nvmet_port *port, char *traddr);
287 u16 (*install_queue)(struct nvmet_sq *nvme_sq);
276}; 288};
277 289
278#define NVMET_MAX_INLINE_BIOVEC 8 290#define NVMET_MAX_INLINE_BIOVEC 8
@@ -308,17 +320,14 @@ struct nvmet_req {
308 void (*execute)(struct nvmet_req *req); 320 void (*execute)(struct nvmet_req *req);
309 const struct nvmet_fabrics_ops *ops; 321 const struct nvmet_fabrics_ops *ops;
310 322
311 struct pci_dev *p2p_dev; 323 struct pci_dev *p2p_dev;
312 struct device *p2p_client; 324 struct device *p2p_client;
325 u16 error_loc;
326 u64 error_slba;
313}; 327};
314 328
315extern struct workqueue_struct *buffered_io_wq; 329extern struct workqueue_struct *buffered_io_wq;
316 330
317static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
318{
319 req->rsp->status = cpu_to_le16(status << 1);
320}
321
322static inline void nvmet_set_result(struct nvmet_req *req, u32 result) 331static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
323{ 332{
324 req->rsp->result.u32 = cpu_to_le32(result); 333 req->rsp->result.u32 = cpu_to_le32(result);
@@ -340,6 +349,27 @@ struct nvmet_async_event {
340 u8 log_page; 349 u8 log_page;
341}; 350};
342 351
352static inline void nvmet_clear_aen_bit(struct nvmet_req *req, u32 bn)
353{
354 int rae = le32_to_cpu(req->cmd->common.cdw10) & 1 << 15;
355
356 if (!rae)
357 clear_bit(bn, &req->sq->ctrl->aen_masked);
358}
359
360static inline bool nvmet_aen_bit_disabled(struct nvmet_ctrl *ctrl, u32 bn)
361{
362 if (!(READ_ONCE(ctrl->aen_enabled) & (1 << bn)))
363 return true;
364 return test_and_set_bit(bn, &ctrl->aen_masked);
365}
366
367void nvmet_get_feat_kato(struct nvmet_req *req);
368void nvmet_get_feat_async_event(struct nvmet_req *req);
369u16 nvmet_set_feat_kato(struct nvmet_req *req);
370u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask);
371void nvmet_execute_async_event(struct nvmet_req *req);
372
343u16 nvmet_parse_connect_cmd(struct nvmet_req *req); 373u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
344u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); 374u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
345u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); 375u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
@@ -355,6 +385,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status);
355int nvmet_req_alloc_sgl(struct nvmet_req *req); 385int nvmet_req_alloc_sgl(struct nvmet_req *req);
356void nvmet_req_free_sgl(struct nvmet_req *req); 386void nvmet_req_free_sgl(struct nvmet_req *req);
357 387
388void nvmet_execute_keep_alive(struct nvmet_req *req);
389
358void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, 390void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
359 u16 size); 391 u16 size);
360void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, 392void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
@@ -395,7 +427,7 @@ int nvmet_enable_port(struct nvmet_port *port);
395void nvmet_disable_port(struct nvmet_port *port); 427void nvmet_disable_port(struct nvmet_port *port);
396 428
397void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port); 429void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port);
398void nvmet_referral_disable(struct nvmet_port *port); 430void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port);
399 431
400u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, 432u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
401 size_t len); 433 size_t len);
@@ -405,6 +437,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len);
405 437
406u32 nvmet_get_log_page_len(struct nvme_command *cmd); 438u32 nvmet_get_log_page_len(struct nvme_command *cmd);
407 439
440extern struct list_head *nvmet_ports;
441void nvmet_port_disc_changed(struct nvmet_port *port,
442 struct nvmet_subsys *subsys);
443void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
444 struct nvmet_host *host);
445void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
446 u8 event_info, u8 log_page);
447
408#define NVMET_QUEUE_SIZE 1024 448#define NVMET_QUEUE_SIZE 1024
409#define NVMET_NR_QUEUES 128 449#define NVMET_NR_QUEUES 128
410#define NVMET_MAX_CMD NVMET_QUEUE_SIZE 450#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
@@ -425,7 +465,7 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd);
425#define NVMET_DEFAULT_ANA_GRPID 1 465#define NVMET_DEFAULT_ANA_GRPID 1
426 466
427#define NVMET_KAS 10 467#define NVMET_KAS 10
428#define NVMET_DISC_KATO 120 468#define NVMET_DISC_KATO_MS 120000
429 469
430int __init nvmet_init_configfs(void); 470int __init nvmet_init_configfs(void);
431void __exit nvmet_exit_configfs(void); 471void __exit nvmet_exit_configfs(void);
@@ -434,15 +474,13 @@ int __init nvmet_init_discovery(void);
434void nvmet_exit_discovery(void); 474void nvmet_exit_discovery(void);
435 475
436extern struct nvmet_subsys *nvmet_disc_subsys; 476extern struct nvmet_subsys *nvmet_disc_subsys;
437extern u64 nvmet_genctr;
438extern struct rw_semaphore nvmet_config_sem; 477extern struct rw_semaphore nvmet_config_sem;
439 478
440extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; 479extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
441extern u64 nvmet_ana_chgcnt; 480extern u64 nvmet_ana_chgcnt;
442extern struct rw_semaphore nvmet_ana_sem; 481extern struct rw_semaphore nvmet_ana_sem;
443 482
444bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, 483bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn);
445 const char *hostnqn);
446 484
447int nvmet_bdev_ns_enable(struct nvmet_ns *ns); 485int nvmet_bdev_ns_enable(struct nvmet_ns *ns);
448int nvmet_file_ns_enable(struct nvmet_ns *ns); 486int nvmet_file_ns_enable(struct nvmet_ns *ns);
@@ -457,4 +495,6 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req)
457 return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << 495 return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
458 req->ns->blksize_shift; 496 req->ns->blksize_shift;
459} 497}
498
499u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
460#endif /* _NVMET_H */ 500#endif /* _NVMET_H */
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 583086dd9cb9..a8d23eb80192 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -196,7 +196,7 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
196{ 196{
197 unsigned long flags; 197 unsigned long flags;
198 198
199 if (rsp->allocated) { 199 if (unlikely(rsp->allocated)) {
200 kfree(rsp); 200 kfree(rsp);
201 return; 201 return;
202 } 202 }
@@ -630,8 +630,11 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
630 u64 off = le64_to_cpu(sgl->addr); 630 u64 off = le64_to_cpu(sgl->addr);
631 u32 len = le32_to_cpu(sgl->length); 631 u32 len = le32_to_cpu(sgl->length);
632 632
633 if (!nvme_is_write(rsp->req.cmd)) 633 if (!nvme_is_write(rsp->req.cmd)) {
634 rsp->req.error_loc =
635 offsetof(struct nvme_common_command, opcode);
634 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 636 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
637 }
635 638
636 if (off + len > rsp->queue->dev->inline_data_size) { 639 if (off + len > rsp->queue->dev->inline_data_size) {
637 pr_err("invalid inline data offset!\n"); 640 pr_err("invalid inline data offset!\n");
@@ -696,6 +699,8 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
696 return nvmet_rdma_map_sgl_inline(rsp); 699 return nvmet_rdma_map_sgl_inline(rsp);
697 default: 700 default:
698 pr_err("invalid SGL subtype: %#x\n", sgl->type); 701 pr_err("invalid SGL subtype: %#x\n", sgl->type);
702 rsp->req.error_loc =
703 offsetof(struct nvme_common_command, dptr);
699 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 704 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
700 } 705 }
701 case NVME_KEY_SGL_FMT_DATA_DESC: 706 case NVME_KEY_SGL_FMT_DATA_DESC:
@@ -706,10 +711,13 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
706 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); 711 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
707 default: 712 default:
708 pr_err("invalid SGL subtype: %#x\n", sgl->type); 713 pr_err("invalid SGL subtype: %#x\n", sgl->type);
714 rsp->req.error_loc =
715 offsetof(struct nvme_common_command, dptr);
709 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 716 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
710 } 717 }
711 default: 718 default:
712 pr_err("invalid SGL type: %#x\n", sgl->type); 719 pr_err("invalid SGL type: %#x\n", sgl->type);
720 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr);
713 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; 721 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
714 } 722 }
715} 723}
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
new file mode 100644
index 000000000000..44b37b202e39
--- /dev/null
+++ b/drivers/nvme/target/tcp.c
@@ -0,0 +1,1737 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe over Fabrics TCP target.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/nvme-tcp.h>
12#include <net/sock.h>
13#include <net/tcp.h>
14#include <linux/inet.h>
15#include <linux/llist.h>
16#include <crypto/hash.h>
17
18#include "nvmet.h"
19
20#define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
21
22#define NVMET_TCP_RECV_BUDGET 8
23#define NVMET_TCP_SEND_BUDGET 8
24#define NVMET_TCP_IO_WORK_BUDGET 64
25
26enum nvmet_tcp_send_state {
27 NVMET_TCP_SEND_DATA_PDU,
28 NVMET_TCP_SEND_DATA,
29 NVMET_TCP_SEND_R2T,
30 NVMET_TCP_SEND_DDGST,
31 NVMET_TCP_SEND_RESPONSE
32};
33
34enum nvmet_tcp_recv_state {
35 NVMET_TCP_RECV_PDU,
36 NVMET_TCP_RECV_DATA,
37 NVMET_TCP_RECV_DDGST,
38 NVMET_TCP_RECV_ERR,
39};
40
41enum {
42 NVMET_TCP_F_INIT_FAILED = (1 << 0),
43};
44
45struct nvmet_tcp_cmd {
46 struct nvmet_tcp_queue *queue;
47 struct nvmet_req req;
48
49 struct nvme_tcp_cmd_pdu *cmd_pdu;
50 struct nvme_tcp_rsp_pdu *rsp_pdu;
51 struct nvme_tcp_data_pdu *data_pdu;
52 struct nvme_tcp_r2t_pdu *r2t_pdu;
53
54 u32 rbytes_done;
55 u32 wbytes_done;
56
57 u32 pdu_len;
58 u32 pdu_recv;
59 int sg_idx;
60 int nr_mapped;
61 struct msghdr recv_msg;
62 struct kvec *iov;
63 u32 flags;
64
65 struct list_head entry;
66 struct llist_node lentry;
67
68 /* send state */
69 u32 offset;
70 struct scatterlist *cur_sg;
71 enum nvmet_tcp_send_state state;
72
73 __le32 exp_ddgst;
74 __le32 recv_ddgst;
75};
76
77enum nvmet_tcp_queue_state {
78 NVMET_TCP_Q_CONNECTING,
79 NVMET_TCP_Q_LIVE,
80 NVMET_TCP_Q_DISCONNECTING,
81};
82
83struct nvmet_tcp_queue {
84 struct socket *sock;
85 struct nvmet_tcp_port *port;
86 struct work_struct io_work;
87 int cpu;
88 struct nvmet_cq nvme_cq;
89 struct nvmet_sq nvme_sq;
90
91 /* send state */
92 struct nvmet_tcp_cmd *cmds;
93 unsigned int nr_cmds;
94 struct list_head free_list;
95 struct llist_head resp_list;
96 struct list_head resp_send_list;
97 int send_list_len;
98 struct nvmet_tcp_cmd *snd_cmd;
99
100 /* recv state */
101 int offset;
102 int left;
103 enum nvmet_tcp_recv_state rcv_state;
104 struct nvmet_tcp_cmd *cmd;
105 union nvme_tcp_pdu pdu;
106
107 /* digest state */
108 bool hdr_digest;
109 bool data_digest;
110 struct ahash_request *snd_hash;
111 struct ahash_request *rcv_hash;
112
113 spinlock_t state_lock;
114 enum nvmet_tcp_queue_state state;
115
116 struct sockaddr_storage sockaddr;
117 struct sockaddr_storage sockaddr_peer;
118 struct work_struct release_work;
119
120 int idx;
121 struct list_head queue_list;
122
123 struct nvmet_tcp_cmd connect;
124
125 struct page_frag_cache pf_cache;
126
127 void (*data_ready)(struct sock *);
128 void (*state_change)(struct sock *);
129 void (*write_space)(struct sock *);
130};
131
132struct nvmet_tcp_port {
133 struct socket *sock;
134 struct work_struct accept_work;
135 struct nvmet_port *nport;
136 struct sockaddr_storage addr;
137 int last_cpu;
138 void (*data_ready)(struct sock *);
139};
140
141static DEFINE_IDA(nvmet_tcp_queue_ida);
142static LIST_HEAD(nvmet_tcp_queue_list);
143static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
144
145static struct workqueue_struct *nvmet_tcp_wq;
146static struct nvmet_fabrics_ops nvmet_tcp_ops;
147static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
148static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
149
150static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
151 struct nvmet_tcp_cmd *cmd)
152{
153 return cmd - queue->cmds;
154}
155
156static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
157{
158 return nvme_is_write(cmd->req.cmd) &&
159 cmd->rbytes_done < cmd->req.transfer_len;
160}
161
162static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
163{
164 return nvmet_tcp_has_data_in(cmd) && !cmd->req.rsp->status;
165}
166
167static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
168{
169 return !nvme_is_write(cmd->req.cmd) &&
170 cmd->req.transfer_len > 0 &&
171 !cmd->req.rsp->status;
172}
173
174static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
175{
176 return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
177 !cmd->rbytes_done;
178}
179
180static inline struct nvmet_tcp_cmd *
181nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
182{
183 struct nvmet_tcp_cmd *cmd;
184
185 cmd = list_first_entry_or_null(&queue->free_list,
186 struct nvmet_tcp_cmd, entry);
187 if (!cmd)
188 return NULL;
189 list_del_init(&cmd->entry);
190
191 cmd->rbytes_done = cmd->wbytes_done = 0;
192 cmd->pdu_len = 0;
193 cmd->pdu_recv = 0;
194 cmd->iov = NULL;
195 cmd->flags = 0;
196 return cmd;
197}
198
199static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
200{
201 if (unlikely(cmd == &cmd->queue->connect))
202 return;
203
204 list_add_tail(&cmd->entry, &cmd->queue->free_list);
205}
206
207static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
208{
209 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
210}
211
212static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
213{
214 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
215}
216
217static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
218 void *pdu, size_t len)
219{
220 struct scatterlist sg;
221
222 sg_init_one(&sg, pdu, len);
223 ahash_request_set_crypt(hash, &sg, pdu + len, len);
224 crypto_ahash_digest(hash);
225}
226
227static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
228 void *pdu, size_t len)
229{
230 struct nvme_tcp_hdr *hdr = pdu;
231 __le32 recv_digest;
232 __le32 exp_digest;
233
234 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
235 pr_err("queue %d: header digest enabled but no header digest\n",
236 queue->idx);
237 return -EPROTO;
238 }
239
240 recv_digest = *(__le32 *)(pdu + hdr->hlen);
241 nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
242 exp_digest = *(__le32 *)(pdu + hdr->hlen);
243 if (recv_digest != exp_digest) {
244 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
245 queue->idx, le32_to_cpu(recv_digest),
246 le32_to_cpu(exp_digest));
247 return -EPROTO;
248 }
249
250 return 0;
251}
252
253static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
254{
255 struct nvme_tcp_hdr *hdr = pdu;
256 u8 digest_len = nvmet_tcp_hdgst_len(queue);
257 u32 len;
258
259 len = le32_to_cpu(hdr->plen) - hdr->hlen -
260 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
261
262 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
263 pr_err("queue %d: data digest flag is cleared\n", queue->idx);
264 return -EPROTO;
265 }
266
267 return 0;
268}
269
270static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
271{
272 struct scatterlist *sg;
273 int i;
274
275 sg = &cmd->req.sg[cmd->sg_idx];
276
277 for (i = 0; i < cmd->nr_mapped; i++)
278 kunmap(sg_page(&sg[i]));
279}
280
281static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
282{
283 struct kvec *iov = cmd->iov;
284 struct scatterlist *sg;
285 u32 length, offset, sg_offset;
286
287 length = cmd->pdu_len;
288 cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
289 offset = cmd->rbytes_done;
290 cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
291 sg_offset = offset % PAGE_SIZE;
292 sg = &cmd->req.sg[cmd->sg_idx];
293
294 while (length) {
295 u32 iov_len = min_t(u32, length, sg->length - sg_offset);
296
297 iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
298 iov->iov_len = iov_len;
299
300 length -= iov_len;
301 sg = sg_next(sg);
302 iov++;
303 }
304
305 iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
306 cmd->nr_mapped, cmd->pdu_len);
307}
308
309static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
310{
311 queue->rcv_state = NVMET_TCP_RECV_ERR;
312 if (queue->nvme_sq.ctrl)
313 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
314 else
315 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
316}
317
318static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
319{
320 struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
321 u32 len = le32_to_cpu(sgl->length);
322
323 if (!cmd->req.data_len)
324 return 0;
325
326 if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
327 NVME_SGL_FMT_OFFSET)) {
328 if (!nvme_is_write(cmd->req.cmd))
329 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
330
331 if (len > cmd->req.port->inline_data_size)
332 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
333 cmd->pdu_len = len;
334 }
335 cmd->req.transfer_len += len;
336
337 cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
338 if (!cmd->req.sg)
339 return NVME_SC_INTERNAL;
340 cmd->cur_sg = cmd->req.sg;
341
342 if (nvmet_tcp_has_data_in(cmd)) {
343 cmd->iov = kmalloc_array(cmd->req.sg_cnt,
344 sizeof(*cmd->iov), GFP_KERNEL);
345 if (!cmd->iov)
346 goto err;
347 }
348
349 return 0;
350err:
351 sgl_free(cmd->req.sg);
352 return NVME_SC_INTERNAL;
353}
354
355static void nvmet_tcp_ddgst(struct ahash_request *hash,
356 struct nvmet_tcp_cmd *cmd)
357{
358 ahash_request_set_crypt(hash, cmd->req.sg,
359 (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
360 crypto_ahash_digest(hash);
361}
362
363static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
364{
365 struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
366 struct nvmet_tcp_queue *queue = cmd->queue;
367 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
368 u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
369
370 cmd->offset = 0;
371 cmd->state = NVMET_TCP_SEND_DATA_PDU;
372
373 pdu->hdr.type = nvme_tcp_c2h_data;
374 pdu->hdr.flags = NVME_TCP_F_DATA_LAST;
375 pdu->hdr.hlen = sizeof(*pdu);
376 pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
377 pdu->hdr.plen =
378 cpu_to_le32(pdu->hdr.hlen + hdgst +
379 cmd->req.transfer_len + ddgst);
380 pdu->command_id = cmd->req.rsp->command_id;
381 pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
382 pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
383
384 if (queue->data_digest) {
385 pdu->hdr.flags |= NVME_TCP_F_DDGST;
386 nvmet_tcp_ddgst(queue->snd_hash, cmd);
387 }
388
389 if (cmd->queue->hdr_digest) {
390 pdu->hdr.flags |= NVME_TCP_F_HDGST;
391 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
392 }
393}
394
395static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
396{
397 struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
398 struct nvmet_tcp_queue *queue = cmd->queue;
399 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
400
401 cmd->offset = 0;
402 cmd->state = NVMET_TCP_SEND_R2T;
403
404 pdu->hdr.type = nvme_tcp_r2t;
405 pdu->hdr.flags = 0;
406 pdu->hdr.hlen = sizeof(*pdu);
407 pdu->hdr.pdo = 0;
408 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
409
410 pdu->command_id = cmd->req.cmd->common.command_id;
411 pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
412 pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
413 pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
414 if (cmd->queue->hdr_digest) {
415 pdu->hdr.flags |= NVME_TCP_F_HDGST;
416 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
417 }
418}
419
420static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
421{
422 struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
423 struct nvmet_tcp_queue *queue = cmd->queue;
424 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
425
426 cmd->offset = 0;
427 cmd->state = NVMET_TCP_SEND_RESPONSE;
428
429 pdu->hdr.type = nvme_tcp_rsp;
430 pdu->hdr.flags = 0;
431 pdu->hdr.hlen = sizeof(*pdu);
432 pdu->hdr.pdo = 0;
433 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
434 if (cmd->queue->hdr_digest) {
435 pdu->hdr.flags |= NVME_TCP_F_HDGST;
436 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
437 }
438}
439
440static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
441{
442 struct llist_node *node;
443
444 node = llist_del_all(&queue->resp_list);
445 if (!node)
446 return;
447
448 while (node) {
449 struct nvmet_tcp_cmd *cmd = llist_entry(node,
450 struct nvmet_tcp_cmd, lentry);
451
452 list_add(&cmd->entry, &queue->resp_send_list);
453 node = node->next;
454 queue->send_list_len++;
455 }
456}
457
458static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
459{
460 queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
461 struct nvmet_tcp_cmd, entry);
462 if (!queue->snd_cmd) {
463 nvmet_tcp_process_resp_list(queue);
464 queue->snd_cmd =
465 list_first_entry_or_null(&queue->resp_send_list,
466 struct nvmet_tcp_cmd, entry);
467 if (unlikely(!queue->snd_cmd))
468 return NULL;
469 }
470
471 list_del_init(&queue->snd_cmd->entry);
472 queue->send_list_len--;
473
474 if (nvmet_tcp_need_data_out(queue->snd_cmd))
475 nvmet_setup_c2h_data_pdu(queue->snd_cmd);
476 else if (nvmet_tcp_need_data_in(queue->snd_cmd))
477 nvmet_setup_r2t_pdu(queue->snd_cmd);
478 else
479 nvmet_setup_response_pdu(queue->snd_cmd);
480
481 return queue->snd_cmd;
482}
483
484static void nvmet_tcp_queue_response(struct nvmet_req *req)
485{
486 struct nvmet_tcp_cmd *cmd =
487 container_of(req, struct nvmet_tcp_cmd, req);
488 struct nvmet_tcp_queue *queue = cmd->queue;
489
490 llist_add(&cmd->lentry, &queue->resp_list);
491 queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
492}
493
494static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
495{
496 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
497 int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
498 int ret;
499
500 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
501 offset_in_page(cmd->data_pdu) + cmd->offset,
502 left, MSG_DONTWAIT | MSG_MORE);
503 if (ret <= 0)
504 return ret;
505
506 cmd->offset += ret;
507 left -= ret;
508
509 if (left)
510 return -EAGAIN;
511
512 cmd->state = NVMET_TCP_SEND_DATA;
513 cmd->offset = 0;
514 return 1;
515}
516
517static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
518{
519 struct nvmet_tcp_queue *queue = cmd->queue;
520 int ret;
521
522 while (cmd->cur_sg) {
523 struct page *page = sg_page(cmd->cur_sg);
524 u32 left = cmd->cur_sg->length - cmd->offset;
525
526 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
527 left, MSG_DONTWAIT | MSG_MORE);
528 if (ret <= 0)
529 return ret;
530
531 cmd->offset += ret;
532 cmd->wbytes_done += ret;
533
534 /* Done with sg?*/
535 if (cmd->offset == cmd->cur_sg->length) {
536 cmd->cur_sg = sg_next(cmd->cur_sg);
537 cmd->offset = 0;
538 }
539 }
540
541 if (queue->data_digest) {
542 cmd->state = NVMET_TCP_SEND_DDGST;
543 cmd->offset = 0;
544 } else {
545 nvmet_setup_response_pdu(cmd);
546 }
547 return 1;
548
549}
550
551static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
552 bool last_in_batch)
553{
554 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
555 int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
556 int flags = MSG_DONTWAIT;
557 int ret;
558
559 if (!last_in_batch && cmd->queue->send_list_len)
560 flags |= MSG_MORE;
561 else
562 flags |= MSG_EOR;
563
564 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
565 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
566 if (ret <= 0)
567 return ret;
568 cmd->offset += ret;
569 left -= ret;
570
571 if (left)
572 return -EAGAIN;
573
574 kfree(cmd->iov);
575 sgl_free(cmd->req.sg);
576 cmd->queue->snd_cmd = NULL;
577 nvmet_tcp_put_cmd(cmd);
578 return 1;
579}
580
581static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
582{
583 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
584 int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
585 int flags = MSG_DONTWAIT;
586 int ret;
587
588 if (!last_in_batch && cmd->queue->send_list_len)
589 flags |= MSG_MORE;
590 else
591 flags |= MSG_EOR;
592
593 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
594 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
595 if (ret <= 0)
596 return ret;
597 cmd->offset += ret;
598 left -= ret;
599
600 if (left)
601 return -EAGAIN;
602
603 cmd->queue->snd_cmd = NULL;
604 return 1;
605}
606
607static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
608{
609 struct nvmet_tcp_queue *queue = cmd->queue;
610 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
611 struct kvec iov = {
612 .iov_base = &cmd->exp_ddgst + cmd->offset,
613 .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
614 };
615 int ret;
616
617 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
618 if (unlikely(ret <= 0))
619 return ret;
620
621 cmd->offset += ret;
622 nvmet_setup_response_pdu(cmd);
623 return 1;
624}
625
626static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
627 bool last_in_batch)
628{
629 struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
630 int ret = 0;
631
632 if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
633 cmd = nvmet_tcp_fetch_cmd(queue);
634 if (unlikely(!cmd))
635 return 0;
636 }
637
638 if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
639 ret = nvmet_try_send_data_pdu(cmd);
640 if (ret <= 0)
641 goto done_send;
642 }
643
644 if (cmd->state == NVMET_TCP_SEND_DATA) {
645 ret = nvmet_try_send_data(cmd);
646 if (ret <= 0)
647 goto done_send;
648 }
649
650 if (cmd->state == NVMET_TCP_SEND_DDGST) {
651 ret = nvmet_try_send_ddgst(cmd);
652 if (ret <= 0)
653 goto done_send;
654 }
655
656 if (cmd->state == NVMET_TCP_SEND_R2T) {
657 ret = nvmet_try_send_r2t(cmd, last_in_batch);
658 if (ret <= 0)
659 goto done_send;
660 }
661
662 if (cmd->state == NVMET_TCP_SEND_RESPONSE)
663 ret = nvmet_try_send_response(cmd, last_in_batch);
664
665done_send:
666 if (ret < 0) {
667 if (ret == -EAGAIN)
668 return 0;
669 return ret;
670 }
671
672 return 1;
673}
674
675static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
676 int budget, int *sends)
677{
678 int i, ret = 0;
679
680 for (i = 0; i < budget; i++) {
681 ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
682 if (ret <= 0)
683 break;
684 (*sends)++;
685 }
686
687 return ret;
688}
689
690static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
691{
692 queue->offset = 0;
693 queue->left = sizeof(struct nvme_tcp_hdr);
694 queue->cmd = NULL;
695 queue->rcv_state = NVMET_TCP_RECV_PDU;
696}
697
698static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
699{
700 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
701
702 ahash_request_free(queue->rcv_hash);
703 ahash_request_free(queue->snd_hash);
704 crypto_free_ahash(tfm);
705}
706
707static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
708{
709 struct crypto_ahash *tfm;
710
711 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
712 if (IS_ERR(tfm))
713 return PTR_ERR(tfm);
714
715 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
716 if (!queue->snd_hash)
717 goto free_tfm;
718 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
719
720 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
721 if (!queue->rcv_hash)
722 goto free_snd_hash;
723 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
724
725 return 0;
726free_snd_hash:
727 ahash_request_free(queue->snd_hash);
728free_tfm:
729 crypto_free_ahash(tfm);
730 return -ENOMEM;
731}
732
733
734static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
735{
736 struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
737 struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
738 struct msghdr msg = {};
739 struct kvec iov;
740 int ret;
741
742 if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
743 pr_err("bad nvme-tcp pdu length (%d)\n",
744 le32_to_cpu(icreq->hdr.plen));
745 nvmet_tcp_fatal_error(queue);
746 }
747
748 if (icreq->pfv != NVME_TCP_PFV_1_0) {
749 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
750 return -EPROTO;
751 }
752
753 if (icreq->hpda != 0) {
754 pr_err("queue %d: unsupported hpda %d\n", queue->idx,
755 icreq->hpda);
756 return -EPROTO;
757 }
758
759 if (icreq->maxr2t != 0) {
760 pr_err("queue %d: unsupported maxr2t %d\n", queue->idx,
761 le32_to_cpu(icreq->maxr2t) + 1);
762 return -EPROTO;
763 }
764
765 queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
766 queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
767 if (queue->hdr_digest || queue->data_digest) {
768 ret = nvmet_tcp_alloc_crypto(queue);
769 if (ret)
770 return ret;
771 }
772
773 memset(icresp, 0, sizeof(*icresp));
774 icresp->hdr.type = nvme_tcp_icresp;
775 icresp->hdr.hlen = sizeof(*icresp);
776 icresp->hdr.pdo = 0;
777 icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
778 icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
779 icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */
780 icresp->cpda = 0;
781 if (queue->hdr_digest)
782 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
783 if (queue->data_digest)
784 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
785
786 iov.iov_base = icresp;
787 iov.iov_len = sizeof(*icresp);
788 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
789 if (ret < 0)
790 goto free_crypto;
791
792 queue->state = NVMET_TCP_Q_LIVE;
793 nvmet_prepare_receive_pdu(queue);
794 return 0;
795free_crypto:
796 if (queue->hdr_digest || queue->data_digest)
797 nvmet_tcp_free_crypto(queue);
798 return ret;
799}
800
801static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
802 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
803{
804 int ret;
805
806 /* recover the expected data transfer length */
807 req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
808
809 if (!nvme_is_write(cmd->req.cmd) ||
810 req->data_len > cmd->req.port->inline_data_size) {
811 nvmet_prepare_receive_pdu(queue);
812 return;
813 }
814
815 ret = nvmet_tcp_map_data(cmd);
816 if (unlikely(ret)) {
817 pr_err("queue %d: failed to map data\n", queue->idx);
818 nvmet_tcp_fatal_error(queue);
819 return;
820 }
821
822 queue->rcv_state = NVMET_TCP_RECV_DATA;
823 nvmet_tcp_map_pdu_iovec(cmd);
824 cmd->flags |= NVMET_TCP_F_INIT_FAILED;
825}
826
827static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
828{
829 struct nvme_tcp_data_pdu *data = &queue->pdu.data;
830 struct nvmet_tcp_cmd *cmd;
831
832 cmd = &queue->cmds[data->ttag];
833
834 if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
835 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
836 data->ttag, le32_to_cpu(data->data_offset),
837 cmd->rbytes_done);
838 /* FIXME: use path and transport errors */
839 nvmet_req_complete(&cmd->req,
840 NVME_SC_INVALID_FIELD | NVME_SC_DNR);
841 return -EPROTO;
842 }
843
844 cmd->pdu_len = le32_to_cpu(data->data_length);
845 cmd->pdu_recv = 0;
846 nvmet_tcp_map_pdu_iovec(cmd);
847 queue->cmd = cmd;
848 queue->rcv_state = NVMET_TCP_RECV_DATA;
849
850 return 0;
851}
852
853static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
854{
855 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
856 struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
857 struct nvmet_req *req;
858 int ret;
859
860 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
861 if (hdr->type != nvme_tcp_icreq) {
862 pr_err("unexpected pdu type (%d) before icreq\n",
863 hdr->type);
864 nvmet_tcp_fatal_error(queue);
865 return -EPROTO;
866 }
867 return nvmet_tcp_handle_icreq(queue);
868 }
869
870 if (hdr->type == nvme_tcp_h2c_data) {
871 ret = nvmet_tcp_handle_h2c_data_pdu(queue);
872 if (unlikely(ret))
873 return ret;
874 return 0;
875 }
876
877 queue->cmd = nvmet_tcp_get_cmd(queue);
878 if (unlikely(!queue->cmd)) {
879 /* This should never happen */
880 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
881 queue->idx, queue->nr_cmds, queue->send_list_len,
882 nvme_cmd->common.opcode);
883 nvmet_tcp_fatal_error(queue);
884 return -ENOMEM;
885 }
886
887 req = &queue->cmd->req;
888 memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
889
890 if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
891 &queue->nvme_sq, &nvmet_tcp_ops))) {
892 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
893 req->cmd, req->cmd->common.command_id,
894 req->cmd->common.opcode,
895 le32_to_cpu(req->cmd->common.dptr.sgl.length));
896
897 nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
898 return -EAGAIN;
899 }
900
901 ret = nvmet_tcp_map_data(queue->cmd);
902 if (unlikely(ret)) {
903 pr_err("queue %d: failed to map data\n", queue->idx);
904 if (nvmet_tcp_has_inline_data(queue->cmd))
905 nvmet_tcp_fatal_error(queue);
906 else
907 nvmet_req_complete(req, ret);
908 ret = -EAGAIN;
909 goto out;
910 }
911
912 if (nvmet_tcp_need_data_in(queue->cmd)) {
913 if (nvmet_tcp_has_inline_data(queue->cmd)) {
914 queue->rcv_state = NVMET_TCP_RECV_DATA;
915 nvmet_tcp_map_pdu_iovec(queue->cmd);
916 return 0;
917 }
918 /* send back R2T */
919 nvmet_tcp_queue_response(&queue->cmd->req);
920 goto out;
921 }
922
923 nvmet_req_execute(&queue->cmd->req);
924out:
925 nvmet_prepare_receive_pdu(queue);
926 return ret;
927}
928
929static const u8 nvme_tcp_pdu_sizes[] = {
930 [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu),
931 [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu),
932 [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu),
933};
934
935static inline u8 nvmet_tcp_pdu_size(u8 type)
936{
937 size_t idx = type;
938
939 return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
940 nvme_tcp_pdu_sizes[idx]) ?
941 nvme_tcp_pdu_sizes[idx] : 0;
942}
943
944static inline bool nvmet_tcp_pdu_valid(u8 type)
945{
946 switch (type) {
947 case nvme_tcp_icreq:
948 case nvme_tcp_cmd:
949 case nvme_tcp_h2c_data:
950 /* fallthru */
951 return true;
952 }
953
954 return false;
955}
956
957static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
958{
959 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
960 int len;
961 struct kvec iov;
962 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
963
964recv:
965 iov.iov_base = (void *)&queue->pdu + queue->offset;
966 iov.iov_len = queue->left;
967 len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
968 iov.iov_len, msg.msg_flags);
969 if (unlikely(len < 0))
970 return len;
971
972 queue->offset += len;
973 queue->left -= len;
974 if (queue->left)
975 return -EAGAIN;
976
977 if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
978 u8 hdgst = nvmet_tcp_hdgst_len(queue);
979
980 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
981 pr_err("unexpected pdu type %d\n", hdr->type);
982 nvmet_tcp_fatal_error(queue);
983 return -EIO;
984 }
985
986 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
987 pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
988 return -EIO;
989 }
990
991 queue->left = hdr->hlen - queue->offset + hdgst;
992 goto recv;
993 }
994
995 if (queue->hdr_digest &&
996 nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
997 nvmet_tcp_fatal_error(queue); /* fatal */
998 return -EPROTO;
999 }
1000
1001 if (queue->data_digest &&
1002 nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1003 nvmet_tcp_fatal_error(queue); /* fatal */
1004 return -EPROTO;
1005 }
1006
1007 return nvmet_tcp_done_recv_pdu(queue);
1008}
1009
1010static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1011{
1012 struct nvmet_tcp_queue *queue = cmd->queue;
1013
1014 nvmet_tcp_ddgst(queue->rcv_hash, cmd);
1015 queue->offset = 0;
1016 queue->left = NVME_TCP_DIGEST_LENGTH;
1017 queue->rcv_state = NVMET_TCP_RECV_DDGST;
1018}
1019
1020static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1021{
1022 struct nvmet_tcp_cmd *cmd = queue->cmd;
1023 int ret;
1024
1025 while (msg_data_left(&cmd->recv_msg)) {
1026 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1027 cmd->recv_msg.msg_flags);
1028 if (ret <= 0)
1029 return ret;
1030
1031 cmd->pdu_recv += ret;
1032 cmd->rbytes_done += ret;
1033 }
1034
1035 nvmet_tcp_unmap_pdu_iovec(cmd);
1036
1037 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1038 cmd->rbytes_done == cmd->req.transfer_len) {
1039 if (queue->data_digest) {
1040 nvmet_tcp_prep_recv_ddgst(cmd);
1041 return 0;
1042 }
1043 nvmet_req_execute(&cmd->req);
1044 }
1045
1046 nvmet_prepare_receive_pdu(queue);
1047 return 0;
1048}
1049
1050static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1051{
1052 struct nvmet_tcp_cmd *cmd = queue->cmd;
1053 int ret;
1054 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1055 struct kvec iov = {
1056 .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1057 .iov_len = queue->left
1058 };
1059
1060 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1061 iov.iov_len, msg.msg_flags);
1062 if (unlikely(ret < 0))
1063 return ret;
1064
1065 queue->offset += ret;
1066 queue->left -= ret;
1067 if (queue->left)
1068 return -EAGAIN;
1069
1070 if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1071 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1072 queue->idx, cmd->req.cmd->common.command_id,
1073 queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1074 le32_to_cpu(cmd->exp_ddgst));
1075 nvmet_tcp_finish_cmd(cmd);
1076 nvmet_tcp_fatal_error(queue);
1077 ret = -EPROTO;
1078 goto out;
1079 }
1080
1081 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1082 cmd->rbytes_done == cmd->req.transfer_len)
1083 nvmet_req_execute(&cmd->req);
1084 ret = 0;
1085out:
1086 nvmet_prepare_receive_pdu(queue);
1087 return ret;
1088}
1089
1090static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1091{
1092 int result;
1093
1094 if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1095 return 0;
1096
1097 if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1098 result = nvmet_tcp_try_recv_pdu(queue);
1099 if (result != 0)
1100 goto done_recv;
1101 }
1102
1103 if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1104 result = nvmet_tcp_try_recv_data(queue);
1105 if (result != 0)
1106 goto done_recv;
1107 }
1108
1109 if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1110 result = nvmet_tcp_try_recv_ddgst(queue);
1111 if (result != 0)
1112 goto done_recv;
1113 }
1114
1115done_recv:
1116 if (result < 0) {
1117 if (result == -EAGAIN)
1118 return 0;
1119 return result;
1120 }
1121 return 1;
1122}
1123
1124static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1125 int budget, int *recvs)
1126{
1127 int i, ret = 0;
1128
1129 for (i = 0; i < budget; i++) {
1130 ret = nvmet_tcp_try_recv_one(queue);
1131 if (ret <= 0)
1132 break;
1133 (*recvs)++;
1134 }
1135
1136 return ret;
1137}
1138
1139static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1140{
1141 spin_lock(&queue->state_lock);
1142 if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1143 queue->state = NVMET_TCP_Q_DISCONNECTING;
1144 schedule_work(&queue->release_work);
1145 }
1146 spin_unlock(&queue->state_lock);
1147}
1148
1149static void nvmet_tcp_io_work(struct work_struct *w)
1150{
1151 struct nvmet_tcp_queue *queue =
1152 container_of(w, struct nvmet_tcp_queue, io_work);
1153 bool pending;
1154 int ret, ops = 0;
1155
1156 do {
1157 pending = false;
1158
1159 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1160 if (ret > 0) {
1161 pending = true;
1162 } else if (ret < 0) {
1163 if (ret == -EPIPE || ret == -ECONNRESET)
1164 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1165 else
1166 nvmet_tcp_fatal_error(queue);
1167 return;
1168 }
1169
1170 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1171 if (ret > 0) {
1172 /* transmitted message/data */
1173 pending = true;
1174 } else if (ret < 0) {
1175 if (ret == -EPIPE || ret == -ECONNRESET)
1176 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1177 else
1178 nvmet_tcp_fatal_error(queue);
1179 return;
1180 }
1181
1182 } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1183
1184 /*
1185 * We exahusted our budget, requeue our selves
1186 */
1187 if (pending)
1188 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1189}
1190
1191static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1192 struct nvmet_tcp_cmd *c)
1193{
1194 u8 hdgst = nvmet_tcp_hdgst_len(queue);
1195
1196 c->queue = queue;
1197 c->req.port = queue->port->nport;
1198
1199 c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1200 sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1201 if (!c->cmd_pdu)
1202 return -ENOMEM;
1203 c->req.cmd = &c->cmd_pdu->cmd;
1204
1205 c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1206 sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1207 if (!c->rsp_pdu)
1208 goto out_free_cmd;
1209 c->req.rsp = &c->rsp_pdu->cqe;
1210
1211 c->data_pdu = page_frag_alloc(&queue->pf_cache,
1212 sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1213 if (!c->data_pdu)
1214 goto out_free_rsp;
1215
1216 c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1217 sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1218 if (!c->r2t_pdu)
1219 goto out_free_data;
1220
1221 c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1222
1223 list_add_tail(&c->entry, &queue->free_list);
1224
1225 return 0;
1226out_free_data:
1227 page_frag_free(c->data_pdu);
1228out_free_rsp:
1229 page_frag_free(c->rsp_pdu);
1230out_free_cmd:
1231 page_frag_free(c->cmd_pdu);
1232 return -ENOMEM;
1233}
1234
1235static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1236{
1237 page_frag_free(c->r2t_pdu);
1238 page_frag_free(c->data_pdu);
1239 page_frag_free(c->rsp_pdu);
1240 page_frag_free(c->cmd_pdu);
1241}
1242
1243static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1244{
1245 struct nvmet_tcp_cmd *cmds;
1246 int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1247
1248 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1249 if (!cmds)
1250 goto out;
1251
1252 for (i = 0; i < nr_cmds; i++) {
1253 ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1254 if (ret)
1255 goto out_free;
1256 }
1257
1258 queue->cmds = cmds;
1259
1260 return 0;
1261out_free:
1262 while (--i >= 0)
1263 nvmet_tcp_free_cmd(cmds + i);
1264 kfree(cmds);
1265out:
1266 return ret;
1267}
1268
1269static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1270{
1271 struct nvmet_tcp_cmd *cmds = queue->cmds;
1272 int i;
1273
1274 for (i = 0; i < queue->nr_cmds; i++)
1275 nvmet_tcp_free_cmd(cmds + i);
1276
1277 nvmet_tcp_free_cmd(&queue->connect);
1278 kfree(cmds);
1279}
1280
1281static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1282{
1283 struct socket *sock = queue->sock;
1284
1285 write_lock_bh(&sock->sk->sk_callback_lock);
1286 sock->sk->sk_data_ready = queue->data_ready;
1287 sock->sk->sk_state_change = queue->state_change;
1288 sock->sk->sk_write_space = queue->write_space;
1289 sock->sk->sk_user_data = NULL;
1290 write_unlock_bh(&sock->sk->sk_callback_lock);
1291}
1292
1293static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1294{
1295 nvmet_req_uninit(&cmd->req);
1296 nvmet_tcp_unmap_pdu_iovec(cmd);
1297 sgl_free(cmd->req.sg);
1298}
1299
1300static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1301{
1302 struct nvmet_tcp_cmd *cmd = queue->cmds;
1303 int i;
1304
1305 for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1306 if (nvmet_tcp_need_data_in(cmd))
1307 nvmet_tcp_finish_cmd(cmd);
1308 }
1309
1310 if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1311 /* failed in connect */
1312 nvmet_tcp_finish_cmd(&queue->connect);
1313 }
1314}
1315
1316static void nvmet_tcp_release_queue_work(struct work_struct *w)
1317{
1318 struct nvmet_tcp_queue *queue =
1319 container_of(w, struct nvmet_tcp_queue, release_work);
1320
1321 mutex_lock(&nvmet_tcp_queue_mutex);
1322 list_del_init(&queue->queue_list);
1323 mutex_unlock(&nvmet_tcp_queue_mutex);
1324
1325 nvmet_tcp_restore_socket_callbacks(queue);
1326 flush_work(&queue->io_work);
1327
1328 nvmet_tcp_uninit_data_in_cmds(queue);
1329 nvmet_sq_destroy(&queue->nvme_sq);
1330 cancel_work_sync(&queue->io_work);
1331 sock_release(queue->sock);
1332 nvmet_tcp_free_cmds(queue);
1333 if (queue->hdr_digest || queue->data_digest)
1334 nvmet_tcp_free_crypto(queue);
1335 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1336
1337 kfree(queue);
1338}
1339
1340static void nvmet_tcp_data_ready(struct sock *sk)
1341{
1342 struct nvmet_tcp_queue *queue;
1343
1344 read_lock_bh(&sk->sk_callback_lock);
1345 queue = sk->sk_user_data;
1346 if (likely(queue))
1347 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1348 read_unlock_bh(&sk->sk_callback_lock);
1349}
1350
1351static void nvmet_tcp_write_space(struct sock *sk)
1352{
1353 struct nvmet_tcp_queue *queue;
1354
1355 read_lock_bh(&sk->sk_callback_lock);
1356 queue = sk->sk_user_data;
1357 if (unlikely(!queue))
1358 goto out;
1359
1360 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1361 queue->write_space(sk);
1362 goto out;
1363 }
1364
1365 if (sk_stream_is_writeable(sk)) {
1366 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1367 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1368 }
1369out:
1370 read_unlock_bh(&sk->sk_callback_lock);
1371}
1372
1373static void nvmet_tcp_state_change(struct sock *sk)
1374{
1375 struct nvmet_tcp_queue *queue;
1376
1377 write_lock_bh(&sk->sk_callback_lock);
1378 queue = sk->sk_user_data;
1379 if (!queue)
1380 goto done;
1381
1382 switch (sk->sk_state) {
1383 case TCP_FIN_WAIT1:
1384 case TCP_CLOSE_WAIT:
1385 case TCP_CLOSE:
1386 /* FALLTHRU */
1387 sk->sk_user_data = NULL;
1388 nvmet_tcp_schedule_release_queue(queue);
1389 break;
1390 default:
1391 pr_warn("queue %d unhandled state %d\n",
1392 queue->idx, sk->sk_state);
1393 }
1394done:
1395 write_unlock_bh(&sk->sk_callback_lock);
1396}
1397
1398static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1399{
1400 struct socket *sock = queue->sock;
1401 struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1402 int ret;
1403
1404 ret = kernel_getsockname(sock,
1405 (struct sockaddr *)&queue->sockaddr);
1406 if (ret < 0)
1407 return ret;
1408
1409 ret = kernel_getpeername(sock,
1410 (struct sockaddr *)&queue->sockaddr_peer);
1411 if (ret < 0)
1412 return ret;
1413
1414 /*
1415 * Cleanup whatever is sitting in the TCP transmit queue on socket
1416 * close. This is done to prevent stale data from being sent should
1417 * the network connection be restored before TCP times out.
1418 */
1419 ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
1420 (char *)&sol, sizeof(sol));
1421 if (ret)
1422 return ret;
1423
1424 write_lock_bh(&sock->sk->sk_callback_lock);
1425 sock->sk->sk_user_data = queue;
1426 queue->data_ready = sock->sk->sk_data_ready;
1427 sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1428 queue->state_change = sock->sk->sk_state_change;
1429 sock->sk->sk_state_change = nvmet_tcp_state_change;
1430 queue->write_space = sock->sk->sk_write_space;
1431 sock->sk->sk_write_space = nvmet_tcp_write_space;
1432 write_unlock_bh(&sock->sk->sk_callback_lock);
1433
1434 return 0;
1435}
1436
1437static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1438 struct socket *newsock)
1439{
1440 struct nvmet_tcp_queue *queue;
1441 int ret;
1442
1443 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1444 if (!queue)
1445 return -ENOMEM;
1446
1447 INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1448 INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1449 queue->sock = newsock;
1450 queue->port = port;
1451 queue->nr_cmds = 0;
1452 spin_lock_init(&queue->state_lock);
1453 queue->state = NVMET_TCP_Q_CONNECTING;
1454 INIT_LIST_HEAD(&queue->free_list);
1455 init_llist_head(&queue->resp_list);
1456 INIT_LIST_HEAD(&queue->resp_send_list);
1457
1458 queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1459 if (queue->idx < 0) {
1460 ret = queue->idx;
1461 goto out_free_queue;
1462 }
1463
1464 ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1465 if (ret)
1466 goto out_ida_remove;
1467
1468 ret = nvmet_sq_init(&queue->nvme_sq);
1469 if (ret)
1470 goto out_free_connect;
1471
1472 port->last_cpu = cpumask_next_wrap(port->last_cpu,
1473 cpu_online_mask, -1, false);
1474 queue->cpu = port->last_cpu;
1475 nvmet_prepare_receive_pdu(queue);
1476
1477 mutex_lock(&nvmet_tcp_queue_mutex);
1478 list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1479 mutex_unlock(&nvmet_tcp_queue_mutex);
1480
1481 ret = nvmet_tcp_set_queue_sock(queue);
1482 if (ret)
1483 goto out_destroy_sq;
1484
1485 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1486
1487 return 0;
1488out_destroy_sq:
1489 mutex_lock(&nvmet_tcp_queue_mutex);
1490 list_del_init(&queue->queue_list);
1491 mutex_unlock(&nvmet_tcp_queue_mutex);
1492 nvmet_sq_destroy(&queue->nvme_sq);
1493out_free_connect:
1494 nvmet_tcp_free_cmd(&queue->connect);
1495out_ida_remove:
1496 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1497out_free_queue:
1498 kfree(queue);
1499 return ret;
1500}
1501
1502static void nvmet_tcp_accept_work(struct work_struct *w)
1503{
1504 struct nvmet_tcp_port *port =
1505 container_of(w, struct nvmet_tcp_port, accept_work);
1506 struct socket *newsock;
1507 int ret;
1508
1509 while (true) {
1510 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1511 if (ret < 0) {
1512 if (ret != -EAGAIN)
1513 pr_warn("failed to accept err=%d\n", ret);
1514 return;
1515 }
1516 ret = nvmet_tcp_alloc_queue(port, newsock);
1517 if (ret) {
1518 pr_err("failed to allocate queue\n");
1519 sock_release(newsock);
1520 }
1521 }
1522}
1523
1524static void nvmet_tcp_listen_data_ready(struct sock *sk)
1525{
1526 struct nvmet_tcp_port *port;
1527
1528 read_lock_bh(&sk->sk_callback_lock);
1529 port = sk->sk_user_data;
1530 if (!port)
1531 goto out;
1532
1533 if (sk->sk_state == TCP_LISTEN)
1534 schedule_work(&port->accept_work);
1535out:
1536 read_unlock_bh(&sk->sk_callback_lock);
1537}
1538
1539static int nvmet_tcp_add_port(struct nvmet_port *nport)
1540{
1541 struct nvmet_tcp_port *port;
1542 __kernel_sa_family_t af;
1543 int opt, ret;
1544
1545 port = kzalloc(sizeof(*port), GFP_KERNEL);
1546 if (!port)
1547 return -ENOMEM;
1548
1549 switch (nport->disc_addr.adrfam) {
1550 case NVMF_ADDR_FAMILY_IP4:
1551 af = AF_INET;
1552 break;
1553 case NVMF_ADDR_FAMILY_IP6:
1554 af = AF_INET6;
1555 break;
1556 default:
1557 pr_err("address family %d not supported\n",
1558 nport->disc_addr.adrfam);
1559 ret = -EINVAL;
1560 goto err_port;
1561 }
1562
1563 ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1564 nport->disc_addr.trsvcid, &port->addr);
1565 if (ret) {
1566 pr_err("malformed ip/port passed: %s:%s\n",
1567 nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1568 goto err_port;
1569 }
1570
1571 port->nport = nport;
1572 port->last_cpu = -1;
1573 INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1574 if (port->nport->inline_data_size < 0)
1575 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1576
1577 ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1578 IPPROTO_TCP, &port->sock);
1579 if (ret) {
1580 pr_err("failed to create a socket\n");
1581 goto err_port;
1582 }
1583
1584 port->sock->sk->sk_user_data = port;
1585 port->data_ready = port->sock->sk->sk_data_ready;
1586 port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1587
1588 opt = 1;
1589 ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
1590 TCP_NODELAY, (char *)&opt, sizeof(opt));
1591 if (ret) {
1592 pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
1593 goto err_sock;
1594 }
1595
1596 ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
1597 (char *)&opt, sizeof(opt));
1598 if (ret) {
1599 pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
1600 goto err_sock;
1601 }
1602
1603 ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1604 sizeof(port->addr));
1605 if (ret) {
1606 pr_err("failed to bind port socket %d\n", ret);
1607 goto err_sock;
1608 }
1609
1610 ret = kernel_listen(port->sock, 128);
1611 if (ret) {
1612 pr_err("failed to listen %d on port sock\n", ret);
1613 goto err_sock;
1614 }
1615
1616 nport->priv = port;
1617 pr_info("enabling port %d (%pISpc)\n",
1618 le16_to_cpu(nport->disc_addr.portid), &port->addr);
1619
1620 return 0;
1621
1622err_sock:
1623 sock_release(port->sock);
1624err_port:
1625 kfree(port);
1626 return ret;
1627}
1628
1629static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1630{
1631 struct nvmet_tcp_port *port = nport->priv;
1632
1633 write_lock_bh(&port->sock->sk->sk_callback_lock);
1634 port->sock->sk->sk_data_ready = port->data_ready;
1635 port->sock->sk->sk_user_data = NULL;
1636 write_unlock_bh(&port->sock->sk->sk_callback_lock);
1637 cancel_work_sync(&port->accept_work);
1638
1639 sock_release(port->sock);
1640 kfree(port);
1641}
1642
1643static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1644{
1645 struct nvmet_tcp_queue *queue;
1646
1647 mutex_lock(&nvmet_tcp_queue_mutex);
1648 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1649 if (queue->nvme_sq.ctrl == ctrl)
1650 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1651 mutex_unlock(&nvmet_tcp_queue_mutex);
1652}
1653
1654static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1655{
1656 struct nvmet_tcp_queue *queue =
1657 container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1658
1659 if (sq->qid == 0) {
1660 /* Let inflight controller teardown complete */
1661 flush_scheduled_work();
1662 }
1663
1664 queue->nr_cmds = sq->size * 2;
1665 if (nvmet_tcp_alloc_cmds(queue))
1666 return NVME_SC_INTERNAL;
1667 return 0;
1668}
1669
1670static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1671 struct nvmet_port *nport, char *traddr)
1672{
1673 struct nvmet_tcp_port *port = nport->priv;
1674
1675 if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1676 struct nvmet_tcp_cmd *cmd =
1677 container_of(req, struct nvmet_tcp_cmd, req);
1678 struct nvmet_tcp_queue *queue = cmd->queue;
1679
1680 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1681 } else {
1682 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1683 }
1684}
1685
1686static struct nvmet_fabrics_ops nvmet_tcp_ops = {
1687 .owner = THIS_MODULE,
1688 .type = NVMF_TRTYPE_TCP,
1689 .msdbd = 1,
1690 .has_keyed_sgls = 0,
1691 .add_port = nvmet_tcp_add_port,
1692 .remove_port = nvmet_tcp_remove_port,
1693 .queue_response = nvmet_tcp_queue_response,
1694 .delete_ctrl = nvmet_tcp_delete_ctrl,
1695 .install_queue = nvmet_tcp_install_queue,
1696 .disc_traddr = nvmet_tcp_disc_port_addr,
1697};
1698
1699static int __init nvmet_tcp_init(void)
1700{
1701 int ret;
1702
1703 nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
1704 if (!nvmet_tcp_wq)
1705 return -ENOMEM;
1706
1707 ret = nvmet_register_transport(&nvmet_tcp_ops);
1708 if (ret)
1709 goto err;
1710
1711 return 0;
1712err:
1713 destroy_workqueue(nvmet_tcp_wq);
1714 return ret;
1715}
1716
1717static void __exit nvmet_tcp_exit(void)
1718{
1719 struct nvmet_tcp_queue *queue;
1720
1721 nvmet_unregister_transport(&nvmet_tcp_ops);
1722
1723 flush_scheduled_work();
1724 mutex_lock(&nvmet_tcp_queue_mutex);
1725 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1726 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1727 mutex_unlock(&nvmet_tcp_queue_mutex);
1728 flush_scheduled_work();
1729
1730 destroy_workqueue(nvmet_tcp_wq);
1731}
1732
1733module_init(nvmet_tcp_init);
1734module_exit(nvmet_tcp_exit);
1735
1736MODULE_LICENSE("GPL v2");
1737MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 2016e0ed5865..8e26001dc11c 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -412,6 +412,7 @@ static int dasd_ioctl_information(struct dasd_block *block,
412 struct ccw_dev_id dev_id; 412 struct ccw_dev_id dev_id;
413 struct dasd_device *base; 413 struct dasd_device *base;
414 struct ccw_device *cdev; 414 struct ccw_device *cdev;
415 struct list_head *l;
415 unsigned long flags; 416 unsigned long flags;
416 int rc; 417 int rc;
417 418
@@ -462,23 +463,10 @@ static int dasd_ioctl_information(struct dasd_block *block,
462 463
463 memcpy(dasd_info->type, base->discipline->name, 4); 464 memcpy(dasd_info->type, base->discipline->name, 4);
464 465
465 if (block->request_queue->request_fn) { 466 spin_lock_irqsave(&block->queue_lock, flags);
466 struct list_head *l; 467 list_for_each(l, &base->ccw_queue)
467#ifdef DASD_EXTENDED_PROFILING 468 dasd_info->chanq_len++;
468 { 469 spin_unlock_irqrestore(&block->queue_lock, flags);
469 struct list_head *l;
470 spin_lock_irqsave(&block->lock, flags);
471 list_for_each(l, &block->request_queue->queue_head)
472 dasd_info->req_queue_len++;
473 spin_unlock_irqrestore(&block->lock, flags);
474 }
475#endif /* DASD_EXTENDED_PROFILING */
476 spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags);
477 list_for_each(l, &base->ccw_queue)
478 dasd_info->chanq_len++;
479 spin_unlock_irqrestore(get_ccwdev_lock(base->cdev),
480 flags);
481 }
482 470
483 rc = 0; 471 rc = 0;
484 if (copy_to_user(argp, dasd_info, 472 if (copy_to_user(argp, dasd_info,
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 640cd1b31a18..f38882f6f37d 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -50,18 +50,6 @@ config SCSI_NETLINK
50 default n 50 default n
51 depends on NET 51 depends on NET
52 52
53config SCSI_MQ_DEFAULT
54 bool "SCSI: use blk-mq I/O path by default"
55 default y
56 depends on SCSI
57 ---help---
58 This option enables the blk-mq based I/O path for SCSI devices by
59 default. With this option the scsi_mod.use_blk_mq module/boot
60 option defaults to Y, without it to N, but it can still be
61 overridden either way.
62
63 If unsure say Y.
64
65config SCSI_PROC_FS 53config SCSI_PROC_FS
66 bool "legacy /proc/scsi/ support" 54 bool "legacy /proc/scsi/ support"
67 depends on SCSI && PROC_FS 55 depends on SCSI && PROC_FS
diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c
index e9e669a6c2bc..6bad2689edd4 100644
--- a/drivers/scsi/bnx2i/bnx2i_hwi.c
+++ b/drivers/scsi/bnx2i/bnx2i_hwi.c
@@ -1906,7 +1906,6 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session,
1906 struct iscsi_task *task; 1906 struct iscsi_task *task;
1907 struct scsi_cmnd *sc; 1907 struct scsi_cmnd *sc;
1908 int rc = 0; 1908 int rc = 0;
1909 int cpu;
1910 1909
1911 spin_lock(&session->back_lock); 1910 spin_lock(&session->back_lock);
1912 task = iscsi_itt_to_task(bnx2i_conn->cls_conn->dd_data, 1911 task = iscsi_itt_to_task(bnx2i_conn->cls_conn->dd_data,
@@ -1917,14 +1916,9 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session,
1917 } 1916 }
1918 sc = task->sc; 1917 sc = task->sc;
1919 1918
1920 if (!blk_rq_cpu_valid(sc->request))
1921 cpu = smp_processor_id();
1922 else
1923 cpu = sc->request->cpu;
1924
1925 spin_unlock(&session->back_lock); 1919 spin_unlock(&session->back_lock);
1926 1920
1927 p = &per_cpu(bnx2i_percpu, cpu); 1921 p = &per_cpu(bnx2i_percpu, blk_mq_rq_cpu(sc->request));
1928 spin_lock(&p->p_work_lock); 1922 spin_lock(&p->p_work_lock);
1929 if (unlikely(!p->iothread)) { 1923 if (unlikely(!p->iothread)) {
1930 rc = -EINVAL; 1924 rc = -EINVAL;
diff --git a/drivers/scsi/csiostor/csio_scsi.c b/drivers/scsi/csiostor/csio_scsi.c
index 8c15b7acb4b7..a95debbea0e4 100644
--- a/drivers/scsi/csiostor/csio_scsi.c
+++ b/drivers/scsi/csiostor/csio_scsi.c
@@ -1780,16 +1780,10 @@ csio_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmnd)
1780 int nsge = 0; 1780 int nsge = 0;
1781 int rv = SCSI_MLQUEUE_HOST_BUSY, nr; 1781 int rv = SCSI_MLQUEUE_HOST_BUSY, nr;
1782 int retval; 1782 int retval;
1783 int cpu;
1784 struct csio_scsi_qset *sqset; 1783 struct csio_scsi_qset *sqset;
1785 struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device)); 1784 struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
1786 1785
1787 if (!blk_rq_cpu_valid(cmnd->request)) 1786 sqset = &hw->sqset[ln->portid][blk_mq_rq_cpu(cmnd->request)];
1788 cpu = smp_processor_id();
1789 else
1790 cpu = cmnd->request->cpu;
1791
1792 sqset = &hw->sqset[ln->portid][cpu];
1793 1787
1794 nr = fc_remote_port_chkready(rport); 1788 nr = fc_remote_port_chkready(rport);
1795 if (nr) { 1789 if (nr) {
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 6637116529aa..abdc9eac4173 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -3088,12 +3088,6 @@ static ssize_t hwq_mode_store(struct device *dev,
3088 return -EINVAL; 3088 return -EINVAL;
3089 } 3089 }
3090 3090
3091 if ((mode == HWQ_MODE_TAG) && !shost_use_blk_mq(shost)) {
3092 dev_info(cfgdev, "SCSI-MQ is not enabled, use a different "
3093 "HWQ steering mode.\n");
3094 return -EINVAL;
3095 }
3096
3097 afu->hwq_mode = mode; 3091 afu->hwq_mode = mode;
3098 3092
3099 return count; 3093 return count;
diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
index 12dc7100bb4c..d7ac498ba35a 100644
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
@@ -1071,28 +1071,29 @@ static void alua_check(struct scsi_device *sdev, bool force)
1071 * Fail I/O to all paths not in state 1071 * Fail I/O to all paths not in state
1072 * active/optimized or active/non-optimized. 1072 * active/optimized or active/non-optimized.
1073 */ 1073 */
1074static int alua_prep_fn(struct scsi_device *sdev, struct request *req) 1074static blk_status_t alua_prep_fn(struct scsi_device *sdev, struct request *req)
1075{ 1075{
1076 struct alua_dh_data *h = sdev->handler_data; 1076 struct alua_dh_data *h = sdev->handler_data;
1077 struct alua_port_group *pg; 1077 struct alua_port_group *pg;
1078 unsigned char state = SCSI_ACCESS_STATE_OPTIMAL; 1078 unsigned char state = SCSI_ACCESS_STATE_OPTIMAL;
1079 int ret = BLKPREP_OK;
1080 1079
1081 rcu_read_lock(); 1080 rcu_read_lock();
1082 pg = rcu_dereference(h->pg); 1081 pg = rcu_dereference(h->pg);
1083 if (pg) 1082 if (pg)
1084 state = pg->state; 1083 state = pg->state;
1085 rcu_read_unlock(); 1084 rcu_read_unlock();
1086 if (state == SCSI_ACCESS_STATE_TRANSITIONING) 1085
1087 ret = BLKPREP_DEFER; 1086 switch (state) {
1088 else if (state != SCSI_ACCESS_STATE_OPTIMAL && 1087 case SCSI_ACCESS_STATE_OPTIMAL:
1089 state != SCSI_ACCESS_STATE_ACTIVE && 1088 case SCSI_ACCESS_STATE_ACTIVE:
1090 state != SCSI_ACCESS_STATE_LBA) { 1089 case SCSI_ACCESS_STATE_LBA:
1091 ret = BLKPREP_KILL; 1090 return BLK_STS_OK;
1091 case SCSI_ACCESS_STATE_TRANSITIONING:
1092 return BLK_STS_RESOURCE;
1093 default:
1092 req->rq_flags |= RQF_QUIET; 1094 req->rq_flags |= RQF_QUIET;
1095 return BLK_STS_IOERR;
1093 } 1096 }
1094 return ret;
1095
1096} 1097}
1097 1098
1098static void alua_rescan(struct scsi_device *sdev) 1099static void alua_rescan(struct scsi_device *sdev)
diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c
index 95c47909a58f..bea8e13febb6 100644
--- a/drivers/scsi/device_handler/scsi_dh_emc.c
+++ b/drivers/scsi/device_handler/scsi_dh_emc.c
@@ -341,17 +341,17 @@ static int clariion_check_sense(struct scsi_device *sdev,
341 return SCSI_RETURN_NOT_HANDLED; 341 return SCSI_RETURN_NOT_HANDLED;
342} 342}
343 343
344static int clariion_prep_fn(struct scsi_device *sdev, struct request *req) 344static blk_status_t clariion_prep_fn(struct scsi_device *sdev,
345 struct request *req)
345{ 346{
346 struct clariion_dh_data *h = sdev->handler_data; 347 struct clariion_dh_data *h = sdev->handler_data;
347 int ret = BLKPREP_OK;
348 348
349 if (h->lun_state != CLARIION_LUN_OWNED) { 349 if (h->lun_state != CLARIION_LUN_OWNED) {
350 ret = BLKPREP_KILL;
351 req->rq_flags |= RQF_QUIET; 350 req->rq_flags |= RQF_QUIET;
351 return BLK_STS_IOERR;
352 } 352 }
353 return ret;
354 353
354 return BLK_STS_OK;
355} 355}
356 356
357static int clariion_std_inquiry(struct scsi_device *sdev, 357static int clariion_std_inquiry(struct scsi_device *sdev,
diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
index e65a0ebb4b54..80129b033855 100644
--- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c
+++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
@@ -172,17 +172,16 @@ retry:
172 return rc; 172 return rc;
173} 173}
174 174
175static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req) 175static blk_status_t hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
176{ 176{
177 struct hp_sw_dh_data *h = sdev->handler_data; 177 struct hp_sw_dh_data *h = sdev->handler_data;
178 int ret = BLKPREP_OK;
179 178
180 if (h->path_state != HP_SW_PATH_ACTIVE) { 179 if (h->path_state != HP_SW_PATH_ACTIVE) {
181 ret = BLKPREP_KILL;
182 req->rq_flags |= RQF_QUIET; 180 req->rq_flags |= RQF_QUIET;
181 return BLK_STS_IOERR;
183 } 182 }
184 return ret;
185 183
184 return BLK_STS_OK;
186} 185}
187 186
188/* 187/*
diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c
index d27fabae8ddd..65f1fe343c64 100644
--- a/drivers/scsi/device_handler/scsi_dh_rdac.c
+++ b/drivers/scsi/device_handler/scsi_dh_rdac.c
@@ -642,17 +642,16 @@ done:
642 return 0; 642 return 0;
643} 643}
644 644
645static int rdac_prep_fn(struct scsi_device *sdev, struct request *req) 645static blk_status_t rdac_prep_fn(struct scsi_device *sdev, struct request *req)
646{ 646{
647 struct rdac_dh_data *h = sdev->handler_data; 647 struct rdac_dh_data *h = sdev->handler_data;
648 int ret = BLKPREP_OK;
649 648
650 if (h->state != RDAC_STATE_ACTIVE) { 649 if (h->state != RDAC_STATE_ACTIVE) {
651 ret = BLKPREP_KILL;
652 req->rq_flags |= RQF_QUIET; 650 req->rq_flags |= RQF_QUIET;
651 return BLK_STS_IOERR;
653 } 652 }
654 return ret;
655 653
654 return BLK_STS_OK;
656} 655}
657 656
658static int rdac_check_sense(struct scsi_device *sdev, 657static int rdac_check_sense(struct scsi_device *sdev,
diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 96acfcecd540..cafbcfb85bfa 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -2274,7 +2274,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc)
2274 return SCSI_NO_TAG; 2274 return SCSI_NO_TAG;
2275 2275
2276 sc->tag = sc->request->tag = dummy->tag; 2276 sc->tag = sc->request->tag = dummy->tag;
2277 sc->request->special = sc; 2277 sc->host_scribble = (unsigned char *)dummy;
2278 2278
2279 return dummy->tag; 2279 return dummy->tag;
2280} 2280}
@@ -2286,7 +2286,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc)
2286static inline void 2286static inline void
2287fnic_scsi_host_end_tag(struct fnic *fnic, struct scsi_cmnd *sc) 2287fnic_scsi_host_end_tag(struct fnic *fnic, struct scsi_cmnd *sc)
2288{ 2288{
2289 struct request *dummy = sc->request->special; 2289 struct request *dummy = (struct request *)sc->host_scribble;
2290 2290
2291 blk_mq_free_request(dummy); 2291 blk_mq_free_request(dummy);
2292} 2292}
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index ea4b0bb0c1cd..cc71136ba300 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -222,18 +222,9 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
222 if (error) 222 if (error)
223 goto fail; 223 goto fail;
224 224
225 if (shost_use_blk_mq(shost)) { 225 error = scsi_mq_setup_tags(shost);
226 error = scsi_mq_setup_tags(shost); 226 if (error)
227 if (error) 227 goto fail;
228 goto fail;
229 } else {
230 shost->bqt = blk_init_tags(shost->can_queue,
231 shost->hostt->tag_alloc_policy);
232 if (!shost->bqt) {
233 error = -ENOMEM;
234 goto fail;
235 }
236 }
237 228
238 if (!shost->shost_gendev.parent) 229 if (!shost->shost_gendev.parent)
239 shost->shost_gendev.parent = dev ? dev : &platform_bus; 230 shost->shost_gendev.parent = dev ? dev : &platform_bus;
@@ -309,8 +300,7 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
309 pm_runtime_disable(&shost->shost_gendev); 300 pm_runtime_disable(&shost->shost_gendev);
310 pm_runtime_set_suspended(&shost->shost_gendev); 301 pm_runtime_set_suspended(&shost->shost_gendev);
311 pm_runtime_put_noidle(&shost->shost_gendev); 302 pm_runtime_put_noidle(&shost->shost_gendev);
312 if (shost_use_blk_mq(shost)) 303 scsi_mq_destroy_tags(shost);
313 scsi_mq_destroy_tags(shost);
314 fail: 304 fail:
315 return error; 305 return error;
316} 306}
@@ -344,13 +334,8 @@ static void scsi_host_dev_release(struct device *dev)
344 kfree(dev_name(&shost->shost_dev)); 334 kfree(dev_name(&shost->shost_dev));
345 } 335 }
346 336
347 if (shost_use_blk_mq(shost)) { 337 if (shost->tag_set.tags)
348 if (shost->tag_set.tags) 338 scsi_mq_destroy_tags(shost);
349 scsi_mq_destroy_tags(shost);
350 } else {
351 if (shost->bqt)
352 blk_free_tags(shost->bqt);
353 }
354 339
355 kfree(shost->shost_data); 340 kfree(shost->shost_data);
356 341
@@ -472,8 +457,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
472 else 457 else
473 shost->dma_boundary = 0xffffffff; 458 shost->dma_boundary = 0xffffffff;
474 459
475 shost->use_blk_mq = scsi_use_blk_mq || shost->hostt->force_blk_mq;
476
477 device_initialize(&shost->shost_gendev); 460 device_initialize(&shost->shost_gendev);
478 dev_set_name(&shost->shost_gendev, "host%d", shost->host_no); 461 dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
479 shost->shost_gendev.bus = &scsi_bus_type; 462 shost->shost_gendev.bus = &scsi_bus_type;
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 4f6cdf53e913..c90b278cc28c 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -601,12 +601,7 @@ void sas_ata_task_abort(struct sas_task *task)
601 601
602 /* Bounce SCSI-initiated commands to the SCSI EH */ 602 /* Bounce SCSI-initiated commands to the SCSI EH */
603 if (qc->scsicmd) { 603 if (qc->scsicmd) {
604 struct request_queue *q = qc->scsicmd->device->request_queue;
605 unsigned long flags;
606
607 spin_lock_irqsave(q->queue_lock, flags);
608 blk_abort_request(qc->scsicmd->request); 604 blk_abort_request(qc->scsicmd->request);
609 spin_unlock_irqrestore(q->queue_lock, flags);
610 return; 605 return;
611 } 606 }
612 607
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 33229348dcb6..af085432c5fe 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -930,16 +930,10 @@ void sas_task_abort(struct sas_task *task)
930 return; 930 return;
931 } 931 }
932 932
933 if (dev_is_sata(task->dev)) { 933 if (dev_is_sata(task->dev))
934 sas_ata_task_abort(task); 934 sas_ata_task_abort(task);
935 } else { 935 else
936 struct request_queue *q = sc->device->request_queue;
937 unsigned long flags;
938
939 spin_lock_irqsave(q->queue_lock, flags);
940 blk_abort_request(sc->request); 936 blk_abort_request(sc->request);
941 spin_unlock_irqrestore(q->queue_lock, flags);
942 }
943} 937}
944 938
945void sas_target_destroy(struct scsi_target *starget) 939void sas_target_destroy(struct scsi_target *starget)
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 4fa6703a9ec9..baed2b891efb 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -3914,7 +3914,7 @@ int lpfc_sli4_scmd_to_wqidx_distr(struct lpfc_hba *phba,
3914 uint32_t tag; 3914 uint32_t tag;
3915 uint16_t hwq; 3915 uint16_t hwq;
3916 3916
3917 if (cmnd && shost_use_blk_mq(cmnd->device->host)) { 3917 if (cmnd) {
3918 tag = blk_mq_unique_tag(cmnd->request); 3918 tag = blk_mq_unique_tag(cmnd->request);
3919 hwq = blk_mq_unique_tag_to_hwq(tag); 3919 hwq = blk_mq_unique_tag_to_hwq(tag);
3920 3920
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index e19fa883376f..60cf7c5eb880 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -506,11 +506,11 @@ static void osd_request_async_done(struct request *req, blk_status_t error)
506 506
507 _set_error_resid(or, req, error); 507 _set_error_resid(or, req, error);
508 if (req->next_rq) { 508 if (req->next_rq) {
509 __blk_put_request(req->q, req->next_rq); 509 blk_put_request(req->next_rq);
510 req->next_rq = NULL; 510 req->next_rq = NULL;
511 } 511 }
512 512
513 __blk_put_request(req->q, req); 513 blk_put_request(req);
514 or->request = NULL; 514 or->request = NULL;
515 or->in.req = NULL; 515 or->in.req = NULL;
516 or->out.req = NULL; 516 or->out.req = NULL;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 7a1a1edde35d..664c1238a87f 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -341,7 +341,7 @@ static void osst_end_async(struct request *req, blk_status_t status)
341 blk_rq_unmap_user(SRpnt->bio); 341 blk_rq_unmap_user(SRpnt->bio);
342 } 342 }
343 343
344 __blk_put_request(req->q, req); 344 blk_put_request(req);
345} 345}
346 346
347/* osst_request memory management */ 347/* osst_request memory management */
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index 105b0e4d7818..311eb22068e1 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -644,8 +644,7 @@ static struct qedi_ctx *qedi_host_alloc(struct pci_dev *pdev)
644 qedi->max_active_conns = ISCSI_MAX_SESS_PER_HBA; 644 qedi->max_active_conns = ISCSI_MAX_SESS_PER_HBA;
645 qedi->max_sqes = QEDI_SQ_SIZE; 645 qedi->max_sqes = QEDI_SQ_SIZE;
646 646
647 if (shost_use_blk_mq(shost)) 647 shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi);
648 shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi);
649 648
650 pci_set_drvdata(pdev, qedi); 649 pci_set_drvdata(pdev, qedi);
651 650
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 7e78e7eff783..fccc733145fc 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -272,17 +272,6 @@ static void qla_nvme_fcp_abort(struct nvme_fc_local_port *lport,
272 schedule_work(&priv->abort_work); 272 schedule_work(&priv->abort_work);
273} 273}
274 274
275static void qla_nvme_poll(struct nvme_fc_local_port *lport, void *hw_queue_handle)
276{
277 struct qla_qpair *qpair = hw_queue_handle;
278 unsigned long flags;
279 struct scsi_qla_host *vha = lport->private;
280
281 spin_lock_irqsave(&qpair->qp_lock, flags);
282 qla24xx_process_response_queue(vha, qpair->rsp);
283 spin_unlock_irqrestore(&qpair->qp_lock, flags);
284}
285
286static inline int qla2x00_start_nvme_mq(srb_t *sp) 275static inline int qla2x00_start_nvme_mq(srb_t *sp)
287{ 276{
288 unsigned long flags; 277 unsigned long flags;
@@ -578,7 +567,6 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = {
578 .ls_abort = qla_nvme_ls_abort, 567 .ls_abort = qla_nvme_ls_abort,
579 .fcp_io = qla_nvme_post_cmd, 568 .fcp_io = qla_nvme_post_cmd,
580 .fcp_abort = qla_nvme_fcp_abort, 569 .fcp_abort = qla_nvme_fcp_abort,
581 .poll_queue = qla_nvme_poll,
582 .max_hw_queues = 8, 570 .max_hw_queues = 8,
583 .max_sgl_segments = 128, 571 .max_sgl_segments = 128,
584 .max_dif_sgl_segments = 64, 572 .max_dif_sgl_segments = 64,
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index d0ecc729a90a..f92196ec5489 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -857,13 +857,9 @@ qla2xxx_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
857 } 857 }
858 858
859 if (ha->mqenable) { 859 if (ha->mqenable) {
860 if (shost_use_blk_mq(vha->host)) { 860 tag = blk_mq_unique_tag(cmd->request);
861 tag = blk_mq_unique_tag(cmd->request); 861 hwq = blk_mq_unique_tag_to_hwq(tag);
862 hwq = blk_mq_unique_tag_to_hwq(tag); 862 qpair = ha->queue_pair_map[hwq];
863 qpair = ha->queue_pair_map[hwq];
864 } else if (vha->vp_idx && vha->qpair) {
865 qpair = vha->qpair;
866 }
867 863
868 if (qpair) 864 if (qpair)
869 return qla2xxx_mqueuecommand(host, cmd, qpair); 865 return qla2xxx_mqueuecommand(host, cmd, qpair);
@@ -1464,7 +1460,7 @@ __qla2xxx_eh_generic_reset(char *name, enum nexus_wait_type type,
1464 goto eh_reset_failed; 1460 goto eh_reset_failed;
1465 } 1461 }
1466 err = 2; 1462 err = 2;
1467 if (do_reset(fcport, cmd->device->lun, cmd->request->cpu + 1) 1463 if (do_reset(fcport, cmd->device->lun, blk_mq_rq_cpu(cmd->request) + 1)
1468 != QLA_SUCCESS) { 1464 != QLA_SUCCESS) {
1469 ql_log(ql_log_warn, vha, 0x800c, 1465 ql_log(ql_log_warn, vha, 0x800c,
1470 "do_reset failed for cmd=%p.\n", cmd); 1466 "do_reset failed for cmd=%p.\n", cmd);
@@ -3159,7 +3155,7 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
3159 goto probe_failed; 3155 goto probe_failed;
3160 } 3156 }
3161 3157
3162 if (ha->mqenable && shost_use_blk_mq(host)) { 3158 if (ha->mqenable) {
3163 /* number of hardware queues supported by blk/scsi-mq*/ 3159 /* number of hardware queues supported by blk/scsi-mq*/
3164 host->nr_hw_queues = ha->max_qpairs; 3160 host->nr_hw_queues = ha->max_qpairs;
3165 3161
@@ -3271,25 +3267,17 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
3271 base_vha->mgmt_svr_loop_id, host->sg_tablesize); 3267 base_vha->mgmt_svr_loop_id, host->sg_tablesize);
3272 3268
3273 if (ha->mqenable) { 3269 if (ha->mqenable) {
3274 bool mq = false;
3275 bool startit = false; 3270 bool startit = false;
3276 3271
3277 if (QLA_TGT_MODE_ENABLED()) { 3272 if (QLA_TGT_MODE_ENABLED())
3278 mq = true;
3279 startit = false; 3273 startit = false;
3280 }
3281 3274
3282 if ((ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED) && 3275 if (ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED)
3283 shost_use_blk_mq(host)) {
3284 mq = true;
3285 startit = true; 3276 startit = true;
3286 }
3287 3277
3288 if (mq) { 3278 /* Create start of day qpairs for Block MQ */
3289 /* Create start of day qpairs for Block MQ */ 3279 for (i = 0; i < ha->max_qpairs; i++)
3290 for (i = 0; i < ha->max_qpairs; i++) 3280 qla2xxx_create_qpair(base_vha, 5, 0, startit);
3291 qla2xxx_create_qpair(base_vha, 5, 0, startit);
3292 }
3293 } 3281 }
3294 3282
3295 if (ha->flags.running_gold_fw) 3283 if (ha->flags.running_gold_fw)
@@ -6952,11 +6940,12 @@ static int qla2xxx_map_queues(struct Scsi_Host *shost)
6952{ 6940{
6953 int rc; 6941 int rc;
6954 scsi_qla_host_t *vha = (scsi_qla_host_t *)shost->hostdata; 6942 scsi_qla_host_t *vha = (scsi_qla_host_t *)shost->hostdata;
6943 struct blk_mq_queue_map *qmap = &shost->tag_set.map[0];
6955 6944
6956 if (USER_CTRL_IRQ(vha->hw)) 6945 if (USER_CTRL_IRQ(vha->hw))
6957 rc = blk_mq_map_queues(&shost->tag_set); 6946 rc = blk_mq_map_queues(qmap);
6958 else 6947 else
6959 rc = blk_mq_pci_map_queues(&shost->tag_set, vha->hw->pdev, 0); 6948 rc = blk_mq_pci_map_queues(qmap, vha->hw->pdev, 0);
6960 return rc; 6949 return rc;
6961} 6950}
6962 6951
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index fc1356d101b0..7675ff0ca2ea 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -780,11 +780,8 @@ MODULE_LICENSE("GPL");
780module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR); 780module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR);
781MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels"); 781MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels");
782 782
783#ifdef CONFIG_SCSI_MQ_DEFAULT 783/* This should go away in the future, it doesn't do anything anymore */
784bool scsi_use_blk_mq = true; 784bool scsi_use_blk_mq = true;
785#else
786bool scsi_use_blk_mq = false;
787#endif
788module_param_named(use_blk_mq, scsi_use_blk_mq, bool, S_IWUSR | S_IRUGO); 785module_param_named(use_blk_mq, scsi_use_blk_mq, bool, S_IWUSR | S_IRUGO);
789 786
790static int __init init_scsi(void) 787static int __init init_scsi(void)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 60bcc6df97a9..4740f1e9dd17 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5881,8 +5881,7 @@ static int sdebug_driver_probe(struct device *dev)
5881 } 5881 }
5882 /* Decide whether to tell scsi subsystem that we want mq */ 5882 /* Decide whether to tell scsi subsystem that we want mq */
5883 /* Following should give the same answer for each host */ 5883 /* Following should give the same answer for each host */
5884 if (shost_use_blk_mq(hpnt)) 5884 hpnt->nr_hw_queues = submit_queues;
5885 hpnt->nr_hw_queues = submit_queues;
5886 5885
5887 sdbg_host->shost = hpnt; 5886 sdbg_host->shost = hpnt;
5888 *((struct sdebug_host_info **)hpnt->hostdata) = sdbg_host; 5887 *((struct sdebug_host_info **)hpnt->hostdata) = sdbg_host;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c736d61b1648..16eef068e9e9 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -297,19 +297,19 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
297 297
298 if (rtn == BLK_EH_DONE) { 298 if (rtn == BLK_EH_DONE) {
299 /* 299 /*
300 * For blk-mq, we must set the request state to complete now 300 * Set the command to complete first in order to prevent a real
301 * before sending the request to the scsi error handler. This 301 * completion from releasing the command while error handling
302 * will prevent a use-after-free in the event the LLD manages 302 * is using it. If the command was already completed, then the
303 * to complete the request before the error handler finishes 303 * lower level driver beat the timeout handler, and it is safe
304 * processing this timed out request. 304 * to return without escalating error recovery.
305 * 305 *
306 * If the request was already completed, then the LLD beat the 306 * If timeout handling lost the race to a real completion, the
307 * time out handler from transferring the request to the scsi 307 * block layer may ignore that due to a fake timeout injection,
308 * error handler. In that case we can return immediately as no 308 * so return RESET_TIMER to allow error handling another shot
309 * further action is required. 309 * at this command.
310 */ 310 */
311 if (req->q->mq_ops && !blk_mq_mark_complete(req)) 311 if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state))
312 return rtn; 312 return BLK_EH_RESET_TIMER;
313 if (scsi_abort_command(scmd) != SUCCESS) { 313 if (scsi_abort_command(scmd) != SUCCESS) {
314 set_host_byte(scmd, DID_TIME_OUT); 314 set_host_byte(scmd, DID_TIME_OUT);
315 scsi_eh_scmd_add(scmd); 315 scsi_eh_scmd_add(scmd);
@@ -1932,7 +1932,7 @@ maybe_retry:
1932 1932
1933static void eh_lock_door_done(struct request *req, blk_status_t status) 1933static void eh_lock_door_done(struct request *req, blk_status_t status)
1934{ 1934{
1935 __blk_put_request(req->q, req); 1935 blk_put_request(req);
1936} 1936}
1937 1937
1938/** 1938/**
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index fa6e0c3b3aa6..0dbf25512778 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -168,8 +168,6 @@ static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd)
168static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy) 168static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
169{ 169{
170 struct scsi_device *device = cmd->device; 170 struct scsi_device *device = cmd->device;
171 struct request_queue *q = device->request_queue;
172 unsigned long flags;
173 171
174 SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd, 172 SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd,
175 "Inserting command %p into mlqueue\n", cmd)); 173 "Inserting command %p into mlqueue\n", cmd));
@@ -190,26 +188,20 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
190 * before blk_cleanup_queue() finishes. 188 * before blk_cleanup_queue() finishes.
191 */ 189 */
192 cmd->result = 0; 190 cmd->result = 0;
193 if (q->mq_ops) { 191
194 /* 192 /*
195 * Before a SCSI command is dispatched, 193 * Before a SCSI command is dispatched,
196 * get_device(&sdev->sdev_gendev) is called and the host, 194 * get_device(&sdev->sdev_gendev) is called and the host,
197 * target and device busy counters are increased. Since 195 * target and device busy counters are increased. Since
198 * requeuing a request causes these actions to be repeated and 196 * requeuing a request causes these actions to be repeated and
199 * since scsi_device_unbusy() has already been called, 197 * since scsi_device_unbusy() has already been called,
200 * put_device(&device->sdev_gendev) must still be called. Call 198 * put_device(&device->sdev_gendev) must still be called. Call
201 * put_device() after blk_mq_requeue_request() to avoid that 199 * put_device() after blk_mq_requeue_request() to avoid that
202 * removal of the SCSI device can start before requeueing has 200 * removal of the SCSI device can start before requeueing has
203 * happened. 201 * happened.
204 */ 202 */
205 blk_mq_requeue_request(cmd->request, true); 203 blk_mq_requeue_request(cmd->request, true);
206 put_device(&device->sdev_gendev); 204 put_device(&device->sdev_gendev);
207 return;
208 }
209 spin_lock_irqsave(q->queue_lock, flags);
210 blk_requeue_request(q, cmd->request);
211 kblockd_schedule_work(&device->requeue_work);
212 spin_unlock_irqrestore(q->queue_lock, flags);
213} 205}
214 206
215/* 207/*
@@ -370,10 +362,7 @@ void scsi_device_unbusy(struct scsi_device *sdev)
370 362
371static void scsi_kick_queue(struct request_queue *q) 363static void scsi_kick_queue(struct request_queue *q)
372{ 364{
373 if (q->mq_ops) 365 blk_mq_run_hw_queues(q, false);
374 blk_mq_run_hw_queues(q, false);
375 else
376 blk_run_queue(q);
377} 366}
378 367
379/* 368/*
@@ -534,10 +523,7 @@ static void scsi_run_queue(struct request_queue *q)
534 if (!list_empty(&sdev->host->starved_list)) 523 if (!list_empty(&sdev->host->starved_list))
535 scsi_starved_list_run(sdev->host); 524 scsi_starved_list_run(sdev->host);
536 525
537 if (q->mq_ops) 526 blk_mq_run_hw_queues(q, false);
538 blk_mq_run_hw_queues(q, false);
539 else
540 blk_run_queue(q);
541} 527}
542 528
543void scsi_requeue_run_queue(struct work_struct *work) 529void scsi_requeue_run_queue(struct work_struct *work)
@@ -550,42 +536,6 @@ void scsi_requeue_run_queue(struct work_struct *work)
550 scsi_run_queue(q); 536 scsi_run_queue(q);
551} 537}
552 538
553/*
554 * Function: scsi_requeue_command()
555 *
556 * Purpose: Handle post-processing of completed commands.
557 *
558 * Arguments: q - queue to operate on
559 * cmd - command that may need to be requeued.
560 *
561 * Returns: Nothing
562 *
563 * Notes: After command completion, there may be blocks left
564 * over which weren't finished by the previous command
565 * this can be for a number of reasons - the main one is
566 * I/O errors in the middle of the request, in which case
567 * we need to request the blocks that come after the bad
568 * sector.
569 * Notes: Upon return, cmd is a stale pointer.
570 */
571static void scsi_requeue_command(struct request_queue *q, struct scsi_cmnd *cmd)
572{
573 struct scsi_device *sdev = cmd->device;
574 struct request *req = cmd->request;
575 unsigned long flags;
576
577 spin_lock_irqsave(q->queue_lock, flags);
578 blk_unprep_request(req);
579 req->special = NULL;
580 scsi_put_command(cmd);
581 blk_requeue_request(q, req);
582 spin_unlock_irqrestore(q->queue_lock, flags);
583
584 scsi_run_queue(q);
585
586 put_device(&sdev->sdev_gendev);
587}
588
589void scsi_run_host_queues(struct Scsi_Host *shost) 539void scsi_run_host_queues(struct Scsi_Host *shost)
590{ 540{
591 struct scsi_device *sdev; 541 struct scsi_device *sdev;
@@ -626,42 +576,6 @@ static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
626 scsi_del_cmd_from_list(cmd); 576 scsi_del_cmd_from_list(cmd);
627} 577}
628 578
629/*
630 * Function: scsi_release_buffers()
631 *
632 * Purpose: Free resources allocate for a scsi_command.
633 *
634 * Arguments: cmd - command that we are bailing.
635 *
636 * Lock status: Assumed that no lock is held upon entry.
637 *
638 * Returns: Nothing
639 *
640 * Notes: In the event that an upper level driver rejects a
641 * command, we must release resources allocated during
642 * the __init_io() function. Primarily this would involve
643 * the scatter-gather table.
644 */
645static void scsi_release_buffers(struct scsi_cmnd *cmd)
646{
647 if (cmd->sdb.table.nents)
648 sg_free_table_chained(&cmd->sdb.table, false);
649
650 memset(&cmd->sdb, 0, sizeof(cmd->sdb));
651
652 if (scsi_prot_sg_count(cmd))
653 sg_free_table_chained(&cmd->prot_sdb->table, false);
654}
655
656static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd)
657{
658 struct scsi_data_buffer *bidi_sdb = cmd->request->next_rq->special;
659
660 sg_free_table_chained(&bidi_sdb->table, false);
661 kmem_cache_free(scsi_sdb_cache, bidi_sdb);
662 cmd->request->next_rq->special = NULL;
663}
664
665/* Returns false when no more bytes to process, true if there are more */ 579/* Returns false when no more bytes to process, true if there are more */
666static bool scsi_end_request(struct request *req, blk_status_t error, 580static bool scsi_end_request(struct request *req, blk_status_t error,
667 unsigned int bytes, unsigned int bidi_bytes) 581 unsigned int bytes, unsigned int bidi_bytes)
@@ -687,46 +601,30 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
687 destroy_rcu_head(&cmd->rcu); 601 destroy_rcu_head(&cmd->rcu);
688 } 602 }
689 603
690 if (req->mq_ctx) { 604 /*
691 /* 605 * In the MQ case the command gets freed by __blk_mq_end_request,
692 * In the MQ case the command gets freed by __blk_mq_end_request, 606 * so we have to do all cleanup that depends on it earlier.
693 * so we have to do all cleanup that depends on it earlier. 607 *
694 * 608 * We also can't kick the queues from irq context, so we
695 * We also can't kick the queues from irq context, so we 609 * will have to defer it to a workqueue.
696 * will have to defer it to a workqueue. 610 */
697 */ 611 scsi_mq_uninit_cmd(cmd);
698 scsi_mq_uninit_cmd(cmd);
699
700 /*
701 * queue is still alive, so grab the ref for preventing it
702 * from being cleaned up during running queue.
703 */
704 percpu_ref_get(&q->q_usage_counter);
705
706 __blk_mq_end_request(req, error);
707
708 if (scsi_target(sdev)->single_lun ||
709 !list_empty(&sdev->host->starved_list))
710 kblockd_schedule_work(&sdev->requeue_work);
711 else
712 blk_mq_run_hw_queues(q, true);
713
714 percpu_ref_put(&q->q_usage_counter);
715 } else {
716 unsigned long flags;
717 612
718 if (bidi_bytes) 613 /*
719 scsi_release_bidi_buffers(cmd); 614 * queue is still alive, so grab the ref for preventing it
720 scsi_release_buffers(cmd); 615 * from being cleaned up during running queue.
721 scsi_put_command(cmd); 616 */
617 percpu_ref_get(&q->q_usage_counter);
722 618
723 spin_lock_irqsave(q->queue_lock, flags); 619 __blk_mq_end_request(req, error);
724 blk_finish_request(req, error);
725 spin_unlock_irqrestore(q->queue_lock, flags);
726 620
727 scsi_run_queue(q); 621 if (scsi_target(sdev)->single_lun ||
728 } 622 !list_empty(&sdev->host->starved_list))
623 kblockd_schedule_work(&sdev->requeue_work);
624 else
625 blk_mq_run_hw_queues(q, true);
729 626
627 percpu_ref_put(&q->q_usage_counter);
730 put_device(&sdev->sdev_gendev); 628 put_device(&sdev->sdev_gendev);
731 return false; 629 return false;
732} 630}
@@ -774,13 +672,7 @@ static void scsi_io_completion_reprep(struct scsi_cmnd *cmd,
774 struct request_queue *q) 672 struct request_queue *q)
775{ 673{
776 /* A new command will be prepared and issued. */ 674 /* A new command will be prepared and issued. */
777 if (q->mq_ops) { 675 scsi_mq_requeue_cmd(cmd);
778 scsi_mq_requeue_cmd(cmd);
779 } else {
780 /* Unprep request and put it back at head of the queue. */
781 scsi_release_buffers(cmd);
782 scsi_requeue_command(q, cmd);
783 }
784} 676}
785 677
786/* Helper for scsi_io_completion() when special action required. */ 678/* Helper for scsi_io_completion() when special action required. */
@@ -1120,7 +1012,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
1120 scsi_io_completion_action(cmd, result); 1012 scsi_io_completion_action(cmd, result);
1121} 1013}
1122 1014
1123static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb) 1015static blk_status_t scsi_init_sgtable(struct request *req,
1016 struct scsi_data_buffer *sdb)
1124{ 1017{
1125 int count; 1018 int count;
1126 1019
@@ -1129,7 +1022,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
1129 */ 1022 */
1130 if (unlikely(sg_alloc_table_chained(&sdb->table, 1023 if (unlikely(sg_alloc_table_chained(&sdb->table,
1131 blk_rq_nr_phys_segments(req), sdb->table.sgl))) 1024 blk_rq_nr_phys_segments(req), sdb->table.sgl)))
1132 return BLKPREP_DEFER; 1025 return BLK_STS_RESOURCE;
1133 1026
1134 /* 1027 /*
1135 * Next, walk the list, and fill in the addresses and sizes of 1028 * Next, walk the list, and fill in the addresses and sizes of
@@ -1139,7 +1032,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
1139 BUG_ON(count > sdb->table.nents); 1032 BUG_ON(count > sdb->table.nents);
1140 sdb->table.nents = count; 1033 sdb->table.nents = count;
1141 sdb->length = blk_rq_payload_bytes(req); 1034 sdb->length = blk_rq_payload_bytes(req);
1142 return BLKPREP_OK; 1035 return BLK_STS_OK;
1143} 1036}
1144 1037
1145/* 1038/*
@@ -1149,62 +1042,48 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
1149 * 1042 *
1150 * Arguments: cmd - Command descriptor we wish to initialize 1043 * Arguments: cmd - Command descriptor we wish to initialize
1151 * 1044 *
1152 * Returns: 0 on success 1045 * Returns: BLK_STS_OK on success
1153 * BLKPREP_DEFER if the failure is retryable 1046 * BLK_STS_RESOURCE if the failure is retryable
1154 * BLKPREP_KILL if the failure is fatal 1047 * BLK_STS_IOERR if the failure is fatal
1155 */ 1048 */
1156int scsi_init_io(struct scsi_cmnd *cmd) 1049blk_status_t scsi_init_io(struct scsi_cmnd *cmd)
1157{ 1050{
1158 struct scsi_device *sdev = cmd->device;
1159 struct request *rq = cmd->request; 1051 struct request *rq = cmd->request;
1160 bool is_mq = (rq->mq_ctx != NULL); 1052 blk_status_t ret;
1161 int error = BLKPREP_KILL;
1162 1053
1163 if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq))) 1054 if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
1164 goto err_exit; 1055 return BLK_STS_IOERR;
1165 1056
1166 error = scsi_init_sgtable(rq, &cmd->sdb); 1057 ret = scsi_init_sgtable(rq, &cmd->sdb);
1167 if (error) 1058 if (ret)
1168 goto err_exit; 1059 return ret;
1169 1060
1170 if (blk_bidi_rq(rq)) { 1061 if (blk_bidi_rq(rq)) {
1171 if (!rq->q->mq_ops) { 1062 ret = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
1172 struct scsi_data_buffer *bidi_sdb = 1063 if (ret)
1173 kmem_cache_zalloc(scsi_sdb_cache, GFP_ATOMIC); 1064 goto out_free_sgtables;
1174 if (!bidi_sdb) {
1175 error = BLKPREP_DEFER;
1176 goto err_exit;
1177 }
1178
1179 rq->next_rq->special = bidi_sdb;
1180 }
1181
1182 error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
1183 if (error)
1184 goto err_exit;
1185 } 1065 }
1186 1066
1187 if (blk_integrity_rq(rq)) { 1067 if (blk_integrity_rq(rq)) {
1188 struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; 1068 struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
1189 int ivecs, count; 1069 int ivecs, count;
1190 1070
1191 if (prot_sdb == NULL) { 1071 if (WARN_ON_ONCE(!prot_sdb)) {
1192 /* 1072 /*
1193 * This can happen if someone (e.g. multipath) 1073 * This can happen if someone (e.g. multipath)
1194 * queues a command to a device on an adapter 1074 * queues a command to a device on an adapter
1195 * that does not support DIX. 1075 * that does not support DIX.
1196 */ 1076 */
1197 WARN_ON_ONCE(1); 1077 ret = BLK_STS_IOERR;
1198 error = BLKPREP_KILL; 1078 goto out_free_sgtables;
1199 goto err_exit;
1200 } 1079 }
1201 1080
1202 ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio); 1081 ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
1203 1082
1204 if (sg_alloc_table_chained(&prot_sdb->table, ivecs, 1083 if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
1205 prot_sdb->table.sgl)) { 1084 prot_sdb->table.sgl)) {
1206 error = BLKPREP_DEFER; 1085 ret = BLK_STS_RESOURCE;
1207 goto err_exit; 1086 goto out_free_sgtables;
1208 } 1087 }
1209 1088
1210 count = blk_rq_map_integrity_sg(rq->q, rq->bio, 1089 count = blk_rq_map_integrity_sg(rq->q, rq->bio,
@@ -1216,17 +1095,10 @@ int scsi_init_io(struct scsi_cmnd *cmd)
1216 cmd->prot_sdb->table.nents = count; 1095 cmd->prot_sdb->table.nents = count;
1217 } 1096 }
1218 1097
1219 return BLKPREP_OK; 1098 return BLK_STS_OK;
1220err_exit: 1099out_free_sgtables:
1221 if (is_mq) { 1100 scsi_mq_free_sgtables(cmd);
1222 scsi_mq_free_sgtables(cmd); 1101 return ret;
1223 } else {
1224 scsi_release_buffers(cmd);
1225 cmd->request->special = NULL;
1226 scsi_put_command(cmd);
1227 put_device(&sdev->sdev_gendev);
1228 }
1229 return error;
1230} 1102}
1231EXPORT_SYMBOL(scsi_init_io); 1103EXPORT_SYMBOL(scsi_init_io);
1232 1104
@@ -1312,7 +1184,8 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
1312 scsi_add_cmd_to_list(cmd); 1184 scsi_add_cmd_to_list(cmd);
1313} 1185}
1314 1186
1315static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req) 1187static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
1188 struct request *req)
1316{ 1189{
1317 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); 1190 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
1318 1191
@@ -1323,8 +1196,8 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
1323 * submit a request without an attached bio. 1196 * submit a request without an attached bio.
1324 */ 1197 */
1325 if (req->bio) { 1198 if (req->bio) {
1326 int ret = scsi_init_io(cmd); 1199 blk_status_t ret = scsi_init_io(cmd);
1327 if (unlikely(ret)) 1200 if (unlikely(ret != BLK_STS_OK))
1328 return ret; 1201 return ret;
1329 } else { 1202 } else {
1330 BUG_ON(blk_rq_bytes(req)); 1203 BUG_ON(blk_rq_bytes(req));
@@ -1336,20 +1209,21 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
1336 cmd->cmnd = scsi_req(req)->cmd; 1209 cmd->cmnd = scsi_req(req)->cmd;
1337 cmd->transfersize = blk_rq_bytes(req); 1210 cmd->transfersize = blk_rq_bytes(req);
1338 cmd->allowed = scsi_req(req)->retries; 1211 cmd->allowed = scsi_req(req)->retries;
1339 return BLKPREP_OK; 1212 return BLK_STS_OK;
1340} 1213}
1341 1214
1342/* 1215/*
1343 * Setup a normal block command. These are simple request from filesystems 1216 * Setup a normal block command. These are simple request from filesystems
1344 * that still need to be translated to SCSI CDBs from the ULD. 1217 * that still need to be translated to SCSI CDBs from the ULD.
1345 */ 1218 */
1346static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req) 1219static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev,
1220 struct request *req)
1347{ 1221{
1348 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); 1222 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
1349 1223
1350 if (unlikely(sdev->handler && sdev->handler->prep_fn)) { 1224 if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
1351 int ret = sdev->handler->prep_fn(sdev, req); 1225 blk_status_t ret = sdev->handler->prep_fn(sdev, req);
1352 if (ret != BLKPREP_OK) 1226 if (ret != BLK_STS_OK)
1353 return ret; 1227 return ret;
1354 } 1228 }
1355 1229
@@ -1358,7 +1232,8 @@ static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
1358 return scsi_cmd_to_driver(cmd)->init_command(cmd); 1232 return scsi_cmd_to_driver(cmd)->init_command(cmd);
1359} 1233}
1360 1234
1361static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req) 1235static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev,
1236 struct request *req)
1362{ 1237{
1363 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); 1238 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
1364 1239
@@ -1375,129 +1250,48 @@ static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req)
1375 return scsi_setup_fs_cmnd(sdev, req); 1250 return scsi_setup_fs_cmnd(sdev, req);
1376} 1251}
1377 1252
1378static int 1253static blk_status_t
1379scsi_prep_state_check(struct scsi_device *sdev, struct request *req) 1254scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
1380{ 1255{
1381 int ret = BLKPREP_OK; 1256 switch (sdev->sdev_state) {
1382 1257 case SDEV_OFFLINE:
1383 /* 1258 case SDEV_TRANSPORT_OFFLINE:
1384 * If the device is not in running state we will reject some
1385 * or all commands.
1386 */
1387 if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
1388 switch (sdev->sdev_state) {
1389 case SDEV_OFFLINE:
1390 case SDEV_TRANSPORT_OFFLINE:
1391 /*
1392 * If the device is offline we refuse to process any
1393 * commands. The device must be brought online
1394 * before trying any recovery commands.
1395 */
1396 sdev_printk(KERN_ERR, sdev,
1397 "rejecting I/O to offline device\n");
1398 ret = BLKPREP_KILL;
1399 break;
1400 case SDEV_DEL:
1401 /*
1402 * If the device is fully deleted, we refuse to
1403 * process any commands as well.
1404 */
1405 sdev_printk(KERN_ERR, sdev,
1406 "rejecting I/O to dead device\n");
1407 ret = BLKPREP_KILL;
1408 break;
1409 case SDEV_BLOCK:
1410 case SDEV_CREATED_BLOCK:
1411 ret = BLKPREP_DEFER;
1412 break;
1413 case SDEV_QUIESCE:
1414 /*
1415 * If the devices is blocked we defer normal commands.
1416 */
1417 if (req && !(req->rq_flags & RQF_PREEMPT))
1418 ret = BLKPREP_DEFER;
1419 break;
1420 default:
1421 /*
1422 * For any other not fully online state we only allow
1423 * special commands. In particular any user initiated
1424 * command is not allowed.
1425 */
1426 if (req && !(req->rq_flags & RQF_PREEMPT))
1427 ret = BLKPREP_KILL;
1428 break;
1429 }
1430 }
1431 return ret;
1432}
1433
1434static int
1435scsi_prep_return(struct request_queue *q, struct request *req, int ret)
1436{
1437 struct scsi_device *sdev = q->queuedata;
1438
1439 switch (ret) {
1440 case BLKPREP_KILL:
1441 case BLKPREP_INVALID:
1442 scsi_req(req)->result = DID_NO_CONNECT << 16;
1443 /* release the command and kill it */
1444 if (req->special) {
1445 struct scsi_cmnd *cmd = req->special;
1446 scsi_release_buffers(cmd);
1447 scsi_put_command(cmd);
1448 put_device(&sdev->sdev_gendev);
1449 req->special = NULL;
1450 }
1451 break;
1452 case BLKPREP_DEFER:
1453 /* 1259 /*
1454 * If we defer, the blk_peek_request() returns NULL, but the 1260 * If the device is offline we refuse to process any
1455 * queue must be restarted, so we schedule a callback to happen 1261 * commands. The device must be brought online
1456 * shortly. 1262 * before trying any recovery commands.
1457 */ 1263 */
1458 if (atomic_read(&sdev->device_busy) == 0) 1264 sdev_printk(KERN_ERR, sdev,
1459 blk_delay_queue(q, SCSI_QUEUE_DELAY); 1265 "rejecting I/O to offline device\n");
1460 break; 1266 return BLK_STS_IOERR;
1267 case SDEV_DEL:
1268 /*
1269 * If the device is fully deleted, we refuse to
1270 * process any commands as well.
1271 */
1272 sdev_printk(KERN_ERR, sdev,
1273 "rejecting I/O to dead device\n");
1274 return BLK_STS_IOERR;
1275 case SDEV_BLOCK:
1276 case SDEV_CREATED_BLOCK:
1277 return BLK_STS_RESOURCE;
1278 case SDEV_QUIESCE:
1279 /*
1280 * If the devices is blocked we defer normal commands.
1281 */
1282 if (req && !(req->rq_flags & RQF_PREEMPT))
1283 return BLK_STS_RESOURCE;
1284 return BLK_STS_OK;
1461 default: 1285 default:
1462 req->rq_flags |= RQF_DONTPREP; 1286 /*
1463 } 1287 * For any other not fully online state we only allow
1464 1288 * special commands. In particular any user initiated
1465 return ret; 1289 * command is not allowed.
1466} 1290 */
1467 1291 if (req && !(req->rq_flags & RQF_PREEMPT))
1468static int scsi_prep_fn(struct request_queue *q, struct request *req) 1292 return BLK_STS_IOERR;
1469{ 1293 return BLK_STS_OK;
1470 struct scsi_device *sdev = q->queuedata;
1471 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
1472 int ret;
1473
1474 ret = scsi_prep_state_check(sdev, req);
1475 if (ret != BLKPREP_OK)
1476 goto out;
1477
1478 if (!req->special) {
1479 /* Bail if we can't get a reference to the device */
1480 if (unlikely(!get_device(&sdev->sdev_gendev))) {
1481 ret = BLKPREP_DEFER;
1482 goto out;
1483 }
1484
1485 scsi_init_command(sdev, cmd);
1486 req->special = cmd;
1487 } 1294 }
1488
1489 cmd->tag = req->tag;
1490 cmd->request = req;
1491 cmd->prot_op = SCSI_PROT_NORMAL;
1492
1493 ret = scsi_setup_cmnd(sdev, req);
1494out:
1495 return scsi_prep_return(q, req, ret);
1496}
1497
1498static void scsi_unprep_fn(struct request_queue *q, struct request *req)
1499{
1500 scsi_uninit_cmd(blk_mq_rq_to_pdu(req));
1501} 1295}
1502 1296
1503/* 1297/*
@@ -1519,14 +1313,8 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
1519 /* 1313 /*
1520 * unblock after device_blocked iterates to zero 1314 * unblock after device_blocked iterates to zero
1521 */ 1315 */
1522 if (atomic_dec_return(&sdev->device_blocked) > 0) { 1316 if (atomic_dec_return(&sdev->device_blocked) > 0)
1523 /*
1524 * For the MQ case we take care of this in the caller.
1525 */
1526 if (!q->mq_ops)
1527 blk_delay_queue(q, SCSI_QUEUE_DELAY);
1528 goto out_dec; 1317 goto out_dec;
1529 }
1530 SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev, 1318 SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
1531 "unblocking device at zero depth\n")); 1319 "unblocking device at zero depth\n"));
1532 } 1320 }
@@ -1661,13 +1449,13 @@ out_dec:
1661 * needs to return 'not busy'. Otherwise, request stacking drivers 1449 * needs to return 'not busy'. Otherwise, request stacking drivers
1662 * may hold requests forever. 1450 * may hold requests forever.
1663 */ 1451 */
1664static int scsi_lld_busy(struct request_queue *q) 1452static bool scsi_mq_lld_busy(struct request_queue *q)
1665{ 1453{
1666 struct scsi_device *sdev = q->queuedata; 1454 struct scsi_device *sdev = q->queuedata;
1667 struct Scsi_Host *shost; 1455 struct Scsi_Host *shost;
1668 1456
1669 if (blk_queue_dying(q)) 1457 if (blk_queue_dying(q))
1670 return 0; 1458 return false;
1671 1459
1672 shost = sdev->host; 1460 shost = sdev->host;
1673 1461
@@ -1678,43 +1466,9 @@ static int scsi_lld_busy(struct request_queue *q)
1678 * in SCSI layer. 1466 * in SCSI layer.
1679 */ 1467 */
1680 if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev)) 1468 if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev))
1681 return 1; 1469 return true;
1682
1683 return 0;
1684}
1685
1686/*
1687 * Kill a request for a dead device
1688 */
1689static void scsi_kill_request(struct request *req, struct request_queue *q)
1690{
1691 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
1692 struct scsi_device *sdev;
1693 struct scsi_target *starget;
1694 struct Scsi_Host *shost;
1695
1696 blk_start_request(req);
1697
1698 scmd_printk(KERN_INFO, cmd, "killing request\n");
1699
1700 sdev = cmd->device;
1701 starget = scsi_target(sdev);
1702 shost = sdev->host;
1703 scsi_init_cmd_errh(cmd);
1704 cmd->result = DID_NO_CONNECT << 16;
1705 atomic_inc(&cmd->device->iorequest_cnt);
1706
1707 /*
1708 * SCSI request completion path will do scsi_device_unbusy(),
1709 * bump busy counts. To bump the counters, we need to dance
1710 * with the locks as normal issue path does.
1711 */
1712 atomic_inc(&sdev->device_busy);
1713 atomic_inc(&shost->host_busy);
1714 if (starget->can_queue > 0)
1715 atomic_inc(&starget->target_busy);
1716 1470
1717 blk_complete_request(req); 1471 return false;
1718} 1472}
1719 1473
1720static void scsi_softirq_done(struct request *rq) 1474static void scsi_softirq_done(struct request *rq)
@@ -1837,170 +1591,6 @@ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
1837 return 0; 1591 return 0;
1838} 1592}
1839 1593
1840/**
1841 * scsi_done - Invoke completion on finished SCSI command.
1842 * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
1843 * ownership back to SCSI Core -- i.e. the LLDD has finished with it.
1844 *
1845 * Description: This function is the mid-level's (SCSI Core) interrupt routine,
1846 * which regains ownership of the SCSI command (de facto) from a LLDD, and
1847 * calls blk_complete_request() for further processing.
1848 *
1849 * This function is interrupt context safe.
1850 */
1851static void scsi_done(struct scsi_cmnd *cmd)
1852{
1853 trace_scsi_dispatch_cmd_done(cmd);
1854 blk_complete_request(cmd->request);
1855}
1856
1857/*
1858 * Function: scsi_request_fn()
1859 *
1860 * Purpose: Main strategy routine for SCSI.
1861 *
1862 * Arguments: q - Pointer to actual queue.
1863 *
1864 * Returns: Nothing
1865 *
1866 * Lock status: request queue lock assumed to be held when called.
1867 *
1868 * Note: See sd_zbc.c sd_zbc_write_lock_zone() for write order
1869 * protection for ZBC disks.
1870 */
1871static void scsi_request_fn(struct request_queue *q)
1872 __releases(q->queue_lock)
1873 __acquires(q->queue_lock)
1874{
1875 struct scsi_device *sdev = q->queuedata;
1876 struct Scsi_Host *shost;
1877 struct scsi_cmnd *cmd;
1878 struct request *req;
1879
1880 /*
1881 * To start with, we keep looping until the queue is empty, or until
1882 * the host is no longer able to accept any more requests.
1883 */
1884 shost = sdev->host;
1885 for (;;) {
1886 int rtn;
1887 /*
1888 * get next queueable request. We do this early to make sure
1889 * that the request is fully prepared even if we cannot
1890 * accept it.
1891 */
1892 req = blk_peek_request(q);
1893 if (!req)
1894 break;
1895
1896 if (unlikely(!scsi_device_online(sdev))) {
1897 sdev_printk(KERN_ERR, sdev,
1898 "rejecting I/O to offline device\n");
1899 scsi_kill_request(req, q);
1900 continue;
1901 }
1902
1903 if (!scsi_dev_queue_ready(q, sdev))
1904 break;
1905
1906 /*
1907 * Remove the request from the request list.
1908 */
1909 if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
1910 blk_start_request(req);
1911
1912 spin_unlock_irq(q->queue_lock);
1913 cmd = blk_mq_rq_to_pdu(req);
1914 if (cmd != req->special) {
1915 printk(KERN_CRIT "impossible request in %s.\n"
1916 "please mail a stack trace to "
1917 "linux-scsi@vger.kernel.org\n",
1918 __func__);
1919 blk_dump_rq_flags(req, "foo");
1920 BUG();
1921 }
1922
1923 /*
1924 * We hit this when the driver is using a host wide
1925 * tag map. For device level tag maps the queue_depth check
1926 * in the device ready fn would prevent us from trying
1927 * to allocate a tag. Since the map is a shared host resource
1928 * we add the dev to the starved list so it eventually gets
1929 * a run when a tag is freed.
1930 */
1931 if (blk_queue_tagged(q) && !(req->rq_flags & RQF_QUEUED)) {
1932 spin_lock_irq(shost->host_lock);
1933 if (list_empty(&sdev->starved_entry))
1934 list_add_tail(&sdev->starved_entry,
1935 &shost->starved_list);
1936 spin_unlock_irq(shost->host_lock);
1937 goto not_ready;
1938 }
1939
1940 if (!scsi_target_queue_ready(shost, sdev))
1941 goto not_ready;
1942
1943 if (!scsi_host_queue_ready(q, shost, sdev))
1944 goto host_not_ready;
1945
1946 if (sdev->simple_tags)
1947 cmd->flags |= SCMD_TAGGED;
1948 else
1949 cmd->flags &= ~SCMD_TAGGED;
1950
1951 /*
1952 * Finally, initialize any error handling parameters, and set up
1953 * the timers for timeouts.
1954 */
1955 scsi_init_cmd_errh(cmd);
1956
1957 /*
1958 * Dispatch the command to the low-level driver.
1959 */
1960 cmd->scsi_done = scsi_done;
1961 rtn = scsi_dispatch_cmd(cmd);
1962 if (rtn) {
1963 scsi_queue_insert(cmd, rtn);
1964 spin_lock_irq(q->queue_lock);
1965 goto out_delay;
1966 }
1967 spin_lock_irq(q->queue_lock);
1968 }
1969
1970 return;
1971
1972 host_not_ready:
1973 if (scsi_target(sdev)->can_queue > 0)
1974 atomic_dec(&scsi_target(sdev)->target_busy);
1975 not_ready:
1976 /*
1977 * lock q, handle tag, requeue req, and decrement device_busy. We
1978 * must return with queue_lock held.
1979 *
1980 * Decrementing device_busy without checking it is OK, as all such
1981 * cases (host limits or settings) should run the queue at some
1982 * later time.
1983 */
1984 spin_lock_irq(q->queue_lock);
1985 blk_requeue_request(q, req);
1986 atomic_dec(&sdev->device_busy);
1987out_delay:
1988 if (!atomic_read(&sdev->device_busy) && !scsi_device_blocked(sdev))
1989 blk_delay_queue(q, SCSI_QUEUE_DELAY);
1990}
1991
1992static inline blk_status_t prep_to_mq(int ret)
1993{
1994 switch (ret) {
1995 case BLKPREP_OK:
1996 return BLK_STS_OK;
1997 case BLKPREP_DEFER:
1998 return BLK_STS_RESOURCE;
1999 default:
2000 return BLK_STS_IOERR;
2001 }
2002}
2003
2004/* Size in bytes of the sg-list stored in the scsi-mq command-private data. */ 1594/* Size in bytes of the sg-list stored in the scsi-mq command-private data. */
2005static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost) 1595static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost)
2006{ 1596{
@@ -2008,7 +1598,7 @@ static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost)
2008 sizeof(struct scatterlist); 1598 sizeof(struct scatterlist);
2009} 1599}
2010 1600
2011static int scsi_mq_prep_fn(struct request *req) 1601static blk_status_t scsi_mq_prep_fn(struct request *req)
2012{ 1602{
2013 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); 1603 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
2014 struct scsi_device *sdev = req->q->queuedata; 1604 struct scsi_device *sdev = req->q->queuedata;
@@ -2052,8 +1642,18 @@ static int scsi_mq_prep_fn(struct request *req)
2052 1642
2053static void scsi_mq_done(struct scsi_cmnd *cmd) 1643static void scsi_mq_done(struct scsi_cmnd *cmd)
2054{ 1644{
1645 if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
1646 return;
2055 trace_scsi_dispatch_cmd_done(cmd); 1647 trace_scsi_dispatch_cmd_done(cmd);
2056 blk_mq_complete_request(cmd->request); 1648
1649 /*
1650 * If the block layer didn't complete the request due to a timeout
1651 * injection, scsi must clear its internal completed state so that the
1652 * timeout handler will see it needs to escalate its own error
1653 * recovery.
1654 */
1655 if (unlikely(!blk_mq_complete_request(cmd->request)))
1656 clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
2057} 1657}
2058 1658
2059static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) 1659static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
@@ -2096,9 +1696,15 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
2096 blk_status_t ret; 1696 blk_status_t ret;
2097 int reason; 1697 int reason;
2098 1698
2099 ret = prep_to_mq(scsi_prep_state_check(sdev, req)); 1699 /*
2100 if (ret != BLK_STS_OK) 1700 * If the device is not in running state we will reject some or all
2101 goto out_put_budget; 1701 * commands.
1702 */
1703 if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
1704 ret = scsi_prep_state_check(sdev, req);
1705 if (ret != BLK_STS_OK)
1706 goto out_put_budget;
1707 }
2102 1708
2103 ret = BLK_STS_RESOURCE; 1709 ret = BLK_STS_RESOURCE;
2104 if (!scsi_target_queue_ready(shost, sdev)) 1710 if (!scsi_target_queue_ready(shost, sdev))
@@ -2106,8 +1712,9 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
2106 if (!scsi_host_queue_ready(q, shost, sdev)) 1712 if (!scsi_host_queue_ready(q, shost, sdev))
2107 goto out_dec_target_busy; 1713 goto out_dec_target_busy;
2108 1714
1715 clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
2109 if (!(req->rq_flags & RQF_DONTPREP)) { 1716 if (!(req->rq_flags & RQF_DONTPREP)) {
2110 ret = prep_to_mq(scsi_mq_prep_fn(req)); 1717 ret = scsi_mq_prep_fn(req);
2111 if (ret != BLK_STS_OK) 1718 if (ret != BLK_STS_OK)
2112 goto out_dec_host_busy; 1719 goto out_dec_host_busy;
2113 req->rq_flags |= RQF_DONTPREP; 1720 req->rq_flags |= RQF_DONTPREP;
@@ -2208,7 +1815,7 @@ static int scsi_map_queues(struct blk_mq_tag_set *set)
2208 1815
2209 if (shost->hostt->map_queues) 1816 if (shost->hostt->map_queues)
2210 return shost->hostt->map_queues(shost); 1817 return shost->hostt->map_queues(shost);
2211 return blk_mq_map_queues(set); 1818 return blk_mq_map_queues(&set->map[0]);
2212} 1819}
2213 1820
2214void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) 1821void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
@@ -2251,77 +1858,6 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
2251} 1858}
2252EXPORT_SYMBOL_GPL(__scsi_init_queue); 1859EXPORT_SYMBOL_GPL(__scsi_init_queue);
2253 1860
2254static int scsi_old_init_rq(struct request_queue *q, struct request *rq,
2255 gfp_t gfp)
2256{
2257 struct Scsi_Host *shost = q->rq_alloc_data;
2258 const bool unchecked_isa_dma = shost->unchecked_isa_dma;
2259 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
2260
2261 memset(cmd, 0, sizeof(*cmd));
2262
2263 if (unchecked_isa_dma)
2264 cmd->flags |= SCMD_UNCHECKED_ISA_DMA;
2265 cmd->sense_buffer = scsi_alloc_sense_buffer(unchecked_isa_dma, gfp,
2266 NUMA_NO_NODE);
2267 if (!cmd->sense_buffer)
2268 goto fail;
2269 cmd->req.sense = cmd->sense_buffer;
2270
2271 if (scsi_host_get_prot(shost) >= SHOST_DIX_TYPE0_PROTECTION) {
2272 cmd->prot_sdb = kmem_cache_zalloc(scsi_sdb_cache, gfp);
2273 if (!cmd->prot_sdb)
2274 goto fail_free_sense;
2275 }
2276
2277 return 0;
2278
2279fail_free_sense:
2280 scsi_free_sense_buffer(unchecked_isa_dma, cmd->sense_buffer);
2281fail:
2282 return -ENOMEM;
2283}
2284
2285static void scsi_old_exit_rq(struct request_queue *q, struct request *rq)
2286{
2287 struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
2288
2289 if (cmd->prot_sdb)
2290 kmem_cache_free(scsi_sdb_cache, cmd->prot_sdb);
2291 scsi_free_sense_buffer(cmd->flags & SCMD_UNCHECKED_ISA_DMA,
2292 cmd->sense_buffer);
2293}
2294
2295struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev)
2296{
2297 struct Scsi_Host *shost = sdev->host;
2298 struct request_queue *q;
2299
2300 q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
2301 if (!q)
2302 return NULL;
2303 q->cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
2304 q->rq_alloc_data = shost;
2305 q->request_fn = scsi_request_fn;
2306 q->init_rq_fn = scsi_old_init_rq;
2307 q->exit_rq_fn = scsi_old_exit_rq;
2308 q->initialize_rq_fn = scsi_initialize_rq;
2309
2310 if (blk_init_allocated_queue(q) < 0) {
2311 blk_cleanup_queue(q);
2312 return NULL;
2313 }
2314
2315 __scsi_init_queue(shost, q);
2316 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
2317 blk_queue_prep_rq(q, scsi_prep_fn);
2318 blk_queue_unprep_rq(q, scsi_unprep_fn);
2319 blk_queue_softirq_done(q, scsi_softirq_done);
2320 blk_queue_rq_timed_out(q, scsi_times_out);
2321 blk_queue_lld_busy(q, scsi_lld_busy);
2322 return q;
2323}
2324
2325static const struct blk_mq_ops scsi_mq_ops = { 1861static const struct blk_mq_ops scsi_mq_ops = {
2326 .get_budget = scsi_mq_get_budget, 1862 .get_budget = scsi_mq_get_budget,
2327 .put_budget = scsi_mq_put_budget, 1863 .put_budget = scsi_mq_put_budget,
@@ -2334,6 +1870,7 @@ static const struct blk_mq_ops scsi_mq_ops = {
2334 .init_request = scsi_mq_init_request, 1870 .init_request = scsi_mq_init_request,
2335 .exit_request = scsi_mq_exit_request, 1871 .exit_request = scsi_mq_exit_request,
2336 .initialize_rq_fn = scsi_initialize_rq, 1872 .initialize_rq_fn = scsi_initialize_rq,
1873 .busy = scsi_mq_lld_busy,
2337 .map_queues = scsi_map_queues, 1874 .map_queues = scsi_map_queues,
2338}; 1875};
2339 1876
@@ -2388,10 +1925,7 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q)
2388{ 1925{
2389 struct scsi_device *sdev = NULL; 1926 struct scsi_device *sdev = NULL;
2390 1927
2391 if (q->mq_ops) { 1928 if (q->mq_ops == &scsi_mq_ops)
2392 if (q->mq_ops == &scsi_mq_ops)
2393 sdev = q->queuedata;
2394 } else if (q->request_fn == scsi_request_fn)
2395 sdev = q->queuedata; 1929 sdev = q->queuedata;
2396 if (!sdev || !get_device(&sdev->sdev_gendev)) 1930 if (!sdev || !get_device(&sdev->sdev_gendev))
2397 sdev = NULL; 1931 sdev = NULL;
@@ -2995,39 +2529,6 @@ void sdev_evt_send_simple(struct scsi_device *sdev,
2995EXPORT_SYMBOL_GPL(sdev_evt_send_simple); 2529EXPORT_SYMBOL_GPL(sdev_evt_send_simple);
2996 2530
2997/** 2531/**
2998 * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn()
2999 * @sdev: SCSI device to count the number of scsi_request_fn() callers for.
3000 */
3001static int scsi_request_fn_active(struct scsi_device *sdev)
3002{
3003 struct request_queue *q = sdev->request_queue;
3004 int request_fn_active;
3005
3006 WARN_ON_ONCE(sdev->host->use_blk_mq);
3007
3008 spin_lock_irq(q->queue_lock);
3009 request_fn_active = q->request_fn_active;
3010 spin_unlock_irq(q->queue_lock);
3011
3012 return request_fn_active;
3013}
3014
3015/**
3016 * scsi_wait_for_queuecommand() - wait for ongoing queuecommand() calls
3017 * @sdev: SCSI device pointer.
3018 *
3019 * Wait until the ongoing shost->hostt->queuecommand() calls that are
3020 * invoked from scsi_request_fn() have finished.
3021 */
3022static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
3023{
3024 WARN_ON_ONCE(sdev->host->use_blk_mq);
3025
3026 while (scsi_request_fn_active(sdev))
3027 msleep(20);
3028}
3029
3030/**
3031 * scsi_device_quiesce - Block user issued commands. 2532 * scsi_device_quiesce - Block user issued commands.
3032 * @sdev: scsi device to quiesce. 2533 * @sdev: scsi device to quiesce.
3033 * 2534 *
@@ -3150,7 +2651,6 @@ EXPORT_SYMBOL(scsi_target_resume);
3150int scsi_internal_device_block_nowait(struct scsi_device *sdev) 2651int scsi_internal_device_block_nowait(struct scsi_device *sdev)
3151{ 2652{
3152 struct request_queue *q = sdev->request_queue; 2653 struct request_queue *q = sdev->request_queue;
3153 unsigned long flags;
3154 int err = 0; 2654 int err = 0;
3155 2655
3156 err = scsi_device_set_state(sdev, SDEV_BLOCK); 2656 err = scsi_device_set_state(sdev, SDEV_BLOCK);
@@ -3166,14 +2666,7 @@ int scsi_internal_device_block_nowait(struct scsi_device *sdev)
3166 * block layer from calling the midlayer with this device's 2666 * block layer from calling the midlayer with this device's
3167 * request queue. 2667 * request queue.
3168 */ 2668 */
3169 if (q->mq_ops) { 2669 blk_mq_quiesce_queue_nowait(q);
3170 blk_mq_quiesce_queue_nowait(q);
3171 } else {
3172 spin_lock_irqsave(q->queue_lock, flags);
3173 blk_stop_queue(q);
3174 spin_unlock_irqrestore(q->queue_lock, flags);
3175 }
3176
3177 return 0; 2670 return 0;
3178} 2671}
3179EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); 2672EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
@@ -3204,12 +2697,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
3204 2697
3205 mutex_lock(&sdev->state_mutex); 2698 mutex_lock(&sdev->state_mutex);
3206 err = scsi_internal_device_block_nowait(sdev); 2699 err = scsi_internal_device_block_nowait(sdev);
3207 if (err == 0) { 2700 if (err == 0)
3208 if (q->mq_ops) 2701 blk_mq_quiesce_queue(q);
3209 blk_mq_quiesce_queue(q);
3210 else
3211 scsi_wait_for_queuecommand(sdev);
3212 }
3213 mutex_unlock(&sdev->state_mutex); 2702 mutex_unlock(&sdev->state_mutex);
3214 2703
3215 return err; 2704 return err;
@@ -3218,15 +2707,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
3218void scsi_start_queue(struct scsi_device *sdev) 2707void scsi_start_queue(struct scsi_device *sdev)
3219{ 2708{
3220 struct request_queue *q = sdev->request_queue; 2709 struct request_queue *q = sdev->request_queue;
3221 unsigned long flags;
3222 2710
3223 if (q->mq_ops) { 2711 blk_mq_unquiesce_queue(q);
3224 blk_mq_unquiesce_queue(q);
3225 } else {
3226 spin_lock_irqsave(q->queue_lock, flags);
3227 blk_start_queue(q);
3228 spin_unlock_irqrestore(q->queue_lock, flags);
3229 }
3230} 2712}
3231 2713
3232/** 2714/**
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 99f1db5e467e..5f21547b2ad2 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -92,7 +92,6 @@ extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason);
92extern void scsi_io_completion(struct scsi_cmnd *, unsigned int); 92extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
93extern void scsi_run_host_queues(struct Scsi_Host *shost); 93extern void scsi_run_host_queues(struct Scsi_Host *shost);
94extern void scsi_requeue_run_queue(struct work_struct *work); 94extern void scsi_requeue_run_queue(struct work_struct *work);
95extern struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev);
96extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev); 95extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
97extern void scsi_start_queue(struct scsi_device *sdev); 96extern void scsi_start_queue(struct scsi_device *sdev);
98extern int scsi_mq_setup_tags(struct Scsi_Host *shost); 97extern int scsi_mq_setup_tags(struct Scsi_Host *shost);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 78ca63dfba4a..dd0d516f65e2 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -266,10 +266,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
266 */ 266 */
267 sdev->borken = 1; 267 sdev->borken = 1;
268 268
269 if (shost_use_blk_mq(shost)) 269 sdev->request_queue = scsi_mq_alloc_queue(sdev);
270 sdev->request_queue = scsi_mq_alloc_queue(sdev);
271 else
272 sdev->request_queue = scsi_old_alloc_queue(sdev);
273 if (!sdev->request_queue) { 270 if (!sdev->request_queue) {
274 /* release fn is set up in scsi_sysfs_device_initialise, so 271 /* release fn is set up in scsi_sysfs_device_initialise, so
275 * have to free and put manually here */ 272 * have to free and put manually here */
@@ -280,11 +277,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
280 WARN_ON_ONCE(!blk_get_queue(sdev->request_queue)); 277 WARN_ON_ONCE(!blk_get_queue(sdev->request_queue));
281 sdev->request_queue->queuedata = sdev; 278 sdev->request_queue->queuedata = sdev;
282 279
283 if (!shost_use_blk_mq(sdev->host)) {
284 blk_queue_init_tags(sdev->request_queue,
285 sdev->host->cmd_per_lun, shost->bqt,
286 shost->hostt->tag_alloc_policy);
287 }
288 scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun ? 280 scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun ?
289 sdev->host->cmd_per_lun : 1); 281 sdev->host->cmd_per_lun : 1);
290 282
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 3aee9464a7bf..6a9040faed00 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -367,7 +367,6 @@ store_shost_eh_deadline(struct device *dev, struct device_attribute *attr,
367 367
368static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline); 368static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline);
369 369
370shost_rd_attr(use_blk_mq, "%d\n");
371shost_rd_attr(unique_id, "%u\n"); 370shost_rd_attr(unique_id, "%u\n");
372shost_rd_attr(cmd_per_lun, "%hd\n"); 371shost_rd_attr(cmd_per_lun, "%hd\n");
373shost_rd_attr(can_queue, "%hd\n"); 372shost_rd_attr(can_queue, "%hd\n");
@@ -386,6 +385,13 @@ show_host_busy(struct device *dev, struct device_attribute *attr, char *buf)
386} 385}
387static DEVICE_ATTR(host_busy, S_IRUGO, show_host_busy, NULL); 386static DEVICE_ATTR(host_busy, S_IRUGO, show_host_busy, NULL);
388 387
388static ssize_t
389show_use_blk_mq(struct device *dev, struct device_attribute *attr, char *buf)
390{
391 return sprintf(buf, "1\n");
392}
393static DEVICE_ATTR(use_blk_mq, S_IRUGO, show_use_blk_mq, NULL);
394
389static struct attribute *scsi_sysfs_shost_attrs[] = { 395static struct attribute *scsi_sysfs_shost_attrs[] = {
390 &dev_attr_use_blk_mq.attr, 396 &dev_attr_use_blk_mq.attr,
391 &dev_attr_unique_id.attr, 397 &dev_attr_unique_id.attr,
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 381668fa135d..d7035270d274 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3592,7 +3592,7 @@ fc_bsg_job_timeout(struct request *req)
3592 3592
3593 /* the blk_end_sync_io() doesn't check the error */ 3593 /* the blk_end_sync_io() doesn't check the error */
3594 if (inflight) 3594 if (inflight)
3595 __blk_complete_request(req); 3595 blk_mq_end_request(req, BLK_STS_IOERR);
3596 return BLK_EH_DONE; 3596 return BLK_EH_DONE;
3597} 3597}
3598 3598
@@ -3684,14 +3684,9 @@ static void
3684fc_bsg_goose_queue(struct fc_rport *rport) 3684fc_bsg_goose_queue(struct fc_rport *rport)
3685{ 3685{
3686 struct request_queue *q = rport->rqst_q; 3686 struct request_queue *q = rport->rqst_q;
3687 unsigned long flags;
3688
3689 if (!q)
3690 return;
3691 3687
3692 spin_lock_irqsave(q->queue_lock, flags); 3688 if (q)
3693 blk_run_queue_async(q); 3689 blk_mq_run_hw_queues(q, true);
3694 spin_unlock_irqrestore(q->queue_lock, flags);
3695} 3690}
3696 3691
3697/** 3692/**
@@ -3759,6 +3754,37 @@ static int fc_bsg_dispatch(struct bsg_job *job)
3759 return fc_bsg_host_dispatch(shost, job); 3754 return fc_bsg_host_dispatch(shost, job);
3760} 3755}
3761 3756
3757static blk_status_t fc_bsg_rport_prep(struct fc_rport *rport)
3758{
3759 if (rport->port_state == FC_PORTSTATE_BLOCKED &&
3760 !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
3761 return BLK_STS_RESOURCE;
3762
3763 if (rport->port_state != FC_PORTSTATE_ONLINE)
3764 return BLK_STS_IOERR;
3765
3766 return BLK_STS_OK;
3767}
3768
3769
3770static int fc_bsg_dispatch_prep(struct bsg_job *job)
3771{
3772 struct fc_rport *rport = fc_bsg_to_rport(job);
3773 blk_status_t ret;
3774
3775 ret = fc_bsg_rport_prep(rport);
3776 switch (ret) {
3777 case BLK_STS_OK:
3778 break;
3779 case BLK_STS_RESOURCE:
3780 return -EAGAIN;
3781 default:
3782 return -EIO;
3783 }
3784
3785 return fc_bsg_dispatch(job);
3786}
3787
3762/** 3788/**
3763 * fc_bsg_hostadd - Create and add the bsg hooks so we can receive requests 3789 * fc_bsg_hostadd - Create and add the bsg hooks so we can receive requests
3764 * @shost: shost for fc_host 3790 * @shost: shost for fc_host
@@ -3780,7 +3806,8 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
3780 snprintf(bsg_name, sizeof(bsg_name), 3806 snprintf(bsg_name, sizeof(bsg_name),
3781 "fc_host%d", shost->host_no); 3807 "fc_host%d", shost->host_no);
3782 3808
3783 q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size); 3809 q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, fc_bsg_job_timeout,
3810 i->f->dd_bsg_size);
3784 if (IS_ERR(q)) { 3811 if (IS_ERR(q)) {
3785 dev_err(dev, 3812 dev_err(dev,
3786 "fc_host%d: bsg interface failed to initialize - setup queue\n", 3813 "fc_host%d: bsg interface failed to initialize - setup queue\n",
@@ -3788,26 +3815,11 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
3788 return PTR_ERR(q); 3815 return PTR_ERR(q);
3789 } 3816 }
3790 __scsi_init_queue(shost, q); 3817 __scsi_init_queue(shost, q);
3791 blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
3792 blk_queue_rq_timeout(q, FC_DEFAULT_BSG_TIMEOUT); 3818 blk_queue_rq_timeout(q, FC_DEFAULT_BSG_TIMEOUT);
3793 fc_host->rqst_q = q; 3819 fc_host->rqst_q = q;
3794 return 0; 3820 return 0;
3795} 3821}
3796 3822
3797static int fc_bsg_rport_prep(struct request_queue *q, struct request *req)
3798{
3799 struct fc_rport *rport = dev_to_rport(q->queuedata);
3800
3801 if (rport->port_state == FC_PORTSTATE_BLOCKED &&
3802 !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
3803 return BLKPREP_DEFER;
3804
3805 if (rport->port_state != FC_PORTSTATE_ONLINE)
3806 return BLKPREP_KILL;
3807
3808 return BLKPREP_OK;
3809}
3810
3811/** 3823/**
3812 * fc_bsg_rportadd - Create and add the bsg hooks so we can receive requests 3824 * fc_bsg_rportadd - Create and add the bsg hooks so we can receive requests
3813 * @shost: shost that rport is attached to 3825 * @shost: shost that rport is attached to
@@ -3825,15 +3837,13 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
3825 if (!i->f->bsg_request) 3837 if (!i->f->bsg_request)
3826 return -ENOTSUPP; 3838 return -ENOTSUPP;
3827 3839
3828 q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch, 3840 q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch_prep,
3829 i->f->dd_bsg_size); 3841 fc_bsg_job_timeout, i->f->dd_bsg_size);
3830 if (IS_ERR(q)) { 3842 if (IS_ERR(q)) {
3831 dev_err(dev, "failed to setup bsg queue\n"); 3843 dev_err(dev, "failed to setup bsg queue\n");
3832 return PTR_ERR(q); 3844 return PTR_ERR(q);
3833 } 3845 }
3834 __scsi_init_queue(shost, q); 3846 __scsi_init_queue(shost, q);
3835 blk_queue_prep_rq(q, fc_bsg_rport_prep);
3836 blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
3837 blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); 3847 blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
3838 rport->rqst_q = q; 3848 rport->rqst_q = q;
3839 return 0; 3849 return 0;
@@ -3852,10 +3862,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
3852static void 3862static void
3853fc_bsg_remove(struct request_queue *q) 3863fc_bsg_remove(struct request_queue *q)
3854{ 3864{
3855 if (q) { 3865 bsg_remove_queue(q);
3856 bsg_unregister_queue(q);
3857 blk_cleanup_queue(q);
3858 }
3859} 3866}
3860 3867
3861 3868
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 6fd2fe210fc3..ff123023e5a5 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
1542 return -ENOTSUPP; 1542 return -ENOTSUPP;
1543 1543
1544 snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no); 1544 snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
1545 q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0); 1545 q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, NULL, 0);
1546 if (IS_ERR(q)) { 1546 if (IS_ERR(q)) {
1547 shost_printk(KERN_ERR, shost, "bsg interface failed to " 1547 shost_printk(KERN_ERR, shost, "bsg interface failed to "
1548 "initialize - no request queue\n"); 1548 "initialize - no request queue\n");
@@ -1576,10 +1576,7 @@ static int iscsi_remove_host(struct transport_container *tc,
1576 struct Scsi_Host *shost = dev_to_shost(dev); 1576 struct Scsi_Host *shost = dev_to_shost(dev);
1577 struct iscsi_cls_host *ihost = shost->shost_data; 1577 struct iscsi_cls_host *ihost = shost->shost_data;
1578 1578
1579 if (ihost->bsg_q) { 1579 bsg_remove_queue(ihost->bsg_q);
1580 bsg_unregister_queue(ihost->bsg_q);
1581 blk_cleanup_queue(ihost->bsg_q);
1582 }
1583 return 0; 1580 return 0;
1584} 1581}
1585 1582
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 0a165b2b3e81..692b46937e52 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -198,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
198 198
199 if (rphy) { 199 if (rphy) {
200 q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev), 200 q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
201 sas_smp_dispatch, 0); 201 sas_smp_dispatch, NULL, 0);
202 if (IS_ERR(q)) 202 if (IS_ERR(q))
203 return PTR_ERR(q); 203 return PTR_ERR(q);
204 rphy->q = q; 204 rphy->q = q;
@@ -207,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
207 207
208 snprintf(name, sizeof(name), "sas_host%d", shost->host_no); 208 snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
209 q = bsg_setup_queue(&shost->shost_gendev, name, 209 q = bsg_setup_queue(&shost->shost_gendev, name,
210 sas_smp_dispatch, 0); 210 sas_smp_dispatch, NULL, 0);
211 if (IS_ERR(q)) 211 if (IS_ERR(q))
212 return PTR_ERR(q); 212 return PTR_ERR(q);
213 to_sas_host_attrs(shost)->q = q; 213 to_sas_host_attrs(shost)->q = q;
@@ -246,11 +246,7 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev,
246 struct Scsi_Host *shost = dev_to_shost(dev); 246 struct Scsi_Host *shost = dev_to_shost(dev);
247 struct request_queue *q = to_sas_host_attrs(shost)->q; 247 struct request_queue *q = to_sas_host_attrs(shost)->q;
248 248
249 if (q) { 249 bsg_remove_queue(q);
250 bsg_unregister_queue(q);
251 blk_cleanup_queue(q);
252 }
253
254 return 0; 250 return 0;
255} 251}
256 252
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index bd0a5c694a97..a1a44f52e0e8 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -114,7 +114,7 @@ static int sd_suspend_system(struct device *);
114static int sd_suspend_runtime(struct device *); 114static int sd_suspend_runtime(struct device *);
115static int sd_resume(struct device *); 115static int sd_resume(struct device *);
116static void sd_rescan(struct device *); 116static void sd_rescan(struct device *);
117static int sd_init_command(struct scsi_cmnd *SCpnt); 117static blk_status_t sd_init_command(struct scsi_cmnd *SCpnt);
118static void sd_uninit_command(struct scsi_cmnd *SCpnt); 118static void sd_uninit_command(struct scsi_cmnd *SCpnt);
119static int sd_done(struct scsi_cmnd *); 119static int sd_done(struct scsi_cmnd *);
120static void sd_eh_reset(struct scsi_cmnd *); 120static void sd_eh_reset(struct scsi_cmnd *);
@@ -751,7 +751,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
751 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); 751 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
752} 752}
753 753
754static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) 754static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
755{ 755{
756 struct scsi_device *sdp = cmd->device; 756 struct scsi_device *sdp = cmd->device;
757 struct request *rq = cmd->request; 757 struct request *rq = cmd->request;
@@ -762,7 +762,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
762 762
763 rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); 763 rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
764 if (!rq->special_vec.bv_page) 764 if (!rq->special_vec.bv_page)
765 return BLKPREP_DEFER; 765 return BLK_STS_RESOURCE;
766 clear_highpage(rq->special_vec.bv_page); 766 clear_highpage(rq->special_vec.bv_page);
767 rq->special_vec.bv_offset = 0; 767 rq->special_vec.bv_offset = 0;
768 rq->special_vec.bv_len = data_len; 768 rq->special_vec.bv_len = data_len;
@@ -786,7 +786,8 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
786 return scsi_init_io(cmd); 786 return scsi_init_io(cmd);
787} 787}
788 788
789static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) 789static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd,
790 bool unmap)
790{ 791{
791 struct scsi_device *sdp = cmd->device; 792 struct scsi_device *sdp = cmd->device;
792 struct request *rq = cmd->request; 793 struct request *rq = cmd->request;
@@ -796,7 +797,7 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
796 797
797 rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); 798 rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
798 if (!rq->special_vec.bv_page) 799 if (!rq->special_vec.bv_page)
799 return BLKPREP_DEFER; 800 return BLK_STS_RESOURCE;
800 clear_highpage(rq->special_vec.bv_page); 801 clear_highpage(rq->special_vec.bv_page);
801 rq->special_vec.bv_offset = 0; 802 rq->special_vec.bv_offset = 0;
802 rq->special_vec.bv_len = data_len; 803 rq->special_vec.bv_len = data_len;
@@ -817,7 +818,8 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
817 return scsi_init_io(cmd); 818 return scsi_init_io(cmd);
818} 819}
819 820
820static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) 821static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd,
822 bool unmap)
821{ 823{
822 struct scsi_device *sdp = cmd->device; 824 struct scsi_device *sdp = cmd->device;
823 struct request *rq = cmd->request; 825 struct request *rq = cmd->request;
@@ -827,7 +829,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
827 829
828 rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); 830 rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
829 if (!rq->special_vec.bv_page) 831 if (!rq->special_vec.bv_page)
830 return BLKPREP_DEFER; 832 return BLK_STS_RESOURCE;
831 clear_highpage(rq->special_vec.bv_page); 833 clear_highpage(rq->special_vec.bv_page);
832 rq->special_vec.bv_offset = 0; 834 rq->special_vec.bv_offset = 0;
833 rq->special_vec.bv_len = data_len; 835 rq->special_vec.bv_len = data_len;
@@ -848,7 +850,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
848 return scsi_init_io(cmd); 850 return scsi_init_io(cmd);
849} 851}
850 852
851static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) 853static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
852{ 854{
853 struct request *rq = cmd->request; 855 struct request *rq = cmd->request;
854 struct scsi_device *sdp = cmd->device; 856 struct scsi_device *sdp = cmd->device;
@@ -866,7 +868,7 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
866 } 868 }
867 869
868 if (sdp->no_write_same) 870 if (sdp->no_write_same)
869 return BLKPREP_INVALID; 871 return BLK_STS_TARGET;
870 872
871 if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff) 873 if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
872 return sd_setup_write_same16_cmnd(cmd, false); 874 return sd_setup_write_same16_cmnd(cmd, false);
@@ -943,7 +945,7 @@ out:
943 * Will set up either WRITE SAME(10) or WRITE SAME(16) depending on 945 * Will set up either WRITE SAME(10) or WRITE SAME(16) depending on
944 * the preference indicated by the target device. 946 * the preference indicated by the target device.
945 **/ 947 **/
946static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) 948static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
947{ 949{
948 struct request *rq = cmd->request; 950 struct request *rq = cmd->request;
949 struct scsi_device *sdp = cmd->device; 951 struct scsi_device *sdp = cmd->device;
@@ -952,10 +954,10 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
952 sector_t sector = blk_rq_pos(rq); 954 sector_t sector = blk_rq_pos(rq);
953 unsigned int nr_sectors = blk_rq_sectors(rq); 955 unsigned int nr_sectors = blk_rq_sectors(rq);
954 unsigned int nr_bytes = blk_rq_bytes(rq); 956 unsigned int nr_bytes = blk_rq_bytes(rq);
955 int ret; 957 blk_status_t ret;
956 958
957 if (sdkp->device->no_write_same) 959 if (sdkp->device->no_write_same)
958 return BLKPREP_INVALID; 960 return BLK_STS_TARGET;
959 961
960 BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size); 962 BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
961 963
@@ -996,7 +998,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
996 return ret; 998 return ret;
997} 999}
998 1000
999static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd) 1001static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
1000{ 1002{
1001 struct request *rq = cmd->request; 1003 struct request *rq = cmd->request;
1002 1004
@@ -1009,10 +1011,10 @@ static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
1009 cmd->allowed = SD_MAX_RETRIES; 1011 cmd->allowed = SD_MAX_RETRIES;
1010 1012
1011 rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; 1013 rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER;
1012 return BLKPREP_OK; 1014 return BLK_STS_OK;
1013} 1015}
1014 1016
1015static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) 1017static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1016{ 1018{
1017 struct request *rq = SCpnt->request; 1019 struct request *rq = SCpnt->request;
1018 struct scsi_device *sdp = SCpnt->device; 1020 struct scsi_device *sdp = SCpnt->device;
@@ -1022,18 +1024,14 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1022 sector_t threshold; 1024 sector_t threshold;
1023 unsigned int this_count = blk_rq_sectors(rq); 1025 unsigned int this_count = blk_rq_sectors(rq);
1024 unsigned int dif, dix; 1026 unsigned int dif, dix;
1025 int ret;
1026 unsigned char protect; 1027 unsigned char protect;
1028 blk_status_t ret;
1027 1029
1028 ret = scsi_init_io(SCpnt); 1030 ret = scsi_init_io(SCpnt);
1029 if (ret != BLKPREP_OK) 1031 if (ret != BLK_STS_OK)
1030 return ret; 1032 return ret;
1031 WARN_ON_ONCE(SCpnt != rq->special); 1033 WARN_ON_ONCE(SCpnt != rq->special);
1032 1034
1033 /* from here on until we're complete, any goto out
1034 * is used for a killable error condition */
1035 ret = BLKPREP_KILL;
1036
1037 SCSI_LOG_HLQUEUE(1, 1035 SCSI_LOG_HLQUEUE(1,
1038 scmd_printk(KERN_INFO, SCpnt, 1036 scmd_printk(KERN_INFO, SCpnt,
1039 "%s: block=%llu, count=%d\n", 1037 "%s: block=%llu, count=%d\n",
@@ -1046,7 +1044,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1046 blk_rq_sectors(rq))); 1044 blk_rq_sectors(rq)));
1047 SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, 1045 SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
1048 "Retry with 0x%p\n", SCpnt)); 1046 "Retry with 0x%p\n", SCpnt));
1049 goto out; 1047 return BLK_STS_IOERR;
1050 } 1048 }
1051 1049
1052 if (sdp->changed) { 1050 if (sdp->changed) {
@@ -1055,7 +1053,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1055 * the changed bit has been reset 1053 * the changed bit has been reset
1056 */ 1054 */
1057 /* printk("SCSI disk has been changed or is not present. Prohibiting further I/O.\n"); */ 1055 /* printk("SCSI disk has been changed or is not present. Prohibiting further I/O.\n"); */
1058 goto out; 1056 return BLK_STS_IOERR;
1059 } 1057 }
1060 1058
1061 /* 1059 /*
@@ -1093,31 +1091,28 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1093 if ((block & 1) || (blk_rq_sectors(rq) & 1)) { 1091 if ((block & 1) || (blk_rq_sectors(rq) & 1)) {
1094 scmd_printk(KERN_ERR, SCpnt, 1092 scmd_printk(KERN_ERR, SCpnt,
1095 "Bad block number requested\n"); 1093 "Bad block number requested\n");
1096 goto out; 1094 return BLK_STS_IOERR;
1097 } else {
1098 block = block >> 1;
1099 this_count = this_count >> 1;
1100 } 1095 }
1096 block = block >> 1;
1097 this_count = this_count >> 1;
1101 } 1098 }
1102 if (sdp->sector_size == 2048) { 1099 if (sdp->sector_size == 2048) {
1103 if ((block & 3) || (blk_rq_sectors(rq) & 3)) { 1100 if ((block & 3) || (blk_rq_sectors(rq) & 3)) {
1104 scmd_printk(KERN_ERR, SCpnt, 1101 scmd_printk(KERN_ERR, SCpnt,
1105 "Bad block number requested\n"); 1102 "Bad block number requested\n");
1106 goto out; 1103 return BLK_STS_IOERR;
1107 } else {
1108 block = block >> 2;
1109 this_count = this_count >> 2;
1110 } 1104 }
1105 block = block >> 2;
1106 this_count = this_count >> 2;
1111 } 1107 }
1112 if (sdp->sector_size == 4096) { 1108 if (sdp->sector_size == 4096) {
1113 if ((block & 7) || (blk_rq_sectors(rq) & 7)) { 1109 if ((block & 7) || (blk_rq_sectors(rq) & 7)) {
1114 scmd_printk(KERN_ERR, SCpnt, 1110 scmd_printk(KERN_ERR, SCpnt,
1115 "Bad block number requested\n"); 1111 "Bad block number requested\n");
1116 goto out; 1112 return BLK_STS_IOERR;
1117 } else {
1118 block = block >> 3;
1119 this_count = this_count >> 3;
1120 } 1113 }
1114 block = block >> 3;
1115 this_count = this_count >> 3;
1121 } 1116 }
1122 if (rq_data_dir(rq) == WRITE) { 1117 if (rq_data_dir(rq) == WRITE) {
1123 SCpnt->cmnd[0] = WRITE_6; 1118 SCpnt->cmnd[0] = WRITE_6;
@@ -1129,7 +1124,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1129 SCpnt->cmnd[0] = READ_6; 1124 SCpnt->cmnd[0] = READ_6;
1130 } else { 1125 } else {
1131 scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq)); 1126 scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq));
1132 goto out; 1127 return BLK_STS_IOERR;
1133 } 1128 }
1134 1129
1135 SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, 1130 SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
@@ -1149,10 +1144,8 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1149 if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { 1144 if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) {
1150 SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC); 1145 SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC);
1151 1146
1152 if (unlikely(SCpnt->cmnd == NULL)) { 1147 if (unlikely(!SCpnt->cmnd))
1153 ret = BLKPREP_DEFER; 1148 return BLK_STS_RESOURCE;
1154 goto out;
1155 }
1156 1149
1157 SCpnt->cmd_len = SD_EXT_CDB_SIZE; 1150 SCpnt->cmd_len = SD_EXT_CDB_SIZE;
1158 memset(SCpnt->cmnd, 0, SCpnt->cmd_len); 1151 memset(SCpnt->cmnd, 0, SCpnt->cmd_len);
@@ -1220,7 +1213,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1220 */ 1213 */
1221 scmd_printk(KERN_ERR, SCpnt, 1214 scmd_printk(KERN_ERR, SCpnt,
1222 "FUA write on READ/WRITE(6) drive\n"); 1215 "FUA write on READ/WRITE(6) drive\n");
1223 goto out; 1216 return BLK_STS_IOERR;
1224 } 1217 }
1225 1218
1226 SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f); 1219 SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f);
@@ -1244,12 +1237,10 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1244 * This indicates that the command is ready from our end to be 1237 * This indicates that the command is ready from our end to be
1245 * queued. 1238 * queued.
1246 */ 1239 */
1247 ret = BLKPREP_OK; 1240 return BLK_STS_OK;
1248 out:
1249 return ret;
1250} 1241}
1251 1242
1252static int sd_init_command(struct scsi_cmnd *cmd) 1243static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
1253{ 1244{
1254 struct request *rq = cmd->request; 1245 struct request *rq = cmd->request;
1255 1246
@@ -1265,7 +1256,7 @@ static int sd_init_command(struct scsi_cmnd *cmd)
1265 case SD_LBP_ZERO: 1256 case SD_LBP_ZERO:
1266 return sd_setup_write_same10_cmnd(cmd, false); 1257 return sd_setup_write_same10_cmnd(cmd, false);
1267 default: 1258 default:
1268 return BLKPREP_INVALID; 1259 return BLK_STS_TARGET;
1269 } 1260 }
1270 case REQ_OP_WRITE_ZEROES: 1261 case REQ_OP_WRITE_ZEROES:
1271 return sd_setup_write_zeroes_cmnd(cmd); 1262 return sd_setup_write_zeroes_cmnd(cmd);
@@ -1280,7 +1271,7 @@ static int sd_init_command(struct scsi_cmnd *cmd)
1280 return sd_zbc_setup_reset_cmnd(cmd); 1271 return sd_zbc_setup_reset_cmnd(cmd);
1281 default: 1272 default:
1282 WARN_ON_ONCE(1); 1273 WARN_ON_ONCE(1);
1283 return BLKPREP_KILL; 1274 return BLK_STS_NOTSUPP;
1284 } 1275 }
1285} 1276}
1286 1277
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 1d63f3a23ffb..7f43e6839bce 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -271,7 +271,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
271 271
272extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer); 272extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
273extern void sd_zbc_print_zones(struct scsi_disk *sdkp); 273extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
274extern int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd); 274extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
275extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, 275extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
276 struct scsi_sense_hdr *sshdr); 276 struct scsi_sense_hdr *sshdr);
277extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, 277extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
@@ -288,9 +288,9 @@ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp,
288 288
289static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {} 289static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {}
290 290
291static inline int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) 291static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
292{ 292{
293 return BLKPREP_INVALID; 293 return BLK_STS_TARGET;
294} 294}
295 295
296static inline void sd_zbc_complete(struct scsi_cmnd *cmd, 296static inline void sd_zbc_complete(struct scsi_cmnd *cmd,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index e06c48c866e4..83365b29a4d8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -185,7 +185,7 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
185 * 185 *
186 * Called from sd_init_command() for a REQ_OP_ZONE_RESET request. 186 * Called from sd_init_command() for a REQ_OP_ZONE_RESET request.
187 */ 187 */
188int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd) 188blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
189{ 189{
190 struct request *rq = cmd->request; 190 struct request *rq = cmd->request;
191 struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); 191 struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
@@ -194,14 +194,14 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
194 194
195 if (!sd_is_zoned(sdkp)) 195 if (!sd_is_zoned(sdkp))
196 /* Not a zoned device */ 196 /* Not a zoned device */
197 return BLKPREP_KILL; 197 return BLK_STS_IOERR;
198 198
199 if (sdkp->device->changed) 199 if (sdkp->device->changed)
200 return BLKPREP_KILL; 200 return BLK_STS_IOERR;
201 201
202 if (sector & (sd_zbc_zone_sectors(sdkp) - 1)) 202 if (sector & (sd_zbc_zone_sectors(sdkp) - 1))
203 /* Unaligned request */ 203 /* Unaligned request */
204 return BLKPREP_KILL; 204 return BLK_STS_IOERR;
205 205
206 cmd->cmd_len = 16; 206 cmd->cmd_len = 16;
207 memset(cmd->cmnd, 0, cmd->cmd_len); 207 memset(cmd->cmnd, 0, cmd->cmd_len);
@@ -214,7 +214,7 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
214 cmd->transfersize = 0; 214 cmd->transfersize = 0;
215 cmd->allowed = 0; 215 cmd->allowed = 0;
216 216
217 return BLKPREP_OK; 217 return BLK_STS_OK;
218} 218}
219 219
220/** 220/**
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index c6ad00703c5b..4e27460ec926 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1390,7 +1390,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status)
1390 */ 1390 */
1391 srp->rq = NULL; 1391 srp->rq = NULL;
1392 scsi_req_free_cmd(scsi_req(rq)); 1392 scsi_req_free_cmd(scsi_req(rq));
1393 __blk_put_request(rq->q, rq); 1393 blk_put_request(rq);
1394 1394
1395 write_lock_irqsave(&sfp->rq_list_lock, iflags); 1395 write_lock_irqsave(&sfp->rq_list_lock, iflags);
1396 if (unlikely(srp->orphan)) { 1396 if (unlikely(srp->orphan)) {
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index a25a07a0b7f0..bac084260d80 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -5319,7 +5319,8 @@ static int pqi_map_queues(struct Scsi_Host *shost)
5319{ 5319{
5320 struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost); 5320 struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
5321 5321
5322 return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev, 0); 5322 return blk_mq_pci_map_queues(&shost->tag_set.map[0],
5323 ctrl_info->pci_dev, 0);
5323} 5324}
5324 5325
5325static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info, 5326static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info,
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 54dd70ae9731..38ddbbfe5f3c 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -80,7 +80,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_WORM);
80static DEFINE_MUTEX(sr_mutex); 80static DEFINE_MUTEX(sr_mutex);
81static int sr_probe(struct device *); 81static int sr_probe(struct device *);
82static int sr_remove(struct device *); 82static int sr_remove(struct device *);
83static int sr_init_command(struct scsi_cmnd *SCpnt); 83static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt);
84static int sr_done(struct scsi_cmnd *); 84static int sr_done(struct scsi_cmnd *);
85static int sr_runtime_suspend(struct device *dev); 85static int sr_runtime_suspend(struct device *dev);
86 86
@@ -384,22 +384,22 @@ static int sr_done(struct scsi_cmnd *SCpnt)
384 return good_bytes; 384 return good_bytes;
385} 385}
386 386
387static int sr_init_command(struct scsi_cmnd *SCpnt) 387static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt)
388{ 388{
389 int block = 0, this_count, s_size; 389 int block = 0, this_count, s_size;
390 struct scsi_cd *cd; 390 struct scsi_cd *cd;
391 struct request *rq = SCpnt->request; 391 struct request *rq = SCpnt->request;
392 int ret; 392 blk_status_t ret;
393 393
394 ret = scsi_init_io(SCpnt); 394 ret = scsi_init_io(SCpnt);
395 if (ret != BLKPREP_OK) 395 if (ret != BLK_STS_OK)
396 goto out; 396 goto out;
397 WARN_ON_ONCE(SCpnt != rq->special); 397 WARN_ON_ONCE(SCpnt != rq->special);
398 cd = scsi_cd(rq->rq_disk); 398 cd = scsi_cd(rq->rq_disk);
399 399
400 /* from here on until we're complete, any goto out 400 /* from here on until we're complete, any goto out
401 * is used for a killable error condition */ 401 * is used for a killable error condition */
402 ret = BLKPREP_KILL; 402 ret = BLK_STS_IOERR;
403 403
404 SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt, 404 SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt,
405 "Doing sr request, block = %d\n", block)); 405 "Doing sr request, block = %d\n", block));
@@ -516,7 +516,7 @@ static int sr_init_command(struct scsi_cmnd *SCpnt)
516 * This indicates that the command is ready from our end to be 516 * This indicates that the command is ready from our end to be
517 * queued. 517 * queued.
518 */ 518 */
519 ret = BLKPREP_OK; 519 ret = BLK_STS_OK;
520 out: 520 out:
521 return ret; 521 return ret;
522} 522}
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 307df2fa39a3..7ff22d3f03e3 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -530,7 +530,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status)
530 complete(SRpnt->waiting); 530 complete(SRpnt->waiting);
531 531
532 blk_rq_unmap_user(tmp); 532 blk_rq_unmap_user(tmp);
533 __blk_put_request(req->q, req); 533 blk_put_request(req);
534} 534}
535 535
536static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, 536static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
diff --git a/drivers/scsi/ufs/ufs_bsg.c b/drivers/scsi/ufs/ufs_bsg.c
index e5f8e54bf644..775bb4e5e36e 100644
--- a/drivers/scsi/ufs/ufs_bsg.c
+++ b/drivers/scsi/ufs/ufs_bsg.c
@@ -157,7 +157,7 @@ void ufs_bsg_remove(struct ufs_hba *hba)
157 if (!hba->bsg_queue) 157 if (!hba->bsg_queue)
158 return; 158 return;
159 159
160 bsg_unregister_queue(hba->bsg_queue); 160 bsg_remove_queue(hba->bsg_queue);
161 161
162 device_del(bsg_dev); 162 device_del(bsg_dev);
163 put_device(bsg_dev); 163 put_device(bsg_dev);
@@ -193,7 +193,7 @@ int ufs_bsg_probe(struct ufs_hba *hba)
193 if (ret) 193 if (ret)
194 goto out; 194 goto out;
195 195
196 q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, 0); 196 q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, NULL, 0);
197 if (IS_ERR(q)) { 197 if (IS_ERR(q)) {
198 ret = PTR_ERR(q); 198 ret = PTR_ERR(q);
199 goto out; 199 goto out;
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 1c72db94270e..c3c95b314286 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -719,8 +719,9 @@ static void virtscsi_target_destroy(struct scsi_target *starget)
719static int virtscsi_map_queues(struct Scsi_Host *shost) 719static int virtscsi_map_queues(struct Scsi_Host *shost)
720{ 720{
721 struct virtio_scsi *vscsi = shost_priv(shost); 721 struct virtio_scsi *vscsi = shost_priv(shost);
722 struct blk_mq_queue_map *qmap = &shost->tag_set.map[0];
722 723
723 return blk_mq_virtio_map_queues(&shost->tag_set, vscsi->vdev, 2); 724 return blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2);
724} 725}
725 726
726/* 727/*
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 36b742932c72..86987da86dd6 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -150,24 +150,26 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd)
150static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup) 150static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
151{ 151{
152 int tag = -1; 152 int tag = -1;
153 DEFINE_WAIT(wait); 153 DEFINE_SBQ_WAIT(wait);
154 struct sbq_wait_state *ws; 154 struct sbq_wait_state *ws;
155 struct sbitmap_queue *sbq;
155 156
156 if (state == TASK_RUNNING) 157 if (state == TASK_RUNNING)
157 return tag; 158 return tag;
158 159
159 ws = &se_sess->sess_tag_pool.ws[0]; 160 sbq = &se_sess->sess_tag_pool;
161 ws = &sbq->ws[0];
160 for (;;) { 162 for (;;) {
161 prepare_to_wait_exclusive(&ws->wait, &wait, state); 163 sbitmap_prepare_to_wait(sbq, ws, &wait, state);
162 if (signal_pending_state(state, current)) 164 if (signal_pending_state(state, current))
163 break; 165 break;
164 tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup); 166 tag = sbitmap_queue_get(sbq, cpup);
165 if (tag >= 0) 167 if (tag >= 0)
166 break; 168 break;
167 schedule(); 169 schedule();
168 } 170 }
169 171
170 finish_wait(&ws->wait, &wait); 172 sbitmap_finish_wait(sbq, ws, &wait);
171 return tag; 173 return tag;
172} 174}
173 175
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 47d76c862014..c062d363dce3 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1094,7 +1094,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status)
1094 break; 1094 break;
1095 } 1095 }
1096 1096
1097 __blk_put_request(req->q, req); 1097 blk_put_request(req);
1098 kfree(pt); 1098 kfree(pt);
1099} 1099}
1100 1100
diff --git a/fs/aio.c b/fs/aio.c
index 76f72509f8c5..0f99cad35ffe 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1438,17 +1438,22 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
1438 ret = ioprio_check_cap(iocb->aio_reqprio); 1438 ret = ioprio_check_cap(iocb->aio_reqprio);
1439 if (ret) { 1439 if (ret) {
1440 pr_debug("aio ioprio check cap error: %d\n", ret); 1440 pr_debug("aio ioprio check cap error: %d\n", ret);
1441 fput(req->ki_filp); 1441 goto out_fput;
1442 return ret;
1443 } 1442 }
1444 1443
1445 req->ki_ioprio = iocb->aio_reqprio; 1444 req->ki_ioprio = iocb->aio_reqprio;
1446 } else 1445 } else
1447 req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); 1446 req->ki_ioprio = get_current_ioprio();
1448 1447
1449 ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); 1448 ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
1450 if (unlikely(ret)) 1449 if (unlikely(ret))
1451 fput(req->ki_filp); 1450 goto out_fput;
1451
1452 req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
1453 return 0;
1454
1455out_fput:
1456 fput(req->ki_filp);
1452 return ret; 1457 return ret;
1453} 1458}
1454 1459
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a80b4f0ee7c4..e1886cc7048f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -181,7 +181,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
181 struct task_struct *waiter = bio->bi_private; 181 struct task_struct *waiter = bio->bi_private;
182 182
183 WRITE_ONCE(bio->bi_private, NULL); 183 WRITE_ONCE(bio->bi_private, NULL);
184 wake_up_process(waiter); 184 blk_wake_io_task(waiter);
185} 185}
186 186
187static ssize_t 187static ssize_t
@@ -232,14 +232,18 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
232 bio.bi_opf = dio_bio_write_op(iocb); 232 bio.bi_opf = dio_bio_write_op(iocb);
233 task_io_account_write(ret); 233 task_io_account_write(ret);
234 } 234 }
235 if (iocb->ki_flags & IOCB_HIPRI)
236 bio.bi_opf |= REQ_HIPRI;
235 237
236 qc = submit_bio(&bio); 238 qc = submit_bio(&bio);
237 for (;;) { 239 for (;;) {
238 set_current_state(TASK_UNINTERRUPTIBLE); 240 __set_current_state(TASK_UNINTERRUPTIBLE);
241
239 if (!READ_ONCE(bio.bi_private)) 242 if (!READ_ONCE(bio.bi_private))
240 break; 243 break;
244
241 if (!(iocb->ki_flags & IOCB_HIPRI) || 245 if (!(iocb->ki_flags & IOCB_HIPRI) ||
242 !blk_poll(bdev_get_queue(bdev), qc)) 246 !blk_poll(bdev_get_queue(bdev), qc, true))
243 io_schedule(); 247 io_schedule();
244 } 248 }
245 __set_current_state(TASK_RUNNING); 249 __set_current_state(TASK_RUNNING);
@@ -298,12 +302,13 @@ static void blkdev_bio_end_io(struct bio *bio)
298 } 302 }
299 303
300 dio->iocb->ki_complete(iocb, ret, 0); 304 dio->iocb->ki_complete(iocb, ret, 0);
301 bio_put(&dio->bio); 305 if (dio->multi_bio)
306 bio_put(&dio->bio);
302 } else { 307 } else {
303 struct task_struct *waiter = dio->waiter; 308 struct task_struct *waiter = dio->waiter;
304 309
305 WRITE_ONCE(dio->waiter, NULL); 310 WRITE_ONCE(dio->waiter, NULL);
306 wake_up_process(waiter); 311 blk_wake_io_task(waiter);
307 } 312 }
308 } 313 }
309 314
@@ -328,6 +333,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
328 struct blk_plug plug; 333 struct blk_plug plug;
329 struct blkdev_dio *dio; 334 struct blkdev_dio *dio;
330 struct bio *bio; 335 struct bio *bio;
336 bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
331 bool is_read = (iov_iter_rw(iter) == READ), is_sync; 337 bool is_read = (iov_iter_rw(iter) == READ), is_sync;
332 loff_t pos = iocb->ki_pos; 338 loff_t pos = iocb->ki_pos;
333 blk_qc_t qc = BLK_QC_T_NONE; 339 blk_qc_t qc = BLK_QC_T_NONE;
@@ -338,20 +344,27 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
338 return -EINVAL; 344 return -EINVAL;
339 345
340 bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); 346 bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
341 bio_get(bio); /* extra ref for the completion handler */
342 347
343 dio = container_of(bio, struct blkdev_dio, bio); 348 dio = container_of(bio, struct blkdev_dio, bio);
344 dio->is_sync = is_sync = is_sync_kiocb(iocb); 349 dio->is_sync = is_sync = is_sync_kiocb(iocb);
345 if (dio->is_sync) 350 if (dio->is_sync) {
346 dio->waiter = current; 351 dio->waiter = current;
347 else 352 bio_get(bio);
353 } else {
348 dio->iocb = iocb; 354 dio->iocb = iocb;
355 }
349 356
350 dio->size = 0; 357 dio->size = 0;
351 dio->multi_bio = false; 358 dio->multi_bio = false;
352 dio->should_dirty = is_read && iter_is_iovec(iter); 359 dio->should_dirty = is_read && iter_is_iovec(iter);
353 360
354 blk_start_plug(&plug); 361 /*
362 * Don't plug for HIPRI/polled IO, as those should go straight
363 * to issue
364 */
365 if (!is_poll)
366 blk_start_plug(&plug);
367
355 for (;;) { 368 for (;;) {
356 bio_set_dev(bio, bdev); 369 bio_set_dev(bio, bdev);
357 bio->bi_iter.bi_sector = pos >> 9; 370 bio->bi_iter.bi_sector = pos >> 9;
@@ -381,11 +394,21 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
381 394
382 nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); 395 nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
383 if (!nr_pages) { 396 if (!nr_pages) {
397 if (iocb->ki_flags & IOCB_HIPRI)
398 bio->bi_opf |= REQ_HIPRI;
399
384 qc = submit_bio(bio); 400 qc = submit_bio(bio);
385 break; 401 break;
386 } 402 }
387 403
388 if (!dio->multi_bio) { 404 if (!dio->multi_bio) {
405 /*
406 * AIO needs an extra reference to ensure the dio
407 * structure which is embedded into the first bio
408 * stays around.
409 */
410 if (!is_sync)
411 bio_get(bio);
389 dio->multi_bio = true; 412 dio->multi_bio = true;
390 atomic_set(&dio->ref, 2); 413 atomic_set(&dio->ref, 2);
391 } else { 414 } else {
@@ -395,18 +418,21 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
395 submit_bio(bio); 418 submit_bio(bio);
396 bio = bio_alloc(GFP_KERNEL, nr_pages); 419 bio = bio_alloc(GFP_KERNEL, nr_pages);
397 } 420 }
398 blk_finish_plug(&plug); 421
422 if (!is_poll)
423 blk_finish_plug(&plug);
399 424
400 if (!is_sync) 425 if (!is_sync)
401 return -EIOCBQUEUED; 426 return -EIOCBQUEUED;
402 427
403 for (;;) { 428 for (;;) {
404 set_current_state(TASK_UNINTERRUPTIBLE); 429 __set_current_state(TASK_UNINTERRUPTIBLE);
430
405 if (!READ_ONCE(dio->waiter)) 431 if (!READ_ONCE(dio->waiter))
406 break; 432 break;
407 433
408 if (!(iocb->ki_flags & IOCB_HIPRI) || 434 if (!(iocb->ki_flags & IOCB_HIPRI) ||
409 !blk_poll(bdev_get_queue(bdev), qc)) 435 !blk_poll(bdev_get_queue(bdev), qc, true))
410 io_schedule(); 436 io_schedule();
411 } 437 }
412 __set_current_state(TASK_RUNNING); 438 __set_current_state(TASK_RUNNING);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1286c2b95498..d60d61e8ed7d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
3060 */ 3060 */
3061 bio = bio_alloc(GFP_NOIO, 1); 3061 bio = bio_alloc(GFP_NOIO, 1);
3062 3062
3063 if (wbc) {
3064 wbc_init_bio(wbc, bio);
3065 wbc_account_io(wbc, bh->b_page, bh->b_size);
3066 }
3067
3068 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 3063 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3069 bio_set_dev(bio, bh->b_bdev); 3064 bio_set_dev(bio, bh->b_bdev);
3070 bio->bi_write_hint = write_hint; 3065 bio->bi_write_hint = write_hint;
@@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
3084 op_flags |= REQ_PRIO; 3079 op_flags |= REQ_PRIO;
3085 bio_set_op_attrs(bio, op, op_flags); 3080 bio_set_op_attrs(bio, op, op_flags);
3086 3081
3082 if (wbc) {
3083 wbc_init_bio(wbc, bio);
3084 wbc_account_io(wbc, bh->b_page, bh->b_size);
3085 }
3086
3087 submit_bio(bio); 3087 submit_bio(bio);
3088 return 0; 3088 return 0;
3089} 3089}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 41a0e97252ae..dbc1a1f080ce 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -518,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio)
518 dio->waiter = current; 518 dio->waiter = current;
519 spin_unlock_irqrestore(&dio->bio_lock, flags); 519 spin_unlock_irqrestore(&dio->bio_lock, flags);
520 if (!(dio->iocb->ki_flags & IOCB_HIPRI) || 520 if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
521 !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) 521 !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
522 io_schedule(); 522 io_schedule();
523 /* wake up sets us TASK_RUNNING */ 523 /* wake up sets us TASK_RUNNING */
524 spin_lock_irqsave(&dio->bio_lock, flags); 524 spin_lock_irqsave(&dio->bio_lock, flags);
@@ -1265,6 +1265,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
1265 } else { 1265 } else {
1266 dio->op = REQ_OP_READ; 1266 dio->op = REQ_OP_READ;
1267 } 1267 }
1268 if (iocb->ki_flags & IOCB_HIPRI)
1269 dio->op_flags |= REQ_HIPRI;
1268 1270
1269 /* 1271 /*
1270 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue 1272 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index db7590178dfc..2aa62d58d8dd 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
374 bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 374 bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
375 if (!bio) 375 if (!bio)
376 return -ENOMEM; 376 return -ENOMEM;
377 wbc_init_bio(io->io_wbc, bio);
378 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 377 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
379 bio_set_dev(bio, bh->b_bdev); 378 bio_set_dev(bio, bh->b_bdev);
380 bio->bi_end_io = ext4_end_bio; 379 bio->bi_end_io = ext4_end_bio;
381 bio->bi_private = ext4_get_io_end(io->io_end); 380 bio->bi_private = ext4_get_io_end(io->io_end);
382 io->io_bio = bio; 381 io->io_bio = bio;
383 io->io_next_block = bh->b_blocknr; 382 io->io_next_block = bh->b_blocknr;
383 wbc_init_bio(io->io_wbc, bio);
384 return 0; 384 return 0;
385} 385}
386 386
diff --git a/fs/iomap.c b/fs/iomap.c
index ce837d962d47..e87c288cd5ef 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1543,7 +1543,7 @@ static void iomap_dio_bio_end_io(struct bio *bio)
1543 if (dio->wait_for_completion) { 1543 if (dio->wait_for_completion) {
1544 struct task_struct *waiter = dio->submit.waiter; 1544 struct task_struct *waiter = dio->submit.waiter;
1545 WRITE_ONCE(dio->submit.waiter, NULL); 1545 WRITE_ONCE(dio->submit.waiter, NULL);
1546 wake_up_process(waiter); 1546 blk_wake_io_task(waiter);
1547 } else if (dio->flags & IOMAP_DIO_WRITE) { 1547 } else if (dio->flags & IOMAP_DIO_WRITE) {
1548 struct inode *inode = file_inode(dio->iocb->ki_filp); 1548 struct inode *inode = file_inode(dio->iocb->ki_filp);
1549 1549
@@ -1571,6 +1571,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
1571 unsigned len) 1571 unsigned len)
1572{ 1572{
1573 struct page *page = ZERO_PAGE(0); 1573 struct page *page = ZERO_PAGE(0);
1574 int flags = REQ_SYNC | REQ_IDLE;
1574 struct bio *bio; 1575 struct bio *bio;
1575 1576
1576 bio = bio_alloc(GFP_KERNEL, 1); 1577 bio = bio_alloc(GFP_KERNEL, 1);
@@ -1579,9 +1580,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
1579 bio->bi_private = dio; 1580 bio->bi_private = dio;
1580 bio->bi_end_io = iomap_dio_bio_end_io; 1581 bio->bi_end_io = iomap_dio_bio_end_io;
1581 1582
1583 if (dio->iocb->ki_flags & IOCB_HIPRI)
1584 flags |= REQ_HIPRI;
1585
1582 get_page(page); 1586 get_page(page);
1583 __bio_add_page(bio, page, len, 0); 1587 __bio_add_page(bio, page, len, 0);
1584 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); 1588 bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
1585 1589
1586 atomic_inc(&dio->ref); 1590 atomic_inc(&dio->ref);
1587 return submit_bio(bio); 1591 return submit_bio(bio);
@@ -1687,6 +1691,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
1687 bio_set_pages_dirty(bio); 1691 bio_set_pages_dirty(bio);
1688 } 1692 }
1689 1693
1694 if (dio->iocb->ki_flags & IOCB_HIPRI)
1695 bio->bi_opf |= REQ_HIPRI;
1696
1690 iov_iter_advance(dio->submit.iter, n); 1697 iov_iter_advance(dio->submit.iter, n);
1691 1698
1692 dio->size += n; 1699 dio->size += n;
@@ -1914,14 +1921,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
1914 return -EIOCBQUEUED; 1921 return -EIOCBQUEUED;
1915 1922
1916 for (;;) { 1923 for (;;) {
1917 set_current_state(TASK_UNINTERRUPTIBLE); 1924 __set_current_state(TASK_UNINTERRUPTIBLE);
1925
1918 if (!READ_ONCE(dio->submit.waiter)) 1926 if (!READ_ONCE(dio->submit.waiter))
1919 break; 1927 break;
1920 1928
1921 if (!(iocb->ki_flags & IOCB_HIPRI) || 1929 if (!(iocb->ki_flags & IOCB_HIPRI) ||
1922 !dio->submit.last_queue || 1930 !dio->submit.last_queue ||
1923 !blk_poll(dio->submit.last_queue, 1931 !blk_poll(dio->submit.last_queue,
1924 dio->submit.cookie)) 1932 dio->submit.cookie, true))
1925 io_schedule(); 1933 io_schedule();
1926 } 1934 }
1927 __set_current_state(TASK_RUNNING); 1935 __set_current_state(TASK_RUNNING);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 056fb627edb3..7380b094dcca 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -491,35 +491,40 @@ do { \
491 bio_clear_flag(bio, BIO_THROTTLED);\ 491 bio_clear_flag(bio, BIO_THROTTLED);\
492 (bio)->bi_disk = (bdev)->bd_disk; \ 492 (bio)->bi_disk = (bdev)->bd_disk; \
493 (bio)->bi_partno = (bdev)->bd_partno; \ 493 (bio)->bi_partno = (bdev)->bd_partno; \
494 bio_associate_blkg(bio); \
494} while (0) 495} while (0)
495 496
496#define bio_copy_dev(dst, src) \ 497#define bio_copy_dev(dst, src) \
497do { \ 498do { \
498 (dst)->bi_disk = (src)->bi_disk; \ 499 (dst)->bi_disk = (src)->bi_disk; \
499 (dst)->bi_partno = (src)->bi_partno; \ 500 (dst)->bi_partno = (src)->bi_partno; \
501 bio_clone_blkg_association(dst, src); \
500} while (0) 502} while (0)
501 503
502#define bio_dev(bio) \ 504#define bio_dev(bio) \
503 disk_devt((bio)->bi_disk) 505 disk_devt((bio)->bi_disk)
504 506
505#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) 507#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
506int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); 508void bio_associate_blkg_from_page(struct bio *bio, struct page *page);
507#else 509#else
508static inline int bio_associate_blkcg_from_page(struct bio *bio, 510static inline void bio_associate_blkg_from_page(struct bio *bio,
509 struct page *page) { return 0; } 511 struct page *page) { }
510#endif 512#endif
511 513
512#ifdef CONFIG_BLK_CGROUP 514#ifdef CONFIG_BLK_CGROUP
513int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); 515void bio_disassociate_blkg(struct bio *bio);
514int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); 516void bio_associate_blkg(struct bio *bio);
515void bio_disassociate_task(struct bio *bio); 517void bio_associate_blkg_from_css(struct bio *bio,
516void bio_clone_blkcg_association(struct bio *dst, struct bio *src); 518 struct cgroup_subsys_state *css);
519void bio_clone_blkg_association(struct bio *dst, struct bio *src);
517#else /* CONFIG_BLK_CGROUP */ 520#else /* CONFIG_BLK_CGROUP */
518static inline int bio_associate_blkcg(struct bio *bio, 521static inline void bio_disassociate_blkg(struct bio *bio) { }
519 struct cgroup_subsys_state *blkcg_css) { return 0; } 522static inline void bio_associate_blkg(struct bio *bio) { }
520static inline void bio_disassociate_task(struct bio *bio) { } 523static inline void bio_associate_blkg_from_css(struct bio *bio,
521static inline void bio_clone_blkcg_association(struct bio *dst, 524 struct cgroup_subsys_state *css)
522 struct bio *src) { } 525{ }
526static inline void bio_clone_blkg_association(struct bio *dst,
527 struct bio *src) { }
523#endif /* CONFIG_BLK_CGROUP */ 528#endif /* CONFIG_BLK_CGROUP */
524 529
525#ifdef CONFIG_HIGHMEM 530#ifdef CONFIG_HIGHMEM
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 6d766a19f2bb..f025fd1e22e6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -21,6 +21,7 @@
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/fs.h>
24 25
25/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ 26/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
26#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) 27#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
@@ -122,11 +123,8 @@ struct blkcg_gq {
122 /* all non-root blkcg_gq's are guaranteed to have access to parent */ 123 /* all non-root blkcg_gq's are guaranteed to have access to parent */
123 struct blkcg_gq *parent; 124 struct blkcg_gq *parent;
124 125
125 /* request allocation list for this blkcg-q pair */
126 struct request_list rl;
127
128 /* reference count */ 126 /* reference count */
129 atomic_t refcnt; 127 struct percpu_ref refcnt;
130 128
131 /* is this blkg online? protected by both blkcg and q locks */ 129 /* is this blkg online? protected by both blkcg and q locks */
132 bool online; 130 bool online;
@@ -184,6 +182,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css;
184 182
185struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, 183struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
186 struct request_queue *q, bool update_hint); 184 struct request_queue *q, bool update_hint);
185struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
186 struct request_queue *q);
187struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 187struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
188 struct request_queue *q); 188 struct request_queue *q);
189int blkcg_init_queue(struct request_queue *q); 189int blkcg_init_queue(struct request_queue *q);
@@ -230,22 +230,62 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
230 char *input, struct blkg_conf_ctx *ctx); 230 char *input, struct blkg_conf_ctx *ctx);
231void blkg_conf_finish(struct blkg_conf_ctx *ctx); 231void blkg_conf_finish(struct blkg_conf_ctx *ctx);
232 232
233/**
234 * blkcg_css - find the current css
235 *
236 * Find the css associated with either the kthread or the current task.
237 * This may return a dying css, so it is up to the caller to use tryget logic
238 * to confirm it is alive and well.
239 */
240static inline struct cgroup_subsys_state *blkcg_css(void)
241{
242 struct cgroup_subsys_state *css;
243
244 css = kthread_blkcg();
245 if (css)
246 return css;
247 return task_css(current, io_cgrp_id);
248}
233 249
234static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) 250static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
235{ 251{
236 return css ? container_of(css, struct blkcg, css) : NULL; 252 return css ? container_of(css, struct blkcg, css) : NULL;
237} 253}
238 254
239static inline struct blkcg *bio_blkcg(struct bio *bio) 255/**
256 * __bio_blkcg - internal, inconsistent version to get blkcg
257 *
258 * DO NOT USE.
259 * This function is inconsistent and consequently is dangerous to use. The
260 * first part of the function returns a blkcg where a reference is owned by the
261 * bio. This means it does not need to be rcu protected as it cannot go away
262 * with the bio owning a reference to it. However, the latter potentially gets
263 * it from task_css(). This can race against task migration and the cgroup
264 * dying. It is also semantically different as it must be called rcu protected
265 * and is susceptible to failure when trying to get a reference to it.
266 * Therefore, it is not ok to assume that *_get() will always succeed on the
267 * blkcg returned here.
268 */
269static inline struct blkcg *__bio_blkcg(struct bio *bio)
240{ 270{
241 struct cgroup_subsys_state *css; 271 if (bio && bio->bi_blkg)
272 return bio->bi_blkg->blkcg;
273 return css_to_blkcg(blkcg_css());
274}
242 275
243 if (bio && bio->bi_css) 276/**
244 return css_to_blkcg(bio->bi_css); 277 * bio_blkcg - grab the blkcg associated with a bio
245 css = kthread_blkcg(); 278 * @bio: target bio
246 if (css) 279 *
247 return css_to_blkcg(css); 280 * This returns the blkcg associated with a bio, %NULL if not associated.
248 return css_to_blkcg(task_css(current, io_cgrp_id)); 281 * Callers are expected to either handle %NULL or know association has been
282 * done prior to calling this.
283 */
284static inline struct blkcg *bio_blkcg(struct bio *bio)
285{
286 if (bio && bio->bi_blkg)
287 return bio->bi_blkg->blkcg;
288 return NULL;
249} 289}
250 290
251static inline bool blk_cgroup_congested(void) 291static inline bool blk_cgroup_congested(void)
@@ -328,16 +368,12 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
328 * @q: request_queue of interest 368 * @q: request_queue of interest
329 * 369 *
330 * Lookup blkg for the @blkcg - @q pair. This function should be called 370 * Lookup blkg for the @blkcg - @q pair. This function should be called
331 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing 371 * under RCU read loc.
332 * - see blk_queue_bypass_start() for details.
333 */ 372 */
334static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, 373static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
335 struct request_queue *q) 374 struct request_queue *q)
336{ 375{
337 WARN_ON_ONCE(!rcu_read_lock_held()); 376 WARN_ON_ONCE(!rcu_read_lock_held());
338
339 if (unlikely(blk_queue_bypass(q)))
340 return NULL;
341 return __blkg_lookup(blkcg, q, false); 377 return __blkg_lookup(blkcg, q, false);
342} 378}
343 379
@@ -451,26 +487,35 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
451 */ 487 */
452static inline void blkg_get(struct blkcg_gq *blkg) 488static inline void blkg_get(struct blkcg_gq *blkg)
453{ 489{
454 WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); 490 percpu_ref_get(&blkg->refcnt);
455 atomic_inc(&blkg->refcnt);
456} 491}
457 492
458/** 493/**
459 * blkg_try_get - try and get a blkg reference 494 * blkg_tryget - try and get a blkg reference
460 * @blkg: blkg to get 495 * @blkg: blkg to get
461 * 496 *
462 * This is for use when doing an RCU lookup of the blkg. We may be in the midst 497 * This is for use when doing an RCU lookup of the blkg. We may be in the midst
463 * of freeing this blkg, so we can only use it if the refcnt is not zero. 498 * of freeing this blkg, so we can only use it if the refcnt is not zero.
464 */ 499 */
465static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg) 500static inline bool blkg_tryget(struct blkcg_gq *blkg)
466{ 501{
467 if (atomic_inc_not_zero(&blkg->refcnt)) 502 return percpu_ref_tryget(&blkg->refcnt);
468 return blkg;
469 return NULL;
470} 503}
471 504
505/**
506 * blkg_tryget_closest - try and get a blkg ref on the closet blkg
507 * @blkg: blkg to get
508 *
509 * This walks up the blkg tree to find the closest non-dying blkg and returns
510 * the blkg that it did association with as it may not be the passed in blkg.
511 */
512static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg)
513{
514 while (blkg && !percpu_ref_tryget(&blkg->refcnt))
515 blkg = blkg->parent;
472 516
473void __blkg_release_rcu(struct rcu_head *rcu); 517 return blkg;
518}
474 519
475/** 520/**
476 * blkg_put - put a blkg reference 521 * blkg_put - put a blkg reference
@@ -478,9 +523,7 @@ void __blkg_release_rcu(struct rcu_head *rcu);
478 */ 523 */
479static inline void blkg_put(struct blkcg_gq *blkg) 524static inline void blkg_put(struct blkcg_gq *blkg)
480{ 525{
481 WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); 526 percpu_ref_put(&blkg->refcnt);
482 if (atomic_dec_and_test(&blkg->refcnt))
483 call_rcu(&blkg->rcu_head, __blkg_release_rcu);
484} 527}
485 528
486/** 529/**
@@ -515,94 +558,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
515 if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ 558 if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
516 (p_blkg)->q, false))) 559 (p_blkg)->q, false)))
517 560
518/**
519 * blk_get_rl - get request_list to use
520 * @q: request_queue of interest
521 * @bio: bio which will be attached to the allocated request (may be %NULL)
522 *
523 * The caller wants to allocate a request from @q to use for @bio. Find
524 * the request_list to use and obtain a reference on it. Should be called
525 * under queue_lock. This function is guaranteed to return non-%NULL
526 * request_list.
527 */
528static inline struct request_list *blk_get_rl(struct request_queue *q,
529 struct bio *bio)
530{
531 struct blkcg *blkcg;
532 struct blkcg_gq *blkg;
533
534 rcu_read_lock();
535
536 blkcg = bio_blkcg(bio);
537
538 /* bypass blkg lookup and use @q->root_rl directly for root */
539 if (blkcg == &blkcg_root)
540 goto root_rl;
541
542 /*
543 * Try to use blkg->rl. blkg lookup may fail under memory pressure
544 * or if either the blkcg or queue is going away. Fall back to
545 * root_rl in such cases.
546 */
547 blkg = blkg_lookup(blkcg, q);
548 if (unlikely(!blkg))
549 goto root_rl;
550
551 blkg_get(blkg);
552 rcu_read_unlock();
553 return &blkg->rl;
554root_rl:
555 rcu_read_unlock();
556 return &q->root_rl;
557}
558
559/**
560 * blk_put_rl - put request_list
561 * @rl: request_list to put
562 *
563 * Put the reference acquired by blk_get_rl(). Should be called under
564 * queue_lock.
565 */
566static inline void blk_put_rl(struct request_list *rl)
567{
568 if (rl->blkg->blkcg != &blkcg_root)
569 blkg_put(rl->blkg);
570}
571
572/**
573 * blk_rq_set_rl - associate a request with a request_list
574 * @rq: request of interest
575 * @rl: target request_list
576 *
577 * Associate @rq with @rl so that accounting and freeing can know the
578 * request_list @rq came from.
579 */
580static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
581{
582 rq->rl = rl;
583}
584
585/**
586 * blk_rq_rl - return the request_list a request came from
587 * @rq: request of interest
588 *
589 * Return the request_list @rq is allocated from.
590 */
591static inline struct request_list *blk_rq_rl(struct request *rq)
592{
593 return rq->rl;
594}
595
596struct request_list *__blk_queue_next_rl(struct request_list *rl,
597 struct request_queue *q);
598/**
599 * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
600 *
601 * Should be used under queue_lock.
602 */
603#define blk_queue_for_each_rl(rl, q) \
604 for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
605
606static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp) 561static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
607{ 562{
608 int ret; 563 int ret;
@@ -797,32 +752,34 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
797 struct bio *bio) { return false; } 752 struct bio *bio) { return false; }
798#endif 753#endif
799 754
755
756static inline void blkcg_bio_issue_init(struct bio *bio)
757{
758 bio_issue_init(&bio->bi_issue, bio_sectors(bio));
759}
760
800static inline bool blkcg_bio_issue_check(struct request_queue *q, 761static inline bool blkcg_bio_issue_check(struct request_queue *q,
801 struct bio *bio) 762 struct bio *bio)
802{ 763{
803 struct blkcg *blkcg;
804 struct blkcg_gq *blkg; 764 struct blkcg_gq *blkg;
805 bool throtl = false; 765 bool throtl = false;
806 766
807 rcu_read_lock(); 767 rcu_read_lock();
808 blkcg = bio_blkcg(bio); 768
809 769 if (!bio->bi_blkg) {
810 /* associate blkcg if bio hasn't attached one */ 770 char b[BDEVNAME_SIZE];
811 bio_associate_blkcg(bio, &blkcg->css); 771
812 772 WARN_ONCE(1,
813 blkg = blkg_lookup(blkcg, q); 773 "no blkg associated for bio on block-device: %s\n",
814 if (unlikely(!blkg)) { 774 bio_devname(bio, b));
815 spin_lock_irq(q->queue_lock); 775 bio_associate_blkg(bio);
816 blkg = blkg_lookup_create(blkcg, q);
817 if (IS_ERR(blkg))
818 blkg = NULL;
819 spin_unlock_irq(q->queue_lock);
820 } 776 }
821 777
778 blkg = bio->bi_blkg;
779
822 throtl = blk_throtl_bio(q, blkg, bio); 780 throtl = blk_throtl_bio(q, blkg, bio);
823 781
824 if (!throtl) { 782 if (!throtl) {
825 blkg = blkg ?: q->root_blkg;
826 /* 783 /*
827 * If the bio is flagged with BIO_QUEUE_ENTERED it means this 784 * If the bio is flagged with BIO_QUEUE_ENTERED it means this
828 * is a split bio and we would have already accounted for the 785 * is a split bio and we would have already accounted for the
@@ -834,6 +791,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
834 blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); 791 blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
835 } 792 }
836 793
794 blkcg_bio_issue_init(bio);
795
837 rcu_read_unlock(); 796 rcu_read_unlock();
838 return !throtl; 797 return !throtl;
839} 798}
@@ -930,6 +889,7 @@ static inline int blkcg_activate_policy(struct request_queue *q,
930static inline void blkcg_deactivate_policy(struct request_queue *q, 889static inline void blkcg_deactivate_policy(struct request_queue *q,
931 const struct blkcg_policy *pol) { } 890 const struct blkcg_policy *pol) { }
932 891
892static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
933static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } 893static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
934 894
935static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, 895static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
@@ -939,12 +899,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
939static inline void blkg_get(struct blkcg_gq *blkg) { } 899static inline void blkg_get(struct blkcg_gq *blkg) { }
940static inline void blkg_put(struct blkcg_gq *blkg) { } 900static inline void blkg_put(struct blkcg_gq *blkg) { }
941 901
942static inline struct request_list *blk_get_rl(struct request_queue *q, 902static inline void blkcg_bio_issue_init(struct bio *bio) { }
943 struct bio *bio) { return &q->root_rl; }
944static inline void blk_put_rl(struct request_list *rl) { }
945static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
946static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
947
948static inline bool blkcg_bio_issue_check(struct request_queue *q, 903static inline bool blkcg_bio_issue_check(struct request_queue *q,
949 struct bio *bio) { return true; } 904 struct bio *bio) { return true; }
950 905
diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h
index 9f4c17f0d2d8..0b1f45c62623 100644
--- a/include/linux/blk-mq-pci.h
+++ b/include/linux/blk-mq-pci.h
@@ -2,10 +2,10 @@
2#ifndef _LINUX_BLK_MQ_PCI_H 2#ifndef _LINUX_BLK_MQ_PCI_H
3#define _LINUX_BLK_MQ_PCI_H 3#define _LINUX_BLK_MQ_PCI_H
4 4
5struct blk_mq_tag_set; 5struct blk_mq_queue_map;
6struct pci_dev; 6struct pci_dev;
7 7
8int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, 8int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
9 int offset); 9 int offset);
10 10
11#endif /* _LINUX_BLK_MQ_PCI_H */ 11#endif /* _LINUX_BLK_MQ_PCI_H */
diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h
index b4ade198007d..7b6ecf9ac4c3 100644
--- a/include/linux/blk-mq-rdma.h
+++ b/include/linux/blk-mq-rdma.h
@@ -4,7 +4,7 @@
4struct blk_mq_tag_set; 4struct blk_mq_tag_set;
5struct ib_device; 5struct ib_device;
6 6
7int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, 7int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
8 struct ib_device *dev, int first_vec); 8 struct ib_device *dev, int first_vec);
9 9
10#endif /* _LINUX_BLK_MQ_RDMA_H */ 10#endif /* _LINUX_BLK_MQ_RDMA_H */
diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h
index 69b4da262c45..687ae287e1dc 100644
--- a/include/linux/blk-mq-virtio.h
+++ b/include/linux/blk-mq-virtio.h
@@ -2,10 +2,10 @@
2#ifndef _LINUX_BLK_MQ_VIRTIO_H 2#ifndef _LINUX_BLK_MQ_VIRTIO_H
3#define _LINUX_BLK_MQ_VIRTIO_H 3#define _LINUX_BLK_MQ_VIRTIO_H
4 4
5struct blk_mq_tag_set; 5struct blk_mq_queue_map;
6struct virtio_device; 6struct virtio_device;
7 7
8int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, 8int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
9 struct virtio_device *vdev, int first_vec); 9 struct virtio_device *vdev, int first_vec);
10 10
11#endif /* _LINUX_BLK_MQ_VIRTIO_H */ 11#endif /* _LINUX_BLK_MQ_VIRTIO_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2286dc12c6bc..0e030f5f76b6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -37,7 +37,8 @@ struct blk_mq_hw_ctx {
37 struct blk_mq_ctx *dispatch_from; 37 struct blk_mq_ctx *dispatch_from;
38 unsigned int dispatch_busy; 38 unsigned int dispatch_busy;
39 39
40 unsigned int nr_ctx; 40 unsigned short type;
41 unsigned short nr_ctx;
41 struct blk_mq_ctx **ctxs; 42 struct blk_mq_ctx **ctxs;
42 43
43 spinlock_t dispatch_wait_lock; 44 spinlock_t dispatch_wait_lock;
@@ -74,10 +75,31 @@ struct blk_mq_hw_ctx {
74 struct srcu_struct srcu[0]; 75 struct srcu_struct srcu[0];
75}; 76};
76 77
78struct blk_mq_queue_map {
79 unsigned int *mq_map;
80 unsigned int nr_queues;
81 unsigned int queue_offset;
82};
83
84enum hctx_type {
85 HCTX_TYPE_DEFAULT, /* all I/O not otherwise accounted for */
86 HCTX_TYPE_READ, /* just for READ I/O */
87 HCTX_TYPE_POLL, /* polled I/O of any kind */
88
89 HCTX_MAX_TYPES,
90};
91
77struct blk_mq_tag_set { 92struct blk_mq_tag_set {
78 unsigned int *mq_map; 93 /*
94 * map[] holds ctx -> hctx mappings, one map exists for each type
95 * that the driver wishes to support. There are no restrictions
96 * on maps being of the same size, and it's perfectly legal to
97 * share maps between types.
98 */
99 struct blk_mq_queue_map map[HCTX_MAX_TYPES];
100 unsigned int nr_maps; /* nr entries in map[] */
79 const struct blk_mq_ops *ops; 101 const struct blk_mq_ops *ops;
80 unsigned int nr_hw_queues; 102 unsigned int nr_hw_queues; /* nr hw queues across maps */
81 unsigned int queue_depth; /* max hw supported */ 103 unsigned int queue_depth; /* max hw supported */
82 unsigned int reserved_tags; 104 unsigned int reserved_tags;
83 unsigned int cmd_size; /* per-request extra data */ 105 unsigned int cmd_size; /* per-request extra data */
@@ -99,6 +121,7 @@ struct blk_mq_queue_data {
99 121
100typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, 122typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
101 const struct blk_mq_queue_data *); 123 const struct blk_mq_queue_data *);
124typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *);
102typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); 125typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
103typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); 126typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
104typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); 127typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
@@ -109,11 +132,13 @@ typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *,
109typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *, 132typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *,
110 unsigned int); 133 unsigned int);
111 134
112typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, 135typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
113 bool); 136 bool);
114typedef void (busy_tag_iter_fn)(struct request *, void *, bool); 137typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
115typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int); 138typedef int (poll_fn)(struct blk_mq_hw_ctx *);
116typedef int (map_queues_fn)(struct blk_mq_tag_set *set); 139typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
140typedef bool (busy_fn)(struct request_queue *);
141typedef void (complete_fn)(struct request *);
117 142
118 143
119struct blk_mq_ops { 144struct blk_mq_ops {
@@ -123,6 +148,15 @@ struct blk_mq_ops {
123 queue_rq_fn *queue_rq; 148 queue_rq_fn *queue_rq;
124 149
125 /* 150 /*
151 * If a driver uses bd->last to judge when to submit requests to
152 * hardware, it must define this function. In case of errors that
153 * make us stop issuing further requests, this hook serves the
154 * purpose of kicking the hardware (which the last request otherwise
155 * would have done).
156 */
157 commit_rqs_fn *commit_rqs;
158
159 /*
126 * Reserve budget before queue request, once .queue_rq is 160 * Reserve budget before queue request, once .queue_rq is
127 * run, it is driver's responsibility to release the 161 * run, it is driver's responsibility to release the
128 * reserved budget. Also we have to handle failure case 162 * reserved budget. Also we have to handle failure case
@@ -141,7 +175,7 @@ struct blk_mq_ops {
141 */ 175 */
142 poll_fn *poll; 176 poll_fn *poll;
143 177
144 softirq_done_fn *complete; 178 complete_fn *complete;
145 179
146 /* 180 /*
147 * Called when the block layer side of a hardware queue has been 181 * Called when the block layer side of a hardware queue has been
@@ -165,6 +199,11 @@ struct blk_mq_ops {
165 /* Called from inside blk_get_request() */ 199 /* Called from inside blk_get_request() */
166 void (*initialize_rq_fn)(struct request *rq); 200 void (*initialize_rq_fn)(struct request *rq);
167 201
202 /*
203 * If set, returns whether or not this queue currently is busy
204 */
205 busy_fn *busy;
206
168 map_queues_fn *map_queues; 207 map_queues_fn *map_queues;
169 208
170#ifdef CONFIG_BLK_DEBUG_FS 209#ifdef CONFIG_BLK_DEBUG_FS
@@ -218,6 +257,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
218void blk_mq_free_request(struct request *rq); 257void blk_mq_free_request(struct request *rq);
219bool blk_mq_can_queue(struct blk_mq_hw_ctx *); 258bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
220 259
260bool blk_mq_queue_inflight(struct request_queue *q);
261
221enum { 262enum {
222 /* return when out of requests */ 263 /* return when out of requests */
223 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 264 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0),
@@ -264,7 +305,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
264 bool kick_requeue_list); 305 bool kick_requeue_list);
265void blk_mq_kick_requeue_list(struct request_queue *q); 306void blk_mq_kick_requeue_list(struct request_queue *q);
266void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 307void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
267void blk_mq_complete_request(struct request *rq); 308bool blk_mq_complete_request(struct request *rq);
268bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 309bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
269 struct bio *bio); 310 struct bio *bio);
270bool blk_mq_queue_stopped(struct request_queue *q); 311bool blk_mq_queue_stopped(struct request_queue *q);
@@ -288,24 +329,12 @@ void blk_mq_freeze_queue_wait(struct request_queue *q);
288int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 329int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
289 unsigned long timeout); 330 unsigned long timeout);
290 331
291int blk_mq_map_queues(struct blk_mq_tag_set *set); 332int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
292void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 333void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
293 334
294void blk_mq_quiesce_queue_nowait(struct request_queue *q); 335void blk_mq_quiesce_queue_nowait(struct request_queue *q);
295 336
296/** 337unsigned int blk_mq_rq_cpu(struct request *rq);
297 * blk_mq_mark_complete() - Set request state to complete
298 * @rq: request to set to complete state
299 *
300 * Returns true if request state was successfully set to complete. If
301 * successful, the caller is responsibile for seeing this request is ended, as
302 * blk_mq_complete_request will not work again.
303 */
304static inline bool blk_mq_mark_complete(struct request *rq)
305{
306 return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) ==
307 MQ_RQ_IN_FLIGHT;
308}
309 338
310/* 339/*
311 * Driver command data is immediately after the request. So subtract request 340 * Driver command data is immediately after the request. So subtract request
@@ -328,4 +357,14 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
328 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 357 for ((i) = 0; (i) < (hctx)->nr_ctx && \
329 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 358 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
330 359
360static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
361 struct request *rq)
362{
363 if (rq->tag != -1)
364 return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);
365
366 return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
367 BLK_QC_T_INTERNAL;
368}
369
331#endif 370#endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1dcf652ba0aa..5c7e7f859a24 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -174,11 +174,11 @@ struct bio {
174 void *bi_private; 174 void *bi_private;
175#ifdef CONFIG_BLK_CGROUP 175#ifdef CONFIG_BLK_CGROUP
176 /* 176 /*
177 * Optional ioc and css associated with this bio. Put on bio 177 * Represents the association of the css and request_queue for the bio.
178 * release. Read comment on top of bio_associate_current(). 178 * If a bio goes direct to device, it will not have a blkg as it will
179 * not have a request_queue associated with it. The reference is put
180 * on release of the bio.
179 */ 181 */
180 struct io_context *bi_ioc;
181 struct cgroup_subsys_state *bi_css;
182 struct blkcg_gq *bi_blkg; 182 struct blkcg_gq *bi_blkg;
183 struct bio_issue bi_issue; 183 struct bio_issue bi_issue;
184#endif 184#endif
@@ -228,6 +228,7 @@ struct bio {
228#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion 228#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion
229 * of this bio. */ 229 * of this bio. */
230#define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */ 230#define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */
231#define BIO_TRACKED 12 /* set if bio goes through the rq_qos path */
231 232
232/* See BVEC_POOL_OFFSET below before adding new flags */ 233/* See BVEC_POOL_OFFSET below before adding new flags */
233 234
@@ -323,6 +324,8 @@ enum req_flag_bits {
323 /* command specific flags for REQ_OP_WRITE_ZEROES: */ 324 /* command specific flags for REQ_OP_WRITE_ZEROES: */
324 __REQ_NOUNMAP, /* do not free blocks when zeroing */ 325 __REQ_NOUNMAP, /* do not free blocks when zeroing */
325 326
327 __REQ_HIPRI,
328
326 /* for driver use */ 329 /* for driver use */
327 __REQ_DRV, 330 __REQ_DRV,
328 __REQ_SWAP, /* swapping request. */ 331 __REQ_SWAP, /* swapping request. */
@@ -343,8 +346,8 @@ enum req_flag_bits {
343#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) 346#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
344#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) 347#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
345#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) 348#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
346
347#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) 349#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
350#define REQ_HIPRI (1ULL << __REQ_HIPRI)
348 351
349#define REQ_DRV (1ULL << __REQ_DRV) 352#define REQ_DRV (1ULL << __REQ_DRV)
350#define REQ_SWAP (1ULL << __REQ_SWAP) 353#define REQ_SWAP (1ULL << __REQ_SWAP)
@@ -422,17 +425,6 @@ static inline bool blk_qc_t_valid(blk_qc_t cookie)
422 return cookie != BLK_QC_T_NONE; 425 return cookie != BLK_QC_T_NONE;
423} 426}
424 427
425static inline blk_qc_t blk_tag_to_qc_t(unsigned int tag, unsigned int queue_num,
426 bool internal)
427{
428 blk_qc_t ret = tag | (queue_num << BLK_QC_T_SHIFT);
429
430 if (internal)
431 ret |= BLK_QC_T_INTERNAL;
432
433 return ret;
434}
435
436static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) 428static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
437{ 429{
438 return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT; 430 return (cookie & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4293dc1cd160..45552e6eae1e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -58,25 +58,6 @@ struct blk_stat_callback;
58 58
59typedef void (rq_end_io_fn)(struct request *, blk_status_t); 59typedef void (rq_end_io_fn)(struct request *, blk_status_t);
60 60
61#define BLK_RL_SYNCFULL (1U << 0)
62#define BLK_RL_ASYNCFULL (1U << 1)
63
64struct request_list {
65 struct request_queue *q; /* the queue this rl belongs to */
66#ifdef CONFIG_BLK_CGROUP
67 struct blkcg_gq *blkg; /* blkg this request pool belongs to */
68#endif
69 /*
70 * count[], starved[], and wait[] are indexed by
71 * BLK_RW_SYNC/BLK_RW_ASYNC
72 */
73 int count[2];
74 int starved[2];
75 mempool_t *rq_pool;
76 wait_queue_head_t wait[2];
77 unsigned int flags;
78};
79
80/* 61/*
81 * request flags */ 62 * request flags */
82typedef __u32 __bitwise req_flags_t; 63typedef __u32 __bitwise req_flags_t;
@@ -85,8 +66,6 @@ typedef __u32 __bitwise req_flags_t;
85#define RQF_SORTED ((__force req_flags_t)(1 << 0)) 66#define RQF_SORTED ((__force req_flags_t)(1 << 0))
86/* drive already may have started this one */ 67/* drive already may have started this one */
87#define RQF_STARTED ((__force req_flags_t)(1 << 1)) 68#define RQF_STARTED ((__force req_flags_t)(1 << 1))
88/* uses tagged queueing */
89#define RQF_QUEUED ((__force req_flags_t)(1 << 2))
90/* may not be passed by ioscheduler */ 69/* may not be passed by ioscheduler */
91#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) 70#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3))
92/* request for flush sequence */ 71/* request for flush sequence */
@@ -150,8 +129,8 @@ enum mq_rq_state {
150struct request { 129struct request {
151 struct request_queue *q; 130 struct request_queue *q;
152 struct blk_mq_ctx *mq_ctx; 131 struct blk_mq_ctx *mq_ctx;
132 struct blk_mq_hw_ctx *mq_hctx;
153 133
154 int cpu;
155 unsigned int cmd_flags; /* op and common flags */ 134 unsigned int cmd_flags; /* op and common flags */
156 req_flags_t rq_flags; 135 req_flags_t rq_flags;
157 136
@@ -245,11 +224,7 @@ struct request {
245 refcount_t ref; 224 refcount_t ref;
246 225
247 unsigned int timeout; 226 unsigned int timeout;
248 227 unsigned long deadline;
249 /* access through blk_rq_set_deadline, blk_rq_deadline */
250 unsigned long __deadline;
251
252 struct list_head timeout_list;
253 228
254 union { 229 union {
255 struct __call_single_data csd; 230 struct __call_single_data csd;
@@ -264,10 +239,6 @@ struct request {
264 239
265 /* for bidi */ 240 /* for bidi */
266 struct request *next_rq; 241 struct request *next_rq;
267
268#ifdef CONFIG_BLK_CGROUP
269 struct request_list *rl; /* rl this rq is alloced from */
270#endif
271}; 242};
272 243
273static inline bool blk_op_is_scsi(unsigned int op) 244static inline bool blk_op_is_scsi(unsigned int op)
@@ -311,41 +282,21 @@ static inline unsigned short req_get_ioprio(struct request *req)
311 282
312struct blk_queue_ctx; 283struct blk_queue_ctx;
313 284
314typedef void (request_fn_proc) (struct request_queue *q);
315typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); 285typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
316typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
317typedef int (prep_rq_fn) (struct request_queue *, struct request *);
318typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
319 286
320struct bio_vec; 287struct bio_vec;
321typedef void (softirq_done_fn)(struct request *);
322typedef int (dma_drain_needed_fn)(struct request *); 288typedef int (dma_drain_needed_fn)(struct request *);
323typedef int (lld_busy_fn) (struct request_queue *q);
324typedef int (bsg_job_fn) (struct bsg_job *);
325typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
326typedef void (exit_rq_fn)(struct request_queue *, struct request *);
327 289
328enum blk_eh_timer_return { 290enum blk_eh_timer_return {
329 BLK_EH_DONE, /* drivers has completed the command */ 291 BLK_EH_DONE, /* drivers has completed the command */
330 BLK_EH_RESET_TIMER, /* reset timer and try again */ 292 BLK_EH_RESET_TIMER, /* reset timer and try again */
331}; 293};
332 294
333typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *);
334
335enum blk_queue_state { 295enum blk_queue_state {
336 Queue_down, 296 Queue_down,
337 Queue_up, 297 Queue_up,
338}; 298};
339 299
340struct blk_queue_tag {
341 struct request **tag_index; /* map of busy tags */
342 unsigned long *tag_map; /* bit map of free/busy tags */
343 int max_depth; /* what we will send to device */
344 int real_max_depth; /* what the array can hold */
345 atomic_t refcnt; /* map can be shared */
346 int alloc_policy; /* tag allocation policy */
347 int next_tag; /* next tag */
348};
349#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ 300#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
350#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ 301#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
351 302
@@ -444,40 +395,15 @@ struct request_queue {
444 struct list_head queue_head; 395 struct list_head queue_head;
445 struct request *last_merge; 396 struct request *last_merge;
446 struct elevator_queue *elevator; 397 struct elevator_queue *elevator;
447 int nr_rqs[2]; /* # allocated [a]sync rqs */
448 int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
449 398
450 struct blk_queue_stats *stats; 399 struct blk_queue_stats *stats;
451 struct rq_qos *rq_qos; 400 struct rq_qos *rq_qos;
452 401
453 /*
454 * If blkcg is not used, @q->root_rl serves all requests. If blkcg
455 * is used, root blkg allocates from @q->root_rl and all other
456 * blkgs from their own blkg->rl. Which one to use should be
457 * determined using bio_request_list().
458 */
459 struct request_list root_rl;
460
461 request_fn_proc *request_fn;
462 make_request_fn *make_request_fn; 402 make_request_fn *make_request_fn;
463 poll_q_fn *poll_fn;
464 prep_rq_fn *prep_rq_fn;
465 unprep_rq_fn *unprep_rq_fn;
466 softirq_done_fn *softirq_done_fn;
467 rq_timed_out_fn *rq_timed_out_fn;
468 dma_drain_needed_fn *dma_drain_needed; 403 dma_drain_needed_fn *dma_drain_needed;
469 lld_busy_fn *lld_busy_fn;
470 /* Called just after a request is allocated */
471 init_rq_fn *init_rq_fn;
472 /* Called just before a request is freed */
473 exit_rq_fn *exit_rq_fn;
474 /* Called from inside blk_get_request() */
475 void (*initialize_rq_fn)(struct request *rq);
476 404
477 const struct blk_mq_ops *mq_ops; 405 const struct blk_mq_ops *mq_ops;
478 406
479 unsigned int *mq_map;
480
481 /* sw queues */ 407 /* sw queues */
482 struct blk_mq_ctx __percpu *queue_ctx; 408 struct blk_mq_ctx __percpu *queue_ctx;
483 unsigned int nr_queues; 409 unsigned int nr_queues;
@@ -488,17 +414,6 @@ struct request_queue {
488 struct blk_mq_hw_ctx **queue_hw_ctx; 414 struct blk_mq_hw_ctx **queue_hw_ctx;
489 unsigned int nr_hw_queues; 415 unsigned int nr_hw_queues;
490 416
491 /*
492 * Dispatch queue sorting
493 */
494 sector_t end_sector;
495 struct request *boundary_rq;
496
497 /*
498 * Delayed queue handling
499 */
500 struct delayed_work delay_work;
501
502 struct backing_dev_info *backing_dev_info; 417 struct backing_dev_info *backing_dev_info;
503 418
504 /* 419 /*
@@ -529,13 +444,7 @@ struct request_queue {
529 */ 444 */
530 gfp_t bounce_gfp; 445 gfp_t bounce_gfp;
531 446
532 /* 447 spinlock_t queue_lock;
533 * protects queue structures from reentrancy. ->__queue_lock should
534 * _never_ be used directly, it is queue private. always use
535 * ->queue_lock.
536 */
537 spinlock_t __queue_lock;
538 spinlock_t *queue_lock;
539 448
540 /* 449 /*
541 * queue kobject 450 * queue kobject
@@ -545,7 +454,7 @@ struct request_queue {
545 /* 454 /*
546 * mq queue kobject 455 * mq queue kobject
547 */ 456 */
548 struct kobject mq_kobj; 457 struct kobject *mq_kobj;
549 458
550#ifdef CONFIG_BLK_DEV_INTEGRITY 459#ifdef CONFIG_BLK_DEV_INTEGRITY
551 struct blk_integrity integrity; 460 struct blk_integrity integrity;
@@ -561,27 +470,12 @@ struct request_queue {
561 * queue settings 470 * queue settings
562 */ 471 */
563 unsigned long nr_requests; /* Max # of requests */ 472 unsigned long nr_requests; /* Max # of requests */
564 unsigned int nr_congestion_on;
565 unsigned int nr_congestion_off;
566 unsigned int nr_batching;
567 473
568 unsigned int dma_drain_size; 474 unsigned int dma_drain_size;
569 void *dma_drain_buffer; 475 void *dma_drain_buffer;
570 unsigned int dma_pad_mask; 476 unsigned int dma_pad_mask;
571 unsigned int dma_alignment; 477 unsigned int dma_alignment;
572 478
573 struct blk_queue_tag *queue_tags;
574
575 unsigned int nr_sorted;
576 unsigned int in_flight[2];
577
578 /*
579 * Number of active block driver functions for which blk_drain_queue()
580 * must wait. Must be incremented around functions that unlock the
581 * queue_lock internally, e.g. scsi_request_fn().
582 */
583 unsigned int request_fn_active;
584
585 unsigned int rq_timeout; 479 unsigned int rq_timeout;
586 int poll_nsec; 480 int poll_nsec;
587 481
@@ -590,7 +484,6 @@ struct request_queue {
590 484
591 struct timer_list timeout; 485 struct timer_list timeout;
592 struct work_struct timeout_work; 486 struct work_struct timeout_work;
593 struct list_head timeout_list;
594 487
595 struct list_head icq_list; 488 struct list_head icq_list;
596#ifdef CONFIG_BLK_CGROUP 489#ifdef CONFIG_BLK_CGROUP
@@ -645,11 +538,9 @@ struct request_queue {
645 538
646 struct mutex sysfs_lock; 539 struct mutex sysfs_lock;
647 540
648 int bypass_depth;
649 atomic_t mq_freeze_depth; 541 atomic_t mq_freeze_depth;
650 542
651#if defined(CONFIG_BLK_DEV_BSG) 543#if defined(CONFIG_BLK_DEV_BSG)
652 bsg_job_fn *bsg_job_fn;
653 struct bsg_class_device bsg_dev; 544 struct bsg_class_device bsg_dev;
654#endif 545#endif
655 546
@@ -669,12 +560,12 @@ struct request_queue {
669#ifdef CONFIG_BLK_DEBUG_FS 560#ifdef CONFIG_BLK_DEBUG_FS
670 struct dentry *debugfs_dir; 561 struct dentry *debugfs_dir;
671 struct dentry *sched_debugfs_dir; 562 struct dentry *sched_debugfs_dir;
563 struct dentry *rqos_debugfs_dir;
672#endif 564#endif
673 565
674 bool mq_sysfs_init_done; 566 bool mq_sysfs_init_done;
675 567
676 size_t cmd_size; 568 size_t cmd_size;
677 void *rq_alloc_data;
678 569
679 struct work_struct release_work; 570 struct work_struct release_work;
680 571
@@ -682,10 +573,8 @@ struct request_queue {
682 u64 write_hints[BLK_MAX_WRITE_HINTS]; 573 u64 write_hints[BLK_MAX_WRITE_HINTS];
683}; 574};
684 575
685#define QUEUE_FLAG_QUEUED 0 /* uses generic tag queueing */
686#define QUEUE_FLAG_STOPPED 1 /* queue is stopped */ 576#define QUEUE_FLAG_STOPPED 1 /* queue is stopped */
687#define QUEUE_FLAG_DYING 2 /* queue being torn down */ 577#define QUEUE_FLAG_DYING 2 /* queue being torn down */
688#define QUEUE_FLAG_BYPASS 3 /* act as dumb FIFO queue */
689#define QUEUE_FLAG_BIDI 4 /* queue supports bidi requests */ 578#define QUEUE_FLAG_BIDI 4 /* queue supports bidi requests */
690#define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ 579#define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */
691#define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ 580#define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */
@@ -718,19 +607,15 @@ struct request_queue {
718 (1 << QUEUE_FLAG_ADD_RANDOM)) 607 (1 << QUEUE_FLAG_ADD_RANDOM))
719 608
720#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 609#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
721 (1 << QUEUE_FLAG_SAME_COMP) | \ 610 (1 << QUEUE_FLAG_SAME_COMP))
722 (1 << QUEUE_FLAG_POLL))
723 611
724void blk_queue_flag_set(unsigned int flag, struct request_queue *q); 612void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
725void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); 613void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
726bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); 614bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
727bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
728 615
729#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
730#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) 616#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
731#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) 617#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
732#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) 618#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
733#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
734#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) 619#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
735#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) 620#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
736#define blk_queue_noxmerges(q) \ 621#define blk_queue_noxmerges(q) \
@@ -757,32 +642,20 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
757extern void blk_set_pm_only(struct request_queue *q); 642extern void blk_set_pm_only(struct request_queue *q);
758extern void blk_clear_pm_only(struct request_queue *q); 643extern void blk_clear_pm_only(struct request_queue *q);
759 644
760static inline int queue_in_flight(struct request_queue *q)
761{
762 return q->in_flight[0] + q->in_flight[1];
763}
764
765static inline bool blk_account_rq(struct request *rq) 645static inline bool blk_account_rq(struct request *rq)
766{ 646{
767 return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq); 647 return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
768} 648}
769 649
770#define blk_rq_cpu_valid(rq) ((rq)->cpu != -1)
771#define blk_bidi_rq(rq) ((rq)->next_rq != NULL) 650#define blk_bidi_rq(rq) ((rq)->next_rq != NULL)
772/* rq->queuelist of dequeued request must be list_empty() */
773#define blk_queued_rq(rq) (!list_empty(&(rq)->queuelist))
774 651
775#define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) 652#define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist)
776 653
777#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) 654#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ)
778 655
779/* 656static inline bool queue_is_mq(struct request_queue *q)
780 * Driver can handle struct request, if it either has an old style
781 * request_fn defined, or is blk-mq based.
782 */
783static inline bool queue_is_rq_based(struct request_queue *q)
784{ 657{
785 return q->request_fn || q->mq_ops; 658 return q->mq_ops;
786} 659}
787 660
788static inline unsigned int blk_queue_cluster(struct request_queue *q) 661static inline unsigned int blk_queue_cluster(struct request_queue *q)
@@ -845,27 +718,6 @@ static inline bool rq_is_sync(struct request *rq)
845 return op_is_sync(rq->cmd_flags); 718 return op_is_sync(rq->cmd_flags);
846} 719}
847 720
848static inline bool blk_rl_full(struct request_list *rl, bool sync)
849{
850 unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
851
852 return rl->flags & flag;
853}
854
855static inline void blk_set_rl_full(struct request_list *rl, bool sync)
856{
857 unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
858
859 rl->flags |= flag;
860}
861
862static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
863{
864 unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
865
866 rl->flags &= ~flag;
867}
868
869static inline bool rq_mergeable(struct request *rq) 721static inline bool rq_mergeable(struct request *rq)
870{ 722{
871 if (blk_rq_is_passthrough(rq)) 723 if (blk_rq_is_passthrough(rq))
@@ -902,16 +754,6 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
902 return q->nr_requests; 754 return q->nr_requests;
903} 755}
904 756
905/*
906 * q->prep_rq_fn return values
907 */
908enum {
909 BLKPREP_OK, /* serve it */
910 BLKPREP_KILL, /* fatal error, kill, return -EIO */
911 BLKPREP_DEFER, /* leave on queue */
912 BLKPREP_INVALID, /* invalid command, kill, return -EREMOTEIO */
913};
914
915extern unsigned long blk_max_low_pfn, blk_max_pfn; 757extern unsigned long blk_max_low_pfn, blk_max_pfn;
916 758
917/* 759/*
@@ -983,10 +825,8 @@ extern blk_qc_t direct_make_request(struct bio *bio);
983extern void blk_rq_init(struct request_queue *q, struct request *rq); 825extern void blk_rq_init(struct request_queue *q, struct request *rq);
984extern void blk_init_request_from_bio(struct request *req, struct bio *bio); 826extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
985extern void blk_put_request(struct request *); 827extern void blk_put_request(struct request *);
986extern void __blk_put_request(struct request_queue *, struct request *);
987extern struct request *blk_get_request(struct request_queue *, unsigned int op, 828extern struct request *blk_get_request(struct request_queue *, unsigned int op,
988 blk_mq_req_flags_t flags); 829 blk_mq_req_flags_t flags);
989extern void blk_requeue_request(struct request_queue *, struct request *);
990extern int blk_lld_busy(struct request_queue *q); 830extern int blk_lld_busy(struct request_queue *q);
991extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, 831extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
992 struct bio_set *bs, gfp_t gfp_mask, 832 struct bio_set *bs, gfp_t gfp_mask,
@@ -996,7 +836,6 @@ extern void blk_rq_unprep_clone(struct request *rq);
996extern blk_status_t blk_insert_cloned_request(struct request_queue *q, 836extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
997 struct request *rq); 837 struct request *rq);
998extern int blk_rq_append_bio(struct request *rq, struct bio **bio); 838extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
999extern void blk_delay_queue(struct request_queue *, unsigned long);
1000extern void blk_queue_split(struct request_queue *, struct bio **); 839extern void blk_queue_split(struct request_queue *, struct bio **);
1001extern void blk_recount_segments(struct request_queue *, struct bio *); 840extern void blk_recount_segments(struct request_queue *, struct bio *);
1002extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); 841extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
@@ -1009,15 +848,7 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
1009 848
1010extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); 849extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
1011extern void blk_queue_exit(struct request_queue *q); 850extern void blk_queue_exit(struct request_queue *q);
1012extern void blk_start_queue(struct request_queue *q);
1013extern void blk_start_queue_async(struct request_queue *q);
1014extern void blk_stop_queue(struct request_queue *q);
1015extern void blk_sync_queue(struct request_queue *q); 851extern void blk_sync_queue(struct request_queue *q);
1016extern void __blk_stop_queue(struct request_queue *q);
1017extern void __blk_run_queue(struct request_queue *q);
1018extern void __blk_run_queue_uncond(struct request_queue *q);
1019extern void blk_run_queue(struct request_queue *);
1020extern void blk_run_queue_async(struct request_queue *q);
1021extern int blk_rq_map_user(struct request_queue *, struct request *, 852extern int blk_rq_map_user(struct request_queue *, struct request *,
1022 struct rq_map_data *, void __user *, unsigned long, 853 struct rq_map_data *, void __user *, unsigned long,
1023 gfp_t); 854 gfp_t);
@@ -1034,7 +865,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
1034int blk_status_to_errno(blk_status_t status); 865int blk_status_to_errno(blk_status_t status);
1035blk_status_t errno_to_blk_status(int errno); 866blk_status_t errno_to_blk_status(int errno);
1036 867
1037bool blk_poll(struct request_queue *q, blk_qc_t cookie); 868int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
1038 869
1039static inline struct request_queue *bdev_get_queue(struct block_device *bdev) 870static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
1040{ 871{
@@ -1172,13 +1003,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq)
1172 return nr_bios; 1003 return nr_bios;
1173} 1004}
1174 1005
1175/*
1176 * Request issue related functions.
1177 */
1178extern struct request *blk_peek_request(struct request_queue *q);
1179extern void blk_start_request(struct request *rq);
1180extern struct request *blk_fetch_request(struct request_queue *q);
1181
1182void blk_steal_bios(struct bio_list *list, struct request *rq); 1006void blk_steal_bios(struct bio_list *list, struct request *rq);
1183 1007
1184/* 1008/*
@@ -1196,27 +1020,18 @@ void blk_steal_bios(struct bio_list *list, struct request *rq);
1196 */ 1020 */
1197extern bool blk_update_request(struct request *rq, blk_status_t error, 1021extern bool blk_update_request(struct request *rq, blk_status_t error,
1198 unsigned int nr_bytes); 1022 unsigned int nr_bytes);
1199extern void blk_finish_request(struct request *rq, blk_status_t error);
1200extern bool blk_end_request(struct request *rq, blk_status_t error,
1201 unsigned int nr_bytes);
1202extern void blk_end_request_all(struct request *rq, blk_status_t error); 1023extern void blk_end_request_all(struct request *rq, blk_status_t error);
1203extern bool __blk_end_request(struct request *rq, blk_status_t error, 1024extern bool __blk_end_request(struct request *rq, blk_status_t error,
1204 unsigned int nr_bytes); 1025 unsigned int nr_bytes);
1205extern void __blk_end_request_all(struct request *rq, blk_status_t error); 1026extern void __blk_end_request_all(struct request *rq, blk_status_t error);
1206extern bool __blk_end_request_cur(struct request *rq, blk_status_t error); 1027extern bool __blk_end_request_cur(struct request *rq, blk_status_t error);
1207 1028
1208extern void blk_complete_request(struct request *);
1209extern void __blk_complete_request(struct request *); 1029extern void __blk_complete_request(struct request *);
1210extern void blk_abort_request(struct request *); 1030extern void blk_abort_request(struct request *);
1211extern void blk_unprep_request(struct request *);
1212 1031
1213/* 1032/*
1214 * Access functions for manipulating queue properties 1033 * Access functions for manipulating queue properties
1215 */ 1034 */
1216extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
1217 spinlock_t *lock, int node_id);
1218extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
1219extern int blk_init_allocated_queue(struct request_queue *);
1220extern void blk_cleanup_queue(struct request_queue *); 1035extern void blk_cleanup_queue(struct request_queue *);
1221extern void blk_queue_make_request(struct request_queue *, make_request_fn *); 1036extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
1222extern void blk_queue_bounce_limit(struct request_queue *, u64); 1037extern void blk_queue_bounce_limit(struct request_queue *, u64);
@@ -1255,15 +1070,10 @@ extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
1255extern int blk_queue_dma_drain(struct request_queue *q, 1070extern int blk_queue_dma_drain(struct request_queue *q,
1256 dma_drain_needed_fn *dma_drain_needed, 1071 dma_drain_needed_fn *dma_drain_needed,
1257 void *buf, unsigned int size); 1072 void *buf, unsigned int size);
1258extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
1259extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); 1073extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
1260extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); 1074extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
1261extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
1262extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
1263extern void blk_queue_dma_alignment(struct request_queue *, int); 1075extern void blk_queue_dma_alignment(struct request_queue *, int);
1264extern void blk_queue_update_dma_alignment(struct request_queue *, int); 1076extern void blk_queue_update_dma_alignment(struct request_queue *, int);
1265extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
1266extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
1267extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); 1077extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
1268extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); 1078extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
1269extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); 1079extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
@@ -1299,8 +1109,7 @@ extern long nr_blockdev_pages(void);
1299 1109
1300bool __must_check blk_get_queue(struct request_queue *); 1110bool __must_check blk_get_queue(struct request_queue *);
1301struct request_queue *blk_alloc_queue(gfp_t); 1111struct request_queue *blk_alloc_queue(gfp_t);
1302struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, 1112struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id);
1303 spinlock_t *lock);
1304extern void blk_put_queue(struct request_queue *); 1113extern void blk_put_queue(struct request_queue *);
1305extern void blk_set_queue_dying(struct request_queue *); 1114extern void blk_set_queue_dying(struct request_queue *);
1306 1115
@@ -1317,9 +1126,10 @@ extern void blk_set_queue_dying(struct request_queue *);
1317 * schedule() where blk_schedule_flush_plug() is called. 1126 * schedule() where blk_schedule_flush_plug() is called.
1318 */ 1127 */
1319struct blk_plug { 1128struct blk_plug {
1320 struct list_head list; /* requests */
1321 struct list_head mq_list; /* blk-mq requests */ 1129 struct list_head mq_list; /* blk-mq requests */
1322 struct list_head cb_list; /* md requires an unplug callback */ 1130 struct list_head cb_list; /* md requires an unplug callback */
1131 unsigned short rq_count;
1132 bool multiple_queues;
1323}; 1133};
1324#define BLK_MAX_REQUEST_COUNT 16 1134#define BLK_MAX_REQUEST_COUNT 16
1325#define BLK_PLUG_FLUSH_SIZE (128 * 1024) 1135#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
@@ -1358,31 +1168,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
1358 struct blk_plug *plug = tsk->plug; 1168 struct blk_plug *plug = tsk->plug;
1359 1169
1360 return plug && 1170 return plug &&
1361 (!list_empty(&plug->list) || 1171 (!list_empty(&plug->mq_list) ||
1362 !list_empty(&plug->mq_list) ||
1363 !list_empty(&plug->cb_list)); 1172 !list_empty(&plug->cb_list));
1364} 1173}
1365 1174
1366/*
1367 * tag stuff
1368 */
1369extern int blk_queue_start_tag(struct request_queue *, struct request *);
1370extern struct request *blk_queue_find_tag(struct request_queue *, int);
1371extern void blk_queue_end_tag(struct request_queue *, struct request *);
1372extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int);
1373extern void blk_queue_free_tags(struct request_queue *);
1374extern int blk_queue_resize_tags(struct request_queue *, int);
1375extern struct blk_queue_tag *blk_init_tags(int, int);
1376extern void blk_free_tags(struct blk_queue_tag *);
1377
1378static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
1379 int tag)
1380{
1381 if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
1382 return NULL;
1383 return bqt->tag_index[tag];
1384}
1385
1386extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); 1175extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
1387extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, 1176extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
1388 sector_t nr_sects, gfp_t gfp_mask, struct page *page); 1177 sector_t nr_sects, gfp_t gfp_mask, struct page *page);
@@ -1982,4 +1771,17 @@ static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
1982 1771
1983#endif /* CONFIG_BLOCK */ 1772#endif /* CONFIG_BLOCK */
1984 1773
1774static inline void blk_wake_io_task(struct task_struct *waiter)
1775{
1776 /*
1777 * If we're polling, the task itself is doing the completions. For
1778 * that case, we don't need to signal a wakeup, it's enough to just
1779 * mark us as RUNNING.
1780 */
1781 if (waiter == current)
1782 __set_current_state(TASK_RUNNING);
1783 else
1784 wake_up_process(waiter);
1785}
1786
1985#endif 1787#endif
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 6aeaf6472665..b356e0006731 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -31,6 +31,9 @@ struct device;
31struct scatterlist; 31struct scatterlist;
32struct request_queue; 32struct request_queue;
33 33
34typedef int (bsg_job_fn) (struct bsg_job *);
35typedef enum blk_eh_timer_return (bsg_timeout_fn)(struct request *);
36
34struct bsg_buffer { 37struct bsg_buffer {
35 unsigned int payload_len; 38 unsigned int payload_len;
36 int sg_cnt; 39 int sg_cnt;
@@ -72,7 +75,8 @@ struct bsg_job {
72void bsg_job_done(struct bsg_job *job, int result, 75void bsg_job_done(struct bsg_job *job, int result,
73 unsigned int reply_payload_rcv_len); 76 unsigned int reply_payload_rcv_len);
74struct request_queue *bsg_setup_queue(struct device *dev, const char *name, 77struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
75 bsg_job_fn *job_fn, int dd_job_size); 78 bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size);
79void bsg_remove_queue(struct request_queue *q);
76void bsg_job_put(struct bsg_job *job); 80void bsg_job_put(struct bsg_job *job);
77int __must_check bsg_job_get(struct bsg_job *job); 81int __must_check bsg_job_get(struct bsg_job *job);
78 82
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9d12757a65b0..9968332cceed 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -93,6 +93,8 @@ extern struct css_set init_css_set;
93 93
94bool css_has_online_children(struct cgroup_subsys_state *css); 94bool css_has_online_children(struct cgroup_subsys_state *css);
95struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); 95struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
96struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
97 struct cgroup_subsys *ss);
96struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, 98struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
97 struct cgroup_subsys *ss); 99 struct cgroup_subsys *ss);
98struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, 100struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 015bb59c0331..2e9e2763bf47 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -23,74 +23,6 @@ enum elv_merge {
23 ELEVATOR_DISCARD_MERGE = 3, 23 ELEVATOR_DISCARD_MERGE = 3,
24}; 24};
25 25
26typedef enum elv_merge (elevator_merge_fn) (struct request_queue *, struct request **,
27 struct bio *);
28
29typedef void (elevator_merge_req_fn) (struct request_queue *, struct request *, struct request *);
30
31typedef void (elevator_merged_fn) (struct request_queue *, struct request *, enum elv_merge);
32
33typedef int (elevator_allow_bio_merge_fn) (struct request_queue *,
34 struct request *, struct bio *);
35
36typedef int (elevator_allow_rq_merge_fn) (struct request_queue *,
37 struct request *, struct request *);
38
39typedef void (elevator_bio_merged_fn) (struct request_queue *,
40 struct request *, struct bio *);
41
42typedef int (elevator_dispatch_fn) (struct request_queue *, int);
43
44typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
45typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
46typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
47typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int);
48
49typedef void (elevator_init_icq_fn) (struct io_cq *);
50typedef void (elevator_exit_icq_fn) (struct io_cq *);
51typedef int (elevator_set_req_fn) (struct request_queue *, struct request *,
52 struct bio *, gfp_t);
53typedef void (elevator_put_req_fn) (struct request *);
54typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
55typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
56
57typedef int (elevator_init_fn) (struct request_queue *,
58 struct elevator_type *e);
59typedef void (elevator_exit_fn) (struct elevator_queue *);
60typedef void (elevator_registered_fn) (struct request_queue *);
61
62struct elevator_ops
63{
64 elevator_merge_fn *elevator_merge_fn;
65 elevator_merged_fn *elevator_merged_fn;
66 elevator_merge_req_fn *elevator_merge_req_fn;
67 elevator_allow_bio_merge_fn *elevator_allow_bio_merge_fn;
68 elevator_allow_rq_merge_fn *elevator_allow_rq_merge_fn;
69 elevator_bio_merged_fn *elevator_bio_merged_fn;
70
71 elevator_dispatch_fn *elevator_dispatch_fn;
72 elevator_add_req_fn *elevator_add_req_fn;
73 elevator_activate_req_fn *elevator_activate_req_fn;
74 elevator_deactivate_req_fn *elevator_deactivate_req_fn;
75
76 elevator_completed_req_fn *elevator_completed_req_fn;
77
78 elevator_request_list_fn *elevator_former_req_fn;
79 elevator_request_list_fn *elevator_latter_req_fn;
80
81 elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */
82 elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */
83
84 elevator_set_req_fn *elevator_set_req_fn;
85 elevator_put_req_fn *elevator_put_req_fn;
86
87 elevator_may_queue_fn *elevator_may_queue_fn;
88
89 elevator_init_fn *elevator_init_fn;
90 elevator_exit_fn *elevator_exit_fn;
91 elevator_registered_fn *elevator_registered_fn;
92};
93
94struct blk_mq_alloc_data; 26struct blk_mq_alloc_data;
95struct blk_mq_hw_ctx; 27struct blk_mq_hw_ctx;
96 28
@@ -137,17 +69,14 @@ struct elevator_type
137 struct kmem_cache *icq_cache; 69 struct kmem_cache *icq_cache;
138 70
139 /* fields provided by elevator implementation */ 71 /* fields provided by elevator implementation */
140 union { 72 struct elevator_mq_ops ops;
141 struct elevator_ops sq; 73
142 struct elevator_mq_ops mq;
143 } ops;
144 size_t icq_size; /* see iocontext.h */ 74 size_t icq_size; /* see iocontext.h */
145 size_t icq_align; /* ditto */ 75 size_t icq_align; /* ditto */
146 struct elv_fs_entry *elevator_attrs; 76 struct elv_fs_entry *elevator_attrs;
147 char elevator_name[ELV_NAME_MAX]; 77 char elevator_name[ELV_NAME_MAX];
148 const char *elevator_alias; 78 const char *elevator_alias;
149 struct module *elevator_owner; 79 struct module *elevator_owner;
150 bool uses_mq;
151#ifdef CONFIG_BLK_DEBUG_FS 80#ifdef CONFIG_BLK_DEBUG_FS
152 const struct blk_mq_debugfs_attr *queue_debugfs_attrs; 81 const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
153 const struct blk_mq_debugfs_attr *hctx_debugfs_attrs; 82 const struct blk_mq_debugfs_attr *hctx_debugfs_attrs;
@@ -175,40 +104,25 @@ struct elevator_queue
175 struct kobject kobj; 104 struct kobject kobj;
176 struct mutex sysfs_lock; 105 struct mutex sysfs_lock;
177 unsigned int registered:1; 106 unsigned int registered:1;
178 unsigned int uses_mq:1;
179 DECLARE_HASHTABLE(hash, ELV_HASH_BITS); 107 DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
180}; 108};
181 109
182/* 110/*
183 * block elevator interface 111 * block elevator interface
184 */ 112 */
185extern void elv_dispatch_sort(struct request_queue *, struct request *);
186extern void elv_dispatch_add_tail(struct request_queue *, struct request *);
187extern void elv_add_request(struct request_queue *, struct request *, int);
188extern void __elv_add_request(struct request_queue *, struct request *, int);
189extern enum elv_merge elv_merge(struct request_queue *, struct request **, 113extern enum elv_merge elv_merge(struct request_queue *, struct request **,
190 struct bio *); 114 struct bio *);
191extern void elv_merge_requests(struct request_queue *, struct request *, 115extern void elv_merge_requests(struct request_queue *, struct request *,
192 struct request *); 116 struct request *);
193extern void elv_merged_request(struct request_queue *, struct request *, 117extern void elv_merged_request(struct request_queue *, struct request *,
194 enum elv_merge); 118 enum elv_merge);
195extern void elv_bio_merged(struct request_queue *q, struct request *,
196 struct bio *);
197extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); 119extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
198extern void elv_requeue_request(struct request_queue *, struct request *);
199extern struct request *elv_former_request(struct request_queue *, struct request *); 120extern struct request *elv_former_request(struct request_queue *, struct request *);
200extern struct request *elv_latter_request(struct request_queue *, struct request *); 121extern struct request *elv_latter_request(struct request_queue *, struct request *);
201extern int elv_may_queue(struct request_queue *, unsigned int);
202extern void elv_completed_request(struct request_queue *, struct request *);
203extern int elv_set_request(struct request_queue *q, struct request *rq,
204 struct bio *bio, gfp_t gfp_mask);
205extern void elv_put_request(struct request_queue *, struct request *);
206extern void elv_drain_elevator(struct request_queue *);
207 122
208/* 123/*
209 * io scheduler registration 124 * io scheduler registration
210 */ 125 */
211extern void __init load_default_elevator_module(void);
212extern int elv_register(struct elevator_type *); 126extern int elv_register(struct elevator_type *);
213extern void elv_unregister(struct elevator_type *); 127extern void elv_unregister(struct elevator_type *);
214 128
@@ -260,9 +174,5 @@ enum {
260#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) 174#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
261#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) 175#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
262 176
263#else /* CONFIG_BLOCK */
264
265static inline void load_default_elevator_module(void) { }
266
267#endif /* CONFIG_BLOCK */ 177#endif /* CONFIG_BLOCK */
268#endif 178#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 26a8607b3c3c..6d52ce6af4ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2026,7 +2026,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
2026 .ki_filp = filp, 2026 .ki_filp = filp,
2027 .ki_flags = iocb_flags(filp), 2027 .ki_flags = iocb_flags(filp),
2028 .ki_hint = ki_hint_validate(file_write_hint(filp)), 2028 .ki_hint = ki_hint_validate(file_write_hint(filp)),
2029 .ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0), 2029 .ki_ioprio = get_current_ioprio(),
2030 }; 2030 };
2031} 2031}
2032 2032
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 70fc838e6773..06c0fd594097 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -17,6 +17,7 @@
17#include <linux/percpu-refcount.h> 17#include <linux/percpu-refcount.h>
18#include <linux/uuid.h> 18#include <linux/uuid.h>
19#include <linux/blk_types.h> 19#include <linux/blk_types.h>
20#include <asm/local.h>
20 21
21#ifdef CONFIG_BLOCK 22#ifdef CONFIG_BLOCK
22 23
@@ -89,6 +90,7 @@ struct disk_stats {
89 unsigned long merges[NR_STAT_GROUPS]; 90 unsigned long merges[NR_STAT_GROUPS];
90 unsigned long io_ticks; 91 unsigned long io_ticks;
91 unsigned long time_in_queue; 92 unsigned long time_in_queue;
93 local_t in_flight[2];
92}; 94};
93 95
94#define PARTITION_META_INFO_VOLNAMELTH 64 96#define PARTITION_META_INFO_VOLNAMELTH 64
@@ -122,14 +124,13 @@ struct hd_struct {
122 int make_it_fail; 124 int make_it_fail;
123#endif 125#endif
124 unsigned long stamp; 126 unsigned long stamp;
125 atomic_t in_flight[2];
126#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
127 struct disk_stats __percpu *dkstats; 128 struct disk_stats __percpu *dkstats;
128#else 129#else
129 struct disk_stats dkstats; 130 struct disk_stats dkstats;
130#endif 131#endif
131 struct percpu_ref ref; 132 struct percpu_ref ref;
132 struct rcu_head rcu_head; 133 struct rcu_work rcu_work;
133}; 134};
134 135
135#define GENHD_FL_REMOVABLE 1 136#define GENHD_FL_REMOVABLE 1
@@ -295,8 +296,11 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
295#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) 296#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); })
296#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) 297#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0)
297 298
298#define __part_stat_add(cpu, part, field, addnd) \ 299#define part_stat_get_cpu(part, field, cpu) \
299 (per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd)) 300 (per_cpu_ptr((part)->dkstats, (cpu))->field)
301
302#define part_stat_get(part, field) \
303 part_stat_get_cpu(part, field, smp_processor_id())
300 304
301#define part_stat_read(part, field) \ 305#define part_stat_read(part, field) \
302({ \ 306({ \
@@ -333,10 +337,9 @@ static inline void free_part_stats(struct hd_struct *part)
333#define part_stat_lock() ({ rcu_read_lock(); 0; }) 337#define part_stat_lock() ({ rcu_read_lock(); 0; })
334#define part_stat_unlock() rcu_read_unlock() 338#define part_stat_unlock() rcu_read_unlock()
335 339
336#define __part_stat_add(cpu, part, field, addnd) \ 340#define part_stat_get(part, field) ((part)->dkstats.field)
337 ((part)->dkstats.field += addnd) 341#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field)
338 342#define part_stat_read(part, field) part_stat_get(part, field)
339#define part_stat_read(part, field) ((part)->dkstats.field)
340 343
341static inline void part_stat_set_all(struct hd_struct *part, int value) 344static inline void part_stat_set_all(struct hd_struct *part, int value)
342{ 345{
@@ -362,22 +365,33 @@ static inline void free_part_stats(struct hd_struct *part)
362 part_stat_read(part, field[STAT_WRITE]) + \ 365 part_stat_read(part, field[STAT_WRITE]) + \
363 part_stat_read(part, field[STAT_DISCARD])) 366 part_stat_read(part, field[STAT_DISCARD]))
364 367
365#define part_stat_add(cpu, part, field, addnd) do { \ 368#define __part_stat_add(part, field, addnd) \
366 __part_stat_add((cpu), (part), field, addnd); \ 369 (part_stat_get(part, field) += (addnd))
370
371#define part_stat_add(part, field, addnd) do { \
372 __part_stat_add((part), field, addnd); \
367 if ((part)->partno) \ 373 if ((part)->partno) \
368 __part_stat_add((cpu), &part_to_disk((part))->part0, \ 374 __part_stat_add(&part_to_disk((part))->part0, \
369 field, addnd); \ 375 field, addnd); \
370} while (0) 376} while (0)
371 377
372#define part_stat_dec(cpu, gendiskp, field) \ 378#define part_stat_dec(gendiskp, field) \
373 part_stat_add(cpu, gendiskp, field, -1) 379 part_stat_add(gendiskp, field, -1)
374#define part_stat_inc(cpu, gendiskp, field) \ 380#define part_stat_inc(gendiskp, field) \
375 part_stat_add(cpu, gendiskp, field, 1) 381 part_stat_add(gendiskp, field, 1)
376#define part_stat_sub(cpu, gendiskp, field, subnd) \ 382#define part_stat_sub(gendiskp, field, subnd) \
377 part_stat_add(cpu, gendiskp, field, -subnd) 383 part_stat_add(gendiskp, field, -subnd)
378 384
379void part_in_flight(struct request_queue *q, struct hd_struct *part, 385#define part_stat_local_dec(gendiskp, field) \
380 unsigned int inflight[2]); 386 local_dec(&(part_stat_get(gendiskp, field)))
387#define part_stat_local_inc(gendiskp, field) \
388 local_inc(&(part_stat_get(gendiskp, field)))
389#define part_stat_local_read(gendiskp, field) \
390 local_read(&(part_stat_get(gendiskp, field)))
391#define part_stat_local_read_cpu(gendiskp, field, cpu) \
392 local_read(&(part_stat_get_cpu(gendiskp, field, cpu)))
393
394unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part);
381void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, 395void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
382 unsigned int inflight[2]); 396 unsigned int inflight[2]);
383void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, 397void part_dec_in_flight(struct request_queue *q, struct hd_struct *part,
@@ -398,8 +412,7 @@ static inline void free_part_info(struct hd_struct *part)
398 kfree(part->info); 412 kfree(part->info);
399} 413}
400 414
401/* block/blk-core.c */ 415void update_io_ticks(struct hd_struct *part, unsigned long now);
402extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part);
403 416
404/* block/genhd.c */ 417/* block/genhd.c */
405extern void device_add_disk(struct device *parent, struct gendisk *disk, 418extern void device_add_disk(struct device *parent, struct gendisk *disk,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c74b0321922a..e7d29ae633cd 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -10,7 +10,7 @@
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/ioport.h> 11#include <linux/ioport.h>
12#include <linux/ata.h> 12#include <linux/ata.h>
13#include <linux/blkdev.h> 13#include <linux/blk-mq.h>
14#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
15#include <linux/interrupt.h> 15#include <linux/interrupt.h>
16#include <linux/bitops.h> 16#include <linux/bitops.h>
@@ -50,6 +50,7 @@ struct ide_request {
50 struct scsi_request sreq; 50 struct scsi_request sreq;
51 u8 sense[SCSI_SENSE_BUFFERSIZE]; 51 u8 sense[SCSI_SENSE_BUFFERSIZE];
52 u8 type; 52 u8 type;
53 void *special;
53}; 54};
54 55
55static inline struct ide_request *ide_req(struct request *rq) 56static inline struct ide_request *ide_req(struct request *rq)
@@ -529,6 +530,10 @@ struct ide_drive_s {
529 530
530 struct request_queue *queue; /* request queue */ 531 struct request_queue *queue; /* request queue */
531 532
533 bool (*prep_rq)(struct ide_drive_s *, struct request *);
534
535 struct blk_mq_tag_set tag_set;
536
532 struct request *rq; /* current request */ 537 struct request *rq; /* current request */
533 void *driver_data; /* extra driver data */ 538 void *driver_data; /* extra driver data */
534 u16 *id; /* identification info */ 539 u16 *id; /* identification info */
@@ -612,6 +617,10 @@ struct ide_drive_s {
612 bool sense_rq_armed; 617 bool sense_rq_armed;
613 struct request *sense_rq; 618 struct request *sense_rq;
614 struct request_sense sense_data; 619 struct request_sense sense_data;
620
621 /* async sense insertion */
622 struct work_struct rq_work;
623 struct list_head rq_list;
615}; 624};
616 625
617typedef struct ide_drive_s ide_drive_t; 626typedef struct ide_drive_s ide_drive_t;
@@ -1089,6 +1098,7 @@ extern int ide_pci_clk;
1089 1098
1090int ide_end_rq(ide_drive_t *, struct request *, blk_status_t, unsigned int); 1099int ide_end_rq(ide_drive_t *, struct request *, blk_status_t, unsigned int);
1091void ide_kill_rq(ide_drive_t *, struct request *); 1100void ide_kill_rq(ide_drive_t *, struct request *);
1101void ide_insert_request_head(ide_drive_t *, struct request *);
1092 1102
1093void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int); 1103void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
1094void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int); 1104void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
@@ -1208,7 +1218,7 @@ extern void ide_stall_queue(ide_drive_t *drive, unsigned long timeout);
1208 1218
1209extern void ide_timer_expiry(struct timer_list *t); 1219extern void ide_timer_expiry(struct timer_list *t);
1210extern irqreturn_t ide_intr(int irq, void *dev_id); 1220extern irqreturn_t ide_intr(int irq, void *dev_id);
1211extern void do_ide_request(struct request_queue *); 1221extern blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
1212extern void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq); 1222extern void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq);
1213 1223
1214void ide_init_disk(struct gendisk *, ide_drive_t *); 1224void ide_init_disk(struct gendisk *, ide_drive_t *);
diff --git a/include/linux/init.h b/include/linux/init.h
index 9c2aba1dbabf..5255069f5a9f 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -146,7 +146,6 @@ extern unsigned int reset_devices;
146/* used by init/main.c */ 146/* used by init/main.c */
147void setup_arch(char **); 147void setup_arch(char **);
148void prepare_namespace(void); 148void prepare_namespace(void);
149void __init load_default_modules(void);
150int __init init_rootfs(void); 149int __init init_rootfs(void);
151 150
152#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX) 151#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 9e30ed6443db..e9bfe6972aed 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -71,6 +71,19 @@ static inline int task_nice_ioclass(struct task_struct *task)
71} 71}
72 72
73/* 73/*
74 * If the calling process has set an I/O priority, use that. Otherwise, return
75 * the default I/O priority.
76 */
77static inline int get_current_ioprio(void)
78{
79 struct io_context *ioc = current->io_context;
80
81 if (ioc)
82 return ioc->ioprio;
83 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
84}
85
86/*
74 * For inheritance, return the highest of the two given priorities 87 * For inheritance, return the highest of the two given priorities
75 */ 88 */
76extern int ioprio_best(unsigned short aprio, unsigned short bprio); 89extern int ioprio_best(unsigned short aprio, unsigned short bprio);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2fdeac1a420d..5d865a5d5cdc 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -90,7 +90,7 @@ typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int,
90 struct nvm_chk_meta *); 90 struct nvm_chk_meta *);
91typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); 91typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
92typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *); 92typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *);
93typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); 93typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int);
94typedef void (nvm_destroy_dma_pool_fn)(void *); 94typedef void (nvm_destroy_dma_pool_fn)(void *);
95typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, 95typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
96 dma_addr_t *); 96 dma_addr_t *);
@@ -357,6 +357,7 @@ struct nvm_geo {
357 u32 clba; /* sectors per chunk */ 357 u32 clba; /* sectors per chunk */
358 u16 csecs; /* sector size */ 358 u16 csecs; /* sector size */
359 u16 sos; /* out-of-band area size */ 359 u16 sos; /* out-of-band area size */
360 bool ext; /* metadata in extended data buffer */
360 361
361 /* device write constrains */ 362 /* device write constrains */
362 u32 ws_min; /* minimum write size */ 363 u32 ws_min; /* minimum write size */
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 496ff759f84c..91745cc3704c 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -403,7 +403,6 @@ struct nvme_fc_port_template {
403 void **handle); 403 void **handle);
404 void (*delete_queue)(struct nvme_fc_local_port *, 404 void (*delete_queue)(struct nvme_fc_local_port *,
405 unsigned int qidx, void *handle); 405 unsigned int qidx, void *handle);
406 void (*poll_queue)(struct nvme_fc_local_port *, void *handle);
407 int (*ls_req)(struct nvme_fc_local_port *, 406 int (*ls_req)(struct nvme_fc_local_port *,
408 struct nvme_fc_remote_port *, 407 struct nvme_fc_remote_port *,
409 struct nvmefc_ls_req *); 408 struct nvmefc_ls_req *);
@@ -649,22 +648,6 @@ enum {
649 * sequence in one LLDD operation. Errors during Data 648 * sequence in one LLDD operation. Errors during Data
650 * sequence transmit must not allow RSP sequence to be sent. 649 * sequence transmit must not allow RSP sequence to be sent.
651 */ 650 */
652 NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 1),
653 /* Bit 2: When 0, the LLDD is calling the cmd rcv handler
654 * in a non-isr context, allowing the transport to finish
655 * op completion in the calling context. When 1, the LLDD
656 * is calling the cmd rcv handler in an ISR context,
657 * requiring the transport to transition to a workqueue
658 * for op completion.
659 */
660 NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 2),
661 /* Bit 3: When 0, the LLDD is calling the op done handler
662 * in a non-isr context, allowing the transport to finish
663 * op completion in the calling context. When 1, the LLDD
664 * is calling the op done handler in an ISR context,
665 * requiring the transport to transition to a workqueue
666 * for op completion.
667 */
668}; 651};
669 652
670 653
diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
new file mode 100644
index 000000000000..03d87c0550a9
--- /dev/null
+++ b/include/linux/nvme-tcp.h
@@ -0,0 +1,189 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe over Fabrics TCP protocol header.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5 */
6
7#ifndef _LINUX_NVME_TCP_H
8#define _LINUX_NVME_TCP_H
9
10#include <linux/nvme.h>
11
12#define NVME_TCP_DISC_PORT 8009
13#define NVME_TCP_ADMIN_CCSZ SZ_8K
14#define NVME_TCP_DIGEST_LENGTH 4
15
16enum nvme_tcp_pfv {
17 NVME_TCP_PFV_1_0 = 0x0,
18};
19
20enum nvme_tcp_fatal_error_status {
21 NVME_TCP_FES_INVALID_PDU_HDR = 0x01,
22 NVME_TCP_FES_PDU_SEQ_ERR = 0x02,
23 NVME_TCP_FES_HDR_DIGEST_ERR = 0x03,
24 NVME_TCP_FES_DATA_OUT_OF_RANGE = 0x04,
25 NVME_TCP_FES_R2T_LIMIT_EXCEEDED = 0x05,
26 NVME_TCP_FES_DATA_LIMIT_EXCEEDED = 0x05,
27 NVME_TCP_FES_UNSUPPORTED_PARAM = 0x06,
28};
29
30enum nvme_tcp_digest_option {
31 NVME_TCP_HDR_DIGEST_ENABLE = (1 << 0),
32 NVME_TCP_DATA_DIGEST_ENABLE = (1 << 1),
33};
34
35enum nvme_tcp_pdu_type {
36 nvme_tcp_icreq = 0x0,
37 nvme_tcp_icresp = 0x1,
38 nvme_tcp_h2c_term = 0x2,
39 nvme_tcp_c2h_term = 0x3,
40 nvme_tcp_cmd = 0x4,
41 nvme_tcp_rsp = 0x5,
42 nvme_tcp_h2c_data = 0x6,
43 nvme_tcp_c2h_data = 0x7,
44 nvme_tcp_r2t = 0x9,
45};
46
47enum nvme_tcp_pdu_flags {
48 NVME_TCP_F_HDGST = (1 << 0),
49 NVME_TCP_F_DDGST = (1 << 1),
50 NVME_TCP_F_DATA_LAST = (1 << 2),
51 NVME_TCP_F_DATA_SUCCESS = (1 << 3),
52};
53
54/**
55 * struct nvme_tcp_hdr - nvme tcp pdu common header
56 *
57 * @type: pdu type
58 * @flags: pdu specific flags
59 * @hlen: pdu header length
60 * @pdo: pdu data offset
61 * @plen: pdu wire byte length
62 */
63struct nvme_tcp_hdr {
64 __u8 type;
65 __u8 flags;
66 __u8 hlen;
67 __u8 pdo;
68 __le32 plen;
69};
70
71/**
72 * struct nvme_tcp_icreq_pdu - nvme tcp initialize connection request pdu
73 *
74 * @hdr: pdu generic header
75 * @pfv: pdu version format
76 * @hpda: host pdu data alignment (dwords, 0's based)
77 * @digest: digest types enabled
78 * @maxr2t: maximum r2ts per request supported
79 */
80struct nvme_tcp_icreq_pdu {
81 struct nvme_tcp_hdr hdr;
82 __le16 pfv;
83 __u8 hpda;
84 __u8 digest;
85 __le32 maxr2t;
86 __u8 rsvd2[112];
87};
88
89/**
90 * struct nvme_tcp_icresp_pdu - nvme tcp initialize connection response pdu
91 *
92 * @hdr: pdu common header
93 * @pfv: pdu version format
94 * @cpda: controller pdu data alignment (dowrds, 0's based)
95 * @digest: digest types enabled
96 * @maxdata: maximum data capsules per r2t supported
97 */
98struct nvme_tcp_icresp_pdu {
99 struct nvme_tcp_hdr hdr;
100 __le16 pfv;
101 __u8 cpda;
102 __u8 digest;
103 __le32 maxdata;
104 __u8 rsvd[112];
105};
106
107/**
108 * struct nvme_tcp_term_pdu - nvme tcp terminate connection pdu
109 *
110 * @hdr: pdu common header
111 * @fes: fatal error status
112 * @fei: fatal error information
113 */
114struct nvme_tcp_term_pdu {
115 struct nvme_tcp_hdr hdr;
116 __le16 fes;
117 __le32 fei;
118 __u8 rsvd[8];
119};
120
121/**
122 * struct nvme_tcp_cmd_pdu - nvme tcp command capsule pdu
123 *
124 * @hdr: pdu common header
125 * @cmd: nvme command
126 */
127struct nvme_tcp_cmd_pdu {
128 struct nvme_tcp_hdr hdr;
129 struct nvme_command cmd;
130};
131
132/**
133 * struct nvme_tcp_rsp_pdu - nvme tcp response capsule pdu
134 *
135 * @hdr: pdu common header
136 * @hdr: nvme-tcp generic header
137 * @cqe: nvme completion queue entry
138 */
139struct nvme_tcp_rsp_pdu {
140 struct nvme_tcp_hdr hdr;
141 struct nvme_completion cqe;
142};
143
144/**
145 * struct nvme_tcp_r2t_pdu - nvme tcp ready-to-transfer pdu
146 *
147 * @hdr: pdu common header
148 * @command_id: nvme command identifier which this relates to
149 * @ttag: transfer tag (controller generated)
150 * @r2t_offset: offset from the start of the command data
151 * @r2t_length: length the host is allowed to send
152 */
153struct nvme_tcp_r2t_pdu {
154 struct nvme_tcp_hdr hdr;
155 __u16 command_id;
156 __u16 ttag;
157 __le32 r2t_offset;
158 __le32 r2t_length;
159 __u8 rsvd[4];
160};
161
162/**
163 * struct nvme_tcp_data_pdu - nvme tcp data pdu
164 *
165 * @hdr: pdu common header
166 * @command_id: nvme command identifier which this relates to
167 * @ttag: transfer tag (controller generated)
168 * @data_offset: offset from the start of the command data
169 * @data_length: length of the data stream
170 */
171struct nvme_tcp_data_pdu {
172 struct nvme_tcp_hdr hdr;
173 __u16 command_id;
174 __u16 ttag;
175 __le32 data_offset;
176 __le32 data_length;
177 __u8 rsvd[4];
178};
179
180union nvme_tcp_pdu {
181 struct nvme_tcp_icreq_pdu icreq;
182 struct nvme_tcp_icresp_pdu icresp;
183 struct nvme_tcp_cmd_pdu cmd;
184 struct nvme_tcp_rsp_pdu rsp;
185 struct nvme_tcp_r2t_pdu r2t;
186 struct nvme_tcp_data_pdu data;
187};
188
189#endif /* _LINUX_NVME_TCP_H */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 818dbe9331be..bbcc83886899 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -52,15 +52,20 @@ enum {
52enum { 52enum {
53 NVMF_TRTYPE_RDMA = 1, /* RDMA */ 53 NVMF_TRTYPE_RDMA = 1, /* RDMA */
54 NVMF_TRTYPE_FC = 2, /* Fibre Channel */ 54 NVMF_TRTYPE_FC = 2, /* Fibre Channel */
55 NVMF_TRTYPE_TCP = 3, /* TCP/IP */
55 NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */ 56 NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */
56 NVMF_TRTYPE_MAX, 57 NVMF_TRTYPE_MAX,
57}; 58};
58 59
59/* Transport Requirements codes for Discovery Log Page entry TREQ field */ 60/* Transport Requirements codes for Discovery Log Page entry TREQ field */
60enum { 61enum {
61 NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ 62 NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */
62 NVMF_TREQ_REQUIRED = 1, /* Required */ 63 NVMF_TREQ_REQUIRED = 1, /* Required */
63 NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ 64 NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */
65#define NVME_TREQ_SECURE_CHANNEL_MASK \
66 (NVMF_TREQ_REQUIRED | NVMF_TREQ_NOT_REQUIRED)
67
68 NVMF_TREQ_DISABLE_SQFLOW = (1 << 2), /* Supports SQ flow control disable */
64}; 69};
65 70
66/* RDMA QP Service Type codes for Discovery Log Page entry TSAS 71/* RDMA QP Service Type codes for Discovery Log Page entry TSAS
@@ -198,6 +203,11 @@ enum {
198 NVME_PS_FLAGS_NON_OP_STATE = 1 << 1, 203 NVME_PS_FLAGS_NON_OP_STATE = 1 << 1,
199}; 204};
200 205
206enum nvme_ctrl_attr {
207 NVME_CTRL_ATTR_HID_128_BIT = (1 << 0),
208 NVME_CTRL_ATTR_TBKAS = (1 << 6),
209};
210
201struct nvme_id_ctrl { 211struct nvme_id_ctrl {
202 __le16 vid; 212 __le16 vid;
203 __le16 ssvid; 213 __le16 ssvid;
@@ -214,7 +224,11 @@ struct nvme_id_ctrl {
214 __le32 rtd3e; 224 __le32 rtd3e;
215 __le32 oaes; 225 __le32 oaes;
216 __le32 ctratt; 226 __le32 ctratt;
217 __u8 rsvd100[156]; 227 __u8 rsvd100[28];
228 __le16 crdt1;
229 __le16 crdt2;
230 __le16 crdt3;
231 __u8 rsvd134[122];
218 __le16 oacs; 232 __le16 oacs;
219 __u8 acl; 233 __u8 acl;
220 __u8 aerl; 234 __u8 aerl;
@@ -481,12 +495,21 @@ enum {
481 NVME_AER_NOTICE_NS_CHANGED = 0x00, 495 NVME_AER_NOTICE_NS_CHANGED = 0x00,
482 NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, 496 NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
483 NVME_AER_NOTICE_ANA = 0x03, 497 NVME_AER_NOTICE_ANA = 0x03,
498 NVME_AER_NOTICE_DISC_CHANGED = 0xf0,
484}; 499};
485 500
486enum { 501enum {
487 NVME_AEN_CFG_NS_ATTR = 1 << 8, 502 NVME_AEN_BIT_NS_ATTR = 8,
488 NVME_AEN_CFG_FW_ACT = 1 << 9, 503 NVME_AEN_BIT_FW_ACT = 9,
489 NVME_AEN_CFG_ANA_CHANGE = 1 << 11, 504 NVME_AEN_BIT_ANA_CHANGE = 11,
505 NVME_AEN_BIT_DISC_CHANGE = 31,
506};
507
508enum {
509 NVME_AEN_CFG_NS_ATTR = 1 << NVME_AEN_BIT_NS_ATTR,
510 NVME_AEN_CFG_FW_ACT = 1 << NVME_AEN_BIT_FW_ACT,
511 NVME_AEN_CFG_ANA_CHANGE = 1 << NVME_AEN_BIT_ANA_CHANGE,
512 NVME_AEN_CFG_DISC_CHANGE = 1 << NVME_AEN_BIT_DISC_CHANGE,
490}; 513};
491 514
492struct nvme_lba_range_type { 515struct nvme_lba_range_type {
@@ -639,7 +662,12 @@ struct nvme_common_command {
639 __le32 cdw2[2]; 662 __le32 cdw2[2];
640 __le64 metadata; 663 __le64 metadata;
641 union nvme_data_ptr dptr; 664 union nvme_data_ptr dptr;
642 __le32 cdw10[6]; 665 __le32 cdw10;
666 __le32 cdw11;
667 __le32 cdw12;
668 __le32 cdw13;
669 __le32 cdw14;
670 __le32 cdw15;
643}; 671};
644 672
645struct nvme_rw_command { 673struct nvme_rw_command {
@@ -738,6 +766,15 @@ enum {
738 NVME_HOST_MEM_RETURN = (1 << 1), 766 NVME_HOST_MEM_RETURN = (1 << 1),
739}; 767};
740 768
769struct nvme_feat_host_behavior {
770 __u8 acre;
771 __u8 resv1[511];
772};
773
774enum {
775 NVME_ENABLE_ACRE = 1,
776};
777
741/* Admin commands */ 778/* Admin commands */
742 779
743enum nvme_admin_opcode { 780enum nvme_admin_opcode {
@@ -792,6 +829,7 @@ enum {
792 NVME_FEAT_RRL = 0x12, 829 NVME_FEAT_RRL = 0x12,
793 NVME_FEAT_PLM_CONFIG = 0x13, 830 NVME_FEAT_PLM_CONFIG = 0x13,
794 NVME_FEAT_PLM_WINDOW = 0x14, 831 NVME_FEAT_PLM_WINDOW = 0x14,
832 NVME_FEAT_HOST_BEHAVIOR = 0x16,
795 NVME_FEAT_SW_PROGRESS = 0x80, 833 NVME_FEAT_SW_PROGRESS = 0x80,
796 NVME_FEAT_HOST_ID = 0x81, 834 NVME_FEAT_HOST_ID = 0x81,
797 NVME_FEAT_RESV_MASK = 0x82, 835 NVME_FEAT_RESV_MASK = 0x82,
@@ -1030,6 +1068,10 @@ struct nvmf_disc_rsp_page_hdr {
1030 struct nvmf_disc_rsp_page_entry entries[0]; 1068 struct nvmf_disc_rsp_page_entry entries[0];
1031}; 1069};
1032 1070
1071enum {
1072 NVME_CONNECT_DISABLE_SQFLOW = (1 << 2),
1073};
1074
1033struct nvmf_connect_command { 1075struct nvmf_connect_command {
1034 __u8 opcode; 1076 __u8 opcode;
1035 __u8 resv1; 1077 __u8 resv1;
@@ -1126,6 +1168,20 @@ struct nvme_command {
1126 }; 1168 };
1127}; 1169};
1128 1170
1171struct nvme_error_slot {
1172 __le64 error_count;
1173 __le16 sqid;
1174 __le16 cmdid;
1175 __le16 status_field;
1176 __le16 param_error_location;
1177 __le64 lba;
1178 __le32 nsid;
1179 __u8 vs;
1180 __u8 resv[3];
1181 __le64 cs;
1182 __u8 resv2[24];
1183};
1184
1129static inline bool nvme_is_write(struct nvme_command *cmd) 1185static inline bool nvme_is_write(struct nvme_command *cmd)
1130{ 1186{
1131 /* 1187 /*
@@ -1243,6 +1299,7 @@ enum {
1243 NVME_SC_ANA_TRANSITION = 0x303, 1299 NVME_SC_ANA_TRANSITION = 0x303,
1244 NVME_SC_HOST_PATH_ERROR = 0x370, 1300 NVME_SC_HOST_PATH_ERROR = 0x370,
1245 1301
1302 NVME_SC_CRD = 0x1800,
1246 NVME_SC_DNR = 0x4000, 1303 NVME_SC_DNR = 0x4000,
1247}; 1304};
1248 1305
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 804a50983ec5..14d558146aea 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -30,14 +30,24 @@ struct seq_file;
30 */ 30 */
31struct sbitmap_word { 31struct sbitmap_word {
32 /** 32 /**
33 * @word: The bitmap word itself. 33 * @depth: Number of bits being used in @word/@cleared
34 */ 34 */
35 unsigned long word; 35 unsigned long depth;
36 36
37 /** 37 /**
38 * @depth: Number of bits being used in @word. 38 * @word: word holding free bits
39 */ 39 */
40 unsigned long depth; 40 unsigned long word ____cacheline_aligned_in_smp;
41
42 /**
43 * @cleared: word holding cleared bits
44 */
45 unsigned long cleared ____cacheline_aligned_in_smp;
46
47 /**
48 * @swap_lock: Held while swapping word <-> cleared
49 */
50 spinlock_t swap_lock;
41} ____cacheline_aligned_in_smp; 51} ____cacheline_aligned_in_smp;
42 52
43/** 53/**
@@ -125,6 +135,11 @@ struct sbitmap_queue {
125 */ 135 */
126 struct sbq_wait_state *ws; 136 struct sbq_wait_state *ws;
127 137
138 /*
139 * @ws_active: count of currently active ws waitqueues
140 */
141 atomic_t ws_active;
142
128 /** 143 /**
129 * @round_robin: Allocate bits in strict round-robin order. 144 * @round_robin: Allocate bits in strict round-robin order.
130 */ 145 */
@@ -250,12 +265,14 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb,
250 nr = SB_NR_TO_BIT(sb, start); 265 nr = SB_NR_TO_BIT(sb, start);
251 266
252 while (scanned < sb->depth) { 267 while (scanned < sb->depth) {
253 struct sbitmap_word *word = &sb->map[index]; 268 unsigned long word;
254 unsigned int depth = min_t(unsigned int, word->depth - nr, 269 unsigned int depth = min_t(unsigned int,
270 sb->map[index].depth - nr,
255 sb->depth - scanned); 271 sb->depth - scanned);
256 272
257 scanned += depth; 273 scanned += depth;
258 if (!word->word) 274 word = sb->map[index].word & ~sb->map[index].cleared;
275 if (!word)
259 goto next; 276 goto next;
260 277
261 /* 278 /*
@@ -265,7 +282,7 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb,
265 */ 282 */
266 depth += nr; 283 depth += nr;
267 while (1) { 284 while (1) {
268 nr = find_next_bit(&word->word, depth, nr); 285 nr = find_next_bit(&word, depth, nr);
269 if (nr >= depth) 286 if (nr >= depth)
270 break; 287 break;
271 if (!fn(sb, (index << sb->shift) + nr, data)) 288 if (!fn(sb, (index << sb->shift) + nr, data))
@@ -310,6 +327,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
310 clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); 327 clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
311} 328}
312 329
330/*
331 * This one is special, since it doesn't actually clear the bit, rather it
332 * sets the corresponding bit in the ->cleared mask instead. Paired with
333 * the caller doing sbitmap_batch_clear() if a given index is full, which
334 * will clear the previously freed entries in the corresponding ->word.
335 */
336static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
337{
338 unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
339
340 set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
341}
342
313static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb, 343static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
314 unsigned int bitnr) 344 unsigned int bitnr)
315{ 345{
@@ -321,8 +351,6 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
321 return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); 351 return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
322} 352}
323 353
324unsigned int sbitmap_weight(const struct sbitmap *sb);
325
326/** 354/**
327 * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. 355 * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
328 * @sb: Bitmap to show. 356 * @sb: Bitmap to show.
@@ -531,4 +559,45 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
531 */ 559 */
532void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); 560void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
533 561
562struct sbq_wait {
563 struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */
564 struct wait_queue_entry wait;
565};
566
567#define DEFINE_SBQ_WAIT(name) \
568 struct sbq_wait name = { \
569 .sbq = NULL, \
570 .wait = { \
571 .private = current, \
572 .func = autoremove_wake_function, \
573 .entry = LIST_HEAD_INIT((name).wait.entry), \
574 } \
575 }
576
577/*
578 * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
579 * internal state.
580 */
581void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
582 struct sbq_wait_state *ws,
583 struct sbq_wait *sbq_wait, int state);
584
585/*
586 * Must be paired with sbitmap_prepare_to_wait().
587 */
588void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
589 struct sbq_wait *sbq_wait);
590
591/*
592 * Wrapper around add_wait_queue(), which maintains some extra internal state
593 */
594void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
595 struct sbq_wait_state *ws,
596 struct sbq_wait *sbq_wait);
597
598/*
599 * Must be paired with sbitmap_add_wait_queue()
600 */
601void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait);
602
534#endif /* __LINUX_SCALE_BITMAP_H */ 603#endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2a57a365c711..93f56fddd92a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3339,6 +3339,9 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
3339} 3339}
3340int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, 3340int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
3341 struct msghdr *msg); 3341 struct msghdr *msg);
3342int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
3343 struct iov_iter *to, int len,
3344 struct ahash_request *hash);
3342int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, 3345int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
3343 struct iov_iter *from, int len); 3346 struct iov_iter *from, int len);
3344int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); 3347int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 55ce99ddb912..ecf584f6b82d 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -11,6 +11,7 @@
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/thread_info.h> 13#include <linux/thread_info.h>
14#include <crypto/hash.h>
14#include <uapi/linux/uio.h> 15#include <uapi/linux/uio.h>
15 16
16struct page; 17struct page;
@@ -266,9 +267,11 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
266{ 267{
267 i->count = count; 268 i->count = count;
268} 269}
269size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); 270size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i);
270size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); 271size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
271bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); 272bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
273size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
274 struct iov_iter *i);
272 275
273int import_iovec(int type, const struct iovec __user * uvector, 276int import_iovec(int type, const struct iovec __user * uvector,
274 unsigned nr_segs, unsigned fast_segs, 277 unsigned nr_segs, unsigned fast_segs,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index fdfd04e348f6..738a0c24874f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -246,7 +246,8 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
246 * 246 *
247 * @bio is a part of the writeback in progress controlled by @wbc. Perform 247 * @bio is a part of the writeback in progress controlled by @wbc. Perform
248 * writeback specific initialization. This is used to apply the cgroup 248 * writeback specific initialization. This is used to apply the cgroup
249 * writeback context. 249 * writeback context. Must be called after the bio has been associated with
250 * a device.
250 */ 251 */
251static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) 252static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
252{ 253{
@@ -257,7 +258,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
257 * regular writeback instead of writing things out itself. 258 * regular writeback instead of writing things out itself.
258 */ 259 */
259 if (wbc->wb) 260 if (wbc->wb)
260 bio_associate_blkcg(bio, wbc->wb->blkcg_css); 261 bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
261} 262}
262 263
263#else /* CONFIG_CGROUP_WRITEBACK */ 264#else /* CONFIG_CGROUP_WRITEBACK */
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index c891ada3c5c2..d85e6befa26b 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -61,6 +61,9 @@ struct scsi_pointer {
61/* flags preserved across unprep / reprep */ 61/* flags preserved across unprep / reprep */
62#define SCMD_PRESERVED_FLAGS (SCMD_UNCHECKED_ISA_DMA | SCMD_INITIALIZED) 62#define SCMD_PRESERVED_FLAGS (SCMD_UNCHECKED_ISA_DMA | SCMD_INITIALIZED)
63 63
64/* for scmd->state */
65#define SCMD_STATE_COMPLETE 0
66
64struct scsi_cmnd { 67struct scsi_cmnd {
65 struct scsi_request req; 68 struct scsi_request req;
66 struct scsi_device *device; 69 struct scsi_device *device;
@@ -145,6 +148,7 @@ struct scsi_cmnd {
145 148
146 int result; /* Status code from lower level driver */ 149 int result; /* Status code from lower level driver */
147 int flags; /* Command flags */ 150 int flags; /* Command flags */
151 unsigned long state; /* Command completion state */
148 152
149 unsigned char tag; /* SCSI-II queued command tag */ 153 unsigned char tag; /* SCSI-II queued command tag */
150}; 154};
@@ -171,7 +175,7 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
171 size_t *offset, size_t *len); 175 size_t *offset, size_t *len);
172extern void scsi_kunmap_atomic_sg(void *virt); 176extern void scsi_kunmap_atomic_sg(void *virt);
173 177
174extern int scsi_init_io(struct scsi_cmnd *cmd); 178extern blk_status_t scsi_init_io(struct scsi_cmnd *cmd);
175 179
176#ifdef CONFIG_SCSI_DMA 180#ifdef CONFIG_SCSI_DMA
177extern int scsi_dma_map(struct scsi_cmnd *cmd); 181extern int scsi_dma_map(struct scsi_cmnd *cmd);
diff --git a/include/scsi/scsi_dh.h b/include/scsi/scsi_dh.h
index c7bba2b24849..a862dc23c68d 100644
--- a/include/scsi/scsi_dh.h
+++ b/include/scsi/scsi_dh.h
@@ -69,7 +69,7 @@ struct scsi_device_handler {
69 int (*attach)(struct scsi_device *); 69 int (*attach)(struct scsi_device *);
70 void (*detach)(struct scsi_device *); 70 void (*detach)(struct scsi_device *);
71 int (*activate)(struct scsi_device *, activate_complete, void *); 71 int (*activate)(struct scsi_device *, activate_complete, void *);
72 int (*prep_fn)(struct scsi_device *, struct request *); 72 blk_status_t (*prep_fn)(struct scsi_device *, struct request *);
73 int (*set_params)(struct scsi_device *, const char *); 73 int (*set_params)(struct scsi_device *, const char *);
74 void (*rescan)(struct scsi_device *); 74 void (*rescan)(struct scsi_device *);
75}; 75};
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index fae8b465233e..6dffa8555a39 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -2,6 +2,7 @@
2#ifndef _SCSI_SCSI_DRIVER_H 2#ifndef _SCSI_SCSI_DRIVER_H
3#define _SCSI_SCSI_DRIVER_H 3#define _SCSI_SCSI_DRIVER_H
4 4
5#include <linux/blk_types.h>
5#include <linux/device.h> 6#include <linux/device.h>
6 7
7struct module; 8struct module;
@@ -13,7 +14,7 @@ struct scsi_driver {
13 struct device_driver gendrv; 14 struct device_driver gendrv;
14 15
15 void (*rescan)(struct device *); 16 void (*rescan)(struct device *);
16 int (*init_command)(struct scsi_cmnd *); 17 blk_status_t (*init_command)(struct scsi_cmnd *);
17 void (*uninit_command)(struct scsi_cmnd *); 18 void (*uninit_command)(struct scsi_cmnd *);
18 int (*done)(struct scsi_cmnd *); 19 int (*done)(struct scsi_cmnd *);
19 int (*eh_action)(struct scsi_cmnd *, int); 20 int (*eh_action)(struct scsi_cmnd *, int);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 5ea06d310a25..aa760df8c6b3 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -11,7 +11,6 @@
11#include <linux/blk-mq.h> 11#include <linux/blk-mq.h>
12#include <scsi/scsi.h> 12#include <scsi/scsi.h>
13 13
14struct request_queue;
15struct block_device; 14struct block_device;
16struct completion; 15struct completion;
17struct module; 16struct module;
@@ -22,7 +21,6 @@ struct scsi_target;
22struct Scsi_Host; 21struct Scsi_Host;
23struct scsi_host_cmd_pool; 22struct scsi_host_cmd_pool;
24struct scsi_transport_template; 23struct scsi_transport_template;
25struct blk_queue_tags;
26 24
27 25
28/* 26/*
@@ -547,14 +545,8 @@ struct Scsi_Host {
547 struct scsi_host_template *hostt; 545 struct scsi_host_template *hostt;
548 struct scsi_transport_template *transportt; 546 struct scsi_transport_template *transportt;
549 547
550 /* 548 /* Area to keep a shared tag map */
551 * Area to keep a shared tag map (if needed, will be 549 struct blk_mq_tag_set tag_set;
552 * NULL if not).
553 */
554 union {
555 struct blk_queue_tag *bqt;
556 struct blk_mq_tag_set tag_set;
557 };
558 550
559 atomic_t host_busy; /* commands actually active on low-level */ 551 atomic_t host_busy; /* commands actually active on low-level */
560 atomic_t host_blocked; 552 atomic_t host_blocked;
@@ -648,7 +640,6 @@ struct Scsi_Host {
648 /* The controller does not support WRITE SAME */ 640 /* The controller does not support WRITE SAME */
649 unsigned no_write_same:1; 641 unsigned no_write_same:1;
650 642
651 unsigned use_blk_mq:1;
652 unsigned use_cmd_list:1; 643 unsigned use_cmd_list:1;
653 644
654 /* Host responded with short (<36 bytes) INQUIRY result */ 645 /* Host responded with short (<36 bytes) INQUIRY result */
@@ -742,11 +733,6 @@ static inline int scsi_host_in_recovery(struct Scsi_Host *shost)
742 shost->tmf_in_progress; 733 shost->tmf_in_progress;
743} 734}
744 735
745static inline bool shost_use_blk_mq(struct Scsi_Host *shost)
746{
747 return shost->use_blk_mq;
748}
749
750extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); 736extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
751extern void scsi_flush_work(struct Scsi_Host *); 737extern void scsi_flush_work(struct Scsi_Host *);
752 738
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index e192a0caa850..6053d46e794e 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -23,19 +23,15 @@ static inline struct scsi_cmnd *scsi_host_find_tag(struct Scsi_Host *shost,
23 int tag) 23 int tag)
24{ 24{
25 struct request *req = NULL; 25 struct request *req = NULL;
26 u16 hwq;
26 27
27 if (tag == SCSI_NO_TAG) 28 if (tag == SCSI_NO_TAG)
28 return NULL; 29 return NULL;
29 30
30 if (shost_use_blk_mq(shost)) { 31 hwq = blk_mq_unique_tag_to_hwq(tag);
31 u16 hwq = blk_mq_unique_tag_to_hwq(tag); 32 if (hwq < shost->tag_set.nr_hw_queues) {
32 33 req = blk_mq_tag_to_rq(shost->tag_set.tags[hwq],
33 if (hwq < shost->tag_set.nr_hw_queues) { 34 blk_mq_unique_tag_to_tag(tag));
34 req = blk_mq_tag_to_rq(shost->tag_set.tags[hwq],
35 blk_mq_unique_tag_to_tag(tag));
36 }
37 } else {
38 req = blk_map_queue_find_tag(shost->bqt, tag);
39 } 35 }
40 36
41 if (!req) 37 if (!req)
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 2cbd6e42ad83..e4526f85c19d 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -221,9 +221,30 @@ DEFINE_EVENT(cache_set, bcache_journal_entry_full,
221 TP_ARGS(c) 221 TP_ARGS(c)
222); 222);
223 223
224DEFINE_EVENT(bcache_bio, bcache_journal_write, 224TRACE_EVENT(bcache_journal_write,
225 TP_PROTO(struct bio *bio), 225 TP_PROTO(struct bio *bio, u32 keys),
226 TP_ARGS(bio) 226 TP_ARGS(bio, keys),
227
228 TP_STRUCT__entry(
229 __field(dev_t, dev )
230 __field(sector_t, sector )
231 __field(unsigned int, nr_sector )
232 __array(char, rwbs, 6 )
233 __field(u32, nr_keys )
234 ),
235
236 TP_fast_assign(
237 __entry->dev = bio_dev(bio);
238 __entry->sector = bio->bi_iter.bi_sector;
239 __entry->nr_sector = bio->bi_iter.bi_size >> 9;
240 __entry->nr_keys = keys;
241 blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
242 ),
243
244 TP_printk("%d,%d %s %llu + %u keys %u",
245 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
246 (unsigned long long)__entry->sector, __entry->nr_sector,
247 __entry->nr_keys)
227); 248);
228 249
229/* Btree */ 250/* Btree */
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index ce43d340f010..8387e0af0f76 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -50,6 +50,8 @@ enum {
50 * 50 *
51 * IOCB_FLAG_RESFD - Set if the "aio_resfd" member of the "struct iocb" 51 * IOCB_FLAG_RESFD - Set if the "aio_resfd" member of the "struct iocb"
52 * is valid. 52 * is valid.
53 * IOCB_FLAG_IOPRIO - Set if the "aio_reqprio" member of the "struct iocb"
54 * is valid.
53 */ 55 */
54#define IOCB_FLAG_RESFD (1 << 0) 56#define IOCB_FLAG_RESFD (1 << 0)
55#define IOCB_FLAG_IOPRIO (1 << 1) 57#define IOCB_FLAG_IOPRIO (1 << 1)
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index d1a5d885ce13..73e02ea5d5d1 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -53,9 +53,6 @@ static void __init handle_initrd(void)
53 ksys_mkdir("/old", 0700); 53 ksys_mkdir("/old", 0700);
54 ksys_chdir("/old"); 54 ksys_chdir("/old");
55 55
56 /* try loading default modules from initrd */
57 load_default_modules();
58
59 /* 56 /*
60 * In case that a resume from disk is carried out by linuxrc or one of 57 * In case that a resume from disk is carried out by linuxrc or one of
61 * its children, we need to tell the freezer not to wait for us. 58 * its children, we need to tell the freezer not to wait for us.
diff --git a/init/initramfs.c b/init/initramfs.c
index f6f4a1e4cd54..fca899622937 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -646,12 +646,6 @@ static int __init populate_rootfs(void)
646#endif 646#endif
647 } 647 }
648 flush_delayed_fput(); 648 flush_delayed_fput();
649 /*
650 * Try loading default modules from initramfs. This gives
651 * us a chance to load before device_initcalls.
652 */
653 load_default_modules();
654
655 return 0; 649 return 0;
656} 650}
657rootfs_initcall(populate_rootfs); 651rootfs_initcall(populate_rootfs);
diff --git a/init/main.c b/init/main.c
index 954d9b6c62c6..0f8cc626e634 100644
--- a/init/main.c
+++ b/init/main.c
@@ -992,17 +992,6 @@ static void __init do_pre_smp_initcalls(void)
992 do_one_initcall(initcall_from_entry(fn)); 992 do_one_initcall(initcall_from_entry(fn));
993} 993}
994 994
995/*
996 * This function requests modules which should be loaded by default and is
997 * called twice right after initrd is mounted and right before init is
998 * exec'd. If such modules are on either initrd or rootfs, they will be
999 * loaded before control is passed to userland.
1000 */
1001void __init load_default_modules(void)
1002{
1003 load_default_elevator_module();
1004}
1005
1006static int run_init_process(const char *init_filename) 995static int run_init_process(const char *init_filename)
1007{ 996{
1008 argv_init[0] = init_filename; 997 argv_init[0] = init_filename;
@@ -1176,5 +1165,4 @@ static noinline void __init kernel_init_freeable(void)
1176 */ 1165 */
1177 1166
1178 integrity_load_keys(); 1167 integrity_load_keys();
1179 load_default_modules();
1180} 1168}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7a8429f8e280..39eb36ba36ad 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -493,7 +493,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
493} 493}
494 494
495/** 495/**
496 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem 496 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
497 * @cgrp: the cgroup of interest 497 * @cgrp: the cgroup of interest
498 * @ss: the subsystem of interest (%NULL returns @cgrp->self) 498 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
499 * 499 *
@@ -502,8 +502,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
502 * enabled. If @ss is associated with the hierarchy @cgrp is on, this 502 * enabled. If @ss is associated with the hierarchy @cgrp is on, this
503 * function is guaranteed to return non-NULL css. 503 * function is guaranteed to return non-NULL css.
504 */ 504 */
505static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, 505static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
506 struct cgroup_subsys *ss) 506 struct cgroup_subsys *ss)
507{ 507{
508 lockdep_assert_held(&cgroup_mutex); 508 lockdep_assert_held(&cgroup_mutex);
509 509
@@ -524,6 +524,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
524} 524}
525 525
526/** 526/**
527 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
528 * @cgrp: the cgroup of interest
529 * @ss: the subsystem of interest
530 *
531 * Find and get the effective css of @cgrp for @ss. The effective css is
532 * defined as the matching css of the nearest ancestor including self which
533 * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
534 * the root css is returned, so this function always returns a valid css.
535 *
536 * The returned css is not guaranteed to be online, and therefore it is the
537 * callers responsiblity to tryget a reference for it.
538 */
539struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
540 struct cgroup_subsys *ss)
541{
542 struct cgroup_subsys_state *css;
543
544 do {
545 css = cgroup_css(cgrp, ss);
546
547 if (css)
548 return css;
549 cgrp = cgroup_parent(cgrp);
550 } while (cgrp);
551
552 return init_css_set.subsys[ss->id];
553}
554
555/**
527 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem 556 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
528 * @cgrp: the cgroup of interest 557 * @cgrp: the cgroup of interest
529 * @ss: the subsystem of interest 558 * @ss: the subsystem of interest
@@ -605,10 +634,11 @@ EXPORT_SYMBOL_GPL(of_css);
605 * 634 *
606 * Should be called under cgroup_[tree_]mutex. 635 * Should be called under cgroup_[tree_]mutex.
607 */ 636 */
608#define for_each_e_css(css, ssid, cgrp) \ 637#define for_each_e_css(css, ssid, cgrp) \
609 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ 638 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
610 if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ 639 if (!((css) = cgroup_e_css_by_mask(cgrp, \
611 ; \ 640 cgroup_subsys[(ssid)]))) \
641 ; \
612 else 642 else
613 643
614/** 644/**
@@ -1007,7 +1037,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
1007 * @ss is in this hierarchy, so we want the 1037 * @ss is in this hierarchy, so we want the
1008 * effective css from @cgrp. 1038 * effective css from @cgrp.
1009 */ 1039 */
1010 template[i] = cgroup_e_css(cgrp, ss); 1040 template[i] = cgroup_e_css_by_mask(cgrp, ss);
1011 } else { 1041 } else {
1012 /* 1042 /*
1013 * @ss is not in this hierarchy, so we don't want 1043 * @ss is not in this hierarchy, so we don't want
@@ -3024,7 +3054,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
3024 return ret; 3054 return ret;
3025 3055
3026 /* 3056 /*
3027 * At this point, cgroup_e_css() results reflect the new csses 3057 * At this point, cgroup_e_css_by_mask() results reflect the new csses
3028 * making the following cgroup_update_dfl_csses() properly update 3058 * making the following cgroup_update_dfl_csses() properly update
3029 * css associations of all tasks in the subtree. 3059 * css associations of all tasks in the subtree.
3030 */ 3060 */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2868d85f1fb1..fac0ddf8a8e2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
764 if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) 764 if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
765 return NULL; 765 return NULL;
766 766
767 if (!bio->bi_css) 767 if (!bio->bi_blkg)
768 return NULL; 768 return NULL;
769 return cgroup_get_kernfs_id(bio->bi_css->cgroup); 769 return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup);
770} 770}
771#else 771#else
772static union kernfs_node_id * 772static union kernfs_node_id *
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 54c248526b55..1928009f506e 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -6,6 +6,7 @@
6#include <linux/vmalloc.h> 6#include <linux/vmalloc.h>
7#include <linux/splice.h> 7#include <linux/splice.h>
8#include <net/checksum.h> 8#include <net/checksum.h>
9#include <linux/scatterlist.h>
9 10
10#define PIPE_PARANOIA /* for now */ 11#define PIPE_PARANOIA /* for now */
11 12
@@ -1464,10 +1465,11 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
1464} 1465}
1465EXPORT_SYMBOL(csum_and_copy_from_iter_full); 1466EXPORT_SYMBOL(csum_and_copy_from_iter_full);
1466 1467
1467size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, 1468size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
1468 struct iov_iter *i) 1469 struct iov_iter *i)
1469{ 1470{
1470 const char *from = addr; 1471 const char *from = addr;
1472 __wsum *csum = csump;
1471 __wsum sum, next; 1473 __wsum sum, next;
1472 size_t off = 0; 1474 size_t off = 0;
1473 1475
@@ -1510,6 +1512,21 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
1510} 1512}
1511EXPORT_SYMBOL(csum_and_copy_to_iter); 1513EXPORT_SYMBOL(csum_and_copy_to_iter);
1512 1514
1515size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1516 struct iov_iter *i)
1517{
1518 struct ahash_request *hash = hashp;
1519 struct scatterlist sg;
1520 size_t copied;
1521
1522 copied = copy_to_iter(addr, bytes, i);
1523 sg_init_one(&sg, addr, copied);
1524 ahash_request_set_crypt(hash, &sg, NULL, copied);
1525 crypto_ahash_update(hash);
1526 return copied;
1527}
1528EXPORT_SYMBOL(hash_and_copy_to_iter);
1529
1513int iov_iter_npages(const struct iov_iter *i, int maxpages) 1530int iov_iter_npages(const struct iov_iter *i, int maxpages)
1514{ 1531{
1515 size_t size = i->count; 1532 size_t size = i->count;
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index fdd1b8aa8ac6..65c2d06250a6 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -20,6 +20,47 @@
20#include <linux/sbitmap.h> 20#include <linux/sbitmap.h>
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22 22
23/*
24 * See if we have deferred clears that we can batch move
25 */
26static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
27{
28 unsigned long mask, val;
29 unsigned long __maybe_unused flags;
30 bool ret = false;
31
32 /* Silence bogus lockdep warning */
33#if defined(CONFIG_LOCKDEP)
34 local_irq_save(flags);
35#endif
36 spin_lock(&sb->map[index].swap_lock);
37
38 if (!sb->map[index].cleared)
39 goto out_unlock;
40
41 /*
42 * First get a stable cleared mask, setting the old mask to 0.
43 */
44 do {
45 mask = sb->map[index].cleared;
46 } while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
47
48 /*
49 * Now clear the masked bits in our free word
50 */
51 do {
52 val = sb->map[index].word;
53 } while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
54
55 ret = true;
56out_unlock:
57 spin_unlock(&sb->map[index].swap_lock);
58#if defined(CONFIG_LOCKDEP)
59 local_irq_restore(flags);
60#endif
61 return ret;
62}
63
23int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, 64int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
24 gfp_t flags, int node) 65 gfp_t flags, int node)
25{ 66{
@@ -59,6 +100,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
59 for (i = 0; i < sb->map_nr; i++) { 100 for (i = 0; i < sb->map_nr; i++) {
60 sb->map[i].depth = min(depth, bits_per_word); 101 sb->map[i].depth = min(depth, bits_per_word);
61 depth -= sb->map[i].depth; 102 depth -= sb->map[i].depth;
103 spin_lock_init(&sb->map[i].swap_lock);
62 } 104 }
63 return 0; 105 return 0;
64} 106}
@@ -69,6 +111,9 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
69 unsigned int bits_per_word = 1U << sb->shift; 111 unsigned int bits_per_word = 1U << sb->shift;
70 unsigned int i; 112 unsigned int i;
71 113
114 for (i = 0; i < sb->map_nr; i++)
115 sbitmap_deferred_clear(sb, i);
116
72 sb->depth = depth; 117 sb->depth = depth;
73 sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); 118 sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
74 119
@@ -111,6 +156,24 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
111 return nr; 156 return nr;
112} 157}
113 158
159static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
160 unsigned int alloc_hint, bool round_robin)
161{
162 int nr;
163
164 do {
165 nr = __sbitmap_get_word(&sb->map[index].word,
166 sb->map[index].depth, alloc_hint,
167 !round_robin);
168 if (nr != -1)
169 break;
170 if (!sbitmap_deferred_clear(sb, index))
171 break;
172 } while (1);
173
174 return nr;
175}
176
114int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) 177int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
115{ 178{
116 unsigned int i, index; 179 unsigned int i, index;
@@ -118,24 +181,28 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
118 181
119 index = SB_NR_TO_INDEX(sb, alloc_hint); 182 index = SB_NR_TO_INDEX(sb, alloc_hint);
120 183
184 /*
185 * Unless we're doing round robin tag allocation, just use the
186 * alloc_hint to find the right word index. No point in looping
187 * twice in find_next_zero_bit() for that case.
188 */
189 if (round_robin)
190 alloc_hint = SB_NR_TO_BIT(sb, alloc_hint);
191 else
192 alloc_hint = 0;
193
121 for (i = 0; i < sb->map_nr; i++) { 194 for (i = 0; i < sb->map_nr; i++) {
122 nr = __sbitmap_get_word(&sb->map[index].word, 195 nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
123 sb->map[index].depth, 196 round_robin);
124 SB_NR_TO_BIT(sb, alloc_hint),
125 !round_robin);
126 if (nr != -1) { 197 if (nr != -1) {
127 nr += index << sb->shift; 198 nr += index << sb->shift;
128 break; 199 break;
129 } 200 }
130 201
131 /* Jump to next index. */ 202 /* Jump to next index. */
132 index++; 203 alloc_hint = 0;
133 alloc_hint = index << sb->shift; 204 if (++index >= sb->map_nr)
134
135 if (index >= sb->map_nr) {
136 index = 0; 205 index = 0;
137 alloc_hint = 0;
138 }
139 } 206 }
140 207
141 return nr; 208 return nr;
@@ -151,6 +218,7 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
151 index = SB_NR_TO_INDEX(sb, alloc_hint); 218 index = SB_NR_TO_INDEX(sb, alloc_hint);
152 219
153 for (i = 0; i < sb->map_nr; i++) { 220 for (i = 0; i < sb->map_nr; i++) {
221again:
154 nr = __sbitmap_get_word(&sb->map[index].word, 222 nr = __sbitmap_get_word(&sb->map[index].word,
155 min(sb->map[index].depth, shallow_depth), 223 min(sb->map[index].depth, shallow_depth),
156 SB_NR_TO_BIT(sb, alloc_hint), true); 224 SB_NR_TO_BIT(sb, alloc_hint), true);
@@ -159,6 +227,9 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
159 break; 227 break;
160 } 228 }
161 229
230 if (sbitmap_deferred_clear(sb, index))
231 goto again;
232
162 /* Jump to next index. */ 233 /* Jump to next index. */
163 index++; 234 index++;
164 alloc_hint = index << sb->shift; 235 alloc_hint = index << sb->shift;
@@ -178,7 +249,7 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb)
178 unsigned int i; 249 unsigned int i;
179 250
180 for (i = 0; i < sb->map_nr; i++) { 251 for (i = 0; i < sb->map_nr; i++) {
181 if (sb->map[i].word) 252 if (sb->map[i].word & ~sb->map[i].cleared)
182 return true; 253 return true;
183 } 254 }
184 return false; 255 return false;
@@ -191,9 +262,10 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb)
191 262
192 for (i = 0; i < sb->map_nr; i++) { 263 for (i = 0; i < sb->map_nr; i++) {
193 const struct sbitmap_word *word = &sb->map[i]; 264 const struct sbitmap_word *word = &sb->map[i];
265 unsigned long mask = word->word & ~word->cleared;
194 unsigned long ret; 266 unsigned long ret;
195 267
196 ret = find_first_zero_bit(&word->word, word->depth); 268 ret = find_first_zero_bit(&mask, word->depth);
197 if (ret < word->depth) 269 if (ret < word->depth)
198 return true; 270 return true;
199 } 271 }
@@ -201,23 +273,36 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb)
201} 273}
202EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); 274EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
203 275
204unsigned int sbitmap_weight(const struct sbitmap *sb) 276static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
205{ 277{
206 unsigned int i, weight = 0; 278 unsigned int i, weight = 0;
207 279
208 for (i = 0; i < sb->map_nr; i++) { 280 for (i = 0; i < sb->map_nr; i++) {
209 const struct sbitmap_word *word = &sb->map[i]; 281 const struct sbitmap_word *word = &sb->map[i];
210 282
211 weight += bitmap_weight(&word->word, word->depth); 283 if (set)
284 weight += bitmap_weight(&word->word, word->depth);
285 else
286 weight += bitmap_weight(&word->cleared, word->depth);
212 } 287 }
213 return weight; 288 return weight;
214} 289}
215EXPORT_SYMBOL_GPL(sbitmap_weight); 290
291static unsigned int sbitmap_weight(const struct sbitmap *sb)
292{
293 return __sbitmap_weight(sb, true);
294}
295
296static unsigned int sbitmap_cleared(const struct sbitmap *sb)
297{
298 return __sbitmap_weight(sb, false);
299}
216 300
217void sbitmap_show(struct sbitmap *sb, struct seq_file *m) 301void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
218{ 302{
219 seq_printf(m, "depth=%u\n", sb->depth); 303 seq_printf(m, "depth=%u\n", sb->depth);
220 seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); 304 seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb));
305 seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
221 seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); 306 seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
222 seq_printf(m, "map_nr=%u\n", sb->map_nr); 307 seq_printf(m, "map_nr=%u\n", sb->map_nr);
223} 308}
@@ -325,6 +410,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
325 sbq->min_shallow_depth = UINT_MAX; 410 sbq->min_shallow_depth = UINT_MAX;
326 sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); 411 sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
327 atomic_set(&sbq->wake_index, 0); 412 atomic_set(&sbq->wake_index, 0);
413 atomic_set(&sbq->ws_active, 0);
328 414
329 sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); 415 sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
330 if (!sbq->ws) { 416 if (!sbq->ws) {
@@ -440,6 +526,9 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
440{ 526{
441 int i, wake_index; 527 int i, wake_index;
442 528
529 if (!atomic_read(&sbq->ws_active))
530 return NULL;
531
443 wake_index = atomic_read(&sbq->wake_index); 532 wake_index = atomic_read(&sbq->wake_index);
444 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 533 for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
445 struct sbq_wait_state *ws = &sbq->ws[wake_index]; 534 struct sbq_wait_state *ws = &sbq->ws[wake_index];
@@ -509,7 +598,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
509void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, 598void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
510 unsigned int cpu) 599 unsigned int cpu)
511{ 600{
512 sbitmap_clear_bit_unlock(&sbq->sb, nr); 601 sbitmap_deferred_clear_bit(&sbq->sb, nr);
602
513 /* 603 /*
514 * Pairs with the memory barrier in set_current_state() to ensure the 604 * Pairs with the memory barrier in set_current_state() to ensure the
515 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker 605 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
@@ -564,6 +654,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
564 654
565 seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); 655 seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
566 seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); 656 seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
657 seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active));
567 658
568 seq_puts(m, "ws={\n"); 659 seq_puts(m, "ws={\n");
569 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 660 for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
@@ -579,3 +670,48 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
579 seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); 670 seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
580} 671}
581EXPORT_SYMBOL_GPL(sbitmap_queue_show); 672EXPORT_SYMBOL_GPL(sbitmap_queue_show);
673
674void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
675 struct sbq_wait_state *ws,
676 struct sbq_wait *sbq_wait)
677{
678 if (!sbq_wait->sbq) {
679 sbq_wait->sbq = sbq;
680 atomic_inc(&sbq->ws_active);
681 }
682 add_wait_queue(&ws->wait, &sbq_wait->wait);
683}
684EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue);
685
686void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait)
687{
688 list_del_init(&sbq_wait->wait.entry);
689 if (sbq_wait->sbq) {
690 atomic_dec(&sbq_wait->sbq->ws_active);
691 sbq_wait->sbq = NULL;
692 }
693}
694EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue);
695
696void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
697 struct sbq_wait_state *ws,
698 struct sbq_wait *sbq_wait, int state)
699{
700 if (!sbq_wait->sbq) {
701 atomic_inc(&sbq->ws_active);
702 sbq_wait->sbq = sbq;
703 }
704 prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state);
705}
706EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);
707
708void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
709 struct sbq_wait *sbq_wait)
710{
711 finish_wait(&ws->wait, &sbq_wait->wait);
712 if (sbq_wait->sbq) {
713 atomic_dec(&sbq->ws_active);
714 sbq_wait->sbq = NULL;
715 }
716}
717EXPORT_SYMBOL_GPL(sbitmap_finish_wait);
diff --git a/mm/page_io.c b/mm/page_io.c
index d4d1c89bcddd..3475733b1926 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -140,7 +140,7 @@ out:
140 unlock_page(page); 140 unlock_page(page);
141 WRITE_ONCE(bio->bi_private, NULL); 141 WRITE_ONCE(bio->bi_private, NULL);
142 bio_put(bio); 142 bio_put(bio);
143 wake_up_process(waiter); 143 blk_wake_io_task(waiter);
144 put_task_struct(waiter); 144 put_task_struct(waiter);
145} 145}
146 146
@@ -339,7 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
339 goto out; 339 goto out;
340 } 340 }
341 bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); 341 bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
342 bio_associate_blkcg_from_page(bio, page); 342 bio_associate_blkg_from_page(bio, page);
343 count_swpout_vm_event(page); 343 count_swpout_vm_event(page);
344 set_page_writeback(page); 344 set_page_writeback(page);
345 unlock_page(page); 345 unlock_page(page);
@@ -405,11 +405,12 @@ int swap_readpage(struct page *page, bool synchronous)
405 bio_get(bio); 405 bio_get(bio);
406 qc = submit_bio(bio); 406 qc = submit_bio(bio);
407 while (synchronous) { 407 while (synchronous) {
408 set_current_state(TASK_UNINTERRUPTIBLE); 408 __set_current_state(TASK_UNINTERRUPTIBLE);
409
409 if (!READ_ONCE(bio->bi_private)) 410 if (!READ_ONCE(bio->bi_private))
410 break; 411 break;
411 412
412 if (!blk_poll(disk->queue, qc)) 413 if (!blk_poll(disk->queue, qc, true))
413 break; 414 break;
414 } 415 }
415 __set_current_state(TASK_RUNNING); 416 __set_current_state(TASK_RUNNING);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 4bf62b1afa3b..b2651bb6d2a3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -408,27 +408,20 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
408} 408}
409EXPORT_SYMBOL(skb_kill_datagram); 409EXPORT_SYMBOL(skb_kill_datagram);
410 410
411/** 411int __skb_datagram_iter(const struct sk_buff *skb, int offset,
412 * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. 412 struct iov_iter *to, int len, bool fault_short,
413 * @skb: buffer to copy 413 size_t (*cb)(const void *, size_t, void *, struct iov_iter *),
414 * @offset: offset in the buffer to start copying from 414 void *data)
415 * @to: iovec iterator to copy to
416 * @len: amount of data to copy from buffer to iovec
417 */
418int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
419 struct iov_iter *to, int len)
420{ 415{
421 int start = skb_headlen(skb); 416 int start = skb_headlen(skb);
422 int i, copy = start - offset, start_off = offset, n; 417 int i, copy = start - offset, start_off = offset, n;
423 struct sk_buff *frag_iter; 418 struct sk_buff *frag_iter;
424 419
425 trace_skb_copy_datagram_iovec(skb, len);
426
427 /* Copy header. */ 420 /* Copy header. */
428 if (copy > 0) { 421 if (copy > 0) {
429 if (copy > len) 422 if (copy > len)
430 copy = len; 423 copy = len;
431 n = copy_to_iter(skb->data + offset, copy, to); 424 n = cb(skb->data + offset, copy, data, to);
432 offset += n; 425 offset += n;
433 if (n != copy) 426 if (n != copy)
434 goto short_copy; 427 goto short_copy;
@@ -445,11 +438,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
445 438
446 end = start + skb_frag_size(frag); 439 end = start + skb_frag_size(frag);
447 if ((copy = end - offset) > 0) { 440 if ((copy = end - offset) > 0) {
441 struct page *page = skb_frag_page(frag);
442 u8 *vaddr = kmap(page);
443
448 if (copy > len) 444 if (copy > len)
449 copy = len; 445 copy = len;
450 n = copy_page_to_iter(skb_frag_page(frag), 446 n = cb(vaddr + frag->page_offset +
451 frag->page_offset + offset - 447 offset - start, copy, data, to);
452 start, copy, to); 448 kunmap(page);
453 offset += n; 449 offset += n;
454 if (n != copy) 450 if (n != copy)
455 goto short_copy; 451 goto short_copy;
@@ -468,8 +464,8 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
468 if ((copy = end - offset) > 0) { 464 if ((copy = end - offset) > 0) {
469 if (copy > len) 465 if (copy > len)
470 copy = len; 466 copy = len;
471 if (skb_copy_datagram_iter(frag_iter, offset - start, 467 if (__skb_datagram_iter(frag_iter, offset - start,
472 to, copy)) 468 to, copy, fault_short, cb, data))
473 goto fault; 469 goto fault;
474 if ((len -= copy) == 0) 470 if ((len -= copy) == 0)
475 return 0; 471 return 0;
@@ -490,11 +486,50 @@ fault:
490 return -EFAULT; 486 return -EFAULT;
491 487
492short_copy: 488short_copy:
493 if (iov_iter_count(to)) 489 if (fault_short || iov_iter_count(to))
494 goto fault; 490 goto fault;
495 491
496 return 0; 492 return 0;
497} 493}
494
495/**
496 * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
497 * and update a hash.
498 * @skb: buffer to copy
499 * @offset: offset in the buffer to start copying from
500 * @to: iovec iterator to copy to
501 * @len: amount of data to copy from buffer to iovec
502 * @hash: hash request to update
503 */
504int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
505 struct iov_iter *to, int len,
506 struct ahash_request *hash)
507{
508 return __skb_datagram_iter(skb, offset, to, len, true,
509 hash_and_copy_to_iter, hash);
510}
511EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);
512
513static size_t simple_copy_to_iter(const void *addr, size_t bytes,
514 void *data __always_unused, struct iov_iter *i)
515{
516 return copy_to_iter(addr, bytes, i);
517}
518
519/**
520 * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
521 * @skb: buffer to copy
522 * @offset: offset in the buffer to start copying from
523 * @to: iovec iterator to copy to
524 * @len: amount of data to copy from buffer to iovec
525 */
526int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
527 struct iov_iter *to, int len)
528{
529 trace_skb_copy_datagram_iovec(skb, len);
530 return __skb_datagram_iter(skb, offset, to, len, false,
531 simple_copy_to_iter, NULL);
532}
498EXPORT_SYMBOL(skb_copy_datagram_iter); 533EXPORT_SYMBOL(skb_copy_datagram_iter);
499 534
500/** 535/**
@@ -645,87 +680,21 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
645} 680}
646EXPORT_SYMBOL(zerocopy_sg_from_iter); 681EXPORT_SYMBOL(zerocopy_sg_from_iter);
647 682
683/**
684 * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator
685 * and update a checksum.
686 * @skb: buffer to copy
687 * @offset: offset in the buffer to start copying from
688 * @to: iovec iterator to copy to
689 * @len: amount of data to copy from buffer to iovec
690 * @csump: checksum pointer
691 */
648static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, 692static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
649 struct iov_iter *to, int len, 693 struct iov_iter *to, int len,
650 __wsum *csump) 694 __wsum *csump)
651{ 695{
652 int start = skb_headlen(skb); 696 return __skb_datagram_iter(skb, offset, to, len, true,
653 int i, copy = start - offset, start_off = offset; 697 csum_and_copy_to_iter, csump);
654 struct sk_buff *frag_iter;
655 int pos = 0;
656 int n;
657
658 /* Copy header. */
659 if (copy > 0) {
660 if (copy > len)
661 copy = len;
662 n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
663 offset += n;
664 if (n != copy)
665 goto fault;
666 if ((len -= copy) == 0)
667 return 0;
668 pos = copy;
669 }
670
671 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
672 int end;
673 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
674
675 WARN_ON(start > offset + len);
676
677 end = start + skb_frag_size(frag);
678 if ((copy = end - offset) > 0) {
679 __wsum csum2 = 0;
680 struct page *page = skb_frag_page(frag);
681 u8 *vaddr = kmap(page);
682
683 if (copy > len)
684 copy = len;
685 n = csum_and_copy_to_iter(vaddr + frag->page_offset +
686 offset - start, copy,
687 &csum2, to);
688 kunmap(page);
689 offset += n;
690 if (n != copy)
691 goto fault;
692 *csump = csum_block_add(*csump, csum2, pos);
693 if (!(len -= copy))
694 return 0;
695 pos += copy;
696 }
697 start = end;
698 }
699
700 skb_walk_frags(skb, frag_iter) {
701 int end;
702
703 WARN_ON(start > offset + len);
704
705 end = start + frag_iter->len;
706 if ((copy = end - offset) > 0) {
707 __wsum csum2 = 0;
708 if (copy > len)
709 copy = len;
710 if (skb_copy_and_csum_datagram(frag_iter,
711 offset - start,
712 to, copy,
713 &csum2))
714 goto fault;
715 *csump = csum_block_add(*csump, csum2, pos);
716 if ((len -= copy) == 0)
717 return 0;
718 offset += copy;
719 pos += copy;
720 }
721 start = end;
722 }
723 if (!len)
724 return 0;
725
726fault:
727 iov_iter_revert(to, offset - start_off);
728 return -EFAULT;
729} 698}
730 699
731/** 700/**