aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/procfs-diskstats10
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst92
-rw-r--r--Documentation/block/null_blk.txt7
-rw-r--r--Documentation/block/stat.txt28
-rw-r--r--Documentation/iostats.txt15
-rw-r--r--block/Kconfig16
-rw-r--r--block/Makefile4
-rw-r--r--block/bfq-iosched.c131
-rw-r--r--block/bfq-iosched.h7
-rw-r--r--block/bfq-wf2q.c30
-rw-r--r--block/bio-integrity.c22
-rw-r--r--block/bio.c208
-rw-r--r--block/blk-cgroup.c284
-rw-r--r--block/blk-core.c106
-rw-r--r--block/blk-ioc.c2
-rw-r--r--block/blk-iolatency.c955
-rw-r--r--block/blk-lib.c10
-rw-r--r--block/blk-mq-debugfs-zoned.c24
-rw-r--r--block/blk-mq-debugfs.c24
-rw-r--r--block/blk-mq-debugfs.h9
-rw-r--r--block/blk-mq-pci.c5
-rw-r--r--block/blk-mq-sched.c112
-rw-r--r--block/blk-mq-tag.c11
-rw-r--r--block/blk-mq.c173
-rw-r--r--block/blk-mq.h13
-rw-r--r--block/blk-rq-qos.c194
-rw-r--r--block/blk-rq-qos.h109
-rw-r--r--block/blk-settings.c6
-rw-r--r--block/blk-stat.c16
-rw-r--r--block/blk-stat.h4
-rw-r--r--block/blk-sysfs.c37
-rw-r--r--block/blk-throttle.c32
-rw-r--r--block/blk-wbt.c425
-rw-r--r--block/blk-wbt.h68
-rw-r--r--block/blk-zoned.c2
-rw-r--r--block/blk.h7
-rw-r--r--block/bounce.c69
-rw-r--r--block/bsg-lib.c5
-rw-r--r--block/bsg.c460
-rw-r--r--block/cfq-iosched.c23
-rw-r--r--block/genhd.c29
-rw-r--r--block/partition-generic.c25
-rw-r--r--block/partitions/aix.c13
-rw-r--r--block/partitions/ldm.c3
-rw-r--r--block/t10-pi.c110
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/ata/libata-scsi.c18
-rw-r--r--drivers/block/DAC960.c9
-rw-r--r--drivers/block/Kconfig2
-rw-r--r--drivers/block/Makefile5
-rw-r--r--drivers/block/aoe/aoecmd.c1
-rw-r--r--drivers/block/aoe/aoedev.c4
-rw-r--r--drivers/block/brd.c14
-rw-r--r--drivers/block/drbd/drbd_int.h2
-rw-r--r--drivers/block/drbd/drbd_main.c12
-rw-r--r--drivers/block/drbd/drbd_receiver.c6
-rw-r--r--drivers/block/drbd/drbd_req.c4
-rw-r--r--drivers/block/drbd/drbd_worker.c4
-rw-r--r--drivers/block/floppy.c3
-rw-r--r--drivers/block/loop.c3
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c3
-rw-r--r--drivers/block/null_blk.h108
-rw-r--r--drivers/block/null_blk_main.c (renamed from drivers/block/null_blk.c)129
-rw-r--r--drivers/block/null_blk_zoned.c149
-rw-r--r--drivers/block/paride/bpck.c3
-rw-r--r--drivers/block/paride/pd.c2
-rw-r--r--drivers/block/pktcdvd.c109
-rw-r--r--drivers/block/rsxx/dev.c6
-rw-r--r--drivers/block/skd_main.c16
-rw-r--r--drivers/block/xen-blkfront.c9
-rw-r--r--drivers/block/zram/zram_drv.c19
-rw-r--r--drivers/cdrom/cdrom.c30
-rw-r--r--drivers/ide/ide-cd.c58
-rw-r--r--drivers/ide/ide-cd.h6
-rw-r--r--drivers/ide/ide-cd_ioctl.c62
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c2
-rw-r--r--drivers/lightnvm/Kconfig30
-rw-r--r--drivers/lightnvm/pblk-cache.c9
-rw-r--r--drivers/lightnvm/pblk-core.c78
-rw-r--r--drivers/lightnvm/pblk-gc.c34
-rw-r--r--drivers/lightnvm/pblk-init.c98
-rw-r--r--drivers/lightnvm/pblk-rb.c24
-rw-r--r--drivers/lightnvm/pblk-read.c247
-rw-r--r--drivers/lightnvm/pblk-recovery.c47
-rw-r--r--drivers/lightnvm/pblk-sysfs.c13
-rw-r--r--drivers/lightnvm/pblk-write.c35
-rw-r--r--drivers/lightnvm/pblk.h48
-rw-r--r--drivers/md/bcache/bcache.h24
-rw-r--r--drivers/md/bcache/bset.c63
-rw-r--r--drivers/md/bcache/btree.c63
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/closure.c13
-rw-r--r--drivers/md/bcache/closure.h4
-rw-r--r--drivers/md/bcache/debug.c17
-rw-r--r--drivers/md/bcache/journal.c1
-rw-r--r--drivers/md/bcache/request.c75
-rw-r--r--drivers/md/bcache/super.c59
-rw-r--r--drivers/md/bcache/sysfs.c48
-rw-r--r--drivers/md/bcache/util.c2
-rw-r--r--drivers/md/bcache/util.h2
-rw-r--r--drivers/md/bcache/writeback.c125
-rw-r--r--drivers/md/bcache/writeback.h19
-rw-r--r--drivers/md/dm.c6
-rw-r--r--drivers/md/md.c12
-rw-r--r--drivers/nvdimm/btt.c12
-rw-r--r--drivers/nvdimm/nd.h7
-rw-r--r--drivers/nvdimm/pmem.c13
-rw-r--r--drivers/nvme/host/core.c108
-rw-r--r--drivers/nvme/host/fabrics.c2
-rw-r--r--drivers/nvme/host/fc.c1
-rw-r--r--drivers/nvme/host/lightnvm.c27
-rw-r--r--drivers/nvme/host/multipath.c349
-rw-r--r--drivers/nvme/host/nvme.h78
-rw-r--r--drivers/nvme/host/pci.c77
-rw-r--r--drivers/nvme/host/rdma.c234
-rw-r--r--drivers/nvme/host/trace.c11
-rw-r--r--drivers/nvme/host/trace.h142
-rw-r--r--drivers/nvme/target/admin-cmd.c221
-rw-r--r--drivers/nvme/target/configfs.c250
-rw-r--r--drivers/nvme/target/core.c104
-rw-r--r--drivers/nvme/target/discovery.c2
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c7
-rw-r--r--drivers/nvme/target/io-cmd-file.c80
-rw-r--r--drivers/nvme/target/loop.c1
-rw-r--r--drivers/nvme/target/nvmet.h62
-rw-r--r--drivers/nvme/target/rdma.c197
-rw-r--r--drivers/scsi/Makefile2
-rw-r--r--drivers/scsi/cxlflash/superpipe.c8
-rw-r--r--drivers/scsi/cxlflash/vlun.c7
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_scsih.c2
-rw-r--r--drivers/scsi/scsi_lib.c6
-rw-r--r--drivers/scsi/sd.c8
-rw-r--r--drivers/scsi/sd.h9
-rw-r--r--drivers/scsi/sd_dif.c113
-rw-r--r--drivers/scsi/sr_ioctl.c22
-rw-r--r--drivers/scsi/virtio_scsi.c8
-rw-r--r--drivers/target/Kconfig5
-rw-r--r--drivers/target/loopback/Kconfig1
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/exofs/ore.c4
-rw-r--r--fs/ext4/super.c5
-rw-r--r--fs/ext4/sysfs.c6
-rw-r--r--fs/f2fs/f2fs.h2
-rw-r--r--fs/f2fs/super.c3
-rw-r--r--fs/mpage.c4
-rw-r--r--include/linux/bio.h19
-rw-r--r--include/linux/blk-cgroup.h146
-rw-r--r--include/linux/blk-mq.h4
-rw-r--r--include/linux/blk_types.h27
-rw-r--r--include/linux/blkdev.h66
-rw-r--r--include/linux/cdrom.h3
-rw-r--r--include/linux/cgroup-defs.h3
-rw-r--r--include/linux/genhd.h14
-rw-r--r--include/linux/memcontrol.h13
-rw-r--r--include/linux/nvme.h72
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/swap.h11
-rw-r--r--include/linux/t10-pi.h24
-rw-r--r--include/linux/tracehook.h2
-rw-r--r--include/scsi/scsi_cmnd.h13
-rw-r--r--include/scsi/scsi_device.h14
-rw-r--r--include/uapi/linux/bcache.h4
-rw-r--r--include/uapi/linux/blkzoned.h2
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/trace/blktrace.c6
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/memcontrol.c13
-rw-r--r--mm/memory.c11
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/readahead.c19
-rw-r--r--mm/shmem.c10
-rw-r--r--mm/swapfile.c31
172 files changed, 6029 insertions, 2659 deletions
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index f91a973a37fe..abac31d216de 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -5,6 +5,7 @@ Description:
5 The /proc/diskstats file displays the I/O statistics 5 The /proc/diskstats file displays the I/O statistics
6 of block devices. Each line contains the following 14 6 of block devices. Each line contains the following 14
7 fields: 7 fields:
8
8 1 - major number 9 1 - major number
9 2 - minor mumber 10 2 - minor mumber
10 3 - device name 11 3 - device name
@@ -19,4 +20,13 @@ Description:
19 12 - I/Os currently in progress 20 12 - I/Os currently in progress
20 13 - time spent doing I/Os (ms) 21 13 - time spent doing I/Os (ms)
21 14 - weighted time spent doing I/Os (ms) 22 14 - weighted time spent doing I/Os (ms)
23
24 Kernel 4.18+ appends four more fields for discard
25 tracking putting the total at 18:
26
27 15 - discards completed successfully
28 16 - discards merged
29 17 - sectors discarded
30 18 - time spent discarding
31
22 For more details refer to Documentation/iostats.txt 32 For more details refer to Documentation/iostats.txt
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8a2c52d5c53b..1746131bc9cb 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -51,6 +51,9 @@ v1 is available under Documentation/cgroup-v1/.
51 5-3. IO 51 5-3. IO
52 5-3-1. IO Interface Files 52 5-3-1. IO Interface Files
53 5-3-2. Writeback 53 5-3-2. Writeback
54 5-3-3. IO Latency
55 5-3-3-1. How IO Latency Throttling Works
56 5-3-3-2. IO Latency Interface Files
54 5-4. PID 57 5-4. PID
55 5-4-1. PID Interface Files 58 5-4-1. PID Interface Files
56 5-5. Device 59 5-5. Device
@@ -1314,17 +1317,19 @@ IO Interface Files
1314 Lines are keyed by $MAJ:$MIN device numbers and not ordered. 1317 Lines are keyed by $MAJ:$MIN device numbers and not ordered.
1315 The following nested keys are defined. 1318 The following nested keys are defined.
1316 1319
1317 ====== =================== 1320 ====== =====================
1318 rbytes Bytes read 1321 rbytes Bytes read
1319 wbytes Bytes written 1322 wbytes Bytes written
1320 rios Number of read IOs 1323 rios Number of read IOs
1321 wios Number of write IOs 1324 wios Number of write IOs
1322 ====== =================== 1325 dbytes Bytes discarded
1326 dios Number of discard IOs
1327 ====== =====================
1323 1328
1324 An example read output follows: 1329 An example read output follows:
1325 1330
1326 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 1331 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0
1327 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 1332 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021
1328 1333
1329 io.weight 1334 io.weight
1330 A read-write flat-keyed file which exists on non-root cgroups. 1335 A read-write flat-keyed file which exists on non-root cgroups.
@@ -1446,6 +1451,85 @@ writeback as follows.
1446 vm.dirty[_background]_ratio. 1451 vm.dirty[_background]_ratio.
1447 1452
1448 1453
1454IO Latency
1455~~~~~~~~~~
1456
1457This is a cgroup v2 controller for IO workload protection. You provide a group
1458with a latency target, and if the average latency exceeds that target the
1459controller will throttle any peers that have a lower latency target than the
1460protected workload.
1461
1462The limits are only applied at the peer level in the hierarchy. This means that
1463in the diagram below, only groups A, B, and C will influence each other, and
1464groups D and F will influence each other. Group G will influence nobody.
1465
1466 [root]
1467 / | \
1468 A B C
1469 / \ |
1470 D F G
1471
1472
1473So the ideal way to configure this is to set io.latency in groups A, B, and C.
1474Generally you do not want to set a value lower than the latency your device
1475supports. Experiment to find the value that works best for your workload.
1476Start at higher than the expected latency for your device and watch the
1477avg_lat value in io.stat for your workload group to get an idea of the
1478latency you see during normal operation. Use the avg_lat value as a basis for
1479your real setting, setting at 10-15% higher than the value in io.stat.
1480
1481How IO Latency Throttling Works
1482~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1483
1484io.latency is work conserving; so as long as everybody is meeting their latency
1485target the controller doesn't do anything. Once a group starts missing its
1486target it begins throttling any peer group that has a higher target than itself.
1487This throttling takes 2 forms:
1488
1489- Queue depth throttling. This is the number of outstanding IO's a group is
1490 allowed to have. We will clamp down relatively quickly, starting at no limit
1491 and going all the way down to 1 IO at a time.
1492
1493- Artificial delay induction. There are certain types of IO that cannot be
1494 throttled without possibly adversely affecting higher priority groups. This
1495 includes swapping and metadata IO. These types of IO are allowed to occur
1496 normally, however they are "charged" to the originating group. If the
1497 originating group is being throttled you will see the use_delay and delay
1498 fields in io.stat increase. The delay value is how many microseconds that are
1499 being added to any process that runs in this group. Because this number can
1500 grow quite large if there is a lot of swapping or metadata IO occurring we
1501 limit the individual delay events to 1 second at a time.
1502
1503Once the victimized group starts meeting its latency target again it will start
1504unthrottling any peer groups that were throttled previously. If the victimized
1505group simply stops doing IO the global counter will unthrottle appropriately.
1506
1507IO Latency Interface Files
1508~~~~~~~~~~~~~~~~~~~~~~~~~~
1509
1510 io.latency
1511 This takes a similar format as the other controllers.
1512
1513 "MAJOR:MINOR target=<target time in microseconds"
1514
1515 io.stat
1516 If the controller is enabled you will see extra stats in io.stat in
1517 addition to the normal ones.
1518
1519 depth
1520 This is the current queue depth for the group.
1521
1522 avg_lat
1523 This is an exponential moving average with a decay rate of 1/exp
1524 bound by the sampling interval. The decay rate interval can be
1525 calculated by multiplying the win value in io.stat by the
1526 corresponding number of samples based on the win value.
1527
1528 win
1529 The sampling window size in milliseconds. This is the minimum
1530 duration of time between evaluation events. Windows only elapse
1531 with IO activity. Idle periods extend the most recent window.
1532
1449PID 1533PID
1450--- 1534---
1451 1535
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 07f147381f32..ea2dafe49ae8 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0
85 0: Tag set is not shared. 85 0: Tag set is not shared.
86 1: Tag set shared between devices for blk-mq. Only makes sense with 86 1: Tag set shared between devices for blk-mq. Only makes sense with
87 nr_devices > 1, otherwise there's no tag set to share. 87 nr_devices > 1, otherwise there's no tag set to share.
88
89zoned=[0/1]: Default: 0
90 0: Block device is exposed as a random-access block device.
91 1: Block device is exposed as a host-managed zoned block device.
92
93zone_size=[MB]: Default: 256
94 Per zone size when exposed as a zoned block device. Must be a power of two.
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt
index 0dbc946de2ea..0aace9cc536c 100644
--- a/Documentation/block/stat.txt
+++ b/Documentation/block/stat.txt
@@ -31,28 +31,32 @@ write ticks milliseconds total wait time for write requests
31in_flight requests number of I/Os currently in flight 31in_flight requests number of I/Os currently in flight
32io_ticks milliseconds total time this block device has been active 32io_ticks milliseconds total time this block device has been active
33time_in_queue milliseconds total wait time for all requests 33time_in_queue milliseconds total wait time for all requests
34discard I/Os requests number of discard I/Os processed
35discard merges requests number of discard I/Os merged with in-queue I/O
36discard sectors sectors number of sectors discarded
37discard ticks milliseconds total wait time for discard requests
34 38
35read I/Os, write I/Os 39read I/Os, write I/Os, discard I/0s
36===================== 40===================================
37 41
38These values increment when an I/O request completes. 42These values increment when an I/O request completes.
39 43
40read merges, write merges 44read merges, write merges, discard merges
41========================= 45=========================================
42 46
43These values increment when an I/O request is merged with an 47These values increment when an I/O request is merged with an
44already-queued I/O request. 48already-queued I/O request.
45 49
46read sectors, write sectors 50read sectors, write sectors, discard_sectors
47=========================== 51============================================
48 52
49These values count the number of sectors read from or written to this 53These values count the number of sectors read from, written to, or
50block device. The "sectors" in question are the standard UNIX 512-byte 54discarded from this block device. The "sectors" in question are the
51sectors, not any device- or filesystem-specific block size. The 55standard UNIX 512-byte sectors, not any device- or filesystem-specific
52counters are incremented when the I/O completes. 56block size. The counters are incremented when the I/O completes.
53 57
54read ticks, write ticks 58read ticks, write ticks, discard ticks
55======================= 59======================================
56 60
57These values count the number of milliseconds that I/O requests have 61These values count the number of milliseconds that I/O requests have
58waited on this block device. If there are multiple I/O requests waiting, 62waited on this block device. If there are multiple I/O requests waiting,
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
index 04d394a2e06c..49df45f90e8a 100644
--- a/Documentation/iostats.txt
+++ b/Documentation/iostats.txt
@@ -31,6 +31,9 @@ Here are examples of these different formats::
31 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 31 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
32 3 1 hda1 35486 38030 38030 38030 32 3 1 hda1 35486 38030 38030 38030
33 33
34 4.18+ diskstats:
35 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
36
34On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have 37On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
35a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. 38a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
36 39
@@ -101,6 +104,18 @@ Field 11 -- weighted # of milliseconds spent doing I/Os
101 last update of this field. This can provide an easy measure of both 104 last update of this field. This can provide an easy measure of both
102 I/O completion time and the backlog that may be accumulating. 105 I/O completion time and the backlog that may be accumulating.
103 106
107Field 12 -- # of discards completed
108 This is the total number of discards completed successfully.
109
110Field 13 -- # of discards merged
111 See the description of field 2
112
113Field 14 -- # of sectors discarded
114 This is the total number of sectors discarded successfully.
115
116Field 15 -- # of milliseconds spent discarding
117 This is the total number of milliseconds spent by all discards (as
118 measured from __make_request() to end_that_request_last()).
104 119
105To avoid introducing performance bottlenecks, no locks are held while 120To avoid introducing performance bottlenecks, no locks are held while
106modifying these counters. This implies that minor inaccuracies may be 121modifying these counters. This implies that minor inaccuracies may be
diff --git a/block/Kconfig b/block/Kconfig
index eb50fd4977c2..1f2469a0123c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -149,6 +149,18 @@ config BLK_WBT
149 dynamically on an algorithm loosely based on CoDel, factoring in 149 dynamically on an algorithm loosely based on CoDel, factoring in
150 the realtime performance of the disk. 150 the realtime performance of the disk.
151 151
152config BLK_CGROUP_IOLATENCY
153 bool "Enable support for latency based cgroup IO protection"
154 depends on BLK_CGROUP=y
155 default n
156 ---help---
157 Enabling this option enables the .latency interface for IO throttling.
158 The IO controller will attempt to maintain average IO latencies below
159 the configured latency target, throttling anybody with a higher latency
160 target than the victimized group.
161
162 Note, this is an experimental interface and could be changed someday.
163
152config BLK_WBT_SQ 164config BLK_WBT_SQ
153 bool "Single queue writeback throttling" 165 bool "Single queue writeback throttling"
154 default n 166 default n
@@ -177,6 +189,10 @@ config BLK_DEBUG_FS
177 Unless you are building a kernel for a tiny system, you should 189 Unless you are building a kernel for a tiny system, you should
178 say Y here. 190 say Y here.
179 191
192config BLK_DEBUG_FS_ZONED
193 bool
194 default BLK_DEBUG_FS && BLK_DEV_ZONED
195
180config BLK_SED_OPAL 196config BLK_SED_OPAL
181 bool "Logic for interfacing with Opal enabled SEDs" 197 bool "Logic for interfacing with Opal enabled SEDs"
182 ---help--- 198 ---help---
diff --git a/block/Makefile b/block/Makefile
index 6a56303b9925..572b33f32c07 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
9 blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ 9 blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
10 blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ 10 blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
11 genhd.o partition-generic.o ioprio.o \ 11 genhd.o partition-generic.o ioprio.o \
12 badblocks.o partitions/ 12 badblocks.o partitions/ blk-rq-qos.o
13 13
14obj-$(CONFIG_BOUNCE) += bounce.o 14obj-$(CONFIG_BOUNCE) += bounce.o
15obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o 15obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
@@ -17,6 +17,7 @@ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
17obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o 17obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
18obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 18obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
19obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o 19obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
20obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
20obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 21obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
21obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 22obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
22obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 23obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
@@ -34,4 +35,5 @@ obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o
34obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o 35obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
35obj-$(CONFIG_BLK_WBT) += blk-wbt.o 36obj-$(CONFIG_BLK_WBT) += blk-wbt.o
36obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o 37obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
38obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
37obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o 39obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 495b9ddb3355..41d9036b1822 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -634,7 +634,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
634 * The following function returns true if every queue must receive the 634 * The following function returns true if every queue must receive the
635 * same share of the throughput (this condition is used when deciding 635 * same share of the throughput (this condition is used when deciding
636 * whether idling may be disabled, see the comments in the function 636 * whether idling may be disabled, see the comments in the function
637 * bfq_bfqq_may_idle()). 637 * bfq_better_to_idle()).
638 * 638 *
639 * Such a scenario occurs when: 639 * Such a scenario occurs when:
640 * 1) all active queues have the same weight, 640 * 1) all active queues have the same weight,
@@ -742,8 +742,9 @@ inc_counter:
742 * See the comments to the function bfq_weights_tree_add() for considerations 742 * See the comments to the function bfq_weights_tree_add() for considerations
743 * about overhead. 743 * about overhead.
744 */ 744 */
745void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, 745void __bfq_weights_tree_remove(struct bfq_data *bfqd,
746 struct rb_root *root) 746 struct bfq_entity *entity,
747 struct rb_root *root)
747{ 748{
748 if (!entity->weight_counter) 749 if (!entity->weight_counter)
749 return; 750 return;
@@ -760,6 +761,43 @@ reset_entity_pointer:
760} 761}
761 762
762/* 763/*
764 * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
765 * parent entities.
766 */
767void bfq_weights_tree_remove(struct bfq_data *bfqd,
768 struct bfq_queue *bfqq)
769{
770 struct bfq_entity *entity = bfqq->entity.parent;
771
772 __bfq_weights_tree_remove(bfqd, &bfqq->entity,
773 &bfqd->queue_weights_tree);
774
775 for_each_entity(entity) {
776 struct bfq_sched_data *sd = entity->my_sched_data;
777
778 if (sd->next_in_service || sd->in_service_entity) {
779 /*
780 * entity is still active, because either
781 * next_in_service or in_service_entity is not
782 * NULL (see the comments on the definition of
783 * next_in_service for details on why
784 * in_service_entity must be checked too).
785 *
786 * As a consequence, the weight of entity is
787 * not to be removed. In addition, if entity
788 * is active, then its parent entities are
789 * active as well, and thus their weights are
790 * not to be removed either. In the end, this
791 * loop must stop here.
792 */
793 break;
794 }
795 __bfq_weights_tree_remove(bfqd, entity,
796 &bfqd->group_weights_tree);
797 }
798}
799
800/*
763 * Return expired entry, or NULL to just start from scratch in rbtree. 801 * Return expired entry, or NULL to just start from scratch in rbtree.
764 */ 802 */
765static struct request *bfq_check_fifo(struct bfq_queue *bfqq, 803static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
@@ -1344,18 +1382,30 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
1344 * remain unchanged after such an expiration, and the 1382 * remain unchanged after such an expiration, and the
1345 * following statement therefore assigns to 1383 * following statement therefore assigns to
1346 * entity->budget the remaining budget on such an 1384 * entity->budget the remaining budget on such an
1347 * expiration. For clarity, entity->service is not 1385 * expiration.
1348 * updated on expiration in any case, and, in normal
1349 * operation, is reset only when bfqq is selected for
1350 * service (see bfq_get_next_queue).
1351 */ 1386 */
1352 entity->budget = min_t(unsigned long, 1387 entity->budget = min_t(unsigned long,
1353 bfq_bfqq_budget_left(bfqq), 1388 bfq_bfqq_budget_left(bfqq),
1354 bfqq->max_budget); 1389 bfqq->max_budget);
1355 1390
1391 /*
1392 * At this point, we have used entity->service to get
1393 * the budget left (needed for updating
1394 * entity->budget). Thus we finally can, and have to,
1395 * reset entity->service. The latter must be reset
1396 * because bfqq would otherwise be charged again for
1397 * the service it has received during its previous
1398 * service slot(s).
1399 */
1400 entity->service = 0;
1401
1356 return true; 1402 return true;
1357 } 1403 }
1358 1404
1405 /*
1406 * We can finally complete expiration, by setting service to 0.
1407 */
1408 entity->service = 0;
1359 entity->budget = max_t(unsigned long, bfqq->max_budget, 1409 entity->budget = max_t(unsigned long, bfqq->max_budget,
1360 bfq_serv_to_charge(bfqq->next_rq, bfqq)); 1410 bfq_serv_to_charge(bfqq->next_rq, bfqq));
1361 bfq_clear_bfqq_non_blocking_wait_rq(bfqq); 1411 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
@@ -3233,11 +3283,21 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
3233 ref = bfqq->ref; 3283 ref = bfqq->ref;
3234 __bfq_bfqq_expire(bfqd, bfqq); 3284 __bfq_bfqq_expire(bfqd, bfqq);
3235 3285
3286 if (ref == 1) /* bfqq is gone, no more actions on it */
3287 return;
3288
3236 /* mark bfqq as waiting a request only if a bic still points to it */ 3289 /* mark bfqq as waiting a request only if a bic still points to it */
3237 if (ref > 1 && !bfq_bfqq_busy(bfqq) && 3290 if (!bfq_bfqq_busy(bfqq) &&
3238 reason != BFQQE_BUDGET_TIMEOUT && 3291 reason != BFQQE_BUDGET_TIMEOUT &&
3239 reason != BFQQE_BUDGET_EXHAUSTED) 3292 reason != BFQQE_BUDGET_EXHAUSTED) {
3240 bfq_mark_bfqq_non_blocking_wait_rq(bfqq); 3293 bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
3294 /*
3295 * Not setting service to 0, because, if the next rq
3296 * arrives in time, the queue will go on receiving
3297 * service with this same budget (as if it never expired)
3298 */
3299 } else
3300 entity->service = 0;
3241} 3301}
3242 3302
3243/* 3303/*
@@ -3295,7 +3355,7 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
3295 * issues taken into account are not trivial. We discuss these issues 3355 * issues taken into account are not trivial. We discuss these issues
3296 * individually while introducing the variables. 3356 * individually while introducing the variables.
3297 */ 3357 */
3298static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) 3358static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3299{ 3359{
3300 struct bfq_data *bfqd = bfqq->bfqd; 3360 struct bfq_data *bfqd = bfqq->bfqd;
3301 bool rot_without_queueing = 3361 bool rot_without_queueing =
@@ -3528,19 +3588,19 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
3528} 3588}
3529 3589
3530/* 3590/*
3531 * If the in-service queue is empty but the function bfq_bfqq_may_idle 3591 * If the in-service queue is empty but the function bfq_better_to_idle
3532 * returns true, then: 3592 * returns true, then:
3533 * 1) the queue must remain in service and cannot be expired, and 3593 * 1) the queue must remain in service and cannot be expired, and
3534 * 2) the device must be idled to wait for the possible arrival of a new 3594 * 2) the device must be idled to wait for the possible arrival of a new
3535 * request for the queue. 3595 * request for the queue.
3536 * See the comments on the function bfq_bfqq_may_idle for the reasons 3596 * See the comments on the function bfq_better_to_idle for the reasons
3537 * why performing device idling is the best choice to boost the throughput 3597 * why performing device idling is the best choice to boost the throughput
3538 * and preserve service guarantees when bfq_bfqq_may_idle itself 3598 * and preserve service guarantees when bfq_better_to_idle itself
3539 * returns true. 3599 * returns true.
3540 */ 3600 */
3541static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) 3601static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
3542{ 3602{
3543 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); 3603 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
3544} 3604}
3545 3605
3546/* 3606/*
@@ -3559,8 +3619,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
3559 3619
3560 bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); 3620 bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
3561 3621
3622 /*
3623 * Do not expire bfqq for budget timeout if bfqq may be about
3624 * to enjoy device idling. The reason why, in this case, we
3625 * prevent bfqq from expiring is the same as in the comments
3626 * on the case where bfq_bfqq_must_idle() returns true, in
3627 * bfq_completed_request().
3628 */
3562 if (bfq_may_expire_for_budg_timeout(bfqq) && 3629 if (bfq_may_expire_for_budg_timeout(bfqq) &&
3563 !bfq_bfqq_wait_request(bfqq) &&
3564 !bfq_bfqq_must_idle(bfqq)) 3630 !bfq_bfqq_must_idle(bfqq))
3565 goto expire; 3631 goto expire;
3566 3632
@@ -3620,7 +3686,7 @@ check_queue:
3620 * may idle after their completion, then keep it anyway. 3686 * may idle after their completion, then keep it anyway.
3621 */ 3687 */
3622 if (bfq_bfqq_wait_request(bfqq) || 3688 if (bfq_bfqq_wait_request(bfqq) ||
3623 (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { 3689 (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
3624 bfqq = NULL; 3690 bfqq = NULL;
3625 goto keep_queue; 3691 goto keep_queue;
3626 } 3692 }
@@ -4582,8 +4648,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
4582 */ 4648 */
4583 bfqq->budget_timeout = jiffies; 4649 bfqq->budget_timeout = jiffies;
4584 4650
4585 bfq_weights_tree_remove(bfqd, &bfqq->entity, 4651 bfq_weights_tree_remove(bfqd, bfqq);
4586 &bfqd->queue_weights_tree);
4587 } 4652 }
4588 4653
4589 now_ns = ktime_get_ns(); 4654 now_ns = ktime_get_ns();
@@ -4637,15 +4702,39 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
4637 * or if we want to idle in case it has no pending requests. 4702 * or if we want to idle in case it has no pending requests.
4638 */ 4703 */
4639 if (bfqd->in_service_queue == bfqq) { 4704 if (bfqd->in_service_queue == bfqq) {
4640 if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { 4705 if (bfq_bfqq_must_idle(bfqq)) {
4641 bfq_arm_slice_timer(bfqd); 4706 if (bfqq->dispatched == 0)
4707 bfq_arm_slice_timer(bfqd);
4708 /*
4709 * If we get here, we do not expire bfqq, even
4710 * if bfqq was in budget timeout or had no
4711 * more requests (as controlled in the next
4712 * conditional instructions). The reason for
4713 * not expiring bfqq is as follows.
4714 *
4715 * Here bfqq->dispatched > 0 holds, but
4716 * bfq_bfqq_must_idle() returned true. This
4717 * implies that, even if no request arrives
4718 * for bfqq before bfqq->dispatched reaches 0,
4719 * bfqq will, however, not be expired on the
4720 * completion event that causes bfqq->dispatch
4721 * to reach zero. In contrast, on this event,
4722 * bfqq will start enjoying device idling
4723 * (I/O-dispatch plugging).
4724 *
4725 * But, if we expired bfqq here, bfqq would
4726 * not have the chance to enjoy device idling
4727 * when bfqq->dispatched finally reaches
4728 * zero. This would expose bfqq to violation
4729 * of its reserved service guarantees.
4730 */
4642 return; 4731 return;
4643 } else if (bfq_may_expire_for_budg_timeout(bfqq)) 4732 } else if (bfq_may_expire_for_budg_timeout(bfqq))
4644 bfq_bfqq_expire(bfqd, bfqq, false, 4733 bfq_bfqq_expire(bfqd, bfqq, false,
4645 BFQQE_BUDGET_TIMEOUT); 4734 BFQQE_BUDGET_TIMEOUT);
4646 else if (RB_EMPTY_ROOT(&bfqq->sort_list) && 4735 else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
4647 (bfqq->dispatched == 0 || 4736 (bfqq->dispatched == 0 ||
4648 !bfq_bfqq_may_idle(bfqq))) 4737 !bfq_better_to_idle(bfqq)))
4649 bfq_bfqq_expire(bfqd, bfqq, false, 4738 bfq_bfqq_expire(bfqd, bfqq, false,
4650 BFQQE_NO_MORE_REQUESTS); 4739 BFQQE_NO_MORE_REQUESTS);
4651 } 4740 }
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 0f712e03b035..a8a2e5aca4d4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -827,8 +827,11 @@ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
827void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); 827void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
828void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, 828void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
829 struct rb_root *root); 829 struct rb_root *root);
830void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, 830void __bfq_weights_tree_remove(struct bfq_data *bfqd,
831 struct rb_root *root); 831 struct bfq_entity *entity,
832 struct rb_root *root);
833void bfq_weights_tree_remove(struct bfq_data *bfqd,
834 struct bfq_queue *bfqq);
832void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, 835void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
833 bool compensate, enum bfqq_expiration reason); 836 bool compensate, enum bfqq_expiration reason);
834void bfq_put_queue(struct bfq_queue *bfqq); 837void bfq_put_queue(struct bfq_queue *bfqq);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 4498c43245e2..dbc07b456059 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -499,9 +499,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
499 if (bfqq) 499 if (bfqq)
500 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); 500 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
501#ifdef CONFIG_BFQ_GROUP_IOSCHED 501#ifdef CONFIG_BFQ_GROUP_IOSCHED
502 else /* bfq_group */
503 bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
504
505 if (bfqg != bfqd->root_group) 502 if (bfqg != bfqd->root_group)
506 bfqg->active_entities++; 503 bfqg->active_entities++;
507#endif 504#endif
@@ -601,10 +598,6 @@ static void bfq_active_extract(struct bfq_service_tree *st,
601 if (bfqq) 598 if (bfqq)
602 list_del(&bfqq->bfqq_list); 599 list_del(&bfqq->bfqq_list);
603#ifdef CONFIG_BFQ_GROUP_IOSCHED 600#ifdef CONFIG_BFQ_GROUP_IOSCHED
604 else /* bfq_group */
605 bfq_weights_tree_remove(bfqd, entity,
606 &bfqd->group_weights_tree);
607
608 if (bfqg != bfqd->root_group) 601 if (bfqg != bfqd->root_group)
609 bfqg->active_entities--; 602 bfqg->active_entities--;
610#endif 603#endif
@@ -799,7 +792,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
799 if (prev_weight != new_weight) { 792 if (prev_weight != new_weight) {
800 root = bfqq ? &bfqd->queue_weights_tree : 793 root = bfqq ? &bfqd->queue_weights_tree :
801 &bfqd->group_weights_tree; 794 &bfqd->group_weights_tree;
802 bfq_weights_tree_remove(bfqd, entity, root); 795 __bfq_weights_tree_remove(bfqd, entity, root);
803 } 796 }
804 entity->weight = new_weight; 797 entity->weight = new_weight;
805 /* 798 /*
@@ -971,7 +964,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
971 * one of its children receives a new request. 964 * one of its children receives a new request.
972 * 965 *
973 * Basically, this function updates the timestamps of entity and 966 * Basically, this function updates the timestamps of entity and
974 * inserts entity into its active tree, ater possibly extracting it 967 * inserts entity into its active tree, after possibly extracting it
975 * from its idle tree. 968 * from its idle tree.
976 */ 969 */
977static void __bfq_activate_entity(struct bfq_entity *entity, 970static void __bfq_activate_entity(struct bfq_entity *entity,
@@ -1015,6 +1008,16 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
1015 entity->on_st = true; 1008 entity->on_st = true;
1016 } 1009 }
1017 1010
1011#ifdef BFQ_GROUP_IOSCHED_ENABLED
1012 if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
1013 struct bfq_group *bfqg =
1014 container_of(entity, struct bfq_group, entity);
1015
1016 bfq_weights_tree_add(bfqg->bfqd, entity,
1017 &bfqd->group_weights_tree);
1018 }
1019#endif
1020
1018 bfq_update_fin_time_enqueue(entity, st, backshifted); 1021 bfq_update_fin_time_enqueue(entity, st, backshifted);
1019} 1022}
1020 1023
@@ -1542,12 +1545,6 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
1542 sd->in_service_entity = entity; 1545 sd->in_service_entity = entity;
1543 1546
1544 /* 1547 /*
1545 * Reset the accumulator of the amount of service that
1546 * the entity is about to receive.
1547 */
1548 entity->service = 0;
1549
1550 /*
1551 * If entity is no longer a candidate for next 1548 * If entity is no longer a candidate for next
1552 * service, then it must be extracted from its active 1549 * service, then it must be extracted from its active
1553 * tree, so as to make sure that it won't be 1550 * tree, so as to make sure that it won't be
@@ -1664,8 +1661,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
1664 bfqd->busy_queues--; 1661 bfqd->busy_queues--;
1665 1662
1666 if (!bfqq->dispatched) 1663 if (!bfqq->dispatched)
1667 bfq_weights_tree_remove(bfqd, &bfqq->entity, 1664 bfq_weights_tree_remove(bfqd, bfqq);
1668 &bfqd->queue_weights_tree);
1669 1665
1670 if (bfqq->wr_coeff > 1) 1666 if (bfqq->wr_coeff > 1)
1671 bfqd->wr_busy_queues--; 1667 bfqd->wr_busy_queues--;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index add7c7c85335..67b5fb861a51 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -160,28 +160,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
160EXPORT_SYMBOL(bio_integrity_add_page); 160EXPORT_SYMBOL(bio_integrity_add_page);
161 161
162/** 162/**
163 * bio_integrity_intervals - Return number of integrity intervals for a bio
164 * @bi: blk_integrity profile for device
165 * @sectors: Size of the bio in 512-byte sectors
166 *
167 * Description: The block layer calculates everything in 512 byte
168 * sectors but integrity metadata is done in terms of the data integrity
169 * interval size of the storage device. Convert the block layer sectors
170 * to the appropriate number of integrity intervals.
171 */
172static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
173 unsigned int sectors)
174{
175 return sectors >> (bi->interval_exp - 9);
176}
177
178static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
179 unsigned int sectors)
180{
181 return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
182}
183
184/**
185 * bio_integrity_process - Process integrity metadata for a bio 163 * bio_integrity_process - Process integrity metadata for a bio
186 * @bio: bio to generate/verify integrity metadata for 164 * @bio: bio to generate/verify integrity metadata for
187 * @proc_iter: iterator to process 165 * @proc_iter: iterator to process
diff --git a/block/bio.c b/block/bio.c
index 047c5dca6d90..b12966e415d3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -28,9 +28,11 @@
28#include <linux/mempool.h> 28#include <linux/mempool.h>
29#include <linux/workqueue.h> 29#include <linux/workqueue.h>
30#include <linux/cgroup.h> 30#include <linux/cgroup.h>
31#include <linux/blk-cgroup.h>
31 32
32#include <trace/events/block.h> 33#include <trace/events/block.h>
33#include "blk.h" 34#include "blk.h"
35#include "blk-rq-qos.h"
34 36
35/* 37/*
36 * Test patch to inline a certain number of bi_io_vec's inside the bio 38 * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -156,7 +158,7 @@ out:
156 158
157unsigned int bvec_nr_vecs(unsigned short idx) 159unsigned int bvec_nr_vecs(unsigned short idx)
158{ 160{
159 return bvec_slabs[idx].nr_vecs; 161 return bvec_slabs[--idx].nr_vecs;
160} 162}
161 163
162void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) 164void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
@@ -645,83 +647,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
645EXPORT_SYMBOL(bio_clone_fast); 647EXPORT_SYMBOL(bio_clone_fast);
646 648
647/** 649/**
648 * bio_clone_bioset - clone a bio
649 * @bio_src: bio to clone
650 * @gfp_mask: allocation priority
651 * @bs: bio_set to allocate from
652 *
653 * Clone bio. Caller will own the returned bio, but not the actual data it
654 * points to. Reference count of returned bio will be one.
655 */
656struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
657 struct bio_set *bs)
658{
659 struct bvec_iter iter;
660 struct bio_vec bv;
661 struct bio *bio;
662
663 /*
664 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
665 * bio_src->bi_io_vec to bio->bi_io_vec.
666 *
667 * We can't do that anymore, because:
668 *
669 * - The point of cloning the biovec is to produce a bio with a biovec
670 * the caller can modify: bi_idx and bi_bvec_done should be 0.
671 *
672 * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
673 * we tried to clone the whole thing bio_alloc_bioset() would fail.
674 * But the clone should succeed as long as the number of biovecs we
675 * actually need to allocate is fewer than BIO_MAX_PAGES.
676 *
677 * - Lastly, bi_vcnt should not be looked at or relied upon by code
678 * that does not own the bio - reason being drivers don't use it for
679 * iterating over the biovec anymore, so expecting it to be kept up
680 * to date (i.e. for clones that share the parent biovec) is just
681 * asking for trouble and would force extra work on
682 * __bio_clone_fast() anyways.
683 */
684
685 bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
686 if (!bio)
687 return NULL;
688 bio->bi_disk = bio_src->bi_disk;
689 bio->bi_opf = bio_src->bi_opf;
690 bio->bi_write_hint = bio_src->bi_write_hint;
691 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
692 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
693
694 switch (bio_op(bio)) {
695 case REQ_OP_DISCARD:
696 case REQ_OP_SECURE_ERASE:
697 case REQ_OP_WRITE_ZEROES:
698 break;
699 case REQ_OP_WRITE_SAME:
700 bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
701 break;
702 default:
703 bio_for_each_segment(bv, bio_src, iter)
704 bio->bi_io_vec[bio->bi_vcnt++] = bv;
705 break;
706 }
707
708 if (bio_integrity(bio_src)) {
709 int ret;
710
711 ret = bio_integrity_clone(bio, bio_src, gfp_mask);
712 if (ret < 0) {
713 bio_put(bio);
714 return NULL;
715 }
716 }
717
718 bio_clone_blkcg_association(bio, bio_src);
719
720 return bio;
721}
722EXPORT_SYMBOL(bio_clone_bioset);
723
724/**
725 * bio_add_pc_page - attempt to add page to bio 650 * bio_add_pc_page - attempt to add page to bio
726 * @q: the target queue 651 * @q: the target queue
727 * @bio: destination bio 652 * @bio: destination bio
@@ -1661,10 +1586,8 @@ void bio_set_pages_dirty(struct bio *bio)
1661 int i; 1586 int i;
1662 1587
1663 bio_for_each_segment_all(bvec, bio, i) { 1588 bio_for_each_segment_all(bvec, bio, i) {
1664 struct page *page = bvec->bv_page; 1589 if (!PageCompound(bvec->bv_page))
1665 1590 set_page_dirty_lock(bvec->bv_page);
1666 if (page && !PageCompound(page))
1667 set_page_dirty_lock(page);
1668 } 1591 }
1669} 1592}
1670EXPORT_SYMBOL_GPL(bio_set_pages_dirty); 1593EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
@@ -1674,19 +1597,15 @@ static void bio_release_pages(struct bio *bio)
1674 struct bio_vec *bvec; 1597 struct bio_vec *bvec;
1675 int i; 1598 int i;
1676 1599
1677 bio_for_each_segment_all(bvec, bio, i) { 1600 bio_for_each_segment_all(bvec, bio, i)
1678 struct page *page = bvec->bv_page; 1601 put_page(bvec->bv_page);
1679
1680 if (page)
1681 put_page(page);
1682 }
1683} 1602}
1684 1603
1685/* 1604/*
1686 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. 1605 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
1687 * If they are, then fine. If, however, some pages are clean then they must 1606 * If they are, then fine. If, however, some pages are clean then they must
1688 * have been written out during the direct-IO read. So we take another ref on 1607 * have been written out during the direct-IO read. So we take another ref on
1689 * the BIO and the offending pages and re-dirty the pages in process context. 1608 * the BIO and re-dirty the pages in process context.
1690 * 1609 *
1691 * It is expected that bio_check_pages_dirty() will wholly own the BIO from 1610 * It is expected that bio_check_pages_dirty() will wholly own the BIO from
1692 * here on. It will run one put_page() against each page and will run one 1611 * here on. It will run one put_page() against each page and will run one
@@ -1704,78 +1623,70 @@ static struct bio *bio_dirty_list;
1704 */ 1623 */
1705static void bio_dirty_fn(struct work_struct *work) 1624static void bio_dirty_fn(struct work_struct *work)
1706{ 1625{
1707 unsigned long flags; 1626 struct bio *bio, *next;
1708 struct bio *bio;
1709 1627
1710 spin_lock_irqsave(&bio_dirty_lock, flags); 1628 spin_lock_irq(&bio_dirty_lock);
1711 bio = bio_dirty_list; 1629 next = bio_dirty_list;
1712 bio_dirty_list = NULL; 1630 bio_dirty_list = NULL;
1713 spin_unlock_irqrestore(&bio_dirty_lock, flags); 1631 spin_unlock_irq(&bio_dirty_lock);
1714 1632
1715 while (bio) { 1633 while ((bio = next) != NULL) {
1716 struct bio *next = bio->bi_private; 1634 next = bio->bi_private;
1717 1635
1718 bio_set_pages_dirty(bio); 1636 bio_set_pages_dirty(bio);
1719 bio_release_pages(bio); 1637 bio_release_pages(bio);
1720 bio_put(bio); 1638 bio_put(bio);
1721 bio = next;
1722 } 1639 }
1723} 1640}
1724 1641
1725void bio_check_pages_dirty(struct bio *bio) 1642void bio_check_pages_dirty(struct bio *bio)
1726{ 1643{
1727 struct bio_vec *bvec; 1644 struct bio_vec *bvec;
1728 int nr_clean_pages = 0; 1645 unsigned long flags;
1729 int i; 1646 int i;
1730 1647
1731 bio_for_each_segment_all(bvec, bio, i) { 1648 bio_for_each_segment_all(bvec, bio, i) {
1732 struct page *page = bvec->bv_page; 1649 if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
1733 1650 goto defer;
1734 if (PageDirty(page) || PageCompound(page)) {
1735 put_page(page);
1736 bvec->bv_page = NULL;
1737 } else {
1738 nr_clean_pages++;
1739 }
1740 } 1651 }
1741 1652
1742 if (nr_clean_pages) { 1653 bio_release_pages(bio);
1743 unsigned long flags; 1654 bio_put(bio);
1744 1655 return;
1745 spin_lock_irqsave(&bio_dirty_lock, flags); 1656defer:
1746 bio->bi_private = bio_dirty_list; 1657 spin_lock_irqsave(&bio_dirty_lock, flags);
1747 bio_dirty_list = bio; 1658 bio->bi_private = bio_dirty_list;
1748 spin_unlock_irqrestore(&bio_dirty_lock, flags); 1659 bio_dirty_list = bio;
1749 schedule_work(&bio_dirty_work); 1660 spin_unlock_irqrestore(&bio_dirty_lock, flags);
1750 } else { 1661 schedule_work(&bio_dirty_work);
1751 bio_put(bio);
1752 }
1753} 1662}
1754EXPORT_SYMBOL_GPL(bio_check_pages_dirty); 1663EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
1755 1664
1756void generic_start_io_acct(struct request_queue *q, int rw, 1665void generic_start_io_acct(struct request_queue *q, int op,
1757 unsigned long sectors, struct hd_struct *part) 1666 unsigned long sectors, struct hd_struct *part)
1758{ 1667{
1668 const int sgrp = op_stat_group(op);
1759 int cpu = part_stat_lock(); 1669 int cpu = part_stat_lock();
1760 1670
1761 part_round_stats(q, cpu, part); 1671 part_round_stats(q, cpu, part);
1762 part_stat_inc(cpu, part, ios[rw]); 1672 part_stat_inc(cpu, part, ios[sgrp]);
1763 part_stat_add(cpu, part, sectors[rw], sectors); 1673 part_stat_add(cpu, part, sectors[sgrp], sectors);
1764 part_inc_in_flight(q, part, rw); 1674 part_inc_in_flight(q, part, op_is_write(op));
1765 1675
1766 part_stat_unlock(); 1676 part_stat_unlock();
1767} 1677}
1768EXPORT_SYMBOL(generic_start_io_acct); 1678EXPORT_SYMBOL(generic_start_io_acct);
1769 1679
1770void generic_end_io_acct(struct request_queue *q, int rw, 1680void generic_end_io_acct(struct request_queue *q, int req_op,
1771 struct hd_struct *part, unsigned long start_time) 1681 struct hd_struct *part, unsigned long start_time)
1772{ 1682{
1773 unsigned long duration = jiffies - start_time; 1683 unsigned long duration = jiffies - start_time;
1684 const int sgrp = op_stat_group(req_op);
1774 int cpu = part_stat_lock(); 1685 int cpu = part_stat_lock();
1775 1686
1776 part_stat_add(cpu, part, ticks[rw], duration); 1687 part_stat_add(cpu, part, ticks[sgrp], duration);
1777 part_round_stats(q, cpu, part); 1688 part_round_stats(q, cpu, part);
1778 part_dec_in_flight(q, part, rw); 1689 part_dec_in_flight(q, part, op_is_write(req_op));
1779 1690
1780 part_stat_unlock(); 1691 part_stat_unlock();
1781} 1692}
@@ -1834,6 +1745,9 @@ again:
1834 if (!bio_integrity_endio(bio)) 1745 if (!bio_integrity_endio(bio))
1835 return; 1746 return;
1836 1747
1748 if (bio->bi_disk)
1749 rq_qos_done_bio(bio->bi_disk->queue, bio);
1750
1837 /* 1751 /*
1838 * Need to have a real endio function for chained bios, otherwise 1752 * Need to have a real endio function for chained bios, otherwise
1839 * various corner cases will break (like stacking block devices that 1753 * various corner cases will break (like stacking block devices that
@@ -2042,6 +1956,30 @@ EXPORT_SYMBOL(bioset_init_from_src);
2042 1956
2043#ifdef CONFIG_BLK_CGROUP 1957#ifdef CONFIG_BLK_CGROUP
2044 1958
1959#ifdef CONFIG_MEMCG
1960/**
1961 * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
1962 * @bio: target bio
1963 * @page: the page to lookup the blkcg from
1964 *
1965 * Associate @bio with the blkcg from @page's owning memcg. This works like
1966 * every other associate function wrt references.
1967 */
1968int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
1969{
1970 struct cgroup_subsys_state *blkcg_css;
1971
1972 if (unlikely(bio->bi_css))
1973 return -EBUSY;
1974 if (!page->mem_cgroup)
1975 return 0;
1976 blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
1977 &io_cgrp_subsys);
1978 bio->bi_css = blkcg_css;
1979 return 0;
1980}
1981#endif /* CONFIG_MEMCG */
1982
2045/** 1983/**
2046 * bio_associate_blkcg - associate a bio with the specified blkcg 1984 * bio_associate_blkcg - associate a bio with the specified blkcg
2047 * @bio: target bio 1985 * @bio: target bio
@@ -2065,6 +2003,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
2065EXPORT_SYMBOL_GPL(bio_associate_blkcg); 2003EXPORT_SYMBOL_GPL(bio_associate_blkcg);
2066 2004
2067/** 2005/**
2006 * bio_associate_blkg - associate a bio with the specified blkg
2007 * @bio: target bio
2008 * @blkg: the blkg to associate
2009 *
2010 * Associate @bio with the blkg specified by @blkg. This is the queue specific
2011 * blkcg information associated with the @bio, a reference will be taken on the
2012 * @blkg and will be freed when the bio is freed.
2013 */
2014int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
2015{
2016 if (unlikely(bio->bi_blkg))
2017 return -EBUSY;
2018 blkg_get(blkg);
2019 bio->bi_blkg = blkg;
2020 return 0;
2021}
2022
2023/**
2068 * bio_disassociate_task - undo bio_associate_current() 2024 * bio_disassociate_task - undo bio_associate_current()
2069 * @bio: target bio 2025 * @bio: target bio
2070 */ 2026 */
@@ -2078,6 +2034,10 @@ void bio_disassociate_task(struct bio *bio)
2078 css_put(bio->bi_css); 2034 css_put(bio->bi_css);
2079 bio->bi_css = NULL; 2035 bio->bi_css = NULL;
2080 } 2036 }
2037 if (bio->bi_blkg) {
2038 blkg_put(bio->bi_blkg);
2039 bio->bi_blkg = NULL;
2040 }
2081} 2041}
2082 2042
2083/** 2043/**
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index eb85cb87c40f..694595b29b8f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@
27#include <linux/atomic.h> 27#include <linux/atomic.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/blk-cgroup.h> 29#include <linux/blk-cgroup.h>
30#include <linux/tracehook.h>
30#include "blk.h" 31#include "blk.h"
31 32
32#define MAX_KEY_LEN 100 33#define MAX_KEY_LEN 100
@@ -50,6 +51,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
50 51
51static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ 52static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
52 53
54static bool blkcg_debug_stats = false;
55
53static bool blkcg_policy_enabled(struct request_queue *q, 56static bool blkcg_policy_enabled(struct request_queue *q,
54 const struct blkcg_policy *pol) 57 const struct blkcg_policy *pol)
55{ 58{
@@ -564,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
564 [BLKG_RWSTAT_WRITE] = "Write", 567 [BLKG_RWSTAT_WRITE] = "Write",
565 [BLKG_RWSTAT_SYNC] = "Sync", 568 [BLKG_RWSTAT_SYNC] = "Sync",
566 [BLKG_RWSTAT_ASYNC] = "Async", 569 [BLKG_RWSTAT_ASYNC] = "Async",
570 [BLKG_RWSTAT_DISCARD] = "Discard",
567 }; 571 };
568 const char *dname = blkg_dev_name(pd->blkg); 572 const char *dname = blkg_dev_name(pd->blkg);
569 u64 v; 573 u64 v;
@@ -577,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
577 (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); 581 (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
578 582
579 v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + 583 v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
580 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); 584 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
585 atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
581 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); 586 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
582 return v; 587 return v;
583} 588}
@@ -954,30 +959,77 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
954 959
955 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { 960 hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
956 const char *dname; 961 const char *dname;
962 char *buf;
957 struct blkg_rwstat rwstat; 963 struct blkg_rwstat rwstat;
958 u64 rbytes, wbytes, rios, wios; 964 u64 rbytes, wbytes, rios, wios, dbytes, dios;
965 size_t size = seq_get_buf(sf, &buf), off = 0;
966 int i;
967 bool has_stats = false;
959 968
960 dname = blkg_dev_name(blkg); 969 dname = blkg_dev_name(blkg);
961 if (!dname) 970 if (!dname)
962 continue; 971 continue;
963 972
973 /*
974 * Hooray string manipulation, count is the size written NOT
975 * INCLUDING THE \0, so size is now count+1 less than what we
976 * had before, but we want to start writing the next bit from
977 * the \0 so we only add count to buf.
978 */
979 off += scnprintf(buf+off, size-off, "%s ", dname);
980
964 spin_lock_irq(blkg->q->queue_lock); 981 spin_lock_irq(blkg->q->queue_lock);
965 982
966 rwstat = blkg_rwstat_recursive_sum(blkg, NULL, 983 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
967 offsetof(struct blkcg_gq, stat_bytes)); 984 offsetof(struct blkcg_gq, stat_bytes));
968 rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); 985 rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
969 wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); 986 wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
987 dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
970 988
971 rwstat = blkg_rwstat_recursive_sum(blkg, NULL, 989 rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
972 offsetof(struct blkcg_gq, stat_ios)); 990 offsetof(struct blkcg_gq, stat_ios));
973 rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); 991 rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
974 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); 992 wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
993 dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
975 994
976 spin_unlock_irq(blkg->q->queue_lock); 995 spin_unlock_irq(blkg->q->queue_lock);
977 996
978 if (rbytes || wbytes || rios || wios) 997 if (rbytes || wbytes || rios || wios) {
979 seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", 998 has_stats = true;
980 dname, rbytes, wbytes, rios, wios); 999 off += scnprintf(buf+off, size-off,
1000 "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
1001 rbytes, wbytes, rios, wios,
1002 dbytes, dios);
1003 }
1004
1005 if (!blkcg_debug_stats)
1006 goto next;
1007
1008 if (atomic_read(&blkg->use_delay)) {
1009 has_stats = true;
1010 off += scnprintf(buf+off, size-off,
1011 " use_delay=%d delay_nsec=%llu",
1012 atomic_read(&blkg->use_delay),
1013 (unsigned long long)atomic64_read(&blkg->delay_nsec));
1014 }
1015
1016 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1017 struct blkcg_policy *pol = blkcg_policy[i];
1018 size_t written;
1019
1020 if (!blkg->pd[i] || !pol->pd_stat_fn)
1021 continue;
1022
1023 written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
1024 if (written)
1025 has_stats = true;
1026 off += written;
1027 }
1028next:
1029 if (has_stats) {
1030 off += scnprintf(buf+off, size-off, "\n");
1031 seq_commit(sf, off);
1032 }
981 } 1033 }
982 1034
983 rcu_read_unlock(); 1035 rcu_read_unlock();
@@ -1191,6 +1243,14 @@ int blkcg_init_queue(struct request_queue *q)
1191 if (preloaded) 1243 if (preloaded)
1192 radix_tree_preload_end(); 1244 radix_tree_preload_end();
1193 1245
1246 ret = blk_iolatency_init(q);
1247 if (ret) {
1248 spin_lock_irq(q->queue_lock);
1249 blkg_destroy_all(q);
1250 spin_unlock_irq(q->queue_lock);
1251 return ret;
1252 }
1253
1194 ret = blk_throtl_init(q); 1254 ret = blk_throtl_init(q);
1195 if (ret) { 1255 if (ret) {
1196 spin_lock_irq(q->queue_lock); 1256 spin_lock_irq(q->queue_lock);
@@ -1288,6 +1348,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
1288 mutex_unlock(&blkcg_pol_mutex); 1348 mutex_unlock(&blkcg_pol_mutex);
1289} 1349}
1290 1350
1351static void blkcg_exit(struct task_struct *tsk)
1352{
1353 if (tsk->throttle_queue)
1354 blk_put_queue(tsk->throttle_queue);
1355 tsk->throttle_queue = NULL;
1356}
1357
1291struct cgroup_subsys io_cgrp_subsys = { 1358struct cgroup_subsys io_cgrp_subsys = {
1292 .css_alloc = blkcg_css_alloc, 1359 .css_alloc = blkcg_css_alloc,
1293 .css_offline = blkcg_css_offline, 1360 .css_offline = blkcg_css_offline,
@@ -1297,6 +1364,7 @@ struct cgroup_subsys io_cgrp_subsys = {
1297 .dfl_cftypes = blkcg_files, 1364 .dfl_cftypes = blkcg_files,
1298 .legacy_cftypes = blkcg_legacy_files, 1365 .legacy_cftypes = blkcg_legacy_files,
1299 .legacy_name = "blkio", 1366 .legacy_name = "blkio",
1367 .exit = blkcg_exit,
1300#ifdef CONFIG_MEMCG 1368#ifdef CONFIG_MEMCG
1301 /* 1369 /*
1302 * This ensures that, if available, memcg is automatically enabled 1370 * This ensures that, if available, memcg is automatically enabled
@@ -1547,3 +1615,209 @@ out_unlock:
1547 mutex_unlock(&blkcg_pol_register_mutex); 1615 mutex_unlock(&blkcg_pol_register_mutex);
1548} 1616}
1549EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1617EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1618
1619/*
1620 * Scale the accumulated delay based on how long it has been since we updated
1621 * the delay. We only call this when we are adding delay, in case it's been a
1622 * while since we added delay, and when we are checking to see if we need to
1623 * delay a task, to account for any delays that may have occurred.
1624 */
1625static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1626{
1627 u64 old = atomic64_read(&blkg->delay_start);
1628
1629 /*
1630 * We only want to scale down every second. The idea here is that we
1631 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
1632 * time window. We only want to throttle tasks for recent delay that
1633 * has occurred, in 1 second time windows since that's the maximum
1634 * things can be throttled. We save the current delay window in
1635 * blkg->last_delay so we know what amount is still left to be charged
1636 * to the blkg from this point onward. blkg->last_use keeps track of
1637 * the use_delay counter. The idea is if we're unthrottling the blkg we
1638 * are ok with whatever is happening now, and we can take away more of
1639 * the accumulated delay as we've already throttled enough that
1640 * everybody is happy with their IO latencies.
1641 */
1642 if (time_before64(old + NSEC_PER_SEC, now) &&
1643 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1644 u64 cur = atomic64_read(&blkg->delay_nsec);
1645 u64 sub = min_t(u64, blkg->last_delay, now - old);
1646 int cur_use = atomic_read(&blkg->use_delay);
1647
1648 /*
1649 * We've been unthrottled, subtract a larger chunk of our
1650 * accumulated delay.
1651 */
1652 if (cur_use < blkg->last_use)
1653 sub = max_t(u64, sub, blkg->last_delay >> 1);
1654
1655 /*
1656 * This shouldn't happen, but handle it anyway. Our delay_nsec
1657 * should only ever be growing except here where we subtract out
1658 * min(last_delay, 1 second), but lord knows bugs happen and I'd
1659 * rather not end up with negative numbers.
1660 */
1661 if (unlikely(cur < sub)) {
1662 atomic64_set(&blkg->delay_nsec, 0);
1663 blkg->last_delay = 0;
1664 } else {
1665 atomic64_sub(sub, &blkg->delay_nsec);
1666 blkg->last_delay = cur - sub;
1667 }
1668 blkg->last_use = cur_use;
1669 }
1670}
1671
1672/*
1673 * This is called when we want to actually walk up the hierarchy and check to
1674 * see if we need to throttle, and then actually throttle if there is some
1675 * accumulated delay. This should only be called upon return to user space so
1676 * we're not holding some lock that would induce a priority inversion.
1677 */
1678static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1679{
1680 u64 now = ktime_to_ns(ktime_get());
1681 u64 exp;
1682 u64 delay_nsec = 0;
1683 int tok;
1684
1685 while (blkg->parent) {
1686 if (atomic_read(&blkg->use_delay)) {
1687 blkcg_scale_delay(blkg, now);
1688 delay_nsec = max_t(u64, delay_nsec,
1689 atomic64_read(&blkg->delay_nsec));
1690 }
1691 blkg = blkg->parent;
1692 }
1693
1694 if (!delay_nsec)
1695 return;
1696
1697 /*
1698 * Let's not sleep for all eternity if we've amassed a huge delay.
1699 * Swapping or metadata IO can accumulate 10's of seconds worth of
1700 * delay, and we want userspace to be able to do _something_ so cap the
1701 * delays at 1 second. If there's 10's of seconds worth of delay then
1702 * the tasks will be delayed for 1 second for every syscall.
1703 */
1704 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1705
1706 /*
1707 * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1708 * that hasn't landed upstream yet. Once that stuff is in place we need
1709 * to do a psi_memstall_enter/leave if memdelay is set.
1710 */
1711
1712 exp = ktime_add_ns(now, delay_nsec);
1713 tok = io_schedule_prepare();
1714 do {
1715 __set_current_state(TASK_KILLABLE);
1716 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1717 break;
1718 } while (!fatal_signal_pending(current));
1719 io_schedule_finish(tok);
1720}
1721
1722/**
1723 * blkcg_maybe_throttle_current - throttle the current task if it has been marked
1724 *
1725 * This is only called if we've been marked with set_notify_resume(). Obviously
1726 * we can be set_notify_resume() for reasons other than blkcg throttling, so we
1727 * check to see if current->throttle_queue is set and if not this doesn't do
1728 * anything. This should only ever be called by the resume code, it's not meant
1729 * to be called by people willy-nilly as it will actually do the work to
1730 * throttle the task if it is setup for throttling.
1731 */
1732void blkcg_maybe_throttle_current(void)
1733{
1734 struct request_queue *q = current->throttle_queue;
1735 struct cgroup_subsys_state *css;
1736 struct blkcg *blkcg;
1737 struct blkcg_gq *blkg;
1738 bool use_memdelay = current->use_memdelay;
1739
1740 if (!q)
1741 return;
1742
1743 current->throttle_queue = NULL;
1744 current->use_memdelay = false;
1745
1746 rcu_read_lock();
1747 css = kthread_blkcg();
1748 if (css)
1749 blkcg = css_to_blkcg(css);
1750 else
1751 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1752
1753 if (!blkcg)
1754 goto out;
1755 blkg = blkg_lookup(blkcg, q);
1756 if (!blkg)
1757 goto out;
1758 blkg = blkg_try_get(blkg);
1759 if (!blkg)
1760 goto out;
1761 rcu_read_unlock();
1762
1763 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1764 blkg_put(blkg);
1765 blk_put_queue(q);
1766 return;
1767out:
1768 rcu_read_unlock();
1769 blk_put_queue(q);
1770}
1771EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
1772
1773/**
1774 * blkcg_schedule_throttle - this task needs to check for throttling
1775 * @q - the request queue IO was submitted on
1776 * @use_memdelay - do we charge this to memory delay for PSI
1777 *
1778 * This is called by the IO controller when we know there's delay accumulated
1779 * for the blkg for this task. We do not pass the blkg because there are places
1780 * we call this that may not have that information, the swapping code for
1781 * instance will only have a request_queue at that point. This set's the
1782 * notify_resume for the task to check and see if it requires throttling before
1783 * returning to user space.
1784 *
1785 * We will only schedule once per syscall. You can call this over and over
1786 * again and it will only do the check once upon return to user space, and only
1787 * throttle once. If the task needs to be throttled again it'll need to be
1788 * re-set at the next time we see the task.
1789 */
1790void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1791{
1792 if (unlikely(current->flags & PF_KTHREAD))
1793 return;
1794
1795 if (!blk_get_queue(q))
1796 return;
1797
1798 if (current->throttle_queue)
1799 blk_put_queue(current->throttle_queue);
1800 current->throttle_queue = q;
1801 if (use_memdelay)
1802 current->use_memdelay = use_memdelay;
1803 set_notify_resume(current);
1804}
1805EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
1806
1807/**
1808 * blkcg_add_delay - add delay to this blkg
1809 * @now - the current time in nanoseconds
1810 * @delta - how many nanoseconds of delay to add
1811 *
1812 * Charge @delta to the blkg's current delay accumulation. This is used to
1813 * throttle tasks if an IO controller thinks we need more throttling.
1814 */
1815void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1816{
1817 blkcg_scale_delay(blkg, now);
1818 atomic64_add(delta, &blkg->delay_nsec);
1819}
1820EXPORT_SYMBOL_GPL(blkcg_add_delay);
1821
1822module_param(blkcg_debug_stats, bool, 0644);
1823MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index ee33590f54eb..12550340418d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -42,7 +42,7 @@
42#include "blk.h" 42#include "blk.h"
43#include "blk-mq.h" 43#include "blk-mq.h"
44#include "blk-mq-sched.h" 44#include "blk-mq-sched.h"
45#include "blk-wbt.h" 45#include "blk-rq-qos.h"
46 46
47#ifdef CONFIG_DEBUG_FS 47#ifdef CONFIG_DEBUG_FS
48struct dentry *blk_debugfs_root; 48struct dentry *blk_debugfs_root;
@@ -715,6 +715,35 @@ void blk_set_queue_dying(struct request_queue *q)
715} 715}
716EXPORT_SYMBOL_GPL(blk_set_queue_dying); 716EXPORT_SYMBOL_GPL(blk_set_queue_dying);
717 717
718/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
719void blk_exit_queue(struct request_queue *q)
720{
721 /*
722 * Since the I/O scheduler exit code may access cgroup information,
723 * perform I/O scheduler exit before disassociating from the block
724 * cgroup controller.
725 */
726 if (q->elevator) {
727 ioc_clear_queue(q);
728 elevator_exit(q, q->elevator);
729 q->elevator = NULL;
730 }
731
732 /*
733 * Remove all references to @q from the block cgroup controller before
734 * restoring @q->queue_lock to avoid that restoring this pointer causes
735 * e.g. blkcg_print_blkgs() to crash.
736 */
737 blkcg_exit_queue(q);
738
739 /*
740 * Since the cgroup code may dereference the @q->backing_dev_info
741 * pointer, only decrease its reference count after having removed the
742 * association with the block cgroup controller.
743 */
744 bdi_put(q->backing_dev_info);
745}
746
718/** 747/**
719 * blk_cleanup_queue - shutdown a request queue 748 * blk_cleanup_queue - shutdown a request queue
720 * @q: request queue to shutdown 749 * @q: request queue to shutdown
@@ -762,9 +791,13 @@ void blk_cleanup_queue(struct request_queue *q)
762 * make sure all in-progress dispatch are completed because 791 * make sure all in-progress dispatch are completed because
763 * blk_freeze_queue() can only complete all requests, and 792 * blk_freeze_queue() can only complete all requests, and
764 * dispatch may still be in-progress since we dispatch requests 793 * dispatch may still be in-progress since we dispatch requests
765 * from more than one contexts 794 * from more than one contexts.
795 *
796 * No need to quiesce queue if it isn't initialized yet since
797 * blk_freeze_queue() should be enough for cases of passthrough
798 * request.
766 */ 799 */
767 if (q->mq_ops) 800 if (q->mq_ops && blk_queue_init_done(q))
768 blk_mq_quiesce_queue(q); 801 blk_mq_quiesce_queue(q);
769 802
770 /* for synchronous bio-based driver finish in-flight integrity i/o */ 803 /* for synchronous bio-based driver finish in-flight integrity i/o */
@@ -780,30 +813,7 @@ void blk_cleanup_queue(struct request_queue *q)
780 */ 813 */
781 WARN_ON_ONCE(q->kobj.state_in_sysfs); 814 WARN_ON_ONCE(q->kobj.state_in_sysfs);
782 815
783 /* 816 blk_exit_queue(q);
784 * Since the I/O scheduler exit code may access cgroup information,
785 * perform I/O scheduler exit before disassociating from the block
786 * cgroup controller.
787 */
788 if (q->elevator) {
789 ioc_clear_queue(q);
790 elevator_exit(q, q->elevator);
791 q->elevator = NULL;
792 }
793
794 /*
795 * Remove all references to @q from the block cgroup controller before
796 * restoring @q->queue_lock to avoid that restoring this pointer causes
797 * e.g. blkcg_print_blkgs() to crash.
798 */
799 blkcg_exit_queue(q);
800
801 /*
802 * Since the cgroup code may dereference the @q->backing_dev_info
803 * pointer, only decrease its reference count after having removed the
804 * association with the block cgroup controller.
805 */
806 bdi_put(q->backing_dev_info);
807 817
808 if (q->mq_ops) 818 if (q->mq_ops)
809 blk_mq_free_queue(q); 819 blk_mq_free_queue(q);
@@ -1180,6 +1190,7 @@ out_exit_flush_rq:
1180 q->exit_rq_fn(q, q->fq->flush_rq); 1190 q->exit_rq_fn(q, q->fq->flush_rq);
1181out_free_flush_queue: 1191out_free_flush_queue:
1182 blk_free_flush_queue(q->fq); 1192 blk_free_flush_queue(q->fq);
1193 q->fq = NULL;
1183 return -ENOMEM; 1194 return -ENOMEM;
1184} 1195}
1185EXPORT_SYMBOL(blk_init_allocated_queue); 1196EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1641,7 +1652,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
1641 blk_delete_timer(rq); 1652 blk_delete_timer(rq);
1642 blk_clear_rq_complete(rq); 1653 blk_clear_rq_complete(rq);
1643 trace_block_rq_requeue(q, rq); 1654 trace_block_rq_requeue(q, rq);
1644 wbt_requeue(q->rq_wb, rq); 1655 rq_qos_requeue(q, rq);
1645 1656
1646 if (rq->rq_flags & RQF_QUEUED) 1657 if (rq->rq_flags & RQF_QUEUED)
1647 blk_queue_end_tag(q, rq); 1658 blk_queue_end_tag(q, rq);
@@ -1748,7 +1759,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1748 /* this is a bio leak */ 1759 /* this is a bio leak */
1749 WARN_ON(req->bio != NULL); 1760 WARN_ON(req->bio != NULL);
1750 1761
1751 wbt_done(q->rq_wb, req); 1762 rq_qos_done(q, req);
1752 1763
1753 /* 1764 /*
1754 * Request may not have originated from ll_rw_blk. if not, 1765 * Request may not have originated from ll_rw_blk. if not,
@@ -1982,7 +1993,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
1982 int where = ELEVATOR_INSERT_SORT; 1993 int where = ELEVATOR_INSERT_SORT;
1983 struct request *req, *free; 1994 struct request *req, *free;
1984 unsigned int request_count = 0; 1995 unsigned int request_count = 0;
1985 unsigned int wb_acct;
1986 1996
1987 /* 1997 /*
1988 * low level driver can indicate that it wants pages above a 1998 * low level driver can indicate that it wants pages above a
@@ -2040,7 +2050,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
2040 } 2050 }
2041 2051
2042get_rq: 2052get_rq:
2043 wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); 2053 rq_qos_throttle(q, bio, q->queue_lock);
2044 2054
2045 /* 2055 /*
2046 * Grab a free request. This is might sleep but can not fail. 2056 * Grab a free request. This is might sleep but can not fail.
@@ -2050,7 +2060,7 @@ get_rq:
2050 req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); 2060 req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
2051 if (IS_ERR(req)) { 2061 if (IS_ERR(req)) {
2052 blk_queue_exit(q); 2062 blk_queue_exit(q);
2053 __wbt_done(q->rq_wb, wb_acct); 2063 rq_qos_cleanup(q, bio);
2054 if (PTR_ERR(req) == -ENOMEM) 2064 if (PTR_ERR(req) == -ENOMEM)
2055 bio->bi_status = BLK_STS_RESOURCE; 2065 bio->bi_status = BLK_STS_RESOURCE;
2056 else 2066 else
@@ -2059,7 +2069,7 @@ get_rq:
2059 goto out_unlock; 2069 goto out_unlock;
2060 } 2070 }
2061 2071
2062 wbt_track(req, wb_acct); 2072 rq_qos_track(q, req, bio);
2063 2073
2064 /* 2074 /*
2065 * After dropping the lock and possibly sleeping here, our request 2075 * After dropping the lock and possibly sleeping here, our request
@@ -2700,13 +2710,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
2700void blk_account_io_completion(struct request *req, unsigned int bytes) 2710void blk_account_io_completion(struct request *req, unsigned int bytes)
2701{ 2711{
2702 if (blk_do_io_stat(req)) { 2712 if (blk_do_io_stat(req)) {
2703 const int rw = rq_data_dir(req); 2713 const int sgrp = op_stat_group(req_op(req));
2704 struct hd_struct *part; 2714 struct hd_struct *part;
2705 int cpu; 2715 int cpu;
2706 2716
2707 cpu = part_stat_lock(); 2717 cpu = part_stat_lock();
2708 part = req->part; 2718 part = req->part;
2709 part_stat_add(cpu, part, sectors[rw], bytes >> 9); 2719 part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
2710 part_stat_unlock(); 2720 part_stat_unlock();
2711 } 2721 }
2712} 2722}
@@ -2720,7 +2730,7 @@ void blk_account_io_done(struct request *req, u64 now)
2720 */ 2730 */
2721 if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { 2731 if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
2722 unsigned long duration; 2732 unsigned long duration;
2723 const int rw = rq_data_dir(req); 2733 const int sgrp = op_stat_group(req_op(req));
2724 struct hd_struct *part; 2734 struct hd_struct *part;
2725 int cpu; 2735 int cpu;
2726 2736
@@ -2728,10 +2738,10 @@ void blk_account_io_done(struct request *req, u64 now)
2728 cpu = part_stat_lock(); 2738 cpu = part_stat_lock();
2729 part = req->part; 2739 part = req->part;
2730 2740
2731 part_stat_inc(cpu, part, ios[rw]); 2741 part_stat_inc(cpu, part, ios[sgrp]);
2732 part_stat_add(cpu, part, ticks[rw], duration); 2742 part_stat_add(cpu, part, ticks[sgrp], duration);
2733 part_round_stats(req->q, cpu, part); 2743 part_round_stats(req->q, cpu, part);
2734 part_dec_in_flight(req->q, part, rw); 2744 part_dec_in_flight(req->q, part, rq_data_dir(req));
2735 2745
2736 hd_struct_put(part); 2746 hd_struct_put(part);
2737 part_stat_unlock(); 2747 part_stat_unlock();
@@ -2751,9 +2761,9 @@ static bool blk_pm_allow_request(struct request *rq)
2751 return rq->rq_flags & RQF_PM; 2761 return rq->rq_flags & RQF_PM;
2752 case RPM_SUSPENDED: 2762 case RPM_SUSPENDED:
2753 return false; 2763 return false;
2764 default:
2765 return true;
2754 } 2766 }
2755
2756 return true;
2757} 2767}
2758#else 2768#else
2759static bool blk_pm_allow_request(struct request *rq) 2769static bool blk_pm_allow_request(struct request *rq)
@@ -2980,7 +2990,7 @@ void blk_start_request(struct request *req)
2980 req->throtl_size = blk_rq_sectors(req); 2990 req->throtl_size = blk_rq_sectors(req);
2981#endif 2991#endif
2982 req->rq_flags |= RQF_STATS; 2992 req->rq_flags |= RQF_STATS;
2983 wbt_issue(req->q->rq_wb, req); 2993 rq_qos_issue(req->q, req);
2984 } 2994 }
2985 2995
2986 BUG_ON(blk_rq_is_complete(req)); 2996 BUG_ON(blk_rq_is_complete(req));
@@ -3053,6 +3063,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
3053 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees 3063 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
3054 * %false return from this function. 3064 * %false return from this function.
3055 * 3065 *
3066 * Note:
3067 * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
3068 * blk_rq_bytes() and in blk_update_request().
3069 *
3056 * Return: 3070 * Return:
3057 * %false - this request doesn't have any more data 3071 * %false - this request doesn't have any more data
3058 * %true - this request has more data 3072 * %true - this request has more data
@@ -3200,7 +3214,7 @@ void blk_finish_request(struct request *req, blk_status_t error)
3200 blk_account_io_done(req, now); 3214 blk_account_io_done(req, now);
3201 3215
3202 if (req->end_io) { 3216 if (req->end_io) {
3203 wbt_done(req->q->rq_wb, req); 3217 rq_qos_done(q, req);
3204 req->end_io(req, error); 3218 req->end_io(req, error);
3205 } else { 3219 } else {
3206 if (blk_bidi_rq(req)) 3220 if (blk_bidi_rq(req))
@@ -3763,9 +3777,11 @@ EXPORT_SYMBOL(blk_finish_plug);
3763 */ 3777 */
3764void blk_pm_runtime_init(struct request_queue *q, struct device *dev) 3778void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
3765{ 3779{
3766 /* not support for RQF_PM and ->rpm_status in blk-mq yet */ 3780 /* Don't enable runtime PM for blk-mq until it is ready */
3767 if (q->mq_ops) 3781 if (q->mq_ops) {
3782 pm_runtime_disable(dev);
3768 return; 3783 return;
3784 }
3769 3785
3770 q->dev = dev; 3786 q->dev = dev;
3771 q->rpm_status = RPM_ACTIVE; 3787 q->rpm_status = RPM_ACTIVE;
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f23311e4b201..01580f88fcb3 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -278,7 +278,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
278 atomic_set(&ioc->nr_tasks, 1); 278 atomic_set(&ioc->nr_tasks, 1);
279 atomic_set(&ioc->active_ref, 1); 279 atomic_set(&ioc->active_ref, 1);
280 spin_lock_init(&ioc->lock); 280 spin_lock_init(&ioc->lock);
281 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); 281 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
282 INIT_HLIST_HEAD(&ioc->icq_list); 282 INIT_HLIST_HEAD(&ioc->icq_list);
283 INIT_WORK(&ioc->release_work, ioc_release_fn); 283 INIT_WORK(&ioc->release_work, ioc_release_fn);
284 284
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
new file mode 100644
index 000000000000..19923f8a029d
--- /dev/null
+++ b/block/blk-iolatency.c
@@ -0,0 +1,955 @@
1/*
2 * Block rq-qos base io controller
3 *
4 * This works similar to wbt with a few exceptions
5 *
6 * - It's bio based, so the latency covers the whole block layer in addition to
7 * the actual io.
8 * - We will throttle all IO that comes in here if we need to.
9 * - We use the mean latency over the 100ms window. This is because writes can
10 * be particularly fast, which could give us a false sense of the impact of
11 * other workloads on our protected workload.
12 * - By default there's no throttling, we set the queue_depth to UINT_MAX so
13 * that we can have as many outstanding bio's as we're allowed to. Only at
14 * throttle time do we pay attention to the actual queue depth.
15 *
16 * The hierarchy works like the cpu controller does, we track the latency at
17 * every configured node, and each configured node has it's own independent
18 * queue depth. This means that we only care about our latency targets at the
19 * peer level. Some group at the bottom of the hierarchy isn't going to affect
20 * a group at the end of some other path if we're only configred at leaf level.
21 *
22 * Consider the following
23 *
24 * root blkg
25 * / \
26 * fast (target=5ms) slow (target=10ms)
27 * / \ / \
28 * a b normal(15ms) unloved
29 *
30 * "a" and "b" have no target, but their combined io under "fast" cannot exceed
31 * an average latency of 5ms. If it does then we will throttle the "slow"
32 * group. In the case of "normal", if it exceeds its 15ms target, we will
33 * throttle "unloved", but nobody else.
34 *
35 * In this example "fast", "slow", and "normal" will be the only groups actually
36 * accounting their io latencies. We have to walk up the heirarchy to the root
37 * on every submit and complete so we can do the appropriate stat recording and
38 * adjust the queue depth of ourselves if needed.
39 *
40 * There are 2 ways we throttle IO.
41 *
42 * 1) Queue depth throttling. As we throttle down we will adjust the maximum
43 * number of IO's we're allowed to have in flight. This starts at (u64)-1 down
44 * to 1. If the group is only ever submitting IO for itself then this is the
45 * only way we throttle.
46 *
47 * 2) Induced delay throttling. This is for the case that a group is generating
48 * IO that has to be issued by the root cg to avoid priority inversion. So think
49 * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot
50 * of work done for us on behalf of the root cg and are being asked to scale
51 * down more then we induce a latency at userspace return. We accumulate the
52 * total amount of time we need to be punished by doing
53 *
54 * total_time += min_lat_nsec - actual_io_completion
55 *
56 * and then at throttle time will do
57 *
58 * throttle_time = min(total_time, NSEC_PER_SEC)
59 *
60 * This induced delay will throttle back the activity that is generating the
61 * root cg issued io's, wethere that's some metadata intensive operation or the
62 * group is using so much memory that it is pushing us into swap.
63 *
64 * Copyright (C) 2018 Josef Bacik
65 */
66#include <linux/kernel.h>
67#include <linux/blk_types.h>
68#include <linux/backing-dev.h>
69#include <linux/module.h>
70#include <linux/timer.h>
71#include <linux/memcontrol.h>
72#include <linux/sched/loadavg.h>
73#include <linux/sched/signal.h>
74#include <trace/events/block.h>
75#include "blk-rq-qos.h"
76#include "blk-stat.h"
77
78#define DEFAULT_SCALE_COOKIE 1000000U
79
80static struct blkcg_policy blkcg_policy_iolatency;
81struct iolatency_grp;
82
83struct blk_iolatency {
84 struct rq_qos rqos;
85 struct timer_list timer;
86 atomic_t enabled;
87};
88
89static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
90{
91 return container_of(rqos, struct blk_iolatency, rqos);
92}
93
94static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat)
95{
96 return atomic_read(&blkiolat->enabled) > 0;
97}
98
99struct child_latency_info {
100 spinlock_t lock;
101
102 /* Last time we adjusted the scale of everybody. */
103 u64 last_scale_event;
104
105 /* The latency that we missed. */
106 u64 scale_lat;
107
108 /* Total io's from all of our children for the last summation. */
109 u64 nr_samples;
110
111 /* The guy who actually changed the latency numbers. */
112 struct iolatency_grp *scale_grp;
113
114 /* Cookie to tell if we need to scale up or down. */
115 atomic_t scale_cookie;
116};
117
118struct iolatency_grp {
119 struct blkg_policy_data pd;
120 struct blk_rq_stat __percpu *stats;
121 struct blk_iolatency *blkiolat;
122 struct rq_depth rq_depth;
123 struct rq_wait rq_wait;
124 atomic64_t window_start;
125 atomic_t scale_cookie;
126 u64 min_lat_nsec;
127 u64 cur_win_nsec;
128
129 /* total running average of our io latency. */
130 u64 lat_avg;
131
132 /* Our current number of IO's for the last summation. */
133 u64 nr_samples;
134
135 struct child_latency_info child_lat;
136};
137
138#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
139#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
140/*
141 * These are the constants used to fake the fixed-point moving average
142 * calculation just like load average. The call to CALC_LOAD folds
143 * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling
144 * window size is bucketed to try to approximately calculate average
145 * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
146 * elapse immediately. Note, windows only elapse with IO activity. Idle
147 * periods extend the most recent window.
148 */
149#define BLKIOLATENCY_NR_EXP_FACTORS 5
150#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
151 (BLKIOLATENCY_NR_EXP_FACTORS - 1))
152static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
153 2045, // exp(1/600) - 600 samples
154 2039, // exp(1/240) - 240 samples
155 2031, // exp(1/120) - 120 samples
156 2023, // exp(1/80) - 80 samples
157 2014, // exp(1/60) - 60 samples
158};
159
160static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
161{
162 return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
163}
164
165static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
166{
167 return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
168}
169
170static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
171{
172 return pd_to_blkg(&iolat->pd);
173}
174
175static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
176 wait_queue_entry_t *wait,
177 bool first_block)
178{
179 struct rq_wait *rqw = &iolat->rq_wait;
180
181 if (first_block && waitqueue_active(&rqw->wait) &&
182 rqw->wait.head.next != &wait->entry)
183 return false;
184 return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
185}
186
187static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
188 struct iolatency_grp *iolat,
189 spinlock_t *lock, bool issue_as_root,
190 bool use_memdelay)
191 __releases(lock)
192 __acquires(lock)
193{
194 struct rq_wait *rqw = &iolat->rq_wait;
195 unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
196 DEFINE_WAIT(wait);
197 bool first_block = true;
198
199 if (use_delay)
200 blkcg_schedule_throttle(rqos->q, use_memdelay);
201
202 /*
203 * To avoid priority inversions we want to just take a slot if we are
204 * issuing as root. If we're being killed off there's no point in
205 * delaying things, we may have been killed by OOM so throttling may
206 * make recovery take even longer, so just let the IO's through so the
207 * task can go away.
208 */
209 if (issue_as_root || fatal_signal_pending(current)) {
210 atomic_inc(&rqw->inflight);
211 return;
212 }
213
214 if (iolatency_may_queue(iolat, &wait, first_block))
215 return;
216
217 do {
218 prepare_to_wait_exclusive(&rqw->wait, &wait,
219 TASK_UNINTERRUPTIBLE);
220
221 if (iolatency_may_queue(iolat, &wait, first_block))
222 break;
223 first_block = false;
224
225 if (lock) {
226 spin_unlock_irq(lock);
227 io_schedule();
228 spin_lock_irq(lock);
229 } else {
230 io_schedule();
231 }
232 } while (1);
233
234 finish_wait(&rqw->wait, &wait);
235}
236
237#define SCALE_DOWN_FACTOR 2
238#define SCALE_UP_FACTOR 4
239
240static inline unsigned long scale_amount(unsigned long qd, bool up)
241{
242 return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
243}
244
245/*
246 * We scale the qd down faster than we scale up, so we need to use this helper
247 * to adjust the scale_cookie accordingly so we don't prematurely get
248 * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
249 *
250 * Each group has their own local copy of the last scale cookie they saw, so if
251 * the global scale cookie goes up or down they know which way they need to go
252 * based on their last knowledge of it.
253 */
254static void scale_cookie_change(struct blk_iolatency *blkiolat,
255 struct child_latency_info *lat_info,
256 bool up)
257{
258 unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
259 unsigned long scale = scale_amount(qd, up);
260 unsigned long old = atomic_read(&lat_info->scale_cookie);
261 unsigned long max_scale = qd << 1;
262 unsigned long diff = 0;
263
264 if (old < DEFAULT_SCALE_COOKIE)
265 diff = DEFAULT_SCALE_COOKIE - old;
266
267 if (up) {
268 if (scale + old > DEFAULT_SCALE_COOKIE)
269 atomic_set(&lat_info->scale_cookie,
270 DEFAULT_SCALE_COOKIE);
271 else if (diff > qd)
272 atomic_inc(&lat_info->scale_cookie);
273 else
274 atomic_add(scale, &lat_info->scale_cookie);
275 } else {
276 /*
277 * We don't want to dig a hole so deep that it takes us hours to
278 * dig out of it. Just enough that we don't throttle/unthrottle
279 * with jagged workloads but can still unthrottle once pressure
280 * has sufficiently dissipated.
281 */
282 if (diff > qd) {
283 if (diff < max_scale)
284 atomic_dec(&lat_info->scale_cookie);
285 } else {
286 atomic_sub(scale, &lat_info->scale_cookie);
287 }
288 }
289}
290
291/*
292 * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
293 * queue depth at a time so we don't get wild swings and hopefully dial in to
294 * fairer distribution of the overall queue depth.
295 */
296static void scale_change(struct iolatency_grp *iolat, bool up)
297{
298 unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
299 unsigned long scale = scale_amount(qd, up);
300 unsigned long old = iolat->rq_depth.max_depth;
301 bool changed = false;
302
303 if (old > qd)
304 old = qd;
305
306 if (up) {
307 if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
308 return;
309
310 if (old < qd) {
311 changed = true;
312 old += scale;
313 old = min(old, qd);
314 iolat->rq_depth.max_depth = old;
315 wake_up_all(&iolat->rq_wait.wait);
316 }
317 } else if (old > 1) {
318 old >>= 1;
319 changed = true;
320 iolat->rq_depth.max_depth = max(old, 1UL);
321 }
322}
323
324/* Check our parent and see if the scale cookie has changed. */
325static void check_scale_change(struct iolatency_grp *iolat)
326{
327 struct iolatency_grp *parent;
328 struct child_latency_info *lat_info;
329 unsigned int cur_cookie;
330 unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
331 u64 scale_lat;
332 unsigned int old;
333 int direction = 0;
334
335 if (lat_to_blkg(iolat)->parent == NULL)
336 return;
337
338 parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
339 if (!parent)
340 return;
341
342 lat_info = &parent->child_lat;
343 cur_cookie = atomic_read(&lat_info->scale_cookie);
344 scale_lat = READ_ONCE(lat_info->scale_lat);
345
346 if (cur_cookie < our_cookie)
347 direction = -1;
348 else if (cur_cookie > our_cookie)
349 direction = 1;
350 else
351 return;
352
353 old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
354
355 /* Somebody beat us to the punch, just bail. */
356 if (old != our_cookie)
357 return;
358
359 if (direction < 0 && iolat->min_lat_nsec) {
360 u64 samples_thresh;
361
362 if (!scale_lat || iolat->min_lat_nsec <= scale_lat)
363 return;
364
365 /*
366 * Sometimes high priority groups are their own worst enemy, so
367 * instead of taking it out on some poor other group that did 5%
368 * or less of the IO's for the last summation just skip this
369 * scale down event.
370 */
371 samples_thresh = lat_info->nr_samples * 5;
372 samples_thresh = div64_u64(samples_thresh, 100);
373 if (iolat->nr_samples <= samples_thresh)
374 return;
375 }
376
377 /* We're as low as we can go. */
378 if (iolat->rq_depth.max_depth == 1 && direction < 0) {
379 blkcg_use_delay(lat_to_blkg(iolat));
380 return;
381 }
382
383 /* We're back to the default cookie, unthrottle all the things. */
384 if (cur_cookie == DEFAULT_SCALE_COOKIE) {
385 blkcg_clear_delay(lat_to_blkg(iolat));
386 iolat->rq_depth.max_depth = UINT_MAX;
387 wake_up_all(&iolat->rq_wait.wait);
388 return;
389 }
390
391 scale_change(iolat, direction > 0);
392}
393
394static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
395 spinlock_t *lock)
396{
397 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
398 struct blkcg *blkcg;
399 struct blkcg_gq *blkg;
400 struct request_queue *q = rqos->q;
401 bool issue_as_root = bio_issue_as_root_blkg(bio);
402
403 if (!blk_iolatency_enabled(blkiolat))
404 return;
405
406 rcu_read_lock();
407 blkcg = bio_blkcg(bio);
408 bio_associate_blkcg(bio, &blkcg->css);
409 blkg = blkg_lookup(blkcg, q);
410 if (unlikely(!blkg)) {
411 if (!lock)
412 spin_lock_irq(q->queue_lock);
413 blkg = blkg_lookup_create(blkcg, q);
414 if (IS_ERR(blkg))
415 blkg = NULL;
416 if (!lock)
417 spin_unlock_irq(q->queue_lock);
418 }
419 if (!blkg)
420 goto out;
421
422 bio_issue_init(&bio->bi_issue, bio_sectors(bio));
423 bio_associate_blkg(bio, blkg);
424out:
425 rcu_read_unlock();
426 while (blkg && blkg->parent) {
427 struct iolatency_grp *iolat = blkg_to_lat(blkg);
428 if (!iolat) {
429 blkg = blkg->parent;
430 continue;
431 }
432
433 check_scale_change(iolat);
434 __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root,
435 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
436 blkg = blkg->parent;
437 }
438 if (!timer_pending(&blkiolat->timer))
439 mod_timer(&blkiolat->timer, jiffies + HZ);
440}
441
442static void iolatency_record_time(struct iolatency_grp *iolat,
443 struct bio_issue *issue, u64 now,
444 bool issue_as_root)
445{
446 struct blk_rq_stat *rq_stat;
447 u64 start = bio_issue_time(issue);
448 u64 req_time;
449
450 /*
451 * Have to do this so we are truncated to the correct time that our
452 * issue is truncated to.
453 */
454 now = __bio_issue_time(now);
455
456 if (now <= start)
457 return;
458
459 req_time = now - start;
460
461 /*
462 * We don't want to count issue_as_root bio's in the cgroups latency
463 * statistics as it could skew the numbers downwards.
464 */
465 if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
466 u64 sub = iolat->min_lat_nsec;
467 if (req_time < sub)
468 blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
469 return;
470 }
471
472 rq_stat = get_cpu_ptr(iolat->stats);
473 blk_rq_stat_add(rq_stat, req_time);
474 put_cpu_ptr(rq_stat);
475}
476
477#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
478#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
479
480static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
481{
482 struct blkcg_gq *blkg = lat_to_blkg(iolat);
483 struct iolatency_grp *parent;
484 struct child_latency_info *lat_info;
485 struct blk_rq_stat stat;
486 unsigned long flags;
487 int cpu, exp_idx;
488
489 blk_rq_stat_init(&stat);
490 preempt_disable();
491 for_each_online_cpu(cpu) {
492 struct blk_rq_stat *s;
493 s = per_cpu_ptr(iolat->stats, cpu);
494 blk_rq_stat_sum(&stat, s);
495 blk_rq_stat_init(s);
496 }
497 preempt_enable();
498
499 parent = blkg_to_lat(blkg->parent);
500 if (!parent)
501 return;
502
503 lat_info = &parent->child_lat;
504
505 /*
506 * CALC_LOAD takes in a number stored in fixed point representation.
507 * Because we are using this for IO time in ns, the values stored
508 * are significantly larger than the FIXED_1 denominator (2048).
509 * Therefore, rounding errors in the calculation are negligible and
510 * can be ignored.
511 */
512 exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
513 div64_u64(iolat->cur_win_nsec,
514 BLKIOLATENCY_EXP_BUCKET_SIZE));
515 CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
516
517 /* Everything is ok and we don't need to adjust the scale. */
518 if (stat.mean <= iolat->min_lat_nsec &&
519 atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
520 return;
521
522 /* Somebody beat us to the punch, just bail. */
523 spin_lock_irqsave(&lat_info->lock, flags);
524 lat_info->nr_samples -= iolat->nr_samples;
525 lat_info->nr_samples += stat.nr_samples;
526 iolat->nr_samples = stat.nr_samples;
527
528 if ((lat_info->last_scale_event >= now ||
529 now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
530 lat_info->scale_lat <= iolat->min_lat_nsec)
531 goto out;
532
533 if (stat.mean <= iolat->min_lat_nsec &&
534 stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
535 if (lat_info->scale_grp == iolat) {
536 lat_info->last_scale_event = now;
537 scale_cookie_change(iolat->blkiolat, lat_info, true);
538 }
539 } else if (stat.mean > iolat->min_lat_nsec) {
540 lat_info->last_scale_event = now;
541 if (!lat_info->scale_grp ||
542 lat_info->scale_lat > iolat->min_lat_nsec) {
543 WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
544 lat_info->scale_grp = iolat;
545 }
546 scale_cookie_change(iolat->blkiolat, lat_info, false);
547 }
548out:
549 spin_unlock_irqrestore(&lat_info->lock, flags);
550}
551
552static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
553{
554 struct blkcg_gq *blkg;
555 struct rq_wait *rqw;
556 struct iolatency_grp *iolat;
557 u64 window_start;
558 u64 now = ktime_to_ns(ktime_get());
559 bool issue_as_root = bio_issue_as_root_blkg(bio);
560 bool enabled = false;
561
562 blkg = bio->bi_blkg;
563 if (!blkg)
564 return;
565
566 iolat = blkg_to_lat(bio->bi_blkg);
567 if (!iolat)
568 return;
569
570 enabled = blk_iolatency_enabled(iolat->blkiolat);
571 while (blkg && blkg->parent) {
572 iolat = blkg_to_lat(blkg);
573 if (!iolat) {
574 blkg = blkg->parent;
575 continue;
576 }
577 rqw = &iolat->rq_wait;
578
579 atomic_dec(&rqw->inflight);
580 if (!enabled || iolat->min_lat_nsec == 0)
581 goto next;
582 iolatency_record_time(iolat, &bio->bi_issue, now,
583 issue_as_root);
584 window_start = atomic64_read(&iolat->window_start);
585 if (now > window_start &&
586 (now - window_start) >= iolat->cur_win_nsec) {
587 if (atomic64_cmpxchg(&iolat->window_start,
588 window_start, now) == window_start)
589 iolatency_check_latencies(iolat, now);
590 }
591next:
592 wake_up(&rqw->wait);
593 blkg = blkg->parent;
594 }
595}
596
597static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio)
598{
599 struct blkcg_gq *blkg;
600
601 blkg = bio->bi_blkg;
602 while (blkg && blkg->parent) {
603 struct rq_wait *rqw;
604 struct iolatency_grp *iolat;
605
606 iolat = blkg_to_lat(blkg);
607 if (!iolat)
608 goto next;
609
610 rqw = &iolat->rq_wait;
611 atomic_dec(&rqw->inflight);
612 wake_up(&rqw->wait);
613next:
614 blkg = blkg->parent;
615 }
616}
617
618static void blkcg_iolatency_exit(struct rq_qos *rqos)
619{
620 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
621
622 del_timer_sync(&blkiolat->timer);
623 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
624 kfree(blkiolat);
625}
626
627static struct rq_qos_ops blkcg_iolatency_ops = {
628 .throttle = blkcg_iolatency_throttle,
629 .cleanup = blkcg_iolatency_cleanup,
630 .done_bio = blkcg_iolatency_done_bio,
631 .exit = blkcg_iolatency_exit,
632};
633
634static void blkiolatency_timer_fn(struct timer_list *t)
635{
636 struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
637 struct blkcg_gq *blkg;
638 struct cgroup_subsys_state *pos_css;
639 u64 now = ktime_to_ns(ktime_get());
640
641 rcu_read_lock();
642 blkg_for_each_descendant_pre(blkg, pos_css,
643 blkiolat->rqos.q->root_blkg) {
644 struct iolatency_grp *iolat;
645 struct child_latency_info *lat_info;
646 unsigned long flags;
647 u64 cookie;
648
649 /*
650 * We could be exiting, don't access the pd unless we have a
651 * ref on the blkg.
652 */
653 if (!blkg_try_get(blkg))
654 continue;
655
656 iolat = blkg_to_lat(blkg);
657 if (!iolat)
658 goto next;
659
660 lat_info = &iolat->child_lat;
661 cookie = atomic_read(&lat_info->scale_cookie);
662
663 if (cookie >= DEFAULT_SCALE_COOKIE)
664 goto next;
665
666 spin_lock_irqsave(&lat_info->lock, flags);
667 if (lat_info->last_scale_event >= now)
668 goto next_lock;
669
670 /*
671 * We scaled down but don't have a scale_grp, scale up and carry
672 * on.
673 */
674 if (lat_info->scale_grp == NULL) {
675 scale_cookie_change(iolat->blkiolat, lat_info, true);
676 goto next_lock;
677 }
678
679 /*
680 * It's been 5 seconds since our last scale event, clear the
681 * scale grp in case the group that needed the scale down isn't
682 * doing any IO currently.
683 */
684 if (now - lat_info->last_scale_event >=
685 ((u64)NSEC_PER_SEC * 5))
686 lat_info->scale_grp = NULL;
687next_lock:
688 spin_unlock_irqrestore(&lat_info->lock, flags);
689next:
690 blkg_put(blkg);
691 }
692 rcu_read_unlock();
693}
694
695int blk_iolatency_init(struct request_queue *q)
696{
697 struct blk_iolatency *blkiolat;
698 struct rq_qos *rqos;
699 int ret;
700
701 blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
702 if (!blkiolat)
703 return -ENOMEM;
704
705 rqos = &blkiolat->rqos;
706 rqos->id = RQ_QOS_CGROUP;
707 rqos->ops = &blkcg_iolatency_ops;
708 rqos->q = q;
709
710 rq_qos_add(q, rqos);
711
712 ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
713 if (ret) {
714 rq_qos_del(q, rqos);
715 kfree(blkiolat);
716 return ret;
717 }
718
719 timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
720
721 return 0;
722}
723
724static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
725{
726 struct iolatency_grp *iolat = blkg_to_lat(blkg);
727 struct blk_iolatency *blkiolat = iolat->blkiolat;
728 u64 oldval = iolat->min_lat_nsec;
729
730 iolat->min_lat_nsec = val;
731 iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
732 iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
733 BLKIOLATENCY_MAX_WIN_SIZE);
734
735 if (!oldval && val)
736 atomic_inc(&blkiolat->enabled);
737 if (oldval && !val)
738 atomic_dec(&blkiolat->enabled);
739}
740
741static void iolatency_clear_scaling(struct blkcg_gq *blkg)
742{
743 if (blkg->parent) {
744 struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
745 struct child_latency_info *lat_info;
746 if (!iolat)
747 return;
748
749 lat_info = &iolat->child_lat;
750 spin_lock(&lat_info->lock);
751 atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
752 lat_info->last_scale_event = 0;
753 lat_info->scale_grp = NULL;
754 lat_info->scale_lat = 0;
755 spin_unlock(&lat_info->lock);
756 }
757}
758
759static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
760 size_t nbytes, loff_t off)
761{
762 struct blkcg *blkcg = css_to_blkcg(of_css(of));
763 struct blkcg_gq *blkg;
764 struct blk_iolatency *blkiolat;
765 struct blkg_conf_ctx ctx;
766 struct iolatency_grp *iolat;
767 char *p, *tok;
768 u64 lat_val = 0;
769 u64 oldval;
770 int ret;
771
772 ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
773 if (ret)
774 return ret;
775
776 iolat = blkg_to_lat(ctx.blkg);
777 blkiolat = iolat->blkiolat;
778 p = ctx.body;
779
780 ret = -EINVAL;
781 while ((tok = strsep(&p, " "))) {
782 char key[16];
783 char val[21]; /* 18446744073709551616 */
784
785 if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
786 goto out;
787
788 if (!strcmp(key, "target")) {
789 u64 v;
790
791 if (!strcmp(val, "max"))
792 lat_val = 0;
793 else if (sscanf(val, "%llu", &v) == 1)
794 lat_val = v * NSEC_PER_USEC;
795 else
796 goto out;
797 } else {
798 goto out;
799 }
800 }
801
802 /* Walk up the tree to see if our new val is lower than it should be. */
803 blkg = ctx.blkg;
804 oldval = iolat->min_lat_nsec;
805
806 iolatency_set_min_lat_nsec(blkg, lat_val);
807 if (oldval != iolat->min_lat_nsec) {
808 iolatency_clear_scaling(blkg);
809 }
810
811 ret = 0;
812out:
813 blkg_conf_finish(&ctx);
814 return ret ?: nbytes;
815}
816
817static u64 iolatency_prfill_limit(struct seq_file *sf,
818 struct blkg_policy_data *pd, int off)
819{
820 struct iolatency_grp *iolat = pd_to_lat(pd);
821 const char *dname = blkg_dev_name(pd->blkg);
822
823 if (!dname || !iolat->min_lat_nsec)
824 return 0;
825 seq_printf(sf, "%s target=%llu\n",
826 dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC));
827 return 0;
828}
829
830static int iolatency_print_limit(struct seq_file *sf, void *v)
831{
832 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
833 iolatency_prfill_limit,
834 &blkcg_policy_iolatency, seq_cft(sf)->private, false);
835 return 0;
836}
837
838static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
839 size_t size)
840{
841 struct iolatency_grp *iolat = pd_to_lat(pd);
842 unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
843 unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
844
845 if (iolat->rq_depth.max_depth == UINT_MAX)
846 return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
847 avg_lat, cur_win);
848
849 return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
850 iolat->rq_depth.max_depth, avg_lat, cur_win);
851}
852
853
854static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
855{
856 struct iolatency_grp *iolat;
857
858 iolat = kzalloc_node(sizeof(*iolat), gfp, node);
859 if (!iolat)
860 return NULL;
861 iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
862 __alignof__(struct blk_rq_stat), gfp);
863 if (!iolat->stats) {
864 kfree(iolat);
865 return NULL;
866 }
867 return &iolat->pd;
868}
869
870static void iolatency_pd_init(struct blkg_policy_data *pd)
871{
872 struct iolatency_grp *iolat = pd_to_lat(pd);
873 struct blkcg_gq *blkg = lat_to_blkg(iolat);
874 struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
875 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
876 u64 now = ktime_to_ns(ktime_get());
877 int cpu;
878
879 for_each_possible_cpu(cpu) {
880 struct blk_rq_stat *stat;
881 stat = per_cpu_ptr(iolat->stats, cpu);
882 blk_rq_stat_init(stat);
883 }
884
885 rq_wait_init(&iolat->rq_wait);
886 spin_lock_init(&iolat->child_lat.lock);
887 iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
888 iolat->rq_depth.max_depth = UINT_MAX;
889 iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
890 iolat->blkiolat = blkiolat;
891 iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
892 atomic64_set(&iolat->window_start, now);
893
894 /*
895 * We init things in list order, so the pd for the parent may not be
896 * init'ed yet for whatever reason.
897 */
898 if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
899 struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
900 atomic_set(&iolat->scale_cookie,
901 atomic_read(&parent->child_lat.scale_cookie));
902 } else {
903 atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
904 }
905
906 atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
907}
908
909static void iolatency_pd_offline(struct blkg_policy_data *pd)
910{
911 struct iolatency_grp *iolat = pd_to_lat(pd);
912 struct blkcg_gq *blkg = lat_to_blkg(iolat);
913
914 iolatency_set_min_lat_nsec(blkg, 0);
915 iolatency_clear_scaling(blkg);
916}
917
918static void iolatency_pd_free(struct blkg_policy_data *pd)
919{
920 struct iolatency_grp *iolat = pd_to_lat(pd);
921 free_percpu(iolat->stats);
922 kfree(iolat);
923}
924
925static struct cftype iolatency_files[] = {
926 {
927 .name = "latency",
928 .flags = CFTYPE_NOT_ON_ROOT,
929 .seq_show = iolatency_print_limit,
930 .write = iolatency_set_limit,
931 },
932 {}
933};
934
935static struct blkcg_policy blkcg_policy_iolatency = {
936 .dfl_cftypes = iolatency_files,
937 .pd_alloc_fn = iolatency_pd_alloc,
938 .pd_init_fn = iolatency_pd_init,
939 .pd_offline_fn = iolatency_pd_offline,
940 .pd_free_fn = iolatency_pd_free,
941 .pd_stat_fn = iolatency_pd_stat,
942};
943
944static int __init iolatency_init(void)
945{
946 return blkcg_policy_register(&blkcg_policy_iolatency);
947}
948
949static void __exit iolatency_exit(void)
950{
951 return blkcg_policy_unregister(&blkcg_policy_iolatency);
952}
953
954module_init(iolatency_init);
955module_exit(iolatency_exit);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 8faa70f26fcd..d1b9dd03da25 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -68,6 +68,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
68 */ 68 */
69 req_sects = min_t(sector_t, nr_sects, 69 req_sects = min_t(sector_t, nr_sects,
70 q->limits.max_discard_sectors); 70 q->limits.max_discard_sectors);
71 if (!req_sects)
72 goto fail;
71 if (req_sects > UINT_MAX >> 9) 73 if (req_sects > UINT_MAX >> 9)
72 req_sects = UINT_MAX >> 9; 74 req_sects = UINT_MAX >> 9;
73 75
@@ -105,6 +107,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
105 107
106 *biop = bio; 108 *biop = bio;
107 return 0; 109 return 0;
110
111fail:
112 if (bio) {
113 submit_bio_wait(bio);
114 bio_put(bio);
115 }
116 *biop = NULL;
117 return -EOPNOTSUPP;
108} 118}
109EXPORT_SYMBOL(__blkdev_issue_discard); 119EXPORT_SYMBOL(__blkdev_issue_discard);
110 120
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
new file mode 100644
index 000000000000..fb2c82c351e4
--- /dev/null
+++ b/block/blk-mq-debugfs-zoned.c
@@ -0,0 +1,24 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2017 Western Digital Corporation or its affiliates.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/blkdev.h>
9#include "blk-mq-debugfs.h"
10
11int queue_zone_wlock_show(void *data, struct seq_file *m)
12{
13 struct request_queue *q = data;
14 unsigned int i;
15
16 if (!q->seq_zones_wlock)
17 return 0;
18
19 for (i = 0; i < q->nr_zones; i++)
20 if (test_bit(i, q->seq_zones_wlock))
21 seq_printf(m, "%u\n", i);
22
23 return 0;
24}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 1c4532e92938..cb1e6cf7ac48 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
206 return count; 206 return count;
207} 207}
208 208
209static int queue_zone_wlock_show(void *data, struct seq_file *m)
210{
211 struct request_queue *q = data;
212 unsigned int i;
213
214 if (!q->seq_zones_wlock)
215 return 0;
216
217 for (i = 0; i < blk_queue_nr_zones(q); i++)
218 if (test_bit(i, q->seq_zones_wlock))
219 seq_printf(m, "%u\n", i);
220
221 return 0;
222}
223
224static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { 209static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
225 { "poll_stat", 0400, queue_poll_stat_show }, 210 { "poll_stat", 0400, queue_poll_stat_show },
226 { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, 211 { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
@@ -637,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m)
637 return 0; 622 return 0;
638} 623}
639 624
625static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
626{
627 struct blk_mq_hw_ctx *hctx = data;
628
629 seq_printf(m, "%u\n", hctx->dispatch_busy);
630 return 0;
631}
632
640static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) 633static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
641 __acquires(&ctx->lock) 634 __acquires(&ctx->lock)
642{ 635{
@@ -798,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
798 {"queued", 0600, hctx_queued_show, hctx_queued_write}, 791 {"queued", 0600, hctx_queued_show, hctx_queued_write},
799 {"run", 0600, hctx_run_show, hctx_run_write}, 792 {"run", 0600, hctx_run_show, hctx_run_write},
800 {"active", 0400, hctx_active_show}, 793 {"active", 0400, hctx_active_show},
794 {"dispatch_busy", 0400, hctx_dispatch_busy_show},
801 {}, 795 {},
802}; 796};
803 797
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index b9d366e57097..a9160be12be0 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc
80} 80}
81#endif 81#endif
82 82
83#ifdef CONFIG_BLK_DEBUG_FS_ZONED
84int queue_zone_wlock_show(void *data, struct seq_file *m);
85#else
86static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
87{
88 return 0;
89}
90#endif
91
83#endif 92#endif
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index e233996bb76f..db644ec624f5 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -17,6 +17,8 @@
17#include <linux/pci.h> 17#include <linux/pci.h>
18#include <linux/module.h> 18#include <linux/module.h>
19 19
20#include "blk-mq.h"
21
20/** 22/**
21 * blk_mq_pci_map_queues - provide a default queue mapping for PCI device 23 * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
22 * @set: tagset to provide the mapping for 24 * @set: tagset to provide the mapping for
@@ -48,8 +50,7 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
48 50
49fallback: 51fallback:
50 WARN_ON_ONCE(set->nr_hw_queues > 1); 52 WARN_ON_ONCE(set->nr_hw_queues > 1);
51 for_each_possible_cpu(cpu) 53 blk_mq_clear_mq_map(set);
52 set->mq_map[cpu] = 0;
53 return 0; 54 return 0;
54} 55}
55EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); 56EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 56c493c6cd90..cf9c66c6d35a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
59 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 59 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
60 return; 60 return;
61 61
62 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { 62 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
63 struct request_queue *q = hctx->queue;
64
65 if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
66 atomic_inc(&q->shared_hctx_restart);
67 } else
68 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
69} 63}
70 64
71static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) 65void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
72{ 66{
73 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 67 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
74 return false; 68 return;
75 69 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
76 if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
77 struct request_queue *q = hctx->queue;
78
79 if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
80 atomic_dec(&q->shared_hctx_restart);
81 } else
82 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
83 70
84 return blk_mq_run_hw_queue(hctx, true); 71 blk_mq_run_hw_queue(hctx, true);
85} 72}
86 73
87/* 74/*
@@ -219,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
219 } 206 }
220 } else if (has_sched_dispatch) { 207 } else if (has_sched_dispatch) {
221 blk_mq_do_dispatch_sched(hctx); 208 blk_mq_do_dispatch_sched(hctx);
222 } else if (q->mq_ops->get_budget) { 209 } else if (hctx->dispatch_busy) {
223 /* 210 /* dequeue request one by one from sw queue if queue is busy */
224 * If we need to get budget before queuing request, we
225 * dequeue request one by one from sw queue for avoiding
226 * to mess up I/O merge when dispatch runs out of resource.
227 *
228 * TODO: get more budgets, and dequeue more requests in
229 * one time.
230 */
231 blk_mq_do_dispatch_ctx(hctx); 211 blk_mq_do_dispatch_ctx(hctx);
232 } else { 212 } else {
233 blk_mq_flush_busy_ctxs(hctx, &rq_list); 213 blk_mq_flush_busy_ctxs(hctx, &rq_list);
@@ -339,7 +319,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
339 return e->type->ops.mq.bio_merge(hctx, bio); 319 return e->type->ops.mq.bio_merge(hctx, bio);
340 } 320 }
341 321
342 if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { 322 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
323 !list_empty_careful(&ctx->rq_list)) {
343 /* default per sw-queue merge */ 324 /* default per sw-queue merge */
344 spin_lock(&ctx->lock); 325 spin_lock(&ctx->lock);
345 ret = blk_mq_attempt_merge(q, ctx, bio); 326 ret = blk_mq_attempt_merge(q, ctx, bio);
@@ -380,68 +361,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
380 return false; 361 return false;
381} 362}
382 363
383/**
384 * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
385 * @pos: loop cursor.
386 * @skip: the list element that will not be examined. Iteration starts at
387 * @skip->next.
388 * @head: head of the list to examine. This list must have at least one
389 * element, namely @skip.
390 * @member: name of the list_head structure within typeof(*pos).
391 */
392#define list_for_each_entry_rcu_rr(pos, skip, head, member) \
393 for ((pos) = (skip); \
394 (pos = (pos)->member.next != (head) ? list_entry_rcu( \
395 (pos)->member.next, typeof(*pos), member) : \
396 list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
397 (pos) != (skip); )
398
399/*
400 * Called after a driver tag has been freed to check whether a hctx needs to
401 * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
402 * queues in a round-robin fashion if the tag set of @hctx is shared with other
403 * hardware queues.
404 */
405void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
406{
407 struct blk_mq_tags *const tags = hctx->tags;
408 struct blk_mq_tag_set *const set = hctx->queue->tag_set;
409 struct request_queue *const queue = hctx->queue, *q;
410 struct blk_mq_hw_ctx *hctx2;
411 unsigned int i, j;
412
413 if (set->flags & BLK_MQ_F_TAG_SHARED) {
414 /*
415 * If this is 0, then we know that no hardware queues
416 * have RESTART marked. We're done.
417 */
418 if (!atomic_read(&queue->shared_hctx_restart))
419 return;
420
421 rcu_read_lock();
422 list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
423 tag_set_list) {
424 queue_for_each_hw_ctx(q, hctx2, i)
425 if (hctx2->tags == tags &&
426 blk_mq_sched_restart_hctx(hctx2))
427 goto done;
428 }
429 j = hctx->queue_num + 1;
430 for (i = 0; i < queue->nr_hw_queues; i++, j++) {
431 if (j == queue->nr_hw_queues)
432 j = 0;
433 hctx2 = queue->queue_hw_ctx[j];
434 if (hctx2->tags == tags &&
435 blk_mq_sched_restart_hctx(hctx2))
436 break;
437 }
438done:
439 rcu_read_unlock();
440 } else {
441 blk_mq_sched_restart_hctx(hctx);
442 }
443}
444
445void blk_mq_sched_insert_request(struct request *rq, bool at_head, 364void blk_mq_sched_insert_request(struct request *rq, bool at_head,
446 bool run_queue, bool async) 365 bool run_queue, bool async)
447{ 366{
@@ -486,8 +405,19 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
486 405
487 if (e && e->type->ops.mq.insert_requests) 406 if (e && e->type->ops.mq.insert_requests)
488 e->type->ops.mq.insert_requests(hctx, list, false); 407 e->type->ops.mq.insert_requests(hctx, list, false);
489 else 408 else {
409 /*
410 * try to issue requests directly if the hw queue isn't
411 * busy in case of 'none' scheduler, and this way may save
412 * us one extra enqueue & dequeue to sw queue.
413 */
414 if (!hctx->dispatch_busy && !e && !run_queue_async) {
415 blk_mq_try_issue_list_directly(hctx, list);
416 if (list_empty(list))
417 return;
418 }
490 blk_mq_insert_requests(hctx, ctx, list); 419 blk_mq_insert_requests(hctx, ctx, list);
420 }
491 421
492 blk_mq_run_hw_queue(hctx, run_queue_async); 422 blk_mq_run_hw_queue(hctx, run_queue_async);
493} 423}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 3de0836163c2..816923bf874d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -23,6 +23,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
23 23
24/* 24/*
25 * If a previously inactive queue goes active, bump the active user count. 25 * If a previously inactive queue goes active, bump the active user count.
26 * We need to do this before try to allocate driver tag, then even if fail
27 * to get tag when first time, the other shared-tag users could reserve
28 * budget for it.
26 */ 29 */
27bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) 30bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
28{ 31{
@@ -399,8 +402,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
399 if (tdepth <= tags->nr_reserved_tags) 402 if (tdepth <= tags->nr_reserved_tags)
400 return -EINVAL; 403 return -EINVAL;
401 404
402 tdepth -= tags->nr_reserved_tags;
403
404 /* 405 /*
405 * If we are allowed to grow beyond the original size, allocate 406 * If we are allowed to grow beyond the original size, allocate
406 * a new set of tags before freeing the old one. 407 * a new set of tags before freeing the old one.
@@ -420,7 +421,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
420 if (tdepth > 16 * BLKDEV_MAX_RQ) 421 if (tdepth > 16 * BLKDEV_MAX_RQ)
421 return -EINVAL; 422 return -EINVAL;
422 423
423 new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0); 424 new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
425 tags->nr_reserved_tags);
424 if (!new) 426 if (!new)
425 return -ENOMEM; 427 return -ENOMEM;
426 ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); 428 ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
@@ -437,7 +439,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
437 * Don't need (or can't) update reserved tags here, they 439 * Don't need (or can't) update reserved tags here, they
438 * remain static and should never need resizing. 440 * remain static and should never need resizing.
439 */ 441 */
440 sbitmap_queue_resize(&tags->bitmap_tags, tdepth); 442 sbitmap_queue_resize(&tags->bitmap_tags,
443 tdepth - tags->nr_reserved_tags);
441 } 444 }
442 445
443 return 0; 446 return 0;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 654b0dc7e001..72a0033ccee9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -34,8 +34,8 @@
34#include "blk-mq-debugfs.h" 34#include "blk-mq-debugfs.h"
35#include "blk-mq-tag.h" 35#include "blk-mq-tag.h"
36#include "blk-stat.h" 36#include "blk-stat.h"
37#include "blk-wbt.h"
38#include "blk-mq-sched.h" 37#include "blk-mq-sched.h"
38#include "blk-rq-qos.h"
39 39
40static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); 40static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
41static void blk_mq_poll_stats_start(struct request_queue *q); 41static void blk_mq_poll_stats_start(struct request_queue *q);
@@ -285,7 +285,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
285 rq->tag = -1; 285 rq->tag = -1;
286 rq->internal_tag = tag; 286 rq->internal_tag = tag;
287 } else { 287 } else {
288 if (blk_mq_tag_busy(data->hctx)) { 288 if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
289 rq_flags = RQF_MQ_INFLIGHT; 289 rq_flags = RQF_MQ_INFLIGHT;
290 atomic_inc(&data->hctx->nr_active); 290 atomic_inc(&data->hctx->nr_active);
291 } 291 }
@@ -367,6 +367,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
367 if (!op_is_flush(op) && e->type->ops.mq.limit_depth && 367 if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
368 !(data->flags & BLK_MQ_REQ_RESERVED)) 368 !(data->flags & BLK_MQ_REQ_RESERVED))
369 e->type->ops.mq.limit_depth(op, data); 369 e->type->ops.mq.limit_depth(op, data);
370 } else {
371 blk_mq_tag_busy(data->hctx);
370 } 372 }
371 373
372 tag = blk_mq_get_tag(data); 374 tag = blk_mq_get_tag(data);
@@ -504,7 +506,7 @@ void blk_mq_free_request(struct request *rq)
504 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) 506 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
505 laptop_io_completion(q->backing_dev_info); 507 laptop_io_completion(q->backing_dev_info);
506 508
507 wbt_done(q->rq_wb, rq); 509 rq_qos_done(q, rq);
508 510
509 if (blk_rq_rl(rq)) 511 if (blk_rq_rl(rq))
510 blk_put_rl(blk_rq_rl(rq)); 512 blk_put_rl(blk_rq_rl(rq));
@@ -527,7 +529,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
527 blk_account_io_done(rq, now); 529 blk_account_io_done(rq, now);
528 530
529 if (rq->end_io) { 531 if (rq->end_io) {
530 wbt_done(rq->q->rq_wb, rq); 532 rq_qos_done(rq->q, rq);
531 rq->end_io(rq, error); 533 rq->end_io(rq, error);
532 } else { 534 } else {
533 if (unlikely(blk_bidi_rq(rq))) 535 if (unlikely(blk_bidi_rq(rq)))
@@ -639,7 +641,7 @@ void blk_mq_start_request(struct request *rq)
639 rq->throtl_size = blk_rq_sectors(rq); 641 rq->throtl_size = blk_rq_sectors(rq);
640#endif 642#endif
641 rq->rq_flags |= RQF_STATS; 643 rq->rq_flags |= RQF_STATS;
642 wbt_issue(q->rq_wb, rq); 644 rq_qos_issue(q, rq);
643 } 645 }
644 646
645 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); 647 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
@@ -665,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq)
665 blk_mq_put_driver_tag(rq); 667 blk_mq_put_driver_tag(rq);
666 668
667 trace_block_rq_requeue(q, rq); 669 trace_block_rq_requeue(q, rq);
668 wbt_requeue(q->rq_wb, rq); 670 rq_qos_requeue(q, rq);
669 671
670 if (blk_mq_request_started(rq)) { 672 if (blk_mq_request_started(rq)) {
671 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 673 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
@@ -962,16 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued)
962 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); 964 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
963} 965}
964 966
965bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, 967bool blk_mq_get_driver_tag(struct request *rq)
966 bool wait)
967{ 968{
968 struct blk_mq_alloc_data data = { 969 struct blk_mq_alloc_data data = {
969 .q = rq->q, 970 .q = rq->q,
970 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), 971 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
971 .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, 972 .flags = BLK_MQ_REQ_NOWAIT,
972 }; 973 };
973 974 bool shared;
974 might_sleep_if(wait);
975 975
976 if (rq->tag != -1) 976 if (rq->tag != -1)
977 goto done; 977 goto done;
@@ -979,9 +979,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
979 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) 979 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
980 data.flags |= BLK_MQ_REQ_RESERVED; 980 data.flags |= BLK_MQ_REQ_RESERVED;
981 981
982 shared = blk_mq_tag_busy(data.hctx);
982 rq->tag = blk_mq_get_tag(&data); 983 rq->tag = blk_mq_get_tag(&data);
983 if (rq->tag >= 0) { 984 if (rq->tag >= 0) {
984 if (blk_mq_tag_busy(data.hctx)) { 985 if (shared) {
985 rq->rq_flags |= RQF_MQ_INFLIGHT; 986 rq->rq_flags |= RQF_MQ_INFLIGHT;
986 atomic_inc(&data.hctx->nr_active); 987 atomic_inc(&data.hctx->nr_active);
987 } 988 }
@@ -989,8 +990,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
989 } 990 }
990 991
991done: 992done:
992 if (hctx)
993 *hctx = data.hctx;
994 return rq->tag != -1; 993 return rq->tag != -1;
995} 994}
996 995
@@ -1001,7 +1000,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1001 1000
1002 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1001 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1003 1002
1003 spin_lock(&hctx->dispatch_wait_lock);
1004 list_del_init(&wait->entry); 1004 list_del_init(&wait->entry);
1005 spin_unlock(&hctx->dispatch_wait_lock);
1006
1005 blk_mq_run_hw_queue(hctx, true); 1007 blk_mq_run_hw_queue(hctx, true);
1006 return 1; 1008 return 1;
1007} 1009}
@@ -1012,17 +1014,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1012 * restart. For both cases, take care to check the condition again after 1014 * restart. For both cases, take care to check the condition again after
1013 * marking us as waiting. 1015 * marking us as waiting.
1014 */ 1016 */
1015static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, 1017static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
1016 struct request *rq) 1018 struct request *rq)
1017{ 1019{
1018 struct blk_mq_hw_ctx *this_hctx = *hctx; 1020 struct wait_queue_head *wq;
1019 struct sbq_wait_state *ws;
1020 wait_queue_entry_t *wait; 1021 wait_queue_entry_t *wait;
1021 bool ret; 1022 bool ret;
1022 1023
1023 if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { 1024 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
1024 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) 1025 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
1025 set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); 1026 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
1026 1027
1027 /* 1028 /*
1028 * It's possible that a tag was freed in the window between the 1029 * It's possible that a tag was freed in the window between the
@@ -1032,30 +1033,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
1032 * Don't clear RESTART here, someone else could have set it. 1033 * Don't clear RESTART here, someone else could have set it.
1033 * At most this will cost an extra queue run. 1034 * At most this will cost an extra queue run.
1034 */ 1035 */
1035 return blk_mq_get_driver_tag(rq, hctx, false); 1036 return blk_mq_get_driver_tag(rq);
1036 } 1037 }
1037 1038
1038 wait = &this_hctx->dispatch_wait; 1039 wait = &hctx->dispatch_wait;
1039 if (!list_empty_careful(&wait->entry)) 1040 if (!list_empty_careful(&wait->entry))
1040 return false; 1041 return false;
1041 1042
1042 spin_lock(&this_hctx->lock); 1043 wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait;
1044
1045 spin_lock_irq(&wq->lock);
1046 spin_lock(&hctx->dispatch_wait_lock);
1043 if (!list_empty(&wait->entry)) { 1047 if (!list_empty(&wait->entry)) {
1044 spin_unlock(&this_hctx->lock); 1048 spin_unlock(&hctx->dispatch_wait_lock);
1049 spin_unlock_irq(&wq->lock);
1045 return false; 1050 return false;
1046 } 1051 }
1047 1052
1048 ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); 1053 wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1049 add_wait_queue(&ws->wait, wait); 1054 __add_wait_queue(wq, wait);
1050 1055
1051 /* 1056 /*
1052 * It's possible that a tag was freed in the window between the 1057 * It's possible that a tag was freed in the window between the
1053 * allocation failure and adding the hardware queue to the wait 1058 * allocation failure and adding the hardware queue to the wait
1054 * queue. 1059 * queue.
1055 */ 1060 */
1056 ret = blk_mq_get_driver_tag(rq, hctx, false); 1061 ret = blk_mq_get_driver_tag(rq);
1057 if (!ret) { 1062 if (!ret) {
1058 spin_unlock(&this_hctx->lock); 1063 spin_unlock(&hctx->dispatch_wait_lock);
1064 spin_unlock_irq(&wq->lock);
1059 return false; 1065 return false;
1060 } 1066 }
1061 1067
@@ -1063,14 +1069,42 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
1063 * We got a tag, remove ourselves from the wait queue to ensure 1069 * We got a tag, remove ourselves from the wait queue to ensure
1064 * someone else gets the wakeup. 1070 * someone else gets the wakeup.
1065 */ 1071 */
1066 spin_lock_irq(&ws->wait.lock);
1067 list_del_init(&wait->entry); 1072 list_del_init(&wait->entry);
1068 spin_unlock_irq(&ws->wait.lock); 1073 spin_unlock(&hctx->dispatch_wait_lock);
1069 spin_unlock(&this_hctx->lock); 1074 spin_unlock_irq(&wq->lock);
1070 1075
1071 return true; 1076 return true;
1072} 1077}
1073 1078
1079#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
1080#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
1081/*
1082 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1083 * - EWMA is one simple way to compute running average value
1084 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1085 * - take 4 as factor for avoiding to get too small(0) result, and this
1086 * factor doesn't matter because EWMA decreases exponentially
1087 */
1088static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1089{
1090 unsigned int ewma;
1091
1092 if (hctx->queue->elevator)
1093 return;
1094
1095 ewma = hctx->dispatch_busy;
1096
1097 if (!ewma && !busy)
1098 return;
1099
1100 ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1101 if (busy)
1102 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1103 ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1104
1105 hctx->dispatch_busy = ewma;
1106}
1107
1074#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ 1108#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
1075 1109
1076/* 1110/*
@@ -1103,7 +1137,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1103 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) 1137 if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
1104 break; 1138 break;
1105 1139
1106 if (!blk_mq_get_driver_tag(rq, NULL, false)) { 1140 if (!blk_mq_get_driver_tag(rq)) {
1107 /* 1141 /*
1108 * The initial allocation attempt failed, so we need to 1142 * The initial allocation attempt failed, so we need to
1109 * rerun the hardware queue when a tag is freed. The 1143 * rerun the hardware queue when a tag is freed. The
@@ -1111,7 +1145,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1111 * before we add this entry back on the dispatch list, 1145 * before we add this entry back on the dispatch list,
1112 * we'll re-run it below. 1146 * we'll re-run it below.
1113 */ 1147 */
1114 if (!blk_mq_mark_tag_wait(&hctx, rq)) { 1148 if (!blk_mq_mark_tag_wait(hctx, rq)) {
1115 blk_mq_put_dispatch_budget(hctx); 1149 blk_mq_put_dispatch_budget(hctx);
1116 /* 1150 /*
1117 * For non-shared tags, the RESTART check 1151 * For non-shared tags, the RESTART check
@@ -1135,7 +1169,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1135 bd.last = true; 1169 bd.last = true;
1136 else { 1170 else {
1137 nxt = list_first_entry(list, struct request, queuelist); 1171 nxt = list_first_entry(list, struct request, queuelist);
1138 bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); 1172 bd.last = !blk_mq_get_driver_tag(nxt);
1139 } 1173 }
1140 1174
1141 ret = q->mq_ops->queue_rq(hctx, &bd); 1175 ret = q->mq_ops->queue_rq(hctx, &bd);
@@ -1207,8 +1241,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1207 else if (needs_restart && (ret == BLK_STS_RESOURCE)) 1241 else if (needs_restart && (ret == BLK_STS_RESOURCE))
1208 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); 1242 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1209 1243
1244 blk_mq_update_dispatch_busy(hctx, true);
1210 return false; 1245 return false;
1211 } 1246 } else
1247 blk_mq_update_dispatch_busy(hctx, false);
1212 1248
1213 /* 1249 /*
1214 * If the host/device is unable to accept more work, inform the 1250 * If the host/device is unable to accept more work, inform the
@@ -1542,19 +1578,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1542 struct list_head *list) 1578 struct list_head *list)
1543 1579
1544{ 1580{
1581 struct request *rq;
1582
1545 /* 1583 /*
1546 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1584 * preemption doesn't flush plug list, so it's possible ctx->cpu is
1547 * offline now 1585 * offline now
1548 */ 1586 */
1549 spin_lock(&ctx->lock); 1587 list_for_each_entry(rq, list, queuelist) {
1550 while (!list_empty(list)) {
1551 struct request *rq;
1552
1553 rq = list_first_entry(list, struct request, queuelist);
1554 BUG_ON(rq->mq_ctx != ctx); 1588 BUG_ON(rq->mq_ctx != ctx);
1555 list_del_init(&rq->queuelist); 1589 trace_block_rq_insert(hctx->queue, rq);
1556 __blk_mq_insert_req_list(hctx, rq, false);
1557 } 1590 }
1591
1592 spin_lock(&ctx->lock);
1593 list_splice_tail_init(list, &ctx->rq_list);
1558 blk_mq_hctx_mark_pending(hctx, ctx); 1594 blk_mq_hctx_mark_pending(hctx, ctx);
1559 spin_unlock(&ctx->lock); 1595 spin_unlock(&ctx->lock);
1560} 1596}
@@ -1657,13 +1693,16 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
1657 ret = q->mq_ops->queue_rq(hctx, &bd); 1693 ret = q->mq_ops->queue_rq(hctx, &bd);
1658 switch (ret) { 1694 switch (ret) {
1659 case BLK_STS_OK: 1695 case BLK_STS_OK:
1696 blk_mq_update_dispatch_busy(hctx, false);
1660 *cookie = new_cookie; 1697 *cookie = new_cookie;
1661 break; 1698 break;
1662 case BLK_STS_RESOURCE: 1699 case BLK_STS_RESOURCE:
1663 case BLK_STS_DEV_RESOURCE: 1700 case BLK_STS_DEV_RESOURCE:
1701 blk_mq_update_dispatch_busy(hctx, true);
1664 __blk_mq_requeue_request(rq); 1702 __blk_mq_requeue_request(rq);
1665 break; 1703 break;
1666 default: 1704 default:
1705 blk_mq_update_dispatch_busy(hctx, false);
1667 *cookie = BLK_QC_T_NONE; 1706 *cookie = BLK_QC_T_NONE;
1668 break; 1707 break;
1669 } 1708 }
@@ -1698,7 +1737,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1698 if (!blk_mq_get_dispatch_budget(hctx)) 1737 if (!blk_mq_get_dispatch_budget(hctx))
1699 goto insert; 1738 goto insert;
1700 1739
1701 if (!blk_mq_get_driver_tag(rq, NULL, false)) { 1740 if (!blk_mq_get_driver_tag(rq)) {
1702 blk_mq_put_dispatch_budget(hctx); 1741 blk_mq_put_dispatch_budget(hctx);
1703 goto insert; 1742 goto insert;
1704 } 1743 }
@@ -1746,6 +1785,27 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
1746 return ret; 1785 return ret;
1747} 1786}
1748 1787
1788void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
1789 struct list_head *list)
1790{
1791 while (!list_empty(list)) {
1792 blk_status_t ret;
1793 struct request *rq = list_first_entry(list, struct request,
1794 queuelist);
1795
1796 list_del_init(&rq->queuelist);
1797 ret = blk_mq_request_issue_directly(rq);
1798 if (ret != BLK_STS_OK) {
1799 if (ret == BLK_STS_RESOURCE ||
1800 ret == BLK_STS_DEV_RESOURCE) {
1801 list_add(&rq->queuelist, list);
1802 break;
1803 }
1804 blk_mq_end_request(rq, ret);
1805 }
1806 }
1807}
1808
1749static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) 1809static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1750{ 1810{
1751 const int is_sync = op_is_sync(bio->bi_opf); 1811 const int is_sync = op_is_sync(bio->bi_opf);
@@ -1756,7 +1816,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1756 struct blk_plug *plug; 1816 struct blk_plug *plug;
1757 struct request *same_queue_rq = NULL; 1817 struct request *same_queue_rq = NULL;
1758 blk_qc_t cookie; 1818 blk_qc_t cookie;
1759 unsigned int wb_acct;
1760 1819
1761 blk_queue_bounce(q, &bio); 1820 blk_queue_bounce(q, &bio);
1762 1821
@@ -1772,19 +1831,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1772 if (blk_mq_sched_bio_merge(q, bio)) 1831 if (blk_mq_sched_bio_merge(q, bio))
1773 return BLK_QC_T_NONE; 1832 return BLK_QC_T_NONE;
1774 1833
1775 wb_acct = wbt_wait(q->rq_wb, bio, NULL); 1834 rq_qos_throttle(q, bio, NULL);
1776 1835
1777 trace_block_getrq(q, bio, bio->bi_opf); 1836 trace_block_getrq(q, bio, bio->bi_opf);
1778 1837
1779 rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); 1838 rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
1780 if (unlikely(!rq)) { 1839 if (unlikely(!rq)) {
1781 __wbt_done(q->rq_wb, wb_acct); 1840 rq_qos_cleanup(q, bio);
1782 if (bio->bi_opf & REQ_NOWAIT) 1841 if (bio->bi_opf & REQ_NOWAIT)
1783 bio_wouldblock_error(bio); 1842 bio_wouldblock_error(bio);
1784 return BLK_QC_T_NONE; 1843 return BLK_QC_T_NONE;
1785 } 1844 }
1786 1845
1787 wbt_track(rq, wb_acct); 1846 rq_qos_track(q, rq, bio);
1788 1847
1789 cookie = request_to_qc_t(data.hctx, rq); 1848 cookie = request_to_qc_t(data.hctx, rq);
1790 1849
@@ -1847,7 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1847 blk_mq_try_issue_directly(data.hctx, same_queue_rq, 1906 blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1848 &cookie); 1907 &cookie);
1849 } 1908 }
1850 } else if (q->nr_hw_queues > 1 && is_sync) { 1909 } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
1910 !data.hctx->dispatch_busy)) {
1851 blk_mq_put_ctx(data.ctx); 1911 blk_mq_put_ctx(data.ctx);
1852 blk_mq_bio_to_request(rq, bio); 1912 blk_mq_bio_to_request(rq, bio);
1853 blk_mq_try_issue_directly(data.hctx, rq, &cookie); 1913 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
@@ -2146,6 +2206,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
2146 2206
2147 hctx->nr_ctx = 0; 2207 hctx->nr_ctx = 0;
2148 2208
2209 spin_lock_init(&hctx->dispatch_wait_lock);
2149 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 2210 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2150 INIT_LIST_HEAD(&hctx->dispatch_wait.entry); 2211 INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2151 2212
@@ -2331,15 +2392,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2331 int i; 2392 int i;
2332 2393
2333 queue_for_each_hw_ctx(q, hctx, i) { 2394 queue_for_each_hw_ctx(q, hctx, i) {
2334 if (shared) { 2395 if (shared)
2335 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2336 atomic_inc(&q->shared_hctx_restart);
2337 hctx->flags |= BLK_MQ_F_TAG_SHARED; 2396 hctx->flags |= BLK_MQ_F_TAG_SHARED;
2338 } else { 2397 else
2339 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2340 atomic_dec(&q->shared_hctx_restart);
2341 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 2398 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
2342 }
2343 } 2399 }
2344} 2400}
2345 2401
@@ -2370,7 +2426,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
2370 blk_mq_update_tag_set_depth(set, false); 2426 blk_mq_update_tag_set_depth(set, false);
2371 } 2427 }
2372 mutex_unlock(&set->tag_list_lock); 2428 mutex_unlock(&set->tag_list_lock);
2373 synchronize_rcu();
2374 INIT_LIST_HEAD(&q->tag_set_list); 2429 INIT_LIST_HEAD(&q->tag_set_list);
2375} 2430}
2376 2431
@@ -2685,7 +2740,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2685static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 2740static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2686{ 2741{
2687 if (set->ops->map_queues) { 2742 if (set->ops->map_queues) {
2688 int cpu;
2689 /* 2743 /*
2690 * transport .map_queues is usually done in the following 2744 * transport .map_queues is usually done in the following
2691 * way: 2745 * way:
@@ -2700,8 +2754,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2700 * killing stale mapping since one CPU may not be mapped 2754 * killing stale mapping since one CPU may not be mapped
2701 * to any hw queue. 2755 * to any hw queue.
2702 */ 2756 */
2703 for_each_possible_cpu(cpu) 2757 blk_mq_clear_mq_map(set);
2704 set->mq_map[cpu] = 0;
2705 2758
2706 return set->ops->map_queues(set); 2759 return set->ops->map_queues(set);
2707 } else 2760 } else
@@ -2711,7 +2764,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2711/* 2764/*
2712 * Alloc a tag set to be associated with one or more request queues. 2765 * Alloc a tag set to be associated with one or more request queues.
2713 * May fail with EINVAL for various error conditions. May adjust the 2766 * May fail with EINVAL for various error conditions. May adjust the
2714 * requested depth down, if if it too large. In that case, the set 2767 * requested depth down, if it's too large. In that case, the set
2715 * value will be stored in set->queue_depth. 2768 * value will be stored in set->queue_depth.
2716 */ 2769 */
2717int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2770int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 89231e439b2f..9497b47e2526 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -36,8 +36,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
36void blk_mq_wake_waiters(struct request_queue *q); 36void blk_mq_wake_waiters(struct request_queue *q);
37bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); 37bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
38void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); 38void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
39bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, 39bool blk_mq_get_driver_tag(struct request *rq);
40 bool wait);
41struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 40struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
42 struct blk_mq_ctx *start); 41 struct blk_mq_ctx *start);
43 42
@@ -65,6 +64,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
65 64
66/* Used by blk_insert_cloned_request() to issue request directly */ 65/* Used by blk_insert_cloned_request() to issue request directly */
67blk_status_t blk_mq_request_issue_directly(struct request *rq); 66blk_status_t blk_mq_request_issue_directly(struct request *rq);
67void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
68 struct list_head *list);
68 69
69/* 70/*
70 * CPU -> queue mappings 71 * CPU -> queue mappings
@@ -203,4 +204,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
203 __blk_mq_put_driver_tag(hctx, rq); 204 __blk_mq_put_driver_tag(hctx, rq);
204} 205}
205 206
207static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set)
208{
209 int cpu;
210
211 for_each_possible_cpu(cpu)
212 set->mq_map[cpu] = 0;
213}
214
206#endif 215#endif
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
new file mode 100644
index 000000000000..0005dfd568dd
--- /dev/null
+++ b/block/blk-rq-qos.c
@@ -0,0 +1,194 @@
1#include "blk-rq-qos.h"
2
3/*
4 * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
5 * false if 'v' + 1 would be bigger than 'below'.
6 */
7static bool atomic_inc_below(atomic_t *v, unsigned int below)
8{
9 unsigned int cur = atomic_read(v);
10
11 for (;;) {
12 unsigned int old;
13
14 if (cur >= below)
15 return false;
16 old = atomic_cmpxchg(v, cur, cur + 1);
17 if (old == cur)
18 break;
19 cur = old;
20 }
21
22 return true;
23}
24
25bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
26{
27 return atomic_inc_below(&rq_wait->inflight, limit);
28}
29
30void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
31{
32 struct rq_qos *rqos;
33
34 for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
35 if (rqos->ops->cleanup)
36 rqos->ops->cleanup(rqos, bio);
37 }
38}
39
40void rq_qos_done(struct request_queue *q, struct request *rq)
41{
42 struct rq_qos *rqos;
43
44 for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
45 if (rqos->ops->done)
46 rqos->ops->done(rqos, rq);
47 }
48}
49
50void rq_qos_issue(struct request_queue *q, struct request *rq)
51{
52 struct rq_qos *rqos;
53
54 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
55 if (rqos->ops->issue)
56 rqos->ops->issue(rqos, rq);
57 }
58}
59
60void rq_qos_requeue(struct request_queue *q, struct request *rq)
61{
62 struct rq_qos *rqos;
63
64 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
65 if (rqos->ops->requeue)
66 rqos->ops->requeue(rqos, rq);
67 }
68}
69
70void rq_qos_throttle(struct request_queue *q, struct bio *bio,
71 spinlock_t *lock)
72{
73 struct rq_qos *rqos;
74
75 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
76 if (rqos->ops->throttle)
77 rqos->ops->throttle(rqos, bio, lock);
78 }
79}
80
81void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio)
82{
83 struct rq_qos *rqos;
84
85 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
86 if (rqos->ops->track)
87 rqos->ops->track(rqos, rq, bio);
88 }
89}
90
91void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
92{
93 struct rq_qos *rqos;
94
95 for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
96 if (rqos->ops->done_bio)
97 rqos->ops->done_bio(rqos, bio);
98 }
99}
100
101/*
102 * Return true, if we can't increase the depth further by scaling
103 */
104bool rq_depth_calc_max_depth(struct rq_depth *rqd)
105{
106 unsigned int depth;
107 bool ret = false;
108
109 /*
110 * For QD=1 devices, this is a special case. It's important for those
111 * to have one request ready when one completes, so force a depth of
112 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
113 * since the device can't have more than that in flight. If we're
114 * scaling down, then keep a setting of 1/1/1.
115 */
116 if (rqd->queue_depth == 1) {
117 if (rqd->scale_step > 0)
118 rqd->max_depth = 1;
119 else {
120 rqd->max_depth = 2;
121 ret = true;
122 }
123 } else {
124 /*
125 * scale_step == 0 is our default state. If we have suffered
126 * latency spikes, step will be > 0, and we shrink the
127 * allowed write depths. If step is < 0, we're only doing
128 * writes, and we allow a temporarily higher depth to
129 * increase performance.
130 */
131 depth = min_t(unsigned int, rqd->default_depth,
132 rqd->queue_depth);
133 if (rqd->scale_step > 0)
134 depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
135 else if (rqd->scale_step < 0) {
136 unsigned int maxd = 3 * rqd->queue_depth / 4;
137
138 depth = 1 + ((depth - 1) << -rqd->scale_step);
139 if (depth > maxd) {
140 depth = maxd;
141 ret = true;
142 }
143 }
144
145 rqd->max_depth = depth;
146 }
147
148 return ret;
149}
150
151void rq_depth_scale_up(struct rq_depth *rqd)
152{
153 /*
154 * Hit max in previous round, stop here
155 */
156 if (rqd->scaled_max)
157 return;
158
159 rqd->scale_step--;
160
161 rqd->scaled_max = rq_depth_calc_max_depth(rqd);
162}
163
164/*
165 * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
166 * had a latency violation.
167 */
168void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
169{
170 /*
171 * Stop scaling down when we've hit the limit. This also prevents
172 * ->scale_step from going to crazy values, if the device can't
173 * keep up.
174 */
175 if (rqd->max_depth == 1)
176 return;
177
178 if (rqd->scale_step < 0 && hard_throttle)
179 rqd->scale_step = 0;
180 else
181 rqd->scale_step++;
182
183 rqd->scaled_max = false;
184 rq_depth_calc_max_depth(rqd);
185}
186
187void rq_qos_exit(struct request_queue *q)
188{
189 while (q->rq_qos) {
190 struct rq_qos *rqos = q->rq_qos;
191 q->rq_qos = rqos->next;
192 rqos->ops->exit(rqos);
193 }
194}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
new file mode 100644
index 000000000000..32b02efbfa66
--- /dev/null
+++ b/block/blk-rq-qos.h
@@ -0,0 +1,109 @@
1#ifndef RQ_QOS_H
2#define RQ_QOS_H
3
4#include <linux/kernel.h>
5#include <linux/blkdev.h>
6#include <linux/blk_types.h>
7#include <linux/atomic.h>
8#include <linux/wait.h>
9
10enum rq_qos_id {
11 RQ_QOS_WBT,
12 RQ_QOS_CGROUP,
13};
14
15struct rq_wait {
16 wait_queue_head_t wait;
17 atomic_t inflight;
18};
19
20struct rq_qos {
21 struct rq_qos_ops *ops;
22 struct request_queue *q;
23 enum rq_qos_id id;
24 struct rq_qos *next;
25};
26
27struct rq_qos_ops {
28 void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *);
29 void (*track)(struct rq_qos *, struct request *, struct bio *);
30 void (*issue)(struct rq_qos *, struct request *);
31 void (*requeue)(struct rq_qos *, struct request *);
32 void (*done)(struct rq_qos *, struct request *);
33 void (*done_bio)(struct rq_qos *, struct bio *);
34 void (*cleanup)(struct rq_qos *, struct bio *);
35 void (*exit)(struct rq_qos *);
36};
37
38struct rq_depth {
39 unsigned int max_depth;
40
41 int scale_step;
42 bool scaled_max;
43
44 unsigned int queue_depth;
45 unsigned int default_depth;
46};
47
48static inline struct rq_qos *rq_qos_id(struct request_queue *q,
49 enum rq_qos_id id)
50{
51 struct rq_qos *rqos;
52 for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
53 if (rqos->id == id)
54 break;
55 }
56 return rqos;
57}
58
59static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
60{
61 return rq_qos_id(q, RQ_QOS_WBT);
62}
63
64static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
65{
66 return rq_qos_id(q, RQ_QOS_CGROUP);
67}
68
69static inline void rq_wait_init(struct rq_wait *rq_wait)
70{
71 atomic_set(&rq_wait->inflight, 0);
72 init_waitqueue_head(&rq_wait->wait);
73}
74
75static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
76{
77 rqos->next = q->rq_qos;
78 q->rq_qos = rqos;
79}
80
81static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
82{
83 struct rq_qos *cur, *prev = NULL;
84 for (cur = q->rq_qos; cur; cur = cur->next) {
85 if (cur == rqos) {
86 if (prev)
87 prev->next = rqos->next;
88 else
89 q->rq_qos = cur;
90 break;
91 }
92 prev = cur;
93 }
94}
95
96bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
97void rq_depth_scale_up(struct rq_depth *rqd);
98void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
99bool rq_depth_calc_max_depth(struct rq_depth *rqd);
100
101void rq_qos_cleanup(struct request_queue *, struct bio *);
102void rq_qos_done(struct request_queue *, struct request *);
103void rq_qos_issue(struct request_queue *, struct request *);
104void rq_qos_requeue(struct request_queue *, struct request *);
105void rq_qos_done_bio(struct request_queue *q, struct bio *bio);
106void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *);
107void rq_qos_track(struct request_queue *q, struct request *, struct bio *);
108void rq_qos_exit(struct request_queue *);
109#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d1de71124656..ffd459969689 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -128,7 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
128 128
129 /* Inherit limits from component devices */ 129 /* Inherit limits from component devices */
130 lim->max_segments = USHRT_MAX; 130 lim->max_segments = USHRT_MAX;
131 lim->max_discard_segments = 1; 131 lim->max_discard_segments = USHRT_MAX;
132 lim->max_hw_sectors = UINT_MAX; 132 lim->max_hw_sectors = UINT_MAX;
133 lim->max_segment_size = UINT_MAX; 133 lim->max_segment_size = UINT_MAX;
134 lim->max_sectors = UINT_MAX; 134 lim->max_sectors = UINT_MAX;
@@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
875void blk_set_queue_depth(struct request_queue *q, unsigned int depth) 875void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
876{ 876{
877 q->queue_depth = depth; 877 q->queue_depth = depth;
878 wbt_set_queue_depth(q->rq_wb, depth); 878 wbt_set_queue_depth(q, depth);
879} 879}
880EXPORT_SYMBOL(blk_set_queue_depth); 880EXPORT_SYMBOL(blk_set_queue_depth);
881 881
@@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
900 queue_flag_clear(QUEUE_FLAG_FUA, q); 900 queue_flag_clear(QUEUE_FLAG_FUA, q);
901 spin_unlock_irq(q->queue_lock); 901 spin_unlock_irq(q->queue_lock);
902 902
903 wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 903 wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
904} 904}
905EXPORT_SYMBOL_GPL(blk_queue_write_cache); 905EXPORT_SYMBOL_GPL(blk_queue_write_cache);
906 906
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 175c143ac5b9..7587b1c3caaf 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -17,7 +17,7 @@ struct blk_queue_stats {
17 bool enable_accounting; 17 bool enable_accounting;
18}; 18};
19 19
20static void blk_stat_init(struct blk_rq_stat *stat) 20void blk_rq_stat_init(struct blk_rq_stat *stat)
21{ 21{
22 stat->min = -1ULL; 22 stat->min = -1ULL;
23 stat->max = stat->nr_samples = stat->mean = 0; 23 stat->max = stat->nr_samples = stat->mean = 0;
@@ -25,7 +25,7 @@ static void blk_stat_init(struct blk_rq_stat *stat)
25} 25}
26 26
27/* src is a per-cpu stat, mean isn't initialized */ 27/* src is a per-cpu stat, mean isn't initialized */
28static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) 28void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
29{ 29{
30 if (!src->nr_samples) 30 if (!src->nr_samples)
31 return; 31 return;
@@ -39,7 +39,7 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
39 dst->nr_samples += src->nr_samples; 39 dst->nr_samples += src->nr_samples;
40} 40}
41 41
42static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) 42void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
43{ 43{
44 stat->min = min(stat->min, value); 44 stat->min = min(stat->min, value);
45 stat->max = max(stat->max, value); 45 stat->max = max(stat->max, value);
@@ -69,7 +69,7 @@ void blk_stat_add(struct request *rq, u64 now)
69 continue; 69 continue;
70 70
71 stat = &get_cpu_ptr(cb->cpu_stat)[bucket]; 71 stat = &get_cpu_ptr(cb->cpu_stat)[bucket];
72 __blk_stat_add(stat, value); 72 blk_rq_stat_add(stat, value);
73 put_cpu_ptr(cb->cpu_stat); 73 put_cpu_ptr(cb->cpu_stat);
74 } 74 }
75 rcu_read_unlock(); 75 rcu_read_unlock();
@@ -82,15 +82,15 @@ static void blk_stat_timer_fn(struct timer_list *t)
82 int cpu; 82 int cpu;
83 83
84 for (bucket = 0; bucket < cb->buckets; bucket++) 84 for (bucket = 0; bucket < cb->buckets; bucket++)
85 blk_stat_init(&cb->stat[bucket]); 85 blk_rq_stat_init(&cb->stat[bucket]);
86 86
87 for_each_online_cpu(cpu) { 87 for_each_online_cpu(cpu) {
88 struct blk_rq_stat *cpu_stat; 88 struct blk_rq_stat *cpu_stat;
89 89
90 cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); 90 cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
91 for (bucket = 0; bucket < cb->buckets; bucket++) { 91 for (bucket = 0; bucket < cb->buckets; bucket++) {
92 blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); 92 blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
93 blk_stat_init(&cpu_stat[bucket]); 93 blk_rq_stat_init(&cpu_stat[bucket]);
94 } 94 }
95 } 95 }
96 96
@@ -143,7 +143,7 @@ void blk_stat_add_callback(struct request_queue *q,
143 143
144 cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); 144 cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
145 for (bucket = 0; bucket < cb->buckets; bucket++) 145 for (bucket = 0; bucket < cb->buckets; bucket++)
146 blk_stat_init(&cpu_stat[bucket]); 146 blk_rq_stat_init(&cpu_stat[bucket]);
147 } 147 }
148 148
149 spin_lock(&q->stats->lock); 149 spin_lock(&q->stats->lock);
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 78399cdde9c9..f4a1568e81a4 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -159,4 +159,8 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
159 mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs)); 159 mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
160} 160}
161 161
162void blk_rq_stat_add(struct blk_rq_stat *, u64);
163void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
164void blk_rq_stat_init(struct blk_rq_stat *);
165
162#endif 166#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 94987b1f69e1..bb109bb0a055 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
422 422
423static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) 423static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
424{ 424{
425 if (!q->rq_wb) 425 if (!wbt_rq_qos(q))
426 return -EINVAL; 426 return -EINVAL;
427 427
428 return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); 428 return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
429} 429}
430 430
431static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, 431static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
432 size_t count) 432 size_t count)
433{ 433{
434 struct rq_wb *rwb; 434 struct rq_qos *rqos;
435 ssize_t ret; 435 ssize_t ret;
436 s64 val; 436 s64 val;
437 437
@@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
441 if (val < -1) 441 if (val < -1)
442 return -EINVAL; 442 return -EINVAL;
443 443
444 rwb = q->rq_wb; 444 rqos = wbt_rq_qos(q);
445 if (!rwb) { 445 if (!rqos) {
446 ret = wbt_init(q); 446 ret = wbt_init(q);
447 if (ret) 447 if (ret)
448 return ret; 448 return ret;
449 } 449 }
450 450
451 rwb = q->rq_wb;
452 if (val == -1) 451 if (val == -1)
453 rwb->min_lat_nsec = wbt_default_latency_nsec(q); 452 val = wbt_default_latency_nsec(q);
454 else if (val >= 0) 453 else if (val >= 0)
455 rwb->min_lat_nsec = val * 1000ULL; 454 val *= 1000ULL;
456 455
457 if (rwb->enable_state == WBT_STATE_ON_DEFAULT) 456 wbt_set_min_lat(q, val);
458 rwb->enable_state = WBT_STATE_ON_MANUAL;
459 457
460 wbt_update_limits(rwb); 458 wbt_update_limits(q);
461 return count; 459 return count;
462} 460}
463 461
@@ -804,6 +802,21 @@ static void __blk_release_queue(struct work_struct *work)
804 blk_stat_remove_callback(q, q->poll_cb); 802 blk_stat_remove_callback(q, q->poll_cb);
805 blk_stat_free_callback(q->poll_cb); 803 blk_stat_free_callback(q->poll_cb);
806 804
805 if (!blk_queue_dead(q)) {
806 /*
807 * Last reference was dropped without having called
808 * blk_cleanup_queue().
809 */
810 WARN_ONCE(blk_queue_init_done(q),
811 "request queue %p has been registered but blk_cleanup_queue() has not been called for that queue\n",
812 q);
813 blk_exit_queue(q);
814 }
815
816 WARN(blk_queue_root_blkg(q),
817 "request queue %p is being released but it has not yet been removed from the blkcg controller\n",
818 q);
819
807 blk_free_queue_stats(q->stats); 820 blk_free_queue_stats(q->stats);
808 821
809 blk_exit_rl(q, &q->root_rl); 822 blk_exit_rl(q, &q->root_rl);
@@ -964,7 +977,7 @@ void blk_unregister_queue(struct gendisk *disk)
964 kobject_del(&q->kobj); 977 kobject_del(&q->kobj);
965 blk_trace_remove_sysfs(disk_to_dev(disk)); 978 blk_trace_remove_sysfs(disk_to_dev(disk));
966 979
967 wbt_exit(q); 980 rq_qos_exit(q);
968 981
969 mutex_lock(&q->sysfs_lock); 982 mutex_lock(&q->sysfs_lock);
970 if (q->request_fn || (q->mq_ops && q->elevator)) 983 if (q->request_fn || (q->mq_ops && q->elevator))
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 82282e6fdcf8..a3eede00d302 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -579,8 +579,10 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
579 struct throtl_grp *tg = blkg_to_tg(blkg); 579 struct throtl_grp *tg = blkg_to_tg(blkg);
580 580
581 if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || 581 if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
582 tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) 582 tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
583 low_valid = true; 583 low_valid = true;
584 break;
585 }
584 } 586 }
585 rcu_read_unlock(); 587 rcu_read_unlock();
586 588
@@ -920,12 +922,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
920 } 922 }
921 923
922 /* Calc approx time to dispatch */ 924 /* Calc approx time to dispatch */
923 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1; 925 jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
924
925 if (jiffy_wait > jiffy_elapsed)
926 jiffy_wait = jiffy_wait - jiffy_elapsed;
927 else
928 jiffy_wait = 1;
929 926
930 if (wait) 927 if (wait)
931 *wait = jiffy_wait; 928 *wait = jiffy_wait;
@@ -2132,12 +2129,8 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
2132static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) 2129static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
2133{ 2130{
2134#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2131#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
2135 if (bio->bi_css) { 2132 if (bio->bi_css)
2136 if (bio->bi_cg_private) 2133 bio_associate_blkg(bio, tg_to_blkg(tg));
2137 blkg_put(tg_to_blkg(bio->bi_cg_private));
2138 bio->bi_cg_private = tg;
2139 blkg_get(tg_to_blkg(tg));
2140 }
2141 bio_issue_init(&bio->bi_issue, bio_sectors(bio)); 2134 bio_issue_init(&bio->bi_issue, bio_sectors(bio));
2142#endif 2135#endif
2143} 2136}
@@ -2285,6 +2278,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
2285 2278
2286void blk_throtl_bio_endio(struct bio *bio) 2279void blk_throtl_bio_endio(struct bio *bio)
2287{ 2280{
2281 struct blkcg_gq *blkg;
2288 struct throtl_grp *tg; 2282 struct throtl_grp *tg;
2289 u64 finish_time_ns; 2283 u64 finish_time_ns;
2290 unsigned long finish_time; 2284 unsigned long finish_time;
@@ -2292,20 +2286,18 @@ void blk_throtl_bio_endio(struct bio *bio)
2292 unsigned long lat; 2286 unsigned long lat;
2293 int rw = bio_data_dir(bio); 2287 int rw = bio_data_dir(bio);
2294 2288
2295 tg = bio->bi_cg_private; 2289 blkg = bio->bi_blkg;
2296 if (!tg) 2290 if (!blkg)
2297 return; 2291 return;
2298 bio->bi_cg_private = NULL; 2292 tg = blkg_to_tg(blkg);
2299 2293
2300 finish_time_ns = ktime_get_ns(); 2294 finish_time_ns = ktime_get_ns();
2301 tg->last_finish_time = finish_time_ns >> 10; 2295 tg->last_finish_time = finish_time_ns >> 10;
2302 2296
2303 start_time = bio_issue_time(&bio->bi_issue) >> 10; 2297 start_time = bio_issue_time(&bio->bi_issue) >> 10;
2304 finish_time = __bio_issue_time(finish_time_ns) >> 10; 2298 finish_time = __bio_issue_time(finish_time_ns) >> 10;
2305 if (!start_time || finish_time <= start_time) { 2299 if (!start_time || finish_time <= start_time)
2306 blkg_put(tg_to_blkg(tg));
2307 return; 2300 return;
2308 }
2309 2301
2310 lat = finish_time - start_time; 2302 lat = finish_time - start_time;
2311 /* this is only for bio based driver */ 2303 /* this is only for bio based driver */
@@ -2334,8 +2326,6 @@ void blk_throtl_bio_endio(struct bio *bio)
2334 tg->bio_cnt /= 2; 2326 tg->bio_cnt /= 2;
2335 tg->bad_bio_cnt /= 2; 2327 tg->bad_bio_cnt /= 2;
2336 } 2328 }
2337
2338 blkg_put(tg_to_blkg(tg));
2339} 2329}
2340#endif 2330#endif
2341 2331
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 4f89b28fa652..1d94a20374fc 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -25,6 +25,7 @@
25#include <linux/swap.h> 25#include <linux/swap.h>
26 26
27#include "blk-wbt.h" 27#include "blk-wbt.h"
28#include "blk-rq-qos.h"
28 29
29#define CREATE_TRACE_POINTS 30#define CREATE_TRACE_POINTS
30#include <trace/events/wbt.h> 31#include <trace/events/wbt.h>
@@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb)
78 return rwb && rwb->wb_normal != 0; 79 return rwb && rwb->wb_normal != 0;
79} 80}
80 81
81/*
82 * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
83 * false if 'v' + 1 would be bigger than 'below'.
84 */
85static bool atomic_inc_below(atomic_t *v, int below)
86{
87 int cur = atomic_read(v);
88
89 for (;;) {
90 int old;
91
92 if (cur >= below)
93 return false;
94 old = atomic_cmpxchg(v, cur, cur + 1);
95 if (old == cur)
96 break;
97 cur = old;
98 }
99
100 return true;
101}
102
103static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) 82static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
104{ 83{
105 if (rwb_enabled(rwb)) { 84 if (rwb_enabled(rwb)) {
@@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
116 */ 95 */
117static bool wb_recent_wait(struct rq_wb *rwb) 96static bool wb_recent_wait(struct rq_wb *rwb)
118{ 97{
119 struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; 98 struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb;
120 99
121 return time_before(jiffies, wb->dirty_sleep + HZ); 100 return time_before(jiffies, wb->dirty_sleep + HZ);
122} 101}
@@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb)
144 } 123 }
145} 124}
146 125
147void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) 126static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
148{ 127{
128 struct rq_wb *rwb = RQWB(rqos);
149 struct rq_wait *rqw; 129 struct rq_wait *rqw;
150 int inflight, limit; 130 int inflight, limit;
151 131
@@ -186,7 +166,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
186 int diff = limit - inflight; 166 int diff = limit - inflight;
187 167
188 if (!inflight || diff >= rwb->wb_background / 2) 168 if (!inflight || diff >= rwb->wb_background / 2)
189 wake_up_all(&rqw->wait); 169 wake_up(&rqw->wait);
190 } 170 }
191} 171}
192 172
@@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
194 * Called on completion of a request. Note that it's also called when 174 * Called on completion of a request. Note that it's also called when
195 * a request is merged, when the request gets freed. 175 * a request is merged, when the request gets freed.
196 */ 176 */
197void wbt_done(struct rq_wb *rwb, struct request *rq) 177static void wbt_done(struct rq_qos *rqos, struct request *rq)
198{ 178{
199 if (!rwb) 179 struct rq_wb *rwb = RQWB(rqos);
200 return;
201 180
202 if (!wbt_is_tracked(rq)) { 181 if (!wbt_is_tracked(rq)) {
203 if (rwb->sync_cookie == rq) { 182 if (rwb->sync_cookie == rq) {
@@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq)
209 wb_timestamp(rwb, &rwb->last_comp); 188 wb_timestamp(rwb, &rwb->last_comp);
210 } else { 189 } else {
211 WARN_ON_ONCE(rq == rwb->sync_cookie); 190 WARN_ON_ONCE(rq == rwb->sync_cookie);
212 __wbt_done(rwb, wbt_flags(rq)); 191 __wbt_done(rqos, wbt_flags(rq));
213 } 192 }
214 wbt_clear_state(rq); 193 wbt_clear_state(rq);
215} 194}
216 195
217/*
218 * Return true, if we can't increase the depth further by scaling
219 */
220static bool calc_wb_limits(struct rq_wb *rwb)
221{
222 unsigned int depth;
223 bool ret = false;
224
225 if (!rwb->min_lat_nsec) {
226 rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
227 return false;
228 }
229
230 /*
231 * For QD=1 devices, this is a special case. It's important for those
232 * to have one request ready when one completes, so force a depth of
233 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
234 * since the device can't have more than that in flight. If we're
235 * scaling down, then keep a setting of 1/1/1.
236 */
237 if (rwb->queue_depth == 1) {
238 if (rwb->scale_step > 0)
239 rwb->wb_max = rwb->wb_normal = 1;
240 else {
241 rwb->wb_max = rwb->wb_normal = 2;
242 ret = true;
243 }
244 rwb->wb_background = 1;
245 } else {
246 /*
247 * scale_step == 0 is our default state. If we have suffered
248 * latency spikes, step will be > 0, and we shrink the
249 * allowed write depths. If step is < 0, we're only doing
250 * writes, and we allow a temporarily higher depth to
251 * increase performance.
252 */
253 depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
254 if (rwb->scale_step > 0)
255 depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
256 else if (rwb->scale_step < 0) {
257 unsigned int maxd = 3 * rwb->queue_depth / 4;
258
259 depth = 1 + ((depth - 1) << -rwb->scale_step);
260 if (depth > maxd) {
261 depth = maxd;
262 ret = true;
263 }
264 }
265
266 /*
267 * Set our max/normal/bg queue depths based on how far
268 * we have scaled down (->scale_step).
269 */
270 rwb->wb_max = depth;
271 rwb->wb_normal = (rwb->wb_max + 1) / 2;
272 rwb->wb_background = (rwb->wb_max + 3) / 4;
273 }
274
275 return ret;
276}
277
278static inline bool stat_sample_valid(struct blk_rq_stat *stat) 196static inline bool stat_sample_valid(struct blk_rq_stat *stat)
279{ 197{
280 /* 198 /*
@@ -307,7 +225,8 @@ enum {
307 225
308static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) 226static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
309{ 227{
310 struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 228 struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
229 struct rq_depth *rqd = &rwb->rq_depth;
311 u64 thislat; 230 u64 thislat;
312 231
313 /* 232 /*
@@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
351 return LAT_EXCEEDED; 270 return LAT_EXCEEDED;
352 } 271 }
353 272
354 if (rwb->scale_step) 273 if (rqd->scale_step)
355 trace_wbt_stat(bdi, stat); 274 trace_wbt_stat(bdi, stat);
356 275
357 return LAT_OK; 276 return LAT_OK;
@@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
359 278
360static void rwb_trace_step(struct rq_wb *rwb, const char *msg) 279static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
361{ 280{
362 struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 281 struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
282 struct rq_depth *rqd = &rwb->rq_depth;
363 283
364 trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, 284 trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
365 rwb->wb_background, rwb->wb_normal, rwb->wb_max); 285 rwb->wb_background, rwb->wb_normal, rqd->max_depth);
366} 286}
367 287
368static void scale_up(struct rq_wb *rwb) 288static void calc_wb_limits(struct rq_wb *rwb)
369{ 289{
370 /* 290 if (rwb->min_lat_nsec == 0) {
371 * Hit max in previous round, stop here 291 rwb->wb_normal = rwb->wb_background = 0;
372 */ 292 } else if (rwb->rq_depth.max_depth <= 2) {
373 if (rwb->scaled_max) 293 rwb->wb_normal = rwb->rq_depth.max_depth;
374 return; 294 rwb->wb_background = 1;
295 } else {
296 rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
297 rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
298 }
299}
375 300
376 rwb->scale_step--; 301static void scale_up(struct rq_wb *rwb)
302{
303 rq_depth_scale_up(&rwb->rq_depth);
304 calc_wb_limits(rwb);
377 rwb->unknown_cnt = 0; 305 rwb->unknown_cnt = 0;
378 306 rwb_trace_step(rwb, "scale up");
379 rwb->scaled_max = calc_wb_limits(rwb);
380
381 rwb_wake_all(rwb);
382
383 rwb_trace_step(rwb, "step up");
384} 307}
385 308
386/*
387 * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
388 * had a latency violation.
389 */
390static void scale_down(struct rq_wb *rwb, bool hard_throttle) 309static void scale_down(struct rq_wb *rwb, bool hard_throttle)
391{ 310{
392 /* 311 rq_depth_scale_down(&rwb->rq_depth, hard_throttle);
393 * Stop scaling down when we've hit the limit. This also prevents
394 * ->scale_step from going to crazy values, if the device can't
395 * keep up.
396 */
397 if (rwb->wb_max == 1)
398 return;
399
400 if (rwb->scale_step < 0 && hard_throttle)
401 rwb->scale_step = 0;
402 else
403 rwb->scale_step++;
404
405 rwb->scaled_max = false;
406 rwb->unknown_cnt = 0;
407 calc_wb_limits(rwb); 312 calc_wb_limits(rwb);
408 rwb_trace_step(rwb, "step down"); 313 rwb->unknown_cnt = 0;
314 rwb_wake_all(rwb);
315 rwb_trace_step(rwb, "scale down");
409} 316}
410 317
411static void rwb_arm_timer(struct rq_wb *rwb) 318static void rwb_arm_timer(struct rq_wb *rwb)
412{ 319{
413 if (rwb->scale_step > 0) { 320 struct rq_depth *rqd = &rwb->rq_depth;
321
322 if (rqd->scale_step > 0) {
414 /* 323 /*
415 * We should speed this up, using some variant of a fast 324 * We should speed this up, using some variant of a fast
416 * integer inverse square root calculation. Since we only do 325 * integer inverse square root calculation. Since we only do
@@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb)
418 * though. 327 * though.
419 */ 328 */
420 rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, 329 rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
421 int_sqrt((rwb->scale_step + 1) << 8)); 330 int_sqrt((rqd->scale_step + 1) << 8));
422 } else { 331 } else {
423 /* 332 /*
424 * For step < 0, we don't want to increase/decrease the 333 * For step < 0, we don't want to increase/decrease the
@@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb)
433static void wb_timer_fn(struct blk_stat_callback *cb) 342static void wb_timer_fn(struct blk_stat_callback *cb)
434{ 343{
435 struct rq_wb *rwb = cb->data; 344 struct rq_wb *rwb = cb->data;
345 struct rq_depth *rqd = &rwb->rq_depth;
436 unsigned int inflight = wbt_inflight(rwb); 346 unsigned int inflight = wbt_inflight(rwb);
437 int status; 347 int status;
438 348
439 status = latency_exceeded(rwb, cb->stat); 349 status = latency_exceeded(rwb, cb->stat);
440 350
441 trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, 351 trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step,
442 inflight); 352 inflight);
443 353
444 /* 354 /*
@@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
469 * currently don't have a valid read/write sample. For that 379 * currently don't have a valid read/write sample. For that
470 * case, slowly return to center state (step == 0). 380 * case, slowly return to center state (step == 0).
471 */ 381 */
472 if (rwb->scale_step > 0) 382 if (rqd->scale_step > 0)
473 scale_up(rwb); 383 scale_up(rwb);
474 else if (rwb->scale_step < 0) 384 else if (rqd->scale_step < 0)
475 scale_down(rwb, false); 385 scale_down(rwb, false);
476 break; 386 break;
477 default: 387 default:
@@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
481 /* 391 /*
482 * Re-arm timer, if we have IO in flight 392 * Re-arm timer, if we have IO in flight
483 */ 393 */
484 if (rwb->scale_step || inflight) 394 if (rqd->scale_step || inflight)
485 rwb_arm_timer(rwb); 395 rwb_arm_timer(rwb);
486} 396}
487 397
488void wbt_update_limits(struct rq_wb *rwb) 398static void __wbt_update_limits(struct rq_wb *rwb)
489{ 399{
490 rwb->scale_step = 0; 400 struct rq_depth *rqd = &rwb->rq_depth;
491 rwb->scaled_max = false; 401
402 rqd->scale_step = 0;
403 rqd->scaled_max = false;
404
405 rq_depth_calc_max_depth(rqd);
492 calc_wb_limits(rwb); 406 calc_wb_limits(rwb);
493 407
494 rwb_wake_all(rwb); 408 rwb_wake_all(rwb);
495} 409}
496 410
411void wbt_update_limits(struct request_queue *q)
412{
413 struct rq_qos *rqos = wbt_rq_qos(q);
414 if (!rqos)
415 return;
416 __wbt_update_limits(RQWB(rqos));
417}
418
419u64 wbt_get_min_lat(struct request_queue *q)
420{
421 struct rq_qos *rqos = wbt_rq_qos(q);
422 if (!rqos)
423 return 0;
424 return RQWB(rqos)->min_lat_nsec;
425}
426
427void wbt_set_min_lat(struct request_queue *q, u64 val)
428{
429 struct rq_qos *rqos = wbt_rq_qos(q);
430 if (!rqos)
431 return;
432 RQWB(rqos)->min_lat_nsec = val;
433 RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
434 __wbt_update_limits(RQWB(rqos));
435}
436
437
497static bool close_io(struct rq_wb *rwb) 438static bool close_io(struct rq_wb *rwb)
498{ 439{
499 const unsigned long now = jiffies; 440 const unsigned long now = jiffies;
@@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
520 * IO for a bit. 461 * IO for a bit.
521 */ 462 */
522 if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) 463 if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
523 limit = rwb->wb_max; 464 limit = rwb->rq_depth.max_depth;
524 else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { 465 else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
525 /* 466 /*
526 * If less than 100ms since we completed unrelated IO, 467 * If less than 100ms since we completed unrelated IO,
@@ -533,30 +474,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
533 return limit; 474 return limit;
534} 475}
535 476
536static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
537 wait_queue_entry_t *wait, unsigned long rw)
538{
539 /*
540 * inc it here even if disabled, since we'll dec it at completion.
541 * this only happens if the task was sleeping in __wbt_wait(),
542 * and someone turned it off at the same time.
543 */
544 if (!rwb_enabled(rwb)) {
545 atomic_inc(&rqw->inflight);
546 return true;
547 }
548
549 /*
550 * If the waitqueue is already active and we are not the next
551 * in line to be woken up, wait for our turn.
552 */
553 if (waitqueue_active(&rqw->wait) &&
554 rqw->wait.head.next != &wait->entry)
555 return false;
556
557 return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
558}
559
560/* 477/*
561 * Block if we will exceed our limit, or if we are currently waiting for 478 * Block if we will exceed our limit, or if we are currently waiting for
562 * the timer to kick off queuing again. 479 * the timer to kick off queuing again.
@@ -567,16 +484,32 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
567 __acquires(lock) 484 __acquires(lock)
568{ 485{
569 struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); 486 struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
570 DEFINE_WAIT(wait); 487 DECLARE_WAITQUEUE(wait, current);
571 488
572 if (may_queue(rwb, rqw, &wait, rw)) 489 /*
490 * inc it here even if disabled, since we'll dec it at completion.
491 * this only happens if the task was sleeping in __wbt_wait(),
492 * and someone turned it off at the same time.
493 */
494 if (!rwb_enabled(rwb)) {
495 atomic_inc(&rqw->inflight);
573 return; 496 return;
497 }
574 498
499 if (!waitqueue_active(&rqw->wait)
500 && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
501 return;
502
503 add_wait_queue_exclusive(&rqw->wait, &wait);
575 do { 504 do {
576 prepare_to_wait_exclusive(&rqw->wait, &wait, 505 set_current_state(TASK_UNINTERRUPTIBLE);
577 TASK_UNINTERRUPTIBLE);
578 506
579 if (may_queue(rwb, rqw, &wait, rw)) 507 if (!rwb_enabled(rwb)) {
508 atomic_inc(&rqw->inflight);
509 break;
510 }
511
512 if (rq_wait_inc_below(rqw, get_limit(rwb, rw)))
580 break; 513 break;
581 514
582 if (lock) { 515 if (lock) {
@@ -587,7 +520,8 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
587 io_schedule(); 520 io_schedule();
588 } while (1); 521 } while (1);
589 522
590 finish_wait(&rqw->wait, &wait); 523 __set_current_state(TASK_RUNNING);
524 remove_wait_queue(&rqw->wait, &wait);
591} 525}
592 526
593static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) 527static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
@@ -608,43 +542,72 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
608 } 542 }
609} 543}
610 544
545static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
546{
547 enum wbt_flags flags = 0;
548
549 if (bio_op(bio) == REQ_OP_READ) {
550 flags = WBT_READ;
551 } else if (wbt_should_throttle(rwb, bio)) {
552 if (current_is_kswapd())
553 flags |= WBT_KSWAPD;
554 if (bio_op(bio) == REQ_OP_DISCARD)
555 flags |= WBT_DISCARD;
556 flags |= WBT_TRACKED;
557 }
558 return flags;
559}
560
561static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
562{
563 struct rq_wb *rwb = RQWB(rqos);
564 enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
565 __wbt_done(rqos, flags);
566}
567
611/* 568/*
612 * Returns true if the IO request should be accounted, false if not. 569 * Returns true if the IO request should be accounted, false if not.
613 * May sleep, if we have exceeded the writeback limits. Caller can pass 570 * May sleep, if we have exceeded the writeback limits. Caller can pass
614 * in an irq held spinlock, if it holds one when calling this function. 571 * in an irq held spinlock, if it holds one when calling this function.
615 * If we do sleep, we'll release and re-grab it. 572 * If we do sleep, we'll release and re-grab it.
616 */ 573 */
617enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) 574static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
618{ 575{
619 enum wbt_flags ret = 0; 576 struct rq_wb *rwb = RQWB(rqos);
577 enum wbt_flags flags;
620 578
621 if (!rwb_enabled(rwb)) 579 if (!rwb_enabled(rwb))
622 return 0; 580 return;
623 581
624 if (bio_op(bio) == REQ_OP_READ) 582 flags = bio_to_wbt_flags(rwb, bio);
625 ret = WBT_READ;
626 583
627 if (!wbt_should_throttle(rwb, bio)) { 584 if (!wbt_should_throttle(rwb, bio)) {
628 if (ret & WBT_READ) 585 if (flags & WBT_READ)
629 wb_timestamp(rwb, &rwb->last_issue); 586 wb_timestamp(rwb, &rwb->last_issue);
630 return ret; 587 return;
631 } 588 }
632 589
633 if (current_is_kswapd()) 590 if (current_is_kswapd())
634 ret |= WBT_KSWAPD; 591 flags |= WBT_KSWAPD;
635 if (bio_op(bio) == REQ_OP_DISCARD) 592 if (bio_op(bio) == REQ_OP_DISCARD)
636 ret |= WBT_DISCARD; 593 flags |= WBT_DISCARD;
637 594
638 __wbt_wait(rwb, ret, bio->bi_opf, lock); 595 __wbt_wait(rwb, flags, bio->bi_opf, lock);
639 596
640 if (!blk_stat_is_active(rwb->cb)) 597 if (!blk_stat_is_active(rwb->cb))
641 rwb_arm_timer(rwb); 598 rwb_arm_timer(rwb);
599}
642 600
643 return ret | WBT_TRACKED; 601static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
602{
603 struct rq_wb *rwb = RQWB(rqos);
604 rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
644} 605}
645 606
646void wbt_issue(struct rq_wb *rwb, struct request *rq) 607void wbt_issue(struct rq_qos *rqos, struct request *rq)
647{ 608{
609 struct rq_wb *rwb = RQWB(rqos);
610
648 if (!rwb_enabled(rwb)) 611 if (!rwb_enabled(rwb))
649 return; 612 return;
650 613
@@ -661,8 +624,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq)
661 } 624 }
662} 625}
663 626
664void wbt_requeue(struct rq_wb *rwb, struct request *rq) 627void wbt_requeue(struct rq_qos *rqos, struct request *rq)
665{ 628{
629 struct rq_wb *rwb = RQWB(rqos);
666 if (!rwb_enabled(rwb)) 630 if (!rwb_enabled(rwb))
667 return; 631 return;
668 if (rq == rwb->sync_cookie) { 632 if (rq == rwb->sync_cookie) {
@@ -671,39 +635,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq)
671 } 635 }
672} 636}
673 637
674void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) 638void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
675{ 639{
676 if (rwb) { 640 struct rq_qos *rqos = wbt_rq_qos(q);
677 rwb->queue_depth = depth; 641 if (rqos) {
678 wbt_update_limits(rwb); 642 RQWB(rqos)->rq_depth.queue_depth = depth;
643 __wbt_update_limits(RQWB(rqos));
679 } 644 }
680} 645}
681 646
682void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) 647void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
683{ 648{
684 if (rwb) 649 struct rq_qos *rqos = wbt_rq_qos(q);
685 rwb->wc = write_cache_on; 650 if (rqos)
651 RQWB(rqos)->wc = write_cache_on;
686} 652}
687 653
688/* 654/*
689 * Disable wbt, if enabled by default.
690 */
691void wbt_disable_default(struct request_queue *q)
692{
693 struct rq_wb *rwb = q->rq_wb;
694
695 if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
696 wbt_exit(q);
697}
698EXPORT_SYMBOL_GPL(wbt_disable_default);
699
700/*
701 * Enable wbt if defaults are configured that way 655 * Enable wbt if defaults are configured that way
702 */ 656 */
703void wbt_enable_default(struct request_queue *q) 657void wbt_enable_default(struct request_queue *q)
704{ 658{
659 struct rq_qos *rqos = wbt_rq_qos(q);
705 /* Throttling already enabled? */ 660 /* Throttling already enabled? */
706 if (q->rq_wb) 661 if (rqos)
707 return; 662 return;
708 663
709 /* Queue not registered? Maybe shutting down... */ 664 /* Queue not registered? Maybe shutting down... */
@@ -741,6 +696,42 @@ static int wbt_data_dir(const struct request *rq)
741 return -1; 696 return -1;
742} 697}
743 698
699static void wbt_exit(struct rq_qos *rqos)
700{
701 struct rq_wb *rwb = RQWB(rqos);
702 struct request_queue *q = rqos->q;
703
704 blk_stat_remove_callback(q, rwb->cb);
705 blk_stat_free_callback(rwb->cb);
706 kfree(rwb);
707}
708
709/*
710 * Disable wbt, if enabled by default.
711 */
712void wbt_disable_default(struct request_queue *q)
713{
714 struct rq_qos *rqos = wbt_rq_qos(q);
715 struct rq_wb *rwb;
716 if (!rqos)
717 return;
718 rwb = RQWB(rqos);
719 if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
720 rwb->wb_normal = 0;
721}
722EXPORT_SYMBOL_GPL(wbt_disable_default);
723
724
725static struct rq_qos_ops wbt_rqos_ops = {
726 .throttle = wbt_wait,
727 .issue = wbt_issue,
728 .track = wbt_track,
729 .requeue = wbt_requeue,
730 .done = wbt_done,
731 .cleanup = wbt_cleanup,
732 .exit = wbt_exit,
733};
734
744int wbt_init(struct request_queue *q) 735int wbt_init(struct request_queue *q)
745{ 736{
746 struct rq_wb *rwb; 737 struct rq_wb *rwb;
@@ -756,39 +747,29 @@ int wbt_init(struct request_queue *q)
756 return -ENOMEM; 747 return -ENOMEM;
757 } 748 }
758 749
759 for (i = 0; i < WBT_NUM_RWQ; i++) { 750 for (i = 0; i < WBT_NUM_RWQ; i++)
760 atomic_set(&rwb->rq_wait[i].inflight, 0); 751 rq_wait_init(&rwb->rq_wait[i]);
761 init_waitqueue_head(&rwb->rq_wait[i].wait);
762 }
763 752
753 rwb->rqos.id = RQ_QOS_WBT;
754 rwb->rqos.ops = &wbt_rqos_ops;
755 rwb->rqos.q = q;
764 rwb->last_comp = rwb->last_issue = jiffies; 756 rwb->last_comp = rwb->last_issue = jiffies;
765 rwb->queue = q;
766 rwb->win_nsec = RWB_WINDOW_NSEC; 757 rwb->win_nsec = RWB_WINDOW_NSEC;
767 rwb->enable_state = WBT_STATE_ON_DEFAULT; 758 rwb->enable_state = WBT_STATE_ON_DEFAULT;
768 wbt_update_limits(rwb); 759 rwb->wc = 1;
760 rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
761 __wbt_update_limits(rwb);
769 762
770 /* 763 /*
771 * Assign rwb and add the stats callback. 764 * Assign rwb and add the stats callback.
772 */ 765 */
773 q->rq_wb = rwb; 766 rq_qos_add(q, &rwb->rqos);
774 blk_stat_add_callback(q, rwb->cb); 767 blk_stat_add_callback(q, rwb->cb);
775 768
776 rwb->min_lat_nsec = wbt_default_latency_nsec(q); 769 rwb->min_lat_nsec = wbt_default_latency_nsec(q);
777 770
778 wbt_set_queue_depth(rwb, blk_queue_depth(q)); 771 wbt_set_queue_depth(q, blk_queue_depth(q));
779 wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 772 wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
780 773
781 return 0; 774 return 0;
782} 775}
783
784void wbt_exit(struct request_queue *q)
785{
786 struct rq_wb *rwb = q->rq_wb;
787
788 if (rwb) {
789 blk_stat_remove_callback(q, rwb->cb);
790 blk_stat_free_callback(rwb->cb);
791 q->rq_wb = NULL;
792 kfree(rwb);
793 }
794}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 300df531d0a6..f47218d5b3b2 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -9,6 +9,7 @@
9#include <linux/ktime.h> 9#include <linux/ktime.h>
10 10
11#include "blk-stat.h" 11#include "blk-stat.h"
12#include "blk-rq-qos.h"
12 13
13enum wbt_flags { 14enum wbt_flags {
14 WBT_TRACKED = 1, /* write, tracked for throttling */ 15 WBT_TRACKED = 1, /* write, tracked for throttling */
@@ -35,20 +36,12 @@ enum {
35 WBT_STATE_ON_MANUAL = 2, 36 WBT_STATE_ON_MANUAL = 2,
36}; 37};
37 38
38struct rq_wait {
39 wait_queue_head_t wait;
40 atomic_t inflight;
41};
42
43struct rq_wb { 39struct rq_wb {
44 /* 40 /*
45 * Settings that govern how we throttle 41 * Settings that govern how we throttle
46 */ 42 */
47 unsigned int wb_background; /* background writeback */ 43 unsigned int wb_background; /* background writeback */
48 unsigned int wb_normal; /* normal writeback */ 44 unsigned int wb_normal; /* normal writeback */
49 unsigned int wb_max; /* max throughput writeback */
50 int scale_step;
51 bool scaled_max;
52 45
53 short enable_state; /* WBT_STATE_* */ 46 short enable_state; /* WBT_STATE_* */
54 47
@@ -67,15 +60,20 @@ struct rq_wb {
67 void *sync_cookie; 60 void *sync_cookie;
68 61
69 unsigned int wc; 62 unsigned int wc;
70 unsigned int queue_depth;
71 63
72 unsigned long last_issue; /* last non-throttled issue */ 64 unsigned long last_issue; /* last non-throttled issue */
73 unsigned long last_comp; /* last non-throttled comp */ 65 unsigned long last_comp; /* last non-throttled comp */
74 unsigned long min_lat_nsec; 66 unsigned long min_lat_nsec;
75 struct request_queue *queue; 67 struct rq_qos rqos;
76 struct rq_wait rq_wait[WBT_NUM_RWQ]; 68 struct rq_wait rq_wait[WBT_NUM_RWQ];
69 struct rq_depth rq_depth;
77}; 70};
78 71
72static inline struct rq_wb *RQWB(struct rq_qos *rqos)
73{
74 return container_of(rqos, struct rq_wb, rqos);
75}
76
79static inline unsigned int wbt_inflight(struct rq_wb *rwb) 77static inline unsigned int wbt_inflight(struct rq_wb *rwb)
80{ 78{
81 unsigned int i, ret = 0; 79 unsigned int i, ret = 0;
@@ -86,26 +84,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
86 return ret; 84 return ret;
87} 85}
88 86
89#ifdef CONFIG_BLK_WBT
90 87
91static inline void wbt_track(struct request *rq, enum wbt_flags flags) 88#ifdef CONFIG_BLK_WBT
92{
93 rq->wbt_flags |= flags;
94}
95 89
96void __wbt_done(struct rq_wb *, enum wbt_flags);
97void wbt_done(struct rq_wb *, struct request *);
98enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
99int wbt_init(struct request_queue *); 90int wbt_init(struct request_queue *);
100void wbt_exit(struct request_queue *); 91void wbt_update_limits(struct request_queue *);
101void wbt_update_limits(struct rq_wb *);
102void wbt_requeue(struct rq_wb *, struct request *);
103void wbt_issue(struct rq_wb *, struct request *);
104void wbt_disable_default(struct request_queue *); 92void wbt_disable_default(struct request_queue *);
105void wbt_enable_default(struct request_queue *); 93void wbt_enable_default(struct request_queue *);
106 94
107void wbt_set_queue_depth(struct rq_wb *, unsigned int); 95u64 wbt_get_min_lat(struct request_queue *q);
108void wbt_set_write_cache(struct rq_wb *, bool); 96void wbt_set_min_lat(struct request_queue *q, u64 val);
97
98void wbt_set_queue_depth(struct request_queue *, unsigned int);
99void wbt_set_write_cache(struct request_queue *, bool);
109 100
110u64 wbt_default_latency_nsec(struct request_queue *); 101u64 wbt_default_latency_nsec(struct request_queue *);
111 102
@@ -114,43 +105,30 @@ u64 wbt_default_latency_nsec(struct request_queue *);
114static inline void wbt_track(struct request *rq, enum wbt_flags flags) 105static inline void wbt_track(struct request *rq, enum wbt_flags flags)
115{ 106{
116} 107}
117static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
118{
119}
120static inline void wbt_done(struct rq_wb *rwb, struct request *rq)
121{
122}
123static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
124 spinlock_t *lock)
125{
126 return 0;
127}
128static inline int wbt_init(struct request_queue *q) 108static inline int wbt_init(struct request_queue *q)
129{ 109{
130 return -EINVAL; 110 return -EINVAL;
131} 111}
132static inline void wbt_exit(struct request_queue *q) 112static inline void wbt_update_limits(struct request_queue *q)
133{
134}
135static inline void wbt_update_limits(struct rq_wb *rwb)
136{ 113{
137} 114}
138static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq) 115static inline void wbt_disable_default(struct request_queue *q)
139{ 116{
140} 117}
141static inline void wbt_issue(struct rq_wb *rwb, struct request *rq) 118static inline void wbt_enable_default(struct request_queue *q)
142{ 119{
143} 120}
144static inline void wbt_disable_default(struct request_queue *q) 121static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
145{ 122{
146} 123}
147static inline void wbt_enable_default(struct request_queue *q) 124static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
148{ 125{
149} 126}
150static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) 127static inline u64 wbt_get_min_lat(struct request_queue *q)
151{ 128{
129 return 0;
152} 130}
153static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc) 131static inline void wbt_set_min_lat(struct request_queue *q, u64 val)
154{ 132{
155} 133}
156static inline u64 wbt_default_latency_nsec(struct request_queue *q) 134static inline u64 wbt_default_latency_nsec(struct request_queue *q)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 51000914e23f..c461cf63f1f4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -200,7 +200,7 @@ int blkdev_report_zones(struct block_device *bdev,
200 /* Get header in the first page */ 200 /* Get header in the first page */
201 ofst = 0; 201 ofst = 0;
202 if (!nr_rep) { 202 if (!nr_rep) {
203 hdr = (struct blk_zone_report_hdr *) addr; 203 hdr = addr;
204 nr_rep = hdr->nr_zones; 204 nr_rep = hdr->nr_zones;
205 ofst = sizeof(struct blk_zone_report_hdr); 205 ofst = sizeof(struct blk_zone_report_hdr);
206 } 206 }
diff --git a/block/blk.h b/block/blk.h
index 8d23aea96ce9..d4d67e948920 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -130,6 +130,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
130int blk_init_rl(struct request_list *rl, struct request_queue *q, 130int blk_init_rl(struct request_list *rl, struct request_queue *q,
131 gfp_t gfp_mask); 131 gfp_t gfp_mask);
132void blk_exit_rl(struct request_queue *q, struct request_list *rl); 132void blk_exit_rl(struct request_queue *q, struct request_list *rl);
133void blk_exit_queue(struct request_queue *q);
133void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 134void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
134 struct bio *bio); 135 struct bio *bio);
135void blk_queue_bypass_start(struct request_queue *q); 136void blk_queue_bypass_start(struct request_queue *q);
@@ -412,4 +413,10 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
412 413
413extern void blk_drain_queue(struct request_queue *q); 414extern void blk_drain_queue(struct request_queue *q);
414 415
416#ifdef CONFIG_BLK_CGROUP_IOLATENCY
417extern int blk_iolatency_init(struct request_queue *q);
418#else
419static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
420#endif
421
415#endif /* BLK_INTERNAL_H */ 422#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index fd31347b7836..bc63b3a2d18c 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -195,6 +195,73 @@ static void bounce_end_io_read_isa(struct bio *bio)
195 __bounce_end_io_read(bio, &isa_page_pool); 195 __bounce_end_io_read(bio, &isa_page_pool);
196} 196}
197 197
198static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
199 struct bio_set *bs)
200{
201 struct bvec_iter iter;
202 struct bio_vec bv;
203 struct bio *bio;
204
205 /*
206 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
207 * bio_src->bi_io_vec to bio->bi_io_vec.
208 *
209 * We can't do that anymore, because:
210 *
211 * - The point of cloning the biovec is to produce a bio with a biovec
212 * the caller can modify: bi_idx and bi_bvec_done should be 0.
213 *
214 * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
215 * we tried to clone the whole thing bio_alloc_bioset() would fail.
216 * But the clone should succeed as long as the number of biovecs we
217 * actually need to allocate is fewer than BIO_MAX_PAGES.
218 *
219 * - Lastly, bi_vcnt should not be looked at or relied upon by code
220 * that does not own the bio - reason being drivers don't use it for
221 * iterating over the biovec anymore, so expecting it to be kept up
222 * to date (i.e. for clones that share the parent biovec) is just
223 * asking for trouble and would force extra work on
224 * __bio_clone_fast() anyways.
225 */
226
227 bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
228 if (!bio)
229 return NULL;
230 bio->bi_disk = bio_src->bi_disk;
231 bio->bi_opf = bio_src->bi_opf;
232 bio->bi_write_hint = bio_src->bi_write_hint;
233 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
234 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
235
236 switch (bio_op(bio)) {
237 case REQ_OP_DISCARD:
238 case REQ_OP_SECURE_ERASE:
239 case REQ_OP_WRITE_ZEROES:
240 break;
241 case REQ_OP_WRITE_SAME:
242 bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
243 break;
244 default:
245 bio_for_each_segment(bv, bio_src, iter)
246 bio->bi_io_vec[bio->bi_vcnt++] = bv;
247 break;
248 }
249
250 if (bio_integrity(bio_src)) {
251 int ret;
252
253 ret = bio_integrity_clone(bio, bio_src, gfp_mask);
254 if (ret < 0) {
255 bio_put(bio);
256 return NULL;
257 }
258 }
259
260 bio_clone_blkcg_association(bio, bio_src);
261
262 return bio;
263}
264
198static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, 265static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
199 mempool_t *pool) 266 mempool_t *pool)
200{ 267{
@@ -222,7 +289,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
222 generic_make_request(*bio_orig); 289 generic_make_request(*bio_orig);
223 *bio_orig = bio; 290 *bio_orig = bio;
224 } 291 }
225 bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL : 292 bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
226 &bounce_bio_set); 293 &bounce_bio_set);
227 294
228 bio_for_each_segment_all(to, bio, i) { 295 bio_for_each_segment_all(to, bio, i) {
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 9419def8c017..f3501cdaf1a6 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -48,9 +48,8 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
48 48
49 job->request_len = hdr->request_len; 49 job->request_len = hdr->request_len;
50 job->request = memdup_user(uptr64(hdr->request), hdr->request_len); 50 job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
51 if (IS_ERR(job->request)) 51
52 return PTR_ERR(job->request); 52 return PTR_ERR_OR_ZERO(job->request);
53 return 0;
54} 53}
55 54
56static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) 55static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
diff --git a/block/bsg.c b/block/bsg.c
index 3da540faf673..db588add6ba6 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -13,11 +13,9 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/poll.h>
17#include <linux/cdev.h> 16#include <linux/cdev.h>
18#include <linux/jiffies.h> 17#include <linux/jiffies.h>
19#include <linux/percpu.h> 18#include <linux/percpu.h>
20#include <linux/uio.h>
21#include <linux/idr.h> 19#include <linux/idr.h>
22#include <linux/bsg.h> 20#include <linux/bsg.h>
23#include <linux/slab.h> 21#include <linux/slab.h>
@@ -38,21 +36,10 @@
38struct bsg_device { 36struct bsg_device {
39 struct request_queue *queue; 37 struct request_queue *queue;
40 spinlock_t lock; 38 spinlock_t lock;
41 struct list_head busy_list;
42 struct list_head done_list;
43 struct hlist_node dev_list; 39 struct hlist_node dev_list;
44 atomic_t ref_count; 40 atomic_t ref_count;
45 int queued_cmds;
46 int done_cmds;
47 wait_queue_head_t wq_done;
48 wait_queue_head_t wq_free;
49 char name[20]; 41 char name[20];
50 int max_queue; 42 int max_queue;
51 unsigned long flags;
52};
53
54enum {
55 BSG_F_BLOCK = 1,
56}; 43};
57 44
58#define BSG_DEFAULT_CMDS 64 45#define BSG_DEFAULT_CMDS 64
@@ -67,64 +54,6 @@ static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE];
67static struct class *bsg_class; 54static struct class *bsg_class;
68static int bsg_major; 55static int bsg_major;
69 56
70static struct kmem_cache *bsg_cmd_cachep;
71
72/*
73 * our internal command type
74 */
75struct bsg_command {
76 struct bsg_device *bd;
77 struct list_head list;
78 struct request *rq;
79 struct bio *bio;
80 struct bio *bidi_bio;
81 int err;
82 struct sg_io_v4 hdr;
83};
84
85static void bsg_free_command(struct bsg_command *bc)
86{
87 struct bsg_device *bd = bc->bd;
88 unsigned long flags;
89
90 kmem_cache_free(bsg_cmd_cachep, bc);
91
92 spin_lock_irqsave(&bd->lock, flags);
93 bd->queued_cmds--;
94 spin_unlock_irqrestore(&bd->lock, flags);
95
96 wake_up(&bd->wq_free);
97}
98
99static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
100{
101 struct bsg_command *bc = ERR_PTR(-EINVAL);
102
103 spin_lock_irq(&bd->lock);
104
105 if (bd->queued_cmds >= bd->max_queue)
106 goto out;
107
108 bd->queued_cmds++;
109 spin_unlock_irq(&bd->lock);
110
111 bc = kmem_cache_zalloc(bsg_cmd_cachep, GFP_KERNEL);
112 if (unlikely(!bc)) {
113 spin_lock_irq(&bd->lock);
114 bd->queued_cmds--;
115 bc = ERR_PTR(-ENOMEM);
116 goto out;
117 }
118
119 bc->bd = bd;
120 INIT_LIST_HEAD(&bc->list);
121 bsg_dbg(bd, "returning free cmd %p\n", bc);
122 return bc;
123out:
124 spin_unlock_irq(&bd->lock);
125 return bc;
126}
127
128static inline struct hlist_head *bsg_dev_idx_hash(int index) 57static inline struct hlist_head *bsg_dev_idx_hash(int index)
129{ 58{
130 return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; 59 return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
@@ -285,101 +214,6 @@ out:
285 return ERR_PTR(ret); 214 return ERR_PTR(ret);
286} 215}
287 216
288/*
289 * async completion call-back from the block layer, when scsi/ide/whatever
290 * calls end_that_request_last() on a request
291 */
292static void bsg_rq_end_io(struct request *rq, blk_status_t status)
293{
294 struct bsg_command *bc = rq->end_io_data;
295 struct bsg_device *bd = bc->bd;
296 unsigned long flags;
297
298 bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
299 rq, bc, bc->bio);
300
301 bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
302
303 spin_lock_irqsave(&bd->lock, flags);
304 list_move_tail(&bc->list, &bd->done_list);
305 bd->done_cmds++;
306 spin_unlock_irqrestore(&bd->lock, flags);
307
308 wake_up(&bd->wq_done);
309}
310
311/*
312 * do final setup of a 'bc' and submit the matching 'rq' to the block
313 * layer for io
314 */
315static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
316 struct bsg_command *bc, struct request *rq)
317{
318 int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL));
319
320 /*
321 * add bc command to busy queue and submit rq for io
322 */
323 bc->rq = rq;
324 bc->bio = rq->bio;
325 if (rq->next_rq)
326 bc->bidi_bio = rq->next_rq->bio;
327 bc->hdr.duration = jiffies;
328 spin_lock_irq(&bd->lock);
329 list_add_tail(&bc->list, &bd->busy_list);
330 spin_unlock_irq(&bd->lock);
331
332 bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
333
334 rq->end_io_data = bc;
335 blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
336}
337
338static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd)
339{
340 struct bsg_command *bc = NULL;
341
342 spin_lock_irq(&bd->lock);
343 if (bd->done_cmds) {
344 bc = list_first_entry(&bd->done_list, struct bsg_command, list);
345 list_del(&bc->list);
346 bd->done_cmds--;
347 }
348 spin_unlock_irq(&bd->lock);
349
350 return bc;
351}
352
353/*
354 * Get a finished command from the done list
355 */
356static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
357{
358 struct bsg_command *bc;
359 int ret;
360
361 do {
362 bc = bsg_next_done_cmd(bd);
363 if (bc)
364 break;
365
366 if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
367 bc = ERR_PTR(-EAGAIN);
368 break;
369 }
370
371 ret = wait_event_interruptible(bd->wq_done, bd->done_cmds);
372 if (ret) {
373 bc = ERR_PTR(-ERESTARTSYS);
374 break;
375 }
376 } while (1);
377
378 bsg_dbg(bd, "returning done %p\n", bc);
379
380 return bc;
381}
382
383static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, 217static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
384 struct bio *bio, struct bio *bidi_bio) 218 struct bio *bio, struct bio *bidi_bio)
385{ 219{
@@ -398,234 +232,6 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
398 return ret; 232 return ret;
399} 233}
400 234
401static bool bsg_complete(struct bsg_device *bd)
402{
403 bool ret = false;
404 bool spin;
405
406 do {
407 spin_lock_irq(&bd->lock);
408
409 BUG_ON(bd->done_cmds > bd->queued_cmds);
410
411 /*
412 * All commands consumed.
413 */
414 if (bd->done_cmds == bd->queued_cmds)
415 ret = true;
416
417 spin = !test_bit(BSG_F_BLOCK, &bd->flags);
418
419 spin_unlock_irq(&bd->lock);
420 } while (!ret && spin);
421
422 return ret;
423}
424
425static int bsg_complete_all_commands(struct bsg_device *bd)
426{
427 struct bsg_command *bc;
428 int ret, tret;
429
430 bsg_dbg(bd, "entered\n");
431
432 /*
433 * wait for all commands to complete
434 */
435 io_wait_event(bd->wq_done, bsg_complete(bd));
436
437 /*
438 * discard done commands
439 */
440 ret = 0;
441 do {
442 spin_lock_irq(&bd->lock);
443 if (!bd->queued_cmds) {
444 spin_unlock_irq(&bd->lock);
445 break;
446 }
447 spin_unlock_irq(&bd->lock);
448
449 bc = bsg_get_done_cmd(bd);
450 if (IS_ERR(bc))
451 break;
452
453 tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
454 bc->bidi_bio);
455 if (!ret)
456 ret = tret;
457
458 bsg_free_command(bc);
459 } while (1);
460
461 return ret;
462}
463
464static int
465__bsg_read(char __user *buf, size_t count, struct bsg_device *bd,
466 const struct iovec *iov, ssize_t *bytes_read)
467{
468 struct bsg_command *bc;
469 int nr_commands, ret;
470
471 if (count % sizeof(struct sg_io_v4))
472 return -EINVAL;
473
474 ret = 0;
475 nr_commands = count / sizeof(struct sg_io_v4);
476 while (nr_commands) {
477 bc = bsg_get_done_cmd(bd);
478 if (IS_ERR(bc)) {
479 ret = PTR_ERR(bc);
480 break;
481 }
482
483 /*
484 * this is the only case where we need to copy data back
485 * after completing the request. so do that here,
486 * bsg_complete_work() cannot do that for us
487 */
488 ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio,
489 bc->bidi_bio);
490
491 if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr)))
492 ret = -EFAULT;
493
494 bsg_free_command(bc);
495
496 if (ret)
497 break;
498
499 buf += sizeof(struct sg_io_v4);
500 *bytes_read += sizeof(struct sg_io_v4);
501 nr_commands--;
502 }
503
504 return ret;
505}
506
507static inline void bsg_set_block(struct bsg_device *bd, struct file *file)
508{
509 if (file->f_flags & O_NONBLOCK)
510 clear_bit(BSG_F_BLOCK, &bd->flags);
511 else
512 set_bit(BSG_F_BLOCK, &bd->flags);
513}
514
515/*
516 * Check if the error is a "real" error that we should return.
517 */
518static inline int err_block_err(int ret)
519{
520 if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN)
521 return 1;
522
523 return 0;
524}
525
526static ssize_t
527bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
528{
529 struct bsg_device *bd = file->private_data;
530 int ret;
531 ssize_t bytes_read;
532
533 bsg_dbg(bd, "read %zd bytes\n", count);
534
535 bsg_set_block(bd, file);
536
537 bytes_read = 0;
538 ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
539 *ppos = bytes_read;
540
541 if (!bytes_read || err_block_err(ret))
542 bytes_read = ret;
543
544 return bytes_read;
545}
546
547static int __bsg_write(struct bsg_device *bd, const char __user *buf,
548 size_t count, ssize_t *bytes_written, fmode_t mode)
549{
550 struct bsg_command *bc;
551 struct request *rq;
552 int ret, nr_commands;
553
554 if (count % sizeof(struct sg_io_v4))
555 return -EINVAL;
556
557 nr_commands = count / sizeof(struct sg_io_v4);
558 rq = NULL;
559 bc = NULL;
560 ret = 0;
561 while (nr_commands) {
562 struct request_queue *q = bd->queue;
563
564 bc = bsg_alloc_command(bd);
565 if (IS_ERR(bc)) {
566 ret = PTR_ERR(bc);
567 bc = NULL;
568 break;
569 }
570
571 if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) {
572 ret = -EFAULT;
573 break;
574 }
575
576 /*
577 * get a request, fill in the blanks, and add to request queue
578 */
579 rq = bsg_map_hdr(bd->queue, &bc->hdr, mode);
580 if (IS_ERR(rq)) {
581 ret = PTR_ERR(rq);
582 rq = NULL;
583 break;
584 }
585
586 bsg_add_command(bd, q, bc, rq);
587 bc = NULL;
588 rq = NULL;
589 nr_commands--;
590 buf += sizeof(struct sg_io_v4);
591 *bytes_written += sizeof(struct sg_io_v4);
592 }
593
594 if (bc)
595 bsg_free_command(bc);
596
597 return ret;
598}
599
600static ssize_t
601bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
602{
603 struct bsg_device *bd = file->private_data;
604 ssize_t bytes_written;
605 int ret;
606
607 bsg_dbg(bd, "write %zd bytes\n", count);
608
609 if (unlikely(uaccess_kernel()))
610 return -EINVAL;
611
612 bsg_set_block(bd, file);
613
614 bytes_written = 0;
615 ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode);
616
617 *ppos = bytes_written;
618
619 /*
620 * return bytes written on non-fatal errors
621 */
622 if (!bytes_written || err_block_err(ret))
623 bytes_written = ret;
624
625 bsg_dbg(bd, "returning %zd\n", bytes_written);
626 return bytes_written;
627}
628
629static struct bsg_device *bsg_alloc_device(void) 235static struct bsg_device *bsg_alloc_device(void)
630{ 236{
631 struct bsg_device *bd; 237 struct bsg_device *bd;
@@ -635,29 +241,20 @@ static struct bsg_device *bsg_alloc_device(void)
635 return NULL; 241 return NULL;
636 242
637 spin_lock_init(&bd->lock); 243 spin_lock_init(&bd->lock);
638
639 bd->max_queue = BSG_DEFAULT_CMDS; 244 bd->max_queue = BSG_DEFAULT_CMDS;
640
641 INIT_LIST_HEAD(&bd->busy_list);
642 INIT_LIST_HEAD(&bd->done_list);
643 INIT_HLIST_NODE(&bd->dev_list); 245 INIT_HLIST_NODE(&bd->dev_list);
644
645 init_waitqueue_head(&bd->wq_free);
646 init_waitqueue_head(&bd->wq_done);
647 return bd; 246 return bd;
648} 247}
649 248
650static int bsg_put_device(struct bsg_device *bd) 249static int bsg_put_device(struct bsg_device *bd)
651{ 250{
652 int ret = 0, do_free;
653 struct request_queue *q = bd->queue; 251 struct request_queue *q = bd->queue;
654 252
655 mutex_lock(&bsg_mutex); 253 mutex_lock(&bsg_mutex);
656 254
657 do_free = atomic_dec_and_test(&bd->ref_count); 255 if (!atomic_dec_and_test(&bd->ref_count)) {
658 if (!do_free) {
659 mutex_unlock(&bsg_mutex); 256 mutex_unlock(&bsg_mutex);
660 goto out; 257 return 0;
661 } 258 }
662 259
663 hlist_del(&bd->dev_list); 260 hlist_del(&bd->dev_list);
@@ -668,20 +265,9 @@ static int bsg_put_device(struct bsg_device *bd)
668 /* 265 /*
669 * close can always block 266 * close can always block
670 */ 267 */
671 set_bit(BSG_F_BLOCK, &bd->flags);
672
673 /*
674 * correct error detection baddies here again. it's the responsibility
675 * of the app to properly reap commands before close() if it wants
676 * fool-proof error detection
677 */
678 ret = bsg_complete_all_commands(bd);
679
680 kfree(bd); 268 kfree(bd);
681out: 269 blk_put_queue(q);
682 if (do_free) 270 return 0;
683 blk_put_queue(q);
684 return ret;
685} 271}
686 272
687static struct bsg_device *bsg_add_device(struct inode *inode, 273static struct bsg_device *bsg_add_device(struct inode *inode,
@@ -704,8 +290,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
704 290
705 bd->queue = rq; 291 bd->queue = rq;
706 292
707 bsg_set_block(bd, file);
708
709 atomic_set(&bd->ref_count, 1); 293 atomic_set(&bd->ref_count, 1);
710 hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); 294 hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
711 295
@@ -779,24 +363,6 @@ static int bsg_release(struct inode *inode, struct file *file)
779 return bsg_put_device(bd); 363 return bsg_put_device(bd);
780} 364}
781 365
782static __poll_t bsg_poll(struct file *file, poll_table *wait)
783{
784 struct bsg_device *bd = file->private_data;
785 __poll_t mask = 0;
786
787 poll_wait(file, &bd->wq_done, wait);
788 poll_wait(file, &bd->wq_free, wait);
789
790 spin_lock_irq(&bd->lock);
791 if (!list_empty(&bd->done_list))
792 mask |= EPOLLIN | EPOLLRDNORM;
793 if (bd->queued_cmds < bd->max_queue)
794 mask |= EPOLLOUT;
795 spin_unlock_irq(&bd->lock);
796
797 return mask;
798}
799
800static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 366static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
801{ 367{
802 struct bsg_device *bd = file->private_data; 368 struct bsg_device *bd = file->private_data;
@@ -870,9 +436,6 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
870} 436}
871 437
872static const struct file_operations bsg_fops = { 438static const struct file_operations bsg_fops = {
873 .read = bsg_read,
874 .write = bsg_write,
875 .poll = bsg_poll,
876 .open = bsg_open, 439 .open = bsg_open,
877 .release = bsg_release, 440 .release = bsg_release,
878 .unlocked_ioctl = bsg_ioctl, 441 .unlocked_ioctl = bsg_ioctl,
@@ -977,21 +540,12 @@ static int __init bsg_init(void)
977 int ret, i; 540 int ret, i;
978 dev_t devid; 541 dev_t devid;
979 542
980 bsg_cmd_cachep = kmem_cache_create("bsg_cmd",
981 sizeof(struct bsg_command), 0, 0, NULL);
982 if (!bsg_cmd_cachep) {
983 printk(KERN_ERR "bsg: failed creating slab cache\n");
984 return -ENOMEM;
985 }
986
987 for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++) 543 for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++)
988 INIT_HLIST_HEAD(&bsg_device_list[i]); 544 INIT_HLIST_HEAD(&bsg_device_list[i]);
989 545
990 bsg_class = class_create(THIS_MODULE, "bsg"); 546 bsg_class = class_create(THIS_MODULE, "bsg");
991 if (IS_ERR(bsg_class)) { 547 if (IS_ERR(bsg_class))
992 ret = PTR_ERR(bsg_class); 548 return PTR_ERR(bsg_class);
993 goto destroy_kmemcache;
994 }
995 bsg_class->devnode = bsg_devnode; 549 bsg_class->devnode = bsg_devnode;
996 550
997 ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); 551 ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
@@ -1012,8 +566,6 @@ unregister_chrdev:
1012 unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS); 566 unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS);
1013destroy_bsg_class: 567destroy_bsg_class:
1014 class_destroy(bsg_class); 568 class_destroy(bsg_class);
1015destroy_kmemcache:
1016 kmem_cache_destroy(bsg_cmd_cachep);
1017 return ret; 569 return ret;
1018} 570}
1019 571
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 82b6c27b3245..2eb87444b157 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3666,6 +3666,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3666 switch (ioprio_class) { 3666 switch (ioprio_class) {
3667 default: 3667 default:
3668 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 3668 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
3669 /* fall through */
3669 case IOPRIO_CLASS_NONE: 3670 case IOPRIO_CLASS_NONE:
3670 /* 3671 /*
3671 * no prio set, inherit CPU scheduling settings 3672 * no prio set, inherit CPU scheduling settings
@@ -4735,12 +4736,13 @@ USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
4735static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 4736static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4736{ \ 4737{ \
4737 struct cfq_data *cfqd = e->elevator_data; \ 4738 struct cfq_data *cfqd = e->elevator_data; \
4738 unsigned int __data; \ 4739 unsigned int __data, __min = (MIN), __max = (MAX); \
4740 \
4739 cfq_var_store(&__data, (page)); \ 4741 cfq_var_store(&__data, (page)); \
4740 if (__data < (MIN)) \ 4742 if (__data < __min) \
4741 __data = (MIN); \ 4743 __data = __min; \
4742 else if (__data > (MAX)) \ 4744 else if (__data > __max) \
4743 __data = (MAX); \ 4745 __data = __max; \
4744 if (__CONV) \ 4746 if (__CONV) \
4745 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ 4747 *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
4746 else \ 4748 else \
@@ -4769,12 +4771,13 @@ STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX,
4769static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 4771static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
4770{ \ 4772{ \
4771 struct cfq_data *cfqd = e->elevator_data; \ 4773 struct cfq_data *cfqd = e->elevator_data; \
4772 unsigned int __data; \ 4774 unsigned int __data, __min = (MIN), __max = (MAX); \
4775 \
4773 cfq_var_store(&__data, (page)); \ 4776 cfq_var_store(&__data, (page)); \
4774 if (__data < (MIN)) \ 4777 if (__data < __min) \
4775 __data = (MIN); \ 4778 __data = __min; \
4776 else if (__data > (MAX)) \ 4779 else if (__data > __max) \
4777 __data = (MAX); \ 4780 __data = __max; \
4778 *(__PTR) = (u64)__data * NSEC_PER_USEC; \ 4781 *(__PTR) = (u64)__data * NSEC_PER_USEC; \
4779 return count; \ 4782 return count; \
4780} 4783}
diff --git a/block/genhd.c b/block/genhd.c
index f1543a45e73b..8cc719a37b32 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1333,21 +1333,28 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1333 part_round_stats(gp->queue, cpu, hd); 1333 part_round_stats(gp->queue, cpu, hd);
1334 part_stat_unlock(); 1334 part_stat_unlock();
1335 part_in_flight(gp->queue, hd, inflight); 1335 part_in_flight(gp->queue, hd, inflight);
1336 seq_printf(seqf, "%4d %7d %s %lu %lu %lu " 1336 seq_printf(seqf, "%4d %7d %s "
1337 "%u %lu %lu %lu %u %u %u %u\n", 1337 "%lu %lu %lu %u "
1338 "%lu %lu %lu %u "
1339 "%u %u %u "
1340 "%lu %lu %lu %u\n",
1338 MAJOR(part_devt(hd)), MINOR(part_devt(hd)), 1341 MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1339 disk_name(gp, hd->partno, buf), 1342 disk_name(gp, hd->partno, buf),
1340 part_stat_read(hd, ios[READ]), 1343 part_stat_read(hd, ios[STAT_READ]),
1341 part_stat_read(hd, merges[READ]), 1344 part_stat_read(hd, merges[STAT_READ]),
1342 part_stat_read(hd, sectors[READ]), 1345 part_stat_read(hd, sectors[STAT_READ]),
1343 jiffies_to_msecs(part_stat_read(hd, ticks[READ])), 1346 jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])),
1344 part_stat_read(hd, ios[WRITE]), 1347 part_stat_read(hd, ios[STAT_WRITE]),
1345 part_stat_read(hd, merges[WRITE]), 1348 part_stat_read(hd, merges[STAT_WRITE]),
1346 part_stat_read(hd, sectors[WRITE]), 1349 part_stat_read(hd, sectors[STAT_WRITE]),
1347 jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), 1350 jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])),
1348 inflight[0], 1351 inflight[0],
1349 jiffies_to_msecs(part_stat_read(hd, io_ticks)), 1352 jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1350 jiffies_to_msecs(part_stat_read(hd, time_in_queue)) 1353 jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
1354 part_stat_read(hd, ios[STAT_DISCARD]),
1355 part_stat_read(hd, merges[STAT_DISCARD]),
1356 part_stat_read(hd, sectors[STAT_DISCARD]),
1357 jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD]))
1351 ); 1358 );
1352 } 1359 }
1353 disk_part_iter_exit(&piter); 1360 disk_part_iter_exit(&piter);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 3dcfd4ec0e11..5a8975a1201c 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -130,19 +130,24 @@ ssize_t part_stat_show(struct device *dev,
130 return sprintf(buf, 130 return sprintf(buf,
131 "%8lu %8lu %8llu %8u " 131 "%8lu %8lu %8llu %8u "
132 "%8lu %8lu %8llu %8u " 132 "%8lu %8lu %8llu %8u "
133 "%8u %8u %8u" 133 "%8u %8u %8u "
134 "%8lu %8lu %8llu %8u"
134 "\n", 135 "\n",
135 part_stat_read(p, ios[READ]), 136 part_stat_read(p, ios[STAT_READ]),
136 part_stat_read(p, merges[READ]), 137 part_stat_read(p, merges[STAT_READ]),
137 (unsigned long long)part_stat_read(p, sectors[READ]), 138 (unsigned long long)part_stat_read(p, sectors[STAT_READ]),
138 jiffies_to_msecs(part_stat_read(p, ticks[READ])), 139 jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])),
139 part_stat_read(p, ios[WRITE]), 140 part_stat_read(p, ios[STAT_WRITE]),
140 part_stat_read(p, merges[WRITE]), 141 part_stat_read(p, merges[STAT_WRITE]),
141 (unsigned long long)part_stat_read(p, sectors[WRITE]), 142 (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
142 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), 143 jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])),
143 inflight[0], 144 inflight[0],
144 jiffies_to_msecs(part_stat_read(p, io_ticks)), 145 jiffies_to_msecs(part_stat_read(p, io_ticks)),
145 jiffies_to_msecs(part_stat_read(p, time_in_queue))); 146 jiffies_to_msecs(part_stat_read(p, time_in_queue)),
147 part_stat_read(p, ios[STAT_DISCARD]),
148 part_stat_read(p, merges[STAT_DISCARD]),
149 (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]),
150 jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD])));
146} 151}
147 152
148ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, 153ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index 007f95eea0e1..903f3ed175d0 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -178,7 +178,7 @@ int aix_partition(struct parsed_partitions *state)
178 u32 vgda_sector = 0; 178 u32 vgda_sector = 0;
179 u32 vgda_len = 0; 179 u32 vgda_len = 0;
180 int numlvs = 0; 180 int numlvs = 0;
181 struct pvd *pvd; 181 struct pvd *pvd = NULL;
182 struct lv_info { 182 struct lv_info {
183 unsigned short pps_per_lv; 183 unsigned short pps_per_lv;
184 unsigned short pps_found; 184 unsigned short pps_found;
@@ -232,10 +232,11 @@ int aix_partition(struct parsed_partitions *state)
232 if (lvip[i].pps_per_lv) 232 if (lvip[i].pps_per_lv)
233 foundlvs += 1; 233 foundlvs += 1;
234 } 234 }
235 /* pvd loops depend on n[].name and lvip[].pps_per_lv */
236 pvd = alloc_pvd(state, vgda_sector + 17);
235 } 237 }
236 put_dev_sector(sect); 238 put_dev_sector(sect);
237 } 239 }
238 pvd = alloc_pvd(state, vgda_sector + 17);
239 if (pvd) { 240 if (pvd) {
240 int numpps = be16_to_cpu(pvd->pp_count); 241 int numpps = be16_to_cpu(pvd->pp_count);
241 int psn_part1 = be32_to_cpu(pvd->psn_part1); 242 int psn_part1 = be32_to_cpu(pvd->psn_part1);
@@ -282,10 +283,14 @@ int aix_partition(struct parsed_partitions *state)
282 next_lp_ix += 1; 283 next_lp_ix += 1;
283 } 284 }
284 for (i = 0; i < state->limit; i += 1) 285 for (i = 0; i < state->limit; i += 1)
285 if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) 286 if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) {
287 char tmp[sizeof(n[i].name) + 1]; // null char
288
289 snprintf(tmp, sizeof(tmp), "%s", n[i].name);
286 pr_warn("partition %s (%u pp's found) is " 290 pr_warn("partition %s (%u pp's found) is "
287 "not contiguous\n", 291 "not contiguous\n",
288 n[i].name, lvip[i].pps_found); 292 tmp, lvip[i].pps_found);
293 }
289 kfree(pvd); 294 kfree(pvd);
290 } 295 }
291 kfree(n); 296 kfree(n);
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index 0417937dfe99..16766f267559 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -830,7 +830,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
830{ 830{
831 char buf[64]; 831 char buf[64];
832 int r_objid, r_name, r_id1, r_id2, len; 832 int r_objid, r_name, r_id1, r_id2, len;
833 struct vblk_dgrp *dgrp;
834 833
835 BUG_ON (!buffer || !vb); 834 BUG_ON (!buffer || !vb);
836 835
@@ -853,8 +852,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
853 if (len != get_unaligned_be32(buffer + 0x14)) 852 if (len != get_unaligned_be32(buffer + 0x14))
854 return false; 853 return false;
855 854
856 dgrp = &vb->vblk.dgrp;
857
858 ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); 855 ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
859 return true; 856 return true;
860} 857}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index a98db384048f..62aed77d0bb9 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -184,3 +184,113 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
184 .verify_fn = t10_pi_type3_verify_ip, 184 .verify_fn = t10_pi_type3_verify_ip,
185}; 185};
186EXPORT_SYMBOL(t10_pi_type3_ip); 186EXPORT_SYMBOL(t10_pi_type3_ip);
187
188/**
189 * t10_pi_prepare - prepare PI prior submitting request to device
190 * @rq: request with PI that should be prepared
191 * @protection_type: PI type (Type 1/Type 2/Type 3)
192 *
193 * For Type 1/Type 2, the virtual start sector is the one that was
194 * originally submitted by the block layer for the ref_tag usage. Due to
195 * partitioning, MD/DM cloning, etc. the actual physical start sector is
196 * likely to be different. Remap protection information to match the
197 * physical LBA.
198 *
199 * Type 3 does not have a reference tag so no remapping is required.
200 */
201void t10_pi_prepare(struct request *rq, u8 protection_type)
202{
203 const int tuple_sz = rq->q->integrity.tuple_size;
204 u32 ref_tag = t10_pi_ref_tag(rq);
205 struct bio *bio;
206
207 if (protection_type == T10_PI_TYPE3_PROTECTION)
208 return;
209
210 __rq_for_each_bio(bio, rq) {
211 struct bio_integrity_payload *bip = bio_integrity(bio);
212 u32 virt = bip_get_seed(bip) & 0xffffffff;
213 struct bio_vec iv;
214 struct bvec_iter iter;
215
216 /* Already remapped? */
217 if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
218 break;
219
220 bip_for_each_vec(iv, bip, iter) {
221 void *p, *pmap;
222 unsigned int j;
223
224 pmap = kmap_atomic(iv.bv_page);
225 p = pmap + iv.bv_offset;
226 for (j = 0; j < iv.bv_len; j += tuple_sz) {
227 struct t10_pi_tuple *pi = p;
228
229 if (be32_to_cpu(pi->ref_tag) == virt)
230 pi->ref_tag = cpu_to_be32(ref_tag);
231 virt++;
232 ref_tag++;
233 p += tuple_sz;
234 }
235
236 kunmap_atomic(pmap);
237 }
238
239 bip->bip_flags |= BIP_MAPPED_INTEGRITY;
240 }
241}
242EXPORT_SYMBOL(t10_pi_prepare);
243
244/**
245 * t10_pi_complete - prepare PI prior returning request to the block layer
246 * @rq: request with PI that should be prepared
247 * @protection_type: PI type (Type 1/Type 2/Type 3)
248 * @intervals: total elements to prepare
249 *
250 * For Type 1/Type 2, the virtual start sector is the one that was
251 * originally submitted by the block layer for the ref_tag usage. Due to
252 * partitioning, MD/DM cloning, etc. the actual physical start sector is
253 * likely to be different. Since the physical start sector was submitted
254 * to the device, we should remap it back to virtual values expected by the
255 * block layer.
256 *
257 * Type 3 does not have a reference tag so no remapping is required.
258 */
259void t10_pi_complete(struct request *rq, u8 protection_type,
260 unsigned int intervals)
261{
262 const int tuple_sz = rq->q->integrity.tuple_size;
263 u32 ref_tag = t10_pi_ref_tag(rq);
264 struct bio *bio;
265
266 if (protection_type == T10_PI_TYPE3_PROTECTION)
267 return;
268
269 __rq_for_each_bio(bio, rq) {
270 struct bio_integrity_payload *bip = bio_integrity(bio);
271 u32 virt = bip_get_seed(bip) & 0xffffffff;
272 struct bio_vec iv;
273 struct bvec_iter iter;
274
275 bip_for_each_vec(iv, bip, iter) {
276 void *p, *pmap;
277 unsigned int j;
278
279 pmap = kmap_atomic(iv.bv_page);
280 p = pmap + iv.bv_offset;
281 for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
282 struct t10_pi_tuple *pi = p;
283
284 if (be32_to_cpu(pi->ref_tag) == ref_tag)
285 pi->ref_tag = cpu_to_be32(virt);
286 virt++;
287 ref_tag++;
288 intervals--;
289 p += tuple_sz;
290 }
291
292 kunmap_atomic(pmap);
293 }
294 }
295}
296EXPORT_SYMBOL(t10_pi_complete);
diff --git a/drivers/Makefile b/drivers/Makefile
index 24cd47014657..a6abd7a856c6 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -76,7 +76,7 @@ obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
76obj-$(CONFIG_NUBUS) += nubus/ 76obj-$(CONFIG_NUBUS) += nubus/
77obj-y += macintosh/ 77obj-y += macintosh/
78obj-$(CONFIG_IDE) += ide/ 78obj-$(CONFIG_IDE) += ide/
79obj-$(CONFIG_SCSI) += scsi/ 79obj-y += scsi/
80obj-y += nvme/ 80obj-y += nvme/
81obj-$(CONFIG_ATA) += ata/ 81obj-$(CONFIG_ATA) += ata/
82obj-$(CONFIG_TARGET_CORE) += target/ 82obj-$(CONFIG_TARGET_CORE) += target/
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index aad1b01447de..8e270962b2f3 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -597,8 +597,9 @@ static int ata_get_identity(struct ata_port *ap, struct scsi_device *sdev,
597int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg) 597int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
598{ 598{
599 int rc = 0; 599 int rc = 0;
600 u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
600 u8 scsi_cmd[MAX_COMMAND_SIZE]; 601 u8 scsi_cmd[MAX_COMMAND_SIZE];
601 u8 args[4], *argbuf = NULL, *sensebuf = NULL; 602 u8 args[4], *argbuf = NULL;
602 int argsize = 0; 603 int argsize = 0;
603 enum dma_data_direction data_dir; 604 enum dma_data_direction data_dir;
604 struct scsi_sense_hdr sshdr; 605 struct scsi_sense_hdr sshdr;
@@ -610,10 +611,7 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
610 if (copy_from_user(args, arg, sizeof(args))) 611 if (copy_from_user(args, arg, sizeof(args)))
611 return -EFAULT; 612 return -EFAULT;
612 613
613 sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO); 614 memset(sensebuf, 0, sizeof(sensebuf));
614 if (!sensebuf)
615 return -ENOMEM;
616
617 memset(scsi_cmd, 0, sizeof(scsi_cmd)); 615 memset(scsi_cmd, 0, sizeof(scsi_cmd));
618 616
619 if (args[3]) { 617 if (args[3]) {
@@ -685,7 +683,6 @@ int ata_cmd_ioctl(struct scsi_device *scsidev, void __user *arg)
685 && copy_to_user(arg + sizeof(args), argbuf, argsize)) 683 && copy_to_user(arg + sizeof(args), argbuf, argsize))
686 rc = -EFAULT; 684 rc = -EFAULT;
687error: 685error:
688 kfree(sensebuf);
689 kfree(argbuf); 686 kfree(argbuf);
690 return rc; 687 return rc;
691} 688}
@@ -704,8 +701,9 @@ error:
704int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg) 701int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
705{ 702{
706 int rc = 0; 703 int rc = 0;
704 u8 sensebuf[SCSI_SENSE_BUFFERSIZE];
707 u8 scsi_cmd[MAX_COMMAND_SIZE]; 705 u8 scsi_cmd[MAX_COMMAND_SIZE];
708 u8 args[7], *sensebuf = NULL; 706 u8 args[7];
709 struct scsi_sense_hdr sshdr; 707 struct scsi_sense_hdr sshdr;
710 int cmd_result; 708 int cmd_result;
711 709
@@ -715,10 +713,7 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
715 if (copy_from_user(args, arg, sizeof(args))) 713 if (copy_from_user(args, arg, sizeof(args)))
716 return -EFAULT; 714 return -EFAULT;
717 715
718 sensebuf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO); 716 memset(sensebuf, 0, sizeof(sensebuf));
719 if (!sensebuf)
720 return -ENOMEM;
721
722 memset(scsi_cmd, 0, sizeof(scsi_cmd)); 717 memset(scsi_cmd, 0, sizeof(scsi_cmd));
723 scsi_cmd[0] = ATA_16; 718 scsi_cmd[0] = ATA_16;
724 scsi_cmd[1] = (3 << 1); /* Non-data */ 719 scsi_cmd[1] = (3 << 1); /* Non-data */
@@ -769,7 +764,6 @@ int ata_task_ioctl(struct scsi_device *scsidev, void __user *arg)
769 } 764 }
770 765
771 error: 766 error:
772 kfree(sensebuf);
773 return rc; 767 return rc;
774} 768}
775 769
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index f6518067aa7d..f99e5c883368 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -21,6 +21,7 @@
21#define DAC960_DriverDate "21 Aug 2007" 21#define DAC960_DriverDate "21 Aug 2007"
22 22
23 23
24#include <linux/compiler.h>
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/types.h> 26#include <linux/types.h>
26#include <linux/miscdevice.h> 27#include <linux/miscdevice.h>
@@ -6426,7 +6427,7 @@ static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller,
6426 return true; 6427 return true;
6427} 6428}
6428 6429
6429static int dac960_proc_show(struct seq_file *m, void *v) 6430static int __maybe_unused dac960_proc_show(struct seq_file *m, void *v)
6430{ 6431{
6431 unsigned char *StatusMessage = "OK\n"; 6432 unsigned char *StatusMessage = "OK\n";
6432 int ControllerNumber; 6433 int ControllerNumber;
@@ -6446,14 +6447,16 @@ static int dac960_proc_show(struct seq_file *m, void *v)
6446 return 0; 6447 return 0;
6447} 6448}
6448 6449
6449static int dac960_initial_status_proc_show(struct seq_file *m, void *v) 6450static int __maybe_unused dac960_initial_status_proc_show(struct seq_file *m,
6451 void *v)
6450{ 6452{
6451 DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private; 6453 DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
6452 seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer); 6454 seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer);
6453 return 0; 6455 return 0;
6454} 6456}
6455 6457
6456static int dac960_current_status_proc_show(struct seq_file *m, void *v) 6458static int __maybe_unused dac960_current_status_proc_show(struct seq_file *m,
6459 void *v)
6457{ 6460{
6458 DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private; 6461 DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private;
6459 unsigned char *StatusMessage = 6462 unsigned char *StatusMessage =
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ad9b687a236a..d4913516823f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -74,12 +74,12 @@ config AMIGA_Z2RAM
74 74
75config CDROM 75config CDROM
76 tristate 76 tristate
77 select BLK_SCSI_REQUEST
77 78
78config GDROM 79config GDROM
79 tristate "SEGA Dreamcast GD-ROM drive" 80 tristate "SEGA Dreamcast GD-ROM drive"
80 depends on SH_DREAMCAST 81 depends on SH_DREAMCAST
81 select CDROM 82 select CDROM
82 select BLK_SCSI_REQUEST # only for the generic cdrom code
83 help 83 help
84 A standard SEGA Dreamcast comes with a modified CD ROM drive called a 84 A standard SEGA Dreamcast comes with a modified CD ROM drive called a
85 "GD-ROM" by SEGA to signify it is capable of reading special disks 85 "GD-ROM" by SEGA to signify it is capable of reading special disks
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index dc061158b403..8566b188368b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,8 +36,11 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
36obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ 36obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
37 37
38obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ 38obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
39obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
40obj-$(CONFIG_ZRAM) += zram/ 39obj-$(CONFIG_ZRAM) += zram/
41 40
41obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
42null_blk-objs := null_blk_main.o
43null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o
44
42skd-y := skd_main.o 45skd-y := skd_main.o
43swim_mod-y := swim.o swim_asm.o 46swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 096882e54095..136dc507d020 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1137,6 +1137,7 @@ noskb: if (buf)
1137 break; 1137 break;
1138 } 1138 }
1139 bvcpy(skb, f->buf->bio, f->iter, n); 1139 bvcpy(skb, f->buf->bio, f->iter, n);
1140 /* fall through */
1140 case ATA_CMD_PIO_WRITE: 1141 case ATA_CMD_PIO_WRITE:
1141 case ATA_CMD_PIO_WRITE_EXT: 1142 case ATA_CMD_PIO_WRITE_EXT:
1142 spin_lock_irq(&d->lock); 1143 spin_lock_irq(&d->lock);
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 697f735b07a4..41060e9cedf2 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -284,8 +284,8 @@ freedev(struct aoedev *d)
284 e = t + d->ntargets; 284 e = t + d->ntargets;
285 for (; t < e && *t; t++) 285 for (; t < e && *t; t++)
286 freetgt(d, *t); 286 freetgt(d, *t);
287 if (d->bufpool) 287
288 mempool_destroy(d->bufpool); 288 mempool_destroy(d->bufpool);
289 skbpoolfree(d); 289 skbpoolfree(d);
290 minor_free(d->sysminor); 290 minor_free(d->sysminor);
291 291
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bb976598ee43..df8103dd40ac 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -254,20 +254,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
254 * Process a single bvec of a bio. 254 * Process a single bvec of a bio.
255 */ 255 */
256static int brd_do_bvec(struct brd_device *brd, struct page *page, 256static int brd_do_bvec(struct brd_device *brd, struct page *page,
257 unsigned int len, unsigned int off, bool is_write, 257 unsigned int len, unsigned int off, unsigned int op,
258 sector_t sector) 258 sector_t sector)
259{ 259{
260 void *mem; 260 void *mem;
261 int err = 0; 261 int err = 0;
262 262
263 if (is_write) { 263 if (op_is_write(op)) {
264 err = copy_to_brd_setup(brd, sector, len); 264 err = copy_to_brd_setup(brd, sector, len);
265 if (err) 265 if (err)
266 goto out; 266 goto out;
267 } 267 }
268 268
269 mem = kmap_atomic(page); 269 mem = kmap_atomic(page);
270 if (!is_write) { 270 if (!op_is_write(op)) {
271 copy_from_brd(mem + off, brd, sector, len); 271 copy_from_brd(mem + off, brd, sector, len);
272 flush_dcache_page(page); 272 flush_dcache_page(page);
273 } else { 273 } else {
@@ -296,7 +296,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
296 int err; 296 int err;
297 297
298 err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, 298 err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
299 op_is_write(bio_op(bio)), sector); 299 bio_op(bio), sector);
300 if (err) 300 if (err)
301 goto io_error; 301 goto io_error;
302 sector += len >> SECTOR_SHIFT; 302 sector += len >> SECTOR_SHIFT;
@@ -310,15 +310,15 @@ io_error:
310} 310}
311 311
312static int brd_rw_page(struct block_device *bdev, sector_t sector, 312static int brd_rw_page(struct block_device *bdev, sector_t sector,
313 struct page *page, bool is_write) 313 struct page *page, unsigned int op)
314{ 314{
315 struct brd_device *brd = bdev->bd_disk->private_data; 315 struct brd_device *brd = bdev->bd_disk->private_data;
316 int err; 316 int err;
317 317
318 if (PageTransHuge(page)) 318 if (PageTransHuge(page))
319 return -ENOTSUPP; 319 return -ENOTSUPP;
320 err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector); 320 err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
321 page_endio(page, is_write, err); 321 page_endio(page, op_is_write(op), err);
322 return err; 322 return err;
323} 323}
324 324
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index bc4ed2ed40a2..e35a234b0a8f 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -55,12 +55,10 @@
55# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) 55# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
56# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) 56# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
57# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) 57# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
58# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
59#else 58#else
60# define __protected_by(x) 59# define __protected_by(x)
61# define __protected_read_by(x) 60# define __protected_read_by(x)
62# define __protected_write_by(x) 61# define __protected_write_by(x)
63# define __must_hold(x)
64#endif 62#endif
65 63
66/* shared module parameters, defined in drbd_main.c */ 64/* shared module parameters, defined in drbd_main.c */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a80809bd3057..ef8212a4b73e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2103,14 +2103,10 @@ static void drbd_destroy_mempools(void)
2103 mempool_exit(&drbd_md_io_page_pool); 2103 mempool_exit(&drbd_md_io_page_pool);
2104 mempool_exit(&drbd_ee_mempool); 2104 mempool_exit(&drbd_ee_mempool);
2105 mempool_exit(&drbd_request_mempool); 2105 mempool_exit(&drbd_request_mempool);
2106 if (drbd_ee_cache) 2106 kmem_cache_destroy(drbd_ee_cache);
2107 kmem_cache_destroy(drbd_ee_cache); 2107 kmem_cache_destroy(drbd_request_cache);
2108 if (drbd_request_cache) 2108 kmem_cache_destroy(drbd_bm_ext_cache);
2109 kmem_cache_destroy(drbd_request_cache); 2109 kmem_cache_destroy(drbd_al_ext_cache);
2110 if (drbd_bm_ext_cache)
2111 kmem_cache_destroy(drbd_bm_ext_cache);
2112 if (drbd_al_ext_cache)
2113 kmem_cache_destroy(drbd_al_ext_cache);
2114 2110
2115 drbd_ee_cache = NULL; 2111 drbd_ee_cache = NULL;
2116 drbd_request_cache = NULL; 2112 drbd_request_cache = NULL;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index be9450f5ad1c..75f6b47169e6 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2674,8 +2674,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2674 if (c_min_rate == 0) 2674 if (c_min_rate == 0)
2675 return false; 2675 return false;
2676 2676
2677 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 2677 curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
2678 (int)part_stat_read(&disk->part0, sectors[1]) -
2679 atomic_read(&device->rs_sect_ev); 2678 atomic_read(&device->rs_sect_ev);
2680 2679
2681 if (atomic_read(&device->ap_actlog_cnt) 2680 if (atomic_read(&device->ap_actlog_cnt)
@@ -2790,6 +2789,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2790 then we would do something smarter here than reading 2789 then we would do something smarter here than reading
2791 the block... */ 2790 the block... */
2792 peer_req->flags |= EE_RS_THIN_REQ; 2791 peer_req->flags |= EE_RS_THIN_REQ;
2792 /* fall through */
2793 case P_RS_DATA_REQUEST: 2793 case P_RS_DATA_REQUEST:
2794 peer_req->w.cb = w_e_end_rsdata_req; 2794 peer_req->w.cb = w_e_end_rsdata_req;
2795 fault_type = DRBD_FAULT_RS_RD; 2795 fault_type = DRBD_FAULT_RS_RD;
@@ -2968,6 +2968,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold
2968 /* Else fall through to one of the other strategies... */ 2968 /* Else fall through to one of the other strategies... */
2969 drbd_warn(device, "Discard younger/older primary did not find a decision\n" 2969 drbd_warn(device, "Discard younger/older primary did not find a decision\n"
2970 "Using discard-least-changes instead\n"); 2970 "Using discard-least-changes instead\n");
2971 /* fall through */
2971 case ASB_DISCARD_ZERO_CHG: 2972 case ASB_DISCARD_ZERO_CHG:
2972 if (ch_peer == 0 && ch_self == 0) { 2973 if (ch_peer == 0 && ch_self == 0) {
2973 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) 2974 rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
@@ -2979,6 +2980,7 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold
2979 } 2980 }
2980 if (after_sb_0p == ASB_DISCARD_ZERO_CHG) 2981 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
2981 break; 2982 break;
2983 /* else: fall through */
2982 case ASB_DISCARD_LEAST_CHG: 2984 case ASB_DISCARD_LEAST_CHG:
2983 if (ch_self < ch_peer) 2985 if (ch_self < ch_peer)
2984 rv = -1; 2986 rv = -1;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index d146fedc38bb..19cac36e9737 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -38,7 +38,7 @@ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request
38{ 38{
39 struct request_queue *q = device->rq_queue; 39 struct request_queue *q = device->rq_queue;
40 40
41 generic_start_io_acct(q, bio_data_dir(req->master_bio), 41 generic_start_io_acct(q, bio_op(req->master_bio),
42 req->i.size >> 9, &device->vdisk->part0); 42 req->i.size >> 9, &device->vdisk->part0);
43} 43}
44 44
@@ -47,7 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
47{ 47{
48 struct request_queue *q = device->rq_queue; 48 struct request_queue *q = device->rq_queue;
49 49
50 generic_end_io_acct(q, bio_data_dir(req->master_bio), 50 generic_end_io_acct(q, bio_op(req->master_bio),
51 &device->vdisk->part0, req->start_jif); 51 &device->vdisk->part0, req->start_jif);
52} 52}
53 53
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 5e793dd7adfb..b8f77e83d456 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1690,9 +1690,7 @@ void drbd_rs_controller_reset(struct drbd_device *device)
1690 atomic_set(&device->rs_sect_in, 0); 1690 atomic_set(&device->rs_sect_in, 0);
1691 atomic_set(&device->rs_sect_ev, 0); 1691 atomic_set(&device->rs_sect_ev, 0);
1692 device->rs_in_flight = 0; 1692 device->rs_in_flight = 0;
1693 device->rs_last_events = 1693 device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors);
1694 (int)part_stat_read(&disk->part0, sectors[0]) +
1695 (int)part_stat_read(&disk->part0, sectors[1]);
1696 1694
1697 /* Updating the RCU protected object in place is necessary since 1695 /* Updating the RCU protected object in place is necessary since
1698 this function gets called from atomic context. 1696 this function gets called from atomic context.
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 8871b5044d9e..48f622728ce6 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -1461,7 +1461,6 @@ static void setup_rw_floppy(void)
1461 int i; 1461 int i;
1462 int r; 1462 int r;
1463 int flags; 1463 int flags;
1464 int dflags;
1465 unsigned long ready_date; 1464 unsigned long ready_date;
1466 void (*function)(void); 1465 void (*function)(void);
1467 1466
@@ -1485,8 +1484,6 @@ static void setup_rw_floppy(void)
1485 if (fd_wait_for_completion(ready_date, function)) 1484 if (fd_wait_for_completion(ready_date, function))
1486 return; 1485 return;
1487 } 1486 }
1488 dflags = DRS->flags;
1489
1490 if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE)) 1487 if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE))
1491 setup_DMA(); 1488 setup_DMA();
1492 1489
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4cb1d1be3cfb..ea9debf59b22 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -690,7 +690,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
690 unsigned int arg) 690 unsigned int arg)
691{ 691{
692 struct file *file, *old_file; 692 struct file *file, *old_file;
693 struct inode *inode;
694 int error; 693 int error;
695 694
696 error = -ENXIO; 695 error = -ENXIO;
@@ -711,7 +710,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
711 if (error) 710 if (error)
712 goto out_putf; 711 goto out_putf;
713 712
714 inode = file->f_mapping->host;
715 old_file = lo->lo_backing_file; 713 old_file = lo->lo_backing_file;
716 714
717 error = -EINVAL; 715 error = -EINVAL;
@@ -1611,6 +1609,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
1611 case LOOP_GET_STATUS64: 1609 case LOOP_GET_STATUS64:
1612 case LOOP_SET_STATUS64: 1610 case LOOP_SET_STATUS64:
1613 arg = (unsigned long) compat_ptr(arg); 1611 arg = (unsigned long) compat_ptr(arg);
1612 /* fall through */
1614 case LOOP_SET_FD: 1613 case LOOP_SET_FD:
1615 case LOOP_CHANGE_FD: 1614 case LOOP_CHANGE_FD:
1616 case LOOP_SET_BLOCK_SIZE: 1615 case LOOP_SET_BLOCK_SIZE:
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index c73626decb46..db253cd5b32a 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2575,8 +2575,7 @@ static int mtip_hw_debugfs_init(struct driver_data *dd)
2575 2575
2576static void mtip_hw_debugfs_exit(struct driver_data *dd) 2576static void mtip_hw_debugfs_exit(struct driver_data *dd)
2577{ 2577{
2578 if (dd->dfs_node) 2578 debugfs_remove_recursive(dd->dfs_node);
2579 debugfs_remove_recursive(dd->dfs_node);
2580} 2579}
2581 2580
2582/* 2581/*
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
new file mode 100644
index 000000000000..d81781f22dba
--- /dev/null
+++ b/drivers/block/null_blk.h
@@ -0,0 +1,108 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef __BLK_NULL_BLK_H
3#define __BLK_NULL_BLK_H
4
5#include <linux/blkdev.h>
6#include <linux/slab.h>
7#include <linux/blk-mq.h>
8#include <linux/hrtimer.h>
9#include <linux/configfs.h>
10#include <linux/badblocks.h>
11#include <linux/fault-inject.h>
12
13struct nullb_cmd {
14 struct list_head list;
15 struct llist_node ll_list;
16 struct __call_single_data csd;
17 struct request *rq;
18 struct bio *bio;
19 unsigned int tag;
20 blk_status_t error;
21 struct nullb_queue *nq;
22 struct hrtimer timer;
23};
24
25struct nullb_queue {
26 unsigned long *tag_map;
27 wait_queue_head_t wait;
28 unsigned int queue_depth;
29 struct nullb_device *dev;
30 unsigned int requeue_selection;
31
32 struct nullb_cmd *cmds;
33};
34
35struct nullb_device {
36 struct nullb *nullb;
37 struct config_item item;
38 struct radix_tree_root data; /* data stored in the disk */
39 struct radix_tree_root cache; /* disk cache data */
40 unsigned long flags; /* device flags */
41 unsigned int curr_cache;
42 struct badblocks badblocks;
43
44 unsigned int nr_zones;
45 struct blk_zone *zones;
46 sector_t zone_size_sects;
47
48 unsigned long size; /* device size in MB */
49 unsigned long completion_nsec; /* time in ns to complete a request */
50 unsigned long cache_size; /* disk cache size in MB */
51 unsigned long zone_size; /* zone size in MB if device is zoned */
52 unsigned int submit_queues; /* number of submission queues */
53 unsigned int home_node; /* home node for the device */
54 unsigned int queue_mode; /* block interface */
55 unsigned int blocksize; /* block size */
56 unsigned int irqmode; /* IRQ completion handler */
57 unsigned int hw_queue_depth; /* queue depth */
58 unsigned int index; /* index of the disk, only valid with a disk */
59 unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
60 bool blocking; /* blocking blk-mq device */
61 bool use_per_node_hctx; /* use per-node allocation for hardware context */
62 bool power; /* power on/off the device */
63 bool memory_backed; /* if data is stored in memory */
64 bool discard; /* if support discard */
65 bool zoned; /* if device is zoned */
66};
67
68struct nullb {
69 struct nullb_device *dev;
70 struct list_head list;
71 unsigned int index;
72 struct request_queue *q;
73 struct gendisk *disk;
74 struct blk_mq_tag_set *tag_set;
75 struct blk_mq_tag_set __tag_set;
76 unsigned int queue_depth;
77 atomic_long_t cur_bytes;
78 struct hrtimer bw_timer;
79 unsigned long cache_flush_pos;
80 spinlock_t lock;
81
82 struct nullb_queue *queues;
83 unsigned int nr_queues;
84 char disk_name[DISK_NAME_LEN];
85};
86
87#ifdef CONFIG_BLK_DEV_ZONED
88int null_zone_init(struct nullb_device *dev);
89void null_zone_exit(struct nullb_device *dev);
90blk_status_t null_zone_report(struct nullb *nullb,
91 struct nullb_cmd *cmd);
92void null_zone_write(struct nullb_cmd *cmd);
93void null_zone_reset(struct nullb_cmd *cmd);
94#else
95static inline int null_zone_init(struct nullb_device *dev)
96{
97 return -EINVAL;
98}
99static inline void null_zone_exit(struct nullb_device *dev) {}
100static inline blk_status_t null_zone_report(struct nullb *nullb,
101 struct nullb_cmd *cmd)
102{
103 return BLK_STS_NOTSUPP;
104}
105static inline void null_zone_write(struct nullb_cmd *cmd) {}
106static inline void null_zone_reset(struct nullb_cmd *cmd) {}
107#endif /* CONFIG_BLK_DEV_ZONED */
108#endif /* __NULL_BLK_H */
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk_main.c
index 042c778e5a4e..6127e3ff7b4b 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk_main.c
@@ -7,14 +7,8 @@
7#include <linux/moduleparam.h> 7#include <linux/moduleparam.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/blkdev.h>
11#include <linux/init.h> 10#include <linux/init.h>
12#include <linux/slab.h> 11#include "null_blk.h"
13#include <linux/blk-mq.h>
14#include <linux/hrtimer.h>
15#include <linux/configfs.h>
16#include <linux/badblocks.h>
17#include <linux/fault-inject.h>
18 12
19#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) 13#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
20#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) 14#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
@@ -35,28 +29,6 @@ static inline u64 mb_per_tick(int mbps)
35 return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); 29 return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
36} 30}
37 31
38struct nullb_cmd {
39 struct list_head list;
40 struct llist_node ll_list;
41 struct __call_single_data csd;
42 struct request *rq;
43 struct bio *bio;
44 unsigned int tag;
45 blk_status_t error;
46 struct nullb_queue *nq;
47 struct hrtimer timer;
48};
49
50struct nullb_queue {
51 unsigned long *tag_map;
52 wait_queue_head_t wait;
53 unsigned int queue_depth;
54 struct nullb_device *dev;
55 unsigned int requeue_selection;
56
57 struct nullb_cmd *cmds;
58};
59
60/* 32/*
61 * Status flags for nullb_device. 33 * Status flags for nullb_device.
62 * 34 *
@@ -92,52 +64,6 @@ struct nullb_page {
92#define NULLB_PAGE_LOCK (MAP_SZ - 1) 64#define NULLB_PAGE_LOCK (MAP_SZ - 1)
93#define NULLB_PAGE_FREE (MAP_SZ - 2) 65#define NULLB_PAGE_FREE (MAP_SZ - 2)
94 66
95struct nullb_device {
96 struct nullb *nullb;
97 struct config_item item;
98 struct radix_tree_root data; /* data stored in the disk */
99 struct radix_tree_root cache; /* disk cache data */
100 unsigned long flags; /* device flags */
101 unsigned int curr_cache;
102 struct badblocks badblocks;
103
104 unsigned long size; /* device size in MB */
105 unsigned long completion_nsec; /* time in ns to complete a request */
106 unsigned long cache_size; /* disk cache size in MB */
107 unsigned int submit_queues; /* number of submission queues */
108 unsigned int home_node; /* home node for the device */
109 unsigned int queue_mode; /* block interface */
110 unsigned int blocksize; /* block size */
111 unsigned int irqmode; /* IRQ completion handler */
112 unsigned int hw_queue_depth; /* queue depth */
113 unsigned int index; /* index of the disk, only valid with a disk */
114 unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
115 bool blocking; /* blocking blk-mq device */
116 bool use_per_node_hctx; /* use per-node allocation for hardware context */
117 bool power; /* power on/off the device */
118 bool memory_backed; /* if data is stored in memory */
119 bool discard; /* if support discard */
120};
121
122struct nullb {
123 struct nullb_device *dev;
124 struct list_head list;
125 unsigned int index;
126 struct request_queue *q;
127 struct gendisk *disk;
128 struct blk_mq_tag_set *tag_set;
129 struct blk_mq_tag_set __tag_set;
130 unsigned int queue_depth;
131 atomic_long_t cur_bytes;
132 struct hrtimer bw_timer;
133 unsigned long cache_flush_pos;
134 spinlock_t lock;
135
136 struct nullb_queue *queues;
137 unsigned int nr_queues;
138 char disk_name[DISK_NAME_LEN];
139};
140
141static LIST_HEAD(nullb_list); 67static LIST_HEAD(nullb_list);
142static struct mutex lock; 68static struct mutex lock;
143static int null_major; 69static int null_major;
@@ -254,6 +180,14 @@ static bool g_use_per_node_hctx;
254module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); 180module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
255MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); 181MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
256 182
183static bool g_zoned;
184module_param_named(zoned, g_zoned, bool, S_IRUGO);
185MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
186
187static unsigned long g_zone_size = 256;
188module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
189MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
190
257static struct nullb_device *null_alloc_dev(void); 191static struct nullb_device *null_alloc_dev(void);
258static void null_free_dev(struct nullb_device *dev); 192static void null_free_dev(struct nullb_device *dev);
259static void null_del_dev(struct nullb *nullb); 193static void null_del_dev(struct nullb *nullb);
@@ -357,6 +291,8 @@ NULLB_DEVICE_ATTR(memory_backed, bool);
357NULLB_DEVICE_ATTR(discard, bool); 291NULLB_DEVICE_ATTR(discard, bool);
358NULLB_DEVICE_ATTR(mbps, uint); 292NULLB_DEVICE_ATTR(mbps, uint);
359NULLB_DEVICE_ATTR(cache_size, ulong); 293NULLB_DEVICE_ATTR(cache_size, ulong);
294NULLB_DEVICE_ATTR(zoned, bool);
295NULLB_DEVICE_ATTR(zone_size, ulong);
360 296
361static ssize_t nullb_device_power_show(struct config_item *item, char *page) 297static ssize_t nullb_device_power_show(struct config_item *item, char *page)
362{ 298{
@@ -390,6 +326,7 @@ static ssize_t nullb_device_power_store(struct config_item *item,
390 null_del_dev(dev->nullb); 326 null_del_dev(dev->nullb);
391 mutex_unlock(&lock); 327 mutex_unlock(&lock);
392 clear_bit(NULLB_DEV_FL_UP, &dev->flags); 328 clear_bit(NULLB_DEV_FL_UP, &dev->flags);
329 clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
393 } 330 }
394 331
395 return count; 332 return count;
@@ -468,6 +405,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
468 &nullb_device_attr_mbps, 405 &nullb_device_attr_mbps,
469 &nullb_device_attr_cache_size, 406 &nullb_device_attr_cache_size,
470 &nullb_device_attr_badblocks, 407 &nullb_device_attr_badblocks,
408 &nullb_device_attr_zoned,
409 &nullb_device_attr_zone_size,
471 NULL, 410 NULL,
472}; 411};
473 412
@@ -520,7 +459,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
520 459
521static ssize_t memb_group_features_show(struct config_item *item, char *page) 460static ssize_t memb_group_features_show(struct config_item *item, char *page)
522{ 461{
523 return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n"); 462 return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n");
524} 463}
525 464
526CONFIGFS_ATTR_RO(memb_group_, features); 465CONFIGFS_ATTR_RO(memb_group_, features);
@@ -579,6 +518,8 @@ static struct nullb_device *null_alloc_dev(void)
579 dev->hw_queue_depth = g_hw_queue_depth; 518 dev->hw_queue_depth = g_hw_queue_depth;
580 dev->blocking = g_blocking; 519 dev->blocking = g_blocking;
581 dev->use_per_node_hctx = g_use_per_node_hctx; 520 dev->use_per_node_hctx = g_use_per_node_hctx;
521 dev->zoned = g_zoned;
522 dev->zone_size = g_zone_size;
582 return dev; 523 return dev;
583} 524}
584 525
@@ -587,6 +528,7 @@ static void null_free_dev(struct nullb_device *dev)
587 if (!dev) 528 if (!dev)
588 return; 529 return;
589 530
531 null_zone_exit(dev);
590 badblocks_exit(&dev->badblocks); 532 badblocks_exit(&dev->badblocks);
591 kfree(dev); 533 kfree(dev);
592} 534}
@@ -862,7 +804,9 @@ static struct nullb_page *null_lookup_page(struct nullb *nullb,
862} 804}
863 805
864static struct nullb_page *null_insert_page(struct nullb *nullb, 806static struct nullb_page *null_insert_page(struct nullb *nullb,
865 sector_t sector, bool ignore_cache) 807 sector_t sector, bool ignore_cache)
808 __releases(&nullb->lock)
809 __acquires(&nullb->lock)
866{ 810{
867 u64 idx; 811 u64 idx;
868 struct nullb_page *t_page; 812 struct nullb_page *t_page;
@@ -1219,6 +1163,11 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
1219 struct nullb *nullb = dev->nullb; 1163 struct nullb *nullb = dev->nullb;
1220 int err = 0; 1164 int err = 0;
1221 1165
1166 if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) {
1167 cmd->error = null_zone_report(nullb, cmd);
1168 goto out;
1169 }
1170
1222 if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { 1171 if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
1223 struct request *rq = cmd->rq; 1172 struct request *rq = cmd->rq;
1224 1173
@@ -1283,6 +1232,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
1283 } 1232 }
1284 } 1233 }
1285 cmd->error = errno_to_blk_status(err); 1234 cmd->error = errno_to_blk_status(err);
1235
1236 if (!cmd->error && dev->zoned) {
1237 if (req_op(cmd->rq) == REQ_OP_WRITE)
1238 null_zone_write(cmd);
1239 else if (req_op(cmd->rq) == REQ_OP_ZONE_RESET)
1240 null_zone_reset(cmd);
1241 }
1286out: 1242out:
1287 /* Complete IO by inline, softirq or timer */ 1243 /* Complete IO by inline, softirq or timer */
1288 switch (dev->irqmode) { 1244 switch (dev->irqmode) {
@@ -1810,6 +1766,15 @@ static int null_add_dev(struct nullb_device *dev)
1810 blk_queue_flush_queueable(nullb->q, true); 1766 blk_queue_flush_queueable(nullb->q, true);
1811 } 1767 }
1812 1768
1769 if (dev->zoned) {
1770 rv = null_zone_init(dev);
1771 if (rv)
1772 goto out_cleanup_blk_queue;
1773
1774 blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
1775 nullb->q->limits.zoned = BLK_ZONED_HM;
1776 }
1777
1813 nullb->q->queuedata = nullb; 1778 nullb->q->queuedata = nullb;
1814 blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); 1779 blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
1815 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); 1780 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
@@ -1828,13 +1793,16 @@ static int null_add_dev(struct nullb_device *dev)
1828 1793
1829 rv = null_gendisk_register(nullb); 1794 rv = null_gendisk_register(nullb);
1830 if (rv) 1795 if (rv)
1831 goto out_cleanup_blk_queue; 1796 goto out_cleanup_zone;
1832 1797
1833 mutex_lock(&lock); 1798 mutex_lock(&lock);
1834 list_add_tail(&nullb->list, &nullb_list); 1799 list_add_tail(&nullb->list, &nullb_list);
1835 mutex_unlock(&lock); 1800 mutex_unlock(&lock);
1836 1801
1837 return 0; 1802 return 0;
1803out_cleanup_zone:
1804 if (dev->zoned)
1805 null_zone_exit(dev);
1838out_cleanup_blk_queue: 1806out_cleanup_blk_queue:
1839 blk_cleanup_queue(nullb->q); 1807 blk_cleanup_queue(nullb->q);
1840out_cleanup_tags: 1808out_cleanup_tags:
@@ -1861,6 +1829,11 @@ static int __init null_init(void)
1861 g_bs = PAGE_SIZE; 1829 g_bs = PAGE_SIZE;
1862 } 1830 }
1863 1831
1832 if (!is_power_of_2(g_zone_size)) {
1833 pr_err("null_blk: zone_size must be power-of-two\n");
1834 return -EINVAL;
1835 }
1836
1864 if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { 1837 if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
1865 if (g_submit_queues != nr_online_nodes) { 1838 if (g_submit_queues != nr_online_nodes) {
1866 pr_warn("null_blk: submit_queues param is set to %u.\n", 1839 pr_warn("null_blk: submit_queues param is set to %u.\n",
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
new file mode 100644
index 000000000000..a979ca00d7be
--- /dev/null
+++ b/drivers/block/null_blk_zoned.c
@@ -0,0 +1,149 @@
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/vmalloc.h>
3#include "null_blk.h"
4
5/* zone_size in MBs to sectors. */
6#define ZONE_SIZE_SHIFT 11
7
8static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
9{
10 return sect >> ilog2(dev->zone_size_sects);
11}
12
13int null_zone_init(struct nullb_device *dev)
14{
15 sector_t dev_size = (sector_t)dev->size * 1024 * 1024;
16 sector_t sector = 0;
17 unsigned int i;
18
19 if (!is_power_of_2(dev->zone_size)) {
20 pr_err("null_blk: zone_size must be power-of-two\n");
21 return -EINVAL;
22 }
23
24 dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
25 dev->nr_zones = dev_size >>
26 (SECTOR_SHIFT + ilog2(dev->zone_size_sects));
27 dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
28 GFP_KERNEL | __GFP_ZERO);
29 if (!dev->zones)
30 return -ENOMEM;
31
32 for (i = 0; i < dev->nr_zones; i++) {
33 struct blk_zone *zone = &dev->zones[i];
34
35 zone->start = zone->wp = sector;
36 zone->len = dev->zone_size_sects;
37 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
38 zone->cond = BLK_ZONE_COND_EMPTY;
39
40 sector += dev->zone_size_sects;
41 }
42
43 return 0;
44}
45
46void null_zone_exit(struct nullb_device *dev)
47{
48 kvfree(dev->zones);
49}
50
51static void null_zone_fill_rq(struct nullb_device *dev, struct request *rq,
52 unsigned int zno, unsigned int nr_zones)
53{
54 struct blk_zone_report_hdr *hdr = NULL;
55 struct bio_vec bvec;
56 struct bvec_iter iter;
57 void *addr;
58 unsigned int zones_to_cpy;
59
60 bio_for_each_segment(bvec, rq->bio, iter) {
61 addr = kmap_atomic(bvec.bv_page);
62
63 zones_to_cpy = bvec.bv_len / sizeof(struct blk_zone);
64
65 if (!hdr) {
66 hdr = (struct blk_zone_report_hdr *)addr;
67 hdr->nr_zones = nr_zones;
68 zones_to_cpy--;
69 addr += sizeof(struct blk_zone_report_hdr);
70 }
71
72 zones_to_cpy = min_t(unsigned int, zones_to_cpy, nr_zones);
73
74 memcpy(addr, &dev->zones[zno],
75 zones_to_cpy * sizeof(struct blk_zone));
76
77 kunmap_atomic(addr);
78
79 nr_zones -= zones_to_cpy;
80 zno += zones_to_cpy;
81
82 if (!nr_zones)
83 break;
84 }
85}
86
87blk_status_t null_zone_report(struct nullb *nullb,
88 struct nullb_cmd *cmd)
89{
90 struct nullb_device *dev = nullb->dev;
91 struct request *rq = cmd->rq;
92 unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
93 unsigned int nr_zones = dev->nr_zones - zno;
94 unsigned int max_zones = (blk_rq_bytes(rq) /
95 sizeof(struct blk_zone)) - 1;
96
97 nr_zones = min_t(unsigned int, nr_zones, max_zones);
98
99 null_zone_fill_rq(nullb->dev, rq, zno, nr_zones);
100
101 return BLK_STS_OK;
102}
103
104void null_zone_write(struct nullb_cmd *cmd)
105{
106 struct nullb_device *dev = cmd->nq->dev;
107 struct request *rq = cmd->rq;
108 sector_t sector = blk_rq_pos(rq);
109 unsigned int rq_sectors = blk_rq_sectors(rq);
110 unsigned int zno = null_zone_no(dev, sector);
111 struct blk_zone *zone = &dev->zones[zno];
112
113 switch (zone->cond) {
114 case BLK_ZONE_COND_FULL:
115 /* Cannot write to a full zone */
116 cmd->error = BLK_STS_IOERR;
117 break;
118 case BLK_ZONE_COND_EMPTY:
119 case BLK_ZONE_COND_IMP_OPEN:
120 /* Writes must be at the write pointer position */
121 if (blk_rq_pos(rq) != zone->wp) {
122 cmd->error = BLK_STS_IOERR;
123 break;
124 }
125
126 if (zone->cond == BLK_ZONE_COND_EMPTY)
127 zone->cond = BLK_ZONE_COND_IMP_OPEN;
128
129 zone->wp += rq_sectors;
130 if (zone->wp == zone->start + zone->len)
131 zone->cond = BLK_ZONE_COND_FULL;
132 break;
133 default:
134 /* Invalid zone condition */
135 cmd->error = BLK_STS_IOERR;
136 break;
137 }
138}
139
140void null_zone_reset(struct nullb_cmd *cmd)
141{
142 struct nullb_device *dev = cmd->nq->dev;
143 struct request *rq = cmd->rq;
144 unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
145 struct blk_zone *zone = &dev->zones[zno];
146
147 zone->cond = BLK_ZONE_COND_EMPTY;
148 zone->wp = zone->start;
149}
diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c
index 4f27e7392e38..f5f63ca2889d 100644
--- a/drivers/block/paride/bpck.c
+++ b/drivers/block/paride/bpck.c
@@ -347,7 +347,7 @@ static int bpck_test_proto( PIA *pi, char * scratch, int verbose )
347 347
348static void bpck_read_eeprom ( PIA *pi, char * buf ) 348static void bpck_read_eeprom ( PIA *pi, char * buf )
349 349
350{ int i,j,k,n,p,v,f, om, od; 350{ int i, j, k, p, v, f, om, od;
351 351
352 bpck_force_spp(pi); 352 bpck_force_spp(pi);
353 353
@@ -356,7 +356,6 @@ static void bpck_read_eeprom ( PIA *pi, char * buf )
356 356
357 bpck_connect(pi); 357 bpck_connect(pi);
358 358
359 n = 0;
360 WR(4,0); 359 WR(4,0);
361 for (i=0;i<64;i++) { 360 for (i=0;i<64;i++) {
362 WR(6,8); 361 WR(6,8);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 8961b190e256..7cf947586fe4 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -426,6 +426,7 @@ static void run_fsm(void)
426 pd_claimed = 1; 426 pd_claimed = 1;
427 if (!pi_schedule_claimed(pi_current, run_fsm)) 427 if (!pi_schedule_claimed(pi_current, run_fsm))
428 return; 428 return;
429 /* fall through */
429 case 1: 430 case 1:
430 pd_claimed = 2; 431 pd_claimed = 2;
431 pi_current->proto->connect(pi_current); 432 pi_current->proto->connect(pi_current);
@@ -445,6 +446,7 @@ static void run_fsm(void)
445 spin_unlock_irqrestore(&pd_lock, saved_flags); 446 spin_unlock_irqrestore(&pd_lock, saved_flags);
446 if (stop) 447 if (stop)
447 return; 448 return;
449 /* fall through */
448 case Hold: 450 case Hold:
449 schedule_fsm(); 451 schedule_fsm();
450 return; 452 return;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index b3f83cd96f33..e285413d4a75 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -67,7 +67,7 @@
67#include <scsi/scsi.h> 67#include <scsi/scsi.h>
68#include <linux/debugfs.h> 68#include <linux/debugfs.h>
69#include <linux/device.h> 69#include <linux/device.h>
70 70#include <linux/nospec.h>
71#include <linux/uaccess.h> 71#include <linux/uaccess.h>
72 72
73#define DRIVER_NAME "pktcdvd" 73#define DRIVER_NAME "pktcdvd"
@@ -748,13 +748,13 @@ static const char *sense_key_string(__u8 index)
748static void pkt_dump_sense(struct pktcdvd_device *pd, 748static void pkt_dump_sense(struct pktcdvd_device *pd,
749 struct packet_command *cgc) 749 struct packet_command *cgc)
750{ 750{
751 struct request_sense *sense = cgc->sense; 751 struct scsi_sense_hdr *sshdr = cgc->sshdr;
752 752
753 if (sense) 753 if (sshdr)
754 pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", 754 pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
755 CDROM_PACKET_SIZE, cgc->cmd, 755 CDROM_PACKET_SIZE, cgc->cmd,
756 sense->sense_key, sense->asc, sense->ascq, 756 sshdr->sense_key, sshdr->asc, sshdr->ascq,
757 sense_key_string(sense->sense_key)); 757 sense_key_string(sshdr->sense_key));
758 else 758 else
759 pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); 759 pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
760} 760}
@@ -787,18 +787,19 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
787 unsigned write_speed, unsigned read_speed) 787 unsigned write_speed, unsigned read_speed)
788{ 788{
789 struct packet_command cgc; 789 struct packet_command cgc;
790 struct request_sense sense; 790 struct scsi_sense_hdr sshdr;
791 int ret; 791 int ret;
792 792
793 init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); 793 init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
794 cgc.sense = &sense; 794 cgc.sshdr = &sshdr;
795 cgc.cmd[0] = GPCMD_SET_SPEED; 795 cgc.cmd[0] = GPCMD_SET_SPEED;
796 cgc.cmd[2] = (read_speed >> 8) & 0xff; 796 cgc.cmd[2] = (read_speed >> 8) & 0xff;
797 cgc.cmd[3] = read_speed & 0xff; 797 cgc.cmd[3] = read_speed & 0xff;
798 cgc.cmd[4] = (write_speed >> 8) & 0xff; 798 cgc.cmd[4] = (write_speed >> 8) & 0xff;
799 cgc.cmd[5] = write_speed & 0xff; 799 cgc.cmd[5] = write_speed & 0xff;
800 800
801 if ((ret = pkt_generic_packet(pd, &cgc))) 801 ret = pkt_generic_packet(pd, &cgc);
802 if (ret)
802 pkt_dump_sense(pd, &cgc); 803 pkt_dump_sense(pd, &cgc);
803 804
804 return ret; 805 return ret;
@@ -1562,7 +1563,8 @@ static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
1562 cgc.cmd[8] = cgc.buflen = 2; 1563 cgc.cmd[8] = cgc.buflen = 2;
1563 cgc.quiet = 1; 1564 cgc.quiet = 1;
1564 1565
1565 if ((ret = pkt_generic_packet(pd, &cgc))) 1566 ret = pkt_generic_packet(pd, &cgc);
1567 if (ret)
1566 return ret; 1568 return ret;
1567 1569
1568 /* not all drives have the same disc_info length, so requeue 1570 /* not all drives have the same disc_info length, so requeue
@@ -1591,7 +1593,8 @@ static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type,
1591 cgc.cmd[8] = 8; 1593 cgc.cmd[8] = 8;
1592 cgc.quiet = 1; 1594 cgc.quiet = 1;
1593 1595
1594 if ((ret = pkt_generic_packet(pd, &cgc))) 1596 ret = pkt_generic_packet(pd, &cgc);
1597 if (ret)
1595 return ret; 1598 return ret;
1596 1599
1597 cgc.buflen = be16_to_cpu(ti->track_information_length) + 1600 cgc.buflen = be16_to_cpu(ti->track_information_length) +
@@ -1612,17 +1615,20 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
1612 __u32 last_track; 1615 __u32 last_track;
1613 int ret = -1; 1616 int ret = -1;
1614 1617
1615 if ((ret = pkt_get_disc_info(pd, &di))) 1618 ret = pkt_get_disc_info(pd, &di);
1619 if (ret)
1616 return ret; 1620 return ret;
1617 1621
1618 last_track = (di.last_track_msb << 8) | di.last_track_lsb; 1622 last_track = (di.last_track_msb << 8) | di.last_track_lsb;
1619 if ((ret = pkt_get_track_info(pd, last_track, 1, &ti))) 1623 ret = pkt_get_track_info(pd, last_track, 1, &ti);
1624 if (ret)
1620 return ret; 1625 return ret;
1621 1626
1622 /* if this track is blank, try the previous. */ 1627 /* if this track is blank, try the previous. */
1623 if (ti.blank) { 1628 if (ti.blank) {
1624 last_track--; 1629 last_track--;
1625 if ((ret = pkt_get_track_info(pd, last_track, 1, &ti))) 1630 ret = pkt_get_track_info(pd, last_track, 1, &ti);
1631 if (ret)
1626 return ret; 1632 return ret;
1627 } 1633 }
1628 1634
@@ -1645,7 +1651,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
1645static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) 1651static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
1646{ 1652{
1647 struct packet_command cgc; 1653 struct packet_command cgc;
1648 struct request_sense sense; 1654 struct scsi_sense_hdr sshdr;
1649 write_param_page *wp; 1655 write_param_page *wp;
1650 char buffer[128]; 1656 char buffer[128];
1651 int ret, size; 1657 int ret, size;
@@ -1656,8 +1662,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
1656 1662
1657 memset(buffer, 0, sizeof(buffer)); 1663 memset(buffer, 0, sizeof(buffer));
1658 init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); 1664 init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
1659 cgc.sense = &sense; 1665 cgc.sshdr = &sshdr;
1660 if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { 1666 ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
1667 if (ret) {
1661 pkt_dump_sense(pd, &cgc); 1668 pkt_dump_sense(pd, &cgc);
1662 return ret; 1669 return ret;
1663 } 1670 }
@@ -1671,8 +1678,9 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
1671 * now get it all 1678 * now get it all
1672 */ 1679 */
1673 init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); 1680 init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
1674 cgc.sense = &sense; 1681 cgc.sshdr = &sshdr;
1675 if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { 1682 ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
1683 if (ret) {
1676 pkt_dump_sense(pd, &cgc); 1684 pkt_dump_sense(pd, &cgc);
1677 return ret; 1685 return ret;
1678 } 1686 }
@@ -1714,7 +1722,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
1714 wp->packet_size = cpu_to_be32(pd->settings.size >> 2); 1722 wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
1715 1723
1716 cgc.buflen = cgc.cmd[8] = size; 1724 cgc.buflen = cgc.cmd[8] = size;
1717 if ((ret = pkt_mode_select(pd, &cgc))) { 1725 ret = pkt_mode_select(pd, &cgc);
1726 if (ret) {
1718 pkt_dump_sense(pd, &cgc); 1727 pkt_dump_sense(pd, &cgc);
1719 return ret; 1728 return ret;
1720 } 1729 }
@@ -1819,7 +1828,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
1819 memset(&di, 0, sizeof(disc_information)); 1828 memset(&di, 0, sizeof(disc_information));
1820 memset(&ti, 0, sizeof(track_information)); 1829 memset(&ti, 0, sizeof(track_information));
1821 1830
1822 if ((ret = pkt_get_disc_info(pd, &di))) { 1831 ret = pkt_get_disc_info(pd, &di);
1832 if (ret) {
1823 pkt_err(pd, "failed get_disc\n"); 1833 pkt_err(pd, "failed get_disc\n");
1824 return ret; 1834 return ret;
1825 } 1835 }
@@ -1830,7 +1840,8 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
1830 pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR; 1840 pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
1831 1841
1832 track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ 1842 track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
1833 if ((ret = pkt_get_track_info(pd, track, 1, &ti))) { 1843 ret = pkt_get_track_info(pd, track, 1, &ti);
1844 if (ret) {
1834 pkt_err(pd, "failed get_track\n"); 1845 pkt_err(pd, "failed get_track\n");
1835 return ret; 1846 return ret;
1836 } 1847 }
@@ -1905,12 +1916,12 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
1905 int set) 1916 int set)
1906{ 1917{
1907 struct packet_command cgc; 1918 struct packet_command cgc;
1908 struct request_sense sense; 1919 struct scsi_sense_hdr sshdr;
1909 unsigned char buf[64]; 1920 unsigned char buf[64];
1910 int ret; 1921 int ret;
1911 1922
1912 init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); 1923 init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
1913 cgc.sense = &sense; 1924 cgc.sshdr = &sshdr;
1914 cgc.buflen = pd->mode_offset + 12; 1925 cgc.buflen = pd->mode_offset + 12;
1915 1926
1916 /* 1927 /*
@@ -1918,7 +1929,8 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
1918 */ 1929 */
1919 cgc.quiet = 1; 1930 cgc.quiet = 1;
1920 1931
1921 if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0))) 1932 ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0);
1933 if (ret)
1922 return ret; 1934 return ret;
1923 1935
1924 buf[pd->mode_offset + 10] |= (!!set << 2); 1936 buf[pd->mode_offset + 10] |= (!!set << 2);
@@ -1950,14 +1962,14 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
1950 unsigned *write_speed) 1962 unsigned *write_speed)
1951{ 1963{
1952 struct packet_command cgc; 1964 struct packet_command cgc;
1953 struct request_sense sense; 1965 struct scsi_sense_hdr sshdr;
1954 unsigned char buf[256+18]; 1966 unsigned char buf[256+18];
1955 unsigned char *cap_buf; 1967 unsigned char *cap_buf;
1956 int ret, offset; 1968 int ret, offset;
1957 1969
1958 cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; 1970 cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
1959 init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); 1971 init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
1960 cgc.sense = &sense; 1972 cgc.sshdr = &sshdr;
1961 1973
1962 ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); 1974 ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
1963 if (ret) { 1975 if (ret) {
@@ -2011,13 +2023,13 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
2011 unsigned *speed) 2023 unsigned *speed)
2012{ 2024{
2013 struct packet_command cgc; 2025 struct packet_command cgc;
2014 struct request_sense sense; 2026 struct scsi_sense_hdr sshdr;
2015 unsigned char buf[64]; 2027 unsigned char buf[64];
2016 unsigned int size, st, sp; 2028 unsigned int size, st, sp;
2017 int ret; 2029 int ret;
2018 2030
2019 init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ); 2031 init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
2020 cgc.sense = &sense; 2032 cgc.sshdr = &sshdr;
2021 cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; 2033 cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
2022 cgc.cmd[1] = 2; 2034 cgc.cmd[1] = 2;
2023 cgc.cmd[2] = 4; /* READ ATIP */ 2035 cgc.cmd[2] = 4; /* READ ATIP */
@@ -2032,7 +2044,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
2032 size = sizeof(buf); 2044 size = sizeof(buf);
2033 2045
2034 init_cdrom_command(&cgc, buf, size, CGC_DATA_READ); 2046 init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
2035 cgc.sense = &sense; 2047 cgc.sshdr = &sshdr;
2036 cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; 2048 cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
2037 cgc.cmd[1] = 2; 2049 cgc.cmd[1] = 2;
2038 cgc.cmd[2] = 4; 2050 cgc.cmd[2] = 4;
@@ -2083,17 +2095,18 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
2083static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) 2095static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
2084{ 2096{
2085 struct packet_command cgc; 2097 struct packet_command cgc;
2086 struct request_sense sense; 2098 struct scsi_sense_hdr sshdr;
2087 int ret; 2099 int ret;
2088 2100
2089 pkt_dbg(2, pd, "Performing OPC\n"); 2101 pkt_dbg(2, pd, "Performing OPC\n");
2090 2102
2091 init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); 2103 init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
2092 cgc.sense = &sense; 2104 cgc.sshdr = &sshdr;
2093 cgc.timeout = 60*HZ; 2105 cgc.timeout = 60*HZ;
2094 cgc.cmd[0] = GPCMD_SEND_OPC; 2106 cgc.cmd[0] = GPCMD_SEND_OPC;
2095 cgc.cmd[1] = 1; 2107 cgc.cmd[1] = 1;
2096 if ((ret = pkt_generic_packet(pd, &cgc))) 2108 ret = pkt_generic_packet(pd, &cgc);
2109 if (ret)
2097 pkt_dump_sense(pd, &cgc); 2110 pkt_dump_sense(pd, &cgc);
2098 return ret; 2111 return ret;
2099} 2112}
@@ -2103,19 +2116,22 @@ static int pkt_open_write(struct pktcdvd_device *pd)
2103 int ret; 2116 int ret;
2104 unsigned int write_speed, media_write_speed, read_speed; 2117 unsigned int write_speed, media_write_speed, read_speed;
2105 2118
2106 if ((ret = pkt_probe_settings(pd))) { 2119 ret = pkt_probe_settings(pd);
2120 if (ret) {
2107 pkt_dbg(2, pd, "failed probe\n"); 2121 pkt_dbg(2, pd, "failed probe\n");
2108 return ret; 2122 return ret;
2109 } 2123 }
2110 2124
2111 if ((ret = pkt_set_write_settings(pd))) { 2125 ret = pkt_set_write_settings(pd);
2126 if (ret) {
2112 pkt_dbg(1, pd, "failed saving write settings\n"); 2127 pkt_dbg(1, pd, "failed saving write settings\n");
2113 return -EIO; 2128 return -EIO;
2114 } 2129 }
2115 2130
2116 pkt_write_caching(pd, USE_WCACHING); 2131 pkt_write_caching(pd, USE_WCACHING);
2117 2132
2118 if ((ret = pkt_get_max_speed(pd, &write_speed))) 2133 ret = pkt_get_max_speed(pd, &write_speed);
2134 if (ret)
2119 write_speed = 16 * 177; 2135 write_speed = 16 * 177;
2120 switch (pd->mmc3_profile) { 2136 switch (pd->mmc3_profile) {
2121 case 0x13: /* DVD-RW */ 2137 case 0x13: /* DVD-RW */
@@ -2124,7 +2140,8 @@ static int pkt_open_write(struct pktcdvd_device *pd)
2124 pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); 2140 pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
2125 break; 2141 break;
2126 default: 2142 default:
2127 if ((ret = pkt_media_speed(pd, &media_write_speed))) 2143 ret = pkt_media_speed(pd, &media_write_speed);
2144 if (ret)
2128 media_write_speed = 16; 2145 media_write_speed = 16;
2129 write_speed = min(write_speed, media_write_speed * 177); 2146 write_speed = min(write_speed, media_write_speed * 177);
2130 pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); 2147 pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
@@ -2132,14 +2149,16 @@ static int pkt_open_write(struct pktcdvd_device *pd)
2132 } 2149 }
2133 read_speed = write_speed; 2150 read_speed = write_speed;
2134 2151
2135 if ((ret = pkt_set_speed(pd, write_speed, read_speed))) { 2152 ret = pkt_set_speed(pd, write_speed, read_speed);
2153 if (ret) {
2136 pkt_dbg(1, pd, "couldn't set write speed\n"); 2154 pkt_dbg(1, pd, "couldn't set write speed\n");
2137 return -EIO; 2155 return -EIO;
2138 } 2156 }
2139 pd->write_speed = write_speed; 2157 pd->write_speed = write_speed;
2140 pd->read_speed = read_speed; 2158 pd->read_speed = read_speed;
2141 2159
2142 if ((ret = pkt_perform_opc(pd))) { 2160 ret = pkt_perform_opc(pd);
2161 if (ret) {
2143 pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); 2162 pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
2144 } 2163 }
2145 2164
@@ -2161,10 +2180,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2161 * so bdget() can't fail. 2180 * so bdget() can't fail.
2162 */ 2181 */
2163 bdget(pd->bdev->bd_dev); 2182 bdget(pd->bdev->bd_dev);
2164 if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd))) 2183 ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd);
2184 if (ret)
2165 goto out; 2185 goto out;
2166 2186
2167 if ((ret = pkt_get_last_written(pd, &lba))) { 2187 ret = pkt_get_last_written(pd, &lba);
2188 if (ret) {
2168 pkt_err(pd, "pkt_get_last_written failed\n"); 2189 pkt_err(pd, "pkt_get_last_written failed\n");
2169 goto out_putdev; 2190 goto out_putdev;
2170 } 2191 }
@@ -2175,7 +2196,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2175 2196
2176 q = bdev_get_queue(pd->bdev); 2197 q = bdev_get_queue(pd->bdev);
2177 if (write) { 2198 if (write) {
2178 if ((ret = pkt_open_write(pd))) 2199 ret = pkt_open_write(pd);
2200 if (ret)
2179 goto out_putdev; 2201 goto out_putdev;
2180 /* 2202 /*
2181 * Some CDRW drives can not handle writes larger than one packet, 2203 * Some CDRW drives can not handle writes larger than one packet,
@@ -2190,7 +2212,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
2190 clear_bit(PACKET_WRITABLE, &pd->flags); 2212 clear_bit(PACKET_WRITABLE, &pd->flags);
2191 } 2213 }
2192 2214
2193 if ((ret = pkt_set_segment_merging(pd, q))) 2215 ret = pkt_set_segment_merging(pd, q);
2216 if (ret)
2194 goto out_putdev; 2217 goto out_putdev;
2195 2218
2196 if (write) { 2219 if (write) {
@@ -2231,6 +2254,8 @@ static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
2231{ 2254{
2232 if (dev_minor >= MAX_WRITERS) 2255 if (dev_minor >= MAX_WRITERS)
2233 return NULL; 2256 return NULL;
2257
2258 dev_minor = array_index_nospec(dev_minor, MAX_WRITERS);
2234 return pkt_devs[dev_minor]; 2259 return pkt_devs[dev_minor];
2235} 2260}
2236 2261
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index dddb3f2490b6..1a92f9e65937 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = {
112 112
113static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) 113static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
114{ 114{
115 generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio), 115 generic_start_io_acct(card->queue, bio_op(bio), bio_sectors(bio),
116 &card->gendisk->part0); 116 &card->gendisk->part0);
117} 117}
118 118
@@ -120,8 +120,8 @@ static void disk_stats_complete(struct rsxx_cardinfo *card,
120 struct bio *bio, 120 struct bio *bio,
121 unsigned long start_time) 121 unsigned long start_time)
122{ 122{
123 generic_end_io_acct(card->queue, bio_data_dir(bio), 123 generic_end_io_acct(card->queue, bio_op(bio),
124 &card->gendisk->part0, start_time); 124 &card->gendisk->part0, start_time);
125} 125}
126 126
127static void bio_dma_done_cb(struct rsxx_cardinfo *card, 127static void bio_dma_done_cb(struct rsxx_cardinfo *card,
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index bc7aea6d7b7c..87b9e7fbf062 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -657,8 +657,8 @@ static bool skd_preop_sg_list(struct skd_device *skdev,
657 657
658 if (unlikely(skdev->dbg_level > 1)) { 658 if (unlikely(skdev->dbg_level > 1)) {
659 dev_dbg(&skdev->pdev->dev, 659 dev_dbg(&skdev->pdev->dev,
660 "skreq=%x sksg_list=%p sksg_dma=%llx\n", 660 "skreq=%x sksg_list=%p sksg_dma=%pad\n",
661 skreq->id, skreq->sksg_list, skreq->sksg_dma_address); 661 skreq->id, skreq->sksg_list, &skreq->sksg_dma_address);
662 for (i = 0; i < n_sg; i++) { 662 for (i = 0; i < n_sg; i++) {
663 struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; 663 struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
664 664
@@ -1190,8 +1190,8 @@ static void skd_send_fitmsg(struct skd_device *skdev,
1190{ 1190{
1191 u64 qcmd; 1191 u64 qcmd;
1192 1192
1193 dev_dbg(&skdev->pdev->dev, "dma address 0x%llx, busy=%d\n", 1193 dev_dbg(&skdev->pdev->dev, "dma address %pad, busy=%d\n",
1194 skmsg->mb_dma_address, skd_in_flight(skdev)); 1194 &skmsg->mb_dma_address, skd_in_flight(skdev));
1195 dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf); 1195 dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf);
1196 1196
1197 qcmd = skmsg->mb_dma_address; 1197 qcmd = skmsg->mb_dma_address;
@@ -1250,9 +1250,9 @@ static void skd_send_special_fitmsg(struct skd_device *skdev,
1250 } 1250 }
1251 1251
1252 dev_dbg(&skdev->pdev->dev, 1252 dev_dbg(&skdev->pdev->dev,
1253 "skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n", 1253 "skspcl=%p id=%04x sksg_list=%p sksg_dma=%pad\n",
1254 skspcl, skspcl->req.id, skspcl->req.sksg_list, 1254 skspcl, skspcl->req.id, skspcl->req.sksg_list,
1255 skspcl->req.sksg_dma_address); 1255 &skspcl->req.sksg_dma_address);
1256 for (i = 0; i < skspcl->req.n_sg; i++) { 1256 for (i = 0; i < skspcl->req.n_sg; i++) {
1257 struct fit_sg_descriptor *sgd = 1257 struct fit_sg_descriptor *sgd =
1258 &skspcl->req.sksg_list[i]; 1258 &skspcl->req.sksg_list[i];
@@ -2685,8 +2685,8 @@ static int skd_cons_skmsg(struct skd_device *skdev)
2685 2685
2686 WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) & 2686 WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) &
2687 (FIT_QCMD_ALIGN - 1), 2687 (FIT_QCMD_ALIGN - 1),
2688 "not aligned: msg_buf %p mb_dma_address %#llx\n", 2688 "not aligned: msg_buf %p mb_dma_address %pad\n",
2689 skmsg->msg_buf, skmsg->mb_dma_address); 2689 skmsg->msg_buf, &skmsg->mb_dma_address);
2690 memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); 2690 memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES);
2691 } 2691 }
2692 2692
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b5cedccb5d7d..8986adab9bf5 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -251,14 +251,9 @@ static DEFINE_SPINLOCK(minor_lock);
251#define GRANTS_PER_INDIRECT_FRAME \ 251#define GRANTS_PER_INDIRECT_FRAME \
252 (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment)) 252 (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
253 253
254#define PSEGS_PER_INDIRECT_FRAME \
255 (GRANTS_INDIRECT_FRAME / GRANTS_PSEGS)
256
257#define INDIRECT_GREFS(_grants) \ 254#define INDIRECT_GREFS(_grants) \
258 DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME) 255 DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
259 256
260#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG)
261
262static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); 257static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
263static void blkfront_gather_backend_features(struct blkfront_info *info); 258static void blkfront_gather_backend_features(struct blkfront_info *info);
264static int negotiate_mq(struct blkfront_info *info); 259static int negotiate_mq(struct blkfront_info *info);
@@ -1441,7 +1436,7 @@ static bool blkif_completion(unsigned long *id,
1441 1436
1442 /* Wait the second response if not yet here. */ 1437 /* Wait the second response if not yet here. */
1443 if (s2->status == REQ_WAITING) 1438 if (s2->status == REQ_WAITING)
1444 return 0; 1439 return false;
1445 1440
1446 bret->status = blkif_get_final_status(s->status, 1441 bret->status = blkif_get_final_status(s->status,
1447 s2->status); 1442 s2->status);
@@ -1542,7 +1537,7 @@ static bool blkif_completion(unsigned long *id,
1542 } 1537 }
1543 } 1538 }
1544 1539
1545 return 1; 1540 return true;
1546} 1541}
1547 1542
1548static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1543static irqreturn_t blkif_interrupt(int irq, void *dev_id)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a390c6d4f72d..c7acf74253a1 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1287,17 +1287,16 @@ static void zram_bio_discard(struct zram *zram, u32 index,
1287 * Returns 1 if IO request was successfully submitted. 1287 * Returns 1 if IO request was successfully submitted.
1288 */ 1288 */
1289static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, 1289static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
1290 int offset, bool is_write, struct bio *bio) 1290 int offset, unsigned int op, struct bio *bio)
1291{ 1291{
1292 unsigned long start_time = jiffies; 1292 unsigned long start_time = jiffies;
1293 int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
1294 struct request_queue *q = zram->disk->queue; 1293 struct request_queue *q = zram->disk->queue;
1295 int ret; 1294 int ret;
1296 1295
1297 generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT, 1296 generic_start_io_acct(q, op, bvec->bv_len >> SECTOR_SHIFT,
1298 &zram->disk->part0); 1297 &zram->disk->part0);
1299 1298
1300 if (!is_write) { 1299 if (!op_is_write(op)) {
1301 atomic64_inc(&zram->stats.num_reads); 1300 atomic64_inc(&zram->stats.num_reads);
1302 ret = zram_bvec_read(zram, bvec, index, offset, bio); 1301 ret = zram_bvec_read(zram, bvec, index, offset, bio);
1303 flush_dcache_page(bvec->bv_page); 1302 flush_dcache_page(bvec->bv_page);
@@ -1306,14 +1305,14 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
1306 ret = zram_bvec_write(zram, bvec, index, offset, bio); 1305 ret = zram_bvec_write(zram, bvec, index, offset, bio);
1307 } 1306 }
1308 1307
1309 generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time); 1308 generic_end_io_acct(q, op, &zram->disk->part0, start_time);
1310 1309
1311 zram_slot_lock(zram, index); 1310 zram_slot_lock(zram, index);
1312 zram_accessed(zram, index); 1311 zram_accessed(zram, index);
1313 zram_slot_unlock(zram, index); 1312 zram_slot_unlock(zram, index);
1314 1313
1315 if (unlikely(ret < 0)) { 1314 if (unlikely(ret < 0)) {
1316 if (!is_write) 1315 if (!op_is_write(op))
1317 atomic64_inc(&zram->stats.failed_reads); 1316 atomic64_inc(&zram->stats.failed_reads);
1318 else 1317 else
1319 atomic64_inc(&zram->stats.failed_writes); 1318 atomic64_inc(&zram->stats.failed_writes);
@@ -1351,7 +1350,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
1351 bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, 1350 bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
1352 unwritten); 1351 unwritten);
1353 if (zram_bvec_rw(zram, &bv, index, offset, 1352 if (zram_bvec_rw(zram, &bv, index, offset,
1354 op_is_write(bio_op(bio)), bio) < 0) 1353 bio_op(bio), bio) < 0)
1355 goto out; 1354 goto out;
1356 1355
1357 bv.bv_offset += bv.bv_len; 1356 bv.bv_offset += bv.bv_len;
@@ -1403,7 +1402,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
1403} 1402}
1404 1403
1405static int zram_rw_page(struct block_device *bdev, sector_t sector, 1404static int zram_rw_page(struct block_device *bdev, sector_t sector,
1406 struct page *page, bool is_write) 1405 struct page *page, unsigned int op)
1407{ 1406{
1408 int offset, ret; 1407 int offset, ret;
1409 u32 index; 1408 u32 index;
@@ -1427,7 +1426,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
1427 bv.bv_len = PAGE_SIZE; 1426 bv.bv_len = PAGE_SIZE;
1428 bv.bv_offset = 0; 1427 bv.bv_offset = 0;
1429 1428
1430 ret = zram_bvec_rw(zram, &bv, index, offset, is_write, NULL); 1429 ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
1431out: 1430out:
1432 /* 1431 /*
1433 * If I/O fails, just return error(ie, non-zero) without 1432 * If I/O fails, just return error(ie, non-zero) without
@@ -1442,7 +1441,7 @@ out:
1442 1441
1443 switch (ret) { 1442 switch (ret) {
1444 case 0: 1443 case 0:
1445 page_endio(page, is_write, 0); 1444 page_endio(page, op_is_write(op), 0);
1446 break; 1445 break;
1447 case 1: 1446 case 1:
1448 ret = 0; 1447 ret = 0;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index a78b8e7085e9..113fc6edb2b0 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -282,6 +282,7 @@
282#include <linux/blkdev.h> 282#include <linux/blkdev.h>
283#include <linux/times.h> 283#include <linux/times.h>
284#include <linux/uaccess.h> 284#include <linux/uaccess.h>
285#include <scsi/scsi_common.h>
285#include <scsi/scsi_request.h> 286#include <scsi/scsi_request.h>
286 287
287/* used to tell the module to turn on full debugging messages */ 288/* used to tell the module to turn on full debugging messages */
@@ -345,10 +346,10 @@ static LIST_HEAD(cdrom_list);
345int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi, 346int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
346 struct packet_command *cgc) 347 struct packet_command *cgc)
347{ 348{
348 if (cgc->sense) { 349 if (cgc->sshdr) {
349 cgc->sense->sense_key = 0x05; 350 cgc->sshdr->sense_key = 0x05;
350 cgc->sense->asc = 0x20; 351 cgc->sshdr->asc = 0x20;
351 cgc->sense->ascq = 0x00; 352 cgc->sshdr->ascq = 0x00;
352 } 353 }
353 354
354 cgc->stat = -EIO; 355 cgc->stat = -EIO;
@@ -2222,9 +2223,12 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
2222 2223
2223 blk_execute_rq(q, cdi->disk, rq, 0); 2224 blk_execute_rq(q, cdi->disk, rq, 0);
2224 if (scsi_req(rq)->result) { 2225 if (scsi_req(rq)->result) {
2225 struct request_sense *s = req->sense; 2226 struct scsi_sense_hdr sshdr;
2227
2226 ret = -EIO; 2228 ret = -EIO;
2227 cdi->last_sense = s->sense_key; 2229 scsi_normalize_sense(req->sense, req->sense_len,
2230 &sshdr);
2231 cdi->last_sense = sshdr.sense_key;
2228 } 2232 }
2229 2233
2230 if (blk_rq_unmap_user(bio)) 2234 if (blk_rq_unmap_user(bio))
@@ -2943,7 +2947,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
2943 struct packet_command *cgc, 2947 struct packet_command *cgc,
2944 int cmd) 2948 int cmd)
2945{ 2949{
2946 struct request_sense sense; 2950 struct scsi_sense_hdr sshdr;
2947 struct cdrom_msf msf; 2951 struct cdrom_msf msf;
2948 int blocksize = 0, format = 0, lba; 2952 int blocksize = 0, format = 0, lba;
2949 int ret; 2953 int ret;
@@ -2971,13 +2975,13 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
2971 if (cgc->buffer == NULL) 2975 if (cgc->buffer == NULL)
2972 return -ENOMEM; 2976 return -ENOMEM;
2973 2977
2974 memset(&sense, 0, sizeof(sense)); 2978 memset(&sshdr, 0, sizeof(sshdr));
2975 cgc->sense = &sense; 2979 cgc->sshdr = &sshdr;
2976 cgc->data_direction = CGC_DATA_READ; 2980 cgc->data_direction = CGC_DATA_READ;
2977 ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize); 2981 ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize);
2978 if (ret && sense.sense_key == 0x05 && 2982 if (ret && sshdr.sense_key == 0x05 &&
2979 sense.asc == 0x20 && 2983 sshdr.asc == 0x20 &&
2980 sense.ascq == 0x00) { 2984 sshdr.ascq == 0x00) {
2981 /* 2985 /*
2982 * SCSI-II devices are not required to support 2986 * SCSI-II devices are not required to support
2983 * READ_CD, so let's try switching block size 2987 * READ_CD, so let's try switching block size
@@ -2986,7 +2990,7 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
2986 ret = cdrom_switch_blocksize(cdi, blocksize); 2990 ret = cdrom_switch_blocksize(cdi, blocksize);
2987 if (ret) 2991 if (ret)
2988 goto out; 2992 goto out;
2989 cgc->sense = NULL; 2993 cgc->sshdr = NULL;
2990 ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1); 2994 ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1);
2991 ret |= cdrom_switch_blocksize(cdi, blocksize); 2995 ret |= cdrom_switch_blocksize(cdi, blocksize);
2992 } 2996 }
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 5f178384876f..44a7a255ef74 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -419,10 +419,11 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
419 419
420int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, 420int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
421 int write, void *buffer, unsigned *bufflen, 421 int write, void *buffer, unsigned *bufflen,
422 struct request_sense *sense, int timeout, 422 struct scsi_sense_hdr *sshdr, int timeout,
423 req_flags_t rq_flags) 423 req_flags_t rq_flags)
424{ 424{
425 struct cdrom_info *info = drive->driver_data; 425 struct cdrom_info *info = drive->driver_data;
426 struct scsi_sense_hdr local_sshdr;
426 int retries = 10; 427 int retries = 10;
427 bool failed; 428 bool failed;
428 429
@@ -430,6 +431,9 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
430 "rq_flags: 0x%x", 431 "rq_flags: 0x%x",
431 cmd[0], write, timeout, rq_flags); 432 cmd[0], write, timeout, rq_flags);
432 433
434 if (!sshdr)
435 sshdr = &local_sshdr;
436
433 /* start of retry loop */ 437 /* start of retry loop */
434 do { 438 do {
435 struct request *rq; 439 struct request *rq;
@@ -456,8 +460,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
456 460
457 if (buffer) 461 if (buffer)
458 *bufflen = scsi_req(rq)->resid_len; 462 *bufflen = scsi_req(rq)->resid_len;
459 if (sense) 463 scsi_normalize_sense(scsi_req(rq)->sense,
460 memcpy(sense, scsi_req(rq)->sense, sizeof(*sense)); 464 scsi_req(rq)->sense_len, sshdr);
461 465
462 /* 466 /*
463 * FIXME: we should probably abort/retry or something in case of 467 * FIXME: we should probably abort/retry or something in case of
@@ -469,12 +473,10 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
469 * The request failed. Retry if it was due to a unit 473 * The request failed. Retry if it was due to a unit
470 * attention status (usually means media was changed). 474 * attention status (usually means media was changed).
471 */ 475 */
472 struct request_sense *reqbuf = scsi_req(rq)->sense; 476 if (sshdr->sense_key == UNIT_ATTENTION)
473
474 if (reqbuf->sense_key == UNIT_ATTENTION)
475 cdrom_saw_media_change(drive); 477 cdrom_saw_media_change(drive);
476 else if (reqbuf->sense_key == NOT_READY && 478 else if (sshdr->sense_key == NOT_READY &&
477 reqbuf->asc == 4 && reqbuf->ascq != 4) { 479 sshdr->asc == 4 && sshdr->ascq != 4) {
478 /* 480 /*
479 * The drive is in the process of loading 481 * The drive is in the process of loading
480 * a disk. Retry, but wait a little to give 482 * a disk. Retry, but wait a little to give
@@ -864,7 +866,7 @@ static void msf_from_bcd(struct atapi_msf *msf)
864 msf->frame = bcd2bin(msf->frame); 866 msf->frame = bcd2bin(msf->frame);
865} 867}
866 868
867int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense) 869int cdrom_check_status(ide_drive_t *drive, struct scsi_sense_hdr *sshdr)
868{ 870{
869 struct cdrom_info *info = drive->driver_data; 871 struct cdrom_info *info = drive->driver_data;
870 struct cdrom_device_info *cdi; 872 struct cdrom_device_info *cdi;
@@ -886,12 +888,11 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense)
886 */ 888 */
887 cmd[7] = cdi->sanyo_slot % 3; 889 cmd[7] = cdi->sanyo_slot % 3;
888 890
889 return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, RQF_QUIET); 891 return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sshdr, 0, RQF_QUIET);
890} 892}
891 893
892static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, 894static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
893 unsigned long *sectors_per_frame, 895 unsigned long *sectors_per_frame)
894 struct request_sense *sense)
895{ 896{
896 struct { 897 struct {
897 __be32 lba; 898 __be32 lba;
@@ -908,7 +909,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
908 memset(cmd, 0, BLK_MAX_CDB); 909 memset(cmd, 0, BLK_MAX_CDB);
909 cmd[0] = GPCMD_READ_CDVD_CAPACITY; 910 cmd[0] = GPCMD_READ_CDVD_CAPACITY;
910 911
911 stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, sense, 0, 912 stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, NULL, 0,
912 RQF_QUIET); 913 RQF_QUIET);
913 if (stat) 914 if (stat)
914 return stat; 915 return stat;
@@ -944,8 +945,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
944} 945}
945 946
946static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag, 947static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
947 int format, char *buf, int buflen, 948 int format, char *buf, int buflen)
948 struct request_sense *sense)
949{ 949{
950 unsigned char cmd[BLK_MAX_CDB]; 950 unsigned char cmd[BLK_MAX_CDB];
951 951
@@ -962,11 +962,11 @@ static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag,
962 if (msf_flag) 962 if (msf_flag)
963 cmd[1] = 2; 963 cmd[1] = 2;
964 964
965 return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, RQF_QUIET); 965 return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, NULL, 0, RQF_QUIET);
966} 966}
967 967
968/* Try to read the entire TOC for the disk into our internal buffer. */ 968/* Try to read the entire TOC for the disk into our internal buffer. */
969int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) 969int ide_cd_read_toc(ide_drive_t *drive)
970{ 970{
971 int stat, ntracks, i; 971 int stat, ntracks, i;
972 struct cdrom_info *info = drive->driver_data; 972 struct cdrom_info *info = drive->driver_data;
@@ -996,14 +996,13 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
996 * Check to see if the existing data is still valid. If it is, 996 * Check to see if the existing data is still valid. If it is,
997 * just return. 997 * just return.
998 */ 998 */
999 (void) cdrom_check_status(drive, sense); 999 (void) cdrom_check_status(drive, NULL);
1000 1000
1001 if (drive->atapi_flags & IDE_AFLAG_TOC_VALID) 1001 if (drive->atapi_flags & IDE_AFLAG_TOC_VALID)
1002 return 0; 1002 return 0;
1003 1003
1004 /* try to get the total cdrom capacity and sector size */ 1004 /* try to get the total cdrom capacity and sector size */
1005 stat = cdrom_read_capacity(drive, &toc->capacity, &sectors_per_frame, 1005 stat = cdrom_read_capacity(drive, &toc->capacity, &sectors_per_frame);
1006 sense);
1007 if (stat) 1006 if (stat)
1008 toc->capacity = 0x1fffff; 1007 toc->capacity = 0x1fffff;
1009 1008
@@ -1016,7 +1015,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
1016 1015
1017 /* first read just the header, so we know how long the TOC is */ 1016 /* first read just the header, so we know how long the TOC is */
1018 stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr, 1017 stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
1019 sizeof(struct atapi_toc_header), sense); 1018 sizeof(struct atapi_toc_header));
1020 if (stat) 1019 if (stat)
1021 return stat; 1020 return stat;
1022 1021
@@ -1036,7 +1035,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
1036 (char *)&toc->hdr, 1035 (char *)&toc->hdr,
1037 sizeof(struct atapi_toc_header) + 1036 sizeof(struct atapi_toc_header) +
1038 (ntracks + 1) * 1037 (ntracks + 1) *
1039 sizeof(struct atapi_toc_entry), sense); 1038 sizeof(struct atapi_toc_entry));
1040 1039
1041 if (stat && toc->hdr.first_track > 1) { 1040 if (stat && toc->hdr.first_track > 1) {
1042 /* 1041 /*
@@ -1056,8 +1055,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
1056 (char *)&toc->hdr, 1055 (char *)&toc->hdr,
1057 sizeof(struct atapi_toc_header) + 1056 sizeof(struct atapi_toc_header) +
1058 (ntracks + 1) * 1057 (ntracks + 1) *
1059 sizeof(struct atapi_toc_entry), 1058 sizeof(struct atapi_toc_entry));
1060 sense);
1061 if (stat) 1059 if (stat)
1062 return stat; 1060 return stat;
1063 1061
@@ -1094,7 +1092,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
1094 if (toc->hdr.first_track != CDROM_LEADOUT) { 1092 if (toc->hdr.first_track != CDROM_LEADOUT) {
1095 /* read the multisession information */ 1093 /* read the multisession information */
1096 stat = cdrom_read_tocentry(drive, 0, 0, 1, (char *)&ms_tmp, 1094 stat = cdrom_read_tocentry(drive, 0, 0, 1, (char *)&ms_tmp,
1097 sizeof(ms_tmp), sense); 1095 sizeof(ms_tmp));
1098 if (stat) 1096 if (stat)
1099 return stat; 1097 return stat;
1100 1098
@@ -1108,7 +1106,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
1108 if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) { 1106 if (drive->atapi_flags & IDE_AFLAG_TOCADDR_AS_BCD) {
1109 /* re-read multisession information using MSF format */ 1107 /* re-read multisession information using MSF format */
1110 stat = cdrom_read_tocentry(drive, 0, 1, 1, (char *)&ms_tmp, 1108 stat = cdrom_read_tocentry(drive, 0, 1, 1, (char *)&ms_tmp,
1111 sizeof(ms_tmp), sense); 1109 sizeof(ms_tmp));
1112 if (stat) 1110 if (stat)
1113 return stat; 1111 return stat;
1114 1112
@@ -1412,7 +1410,7 @@ static sector_t ide_cdrom_capacity(ide_drive_t *drive)
1412{ 1410{
1413 unsigned long capacity, sectors_per_frame; 1411 unsigned long capacity, sectors_per_frame;
1414 1412
1415 if (cdrom_read_capacity(drive, &capacity, &sectors_per_frame, NULL)) 1413 if (cdrom_read_capacity(drive, &capacity, &sectors_per_frame))
1416 return 0; 1414 return 0;
1417 1415
1418 return capacity * sectors_per_frame; 1416 return capacity * sectors_per_frame;
@@ -1710,9 +1708,8 @@ static unsigned int idecd_check_events(struct gendisk *disk,
1710static int idecd_revalidate_disk(struct gendisk *disk) 1708static int idecd_revalidate_disk(struct gendisk *disk)
1711{ 1709{
1712 struct cdrom_info *info = ide_drv_g(disk, cdrom_info); 1710 struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
1713 struct request_sense sense;
1714 1711
1715 ide_cd_read_toc(info->drive, &sense); 1712 ide_cd_read_toc(info->drive);
1716 1713
1717 return 0; 1714 return 0;
1718} 1715}
@@ -1736,7 +1733,6 @@ static int ide_cd_probe(ide_drive_t *drive)
1736{ 1733{
1737 struct cdrom_info *info; 1734 struct cdrom_info *info;
1738 struct gendisk *g; 1735 struct gendisk *g;
1739 struct request_sense sense;
1740 1736
1741 ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x", 1737 ide_debug_log(IDE_DBG_PROBE, "driver_req: %s, media: 0x%x",
1742 drive->driver_req, drive->media); 1738 drive->driver_req, drive->media);
@@ -1785,7 +1781,7 @@ static int ide_cd_probe(ide_drive_t *drive)
1785 goto failed; 1781 goto failed;
1786 } 1782 }
1787 1783
1788 ide_cd_read_toc(drive, &sense); 1784 ide_cd_read_toc(drive);
1789 g->fops = &idecd_ops; 1785 g->fops = &idecd_ops;
1790 g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; 1786 g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
1791 device_add_disk(&drive->gendev, g); 1787 device_add_disk(&drive->gendev, g);
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 04f0f310a856..a69dc7f61c4d 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -98,11 +98,11 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *);
98 98
99/* ide-cd.c functions used by ide-cd_ioctl.c */ 99/* ide-cd.c functions used by ide-cd_ioctl.c */
100int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *, 100int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *,
101 unsigned *, struct request_sense *, int, req_flags_t); 101 unsigned *, struct scsi_sense_hdr *, int, req_flags_t);
102int ide_cd_read_toc(ide_drive_t *, struct request_sense *); 102int ide_cd_read_toc(ide_drive_t *);
103int ide_cdrom_get_capabilities(ide_drive_t *, u8 *); 103int ide_cdrom_get_capabilities(ide_drive_t *, u8 *);
104void ide_cdrom_update_speed(ide_drive_t *, u8 *); 104void ide_cdrom_update_speed(ide_drive_t *, u8 *);
105int cdrom_check_status(ide_drive_t *, struct request_sense *); 105int cdrom_check_status(ide_drive_t *, struct scsi_sense_hdr *);
106 106
107/* ide-cd_ioctl.c */ 107/* ide-cd_ioctl.c */
108int ide_cdrom_open_real(struct cdrom_device_info *, int); 108int ide_cdrom_open_real(struct cdrom_device_info *, int);
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index b1322400887b..4a6e1a413ead 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -43,14 +43,14 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
43{ 43{
44 ide_drive_t *drive = cdi->handle; 44 ide_drive_t *drive = cdi->handle;
45 struct media_event_desc med; 45 struct media_event_desc med;
46 struct request_sense sense; 46 struct scsi_sense_hdr sshdr;
47 int stat; 47 int stat;
48 48
49 if (slot_nr != CDSL_CURRENT) 49 if (slot_nr != CDSL_CURRENT)
50 return -EINVAL; 50 return -EINVAL;
51 51
52 stat = cdrom_check_status(drive, &sense); 52 stat = cdrom_check_status(drive, &sshdr);
53 if (!stat || sense.sense_key == UNIT_ATTENTION) 53 if (!stat || sshdr.sense_key == UNIT_ATTENTION)
54 return CDS_DISC_OK; 54 return CDS_DISC_OK;
55 55
56 if (!cdrom_get_media_event(cdi, &med)) { 56 if (!cdrom_get_media_event(cdi, &med)) {
@@ -62,8 +62,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
62 return CDS_NO_DISC; 62 return CDS_NO_DISC;
63 } 63 }
64 64
65 if (sense.sense_key == NOT_READY && sense.asc == 0x04 65 if (sshdr.sense_key == NOT_READY && sshdr.asc == 0x04
66 && sense.ascq == 0x04) 66 && sshdr.ascq == 0x04)
67 return CDS_DISC_OK; 67 return CDS_DISC_OK;
68 68
69 /* 69 /*
@@ -71,8 +71,8 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
71 * just return TRAY_OPEN since ATAPI doesn't provide 71 * just return TRAY_OPEN since ATAPI doesn't provide
72 * any other way to detect this... 72 * any other way to detect this...
73 */ 73 */
74 if (sense.sense_key == NOT_READY) { 74 if (sshdr.sense_key == NOT_READY) {
75 if (sense.asc == 0x3a && sense.ascq == 1) 75 if (sshdr.asc == 0x3a && sshdr.ascq == 1)
76 return CDS_NO_DISC; 76 return CDS_NO_DISC;
77 else 77 else
78 return CDS_TRAY_OPEN; 78 return CDS_TRAY_OPEN;
@@ -105,8 +105,7 @@ unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi,
105/* Eject the disk if EJECTFLAG is 0. 105/* Eject the disk if EJECTFLAG is 0.
106 If EJECTFLAG is 1, try to reload the disk. */ 106 If EJECTFLAG is 1, try to reload the disk. */
107static 107static
108int cdrom_eject(ide_drive_t *drive, int ejectflag, 108int cdrom_eject(ide_drive_t *drive, int ejectflag)
109 struct request_sense *sense)
110{ 109{
111 struct cdrom_info *cd = drive->driver_data; 110 struct cdrom_info *cd = drive->driver_data;
112 struct cdrom_device_info *cdi = &cd->devinfo; 111 struct cdrom_device_info *cdi = &cd->devinfo;
@@ -129,20 +128,16 @@ int cdrom_eject(ide_drive_t *drive, int ejectflag,
129 cmd[0] = GPCMD_START_STOP_UNIT; 128 cmd[0] = GPCMD_START_STOP_UNIT;
130 cmd[4] = loej | (ejectflag != 0); 129 cmd[4] = loej | (ejectflag != 0);
131 130
132 return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, 0); 131 return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
133} 132}
134 133
135/* Lock the door if LOCKFLAG is nonzero; unlock it otherwise. */ 134/* Lock the door if LOCKFLAG is nonzero; unlock it otherwise. */
136static 135static
137int ide_cd_lockdoor(ide_drive_t *drive, int lockflag, 136int ide_cd_lockdoor(ide_drive_t *drive, int lockflag)
138 struct request_sense *sense)
139{ 137{
140 struct request_sense my_sense; 138 struct scsi_sense_hdr sshdr;
141 int stat; 139 int stat;
142 140
143 if (sense == NULL)
144 sense = &my_sense;
145
146 /* If the drive cannot lock the door, just pretend. */ 141 /* If the drive cannot lock the door, just pretend. */
147 if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) { 142 if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0) {
148 stat = 0; 143 stat = 0;
@@ -155,14 +150,14 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
155 cmd[4] = lockflag ? 1 : 0; 150 cmd[4] = lockflag ? 1 : 0;
156 151
157 stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, 152 stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL,
158 sense, 0, 0); 153 &sshdr, 0, 0);
159 } 154 }
160 155
161 /* If we got an illegal field error, the drive 156 /* If we got an illegal field error, the drive
162 probably cannot lock the door. */ 157 probably cannot lock the door. */
163 if (stat != 0 && 158 if (stat != 0 &&
164 sense->sense_key == ILLEGAL_REQUEST && 159 sshdr.sense_key == ILLEGAL_REQUEST &&
165 (sense->asc == 0x24 || sense->asc == 0x20)) { 160 (sshdr.asc == 0x24 || sshdr.asc == 0x20)) {
166 printk(KERN_ERR "%s: door locking not supported\n", 161 printk(KERN_ERR "%s: door locking not supported\n",
167 drive->name); 162 drive->name);
168 drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING; 163 drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
@@ -170,7 +165,7 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
170 } 165 }
171 166
172 /* no medium, that's alright. */ 167 /* no medium, that's alright. */
173 if (stat != 0 && sense->sense_key == NOT_READY && sense->asc == 0x3a) 168 if (stat != 0 && sshdr.sense_key == NOT_READY && sshdr.asc == 0x3a)
174 stat = 0; 169 stat = 0;
175 170
176 if (stat == 0) { 171 if (stat == 0) {
@@ -186,23 +181,22 @@ int ide_cd_lockdoor(ide_drive_t *drive, int lockflag,
186int ide_cdrom_tray_move(struct cdrom_device_info *cdi, int position) 181int ide_cdrom_tray_move(struct cdrom_device_info *cdi, int position)
187{ 182{
188 ide_drive_t *drive = cdi->handle; 183 ide_drive_t *drive = cdi->handle;
189 struct request_sense sense;
190 184
191 if (position) { 185 if (position) {
192 int stat = ide_cd_lockdoor(drive, 0, &sense); 186 int stat = ide_cd_lockdoor(drive, 0);
193 187
194 if (stat) 188 if (stat)
195 return stat; 189 return stat;
196 } 190 }
197 191
198 return cdrom_eject(drive, !position, &sense); 192 return cdrom_eject(drive, !position);
199} 193}
200 194
201int ide_cdrom_lock_door(struct cdrom_device_info *cdi, int lock) 195int ide_cdrom_lock_door(struct cdrom_device_info *cdi, int lock)
202{ 196{
203 ide_drive_t *drive = cdi->handle; 197 ide_drive_t *drive = cdi->handle;
204 198
205 return ide_cd_lockdoor(drive, lock, NULL); 199 return ide_cd_lockdoor(drive, lock);
206} 200}
207 201
208/* 202/*
@@ -213,7 +207,6 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed)
213{ 207{
214 ide_drive_t *drive = cdi->handle; 208 ide_drive_t *drive = cdi->handle;
215 struct cdrom_info *cd = drive->driver_data; 209 struct cdrom_info *cd = drive->driver_data;
216 struct request_sense sense;
217 u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE]; 210 u8 buf[ATAPI_CAPABILITIES_PAGE_SIZE];
218 int stat; 211 int stat;
219 unsigned char cmd[BLK_MAX_CDB]; 212 unsigned char cmd[BLK_MAX_CDB];
@@ -236,7 +229,7 @@ int ide_cdrom_select_speed(struct cdrom_device_info *cdi, int speed)
236 cmd[5] = speed & 0xff; 229 cmd[5] = speed & 0xff;
237 } 230 }
238 231
239 stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0); 232 stat = ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
240 233
241 if (!ide_cdrom_get_capabilities(drive, buf)) { 234 if (!ide_cdrom_get_capabilities(drive, buf)) {
242 ide_cdrom_update_speed(drive, buf); 235 ide_cdrom_update_speed(drive, buf);
@@ -252,11 +245,10 @@ int ide_cdrom_get_last_session(struct cdrom_device_info *cdi,
252 struct atapi_toc *toc; 245 struct atapi_toc *toc;
253 ide_drive_t *drive = cdi->handle; 246 ide_drive_t *drive = cdi->handle;
254 struct cdrom_info *info = drive->driver_data; 247 struct cdrom_info *info = drive->driver_data;
255 struct request_sense sense;
256 int ret; 248 int ret;
257 249
258 if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0 || !info->toc) { 250 if ((drive->atapi_flags & IDE_AFLAG_TOC_VALID) == 0 || !info->toc) {
259 ret = ide_cd_read_toc(drive, &sense); 251 ret = ide_cd_read_toc(drive);
260 if (ret) 252 if (ret)
261 return ret; 253 return ret;
262 } 254 }
@@ -300,7 +292,6 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
300{ 292{
301 ide_drive_t *drive = cdi->handle; 293 ide_drive_t *drive = cdi->handle;
302 struct cdrom_info *cd = drive->driver_data; 294 struct cdrom_info *cd = drive->driver_data;
303 struct request_sense sense;
304 struct request *rq; 295 struct request *rq;
305 int ret; 296 int ret;
306 297
@@ -315,7 +306,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
315 * lock it again. 306 * lock it again.
316 */ 307 */
317 if (drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED) 308 if (drive->atapi_flags & IDE_AFLAG_DOOR_LOCKED)
318 (void)ide_cd_lockdoor(drive, 1, &sense); 309 (void)ide_cd_lockdoor(drive, 1);
319 310
320 return ret; 311 return ret;
321} 312}
@@ -355,7 +346,6 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg)
355 struct atapi_toc_entry *first_toc, *last_toc; 346 struct atapi_toc_entry *first_toc, *last_toc;
356 unsigned long lba_start, lba_end; 347 unsigned long lba_start, lba_end;
357 int stat; 348 int stat;
358 struct request_sense sense;
359 unsigned char cmd[BLK_MAX_CDB]; 349 unsigned char cmd[BLK_MAX_CDB];
360 350
361 stat = ide_cd_get_toc_entry(drive, ti->cdti_trk0, &first_toc); 351 stat = ide_cd_get_toc_entry(drive, ti->cdti_trk0, &first_toc);
@@ -380,7 +370,7 @@ static int ide_cd_fake_play_trkind(ide_drive_t *drive, void *arg)
380 lba_to_msf(lba_start, &cmd[3], &cmd[4], &cmd[5]); 370 lba_to_msf(lba_start, &cmd[3], &cmd[4], &cmd[5]);
381 lba_to_msf(lba_end - 1, &cmd[6], &cmd[7], &cmd[8]); 371 lba_to_msf(lba_end - 1, &cmd[6], &cmd[7], &cmd[8]);
382 372
383 return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, &sense, 0, 0); 373 return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, NULL, 0, 0);
384} 374}
385 375
386static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg) 376static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg)
@@ -391,7 +381,7 @@ static int ide_cd_read_tochdr(ide_drive_t *drive, void *arg)
391 int stat; 381 int stat;
392 382
393 /* Make sure our saved TOC is valid. */ 383 /* Make sure our saved TOC is valid. */
394 stat = ide_cd_read_toc(drive, NULL); 384 stat = ide_cd_read_toc(drive);
395 if (stat) 385 if (stat)
396 return stat; 386 return stat;
397 387
@@ -461,8 +451,8 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
461 layer. the packet must be complete, as we do not 451 layer. the packet must be complete, as we do not
462 touch it at all. */ 452 touch it at all. */
463 453
464 if (cgc->sense) 454 if (cgc->sshdr)
465 memset(cgc->sense, 0, sizeof(struct request_sense)); 455 memset(cgc->sshdr, 0, sizeof(*cgc->sshdr));
466 456
467 if (cgc->quiet) 457 if (cgc->quiet)
468 flags |= RQF_QUIET; 458 flags |= RQF_QUIET;
@@ -470,7 +460,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi,
470 cgc->stat = ide_cd_queue_pc(drive, cgc->cmd, 460 cgc->stat = ide_cd_queue_pc(drive, cgc->cmd,
471 cgc->data_direction == CGC_DATA_WRITE, 461 cgc->data_direction == CGC_DATA_WRITE,
472 cgc->buffer, &len, 462 cgc->buffer, &len,
473 cgc->sense, cgc->timeout, flags); 463 cgc->sshdr, cgc->timeout, flags);
474 if (!cgc->stat) 464 if (!cgc->stat)
475 cgc->buflen -= len; 465 cgc->buflen -= len;
476 return cgc->stat; 466 return cgc->stat;
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index ca844a926e6a..130bf163f066 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -311,7 +311,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
311{ 311{
312 domain->sig_type = IB_SIG_TYPE_T10_DIF; 312 domain->sig_type = IB_SIG_TYPE_T10_DIF;
313 domain->sig.dif.pi_interval = scsi_prot_interval(sc); 313 domain->sig.dif.pi_interval = scsi_prot_interval(sc);
314 domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc); 314 domain->sig.dif.ref_tag = t10_pi_ref_tag(sc->request);
315 /* 315 /*
316 * At the moment we hard code those, but in the future 316 * At the moment we hard code those, but in the future
317 * we will take them from sc. 317 * we will take them from sc.
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 9c03f35d9df1..439bf90d084d 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -17,23 +17,25 @@ menuconfig NVM
17 17
18if NVM 18if NVM
19 19
20config NVM_DEBUG 20config NVM_PBLK
21 bool "Open-Channel SSD debugging support" 21 tristate "Physical Block Device Open-Channel SSD target"
22 default n 22 help
23 ---help--- 23 Allows an open-channel SSD to be exposed as a block device to the
24 Exposes a debug management interface to create/remove targets at: 24 host. The target assumes the device exposes raw flash and must be
25 explicitly managed by the host.
25 26
26 /sys/module/lnvm/parameters/configure_debug 27 Please note the disk format is considered EXPERIMENTAL for now.
27 28
28 It is required to create/remove targets without IOCTLs. 29if NVM_PBLK
29 30
30config NVM_PBLK 31config NVM_PBLK_DEBUG
31 tristate "Physical Block Device Open-Channel SSD target" 32 bool "PBlk Debug Support"
32 ---help--- 33 default n
33 Allows an open-channel SSD to be exposed as a block device to the 34 help
34 host. The target assumes the device exposes raw flash and must be 35 Enables debug support for pblk. This includes extra checks, more
35 explicitly managed by the host. 36 vocal error messages, and extra tracking fields in the pblk sysfs
37 entries.
36 38
37 Please note the disk format is considered EXPERIMENTAL for now. 39endif # NVM_PBLK_DEBUG
38 40
39endif # NVM 41endif # NVM
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index b1c6d7eb6115..f565a56b898a 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -27,7 +27,8 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
27 int nr_entries = pblk_get_secs(bio); 27 int nr_entries = pblk_get_secs(bio);
28 int i, ret; 28 int i, ret;
29 29
30 generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0); 30 generic_start_io_acct(q, REQ_OP_WRITE, bio_sectors(bio),
31 &pblk->disk->part0);
31 32
32 /* Update the write buffer head (mem) with the entries that we can 33 /* Update the write buffer head (mem) with the entries that we can
33 * write. The write in itself cannot fail, so there is no need to 34 * write. The write in itself cannot fail, so there is no need to
@@ -67,7 +68,7 @@ retry:
67 68
68 atomic64_add(nr_entries, &pblk->user_wa); 69 atomic64_add(nr_entries, &pblk->user_wa);
69 70
70#ifdef CONFIG_NVM_DEBUG 71#ifdef CONFIG_NVM_PBLK_DEBUG
71 atomic_long_add(nr_entries, &pblk->inflight_writes); 72 atomic_long_add(nr_entries, &pblk->inflight_writes);
72 atomic_long_add(nr_entries, &pblk->req_writes); 73 atomic_long_add(nr_entries, &pblk->req_writes);
73#endif 74#endif
@@ -75,7 +76,7 @@ retry:
75 pblk_rl_inserted(&pblk->rl, nr_entries); 76 pblk_rl_inserted(&pblk->rl, nr_entries);
76 77
77out: 78out:
78 generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time); 79 generic_end_io_acct(q, REQ_OP_WRITE, &pblk->disk->part0, start_time);
79 pblk_write_should_kick(pblk); 80 pblk_write_should_kick(pblk);
80 return ret; 81 return ret;
81} 82}
@@ -123,7 +124,7 @@ retry:
123 124
124 atomic64_add(valid_entries, &pblk->gc_wa); 125 atomic64_add(valid_entries, &pblk->gc_wa);
125 126
126#ifdef CONFIG_NVM_DEBUG 127#ifdef CONFIG_NVM_PBLK_DEBUG
127 atomic_long_add(valid_entries, &pblk->inflight_writes); 128 atomic_long_add(valid_entries, &pblk->inflight_writes);
128 atomic_long_add(valid_entries, &pblk->recov_gc_writes); 129 atomic_long_add(valid_entries, &pblk->recov_gc_writes);
129#endif 130#endif
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index ed9cc977c8b3..00984b486fea 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -35,7 +35,7 @@ static void pblk_line_mark_bb(struct work_struct *work)
35 line = &pblk->lines[pblk_ppa_to_line(*ppa)]; 35 line = &pblk->lines[pblk_ppa_to_line(*ppa)];
36 pos = pblk_ppa_to_pos(&dev->geo, *ppa); 36 pos = pblk_ppa_to_pos(&dev->geo, *ppa);
37 37
38 pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", 38 pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n",
39 line->id, pos); 39 line->id, pos);
40 } 40 }
41 41
@@ -51,12 +51,12 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
51 struct ppa_addr *ppa; 51 struct ppa_addr *ppa;
52 int pos = pblk_ppa_to_pos(geo, ppa_addr); 52 int pos = pblk_ppa_to_pos(geo, ppa_addr);
53 53
54 pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos); 54 pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos);
55 atomic_long_inc(&pblk->erase_failed); 55 atomic_long_inc(&pblk->erase_failed);
56 56
57 atomic_dec(&line->blk_in_line); 57 atomic_dec(&line->blk_in_line);
58 if (test_and_set_bit(pos, line->blk_bitmap)) 58 if (test_and_set_bit(pos, line->blk_bitmap))
59 pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", 59 pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n",
60 line->id, pos); 60 line->id, pos);
61 61
62 /* Not necessary to mark bad blocks on 2.0 spec. */ 62 /* Not necessary to mark bad blocks on 2.0 spec. */
@@ -194,7 +194,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
194 u64 paddr; 194 u64 paddr;
195 int line_id; 195 int line_id;
196 196
197#ifdef CONFIG_NVM_DEBUG 197#ifdef CONFIG_NVM_PBLK_DEBUG
198 /* Callers must ensure that the ppa points to a device address */ 198 /* Callers must ensure that the ppa points to a device address */
199 BUG_ON(pblk_addr_in_cache(ppa)); 199 BUG_ON(pblk_addr_in_cache(ppa));
200 BUG_ON(pblk_ppa_empty(ppa)); 200 BUG_ON(pblk_ppa_empty(ppa));
@@ -264,6 +264,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
264 switch (type) { 264 switch (type) {
265 case PBLK_WRITE: 265 case PBLK_WRITE:
266 kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); 266 kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
267 /* fall through */
267 case PBLK_WRITE_INT: 268 case PBLK_WRITE_INT:
268 pool = &pblk->w_rq_pool; 269 pool = &pblk->w_rq_pool;
269 break; 270 break;
@@ -274,7 +275,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
274 pool = &pblk->e_rq_pool; 275 pool = &pblk->e_rq_pool;
275 break; 276 break;
276 default: 277 default:
277 pr_err("pblk: trying to free unknown rqd type\n"); 278 pblk_err(pblk, "trying to free unknown rqd type\n");
278 return; 279 return;
279 } 280 }
280 281
@@ -310,7 +311,7 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
310 311
311 ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); 312 ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
312 if (ret != PBLK_EXPOSED_PAGE_SIZE) { 313 if (ret != PBLK_EXPOSED_PAGE_SIZE) {
313 pr_err("pblk: could not add page to bio\n"); 314 pblk_err(pblk, "could not add page to bio\n");
314 mempool_free(page, &pblk->page_bio_pool); 315 mempool_free(page, &pblk->page_bio_pool);
315 goto err; 316 goto err;
316 } 317 }
@@ -410,7 +411,7 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
410 line->state = PBLK_LINESTATE_CORRUPT; 411 line->state = PBLK_LINESTATE_CORRUPT;
411 line->gc_group = PBLK_LINEGC_NONE; 412 line->gc_group = PBLK_LINEGC_NONE;
412 move_list = &l_mg->corrupt_list; 413 move_list = &l_mg->corrupt_list;
413 pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", 414 pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
414 line->id, vsc, 415 line->id, vsc,
415 line->sec_in_line, 416 line->sec_in_line,
416 lm->high_thrs, lm->mid_thrs); 417 lm->high_thrs, lm->mid_thrs);
@@ -430,7 +431,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio)
430void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) 431void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
431{ 432{
432 atomic_long_inc(&pblk->write_failed); 433 atomic_long_inc(&pblk->write_failed);
433#ifdef CONFIG_NVM_DEBUG 434#ifdef CONFIG_NVM_PBLK_DEBUG
434 pblk_print_failed_rqd(pblk, rqd, rqd->error); 435 pblk_print_failed_rqd(pblk, rqd, rqd->error);
435#endif 436#endif
436} 437}
@@ -452,9 +453,9 @@ void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
452 atomic_long_inc(&pblk->read_failed); 453 atomic_long_inc(&pblk->read_failed);
453 break; 454 break;
454 default: 455 default:
455 pr_err("pblk: unknown read error:%d\n", rqd->error); 456 pblk_err(pblk, "unknown read error:%d\n", rqd->error);
456 } 457 }
457#ifdef CONFIG_NVM_DEBUG 458#ifdef CONFIG_NVM_PBLK_DEBUG
458 pblk_print_failed_rqd(pblk, rqd, rqd->error); 459 pblk_print_failed_rqd(pblk, rqd, rqd->error);
459#endif 460#endif
460} 461}
@@ -470,7 +471,7 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
470 471
471 atomic_inc(&pblk->inflight_io); 472 atomic_inc(&pblk->inflight_io);
472 473
473#ifdef CONFIG_NVM_DEBUG 474#ifdef CONFIG_NVM_PBLK_DEBUG
474 if (pblk_check_io(pblk, rqd)) 475 if (pblk_check_io(pblk, rqd))
475 return NVM_IO_ERR; 476 return NVM_IO_ERR;
476#endif 477#endif
@@ -484,7 +485,7 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
484 485
485 atomic_inc(&pblk->inflight_io); 486 atomic_inc(&pblk->inflight_io);
486 487
487#ifdef CONFIG_NVM_DEBUG 488#ifdef CONFIG_NVM_PBLK_DEBUG
488 if (pblk_check_io(pblk, rqd)) 489 if (pblk_check_io(pblk, rqd))
489 return NVM_IO_ERR; 490 return NVM_IO_ERR;
490#endif 491#endif
@@ -517,7 +518,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
517 for (i = 0; i < nr_secs; i++) { 518 for (i = 0; i < nr_secs; i++) {
518 page = vmalloc_to_page(kaddr); 519 page = vmalloc_to_page(kaddr);
519 if (!page) { 520 if (!page) {
520 pr_err("pblk: could not map vmalloc bio\n"); 521 pblk_err(pblk, "could not map vmalloc bio\n");
521 bio_put(bio); 522 bio_put(bio);
522 bio = ERR_PTR(-ENOMEM); 523 bio = ERR_PTR(-ENOMEM);
523 goto out; 524 goto out;
@@ -525,7 +526,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
525 526
526 ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0); 527 ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
527 if (ret != PAGE_SIZE) { 528 if (ret != PAGE_SIZE) {
528 pr_err("pblk: could not add page to bio\n"); 529 pblk_err(pblk, "could not add page to bio\n");
529 bio_put(bio); 530 bio_put(bio);
530 bio = ERR_PTR(-ENOMEM); 531 bio = ERR_PTR(-ENOMEM);
531 goto out; 532 goto out;
@@ -711,7 +712,7 @@ next_rq:
711 while (test_bit(pos, line->blk_bitmap)) { 712 while (test_bit(pos, line->blk_bitmap)) {
712 paddr += min; 713 paddr += min;
713 if (pblk_boundary_paddr_checks(pblk, paddr)) { 714 if (pblk_boundary_paddr_checks(pblk, paddr)) {
714 pr_err("pblk: corrupt emeta line:%d\n", 715 pblk_err(pblk, "corrupt emeta line:%d\n",
715 line->id); 716 line->id);
716 bio_put(bio); 717 bio_put(bio);
717 ret = -EINTR; 718 ret = -EINTR;
@@ -723,7 +724,7 @@ next_rq:
723 } 724 }
724 725
725 if (pblk_boundary_paddr_checks(pblk, paddr + min)) { 726 if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
726 pr_err("pblk: corrupt emeta line:%d\n", 727 pblk_err(pblk, "corrupt emeta line:%d\n",
727 line->id); 728 line->id);
728 bio_put(bio); 729 bio_put(bio);
729 ret = -EINTR; 730 ret = -EINTR;
@@ -738,7 +739,7 @@ next_rq:
738 739
739 ret = pblk_submit_io_sync(pblk, &rqd); 740 ret = pblk_submit_io_sync(pblk, &rqd);
740 if (ret) { 741 if (ret) {
741 pr_err("pblk: emeta I/O submission failed: %d\n", ret); 742 pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
742 bio_put(bio); 743 bio_put(bio);
743 goto free_rqd_dma; 744 goto free_rqd_dma;
744 } 745 }
@@ -843,7 +844,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
843 */ 844 */
844 ret = pblk_submit_io_sync(pblk, &rqd); 845 ret = pblk_submit_io_sync(pblk, &rqd);
845 if (ret) { 846 if (ret) {
846 pr_err("pblk: smeta I/O submission failed: %d\n", ret); 847 pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
847 bio_put(bio); 848 bio_put(bio);
848 goto free_ppa_list; 849 goto free_ppa_list;
849 } 850 }
@@ -905,7 +906,7 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
905 struct nvm_tgt_dev *dev = pblk->dev; 906 struct nvm_tgt_dev *dev = pblk->dev;
906 struct nvm_geo *geo = &dev->geo; 907 struct nvm_geo *geo = &dev->geo;
907 908
908 pr_err("pblk: could not sync erase line:%d,blk:%d\n", 909 pblk_err(pblk, "could not sync erase line:%d,blk:%d\n",
909 pblk_ppa_to_line(ppa), 910 pblk_ppa_to_line(ppa),
910 pblk_ppa_to_pos(geo, ppa)); 911 pblk_ppa_to_pos(geo, ppa));
911 912
@@ -945,7 +946,7 @@ int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
945 946
946 ret = pblk_blk_erase_sync(pblk, ppa); 947 ret = pblk_blk_erase_sync(pblk, ppa);
947 if (ret) { 948 if (ret) {
948 pr_err("pblk: failed to erase line %d\n", line->id); 949 pblk_err(pblk, "failed to erase line %d\n", line->id);
949 return ret; 950 return ret;
950 } 951 }
951 } while (1); 952 } while (1);
@@ -1012,7 +1013,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
1012 list_add_tail(&line->list, &l_mg->bad_list); 1013 list_add_tail(&line->list, &l_mg->bad_list);
1013 spin_unlock(&l_mg->free_lock); 1014 spin_unlock(&l_mg->free_lock);
1014 1015
1015 pr_debug("pblk: line %d is bad\n", line->id); 1016 pblk_debug(pblk, "line %d is bad\n", line->id);
1016 1017
1017 return 0; 1018 return 0;
1018 } 1019 }
@@ -1122,7 +1123,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
1122 line->cur_sec = off + lm->smeta_sec; 1123 line->cur_sec = off + lm->smeta_sec;
1123 1124
1124 if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) { 1125 if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
1125 pr_debug("pblk: line smeta I/O failed. Retry\n"); 1126 pblk_debug(pblk, "line smeta I/O failed. Retry\n");
1126 return 0; 1127 return 0;
1127 } 1128 }
1128 1129
@@ -1154,7 +1155,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
1154 spin_unlock(&line->lock); 1155 spin_unlock(&line->lock);
1155 1156
1156 list_add_tail(&line->list, &l_mg->bad_list); 1157 list_add_tail(&line->list, &l_mg->bad_list);
1157 pr_err("pblk: unexpected line %d is bad\n", line->id); 1158 pblk_err(pblk, "unexpected line %d is bad\n", line->id);
1158 1159
1159 return 0; 1160 return 0;
1160 } 1161 }
@@ -1299,7 +1300,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
1299 1300
1300retry: 1301retry:
1301 if (list_empty(&l_mg->free_list)) { 1302 if (list_empty(&l_mg->free_list)) {
1302 pr_err("pblk: no free lines\n"); 1303 pblk_err(pblk, "no free lines\n");
1303 return NULL; 1304 return NULL;
1304 } 1305 }
1305 1306
@@ -1315,7 +1316,7 @@ retry:
1315 1316
1316 list_add_tail(&line->list, &l_mg->bad_list); 1317 list_add_tail(&line->list, &l_mg->bad_list);
1317 1318
1318 pr_debug("pblk: line %d is bad\n", line->id); 1319 pblk_debug(pblk, "line %d is bad\n", line->id);
1319 goto retry; 1320 goto retry;
1320 } 1321 }
1321 1322
@@ -1329,7 +1330,7 @@ retry:
1329 list_add(&line->list, &l_mg->corrupt_list); 1330 list_add(&line->list, &l_mg->corrupt_list);
1330 goto retry; 1331 goto retry;
1331 default: 1332 default:
1332 pr_err("pblk: failed to prepare line %d\n", line->id); 1333 pblk_err(pblk, "failed to prepare line %d\n", line->id);
1333 list_add(&line->list, &l_mg->free_list); 1334 list_add(&line->list, &l_mg->free_list);
1334 l_mg->nr_free_lines++; 1335 l_mg->nr_free_lines++;
1335 return NULL; 1336 return NULL;
@@ -1477,7 +1478,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk)
1477 1478
1478 ret = pblk_submit_meta_io(pblk, line); 1479 ret = pblk_submit_meta_io(pblk, line);
1479 if (ret) { 1480 if (ret) {
1480 pr_err("pblk: sync meta line %d failed (%d)\n", 1481 pblk_err(pblk, "sync meta line %d failed (%d)\n",
1481 line->id, ret); 1482 line->id, ret);
1482 return; 1483 return;
1483 } 1484 }
@@ -1507,7 +1508,7 @@ void __pblk_pipeline_flush(struct pblk *pblk)
1507 1508
1508 ret = pblk_recov_pad(pblk); 1509 ret = pblk_recov_pad(pblk);
1509 if (ret) { 1510 if (ret) {
1510 pr_err("pblk: could not close data on teardown(%d)\n", ret); 1511 pblk_err(pblk, "could not close data on teardown(%d)\n", ret);
1511 return; 1512 return;
1512 } 1513 }
1513 1514
@@ -1687,7 +1688,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
1687 struct nvm_tgt_dev *dev = pblk->dev; 1688 struct nvm_tgt_dev *dev = pblk->dev;
1688 struct nvm_geo *geo = &dev->geo; 1689 struct nvm_geo *geo = &dev->geo;
1689 1690
1690 pr_err("pblk: could not async erase line:%d,blk:%d\n", 1691 pblk_err(pblk, "could not async erase line:%d,blk:%d\n",
1691 pblk_ppa_to_line(ppa), 1692 pblk_ppa_to_line(ppa),
1692 pblk_ppa_to_pos(geo, ppa)); 1693 pblk_ppa_to_pos(geo, ppa));
1693 } 1694 }
@@ -1726,7 +1727,7 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
1726 struct list_head *move_list; 1727 struct list_head *move_list;
1727 int i; 1728 int i;
1728 1729
1729#ifdef CONFIG_NVM_DEBUG 1730#ifdef CONFIG_NVM_PBLK_DEBUG
1730 WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), 1731 WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line),
1731 "pblk: corrupt closed line %d\n", line->id); 1732 "pblk: corrupt closed line %d\n", line->id);
1732#endif 1733#endif
@@ -1856,7 +1857,7 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
1856 * Only send one inflight I/O per LUN. Since we map at a page 1857 * Only send one inflight I/O per LUN. Since we map at a page
1857 * granurality, all ppas in the I/O will map to the same LUN 1858 * granurality, all ppas in the I/O will map to the same LUN
1858 */ 1859 */
1859#ifdef CONFIG_NVM_DEBUG 1860#ifdef CONFIG_NVM_PBLK_DEBUG
1860 int i; 1861 int i;
1861 1862
1862 for (i = 1; i < nr_ppas; i++) 1863 for (i = 1; i < nr_ppas; i++)
@@ -1866,7 +1867,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
1866 1867
1867 ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); 1868 ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
1868 if (ret == -ETIME || ret == -EINTR) 1869 if (ret == -ETIME || ret == -EINTR)
1869 pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret); 1870 pblk_err(pblk, "taking lun semaphore timed out: err %d\n",
1871 -ret);
1870} 1872}
1871 1873
1872void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) 1874void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
@@ -1901,7 +1903,7 @@ void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
1901 struct pblk_lun *rlun; 1903 struct pblk_lun *rlun;
1902 int pos = pblk_ppa_to_pos(geo, ppa_list[0]); 1904 int pos = pblk_ppa_to_pos(geo, ppa_list[0]);
1903 1905
1904#ifdef CONFIG_NVM_DEBUG 1906#ifdef CONFIG_NVM_PBLK_DEBUG
1905 int i; 1907 int i;
1906 1908
1907 for (i = 1; i < nr_ppas; i++) 1909 for (i = 1; i < nr_ppas; i++)
@@ -1951,7 +1953,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
1951void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) 1953void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
1952{ 1954{
1953 1955
1954#ifdef CONFIG_NVM_DEBUG 1956#ifdef CONFIG_NVM_PBLK_DEBUG
1955 /* Callers must ensure that the ppa points to a cache address */ 1957 /* Callers must ensure that the ppa points to a cache address */
1956 BUG_ON(!pblk_addr_in_cache(ppa)); 1958 BUG_ON(!pblk_addr_in_cache(ppa));
1957 BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); 1959 BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
@@ -1966,7 +1968,7 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new,
1966 struct ppa_addr ppa_l2p, ppa_gc; 1968 struct ppa_addr ppa_l2p, ppa_gc;
1967 int ret = 1; 1969 int ret = 1;
1968 1970
1969#ifdef CONFIG_NVM_DEBUG 1971#ifdef CONFIG_NVM_PBLK_DEBUG
1970 /* Callers must ensure that the ppa points to a cache address */ 1972 /* Callers must ensure that the ppa points to a cache address */
1971 BUG_ON(!pblk_addr_in_cache(ppa_new)); 1973 BUG_ON(!pblk_addr_in_cache(ppa_new));
1972 BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); 1974 BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new)));
@@ -2003,14 +2005,14 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
2003{ 2005{
2004 struct ppa_addr ppa_l2p; 2006 struct ppa_addr ppa_l2p;
2005 2007
2006#ifdef CONFIG_NVM_DEBUG 2008#ifdef CONFIG_NVM_PBLK_DEBUG
2007 /* Callers must ensure that the ppa points to a device address */ 2009 /* Callers must ensure that the ppa points to a device address */
2008 BUG_ON(pblk_addr_in_cache(ppa_mapped)); 2010 BUG_ON(pblk_addr_in_cache(ppa_mapped));
2009#endif 2011#endif
2010 /* Invalidate and discard padded entries */ 2012 /* Invalidate and discard padded entries */
2011 if (lba == ADDR_EMPTY) { 2013 if (lba == ADDR_EMPTY) {
2012 atomic64_inc(&pblk->pad_wa); 2014 atomic64_inc(&pblk->pad_wa);
2013#ifdef CONFIG_NVM_DEBUG 2015#ifdef CONFIG_NVM_PBLK_DEBUG
2014 atomic_long_inc(&pblk->padded_wb); 2016 atomic_long_inc(&pblk->padded_wb);
2015#endif 2017#endif
2016 if (!pblk_ppa_empty(ppa_mapped)) 2018 if (!pblk_ppa_empty(ppa_mapped))
@@ -2036,7 +2038,7 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
2036 goto out; 2038 goto out;
2037 } 2039 }
2038 2040
2039#ifdef CONFIG_NVM_DEBUG 2041#ifdef CONFIG_NVM_PBLK_DEBUG
2040 WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); 2042 WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
2041#endif 2043#endif
2042 2044
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 080469d90b40..157c2567c9e8 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -90,7 +90,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
90 90
91 gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs)); 91 gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs));
92 if (!gc_rq->data) { 92 if (!gc_rq->data) {
93 pr_err("pblk: could not GC line:%d (%d/%d)\n", 93 pblk_err(pblk, "could not GC line:%d (%d/%d)\n",
94 line->id, *line->vsc, gc_rq->nr_secs); 94 line->id, *line->vsc, gc_rq->nr_secs);
95 goto out; 95 goto out;
96 } 96 }
@@ -98,7 +98,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
98 /* Read from GC victim block */ 98 /* Read from GC victim block */
99 ret = pblk_submit_read_gc(pblk, gc_rq); 99 ret = pblk_submit_read_gc(pblk, gc_rq);
100 if (ret) { 100 if (ret) {
101 pr_err("pblk: failed GC read in line:%d (err:%d)\n", 101 pblk_err(pblk, "failed GC read in line:%d (err:%d)\n",
102 line->id, ret); 102 line->id, ret);
103 goto out; 103 goto out;
104 } 104 }
@@ -146,7 +146,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
146 146
147 ret = pblk_line_read_emeta(pblk, line, emeta_buf); 147 ret = pblk_line_read_emeta(pblk, line, emeta_buf);
148 if (ret) { 148 if (ret) {
149 pr_err("pblk: line %d read emeta failed (%d)\n", 149 pblk_err(pblk, "line %d read emeta failed (%d)\n",
150 line->id, ret); 150 line->id, ret);
151 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); 151 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
152 return NULL; 152 return NULL;
@@ -160,7 +160,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
160 160
161 ret = pblk_recov_check_emeta(pblk, emeta_buf); 161 ret = pblk_recov_check_emeta(pblk, emeta_buf);
162 if (ret) { 162 if (ret) {
163 pr_err("pblk: inconsistent emeta (line %d)\n", 163 pblk_err(pblk, "inconsistent emeta (line %d)\n",
164 line->id); 164 line->id);
165 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); 165 pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
166 return NULL; 166 return NULL;
@@ -201,7 +201,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
201 } else { 201 } else {
202 lba_list = get_lba_list_from_emeta(pblk, line); 202 lba_list = get_lba_list_from_emeta(pblk, line);
203 if (!lba_list) { 203 if (!lba_list) {
204 pr_err("pblk: could not interpret emeta (line %d)\n", 204 pblk_err(pblk, "could not interpret emeta (line %d)\n",
205 line->id); 205 line->id);
206 goto fail_free_invalid_bitmap; 206 goto fail_free_invalid_bitmap;
207 } 207 }
@@ -213,7 +213,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
213 spin_unlock(&line->lock); 213 spin_unlock(&line->lock);
214 214
215 if (sec_left < 0) { 215 if (sec_left < 0) {
216 pr_err("pblk: corrupted GC line (%d)\n", line->id); 216 pblk_err(pblk, "corrupted GC line (%d)\n", line->id);
217 goto fail_free_lba_list; 217 goto fail_free_lba_list;
218 } 218 }
219 219
@@ -289,7 +289,7 @@ fail_free_ws:
289 kref_put(&line->ref, pblk_line_put); 289 kref_put(&line->ref, pblk_line_put);
290 atomic_dec(&gc->read_inflight_gc); 290 atomic_dec(&gc->read_inflight_gc);
291 291
292 pr_err("pblk: Failed to GC line %d\n", line->id); 292 pblk_err(pblk, "failed to GC line %d\n", line->id);
293} 293}
294 294
295static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) 295static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
@@ -297,7 +297,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
297 struct pblk_gc *gc = &pblk->gc; 297 struct pblk_gc *gc = &pblk->gc;
298 struct pblk_line_ws *line_ws; 298 struct pblk_line_ws *line_ws;
299 299
300 pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); 300 pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id);
301 301
302 line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); 302 line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL);
303 if (!line_ws) 303 if (!line_ws)
@@ -351,7 +351,7 @@ static int pblk_gc_read(struct pblk *pblk)
351 pblk_gc_kick(pblk); 351 pblk_gc_kick(pblk);
352 352
353 if (pblk_gc_line(pblk, line)) 353 if (pblk_gc_line(pblk, line))
354 pr_err("pblk: failed to GC line %d\n", line->id); 354 pblk_err(pblk, "failed to GC line %d\n", line->id);
355 355
356 return 0; 356 return 0;
357} 357}
@@ -522,8 +522,8 @@ static int pblk_gc_reader_ts(void *data)
522 io_schedule(); 522 io_schedule();
523 } 523 }
524 524
525#ifdef CONFIG_NVM_DEBUG 525#ifdef CONFIG_NVM_PBLK_DEBUG
526 pr_info("pblk: flushing gc pipeline, %d lines left\n", 526 pblk_info(pblk, "flushing gc pipeline, %d lines left\n",
527 atomic_read(&gc->pipeline_gc)); 527 atomic_read(&gc->pipeline_gc));
528#endif 528#endif
529 529
@@ -540,7 +540,7 @@ static int pblk_gc_reader_ts(void *data)
540static void pblk_gc_start(struct pblk *pblk) 540static void pblk_gc_start(struct pblk *pblk)
541{ 541{
542 pblk->gc.gc_active = 1; 542 pblk->gc.gc_active = 1;
543 pr_debug("pblk: gc start\n"); 543 pblk_debug(pblk, "gc start\n");
544} 544}
545 545
546void pblk_gc_should_start(struct pblk *pblk) 546void pblk_gc_should_start(struct pblk *pblk)
@@ -605,14 +605,14 @@ int pblk_gc_init(struct pblk *pblk)
605 605
606 gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts"); 606 gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
607 if (IS_ERR(gc->gc_ts)) { 607 if (IS_ERR(gc->gc_ts)) {
608 pr_err("pblk: could not allocate GC main kthread\n"); 608 pblk_err(pblk, "could not allocate GC main kthread\n");
609 return PTR_ERR(gc->gc_ts); 609 return PTR_ERR(gc->gc_ts);
610 } 610 }
611 611
612 gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk, 612 gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
613 "pblk-gc-writer-ts"); 613 "pblk-gc-writer-ts");
614 if (IS_ERR(gc->gc_writer_ts)) { 614 if (IS_ERR(gc->gc_writer_ts)) {
615 pr_err("pblk: could not allocate GC writer kthread\n"); 615 pblk_err(pblk, "could not allocate GC writer kthread\n");
616 ret = PTR_ERR(gc->gc_writer_ts); 616 ret = PTR_ERR(gc->gc_writer_ts);
617 goto fail_free_main_kthread; 617 goto fail_free_main_kthread;
618 } 618 }
@@ -620,7 +620,7 @@ int pblk_gc_init(struct pblk *pblk)
620 gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, 620 gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk,
621 "pblk-gc-reader-ts"); 621 "pblk-gc-reader-ts");
622 if (IS_ERR(gc->gc_reader_ts)) { 622 if (IS_ERR(gc->gc_reader_ts)) {
623 pr_err("pblk: could not allocate GC reader kthread\n"); 623 pblk_err(pblk, "could not allocate GC reader kthread\n");
624 ret = PTR_ERR(gc->gc_reader_ts); 624 ret = PTR_ERR(gc->gc_reader_ts);
625 goto fail_free_writer_kthread; 625 goto fail_free_writer_kthread;
626 } 626 }
@@ -641,7 +641,7 @@ int pblk_gc_init(struct pblk *pblk)
641 gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", 641 gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq",
642 WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); 642 WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS);
643 if (!gc->gc_line_reader_wq) { 643 if (!gc->gc_line_reader_wq) {
644 pr_err("pblk: could not allocate GC line reader workqueue\n"); 644 pblk_err(pblk, "could not allocate GC line reader workqueue\n");
645 ret = -ENOMEM; 645 ret = -ENOMEM;
646 goto fail_free_reader_kthread; 646 goto fail_free_reader_kthread;
647 } 647 }
@@ -650,7 +650,7 @@ int pblk_gc_init(struct pblk *pblk)
650 gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", 650 gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq",
651 WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 651 WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
652 if (!gc->gc_reader_wq) { 652 if (!gc->gc_reader_wq) {
653 pr_err("pblk: could not allocate GC reader workqueue\n"); 653 pblk_err(pblk, "could not allocate GC reader workqueue\n");
654 ret = -ENOMEM; 654 ret = -ENOMEM;
655 goto fail_free_reader_line_wq; 655 goto fail_free_reader_line_wq;
656 } 656 }
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index b57f764d6a16..537e98f2b24a 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -91,7 +91,7 @@ static size_t pblk_trans_map_size(struct pblk *pblk)
91 return entry_size * pblk->rl.nr_secs; 91 return entry_size * pblk->rl.nr_secs;
92} 92}
93 93
94#ifdef CONFIG_NVM_DEBUG 94#ifdef CONFIG_NVM_PBLK_DEBUG
95static u32 pblk_l2p_crc(struct pblk *pblk) 95static u32 pblk_l2p_crc(struct pblk *pblk)
96{ 96{
97 size_t map_size; 97 size_t map_size;
@@ -117,13 +117,13 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
117 } else { 117 } else {
118 line = pblk_recov_l2p(pblk); 118 line = pblk_recov_l2p(pblk);
119 if (IS_ERR(line)) { 119 if (IS_ERR(line)) {
120 pr_err("pblk: could not recover l2p table\n"); 120 pblk_err(pblk, "could not recover l2p table\n");
121 return -EFAULT; 121 return -EFAULT;
122 } 122 }
123 } 123 }
124 124
125#ifdef CONFIG_NVM_DEBUG 125#ifdef CONFIG_NVM_PBLK_DEBUG
126 pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); 126 pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
127#endif 127#endif
128 128
129 /* Free full lines directly as GC has not been started yet */ 129 /* Free full lines directly as GC has not been started yet */
@@ -166,7 +166,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
166static void pblk_rwb_free(struct pblk *pblk) 166static void pblk_rwb_free(struct pblk *pblk)
167{ 167{
168 if (pblk_rb_tear_down_check(&pblk->rwb)) 168 if (pblk_rb_tear_down_check(&pblk->rwb))
169 pr_err("pblk: write buffer error on tear down\n"); 169 pblk_err(pblk, "write buffer error on tear down\n");
170 170
171 pblk_rb_data_free(&pblk->rwb); 171 pblk_rb_data_free(&pblk->rwb);
172 vfree(pblk_rb_entries_ref(&pblk->rwb)); 172 vfree(pblk_rb_entries_ref(&pblk->rwb));
@@ -179,11 +179,14 @@ static int pblk_rwb_init(struct pblk *pblk)
179 struct pblk_rb_entry *entries; 179 struct pblk_rb_entry *entries;
180 unsigned long nr_entries, buffer_size; 180 unsigned long nr_entries, buffer_size;
181 unsigned int power_size, power_seg_sz; 181 unsigned int power_size, power_seg_sz;
182 int pgs_in_buffer;
182 183
183 if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer)) 184 pgs_in_buffer = max(geo->mw_cunits, geo->ws_opt) * geo->all_luns;
185
186 if (write_buffer_size && (write_buffer_size > pgs_in_buffer))
184 buffer_size = write_buffer_size; 187 buffer_size = write_buffer_size;
185 else 188 else
186 buffer_size = pblk->pgs_in_buffer; 189 buffer_size = pgs_in_buffer;
187 190
188 nr_entries = pblk_rb_calculate_size(buffer_size); 191 nr_entries = pblk_rb_calculate_size(buffer_size);
189 192
@@ -200,7 +203,8 @@ static int pblk_rwb_init(struct pblk *pblk)
200/* Minimum pages needed within a lun */ 203/* Minimum pages needed within a lun */
201#define ADDR_POOL_SIZE 64 204#define ADDR_POOL_SIZE 64
202 205
203static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst) 206static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
207 struct nvm_addrf_12 *dst)
204{ 208{
205 struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf; 209 struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf;
206 int power_len; 210 int power_len;
@@ -208,14 +212,14 @@ static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst)
208 /* Re-calculate channel and lun format to adapt to configuration */ 212 /* Re-calculate channel and lun format to adapt to configuration */
209 power_len = get_count_order(geo->num_ch); 213 power_len = get_count_order(geo->num_ch);
210 if (1 << power_len != geo->num_ch) { 214 if (1 << power_len != geo->num_ch) {
211 pr_err("pblk: supports only power-of-two channel config.\n"); 215 pblk_err(pblk, "supports only power-of-two channel config.\n");
212 return -EINVAL; 216 return -EINVAL;
213 } 217 }
214 dst->ch_len = power_len; 218 dst->ch_len = power_len;
215 219
216 power_len = get_count_order(geo->num_lun); 220 power_len = get_count_order(geo->num_lun);
217 if (1 << power_len != geo->num_lun) { 221 if (1 << power_len != geo->num_lun) {
218 pr_err("pblk: supports only power-of-two LUN config.\n"); 222 pblk_err(pblk, "supports only power-of-two LUN config.\n");
219 return -EINVAL; 223 return -EINVAL;
220 } 224 }
221 dst->lun_len = power_len; 225 dst->lun_len = power_len;
@@ -282,18 +286,19 @@ static int pblk_set_addrf(struct pblk *pblk)
282 case NVM_OCSSD_SPEC_12: 286 case NVM_OCSSD_SPEC_12:
283 div_u64_rem(geo->clba, pblk->min_write_pgs, &mod); 287 div_u64_rem(geo->clba, pblk->min_write_pgs, &mod);
284 if (mod) { 288 if (mod) {
285 pr_err("pblk: bad configuration of sectors/pages\n"); 289 pblk_err(pblk, "bad configuration of sectors/pages\n");
286 return -EINVAL; 290 return -EINVAL;
287 } 291 }
288 292
289 pblk->addrf_len = pblk_set_addrf_12(geo, (void *)&pblk->addrf); 293 pblk->addrf_len = pblk_set_addrf_12(pblk, geo,
294 (void *)&pblk->addrf);
290 break; 295 break;
291 case NVM_OCSSD_SPEC_20: 296 case NVM_OCSSD_SPEC_20:
292 pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf, 297 pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf,
293 &pblk->uaddrf); 298 &pblk->uaddrf);
294 break; 299 break;
295 default: 300 default:
296 pr_err("pblk: OCSSD revision not supported (%d)\n", 301 pblk_err(pblk, "OCSSD revision not supported (%d)\n",
297 geo->version); 302 geo->version);
298 return -EINVAL; 303 return -EINVAL;
299 } 304 }
@@ -366,15 +371,13 @@ static int pblk_core_init(struct pblk *pblk)
366 atomic64_set(&pblk->nr_flush, 0); 371 atomic64_set(&pblk->nr_flush, 0);
367 pblk->nr_flush_rst = 0; 372 pblk->nr_flush_rst = 0;
368 373
369 pblk->pgs_in_buffer = geo->mw_cunits * geo->all_luns;
370
371 pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE); 374 pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE);
372 max_write_ppas = pblk->min_write_pgs * geo->all_luns; 375 max_write_ppas = pblk->min_write_pgs * geo->all_luns;
373 pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); 376 pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
374 pblk_set_sec_per_write(pblk, pblk->min_write_pgs); 377 pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
375 378
376 if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { 379 if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
377 pr_err("pblk: vector list too big(%u > %u)\n", 380 pblk_err(pblk, "vector list too big(%u > %u)\n",
378 pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS); 381 pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
379 return -EINVAL; 382 return -EINVAL;
380 } 383 }
@@ -607,7 +610,7 @@ static int pblk_luns_init(struct pblk *pblk)
607 610
608 /* TODO: Implement unbalanced LUN support */ 611 /* TODO: Implement unbalanced LUN support */
609 if (geo->num_lun < 0) { 612 if (geo->num_lun < 0) {
610 pr_err("pblk: unbalanced LUN config.\n"); 613 pblk_err(pblk, "unbalanced LUN config.\n");
611 return -EINVAL; 614 return -EINVAL;
612 } 615 }
613 616
@@ -716,10 +719,11 @@ static int pblk_setup_line_meta_12(struct pblk *pblk, struct pblk_line *line,
716 719
717 /* 720 /*
718 * In 1.2 spec. chunk state is not persisted by the device. Thus 721 * In 1.2 spec. chunk state is not persisted by the device. Thus
719 * some of the values are reset each time pblk is instantiated. 722 * some of the values are reset each time pblk is instantiated,
723 * so we have to assume that the block is closed.
720 */ 724 */
721 if (lun_bb_meta[line->id] == NVM_BLK_T_FREE) 725 if (lun_bb_meta[line->id] == NVM_BLK_T_FREE)
722 chunk->state = NVM_CHK_ST_FREE; 726 chunk->state = NVM_CHK_ST_CLOSED;
723 else 727 else
724 chunk->state = NVM_CHK_ST_OFFLINE; 728 chunk->state = NVM_CHK_ST_OFFLINE;
725 729
@@ -1026,7 +1030,7 @@ add_emeta_page:
1026 lm->emeta_sec[0], geo->clba); 1030 lm->emeta_sec[0], geo->clba);
1027 1031
1028 if (lm->min_blk_line > lm->blk_per_line) { 1032 if (lm->min_blk_line > lm->blk_per_line) {
1029 pr_err("pblk: config. not supported. Min. LUN in line:%d\n", 1033 pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n",
1030 lm->blk_per_line); 1034 lm->blk_per_line);
1031 return -EINVAL; 1035 return -EINVAL;
1032 } 1036 }
@@ -1078,7 +1082,7 @@ static int pblk_lines_init(struct pblk *pblk)
1078 } 1082 }
1079 1083
1080 if (!nr_free_chks) { 1084 if (!nr_free_chks) {
1081 pr_err("pblk: too many bad blocks prevent for sane instance\n"); 1085 pblk_err(pblk, "too many bad blocks prevent for sane instance\n");
1082 return -EINTR; 1086 return -EINTR;
1083 } 1087 }
1084 1088
@@ -1108,7 +1112,7 @@ static int pblk_writer_init(struct pblk *pblk)
1108 int err = PTR_ERR(pblk->writer_ts); 1112 int err = PTR_ERR(pblk->writer_ts);
1109 1113
1110 if (err != -EINTR) 1114 if (err != -EINTR)
1111 pr_err("pblk: could not allocate writer kthread (%d)\n", 1115 pblk_err(pblk, "could not allocate writer kthread (%d)\n",
1112 err); 1116 err);
1113 return err; 1117 return err;
1114 } 1118 }
@@ -1154,7 +1158,7 @@ static void pblk_tear_down(struct pblk *pblk, bool graceful)
1154 pblk_rb_sync_l2p(&pblk->rwb); 1158 pblk_rb_sync_l2p(&pblk->rwb);
1155 pblk_rl_free(&pblk->rl); 1159 pblk_rl_free(&pblk->rl);
1156 1160
1157 pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful); 1161 pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful);
1158} 1162}
1159 1163
1160static void pblk_exit(void *private, bool graceful) 1164static void pblk_exit(void *private, bool graceful)
@@ -1165,8 +1169,8 @@ static void pblk_exit(void *private, bool graceful)
1165 pblk_gc_exit(pblk, graceful); 1169 pblk_gc_exit(pblk, graceful);
1166 pblk_tear_down(pblk, graceful); 1170 pblk_tear_down(pblk, graceful);
1167 1171
1168#ifdef CONFIG_NVM_DEBUG 1172#ifdef CONFIG_NVM_PBLK_DEBUG
1169 pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); 1173 pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
1170#endif 1174#endif
1171 1175
1172 pblk_free(pblk); 1176 pblk_free(pblk);
@@ -1189,34 +1193,35 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
1189 struct pblk *pblk; 1193 struct pblk *pblk;
1190 int ret; 1194 int ret;
1191 1195
1192 /* pblk supports 1.2 and 2.0 versions */ 1196 pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
1197 if (!pblk)
1198 return ERR_PTR(-ENOMEM);
1199
1200 pblk->dev = dev;
1201 pblk->disk = tdisk;
1202 pblk->state = PBLK_STATE_RUNNING;
1203 pblk->gc.gc_enabled = 0;
1204
1193 if (!(geo->version == NVM_OCSSD_SPEC_12 || 1205 if (!(geo->version == NVM_OCSSD_SPEC_12 ||
1194 geo->version == NVM_OCSSD_SPEC_20)) { 1206 geo->version == NVM_OCSSD_SPEC_20)) {
1195 pr_err("pblk: OCSSD version not supported (%u)\n", 1207 pblk_err(pblk, "OCSSD version not supported (%u)\n",
1196 geo->version); 1208 geo->version);
1209 kfree(pblk);
1197 return ERR_PTR(-EINVAL); 1210 return ERR_PTR(-EINVAL);
1198 } 1211 }
1199 1212
1200 if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) { 1213 if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) {
1201 pr_err("pblk: host-side L2P table not supported. (%x)\n", 1214 pblk_err(pblk, "host-side L2P table not supported. (%x)\n",
1202 geo->dom); 1215 geo->dom);
1216 kfree(pblk);
1203 return ERR_PTR(-EINVAL); 1217 return ERR_PTR(-EINVAL);
1204 } 1218 }
1205 1219
1206 pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
1207 if (!pblk)
1208 return ERR_PTR(-ENOMEM);
1209
1210 pblk->dev = dev;
1211 pblk->disk = tdisk;
1212 pblk->state = PBLK_STATE_RUNNING;
1213 pblk->gc.gc_enabled = 0;
1214
1215 spin_lock_init(&pblk->resubmit_lock); 1220 spin_lock_init(&pblk->resubmit_lock);
1216 spin_lock_init(&pblk->trans_lock); 1221 spin_lock_init(&pblk->trans_lock);
1217 spin_lock_init(&pblk->lock); 1222 spin_lock_init(&pblk->lock);
1218 1223
1219#ifdef CONFIG_NVM_DEBUG 1224#ifdef CONFIG_NVM_PBLK_DEBUG
1220 atomic_long_set(&pblk->inflight_writes, 0); 1225 atomic_long_set(&pblk->inflight_writes, 0);
1221 atomic_long_set(&pblk->padded_writes, 0); 1226 atomic_long_set(&pblk->padded_writes, 0);
1222 atomic_long_set(&pblk->padded_wb, 0); 1227 atomic_long_set(&pblk->padded_wb, 0);
@@ -1241,38 +1246,38 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
1241 1246
1242 ret = pblk_core_init(pblk); 1247 ret = pblk_core_init(pblk);
1243 if (ret) { 1248 if (ret) {
1244 pr_err("pblk: could not initialize core\n"); 1249 pblk_err(pblk, "could not initialize core\n");
1245 goto fail; 1250 goto fail;
1246 } 1251 }
1247 1252
1248 ret = pblk_lines_init(pblk); 1253 ret = pblk_lines_init(pblk);
1249 if (ret) { 1254 if (ret) {
1250 pr_err("pblk: could not initialize lines\n"); 1255 pblk_err(pblk, "could not initialize lines\n");
1251 goto fail_free_core; 1256 goto fail_free_core;
1252 } 1257 }
1253 1258
1254 ret = pblk_rwb_init(pblk); 1259 ret = pblk_rwb_init(pblk);
1255 if (ret) { 1260 if (ret) {
1256 pr_err("pblk: could not initialize write buffer\n"); 1261 pblk_err(pblk, "could not initialize write buffer\n");
1257 goto fail_free_lines; 1262 goto fail_free_lines;
1258 } 1263 }
1259 1264
1260 ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY); 1265 ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY);
1261 if (ret) { 1266 if (ret) {
1262 pr_err("pblk: could not initialize maps\n"); 1267 pblk_err(pblk, "could not initialize maps\n");
1263 goto fail_free_rwb; 1268 goto fail_free_rwb;
1264 } 1269 }
1265 1270
1266 ret = pblk_writer_init(pblk); 1271 ret = pblk_writer_init(pblk);
1267 if (ret) { 1272 if (ret) {
1268 if (ret != -EINTR) 1273 if (ret != -EINTR)
1269 pr_err("pblk: could not initialize write thread\n"); 1274 pblk_err(pblk, "could not initialize write thread\n");
1270 goto fail_free_l2p; 1275 goto fail_free_l2p;
1271 } 1276 }
1272 1277
1273 ret = pblk_gc_init(pblk); 1278 ret = pblk_gc_init(pblk);
1274 if (ret) { 1279 if (ret) {
1275 pr_err("pblk: could not initialize gc\n"); 1280 pblk_err(pblk, "could not initialize gc\n");
1276 goto fail_stop_writer; 1281 goto fail_stop_writer;
1277 } 1282 }
1278 1283
@@ -1287,8 +1292,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
1287 blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); 1292 blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
1288 blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue); 1293 blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue);
1289 1294
1290 pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n", 1295 pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
1291 tdisk->disk_name,
1292 geo->all_luns, pblk->l_mg.nr_lines, 1296 geo->all_luns, pblk->l_mg.nr_lines,
1293 (unsigned long long)pblk->rl.nr_secs, 1297 (unsigned long long)pblk->rl.nr_secs,
1294 pblk->rwb.nr_entries); 1298 pblk->rwb.nr_entries);
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 55e9442a99e2..f6eec0212dfc 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -111,7 +111,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
111 } while (iter > 0); 111 } while (iter > 0);
112 up_write(&pblk_rb_lock); 112 up_write(&pblk_rb_lock);
113 113
114#ifdef CONFIG_NVM_DEBUG 114#ifdef CONFIG_NVM_PBLK_DEBUG
115 atomic_set(&rb->inflight_flush_point, 0); 115 atomic_set(&rb->inflight_flush_point, 0);
116#endif 116#endif
117 117
@@ -308,7 +308,7 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
308 308
309 entry = &rb->entries[ring_pos]; 309 entry = &rb->entries[ring_pos];
310 flags = READ_ONCE(entry->w_ctx.flags); 310 flags = READ_ONCE(entry->w_ctx.flags);
311#ifdef CONFIG_NVM_DEBUG 311#ifdef CONFIG_NVM_PBLK_DEBUG
312 /* Caller must guarantee that the entry is free */ 312 /* Caller must guarantee that the entry is free */
313 BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); 313 BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
314#endif 314#endif
@@ -332,7 +332,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
332 332
333 entry = &rb->entries[ring_pos]; 333 entry = &rb->entries[ring_pos];
334 flags = READ_ONCE(entry->w_ctx.flags); 334 flags = READ_ONCE(entry->w_ctx.flags);
335#ifdef CONFIG_NVM_DEBUG 335#ifdef CONFIG_NVM_PBLK_DEBUG
336 /* Caller must guarantee that the entry is free */ 336 /* Caller must guarantee that the entry is free */
337 BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); 337 BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
338#endif 338#endif
@@ -362,7 +362,7 @@ static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
362 return 0; 362 return 0;
363 } 363 }
364 364
365#ifdef CONFIG_NVM_DEBUG 365#ifdef CONFIG_NVM_PBLK_DEBUG
366 atomic_inc(&rb->inflight_flush_point); 366 atomic_inc(&rb->inflight_flush_point);
367#endif 367#endif
368 368
@@ -547,7 +547,7 @@ try:
547 547
548 page = virt_to_page(entry->data); 548 page = virt_to_page(entry->data);
549 if (!page) { 549 if (!page) {
550 pr_err("pblk: could not allocate write bio page\n"); 550 pblk_err(pblk, "could not allocate write bio page\n");
551 flags &= ~PBLK_WRITTEN_DATA; 551 flags &= ~PBLK_WRITTEN_DATA;
552 flags |= PBLK_SUBMITTED_ENTRY; 552 flags |= PBLK_SUBMITTED_ENTRY;
553 /* Release flags on context. Protect from writes */ 553 /* Release flags on context. Protect from writes */
@@ -557,7 +557,7 @@ try:
557 557
558 if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != 558 if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) !=
559 rb->seg_size) { 559 rb->seg_size) {
560 pr_err("pblk: could not add page to write bio\n"); 560 pblk_err(pblk, "could not add page to write bio\n");
561 flags &= ~PBLK_WRITTEN_DATA; 561 flags &= ~PBLK_WRITTEN_DATA;
562 flags |= PBLK_SUBMITTED_ENTRY; 562 flags |= PBLK_SUBMITTED_ENTRY;
563 /* Release flags on context. Protect from writes */ 563 /* Release flags on context. Protect from writes */
@@ -576,19 +576,19 @@ try:
576 576
577 if (pad) { 577 if (pad) {
578 if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { 578 if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) {
579 pr_err("pblk: could not pad page in write bio\n"); 579 pblk_err(pblk, "could not pad page in write bio\n");
580 return NVM_IO_ERR; 580 return NVM_IO_ERR;
581 } 581 }
582 582
583 if (pad < pblk->min_write_pgs) 583 if (pad < pblk->min_write_pgs)
584 atomic64_inc(&pblk->pad_dist[pad - 1]); 584 atomic64_inc(&pblk->pad_dist[pad - 1]);
585 else 585 else
586 pr_warn("pblk: padding more than min. sectors\n"); 586 pblk_warn(pblk, "padding more than min. sectors\n");
587 587
588 atomic64_add(pad, &pblk->pad_wa); 588 atomic64_add(pad, &pblk->pad_wa);
589 } 589 }
590 590
591#ifdef CONFIG_NVM_DEBUG 591#ifdef CONFIG_NVM_PBLK_DEBUG
592 atomic_long_add(pad, &pblk->padded_writes); 592 atomic_long_add(pad, &pblk->padded_writes);
593#endif 593#endif
594 594
@@ -613,7 +613,7 @@ int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
613 int ret = 1; 613 int ret = 1;
614 614
615 615
616#ifdef CONFIG_NVM_DEBUG 616#ifdef CONFIG_NVM_PBLK_DEBUG
617 /* Caller must ensure that the access will not cause an overflow */ 617 /* Caller must ensure that the access will not cause an overflow */
618 BUG_ON(pos >= rb->nr_entries); 618 BUG_ON(pos >= rb->nr_entries);
619#endif 619#endif
@@ -820,7 +820,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
820 rb->subm, 820 rb->subm,
821 rb->sync, 821 rb->sync,
822 rb->l2p_update, 822 rb->l2p_update,
823#ifdef CONFIG_NVM_DEBUG 823#ifdef CONFIG_NVM_PBLK_DEBUG
824 atomic_read(&rb->inflight_flush_point), 824 atomic_read(&rb->inflight_flush_point),
825#else 825#else
826 0, 826 0,
@@ -838,7 +838,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
838 rb->subm, 838 rb->subm,
839 rb->sync, 839 rb->sync,
840 rb->l2p_update, 840 rb->l2p_update,
841#ifdef CONFIG_NVM_DEBUG 841#ifdef CONFIG_NVM_PBLK_DEBUG
842 atomic_read(&rb->inflight_flush_point), 842 atomic_read(&rb->inflight_flush_point),
843#else 843#else
844 0, 844 0,
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 18694694e5f0..5a46d7f9302f 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -28,7 +28,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
28 sector_t lba, struct ppa_addr ppa, 28 sector_t lba, struct ppa_addr ppa,
29 int bio_iter, bool advanced_bio) 29 int bio_iter, bool advanced_bio)
30{ 30{
31#ifdef CONFIG_NVM_DEBUG 31#ifdef CONFIG_NVM_PBLK_DEBUG
32 /* Callers must ensure that the ppa points to a cache address */ 32 /* Callers must ensure that the ppa points to a cache address */
33 BUG_ON(pblk_ppa_empty(ppa)); 33 BUG_ON(pblk_ppa_empty(ppa));
34 BUG_ON(!pblk_addr_in_cache(ppa)); 34 BUG_ON(!pblk_addr_in_cache(ppa));
@@ -79,7 +79,7 @@ retry:
79 WARN_ON(test_and_set_bit(i, read_bitmap)); 79 WARN_ON(test_and_set_bit(i, read_bitmap));
80 meta_list[i].lba = cpu_to_le64(lba); 80 meta_list[i].lba = cpu_to_le64(lba);
81 advanced_bio = true; 81 advanced_bio = true;
82#ifdef CONFIG_NVM_DEBUG 82#ifdef CONFIG_NVM_PBLK_DEBUG
83 atomic_long_inc(&pblk->cache_reads); 83 atomic_long_inc(&pblk->cache_reads);
84#endif 84#endif
85 } else { 85 } else {
@@ -97,7 +97,7 @@ next:
97 else 97 else
98 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); 98 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
99 99
100#ifdef CONFIG_NVM_DEBUG 100#ifdef CONFIG_NVM_PBLK_DEBUG
101 atomic_long_add(nr_secs, &pblk->inflight_reads); 101 atomic_long_add(nr_secs, &pblk->inflight_reads);
102#endif 102#endif
103} 103}
@@ -117,13 +117,13 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
117 continue; 117 continue;
118 118
119 if (lba != blba + i) { 119 if (lba != blba + i) {
120#ifdef CONFIG_NVM_DEBUG 120#ifdef CONFIG_NVM_PBLK_DEBUG
121 struct ppa_addr *p; 121 struct ppa_addr *p;
122 122
123 p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr; 123 p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr;
124 print_ppa(&pblk->dev->geo, p, "seq", i); 124 print_ppa(pblk, p, "seq", i);
125#endif 125#endif
126 pr_err("pblk: corrupted read LBA (%llu/%llu)\n", 126 pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
127 lba, (u64)blba + i); 127 lba, (u64)blba + i);
128 WARN_ON(1); 128 WARN_ON(1);
129 } 129 }
@@ -149,14 +149,14 @@ static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
149 meta_lba = le64_to_cpu(meta_lba_list[j].lba); 149 meta_lba = le64_to_cpu(meta_lba_list[j].lba);
150 150
151 if (lba != meta_lba) { 151 if (lba != meta_lba) {
152#ifdef CONFIG_NVM_DEBUG 152#ifdef CONFIG_NVM_PBLK_DEBUG
153 struct ppa_addr *p; 153 struct ppa_addr *p;
154 int nr_ppas = rqd->nr_ppas; 154 int nr_ppas = rqd->nr_ppas;
155 155
156 p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr; 156 p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr;
157 print_ppa(&pblk->dev->geo, p, "seq", j); 157 print_ppa(pblk, p, "seq", j);
158#endif 158#endif
159 pr_err("pblk: corrupted read LBA (%llu/%llu)\n", 159 pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n",
160 lba, meta_lba); 160 lba, meta_lba);
161 WARN_ON(1); 161 WARN_ON(1);
162 } 162 }
@@ -185,7 +185,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
185 185
186static void pblk_end_user_read(struct bio *bio) 186static void pblk_end_user_read(struct bio *bio)
187{ 187{
188#ifdef CONFIG_NVM_DEBUG 188#ifdef CONFIG_NVM_PBLK_DEBUG
189 WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n"); 189 WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
190#endif 190#endif
191 bio_endio(bio); 191 bio_endio(bio);
@@ -199,7 +199,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
199 struct bio *int_bio = rqd->bio; 199 struct bio *int_bio = rqd->bio;
200 unsigned long start_time = r_ctx->start_time; 200 unsigned long start_time = r_ctx->start_time;
201 201
202 generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time); 202 generic_end_io_acct(dev->q, REQ_OP_READ, &pblk->disk->part0, start_time);
203 203
204 if (rqd->error) 204 if (rqd->error)
205 pblk_log_read_err(pblk, rqd); 205 pblk_log_read_err(pblk, rqd);
@@ -212,7 +212,7 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
212 if (put_line) 212 if (put_line)
213 pblk_read_put_rqd_kref(pblk, rqd); 213 pblk_read_put_rqd_kref(pblk, rqd);
214 214
215#ifdef CONFIG_NVM_DEBUG 215#ifdef CONFIG_NVM_PBLK_DEBUG
216 atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); 216 atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
217 atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); 217 atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
218#endif 218#endif
@@ -231,74 +231,36 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
231 __pblk_end_io_read(pblk, rqd, true); 231 __pblk_end_io_read(pblk, rqd, true);
232} 232}
233 233
234static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd, 234static void pblk_end_partial_read(struct nvm_rq *rqd)
235 struct bio *orig_bio, unsigned int bio_init_idx,
236 unsigned long *read_bitmap)
237{ 235{
238 struct pblk_sec_meta *meta_list = rqd->meta_list; 236 struct pblk *pblk = rqd->private;
239 struct bio *new_bio; 237 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
238 struct pblk_pr_ctx *pr_ctx = r_ctx->private;
239 struct bio *new_bio = rqd->bio;
240 struct bio *bio = pr_ctx->orig_bio;
240 struct bio_vec src_bv, dst_bv; 241 struct bio_vec src_bv, dst_bv;
241 void *ppa_ptr = NULL; 242 struct pblk_sec_meta *meta_list = rqd->meta_list;
242 void *src_p, *dst_p; 243 int bio_init_idx = pr_ctx->bio_init_idx;
243 dma_addr_t dma_ppa_list = 0; 244 unsigned long *read_bitmap = pr_ctx->bitmap;
244 __le64 *lba_list_mem, *lba_list_media; 245 int nr_secs = pr_ctx->orig_nr_secs;
245 int nr_secs = rqd->nr_ppas;
246 int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); 246 int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
247 int i, ret, hole; 247 __le64 *lba_list_mem, *lba_list_media;
248 248 void *src_p, *dst_p;
249 /* Re-use allocated memory for intermediate lbas */ 249 int hole, i;
250 lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
251 lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
252
253 new_bio = bio_alloc(GFP_KERNEL, nr_holes);
254
255 if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
256 goto fail_add_pages;
257
258 if (nr_holes != new_bio->bi_vcnt) {
259 pr_err("pblk: malformed bio\n");
260 goto fail;
261 }
262
263 for (i = 0; i < nr_secs; i++)
264 lba_list_mem[i] = meta_list[i].lba;
265
266 new_bio->bi_iter.bi_sector = 0; /* internal bio */
267 bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
268
269 rqd->bio = new_bio;
270 rqd->nr_ppas = nr_holes;
271 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
272
273 if (unlikely(nr_holes == 1)) {
274 ppa_ptr = rqd->ppa_list;
275 dma_ppa_list = rqd->dma_ppa_list;
276 rqd->ppa_addr = rqd->ppa_list[0];
277 }
278
279 ret = pblk_submit_io_sync(pblk, rqd);
280 if (ret) {
281 bio_put(rqd->bio);
282 pr_err("pblk: sync read IO submission failed\n");
283 goto fail;
284 }
285
286 if (rqd->error) {
287 atomic_long_inc(&pblk->read_failed);
288#ifdef CONFIG_NVM_DEBUG
289 pblk_print_failed_rqd(pblk, rqd, rqd->error);
290#endif
291 }
292 250
293 if (unlikely(nr_holes == 1)) { 251 if (unlikely(nr_holes == 1)) {
294 struct ppa_addr ppa; 252 struct ppa_addr ppa;
295 253
296 ppa = rqd->ppa_addr; 254 ppa = rqd->ppa_addr;
297 rqd->ppa_list = ppa_ptr; 255 rqd->ppa_list = pr_ctx->ppa_ptr;
298 rqd->dma_ppa_list = dma_ppa_list; 256 rqd->dma_ppa_list = pr_ctx->dma_ppa_list;
299 rqd->ppa_list[0] = ppa; 257 rqd->ppa_list[0] = ppa;
300 } 258 }
301 259
260 /* Re-use allocated memory for intermediate lbas */
261 lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
262 lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
263
302 for (i = 0; i < nr_secs; i++) { 264 for (i = 0; i < nr_secs; i++) {
303 lba_list_media[i] = meta_list[i].lba; 265 lba_list_media[i] = meta_list[i].lba;
304 meta_list[i].lba = lba_list_mem[i]; 266 meta_list[i].lba = lba_list_mem[i];
@@ -316,7 +278,7 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
316 meta_list[hole].lba = lba_list_media[i]; 278 meta_list[hole].lba = lba_list_media[i];
317 279
318 src_bv = new_bio->bi_io_vec[i++]; 280 src_bv = new_bio->bi_io_vec[i++];
319 dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole]; 281 dst_bv = bio->bi_io_vec[bio_init_idx + hole];
320 282
321 src_p = kmap_atomic(src_bv.bv_page); 283 src_p = kmap_atomic(src_bv.bv_page);
322 dst_p = kmap_atomic(dst_bv.bv_page); 284 dst_p = kmap_atomic(dst_bv.bv_page);
@@ -334,19 +296,107 @@ static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
334 } while (hole < nr_secs); 296 } while (hole < nr_secs);
335 297
336 bio_put(new_bio); 298 bio_put(new_bio);
299 kfree(pr_ctx);
337 300
338 /* restore original request */ 301 /* restore original request */
339 rqd->bio = NULL; 302 rqd->bio = NULL;
340 rqd->nr_ppas = nr_secs; 303 rqd->nr_ppas = nr_secs;
341 304
305 bio_endio(bio);
342 __pblk_end_io_read(pblk, rqd, false); 306 __pblk_end_io_read(pblk, rqd, false);
343 return NVM_IO_DONE; 307}
344 308
345fail: 309static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
346 /* Free allocated pages in new bio */ 310 unsigned int bio_init_idx,
311 unsigned long *read_bitmap,
312 int nr_holes)
313{
314 struct pblk_sec_meta *meta_list = rqd->meta_list;
315 struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
316 struct pblk_pr_ctx *pr_ctx;
317 struct bio *new_bio, *bio = r_ctx->private;
318 __le64 *lba_list_mem;
319 int nr_secs = rqd->nr_ppas;
320 int i;
321
322 /* Re-use allocated memory for intermediate lbas */
323 lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
324
325 new_bio = bio_alloc(GFP_KERNEL, nr_holes);
326
327 if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
328 goto fail_bio_put;
329
330 if (nr_holes != new_bio->bi_vcnt) {
331 WARN_ONCE(1, "pblk: malformed bio\n");
332 goto fail_free_pages;
333 }
334
335 pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
336 if (!pr_ctx)
337 goto fail_free_pages;
338
339 for (i = 0; i < nr_secs; i++)
340 lba_list_mem[i] = meta_list[i].lba;
341
342 new_bio->bi_iter.bi_sector = 0; /* internal bio */
343 bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
344
345 rqd->bio = new_bio;
346 rqd->nr_ppas = nr_holes;
347 rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
348
349 pr_ctx->ppa_ptr = NULL;
350 pr_ctx->orig_bio = bio;
351 bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
352 pr_ctx->bio_init_idx = bio_init_idx;
353 pr_ctx->orig_nr_secs = nr_secs;
354 r_ctx->private = pr_ctx;
355
356 if (unlikely(nr_holes == 1)) {
357 pr_ctx->ppa_ptr = rqd->ppa_list;
358 pr_ctx->dma_ppa_list = rqd->dma_ppa_list;
359 rqd->ppa_addr = rqd->ppa_list[0];
360 }
361 return 0;
362
363fail_free_pages:
347 pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt); 364 pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
348fail_add_pages: 365fail_bio_put:
349 pr_err("pblk: failed to perform partial read\n"); 366 bio_put(new_bio);
367
368 return -ENOMEM;
369}
370
371static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
372 unsigned int bio_init_idx,
373 unsigned long *read_bitmap, int nr_secs)
374{
375 int nr_holes;
376 int ret;
377
378 nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
379
380 if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap,
381 nr_holes))
382 return NVM_IO_ERR;
383
384 rqd->end_io = pblk_end_partial_read;
385
386 ret = pblk_submit_io(pblk, rqd);
387 if (ret) {
388 bio_put(rqd->bio);
389 pblk_err(pblk, "partial read IO submission failed\n");
390 goto err;
391 }
392
393 return NVM_IO_OK;
394
395err:
396 pblk_err(pblk, "failed to perform partial read\n");
397
398 /* Free allocated pages in new bio */
399 pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt);
350 __pblk_end_io_read(pblk, rqd, false); 400 __pblk_end_io_read(pblk, rqd, false);
351 return NVM_IO_ERR; 401 return NVM_IO_ERR;
352} 402}
@@ -359,7 +409,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
359 409
360 pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); 410 pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
361 411
362#ifdef CONFIG_NVM_DEBUG 412#ifdef CONFIG_NVM_PBLK_DEBUG
363 atomic_long_inc(&pblk->inflight_reads); 413 atomic_long_inc(&pblk->inflight_reads);
364#endif 414#endif
365 415
@@ -382,7 +432,7 @@ retry:
382 WARN_ON(test_and_set_bit(0, read_bitmap)); 432 WARN_ON(test_and_set_bit(0, read_bitmap));
383 meta_list[0].lba = cpu_to_le64(lba); 433 meta_list[0].lba = cpu_to_le64(lba);
384 434
385#ifdef CONFIG_NVM_DEBUG 435#ifdef CONFIG_NVM_PBLK_DEBUG
386 atomic_long_inc(&pblk->cache_reads); 436 atomic_long_inc(&pblk->cache_reads);
387#endif 437#endif
388 } else { 438 } else {
@@ -401,7 +451,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
401 struct pblk_g_ctx *r_ctx; 451 struct pblk_g_ctx *r_ctx;
402 struct nvm_rq *rqd; 452 struct nvm_rq *rqd;
403 unsigned int bio_init_idx; 453 unsigned int bio_init_idx;
404 unsigned long read_bitmap; /* Max 64 ppas per request */ 454 DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA);
405 int ret = NVM_IO_ERR; 455 int ret = NVM_IO_ERR;
406 456
407 /* logic error: lba out-of-bounds. Ignore read request */ 457 /* logic error: lba out-of-bounds. Ignore read request */
@@ -411,9 +461,10 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
411 return NVM_IO_ERR; 461 return NVM_IO_ERR;
412 } 462 }
413 463
414 generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0); 464 generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio),
465 &pblk->disk->part0);
415 466
416 bitmap_zero(&read_bitmap, nr_secs); 467 bitmap_zero(read_bitmap, nr_secs);
417 468
418 rqd = pblk_alloc_rqd(pblk, PBLK_READ); 469 rqd = pblk_alloc_rqd(pblk, PBLK_READ);
419 470
@@ -436,7 +487,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
436 rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, 487 rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
437 &rqd->dma_meta_list); 488 &rqd->dma_meta_list);
438 if (!rqd->meta_list) { 489 if (!rqd->meta_list) {
439 pr_err("pblk: not able to allocate ppa list\n"); 490 pblk_err(pblk, "not able to allocate ppa list\n");
440 goto fail_rqd_free; 491 goto fail_rqd_free;
441 } 492 }
442 493
@@ -444,32 +495,32 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
444 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; 495 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
445 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; 496 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
446 497
447 pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap); 498 pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap);
448 } else { 499 } else {
449 pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap); 500 pblk_read_rq(pblk, rqd, bio, blba, read_bitmap);
450 } 501 }
451 502
452 if (bitmap_full(&read_bitmap, nr_secs)) { 503 if (bitmap_full(read_bitmap, nr_secs)) {
453 atomic_inc(&pblk->inflight_io); 504 atomic_inc(&pblk->inflight_io);
454 __pblk_end_io_read(pblk, rqd, false); 505 __pblk_end_io_read(pblk, rqd, false);
455 return NVM_IO_DONE; 506 return NVM_IO_DONE;
456 } 507 }
457 508
458 /* All sectors are to be read from the device */ 509 /* All sectors are to be read from the device */
459 if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { 510 if (bitmap_empty(read_bitmap, rqd->nr_ppas)) {
460 struct bio *int_bio = NULL; 511 struct bio *int_bio = NULL;
461 512
462 /* Clone read bio to deal with read errors internally */ 513 /* Clone read bio to deal with read errors internally */
463 int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); 514 int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
464 if (!int_bio) { 515 if (!int_bio) {
465 pr_err("pblk: could not clone read bio\n"); 516 pblk_err(pblk, "could not clone read bio\n");
466 goto fail_end_io; 517 goto fail_end_io;
467 } 518 }
468 519
469 rqd->bio = int_bio; 520 rqd->bio = int_bio;
470 521
471 if (pblk_submit_io(pblk, rqd)) { 522 if (pblk_submit_io(pblk, rqd)) {
472 pr_err("pblk: read IO submission failed\n"); 523 pblk_err(pblk, "read IO submission failed\n");
473 ret = NVM_IO_ERR; 524 ret = NVM_IO_ERR;
474 goto fail_end_io; 525 goto fail_end_io;
475 } 526 }
@@ -480,8 +531,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
480 /* The read bio request could be partially filled by the write buffer, 531 /* The read bio request could be partially filled by the write buffer,
481 * but there are some holes that need to be read from the drive. 532 * but there are some holes that need to be read from the drive.
482 */ 533 */
483 return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap); 534 ret = pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap,
535 nr_secs);
536 if (ret)
537 goto fail_meta_free;
538
539 return NVM_IO_OK;
484 540
541fail_meta_free:
542 nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
485fail_rqd_free: 543fail_rqd_free:
486 pblk_free_rqd(pblk, rqd, PBLK_READ); 544 pblk_free_rqd(pblk, rqd, PBLK_READ);
487 return ret; 545 return ret;
@@ -514,7 +572,7 @@ static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
514 rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; 572 rqd->ppa_list[valid_secs++] = ppa_list_l2p[i];
515 } 573 }
516 574
517#ifdef CONFIG_NVM_DEBUG 575#ifdef CONFIG_NVM_PBLK_DEBUG
518 atomic_long_add(valid_secs, &pblk->inflight_reads); 576 atomic_long_add(valid_secs, &pblk->inflight_reads);
519#endif 577#endif
520 578
@@ -548,7 +606,7 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
548 rqd->ppa_addr = ppa_l2p; 606 rqd->ppa_addr = ppa_l2p;
549 valid_secs = 1; 607 valid_secs = 1;
550 608
551#ifdef CONFIG_NVM_DEBUG 609#ifdef CONFIG_NVM_PBLK_DEBUG
552 atomic_long_inc(&pblk->inflight_reads); 610 atomic_long_inc(&pblk->inflight_reads);
553#endif 611#endif
554 612
@@ -595,7 +653,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
595 bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, 653 bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
596 PBLK_VMALLOC_META, GFP_KERNEL); 654 PBLK_VMALLOC_META, GFP_KERNEL);
597 if (IS_ERR(bio)) { 655 if (IS_ERR(bio)) {
598 pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); 656 pblk_err(pblk, "could not allocate GC bio (%lu)\n",
657 PTR_ERR(bio));
599 goto err_free_dma; 658 goto err_free_dma;
600 } 659 }
601 660
@@ -609,7 +668,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
609 668
610 if (pblk_submit_io_sync(pblk, &rqd)) { 669 if (pblk_submit_io_sync(pblk, &rqd)) {
611 ret = -EIO; 670 ret = -EIO;
612 pr_err("pblk: GC read request failed\n"); 671 pblk_err(pblk, "GC read request failed\n");
613 goto err_free_bio; 672 goto err_free_bio;
614 } 673 }
615 674
@@ -619,12 +678,12 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
619 678
620 if (rqd.error) { 679 if (rqd.error) {
621 atomic_long_inc(&pblk->read_failed_gc); 680 atomic_long_inc(&pblk->read_failed_gc);
622#ifdef CONFIG_NVM_DEBUG 681#ifdef CONFIG_NVM_PBLK_DEBUG
623 pblk_print_failed_rqd(pblk, &rqd, rqd.error); 682 pblk_print_failed_rqd(pblk, &rqd, rqd.error);
624#endif 683#endif
625 } 684 }
626 685
627#ifdef CONFIG_NVM_DEBUG 686#ifdef CONFIG_NVM_PBLK_DEBUG
628 atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); 687 atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads);
629 atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); 688 atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads);
630 atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); 689 atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 3a5069183859..e232e47e1353 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -77,7 +77,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
77 } 77 }
78 78
79 if (nr_valid_lbas != nr_lbas) 79 if (nr_valid_lbas != nr_lbas)
80 pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n", 80 pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n",
81 line->id, nr_valid_lbas, nr_lbas); 81 line->id, nr_valid_lbas, nr_lbas);
82 82
83 line->left_msecs = 0; 83 line->left_msecs = 0;
@@ -184,7 +184,7 @@ next_read_rq:
184 /* If read fails, more padding is needed */ 184 /* If read fails, more padding is needed */
185 ret = pblk_submit_io_sync(pblk, rqd); 185 ret = pblk_submit_io_sync(pblk, rqd);
186 if (ret) { 186 if (ret) {
187 pr_err("pblk: I/O submission failed: %d\n", ret); 187 pblk_err(pblk, "I/O submission failed: %d\n", ret);
188 return ret; 188 return ret;
189 } 189 }
190 190
@@ -194,7 +194,7 @@ next_read_rq:
194 * we cannot recover from here. Need FTL log. 194 * we cannot recover from here. Need FTL log.
195 */ 195 */
196 if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { 196 if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
197 pr_err("pblk: L2P recovery failed (%d)\n", rqd->error); 197 pblk_err(pblk, "L2P recovery failed (%d)\n", rqd->error);
198 return -EINTR; 198 return -EINTR;
199 } 199 }
200 200
@@ -273,7 +273,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
273next_pad_rq: 273next_pad_rq:
274 rq_ppas = pblk_calc_secs(pblk, left_ppas, 0); 274 rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
275 if (rq_ppas < pblk->min_write_pgs) { 275 if (rq_ppas < pblk->min_write_pgs) {
276 pr_err("pblk: corrupted pad line %d\n", line->id); 276 pblk_err(pblk, "corrupted pad line %d\n", line->id);
277 goto fail_free_pad; 277 goto fail_free_pad;
278 } 278 }
279 279
@@ -342,7 +342,7 @@ next_pad_rq:
342 342
343 ret = pblk_submit_io(pblk, rqd); 343 ret = pblk_submit_io(pblk, rqd);
344 if (ret) { 344 if (ret) {
345 pr_err("pblk: I/O submission failed: %d\n", ret); 345 pblk_err(pblk, "I/O submission failed: %d\n", ret);
346 pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); 346 pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
347 goto fail_free_bio; 347 goto fail_free_bio;
348 } 348 }
@@ -356,12 +356,12 @@ next_pad_rq:
356 356
357 if (!wait_for_completion_io_timeout(&pad_rq->wait, 357 if (!wait_for_completion_io_timeout(&pad_rq->wait,
358 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { 358 msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
359 pr_err("pblk: pad write timed out\n"); 359 pblk_err(pblk, "pad write timed out\n");
360 ret = -ETIME; 360 ret = -ETIME;
361 } 361 }
362 362
363 if (!pblk_line_is_full(line)) 363 if (!pblk_line_is_full(line))
364 pr_err("pblk: corrupted padded line: %d\n", line->id); 364 pblk_err(pblk, "corrupted padded line: %d\n", line->id);
365 365
366 vfree(data); 366 vfree(data);
367free_rq: 367free_rq:
@@ -461,7 +461,7 @@ next_rq:
461 461
462 ret = pblk_submit_io_sync(pblk, rqd); 462 ret = pblk_submit_io_sync(pblk, rqd);
463 if (ret) { 463 if (ret) {
464 pr_err("pblk: I/O submission failed: %d\n", ret); 464 pblk_err(pblk, "I/O submission failed: %d\n", ret);
465 return ret; 465 return ret;
466 } 466 }
467 467
@@ -501,11 +501,11 @@ next_rq:
501 501
502 ret = pblk_recov_pad_oob(pblk, line, pad_secs); 502 ret = pblk_recov_pad_oob(pblk, line, pad_secs);
503 if (ret) 503 if (ret)
504 pr_err("pblk: OOB padding failed (err:%d)\n", ret); 504 pblk_err(pblk, "OOB padding failed (err:%d)\n", ret);
505 505
506 ret = pblk_recov_read_oob(pblk, line, p, r_ptr); 506 ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
507 if (ret) 507 if (ret)
508 pr_err("pblk: OOB read failed (err:%d)\n", ret); 508 pblk_err(pblk, "OOB read failed (err:%d)\n", ret);
509 509
510 left_ppas = 0; 510 left_ppas = 0;
511 } 511 }
@@ -592,7 +592,7 @@ next_rq:
592 592
593 ret = pblk_submit_io_sync(pblk, rqd); 593 ret = pblk_submit_io_sync(pblk, rqd);
594 if (ret) { 594 if (ret) {
595 pr_err("pblk: I/O submission failed: %d\n", ret); 595 pblk_err(pblk, "I/O submission failed: %d\n", ret);
596 bio_put(bio); 596 bio_put(bio);
597 return ret; 597 return ret;
598 } 598 }
@@ -671,14 +671,14 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
671 671
672 ret = pblk_recov_scan_oob(pblk, line, p, &done); 672 ret = pblk_recov_scan_oob(pblk, line, p, &done);
673 if (ret) { 673 if (ret) {
674 pr_err("pblk: could not recover L2P from OOB\n"); 674 pblk_err(pblk, "could not recover L2P from OOB\n");
675 goto out; 675 goto out;
676 } 676 }
677 677
678 if (!done) { 678 if (!done) {
679 ret = pblk_recov_scan_all_oob(pblk, line, p); 679 ret = pblk_recov_scan_all_oob(pblk, line, p);
680 if (ret) { 680 if (ret) {
681 pr_err("pblk: could not recover L2P from OOB\n"); 681 pblk_err(pblk, "could not recover L2P from OOB\n");
682 goto out; 682 goto out;
683 } 683 }
684 } 684 }
@@ -737,14 +737,15 @@ static int pblk_recov_check_line_version(struct pblk *pblk,
737 struct line_header *header = &emeta->header; 737 struct line_header *header = &emeta->header;
738 738
739 if (header->version_major != EMETA_VERSION_MAJOR) { 739 if (header->version_major != EMETA_VERSION_MAJOR) {
740 pr_err("pblk: line major version mismatch: %d, expected: %d\n", 740 pblk_err(pblk, "line major version mismatch: %d, expected: %d\n",
741 header->version_major, EMETA_VERSION_MAJOR); 741 header->version_major, EMETA_VERSION_MAJOR);
742 return 1; 742 return 1;
743 } 743 }
744 744
745#ifdef NVM_DEBUG 745#ifdef CONFIG_NVM_PBLK_DEBUG
746 if (header->version_minor > EMETA_VERSION_MINOR) 746 if (header->version_minor > EMETA_VERSION_MINOR)
747 pr_info("pblk: newer line minor version found: %d\n", line_v); 747 pblk_info(pblk, "newer line minor version found: %d\n",
748 header->version_minor);
748#endif 749#endif
749 750
750 return 0; 751 return 0;
@@ -851,7 +852,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
851 continue; 852 continue;
852 853
853 if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) { 854 if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) {
854 pr_err("pblk: found incompatible line version %u\n", 855 pblk_err(pblk, "found incompatible line version %u\n",
855 smeta_buf->header.version_major); 856 smeta_buf->header.version_major);
856 return ERR_PTR(-EINVAL); 857 return ERR_PTR(-EINVAL);
857 } 858 }
@@ -863,7 +864,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
863 } 864 }
864 865
865 if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) { 866 if (memcmp(pblk->instance_uuid, smeta_buf->header.uuid, 16)) {
866 pr_debug("pblk: ignore line %u due to uuid mismatch\n", 867 pblk_debug(pblk, "ignore line %u due to uuid mismatch\n",
867 i); 868 i);
868 continue; 869 continue;
869 } 870 }
@@ -887,7 +888,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
887 888
888 pblk_recov_line_add_ordered(&recov_list, line); 889 pblk_recov_line_add_ordered(&recov_list, line);
889 found_lines++; 890 found_lines++;
890 pr_debug("pblk: recovering data line %d, seq:%llu\n", 891 pblk_debug(pblk, "recovering data line %d, seq:%llu\n",
891 line->id, smeta_buf->seq_nr); 892 line->id, smeta_buf->seq_nr);
892 } 893 }
893 894
@@ -947,7 +948,7 @@ next:
947 line->emeta = NULL; 948 line->emeta = NULL;
948 } else { 949 } else {
949 if (open_lines > 1) 950 if (open_lines > 1)
950 pr_err("pblk: failed to recover L2P\n"); 951 pblk_err(pblk, "failed to recover L2P\n");
951 952
952 open_lines++; 953 open_lines++;
953 line->meta_line = meta_line; 954 line->meta_line = meta_line;
@@ -976,7 +977,7 @@ next:
976 977
977out: 978out:
978 if (found_lines != recovered_lines) 979 if (found_lines != recovered_lines)
979 pr_err("pblk: failed to recover all found lines %d/%d\n", 980 pblk_err(pblk, "failed to recover all found lines %d/%d\n",
980 found_lines, recovered_lines); 981 found_lines, recovered_lines);
981 982
982 return data_line; 983 return data_line;
@@ -999,7 +1000,7 @@ int pblk_recov_pad(struct pblk *pblk)
999 1000
1000 ret = pblk_recov_pad_oob(pblk, line, left_msecs); 1001 ret = pblk_recov_pad_oob(pblk, line, left_msecs);
1001 if (ret) { 1002 if (ret) {
1002 pr_err("pblk: Tear down padding failed (%d)\n", ret); 1003 pblk_err(pblk, "tear down padding failed (%d)\n", ret);
1003 return ret; 1004 return ret;
1004 } 1005 }
1005 1006
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index 88a0a7c407aa..9fc3dfa168b4 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -268,7 +268,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
268 spin_unlock(&l_mg->free_lock); 268 spin_unlock(&l_mg->free_lock);
269 269
270 if (nr_free_lines != free_line_cnt) 270 if (nr_free_lines != free_line_cnt)
271 pr_err("pblk: corrupted free line list:%d/%d\n", 271 pblk_err(pblk, "corrupted free line list:%d/%d\n",
272 nr_free_lines, free_line_cnt); 272 nr_free_lines, free_line_cnt);
273 273
274 sz = snprintf(page, PAGE_SIZE - sz, 274 sz = snprintf(page, PAGE_SIZE - sz,
@@ -421,7 +421,7 @@ static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page)
421 return sz; 421 return sz;
422} 422}
423 423
424#ifdef CONFIG_NVM_DEBUG 424#ifdef CONFIG_NVM_PBLK_DEBUG
425static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) 425static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
426{ 426{
427 return snprintf(page, PAGE_SIZE, 427 return snprintf(page, PAGE_SIZE,
@@ -598,7 +598,7 @@ static struct attribute sys_padding_dist = {
598 .mode = 0644, 598 .mode = 0644,
599}; 599};
600 600
601#ifdef CONFIG_NVM_DEBUG 601#ifdef CONFIG_NVM_PBLK_DEBUG
602static struct attribute sys_stats_debug_attr = { 602static struct attribute sys_stats_debug_attr = {
603 .name = "stats", 603 .name = "stats",
604 .mode = 0444, 604 .mode = 0444,
@@ -619,7 +619,7 @@ static struct attribute *pblk_attrs[] = {
619 &sys_write_amp_mileage, 619 &sys_write_amp_mileage,
620 &sys_write_amp_trip, 620 &sys_write_amp_trip,
621 &sys_padding_dist, 621 &sys_padding_dist,
622#ifdef CONFIG_NVM_DEBUG 622#ifdef CONFIG_NVM_PBLK_DEBUG
623 &sys_stats_debug_attr, 623 &sys_stats_debug_attr,
624#endif 624#endif
625 NULL, 625 NULL,
@@ -654,7 +654,7 @@ static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
654 return pblk_sysfs_get_write_amp_trip(pblk, buf); 654 return pblk_sysfs_get_write_amp_trip(pblk, buf);
655 else if (strcmp(attr->name, "padding_dist") == 0) 655 else if (strcmp(attr->name, "padding_dist") == 0)
656 return pblk_sysfs_get_padding_dist(pblk, buf); 656 return pblk_sysfs_get_padding_dist(pblk, buf);
657#ifdef CONFIG_NVM_DEBUG 657#ifdef CONFIG_NVM_PBLK_DEBUG
658 else if (strcmp(attr->name, "stats") == 0) 658 else if (strcmp(attr->name, "stats") == 0)
659 return pblk_sysfs_stats_debug(pblk, buf); 659 return pblk_sysfs_stats_debug(pblk, buf);
660#endif 660#endif
@@ -697,8 +697,7 @@ int pblk_sysfs_init(struct gendisk *tdisk)
697 kobject_get(&parent_dev->kobj), 697 kobject_get(&parent_dev->kobj),
698 "%s", "pblk"); 698 "%s", "pblk");
699 if (ret) { 699 if (ret) {
700 pr_err("pblk: could not register %s/pblk\n", 700 pblk_err(pblk, "could not register\n");
701 tdisk->disk_name);
702 return ret; 701 return ret;
703 } 702 }
704 703
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index f353e52941f5..ee774a86cf1e 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -38,7 +38,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
38 /* Release flags on context. Protect from writes */ 38 /* Release flags on context. Protect from writes */
39 smp_store_release(&w_ctx->flags, flags); 39 smp_store_release(&w_ctx->flags, flags);
40 40
41#ifdef CONFIG_NVM_DEBUG 41#ifdef CONFIG_NVM_PBLK_DEBUG
42 atomic_dec(&rwb->inflight_flush_point); 42 atomic_dec(&rwb->inflight_flush_point);
43#endif 43#endif
44 } 44 }
@@ -51,7 +51,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
51 pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, 51 pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
52 c_ctx->nr_padded); 52 c_ctx->nr_padded);
53 53
54#ifdef CONFIG_NVM_DEBUG 54#ifdef CONFIG_NVM_PBLK_DEBUG
55 atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); 55 atomic_long_add(rqd->nr_ppas, &pblk->sync_writes);
56#endif 56#endif
57 57
@@ -78,7 +78,7 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
78 unsigned long flags; 78 unsigned long flags;
79 unsigned long pos; 79 unsigned long pos;
80 80
81#ifdef CONFIG_NVM_DEBUG 81#ifdef CONFIG_NVM_PBLK_DEBUG
82 atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes); 82 atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
83#endif 83#endif
84 84
@@ -196,7 +196,7 @@ static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
196 list_add_tail(&r_ctx->list, &pblk->resubmit_list); 196 list_add_tail(&r_ctx->list, &pblk->resubmit_list);
197 spin_unlock(&pblk->resubmit_lock); 197 spin_unlock(&pblk->resubmit_lock);
198 198
199#ifdef CONFIG_NVM_DEBUG 199#ifdef CONFIG_NVM_PBLK_DEBUG
200 atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes); 200 atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
201#endif 201#endif
202} 202}
@@ -238,7 +238,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
238 238
239 recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC); 239 recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
240 if (!recovery) { 240 if (!recovery) {
241 pr_err("pblk: could not allocate recovery work\n"); 241 pblk_err(pblk, "could not allocate recovery work\n");
242 return; 242 return;
243 } 243 }
244 244
@@ -258,7 +258,7 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
258 pblk_end_w_fail(pblk, rqd); 258 pblk_end_w_fail(pblk, rqd);
259 return; 259 return;
260 } 260 }
261#ifdef CONFIG_NVM_DEBUG 261#ifdef CONFIG_NVM_PBLK_DEBUG
262 else 262 else
263 WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); 263 WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
264#endif 264#endif
@@ -279,7 +279,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
279 279
280 if (rqd->error) { 280 if (rqd->error) {
281 pblk_log_write_err(pblk, rqd); 281 pblk_log_write_err(pblk, rqd);
282 pr_err("pblk: metadata I/O failed. Line %d\n", line->id); 282 pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id);
283 line->w_err_gc->has_write_err = 1; 283 line->w_err_gc->has_write_err = 1;
284 } 284 }
285 285
@@ -356,11 +356,11 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
356 356
357 secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush); 357 secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
358 358
359#ifdef CONFIG_NVM_DEBUG 359#ifdef CONFIG_NVM_PBLK_DEBUG
360 if ((!secs_to_sync && secs_to_flush) 360 if ((!secs_to_sync && secs_to_flush)
361 || (secs_to_sync < 0) 361 || (secs_to_sync < 0)
362 || (secs_to_sync > secs_avail && !secs_to_flush)) { 362 || (secs_to_sync > secs_avail && !secs_to_flush)) {
363 pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n", 363 pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n",
364 secs_avail, secs_to_sync, secs_to_flush); 364 secs_avail, secs_to_sync, secs_to_flush);
365 } 365 }
366#endif 366#endif
@@ -397,7 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
397 bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, 397 bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
398 l_mg->emeta_alloc_type, GFP_KERNEL); 398 l_mg->emeta_alloc_type, GFP_KERNEL);
399 if (IS_ERR(bio)) { 399 if (IS_ERR(bio)) {
400 pr_err("pblk: failed to map emeta io"); 400 pblk_err(pblk, "failed to map emeta io");
401 ret = PTR_ERR(bio); 401 ret = PTR_ERR(bio);
402 goto fail_free_rqd; 402 goto fail_free_rqd;
403 } 403 }
@@ -428,7 +428,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
428 428
429 ret = pblk_submit_io(pblk, rqd); 429 ret = pblk_submit_io(pblk, rqd);
430 if (ret) { 430 if (ret) {
431 pr_err("pblk: emeta I/O submission failed: %d\n", ret); 431 pblk_err(pblk, "emeta I/O submission failed: %d\n", ret);
432 goto fail_rollback; 432 goto fail_rollback;
433 } 433 }
434 434
@@ -518,7 +518,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
518 /* Assign lbas to ppas and populate request structure */ 518 /* Assign lbas to ppas and populate request structure */
519 err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); 519 err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
520 if (err) { 520 if (err) {
521 pr_err("pblk: could not setup write request: %d\n", err); 521 pblk_err(pblk, "could not setup write request: %d\n", err);
522 return NVM_IO_ERR; 522 return NVM_IO_ERR;
523 } 523 }
524 524
@@ -527,7 +527,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
527 /* Submit data write for current data line */ 527 /* Submit data write for current data line */
528 err = pblk_submit_io(pblk, rqd); 528 err = pblk_submit_io(pblk, rqd);
529 if (err) { 529 if (err) {
530 pr_err("pblk: data I/O submission failed: %d\n", err); 530 pblk_err(pblk, "data I/O submission failed: %d\n", err);
531 return NVM_IO_ERR; 531 return NVM_IO_ERR;
532 } 532 }
533 533
@@ -549,7 +549,8 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
549 /* Submit metadata write for previous data line */ 549 /* Submit metadata write for previous data line */
550 err = pblk_submit_meta_io(pblk, meta_line); 550 err = pblk_submit_meta_io(pblk, meta_line);
551 if (err) { 551 if (err) {
552 pr_err("pblk: metadata I/O submission failed: %d", err); 552 pblk_err(pblk, "metadata I/O submission failed: %d",
553 err);
553 return NVM_IO_ERR; 554 return NVM_IO_ERR;
554 } 555 }
555 } 556 }
@@ -614,7 +615,7 @@ static int pblk_submit_write(struct pblk *pblk)
614 secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, 615 secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
615 secs_to_flush); 616 secs_to_flush);
616 if (secs_to_sync > pblk->max_write_pgs) { 617 if (secs_to_sync > pblk->max_write_pgs) {
617 pr_err("pblk: bad buffer sync calculation\n"); 618 pblk_err(pblk, "bad buffer sync calculation\n");
618 return 1; 619 return 1;
619 } 620 }
620 621
@@ -633,14 +634,14 @@ static int pblk_submit_write(struct pblk *pblk)
633 634
634 if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, 635 if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
635 secs_avail)) { 636 secs_avail)) {
636 pr_err("pblk: corrupted write bio\n"); 637 pblk_err(pblk, "corrupted write bio\n");
637 goto fail_put_bio; 638 goto fail_put_bio;
638 } 639 }
639 640
640 if (pblk_submit_io_set(pblk, rqd)) 641 if (pblk_submit_io_set(pblk, rqd))
641 goto fail_free_bio; 642 goto fail_free_bio;
642 643
643#ifdef CONFIG_NVM_DEBUG 644#ifdef CONFIG_NVM_PBLK_DEBUG
644 atomic_long_add(secs_to_sync, &pblk->sub_writes); 645 atomic_long_add(secs_to_sync, &pblk->sub_writes);
645#endif 646#endif
646 647
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 34cc1d64a9d4..4760af7b6499 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -119,6 +119,16 @@ struct pblk_g_ctx {
119 u64 lba; 119 u64 lba;
120}; 120};
121 121
122/* partial read context */
123struct pblk_pr_ctx {
124 struct bio *orig_bio;
125 DECLARE_BITMAP(bitmap, NVM_MAX_VLBA);
126 unsigned int orig_nr_secs;
127 unsigned int bio_init_idx;
128 void *ppa_ptr;
129 dma_addr_t dma_ppa_list;
130};
131
122/* Pad context */ 132/* Pad context */
123struct pblk_pad_rq { 133struct pblk_pad_rq {
124 struct pblk *pblk; 134 struct pblk *pblk;
@@ -193,7 +203,7 @@ struct pblk_rb {
193 spinlock_t w_lock; /* Write lock */ 203 spinlock_t w_lock; /* Write lock */
194 spinlock_t s_lock; /* Sync lock */ 204 spinlock_t s_lock; /* Sync lock */
195 205
196#ifdef CONFIG_NVM_DEBUG 206#ifdef CONFIG_NVM_PBLK_DEBUG
197 atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ 207 atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */
198#endif 208#endif
199}; 209};
@@ -608,9 +618,6 @@ struct pblk {
608 618
609 int min_write_pgs; /* Minimum amount of pages required by controller */ 619 int min_write_pgs; /* Minimum amount of pages required by controller */
610 int max_write_pgs; /* Maximum amount of pages supported by controller */ 620 int max_write_pgs; /* Maximum amount of pages supported by controller */
611 int pgs_in_buffer; /* Number of pages that need to be held in buffer to
612 * guarantee successful reads.
613 */
614 621
615 sector_t capacity; /* Device capacity when bad blocks are subtracted */ 622 sector_t capacity; /* Device capacity when bad blocks are subtracted */
616 623
@@ -639,7 +646,7 @@ struct pblk {
639 u64 nr_flush_rst; /* Flushes reset value for pad dist.*/ 646 u64 nr_flush_rst; /* Flushes reset value for pad dist.*/
640 atomic64_t nr_flush; /* Number of flush/fua I/O */ 647 atomic64_t nr_flush; /* Number of flush/fua I/O */
641 648
642#ifdef CONFIG_NVM_DEBUG 649#ifdef CONFIG_NVM_PBLK_DEBUG
643 /* Non-persistent debug counters, 4kb sector I/Os */ 650 /* Non-persistent debug counters, 4kb sector I/Os */
644 atomic_long_t inflight_writes; /* Inflight writes (user and gc) */ 651 atomic_long_t inflight_writes; /* Inflight writes (user and gc) */
645 atomic_long_t padded_writes; /* Sectors padded due to flush/fua */ 652 atomic_long_t padded_writes; /* Sectors padded due to flush/fua */
@@ -706,6 +713,15 @@ struct pblk_line_ws {
706#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) 713#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx))
707#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) 714#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
708 715
716#define pblk_err(pblk, fmt, ...) \
717 pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
718#define pblk_info(pblk, fmt, ...) \
719 pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
720#define pblk_warn(pblk, fmt, ...) \
721 pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
722#define pblk_debug(pblk, fmt, ...) \
723 pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__)
724
709/* 725/*
710 * pblk ring buffer operations 726 * pblk ring buffer operations
711 */ 727 */
@@ -1282,20 +1298,22 @@ static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs)
1282 return !(nr_secs % pblk->min_write_pgs); 1298 return !(nr_secs % pblk->min_write_pgs);
1283} 1299}
1284 1300
1285#ifdef CONFIG_NVM_DEBUG 1301#ifdef CONFIG_NVM_PBLK_DEBUG
1286static inline void print_ppa(struct nvm_geo *geo, struct ppa_addr *p, 1302static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p,
1287 char *msg, int error) 1303 char *msg, int error)
1288{ 1304{
1305 struct nvm_geo *geo = &pblk->dev->geo;
1306
1289 if (p->c.is_cached) { 1307 if (p->c.is_cached) {
1290 pr_err("ppa: (%s: %x) cache line: %llu\n", 1308 pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n",
1291 msg, error, (u64)p->c.line); 1309 msg, error, (u64)p->c.line);
1292 } else if (geo->version == NVM_OCSSD_SPEC_12) { 1310 } else if (geo->version == NVM_OCSSD_SPEC_12) {
1293 pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n", 1311 pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
1294 msg, error, 1312 msg, error,
1295 p->g.ch, p->g.lun, p->g.blk, 1313 p->g.ch, p->g.lun, p->g.blk,
1296 p->g.pg, p->g.pl, p->g.sec); 1314 p->g.pg, p->g.pl, p->g.sec);
1297 } else { 1315 } else {
1298 pr_err("ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n", 1316 pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
1299 msg, error, 1317 msg, error,
1300 p->m.grp, p->m.pu, p->m.chk, p->m.sec); 1318 p->m.grp, p->m.pu, p->m.chk, p->m.sec);
1301 } 1319 }
@@ -1307,16 +1325,16 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
1307 int bit = -1; 1325 int bit = -1;
1308 1326
1309 if (rqd->nr_ppas == 1) { 1327 if (rqd->nr_ppas == 1) {
1310 print_ppa(&pblk->dev->geo, &rqd->ppa_addr, "rqd", error); 1328 print_ppa(pblk, &rqd->ppa_addr, "rqd", error);
1311 return; 1329 return;
1312 } 1330 }
1313 1331
1314 while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas, 1332 while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
1315 bit + 1)) < rqd->nr_ppas) { 1333 bit + 1)) < rqd->nr_ppas) {
1316 print_ppa(&pblk->dev->geo, &rqd->ppa_list[bit], "rqd", error); 1334 print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error);
1317 } 1335 }
1318 1336
1319 pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status); 1337 pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
1320} 1338}
1321 1339
1322static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, 1340static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
@@ -1347,7 +1365,7 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
1347 continue; 1365 continue;
1348 } 1366 }
1349 1367
1350 print_ppa(geo, ppa, "boundary", i); 1368 print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i);
1351 1369
1352 return 1; 1370 return 1;
1353 } 1371 }
@@ -1377,7 +1395,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
1377 1395
1378 spin_lock(&line->lock); 1396 spin_lock(&line->lock);
1379 if (line->state != PBLK_LINESTATE_OPEN) { 1397 if (line->state != PBLK_LINESTATE_OPEN) {
1380 pr_err("pblk: bad ppa: line:%d,state:%d\n", 1398 pblk_err(pblk, "bad ppa: line:%d,state:%d\n",
1381 line->id, line->state); 1399 line->id, line->state);
1382 WARN_ON(1); 1400 WARN_ON(1);
1383 spin_unlock(&line->lock); 1401 spin_unlock(&line->lock);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d6bf294f3907..05f82ff6f016 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,13 +328,6 @@ struct cached_dev {
328 */ 328 */
329 atomic_t has_dirty; 329 atomic_t has_dirty;
330 330
331 /*
332 * Set to zero by things that touch the backing volume-- except
333 * writeback. Incremented by writeback. Used to determine when to
334 * accelerate idle writeback.
335 */
336 atomic_t backing_idle;
337
338 struct bch_ratelimit writeback_rate; 331 struct bch_ratelimit writeback_rate;
339 struct delayed_work writeback_rate_update; 332 struct delayed_work writeback_rate_update;
340 333
@@ -423,9 +416,9 @@ struct cache {
423 /* 416 /*
424 * When allocating new buckets, prio_write() gets first dibs - since we 417 * When allocating new buckets, prio_write() gets first dibs - since we
425 * may not be allocate at all without writing priorities and gens. 418 * may not be allocate at all without writing priorities and gens.
426 * prio_buckets[] contains the last buckets we wrote priorities to (so 419 * prio_last_buckets[] contains the last buckets we wrote priorities to
427 * gc can mark them as metadata), prio_next[] contains the buckets 420 * (so gc can mark them as metadata), prio_buckets[] contains the
428 * allocated for the next prio write. 421 * buckets allocated for the next prio write.
429 */ 422 */
430 uint64_t *prio_buckets; 423 uint64_t *prio_buckets;
431 uint64_t *prio_last_buckets; 424 uint64_t *prio_last_buckets;
@@ -474,6 +467,7 @@ struct cache {
474 467
475struct gc_stat { 468struct gc_stat {
476 size_t nodes; 469 size_t nodes;
470 size_t nodes_pre;
477 size_t key_bytes; 471 size_t key_bytes;
478 472
479 size_t nkeys; 473 size_t nkeys;
@@ -514,6 +508,8 @@ struct cache_set {
514 struct cache_accounting accounting; 508 struct cache_accounting accounting;
515 509
516 unsigned long flags; 510 unsigned long flags;
511 atomic_t idle_counter;
512 atomic_t at_max_writeback_rate;
517 513
518 struct cache_sb sb; 514 struct cache_sb sb;
519 515
@@ -523,8 +519,10 @@ struct cache_set {
523 519
524 struct bcache_device **devices; 520 struct bcache_device **devices;
525 unsigned devices_max_used; 521 unsigned devices_max_used;
522 atomic_t attached_dev_nr;
526 struct list_head cached_devs; 523 struct list_head cached_devs;
527 uint64_t cached_dev_sectors; 524 uint64_t cached_dev_sectors;
525 atomic_long_t flash_dev_dirty_sectors;
528 struct closure caching; 526 struct closure caching;
529 527
530 struct closure sb_write; 528 struct closure sb_write;
@@ -603,6 +601,10 @@ struct cache_set {
603 */ 601 */
604 atomic_t rescale; 602 atomic_t rescale;
605 /* 603 /*
604 * used for GC, identify if any front side I/Os is inflight
605 */
606 atomic_t search_inflight;
607 /*
606 * When we invalidate buckets, we use both the priority and the amount 608 * When we invalidate buckets, we use both the priority and the amount
607 * of good data to determine which buckets to reuse first - to weight 609 * of good data to determine which buckets to reuse first - to weight
608 * those together consistently we keep track of the smallest nonzero 610 * those together consistently we keep track of the smallest nonzero
@@ -995,7 +997,7 @@ void bch_open_buckets_free(struct cache_set *);
995int bch_cache_allocator_start(struct cache *ca); 997int bch_cache_allocator_start(struct cache *ca);
996 998
997void bch_debug_exit(void); 999void bch_debug_exit(void);
998int bch_debug_init(struct kobject *); 1000void bch_debug_init(struct kobject *kobj);
999void bch_request_exit(void); 1001void bch_request_exit(void);
1000int bch_request_init(void); 1002int bch_request_init(void);
1001 1003
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f3403b45bc28..596c93b44e9b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -366,6 +366,10 @@ EXPORT_SYMBOL(bch_btree_keys_init);
366 366
367/* Binary tree stuff for auxiliary search trees */ 367/* Binary tree stuff for auxiliary search trees */
368 368
369/*
370 * return array index next to j when does in-order traverse
371 * of a binary tree which is stored in a linear array
372 */
369static unsigned inorder_next(unsigned j, unsigned size) 373static unsigned inorder_next(unsigned j, unsigned size)
370{ 374{
371 if (j * 2 + 1 < size) { 375 if (j * 2 + 1 < size) {
@@ -379,6 +383,10 @@ static unsigned inorder_next(unsigned j, unsigned size)
379 return j; 383 return j;
380} 384}
381 385
386/*
387 * return array index previous to j when does in-order traverse
388 * of a binary tree which is stored in a linear array
389 */
382static unsigned inorder_prev(unsigned j, unsigned size) 390static unsigned inorder_prev(unsigned j, unsigned size)
383{ 391{
384 if (j * 2 < size) { 392 if (j * 2 < size) {
@@ -421,6 +429,10 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
421 return j; 429 return j;
422} 430}
423 431
432/*
433 * Return the cacheline index in bset_tree->data, where j is index
434 * from a linear array which stores the auxiliar binary tree
435 */
424static unsigned to_inorder(unsigned j, struct bset_tree *t) 436static unsigned to_inorder(unsigned j, struct bset_tree *t)
425{ 437{
426 return __to_inorder(j, t->size, t->extra); 438 return __to_inorder(j, t->size, t->extra);
@@ -441,6 +453,10 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
441 return j; 453 return j;
442} 454}
443 455
456/*
457 * Return an index from a linear array which stores the auxiliar binary
458 * tree, j is the cacheline index of t->data.
459 */
444static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) 460static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
445{ 461{
446 return __inorder_to_tree(j, t->size, t->extra); 462 return __inorder_to_tree(j, t->size, t->extra);
@@ -546,6 +562,20 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
546 return low; 562 return low;
547} 563}
548 564
565/*
566 * Calculate mantissa value for struct bkey_float.
567 * If most significant bit of f->exponent is not set, then
568 * - f->exponent >> 6 is 0
569 * - p[0] points to bkey->low
570 * - p[-1] borrows bits from KEY_INODE() of bkey->high
571 * if most isgnificant bits of f->exponent is set, then
572 * - f->exponent >> 6 is 1
573 * - p[0] points to bits from KEY_INODE() of bkey->high
574 * - p[-1] points to other bits from KEY_INODE() of
575 * bkey->high too.
576 * See make_bfloat() to check when most significant bit of f->exponent
577 * is set or not.
578 */
549static inline unsigned bfloat_mantissa(const struct bkey *k, 579static inline unsigned bfloat_mantissa(const struct bkey *k,
550 struct bkey_float *f) 580 struct bkey_float *f)
551{ 581{
@@ -570,6 +600,16 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
570 BUG_ON(m < l || m > r); 600 BUG_ON(m < l || m > r);
571 BUG_ON(bkey_next(p) != m); 601 BUG_ON(bkey_next(p) != m);
572 602
603 /*
604 * If l and r have different KEY_INODE values (different backing
605 * device), f->exponent records how many least significant bits
606 * are different in KEY_INODE values and sets most significant
607 * bits to 1 (by +64).
608 * If l and r have same KEY_INODE value, f->exponent records
609 * how many different bits in least significant bits of bkey->low.
610 * See bfloat_mantiss() how the most significant bit of
611 * f->exponent is used to calculate bfloat mantissa value.
612 */
573 if (KEY_INODE(l) != KEY_INODE(r)) 613 if (KEY_INODE(l) != KEY_INODE(r))
574 f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; 614 f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
575 else 615 else
@@ -633,6 +673,15 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
633} 673}
634EXPORT_SYMBOL(bch_bset_init_next); 674EXPORT_SYMBOL(bch_bset_init_next);
635 675
676/*
677 * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to
678 * accelerate bkey search in a btree node (pointed by bset_tree->data in
679 * memory). After search in the auxiliar tree by calling bset_search_tree(),
680 * a struct bset_search_iter is returned which indicates range [l, r] from
681 * bset_tree->data where the searching bkey might be inside. Then a followed
682 * linear comparison does the exact search, see __bch_bset_search() for how
683 * the auxiliary tree is used.
684 */
636void bch_bset_build_written_tree(struct btree_keys *b) 685void bch_bset_build_written_tree(struct btree_keys *b)
637{ 686{
638 struct bset_tree *t = bset_tree_last(b); 687 struct bset_tree *t = bset_tree_last(b);
@@ -898,6 +947,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
898 unsigned inorder, j, n = 1; 947 unsigned inorder, j, n = 1;
899 948
900 do { 949 do {
950 /*
951 * A bit trick here.
952 * If p < t->size, (int)(p - t->size) is a minus value and
953 * the most significant bit is set, right shifting 31 bits
954 * gets 1. If p >= t->size, the most significant bit is
955 * not set, right shifting 31 bits gets 0.
956 * So the following 2 lines equals to
957 * if (p >= t->size)
958 * p = 0;
959 * but a branch instruction is avoided.
960 */
901 unsigned p = n << 4; 961 unsigned p = n << 4;
902 p &= ((int) (p - t->size)) >> 31; 962 p &= ((int) (p - t->size)) >> 31;
903 963
@@ -907,6 +967,9 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t,
907 f = &t->tree[j]; 967 f = &t->tree[j];
908 968
909 /* 969 /*
970 * Similar bit trick, use subtract operation to avoid a branch
971 * instruction.
972 *
910 * n = (f->mantissa > bfloat_mantissa()) 973 * n = (f->mantissa > bfloat_mantissa())
911 * ? j * 2 974 * ? j * 2
912 * : j * 2 + 1; 975 * : j * 2 + 1;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 547c9eedc2f4..c19f7716df88 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -90,6 +90,9 @@
90 90
91#define MAX_NEED_GC 64 91#define MAX_NEED_GC 64
92#define MAX_SAVE_PRIO 72 92#define MAX_SAVE_PRIO 72
93#define MAX_GC_TIMES 100
94#define MIN_GC_NODES 100
95#define GC_SLEEP_MS 100
93 96
94#define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) 97#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
95 98
@@ -1008,6 +1011,13 @@ retry:
1008 BUG_ON(b->level != level); 1011 BUG_ON(b->level != level);
1009 } 1012 }
1010 1013
1014 if (btree_node_io_error(b)) {
1015 rw_unlock(write, b);
1016 return ERR_PTR(-EIO);
1017 }
1018
1019 BUG_ON(!b->written);
1020
1011 b->parent = parent; 1021 b->parent = parent;
1012 b->accessed = 1; 1022 b->accessed = 1;
1013 1023
@@ -1019,13 +1029,6 @@ retry:
1019 for (; i <= b->keys.nsets; i++) 1029 for (; i <= b->keys.nsets; i++)
1020 prefetch(b->keys.set[i].data); 1030 prefetch(b->keys.set[i].data);
1021 1031
1022 if (btree_node_io_error(b)) {
1023 rw_unlock(write, b);
1024 return ERR_PTR(-EIO);
1025 }
1026
1027 BUG_ON(!b->written);
1028
1029 return b; 1032 return b;
1030} 1033}
1031 1034
@@ -1520,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b)
1520 return ret; 1523 return ret;
1521} 1524}
1522 1525
1526static size_t btree_gc_min_nodes(struct cache_set *c)
1527{
1528 size_t min_nodes;
1529
1530 /*
1531 * Since incremental GC would stop 100ms when front
1532 * side I/O comes, so when there are many btree nodes,
1533 * if GC only processes constant (100) nodes each time,
1534 * GC would last a long time, and the front side I/Os
1535 * would run out of the buckets (since no new bucket
1536 * can be allocated during GC), and be blocked again.
1537 * So GC should not process constant nodes, but varied
1538 * nodes according to the number of btree nodes, which
1539 * realized by dividing GC into constant(100) times,
1540 * so when there are many btree nodes, GC can process
1541 * more nodes each time, otherwise, GC will process less
1542 * nodes each time (but no less than MIN_GC_NODES)
1543 */
1544 min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
1545 if (min_nodes < MIN_GC_NODES)
1546 min_nodes = MIN_GC_NODES;
1547
1548 return min_nodes;
1549}
1550
1551
1523static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1552static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1524 struct closure *writes, struct gc_stat *gc) 1553 struct closure *writes, struct gc_stat *gc)
1525{ 1554{
@@ -1585,6 +1614,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1585 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); 1614 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1586 r->b = NULL; 1615 r->b = NULL;
1587 1616
1617 if (atomic_read(&b->c->search_inflight) &&
1618 gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
1619 gc->nodes_pre = gc->nodes;
1620 ret = -EAGAIN;
1621 break;
1622 }
1623
1588 if (need_resched()) { 1624 if (need_resched()) {
1589 ret = -EAGAIN; 1625 ret = -EAGAIN;
1590 break; 1626 break;
@@ -1753,7 +1789,10 @@ static void bch_btree_gc(struct cache_set *c)
1753 closure_sync(&writes); 1789 closure_sync(&writes);
1754 cond_resched(); 1790 cond_resched();
1755 1791
1756 if (ret && ret != -EAGAIN) 1792 if (ret == -EAGAIN)
1793 schedule_timeout_interruptible(msecs_to_jiffies
1794 (GC_SLEEP_MS));
1795 else if (ret)
1757 pr_warn("gc failed!"); 1796 pr_warn("gc failed!");
1758 } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); 1797 } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
1759 1798
@@ -1834,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
1834 do { 1873 do {
1835 k = bch_btree_iter_next_filter(&iter, &b->keys, 1874 k = bch_btree_iter_next_filter(&iter, &b->keys,
1836 bch_ptr_bad); 1875 bch_ptr_bad);
1837 if (k) 1876 if (k) {
1838 btree_node_prefetch(b, k); 1877 btree_node_prefetch(b, k);
1878 /*
1879 * initiallize c->gc_stats.nodes
1880 * for incremental GC
1881 */
1882 b->c->gc_stats.nodes++;
1883 }
1839 1884
1840 if (p) 1885 if (p)
1841 ret = btree(check_recurse, p, b, op); 1886 ret = btree(check_recurse, p, b, op);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d211e2c25b6b..68e9d926134d 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -152,7 +152,7 @@ static inline bool btree_node_ ## flag(struct btree *b) \
152{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ 152{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
153 \ 153 \
154static inline void set_btree_node_ ## flag(struct btree *b) \ 154static inline void set_btree_node_ ## flag(struct btree *b) \
155{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ 155{ set_bit(BTREE_NODE_ ## flag, &b->flags); }
156 156
157enum btree_flags { 157enum btree_flags {
158 BTREE_NODE_io_error, 158 BTREE_NODE_io_error,
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 0e14969182c6..618253683d40 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -199,11 +199,16 @@ static const struct file_operations debug_ops = {
199 .release = single_release 199 .release = single_release
200}; 200};
201 201
202int __init closure_debug_init(void) 202void __init closure_debug_init(void)
203{ 203{
204 closure_debug = debugfs_create_file("closures", 204 if (!IS_ERR_OR_NULL(bcache_debug))
205 0400, bcache_debug, NULL, &debug_ops); 205 /*
206 return IS_ERR_OR_NULL(closure_debug); 206 * it is unnecessary to check return value of
207 * debugfs_create_file(), we should not care
208 * about this.
209 */
210 closure_debug = debugfs_create_file(
211 "closures", 0400, bcache_debug, NULL, &debug_ops);
207} 212}
208#endif 213#endif
209 214
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 71427eb5fdae..7c2c5bc7c88b 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -186,13 +186,13 @@ static inline void closure_sync(struct closure *cl)
186 186
187#ifdef CONFIG_BCACHE_CLOSURES_DEBUG 187#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
188 188
189int closure_debug_init(void); 189void closure_debug_init(void);
190void closure_debug_create(struct closure *cl); 190void closure_debug_create(struct closure *cl);
191void closure_debug_destroy(struct closure *cl); 191void closure_debug_destroy(struct closure *cl);
192 192
193#else 193#else
194 194
195static inline int closure_debug_init(void) { return 0; } 195static inline void closure_debug_init(void) {}
196static inline void closure_debug_create(struct closure *cl) {} 196static inline void closure_debug_create(struct closure *cl) {}
197static inline void closure_debug_destroy(struct closure *cl) {} 197static inline void closure_debug_destroy(struct closure *cl) {}
198 198
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index d030ce3025a6..12034c07257b 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
110 struct bio_vec bv, cbv; 110 struct bio_vec bv, cbv;
111 struct bvec_iter iter, citer = { 0 }; 111 struct bvec_iter iter, citer = { 0 };
112 112
113 check = bio_clone_kmalloc(bio, GFP_NOIO); 113 check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
114 if (!check) 114 if (!check)
115 return; 115 return;
116 check->bi_disk = bio->bi_disk;
116 check->bi_opf = REQ_OP_READ; 117 check->bi_opf = REQ_OP_READ;
118 check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
119 check->bi_iter.bi_size = bio->bi_iter.bi_size;
117 120
121 bch_bio_map(check, NULL);
118 if (bch_bio_alloc_pages(check, GFP_NOIO)) 122 if (bch_bio_alloc_pages(check, GFP_NOIO))
119 goto out_put; 123 goto out_put;
120 124
@@ -248,11 +252,12 @@ void bch_debug_exit(void)
248 debugfs_remove_recursive(bcache_debug); 252 debugfs_remove_recursive(bcache_debug);
249} 253}
250 254
251int __init bch_debug_init(struct kobject *kobj) 255void __init bch_debug_init(struct kobject *kobj)
252{ 256{
253 if (!IS_ENABLED(CONFIG_DEBUG_FS)) 257 /*
254 return 0; 258 * it is unnecessary to check return value of
255 259 * debugfs_create_file(), we should not care
260 * about this.
261 */
256 bcache_debug = debugfs_create_dir("bcache", NULL); 262 bcache_debug = debugfs_create_dir("bcache", NULL);
257 return IS_ERR_OR_NULL(bcache_debug);
258} 263}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 18f1b5239620..10748c626a1d 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c)
828 free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); 828 free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
829 free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); 829 free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
830 free_fifo(&c->journal.pin); 830 free_fifo(&c->journal.pin);
831 free_heap(&c->flush_btree);
831} 832}
832 833
833int bch_journal_alloc(struct cache_set *c) 834int bch_journal_alloc(struct cache_set *c)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ae67f5fa8047..7dbe8b6316a0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -107,7 +107,7 @@ static int bch_keylist_realloc(struct keylist *l, unsigned u64s,
107 /* 107 /*
108 * The journalling code doesn't handle the case where the keys to insert 108 * The journalling code doesn't handle the case where the keys to insert
109 * is bigger than an empty write: If we just return -ENOMEM here, 109 * is bigger than an empty write: If we just return -ENOMEM here,
110 * bio_insert() and bio_invalidate() will insert the keys created so far 110 * bch_data_insert_keys() will insert the keys created so far
111 * and finish the rest when the keylist is empty. 111 * and finish the rest when the keylist is empty.
112 */ 112 */
113 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) 113 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
@@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio)
667static void bio_complete(struct search *s) 667static void bio_complete(struct search *s)
668{ 668{
669 if (s->orig_bio) { 669 if (s->orig_bio) {
670 generic_end_io_acct(s->d->disk->queue, 670 generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
671 bio_data_dir(s->orig_bio),
672 &s->d->disk->part0, s->start_time); 671 &s->d->disk->part0, s->start_time);
673 672
674 trace_bcache_request_end(s->d, s->orig_bio); 673 trace_bcache_request_end(s->d, s->orig_bio);
@@ -702,6 +701,8 @@ static void search_free(struct closure *cl)
702{ 701{
703 struct search *s = container_of(cl, struct search, cl); 702 struct search *s = container_of(cl, struct search, cl);
704 703
704 atomic_dec(&s->d->c->search_inflight);
705
705 if (s->iop.bio) 706 if (s->iop.bio)
706 bio_put(s->iop.bio); 707 bio_put(s->iop.bio);
707 708
@@ -719,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio,
719 720
720 closure_init(&s->cl, NULL); 721 closure_init(&s->cl, NULL);
721 do_bio_hook(s, bio, request_endio); 722 do_bio_hook(s, bio, request_endio);
723 atomic_inc(&d->c->search_inflight);
722 724
723 s->orig_bio = bio; 725 s->orig_bio = bio;
724 s->cache_miss = NULL; 726 s->cache_miss = NULL;
@@ -1062,8 +1064,7 @@ static void detached_dev_end_io(struct bio *bio)
1062 bio->bi_end_io = ddip->bi_end_io; 1064 bio->bi_end_io = ddip->bi_end_io;
1063 bio->bi_private = ddip->bi_private; 1065 bio->bi_private = ddip->bi_private;
1064 1066
1065 generic_end_io_acct(ddip->d->disk->queue, 1067 generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
1066 bio_data_dir(bio),
1067 &ddip->d->disk->part0, ddip->start_time); 1068 &ddip->d->disk->part0, ddip->start_time);
1068 1069
1069 if (bio->bi_status) { 1070 if (bio->bi_status) {
@@ -1102,6 +1103,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
1102 generic_make_request(bio); 1103 generic_make_request(bio);
1103} 1104}
1104 1105
1106static void quit_max_writeback_rate(struct cache_set *c,
1107 struct cached_dev *this_dc)
1108{
1109 int i;
1110 struct bcache_device *d;
1111 struct cached_dev *dc;
1112
1113 /*
1114 * mutex bch_register_lock may compete with other parallel requesters,
1115 * or attach/detach operations on other backing device. Waiting to
1116 * the mutex lock may increase I/O request latency for seconds or more.
1117 * To avoid such situation, if mutext_trylock() failed, only writeback
1118 * rate of current cached device is set to 1, and __update_write_back()
1119 * will decide writeback rate of other cached devices (remember now
1120 * c->idle_counter is 0 already).
1121 */
1122 if (mutex_trylock(&bch_register_lock)) {
1123 for (i = 0; i < c->devices_max_used; i++) {
1124 if (!c->devices[i])
1125 continue;
1126
1127 if (UUID_FLASH_ONLY(&c->uuids[i]))
1128 continue;
1129
1130 d = c->devices[i];
1131 dc = container_of(d, struct cached_dev, disk);
1132 /*
1133 * set writeback rate to default minimum value,
1134 * then let update_writeback_rate() to decide the
1135 * upcoming rate.
1136 */
1137 atomic_long_set(&dc->writeback_rate.rate, 1);
1138 }
1139 mutex_unlock(&bch_register_lock);
1140 } else
1141 atomic_long_set(&this_dc->writeback_rate.rate, 1);
1142}
1143
1105/* Cached devices - read & write stuff */ 1144/* Cached devices - read & write stuff */
1106 1145
1107static blk_qc_t cached_dev_make_request(struct request_queue *q, 1146static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -1119,8 +1158,25 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
1119 return BLK_QC_T_NONE; 1158 return BLK_QC_T_NONE;
1120 } 1159 }
1121 1160
1122 atomic_set(&dc->backing_idle, 0); 1161 if (likely(d->c)) {
1123 generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); 1162 if (atomic_read(&d->c->idle_counter))
1163 atomic_set(&d->c->idle_counter, 0);
1164 /*
1165 * If at_max_writeback_rate of cache set is true and new I/O
1166 * comes, quit max writeback rate of all cached devices
1167 * attached to this cache set, and set at_max_writeback_rate
1168 * to false.
1169 */
1170 if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
1171 atomic_set(&d->c->at_max_writeback_rate, 0);
1172 quit_max_writeback_rate(d->c, dc);
1173 }
1174 }
1175
1176 generic_start_io_acct(q,
1177 bio_op(bio),
1178 bio_sectors(bio),
1179 &d->disk->part0);
1124 1180
1125 bio_set_dev(bio, dc->bdev); 1181 bio_set_dev(bio, dc->bdev);
1126 bio->bi_iter.bi_sector += dc->sb.data_offset; 1182 bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1229,7 +1285,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
1229 struct search *s; 1285 struct search *s;
1230 struct closure *cl; 1286 struct closure *cl;
1231 struct bcache_device *d = bio->bi_disk->private_data; 1287 struct bcache_device *d = bio->bi_disk->private_data;
1232 int rw = bio_data_dir(bio);
1233 1288
1234 if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { 1289 if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
1235 bio->bi_status = BLK_STS_IOERR; 1290 bio->bi_status = BLK_STS_IOERR;
@@ -1237,7 +1292,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
1237 return BLK_QC_T_NONE; 1292 return BLK_QC_T_NONE;
1238 } 1293 }
1239 1294
1240 generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); 1295 generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
1241 1296
1242 s = search_alloc(bio, d); 1297 s = search_alloc(bio, d);
1243 cl = &s->cl; 1298 cl = &s->cl;
@@ -1254,7 +1309,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
1254 flash_dev_nodata, 1309 flash_dev_nodata,
1255 bcache_wq); 1310 bcache_wq);
1256 return BLK_QC_T_NONE; 1311 return BLK_QC_T_NONE;
1257 } else if (rw) { 1312 } else if (bio_data_dir(bio)) {
1258 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, 1313 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
1259 &KEY(d->id, bio->bi_iter.bi_sector, 0), 1314 &KEY(d->id, bio->bi_iter.bi_sector, 0),
1260 &KEY(d->id, bio_end_sector(bio), 0)); 1315 &KEY(d->id, bio_end_sector(bio), 0));
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fa4058e43202..55a37641aa95 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
181 goto err; 181 goto err;
182 } 182 }
183 183
184 sb->last_mount = get_seconds(); 184 sb->last_mount = (u32)ktime_get_real_seconds();
185 err = NULL; 185 err = NULL;
186 186
187 get_page(bh->b_page); 187 get_page(bh->b_page);
@@ -696,12 +696,14 @@ static void bcache_device_detach(struct bcache_device *d)
696{ 696{
697 lockdep_assert_held(&bch_register_lock); 697 lockdep_assert_held(&bch_register_lock);
698 698
699 atomic_dec(&d->c->attached_dev_nr);
700
699 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { 701 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
700 struct uuid_entry *u = d->c->uuids + d->id; 702 struct uuid_entry *u = d->c->uuids + d->id;
701 703
702 SET_UUID_FLASH_ONLY(u, 0); 704 SET_UUID_FLASH_ONLY(u, 0);
703 memcpy(u->uuid, invalid_uuid, 16); 705 memcpy(u->uuid, invalid_uuid, 16);
704 u->invalidated = cpu_to_le32(get_seconds()); 706 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
705 bch_uuid_write(d->c); 707 bch_uuid_write(d->c);
706 } 708 }
707 709
@@ -796,11 +798,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
796 return idx; 798 return idx;
797 799
798 if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), 800 if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
799 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || 801 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
800 !(d->disk = alloc_disk(BCACHE_MINORS))) { 802 goto err;
801 ida_simple_remove(&bcache_device_idx, idx); 803
802 return -ENOMEM; 804 d->disk = alloc_disk(BCACHE_MINORS);
803 } 805 if (!d->disk)
806 goto err;
804 807
805 set_capacity(d->disk, sectors); 808 set_capacity(d->disk, sectors);
806 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); 809 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
@@ -834,6 +837,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
834 blk_queue_write_cache(q, true, true); 837 blk_queue_write_cache(q, true, true);
835 838
836 return 0; 839 return 0;
840
841err:
842 ida_simple_remove(&bcache_device_idx, idx);
843 return -ENOMEM;
844
837} 845}
838 846
839/* Cached device */ 847/* Cached device */
@@ -1027,7 +1035,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
1027int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, 1035int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1028 uint8_t *set_uuid) 1036 uint8_t *set_uuid)
1029{ 1037{
1030 uint32_t rtime = cpu_to_le32(get_seconds()); 1038 uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
1031 struct uuid_entry *u; 1039 struct uuid_entry *u;
1032 struct cached_dev *exist_dc, *t; 1040 struct cached_dev *exist_dc, *t;
1033 1041
@@ -1070,7 +1078,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1070 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || 1078 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1071 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { 1079 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1072 memcpy(u->uuid, invalid_uuid, 16); 1080 memcpy(u->uuid, invalid_uuid, 16);
1073 u->invalidated = cpu_to_le32(get_seconds()); 1081 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
1074 u = NULL; 1082 u = NULL;
1075 } 1083 }
1076 1084
@@ -1138,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1138 1146
1139 bch_cached_dev_run(dc); 1147 bch_cached_dev_run(dc);
1140 bcache_device_link(&dc->disk, c, "bdev"); 1148 bcache_device_link(&dc->disk, c, "bdev");
1149 atomic_inc(&c->attached_dev_nr);
1141 1150
1142 /* Allow the writeback thread to proceed */ 1151 /* Allow the writeback thread to proceed */
1143 up_write(&dc->writeback_lock); 1152 up_write(&dc->writeback_lock);
@@ -1285,6 +1294,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1285 pr_info("registered backing device %s", dc->backing_dev_name); 1294 pr_info("registered backing device %s", dc->backing_dev_name);
1286 1295
1287 list_add(&dc->list, &uncached_devices); 1296 list_add(&dc->list, &uncached_devices);
1297 /* attach to a matched cache set if it exists */
1288 list_for_each_entry(c, &bch_cache_sets, list) 1298 list_for_each_entry(c, &bch_cache_sets, list)
1289 bch_cached_dev_attach(dc, c, NULL); 1299 bch_cached_dev_attach(dc, c, NULL);
1290 1300
@@ -1311,6 +1321,8 @@ static void flash_dev_free(struct closure *cl)
1311{ 1321{
1312 struct bcache_device *d = container_of(cl, struct bcache_device, cl); 1322 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1313 mutex_lock(&bch_register_lock); 1323 mutex_lock(&bch_register_lock);
1324 atomic_long_sub(bcache_dev_sectors_dirty(d),
1325 &d->c->flash_dev_dirty_sectors);
1314 bcache_device_free(d); 1326 bcache_device_free(d);
1315 mutex_unlock(&bch_register_lock); 1327 mutex_unlock(&bch_register_lock);
1316 kobject_put(&d->kobj); 1328 kobject_put(&d->kobj);
@@ -1390,7 +1402,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1390 1402
1391 get_random_bytes(u->uuid, 16); 1403 get_random_bytes(u->uuid, 16);
1392 memset(u->label, 0, 32); 1404 memset(u->label, 0, 32);
1393 u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); 1405 u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
1394 1406
1395 SET_UUID_FLASH_ONLY(u, 1); 1407 SET_UUID_FLASH_ONLY(u, 1);
1396 u->sectors = size >> 9; 1408 u->sectors = size >> 9;
@@ -1687,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1687 c->block_bits = ilog2(sb->block_size); 1699 c->block_bits = ilog2(sb->block_size);
1688 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); 1700 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1689 c->devices_max_used = 0; 1701 c->devices_max_used = 0;
1702 atomic_set(&c->attached_dev_nr, 0);
1690 c->btree_pages = bucket_pages(c); 1703 c->btree_pages = bucket_pages(c);
1691 if (c->btree_pages > BTREE_MAX_PAGES) 1704 if (c->btree_pages > BTREE_MAX_PAGES)
1692 c->btree_pages = max_t(int, c->btree_pages / 4, 1705 c->btree_pages = max_t(int, c->btree_pages / 4,
@@ -1894,7 +1907,7 @@ static void run_cache_set(struct cache_set *c)
1894 goto err; 1907 goto err;
1895 1908
1896 closure_sync(&cl); 1909 closure_sync(&cl);
1897 c->sb.last_mount = get_seconds(); 1910 c->sb.last_mount = (u32)ktime_get_real_seconds();
1898 bcache_write_super(c); 1911 bcache_write_super(c);
1899 1912
1900 list_for_each_entry_safe(dc, t, &uncached_devices, list) 1913 list_for_each_entry_safe(dc, t, &uncached_devices, list)
@@ -2163,8 +2176,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2163 if (!try_module_get(THIS_MODULE)) 2176 if (!try_module_get(THIS_MODULE))
2164 return -EBUSY; 2177 return -EBUSY;
2165 2178
2166 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || 2179 path = kstrndup(buffer, size, GFP_KERNEL);
2167 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) 2180 if (!path)
2181 goto err;
2182
2183 sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
2184 if (!sb)
2168 goto err; 2185 goto err;
2169 2186
2170 err = "failed to open device"; 2187 err = "failed to open device";
@@ -2324,13 +2341,21 @@ static int __init bcache_init(void)
2324 return bcache_major; 2341 return bcache_major;
2325 } 2342 }
2326 2343
2327 if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || 2344 bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
2328 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || 2345 if (!bcache_wq)
2329 bch_request_init() || 2346 goto err;
2330 bch_debug_init(bcache_kobj) || closure_debug_init() || 2347
2348 bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
2349 if (!bcache_kobj)
2350 goto err;
2351
2352 if (bch_request_init() ||
2331 sysfs_create_files(bcache_kobj, files)) 2353 sysfs_create_files(bcache_kobj, files))
2332 goto err; 2354 goto err;
2333 2355
2356 bch_debug_init(bcache_kobj);
2357 closure_debug_init();
2358
2334 return 0; 2359 return 0;
2335err: 2360err:
2336 bcache_exit(); 2361 bcache_exit();
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 225b15aa0340..81d3520b0702 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -149,6 +149,7 @@ SHOW(__bch_cached_dev)
149 struct cached_dev *dc = container_of(kobj, struct cached_dev, 149 struct cached_dev *dc = container_of(kobj, struct cached_dev,
150 disk.kobj); 150 disk.kobj);
151 const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; 151 const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
152 int wb = dc->writeback_running;
152 153
153#define var(stat) (dc->stat) 154#define var(stat) (dc->stat)
154 155
@@ -170,7 +171,8 @@ SHOW(__bch_cached_dev)
170 var_printf(writeback_running, "%i"); 171 var_printf(writeback_running, "%i");
171 var_print(writeback_delay); 172 var_print(writeback_delay);
172 var_print(writeback_percent); 173 var_print(writeback_percent);
173 sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); 174 sysfs_hprint(writeback_rate,
175 wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
174 sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); 176 sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
175 sysfs_printf(io_error_limit, "%i", dc->error_limit); 177 sysfs_printf(io_error_limit, "%i", dc->error_limit);
176 sysfs_printf(io_disable, "%i", dc->io_disable); 178 sysfs_printf(io_disable, "%i", dc->io_disable);
@@ -188,15 +190,22 @@ SHOW(__bch_cached_dev)
188 char change[20]; 190 char change[20];
189 s64 next_io; 191 s64 next_io;
190 192
191 bch_hprint(rate, dc->writeback_rate.rate << 9); 193 /*
192 bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); 194 * Except for dirty and target, other values should
193 bch_hprint(target, dc->writeback_rate_target << 9); 195 * be 0 if writeback is not running.
194 bch_hprint(proportional,dc->writeback_rate_proportional << 9); 196 */
195 bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); 197 bch_hprint(rate,
196 bch_hprint(change, dc->writeback_rate_change << 9); 198 wb ? atomic_long_read(&dc->writeback_rate.rate) << 9
197 199 : 0);
198 next_io = div64_s64(dc->writeback_rate.next - local_clock(), 200 bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
199 NSEC_PER_MSEC); 201 bch_hprint(target, dc->writeback_rate_target << 9);
202 bch_hprint(proportional,
203 wb ? dc->writeback_rate_proportional << 9 : 0);
204 bch_hprint(integral,
205 wb ? dc->writeback_rate_integral_scaled << 9 : 0);
206 bch_hprint(change, wb ? dc->writeback_rate_change << 9 : 0);
207 next_io = wb ? div64_s64(dc->writeback_rate.next-local_clock(),
208 NSEC_PER_MSEC) : 0;
200 209
201 return sprintf(buf, 210 return sprintf(buf,
202 "rate:\t\t%s/sec\n" 211 "rate:\t\t%s/sec\n"
@@ -255,8 +264,19 @@ STORE(__cached_dev)
255 264
256 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); 265 sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
257 266
258 sysfs_strtoul_clamp(writeback_rate, 267 if (attr == &sysfs_writeback_rate) {
259 dc->writeback_rate.rate, 1, INT_MAX); 268 ssize_t ret;
269 long int v = atomic_long_read(&dc->writeback_rate.rate);
270
271 ret = strtoul_safe_clamp(buf, v, 1, INT_MAX);
272
273 if (!ret) {
274 atomic_long_set(&dc->writeback_rate.rate, v);
275 ret = size;
276 }
277
278 return ret;
279 }
260 280
261 sysfs_strtoul_clamp(writeback_rate_update_seconds, 281 sysfs_strtoul_clamp(writeback_rate_update_seconds,
262 dc->writeback_rate_update_seconds, 282 dc->writeback_rate_update_seconds,
@@ -338,8 +358,8 @@ STORE(__cached_dev)
338 if (!v) 358 if (!v)
339 return size; 359 return size;
340 } 360 }
341 361 if (v == -ENOENT)
342 pr_err("Can't attach %s: cache set not found", buf); 362 pr_err("Can't attach %s: cache set not found", buf);
343 return v; 363 return v;
344 } 364 }
345 365
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index fc479b026d6d..b15256bcf0e7 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
200{ 200{
201 uint64_t now = local_clock(); 201 uint64_t now = local_clock();
202 202
203 d->next += div_u64(done * NSEC_PER_SEC, d->rate); 203 d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate));
204 204
205 /* Bound the time. Don't let us fall further than 2 seconds behind 205 /* Bound the time. Don't let us fall further than 2 seconds behind
206 * (this prevents unnecessary backlog that would make it impossible 206 * (this prevents unnecessary backlog that would make it impossible
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cced87f8eb27..f7b0133c9d2f 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,7 +442,7 @@ struct bch_ratelimit {
442 * Rate at which we want to do work, in units per second 442 * Rate at which we want to do work, in units per second
443 * The units here correspond to the units passed to bch_next_delay() 443 * The units here correspond to the units passed to bch_next_delay()
444 */ 444 */
445 uint32_t rate; 445 atomic_long_t rate;
446}; 446};
447 447
448static inline void bch_ratelimit_reset(struct bch_ratelimit *d) 448static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ad45ebe1a74b..481d4cf38ac0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
27 * flash-only devices 27 * flash-only devices
28 */ 28 */
29 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - 29 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
30 bcache_flash_devs_sectors_dirty(c); 30 atomic_long_read(&c->flash_dev_dirty_sectors);
31 31
32 /* 32 /*
33 * Unfortunately there is no control of global dirty data. If the 33 * Unfortunately there is no control of global dirty data. If the
@@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc)
104 104
105 dc->writeback_rate_proportional = proportional_scaled; 105 dc->writeback_rate_proportional = proportional_scaled;
106 dc->writeback_rate_integral_scaled = integral_scaled; 106 dc->writeback_rate_integral_scaled = integral_scaled;
107 dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; 107 dc->writeback_rate_change = new_rate -
108 dc->writeback_rate.rate = new_rate; 108 atomic_long_read(&dc->writeback_rate.rate);
109 atomic_long_set(&dc->writeback_rate.rate, new_rate);
109 dc->writeback_rate_target = target; 110 dc->writeback_rate_target = target;
110} 111}
111 112
113static bool set_at_max_writeback_rate(struct cache_set *c,
114 struct cached_dev *dc)
115{
116 /*
117 * Idle_counter is increased everytime when update_writeback_rate() is
118 * called. If all backing devices attached to the same cache set have
119 * identical dc->writeback_rate_update_seconds values, it is about 6
120 * rounds of update_writeback_rate() on each backing device before
121 * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
122 * to each dc->writeback_rate.rate.
123 * In order to avoid extra locking cost for counting exact dirty cached
124 * devices number, c->attached_dev_nr is used to calculate the idle
125 * throushold. It might be bigger if not all cached device are in write-
126 * back mode, but it still works well with limited extra rounds of
127 * update_writeback_rate().
128 */
129 if (atomic_inc_return(&c->idle_counter) <
130 atomic_read(&c->attached_dev_nr) * 6)
131 return false;
132
133 if (atomic_read(&c->at_max_writeback_rate) != 1)
134 atomic_set(&c->at_max_writeback_rate, 1);
135
136 atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
137
138 /* keep writeback_rate_target as existing value */
139 dc->writeback_rate_proportional = 0;
140 dc->writeback_rate_integral_scaled = 0;
141 dc->writeback_rate_change = 0;
142
143 /*
144 * Check c->idle_counter and c->at_max_writeback_rate agagain in case
145 * new I/O arrives during before set_at_max_writeback_rate() returns.
146 * Then the writeback rate is set to 1, and its new value should be
147 * decided via __update_writeback_rate().
148 */
149 if ((atomic_read(&c->idle_counter) <
150 atomic_read(&c->attached_dev_nr) * 6) ||
151 !atomic_read(&c->at_max_writeback_rate))
152 return false;
153
154 return true;
155}
156
112static void update_writeback_rate(struct work_struct *work) 157static void update_writeback_rate(struct work_struct *work)
113{ 158{
114 struct cached_dev *dc = container_of(to_delayed_work(work), 159 struct cached_dev *dc = container_of(to_delayed_work(work),
@@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work)
136 return; 181 return;
137 } 182 }
138 183
139 down_read(&dc->writeback_lock); 184 if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
140 185 /*
141 if (atomic_read(&dc->has_dirty) && 186 * If the whole cache set is idle, set_at_max_writeback_rate()
142 dc->writeback_percent) 187 * will set writeback rate to a max number. Then it is
143 __update_writeback_rate(dc); 188 * unncessary to update writeback rate for an idle cache set
189 * in maximum writeback rate number(s).
190 */
191 if (!set_at_max_writeback_rate(c, dc)) {
192 down_read(&dc->writeback_lock);
193 __update_writeback_rate(dc);
194 up_read(&dc->writeback_lock);
195 }
196 }
144 197
145 up_read(&dc->writeback_lock);
146 198
147 /* 199 /*
148 * CACHE_SET_IO_DISABLE might be set via sysfs interface, 200 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc)
422 474
423 delay = writeback_delay(dc, size); 475 delay = writeback_delay(dc, size);
424 476
425 /* If the control system would wait for at least half a
426 * second, and there's been no reqs hitting the backing disk
427 * for awhile: use an alternate mode where we have at most
428 * one contiguous set of writebacks in flight at a time. If
429 * someone wants to do IO it will be quick, as it will only
430 * have to contend with one operation in flight, and we'll
431 * be round-tripping data to the backing disk as quickly as
432 * it can accept it.
433 */
434 if (delay >= HZ / 2) {
435 /* 3 means at least 1.5 seconds, up to 7.5 if we
436 * have slowed way down.
437 */
438 if (atomic_inc_return(&dc->backing_idle) >= 3) {
439 /* Wait for current I/Os to finish */
440 closure_sync(&cl);
441 /* And immediately launch a new set. */
442 delay = 0;
443 }
444 }
445
446 while (!kthread_should_stop() && 477 while (!kthread_should_stop() &&
447 !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) && 478 !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
448 delay) { 479 delay) {
@@ -476,6 +507,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
476 if (!d) 507 if (!d)
477 return; 508 return;
478 509
510 if (UUID_FLASH_ONLY(&c->uuids[inode]))
511 atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
512
479 stripe = offset_to_stripe(d, offset); 513 stripe = offset_to_stripe(d, offset);
480 stripe_offset = offset & (d->stripe_size - 1); 514 stripe_offset = offset & (d->stripe_size - 1);
481 515
@@ -673,10 +707,14 @@ static int bch_writeback_thread(void *arg)
673} 707}
674 708
675/* Init */ 709/* Init */
710#define INIT_KEYS_EACH_TIME 500000
711#define INIT_KEYS_SLEEP_MS 100
676 712
677struct sectors_dirty_init { 713struct sectors_dirty_init {
678 struct btree_op op; 714 struct btree_op op;
679 unsigned inode; 715 unsigned inode;
716 size_t count;
717 struct bkey start;
680}; 718};
681 719
682static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, 720static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -691,18 +729,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
691 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 729 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
692 KEY_START(k), KEY_SIZE(k)); 730 KEY_START(k), KEY_SIZE(k));
693 731
732 op->count++;
733 if (atomic_read(&b->c->search_inflight) &&
734 !(op->count % INIT_KEYS_EACH_TIME)) {
735 bkey_copy_key(&op->start, k);
736 return -EAGAIN;
737 }
738
694 return MAP_CONTINUE; 739 return MAP_CONTINUE;
695} 740}
696 741
697void bch_sectors_dirty_init(struct bcache_device *d) 742void bch_sectors_dirty_init(struct bcache_device *d)
698{ 743{
699 struct sectors_dirty_init op; 744 struct sectors_dirty_init op;
745 int ret;
700 746
701 bch_btree_op_init(&op.op, -1); 747 bch_btree_op_init(&op.op, -1);
702 op.inode = d->id; 748 op.inode = d->id;
703 749 op.count = 0;
704 bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), 750 op.start = KEY(op.inode, 0, 0);
705 sectors_dirty_init_fn, 0); 751
752 do {
753 ret = bch_btree_map_keys(&op.op, d->c, &op.start,
754 sectors_dirty_init_fn, 0);
755 if (ret == -EAGAIN)
756 schedule_timeout_interruptible(
757 msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
758 else if (ret < 0) {
759 pr_warn("sectors dirty init failed, ret=%d!", ret);
760 break;
761 }
762 } while (ret == -EAGAIN);
706} 763}
707 764
708void bch_cached_dev_writeback_init(struct cached_dev *dc) 765void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -715,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
715 dc->writeback_running = true; 772 dc->writeback_running = true;
716 dc->writeback_percent = 10; 773 dc->writeback_percent = 10;
717 dc->writeback_delay = 30; 774 dc->writeback_delay = 30;
718 dc->writeback_rate.rate = 1024; 775 atomic_long_set(&dc->writeback_rate.rate, 1024);
719 dc->writeback_rate_minimum = 8; 776 dc->writeback_rate_minimum = 8;
720 777
721 dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; 778 dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 610fb01de629..3745d7004c47 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -28,25 +28,6 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
28 return ret; 28 return ret;
29} 29}
30 30
31static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
32{
33 uint64_t i, ret = 0;
34
35 mutex_lock(&bch_register_lock);
36
37 for (i = 0; i < c->devices_max_used; i++) {
38 struct bcache_device *d = c->devices[i];
39
40 if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
41 continue;
42 ret += bcache_dev_sectors_dirty(d);
43 }
44
45 mutex_unlock(&bch_register_lock);
46
47 return ret;
48}
49
50static inline unsigned offset_to_stripe(struct bcache_device *d, 31static inline unsigned offset_to_stripe(struct bcache_device *d,
51 uint64_t offset) 32 uint64_t offset)
52{ 33{
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b0dd7027848b..20f7e4ef5342 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io)
609 609
610 io->start_time = jiffies; 610 io->start_time = jiffies;
611 611
612 generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0); 612 generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
613 &dm_disk(md)->part0);
613 614
614 atomic_set(&dm_disk(md)->part0.in_flight[rw], 615 atomic_set(&dm_disk(md)->part0.in_flight[rw],
615 atomic_inc_return(&md->pending[rw])); 616 atomic_inc_return(&md->pending[rw]));
@@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io)
628 int pending; 629 int pending;
629 int rw = bio_data_dir(bio); 630 int rw = bio_data_dir(bio);
630 631
631 generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time); 632 generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
633 io->start_time);
632 634
633 if (unlikely(dm_stats_used(&md->stats))) 635 if (unlikely(dm_stats_used(&md->stats)))
634 dm_stats_account_io(&md->stats, bio_data_dir(bio), 636 dm_stats_account_io(&md->stats, bio_data_dir(bio),
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 994aed2f9dff..cb4eb5faa519 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -204,10 +204,6 @@ static int start_readonly;
204 */ 204 */
205static bool create_on_open = true; 205static bool create_on_open = true;
206 206
207/* bio_clone_mddev
208 * like bio_clone_bioset, but with a local bio set
209 */
210
211struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 207struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
212 struct mddev *mddev) 208 struct mddev *mddev)
213{ 209{
@@ -335,6 +331,7 @@ EXPORT_SYMBOL(md_handle_request);
335static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) 331static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
336{ 332{
337 const int rw = bio_data_dir(bio); 333 const int rw = bio_data_dir(bio);
334 const int sgrp = op_stat_group(bio_op(bio));
338 struct mddev *mddev = q->queuedata; 335 struct mddev *mddev = q->queuedata;
339 unsigned int sectors; 336 unsigned int sectors;
340 int cpu; 337 int cpu;
@@ -363,8 +360,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
363 md_handle_request(mddev, bio); 360 md_handle_request(mddev, bio);
364 361
365 cpu = part_stat_lock(); 362 cpu = part_stat_lock();
366 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 363 part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
367 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 364 part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
368 part_stat_unlock(); 365 part_stat_unlock();
369 366
370 return BLK_QC_T_NONE; 367 return BLK_QC_T_NONE;
@@ -8046,8 +8043,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
8046 rcu_read_lock(); 8043 rcu_read_lock();
8047 rdev_for_each_rcu(rdev, mddev) { 8044 rdev_for_each_rcu(rdev, mddev) {
8048 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 8045 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
8049 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 8046 curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
8050 (int)part_stat_read(&disk->part0, sectors[1]) -
8051 atomic_read(&disk->sync_io); 8047 atomic_read(&disk->sync_io);
8052 /* sync IO will cause sync_io to increase before the disk_stats 8048 /* sync IO will cause sync_io to increase before the disk_stats
8053 * as sync_io is counted when a request starts, and 8049 * as sync_io is counted when a request starts, and
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 85de8053aa34..0360c015f658 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1423,11 +1423,11 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
1423 1423
1424static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, 1424static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
1425 struct page *page, unsigned int len, unsigned int off, 1425 struct page *page, unsigned int len, unsigned int off,
1426 bool is_write, sector_t sector) 1426 unsigned int op, sector_t sector)
1427{ 1427{
1428 int ret; 1428 int ret;
1429 1429
1430 if (!is_write) { 1430 if (!op_is_write(op)) {
1431 ret = btt_read_pg(btt, bip, page, off, sector, len); 1431 ret = btt_read_pg(btt, bip, page, off, sector, len);
1432 flush_dcache_page(page); 1432 flush_dcache_page(page);
1433 } else { 1433 } else {
@@ -1464,7 +1464,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
1464 } 1464 }
1465 1465
1466 err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, 1466 err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
1467 op_is_write(bio_op(bio)), iter.bi_sector); 1467 bio_op(bio), iter.bi_sector);
1468 if (err) { 1468 if (err) {
1469 dev_err(&btt->nd_btt->dev, 1469 dev_err(&btt->nd_btt->dev,
1470 "io error in %s sector %lld, len %d,\n", 1470 "io error in %s sector %lld, len %d,\n",
@@ -1483,16 +1483,16 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
1483} 1483}
1484 1484
1485static int btt_rw_page(struct block_device *bdev, sector_t sector, 1485static int btt_rw_page(struct block_device *bdev, sector_t sector,
1486 struct page *page, bool is_write) 1486 struct page *page, unsigned int op)
1487{ 1487{
1488 struct btt *btt = bdev->bd_disk->private_data; 1488 struct btt *btt = bdev->bd_disk->private_data;
1489 int rc; 1489 int rc;
1490 unsigned int len; 1490 unsigned int len;
1491 1491
1492 len = hpage_nr_pages(page) * PAGE_SIZE; 1492 len = hpage_nr_pages(page) * PAGE_SIZE;
1493 rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector); 1493 rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector);
1494 if (rc == 0) 1494 if (rc == 0)
1495 page_endio(page, is_write, 0); 1495 page_endio(page, op_is_write(op), 0);
1496 1496
1497 return rc; 1497 return rc;
1498} 1498}
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 32e0364b48b9..6ee7fd7e4bbd 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -396,16 +396,15 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
396 return false; 396 return false;
397 397
398 *start = jiffies; 398 *start = jiffies;
399 generic_start_io_acct(disk->queue, bio_data_dir(bio), 399 generic_start_io_acct(disk->queue, bio_op(bio), bio_sectors(bio),
400 bio_sectors(bio), &disk->part0); 400 &disk->part0);
401 return true; 401 return true;
402} 402}
403static inline void nd_iostat_end(struct bio *bio, unsigned long start) 403static inline void nd_iostat_end(struct bio *bio, unsigned long start)
404{ 404{
405 struct gendisk *disk = bio->bi_disk; 405 struct gendisk *disk = bio->bi_disk;
406 406
407 generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0, 407 generic_end_io_acct(disk->queue, bio_op(bio), &disk->part0, start);
408 start);
409} 408}
410static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector, 409static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
411 unsigned int len) 410 unsigned int len)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8b1fd7f1a224..dd17acd8fe68 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -120,7 +120,7 @@ static blk_status_t read_pmem(struct page *page, unsigned int off,
120} 120}
121 121
122static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, 122static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
123 unsigned int len, unsigned int off, bool is_write, 123 unsigned int len, unsigned int off, unsigned int op,
124 sector_t sector) 124 sector_t sector)
125{ 125{
126 blk_status_t rc = BLK_STS_OK; 126 blk_status_t rc = BLK_STS_OK;
@@ -131,7 +131,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
131 if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) 131 if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
132 bad_pmem = true; 132 bad_pmem = true;
133 133
134 if (!is_write) { 134 if (!op_is_write(op)) {
135 if (unlikely(bad_pmem)) 135 if (unlikely(bad_pmem))
136 rc = BLK_STS_IOERR; 136 rc = BLK_STS_IOERR;
137 else { 137 else {
@@ -180,8 +180,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
180 do_acct = nd_iostat_start(bio, &start); 180 do_acct = nd_iostat_start(bio, &start);
181 bio_for_each_segment(bvec, bio, iter) { 181 bio_for_each_segment(bvec, bio, iter) {
182 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, 182 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
183 bvec.bv_offset, op_is_write(bio_op(bio)), 183 bvec.bv_offset, bio_op(bio), iter.bi_sector);
184 iter.bi_sector);
185 if (rc) { 184 if (rc) {
186 bio->bi_status = rc; 185 bio->bi_status = rc;
187 break; 186 break;
@@ -198,13 +197,13 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
198} 197}
199 198
200static int pmem_rw_page(struct block_device *bdev, sector_t sector, 199static int pmem_rw_page(struct block_device *bdev, sector_t sector,
201 struct page *page, bool is_write) 200 struct page *page, unsigned int op)
202{ 201{
203 struct pmem_device *pmem = bdev->bd_queue->queuedata; 202 struct pmem_device *pmem = bdev->bd_queue->queuedata;
204 blk_status_t rc; 203 blk_status_t rc;
205 204
206 rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, 205 rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
207 0, is_write, sector); 206 0, op, sector);
208 207
209 /* 208 /*
210 * The ->rw_page interface is subtle and tricky. The core 209 * The ->rw_page interface is subtle and tricky. The core
@@ -213,7 +212,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
213 * caused by double completion. 212 * caused by double completion.
214 */ 213 */
215 if (rc == 0) 214 if (rc == 0)
216 page_endio(page, is_write, 0); 215 page_endio(page, op_is_write(op), 0);
217 216
218 return blk_status_to_errno(rc); 217 return blk_status_to_errno(rc);
219} 218}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bf65501e6ed6..dd8ec1dd9219 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -252,7 +252,8 @@ void nvme_complete_rq(struct request *req)
252 trace_nvme_complete_rq(req); 252 trace_nvme_complete_rq(req);
253 253
254 if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { 254 if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
255 if (nvme_req_needs_failover(req, status)) { 255 if ((req->cmd_flags & REQ_NVME_MPATH) &&
256 blk_path_error(status)) {
256 nvme_failover_req(req); 257 nvme_failover_req(req);
257 return; 258 return;
258 } 259 }
@@ -617,6 +618,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
617 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) 618 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
618 return BLK_STS_NOTSUPP; 619 return BLK_STS_NOTSUPP;
619 control |= NVME_RW_PRINFO_PRACT; 620 control |= NVME_RW_PRINFO_PRACT;
621 } else if (req_op(req) == REQ_OP_WRITE) {
622 t10_pi_prepare(req, ns->pi_type);
620 } 623 }
621 624
622 switch (ns->pi_type) { 625 switch (ns->pi_type) {
@@ -627,8 +630,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
627 case NVME_NS_DPS_PI_TYPE2: 630 case NVME_NS_DPS_PI_TYPE2:
628 control |= NVME_RW_PRINFO_PRCHK_GUARD | 631 control |= NVME_RW_PRINFO_PRCHK_GUARD |
629 NVME_RW_PRINFO_PRCHK_REF; 632 NVME_RW_PRINFO_PRCHK_REF;
630 cmnd->rw.reftag = cpu_to_le32( 633 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
631 nvme_block_nr(ns, blk_rq_pos(req)));
632 break; 634 break;
633 } 635 }
634 } 636 }
@@ -638,6 +640,22 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
638 return 0; 640 return 0;
639} 641}
640 642
643void nvme_cleanup_cmd(struct request *req)
644{
645 if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
646 nvme_req(req)->status == 0) {
647 struct nvme_ns *ns = req->rq_disk->private_data;
648
649 t10_pi_complete(req, ns->pi_type,
650 blk_rq_bytes(req) >> ns->lba_shift);
651 }
652 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
653 kfree(page_address(req->special_vec.bv_page) +
654 req->special_vec.bv_offset);
655 }
656}
657EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
658
641blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 659blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
642 struct nvme_command *cmd) 660 struct nvme_command *cmd)
643{ 661{
@@ -668,10 +686,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
668 } 686 }
669 687
670 cmd->common.command_id = req->tag; 688 cmd->common.command_id = req->tag;
671 if (ns) 689 trace_nvme_setup_cmd(req, cmd);
672 trace_nvme_setup_nvm_cmd(req->q->id, cmd);
673 else
674 trace_nvme_setup_admin_cmd(cmd);
675 return ret; 690 return ret;
676} 691}
677EXPORT_SYMBOL_GPL(nvme_setup_cmd); 692EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@ -864,9 +879,6 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
864 if (unlikely(ctrl->kato == 0)) 879 if (unlikely(ctrl->kato == 0))
865 return; 880 return;
866 881
867 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
868 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
869 ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
870 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); 882 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
871} 883}
872 884
@@ -1056,7 +1068,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1056EXPORT_SYMBOL_GPL(nvme_set_queue_count); 1068EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1057 1069
1058#define NVME_AEN_SUPPORTED \ 1070#define NVME_AEN_SUPPORTED \
1059 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT) 1071 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
1060 1072
1061static void nvme_enable_aen(struct nvme_ctrl *ctrl) 1073static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1062{ 1074{
@@ -1472,6 +1484,12 @@ static void nvme_update_disk_info(struct gendisk *disk,
1472 1484
1473 set_capacity(disk, capacity); 1485 set_capacity(disk, capacity);
1474 nvme_config_discard(ns); 1486 nvme_config_discard(ns);
1487
1488 if (id->nsattr & (1 << 0))
1489 set_disk_ro(disk, true);
1490 else
1491 set_disk_ro(disk, false);
1492
1475 blk_mq_unfreeze_queue(disk->queue); 1493 blk_mq_unfreeze_queue(disk->queue);
1476} 1494}
1477 1495
@@ -2270,21 +2288,16 @@ out_unlock:
2270 return ret; 2288 return ret;
2271} 2289}
2272 2290
2273int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 2291int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
2274 u8 log_page, void *log, 2292 void *log, size_t size, u64 offset)
2275 size_t size, u64 offset)
2276{ 2293{
2277 struct nvme_command c = { }; 2294 struct nvme_command c = { };
2278 unsigned long dwlen = size / 4 - 1; 2295 unsigned long dwlen = size / 4 - 1;
2279 2296
2280 c.get_log_page.opcode = nvme_admin_get_log_page; 2297 c.get_log_page.opcode = nvme_admin_get_log_page;
2281 2298 c.get_log_page.nsid = cpu_to_le32(nsid);
2282 if (ns)
2283 c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id);
2284 else
2285 c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL);
2286
2287 c.get_log_page.lid = log_page; 2299 c.get_log_page.lid = log_page;
2300 c.get_log_page.lsp = lsp;
2288 c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1)); 2301 c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
2289 c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); 2302 c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
2290 c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); 2303 c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
@@ -2293,12 +2306,6 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
2293 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); 2306 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
2294} 2307}
2295 2308
2296static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
2297 size_t size)
2298{
2299 return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0);
2300}
2301
2302static int nvme_get_effects_log(struct nvme_ctrl *ctrl) 2309static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
2303{ 2310{
2304 int ret; 2311 int ret;
@@ -2309,8 +2316,8 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
2309 if (!ctrl->effects) 2316 if (!ctrl->effects)
2310 return 0; 2317 return 0;
2311 2318
2312 ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects, 2319 ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
2313 sizeof(*ctrl->effects)); 2320 ctrl->effects, sizeof(*ctrl->effects), 0);
2314 if (ret) { 2321 if (ret) {
2315 kfree(ctrl->effects); 2322 kfree(ctrl->effects);
2316 ctrl->effects = NULL; 2323 ctrl->effects = NULL;
@@ -2401,6 +2408,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
2401 nvme_set_queue_limits(ctrl, ctrl->admin_q); 2408 nvme_set_queue_limits(ctrl, ctrl->admin_q);
2402 ctrl->sgls = le32_to_cpu(id->sgls); 2409 ctrl->sgls = le32_to_cpu(id->sgls);
2403 ctrl->kas = le16_to_cpu(id->kas); 2410 ctrl->kas = le16_to_cpu(id->kas);
2411 ctrl->max_namespaces = le32_to_cpu(id->mnan);
2404 2412
2405 if (id->rtd3e) { 2413 if (id->rtd3e) {
2406 /* us -> s */ 2414 /* us -> s */
@@ -2460,8 +2468,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
2460 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); 2468 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
2461 } 2469 }
2462 2470
2471 ret = nvme_mpath_init(ctrl, id);
2463 kfree(id); 2472 kfree(id);
2464 2473
2474 if (ret < 0)
2475 return ret;
2476
2465 if (ctrl->apst_enabled && !prev_apst_enabled) 2477 if (ctrl->apst_enabled && !prev_apst_enabled)
2466 dev_pm_qos_expose_latency_tolerance(ctrl->device); 2478 dev_pm_qos_expose_latency_tolerance(ctrl->device);
2467 else if (!ctrl->apst_enabled && prev_apst_enabled) 2479 else if (!ctrl->apst_enabled && prev_apst_enabled)
@@ -2680,6 +2692,10 @@ static struct attribute *nvme_ns_id_attrs[] = {
2680 &dev_attr_nguid.attr, 2692 &dev_attr_nguid.attr,
2681 &dev_attr_eui.attr, 2693 &dev_attr_eui.attr,
2682 &dev_attr_nsid.attr, 2694 &dev_attr_nsid.attr,
2695#ifdef CONFIG_NVME_MULTIPATH
2696 &dev_attr_ana_grpid.attr,
2697 &dev_attr_ana_state.attr,
2698#endif
2683 NULL, 2699 NULL,
2684}; 2700};
2685 2701
@@ -2702,6 +2718,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
2702 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) 2718 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2703 return 0; 2719 return 0;
2704 } 2720 }
2721#ifdef CONFIG_NVME_MULTIPATH
2722 if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
2723 if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
2724 return 0;
2725 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
2726 return 0;
2727 }
2728#endif
2705 return a->mode; 2729 return a->mode;
2706} 2730}
2707 2731
@@ -3075,8 +3099,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3075 3099
3076 nvme_get_ctrl(ctrl); 3100 nvme_get_ctrl(ctrl);
3077 3101
3078 kfree(id);
3079
3080 device_add_disk(ctrl->device, ns->disk); 3102 device_add_disk(ctrl->device, ns->disk);
3081 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, 3103 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
3082 &nvme_ns_id_attr_group)) 3104 &nvme_ns_id_attr_group))
@@ -3086,8 +3108,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3086 pr_warn("%s: failed to register lightnvm sysfs group for identification\n", 3108 pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
3087 ns->disk->disk_name); 3109 ns->disk->disk_name);
3088 3110
3089 nvme_mpath_add_disk(ns->head); 3111 nvme_mpath_add_disk(ns, id);
3090 nvme_fault_inject_init(ns); 3112 nvme_fault_inject_init(ns);
3113 kfree(id);
3114
3091 return; 3115 return;
3092 out_unlink_ns: 3116 out_unlink_ns:
3093 mutex_lock(&ctrl->subsys->lock); 3117 mutex_lock(&ctrl->subsys->lock);
@@ -3229,7 +3253,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
3229 * raced with us in reading the log page, which could cause us to miss 3253 * raced with us in reading the log page, which could cause us to miss
3230 * updates. 3254 * updates.
3231 */ 3255 */
3232 error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); 3256 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
3257 log_size, 0);
3233 if (error) 3258 if (error)
3234 dev_warn(ctrl->device, 3259 dev_warn(ctrl->device,
3235 "reading changed ns log failed: %d\n", error); 3260 "reading changed ns log failed: %d\n", error);
@@ -3346,9 +3371,9 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
3346 if (!log) 3371 if (!log)
3347 return; 3372 return;
3348 3373
3349 if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log))) 3374 if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log,
3350 dev_warn(ctrl->device, 3375 sizeof(*log), 0))
3351 "Get FW SLOT INFO log error\n"); 3376 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
3352 kfree(log); 3377 kfree(log);
3353} 3378}
3354 3379
@@ -3394,6 +3419,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
3394 case NVME_AER_NOTICE_FW_ACT_STARTING: 3419 case NVME_AER_NOTICE_FW_ACT_STARTING:
3395 queue_work(nvme_wq, &ctrl->fw_act_work); 3420 queue_work(nvme_wq, &ctrl->fw_act_work);
3396 break; 3421 break;
3422#ifdef CONFIG_NVME_MULTIPATH
3423 case NVME_AER_NOTICE_ANA:
3424 if (!ctrl->ana_log_buf)
3425 break;
3426 queue_work(nvme_wq, &ctrl->ana_work);
3427 break;
3428#endif
3397 default: 3429 default:
3398 dev_warn(ctrl->device, "async event result %08x\n", result); 3430 dev_warn(ctrl->device, "async event result %08x\n", result);
3399 } 3431 }
@@ -3426,6 +3458,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
3426 3458
3427void nvme_stop_ctrl(struct nvme_ctrl *ctrl) 3459void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
3428{ 3460{
3461 nvme_mpath_stop(ctrl);
3429 nvme_stop_keep_alive(ctrl); 3462 nvme_stop_keep_alive(ctrl);
3430 flush_work(&ctrl->async_event_work); 3463 flush_work(&ctrl->async_event_work);
3431 flush_work(&ctrl->scan_work); 3464 flush_work(&ctrl->scan_work);
@@ -3463,6 +3496,7 @@ static void nvme_free_ctrl(struct device *dev)
3463 3496
3464 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 3497 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3465 kfree(ctrl->effects); 3498 kfree(ctrl->effects);
3499 nvme_mpath_uninit(ctrl);
3466 3500
3467 if (subsys) { 3501 if (subsys) {
3468 mutex_lock(&subsys->lock); 3502 mutex_lock(&subsys->lock);
@@ -3499,6 +3533,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
3499 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); 3533 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
3500 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); 3534 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
3501 3535
3536 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
3537 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
3538 ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
3539
3502 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); 3540 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
3503 if (ret < 0) 3541 if (ret < 0)
3504 goto out; 3542 goto out;
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index f7efe5a58cc7..206d63cb1afc 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
474 474
475bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) 475bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
476{ 476{
477 if (ctrl->opts->max_reconnects != -1 && 477 if (ctrl->opts->max_reconnects == -1 ||
478 ctrl->nr_reconnects < ctrl->opts->max_reconnects) 478 ctrl->nr_reconnects < ctrl->opts->max_reconnects)
479 return true; 479 return true;
480 480
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 9bac912173ba..611e70cae754 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1737,6 +1737,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq,
1737 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; 1737 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
1738 struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; 1738 struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
1739 1739
1740 nvme_req(rq)->ctrl = &ctrl->ctrl;
1740 return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); 1741 return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
1741} 1742}
1742 1743
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 41279da799ed..6fe5923c95d4 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -414,12 +414,6 @@ static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id,
414 /* Set compacted version for upper layers */ 414 /* Set compacted version for upper layers */
415 geo->version = NVM_OCSSD_SPEC_20; 415 geo->version = NVM_OCSSD_SPEC_20;
416 416
417 if (!(geo->major_ver_id == 2 && geo->minor_ver_id == 0)) {
418 pr_err("nvm: OCSSD version not supported (v%d.%d)\n",
419 geo->major_ver_id, geo->minor_ver_id);
420 return -EINVAL;
421 }
422
423 geo->num_ch = le16_to_cpu(id->num_grp); 417 geo->num_ch = le16_to_cpu(id->num_grp);
424 geo->num_lun = le16_to_cpu(id->num_pu); 418 geo->num_lun = le16_to_cpu(id->num_pu);
425 geo->all_luns = geo->num_ch * geo->num_lun; 419 geo->all_luns = geo->num_ch * geo->num_lun;
@@ -583,7 +577,13 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
583 struct ppa_addr ppa; 577 struct ppa_addr ppa;
584 size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); 578 size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
585 size_t log_pos, offset, len; 579 size_t log_pos, offset, len;
586 int ret, i; 580 int ret, i, max_len;
581
582 /*
583 * limit requests to maximum 256K to avoid issuing arbitrary large
584 * requests when the device does not specific a maximum transfer size.
585 */
586 max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024);
587 587
588 /* Normalize lba address space to obtain log offset */ 588 /* Normalize lba address space to obtain log offset */
589 ppa.ppa = slba; 589 ppa.ppa = slba;
@@ -596,10 +596,11 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
596 offset = log_pos * sizeof(struct nvme_nvm_chk_meta); 596 offset = log_pos * sizeof(struct nvme_nvm_chk_meta);
597 597
598 while (left) { 598 while (left) {
599 len = min_t(unsigned int, left, ctrl->max_hw_sectors << 9); 599 len = min_t(unsigned int, left, max_len);
600 600
601 ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK, 601 ret = nvme_get_log(ctrl, ns->head->ns_id,
602 dev_meta, len, offset); 602 NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
603 offset);
603 if (ret) { 604 if (ret) {
604 dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); 605 dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
605 break; 606 break;
@@ -662,12 +663,10 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q,
662 663
663 rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; 664 rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
664 665
665 if (rqd->bio) { 666 if (rqd->bio)
666 blk_init_request_from_bio(rq, rqd->bio); 667 blk_init_request_from_bio(rq, rqd->bio);
667 } else { 668 else
668 rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); 669 rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
669 rq->__data_len = 0;
670 }
671 670
672 return rq; 671 return rq;
673} 672}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1ffd3e8b13a1..5a9562881d4e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2017 Christoph Hellwig. 2 * Copyright (c) 2017-2018 Christoph Hellwig.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444);
20MODULE_PARM_DESC(multipath, 20MODULE_PARM_DESC(multipath,
21 "turn on native support for multiple controllers per subsystem"); 21 "turn on native support for multiple controllers per subsystem");
22 22
23inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
24{
25 return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
26}
27
23/* 28/*
24 * If multipathing is enabled we need to always use the subsystem instance 29 * If multipathing is enabled we need to always use the subsystem instance
25 * number for numbering our devices to avoid conflicts between subsystems that 30 * number for numbering our devices to avoid conflicts between subsystems that
@@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
45void nvme_failover_req(struct request *req) 50void nvme_failover_req(struct request *req)
46{ 51{
47 struct nvme_ns *ns = req->q->queuedata; 52 struct nvme_ns *ns = req->q->queuedata;
53 u16 status = nvme_req(req)->status;
48 unsigned long flags; 54 unsigned long flags;
49 55
50 spin_lock_irqsave(&ns->head->requeue_lock, flags); 56 spin_lock_irqsave(&ns->head->requeue_lock, flags);
@@ -52,15 +58,35 @@ void nvme_failover_req(struct request *req)
52 spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 58 spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
53 blk_mq_end_request(req, 0); 59 blk_mq_end_request(req, 0);
54 60
55 nvme_reset_ctrl(ns->ctrl); 61 switch (status & 0x7ff) {
56 kblockd_schedule_work(&ns->head->requeue_work); 62 case NVME_SC_ANA_TRANSITION:
57} 63 case NVME_SC_ANA_INACCESSIBLE:
64 case NVME_SC_ANA_PERSISTENT_LOSS:
65 /*
66 * If we got back an ANA error we know the controller is alive,
67 * but not ready to serve this namespaces. The spec suggests
68 * we should update our general state here, but due to the fact
69 * that the admin and I/O queues are not serialized that is
70 * fundamentally racy. So instead just clear the current path,
71 * mark the the path as pending and kick of a re-read of the ANA
72 * log page ASAP.
73 */
74 nvme_mpath_clear_current_path(ns);
75 if (ns->ctrl->ana_log_buf) {
76 set_bit(NVME_NS_ANA_PENDING, &ns->flags);
77 queue_work(nvme_wq, &ns->ctrl->ana_work);
78 }
79 break;
80 default:
81 /*
82 * Reset the controller for any non-ANA error as we don't know
83 * what caused the error.
84 */
85 nvme_reset_ctrl(ns->ctrl);
86 break;
87 }
58 88
59bool nvme_req_needs_failover(struct request *req, blk_status_t error) 89 kblockd_schedule_work(&ns->head->requeue_work);
60{
61 if (!(req->cmd_flags & REQ_NVME_MPATH))
62 return false;
63 return blk_path_error(error);
64} 90}
65 91
66void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 92void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
@@ -75,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
75 up_read(&ctrl->namespaces_rwsem); 101 up_read(&ctrl->namespaces_rwsem);
76} 102}
77 103
104static const char *nvme_ana_state_names[] = {
105 [0] = "invalid state",
106 [NVME_ANA_OPTIMIZED] = "optimized",
107 [NVME_ANA_NONOPTIMIZED] = "non-optimized",
108 [NVME_ANA_INACCESSIBLE] = "inaccessible",
109 [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
110 [NVME_ANA_CHANGE] = "change",
111};
112
78static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) 113static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
79{ 114{
80 struct nvme_ns *ns; 115 struct nvme_ns *ns, *fallback = NULL;
81 116
82 list_for_each_entry_rcu(ns, &head->list, siblings) { 117 list_for_each_entry_rcu(ns, &head->list, siblings) {
83 if (ns->ctrl->state == NVME_CTRL_LIVE) { 118 if (ns->ctrl->state != NVME_CTRL_LIVE ||
119 test_bit(NVME_NS_ANA_PENDING, &ns->flags))
120 continue;
121 switch (ns->ana_state) {
122 case NVME_ANA_OPTIMIZED:
84 rcu_assign_pointer(head->current_path, ns); 123 rcu_assign_pointer(head->current_path, ns);
85 return ns; 124 return ns;
125 case NVME_ANA_NONOPTIMIZED:
126 fallback = ns;
127 break;
128 default:
129 break;
86 } 130 }
87 } 131 }
88 132
89 return NULL; 133 if (fallback)
134 rcu_assign_pointer(head->current_path, fallback);
135 return fallback;
136}
137
138static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
139{
140 return ns->ctrl->state == NVME_CTRL_LIVE &&
141 ns->ana_state == NVME_ANA_OPTIMIZED;
90} 142}
91 143
92inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) 144inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
93{ 145{
94 struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); 146 struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
95 147
96 if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) 148 if (unlikely(!ns || !nvme_path_is_optimized(ns)))
97 ns = __nvme_find_path(head); 149 ns = __nvme_find_path(head);
98 return ns; 150 return ns;
99} 151}
@@ -142,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
142 194
143 srcu_idx = srcu_read_lock(&head->srcu); 195 srcu_idx = srcu_read_lock(&head->srcu);
144 ns = srcu_dereference(head->current_path, &head->srcu); 196 ns = srcu_dereference(head->current_path, &head->srcu);
145 if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) 197 if (likely(ns && nvme_path_is_optimized(ns)))
146 found = ns->queue->poll_fn(q, qc); 198 found = ns->queue->poll_fn(q, qc);
147 srcu_read_unlock(&head->srcu, srcu_idx); 199 srcu_read_unlock(&head->srcu, srcu_idx);
148 return found; 200 return found;
@@ -176,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
176 struct request_queue *q; 228 struct request_queue *q;
177 bool vwc = false; 229 bool vwc = false;
178 230
231 mutex_init(&head->lock);
179 bio_list_init(&head->requeue_list); 232 bio_list_init(&head->requeue_list);
180 spin_lock_init(&head->requeue_lock); 233 spin_lock_init(&head->requeue_lock);
181 INIT_WORK(&head->requeue_work, nvme_requeue_work); 234 INIT_WORK(&head->requeue_work, nvme_requeue_work);
@@ -220,29 +273,232 @@ out:
220 return -ENOMEM; 273 return -ENOMEM;
221} 274}
222 275
223void nvme_mpath_add_disk(struct nvme_ns_head *head) 276static void nvme_mpath_set_live(struct nvme_ns *ns)
224{ 277{
278 struct nvme_ns_head *head = ns->head;
279
280 lockdep_assert_held(&ns->head->lock);
281
225 if (!head->disk) 282 if (!head->disk)
226 return; 283 return;
227 284
228 mutex_lock(&head->subsys->lock);
229 if (!(head->disk->flags & GENHD_FL_UP)) { 285 if (!(head->disk->flags & GENHD_FL_UP)) {
230 device_add_disk(&head->subsys->dev, head->disk); 286 device_add_disk(&head->subsys->dev, head->disk);
231 if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, 287 if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
232 &nvme_ns_id_attr_group)) 288 &nvme_ns_id_attr_group))
233 pr_warn("%s: failed to create sysfs group for identification\n", 289 dev_warn(&head->subsys->dev,
234 head->disk->disk_name); 290 "failed to create id group.\n");
291 }
292
293 kblockd_schedule_work(&ns->head->requeue_work);
294}
295
296static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
297 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
298 void *))
299{
300 void *base = ctrl->ana_log_buf;
301 size_t offset = sizeof(struct nvme_ana_rsp_hdr);
302 int error, i;
303
304 lockdep_assert_held(&ctrl->ana_lock);
305
306 for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
307 struct nvme_ana_group_desc *desc = base + offset;
308 u32 nr_nsids = le32_to_cpu(desc->nnsids);
309 size_t nsid_buf_size = nr_nsids * sizeof(__le32);
310
311 if (WARN_ON_ONCE(desc->grpid == 0))
312 return -EINVAL;
313 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
314 return -EINVAL;
315 if (WARN_ON_ONCE(desc->state == 0))
316 return -EINVAL;
317 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
318 return -EINVAL;
319
320 offset += sizeof(*desc);
321 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
322 return -EINVAL;
323
324 error = cb(ctrl, desc, data);
325 if (error)
326 return error;
327
328 offset += nsid_buf_size;
329 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
330 return -EINVAL;
331 }
332
333 return 0;
334}
335
336static inline bool nvme_state_is_live(enum nvme_ana_state state)
337{
338 return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
339}
340
341static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
342 struct nvme_ns *ns)
343{
344 enum nvme_ana_state old;
345
346 mutex_lock(&ns->head->lock);
347 old = ns->ana_state;
348 ns->ana_grpid = le32_to_cpu(desc->grpid);
349 ns->ana_state = desc->state;
350 clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
351
352 if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old))
353 nvme_mpath_set_live(ns);
354 mutex_unlock(&ns->head->lock);
355}
356
357static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
358 struct nvme_ana_group_desc *desc, void *data)
359{
360 u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
361 unsigned *nr_change_groups = data;
362 struct nvme_ns *ns;
363
364 dev_info(ctrl->device, "ANA group %d: %s.\n",
365 le32_to_cpu(desc->grpid),
366 nvme_ana_state_names[desc->state]);
367
368 if (desc->state == NVME_ANA_CHANGE)
369 (*nr_change_groups)++;
370
371 if (!nr_nsids)
372 return 0;
373
374 down_write(&ctrl->namespaces_rwsem);
375 list_for_each_entry(ns, &ctrl->namespaces, list) {
376 if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
377 continue;
378 nvme_update_ns_ana_state(desc, ns);
379 if (++n == nr_nsids)
380 break;
381 }
382 up_write(&ctrl->namespaces_rwsem);
383 WARN_ON_ONCE(n < nr_nsids);
384 return 0;
385}
386
387static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
388{
389 u32 nr_change_groups = 0;
390 int error;
391
392 mutex_lock(&ctrl->ana_lock);
393 error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
394 groups_only ? NVME_ANA_LOG_RGO : 0,
395 ctrl->ana_log_buf, ctrl->ana_log_size, 0);
396 if (error) {
397 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
398 goto out_unlock;
399 }
400
401 error = nvme_parse_ana_log(ctrl, &nr_change_groups,
402 nvme_update_ana_state);
403 if (error)
404 goto out_unlock;
405
406 /*
407 * In theory we should have an ANATT timer per group as they might enter
408 * the change state at different times. But that is a lot of overhead
409 * just to protect against a target that keeps entering new changes
410 * states while never finishing previous ones. But we'll still
411 * eventually time out once all groups are in change state, so this
412 * isn't a big deal.
413 *
414 * We also double the ANATT value to provide some slack for transports
415 * or AEN processing overhead.
416 */
417 if (nr_change_groups)
418 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
419 else
420 del_timer_sync(&ctrl->anatt_timer);
421out_unlock:
422 mutex_unlock(&ctrl->ana_lock);
423 return error;
424}
425
426static void nvme_ana_work(struct work_struct *work)
427{
428 struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
429
430 nvme_read_ana_log(ctrl, false);
431}
432
433static void nvme_anatt_timeout(struct timer_list *t)
434{
435 struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
436
437 dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
438 nvme_reset_ctrl(ctrl);
439}
440
441void nvme_mpath_stop(struct nvme_ctrl *ctrl)
442{
443 if (!nvme_ctrl_use_ana(ctrl))
444 return;
445 del_timer_sync(&ctrl->anatt_timer);
446 cancel_work_sync(&ctrl->ana_work);
447}
448
449static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
450 char *buf)
451{
452 return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
453}
454DEVICE_ATTR_RO(ana_grpid);
455
456static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
457 char *buf)
458{
459 struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
460
461 return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
462}
463DEVICE_ATTR_RO(ana_state);
464
465static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
466 struct nvme_ana_group_desc *desc, void *data)
467{
468 struct nvme_ns *ns = data;
469
470 if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
471 nvme_update_ns_ana_state(desc, ns);
472 return -ENXIO; /* just break out of the loop */
473 }
474
475 return 0;
476}
477
478void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
479{
480 if (nvme_ctrl_use_ana(ns->ctrl)) {
481 mutex_lock(&ns->ctrl->ana_lock);
482 ns->ana_grpid = le32_to_cpu(id->anagrpid);
483 nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
484 mutex_unlock(&ns->ctrl->ana_lock);
485 } else {
486 mutex_lock(&ns->head->lock);
487 ns->ana_state = NVME_ANA_OPTIMIZED;
488 nvme_mpath_set_live(ns);
489 mutex_unlock(&ns->head->lock);
235 } 490 }
236 mutex_unlock(&head->subsys->lock);
237} 491}
238 492
239void nvme_mpath_remove_disk(struct nvme_ns_head *head) 493void nvme_mpath_remove_disk(struct nvme_ns_head *head)
240{ 494{
241 if (!head->disk) 495 if (!head->disk)
242 return; 496 return;
243 sysfs_remove_group(&disk_to_dev(head->disk)->kobj, 497 if (head->disk->flags & GENHD_FL_UP) {
244 &nvme_ns_id_attr_group); 498 sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
245 del_gendisk(head->disk); 499 &nvme_ns_id_attr_group);
500 del_gendisk(head->disk);
501 }
246 blk_set_queue_dying(head->disk->queue); 502 blk_set_queue_dying(head->disk->queue);
247 /* make sure all pending bios are cleaned up */ 503 /* make sure all pending bios are cleaned up */
248 kblockd_schedule_work(&head->requeue_work); 504 kblockd_schedule_work(&head->requeue_work);
@@ -250,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
250 blk_cleanup_queue(head->disk->queue); 506 blk_cleanup_queue(head->disk->queue);
251 put_disk(head->disk); 507 put_disk(head->disk);
252} 508}
509
510int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
511{
512 int error;
513
514 if (!nvme_ctrl_use_ana(ctrl))
515 return 0;
516
517 ctrl->anacap = id->anacap;
518 ctrl->anatt = id->anatt;
519 ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
520 ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
521
522 mutex_init(&ctrl->ana_lock);
523 timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
524 ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
525 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
526 if (!(ctrl->anacap & (1 << 6)))
527 ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
528
529 if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
530 dev_err(ctrl->device,
531 "ANA log page size (%zd) larger than MDTS (%d).\n",
532 ctrl->ana_log_size,
533 ctrl->max_hw_sectors << SECTOR_SHIFT);
534 dev_err(ctrl->device, "disabling ANA support.\n");
535 return 0;
536 }
537
538 INIT_WORK(&ctrl->ana_work, nvme_ana_work);
539 ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
540 if (!ctrl->ana_log_buf)
541 goto out;
542
543 error = nvme_read_ana_log(ctrl, true);
544 if (error)
545 goto out_free_ana_log_buf;
546 return 0;
547out_free_ana_log_buf:
548 kfree(ctrl->ana_log_buf);
549out:
550 return -ENOMEM;
551}
552
553void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
554{
555 kfree(ctrl->ana_log_buf);
556}
557
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0c4a33df3b2f..bb4a2003c097 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -102,6 +102,7 @@ struct nvme_request {
102 u8 retries; 102 u8 retries;
103 u8 flags; 103 u8 flags;
104 u16 status; 104 u16 status;
105 struct nvme_ctrl *ctrl;
105}; 106};
106 107
107/* 108/*
@@ -119,6 +120,13 @@ static inline struct nvme_request *nvme_req(struct request *req)
119 return blk_mq_rq_to_pdu(req); 120 return blk_mq_rq_to_pdu(req);
120} 121}
121 122
123static inline u16 nvme_req_qid(struct request *req)
124{
125 if (!req->rq_disk)
126 return 0;
127 return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1;
128}
129
122/* The below value is the specific amount of delay needed before checking 130/* The below value is the specific amount of delay needed before checking
123 * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the 131 * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
124 * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was 132 * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
@@ -175,6 +183,7 @@ struct nvme_ctrl {
175 u16 oacs; 183 u16 oacs;
176 u16 nssa; 184 u16 nssa;
177 u16 nr_streams; 185 u16 nr_streams;
186 u32 max_namespaces;
178 atomic_t abort_limit; 187 atomic_t abort_limit;
179 u8 vwc; 188 u8 vwc;
180 u32 vs; 189 u32 vs;
@@ -197,6 +206,19 @@ struct nvme_ctrl {
197 struct work_struct fw_act_work; 206 struct work_struct fw_act_work;
198 unsigned long events; 207 unsigned long events;
199 208
209#ifdef CONFIG_NVME_MULTIPATH
210 /* asymmetric namespace access: */
211 u8 anacap;
212 u8 anatt;
213 u32 anagrpmax;
214 u32 nanagrpid;
215 struct mutex ana_lock;
216 struct nvme_ana_rsp_hdr *ana_log_buf;
217 size_t ana_log_size;
218 struct timer_list anatt_timer;
219 struct work_struct ana_work;
220#endif
221
200 /* Power saving configuration */ 222 /* Power saving configuration */
201 u64 ps_max_latency_us; 223 u64 ps_max_latency_us;
202 bool apst_enabled; 224 bool apst_enabled;
@@ -261,6 +283,7 @@ struct nvme_ns_head {
261 struct bio_list requeue_list; 283 struct bio_list requeue_list;
262 spinlock_t requeue_lock; 284 spinlock_t requeue_lock;
263 struct work_struct requeue_work; 285 struct work_struct requeue_work;
286 struct mutex lock;
264#endif 287#endif
265 struct list_head list; 288 struct list_head list;
266 struct srcu_struct srcu; 289 struct srcu_struct srcu;
@@ -287,6 +310,10 @@ struct nvme_ns {
287 struct nvme_ctrl *ctrl; 310 struct nvme_ctrl *ctrl;
288 struct request_queue *queue; 311 struct request_queue *queue;
289 struct gendisk *disk; 312 struct gendisk *disk;
313#ifdef CONFIG_NVME_MULTIPATH
314 enum nvme_ana_state ana_state;
315 u32 ana_grpid;
316#endif
290 struct list_head siblings; 317 struct list_head siblings;
291 struct nvm_dev *ndev; 318 struct nvm_dev *ndev;
292 struct kref kref; 319 struct kref kref;
@@ -299,8 +326,9 @@ struct nvme_ns {
299 bool ext; 326 bool ext;
300 u8 pi_type; 327 u8 pi_type;
301 unsigned long flags; 328 unsigned long flags;
302#define NVME_NS_REMOVING 0 329#define NVME_NS_REMOVING 0
303#define NVME_NS_DEAD 1 330#define NVME_NS_DEAD 1
331#define NVME_NS_ANA_PENDING 2
304 u16 noiob; 332 u16 noiob;
305 333
306#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 334#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@@ -356,14 +384,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
356 return (sector >> (ns->lba_shift - 9)); 384 return (sector >> (ns->lba_shift - 9));
357} 385}
358 386
359static inline void nvme_cleanup_cmd(struct request *req)
360{
361 if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
362 kfree(page_address(req->special_vec.bv_page) +
363 req->special_vec.bv_offset);
364 }
365}
366
367static inline void nvme_end_request(struct request *req, __le16 status, 387static inline void nvme_end_request(struct request *req, __le16 status,
368 union nvme_result result) 388 union nvme_result result)
369{ 389{
@@ -420,6 +440,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl);
420#define NVME_QID_ANY -1 440#define NVME_QID_ANY -1
421struct request *nvme_alloc_request(struct request_queue *q, 441struct request *nvme_alloc_request(struct request_queue *q,
422 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); 442 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
443void nvme_cleanup_cmd(struct request *req);
423blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, 444blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
424 struct nvme_command *cmd); 445 struct nvme_command *cmd);
425int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 446int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
@@ -435,21 +456,24 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
435int nvme_delete_ctrl(struct nvme_ctrl *ctrl); 456int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
436int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); 457int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
437 458
438int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 459int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
439 u8 log_page, void *log, size_t size, u64 offset); 460 void *log, size_t size, u64 offset);
440 461
441extern const struct attribute_group nvme_ns_id_attr_group; 462extern const struct attribute_group nvme_ns_id_attr_group;
442extern const struct block_device_operations nvme_ns_head_ops; 463extern const struct block_device_operations nvme_ns_head_ops;
443 464
444#ifdef CONFIG_NVME_MULTIPATH 465#ifdef CONFIG_NVME_MULTIPATH
466bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
445void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, 467void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
446 struct nvme_ctrl *ctrl, int *flags); 468 struct nvme_ctrl *ctrl, int *flags);
447void nvme_failover_req(struct request *req); 469void nvme_failover_req(struct request *req);
448bool nvme_req_needs_failover(struct request *req, blk_status_t error);
449void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); 470void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
450int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); 471int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
451void nvme_mpath_add_disk(struct nvme_ns_head *head); 472void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
452void nvme_mpath_remove_disk(struct nvme_ns_head *head); 473void nvme_mpath_remove_disk(struct nvme_ns_head *head);
474int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
475void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
476void nvme_mpath_stop(struct nvme_ctrl *ctrl);
453 477
454static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) 478static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
455{ 479{
@@ -468,7 +492,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
468 kblockd_schedule_work(&head->requeue_work); 492 kblockd_schedule_work(&head->requeue_work);
469} 493}
470 494
495extern struct device_attribute dev_attr_ana_grpid;
496extern struct device_attribute dev_attr_ana_state;
497
471#else 498#else
499static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
500{
501 return false;
502}
472/* 503/*
473 * Without the multipath code enabled, multiple controller per subsystems are 504 * Without the multipath code enabled, multiple controller per subsystems are
474 * visible as devices and thus we cannot use the subsystem instance. 505 * visible as devices and thus we cannot use the subsystem instance.
@@ -482,11 +513,6 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
482static inline void nvme_failover_req(struct request *req) 513static inline void nvme_failover_req(struct request *req)
483{ 514{
484} 515}
485static inline bool nvme_req_needs_failover(struct request *req,
486 blk_status_t error)
487{
488 return false;
489}
490static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 516static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
491{ 517{
492} 518}
@@ -495,7 +521,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
495{ 521{
496 return 0; 522 return 0;
497} 523}
498static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) 524static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
525 struct nvme_id_ns *id)
499{ 526{
500} 527}
501static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) 528static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@@ -507,6 +534,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
507static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) 534static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
508{ 535{
509} 536}
537static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
538 struct nvme_id_ctrl *id)
539{
540 return 0;
541}
542static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
543{
544}
545static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
546{
547}
510#endif /* CONFIG_NVME_MULTIPATH */ 548#endif /* CONFIG_NVME_MULTIPATH */
511 549
512#ifdef CONFIG_NVM 550#ifdef CONFIG_NVM
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ddd441b1516a..1b9951d2067e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -418,6 +418,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
418 418
419 BUG_ON(!nvmeq); 419 BUG_ON(!nvmeq);
420 iod->nvmeq = nvmeq; 420 iod->nvmeq = nvmeq;
421
422 nvme_req(req)->ctrl = &dev->ctrl;
421 return 0; 423 return 0;
422} 424}
423 425
@@ -535,73 +537,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
535 mempool_free(iod->sg, dev->iod_mempool); 537 mempool_free(iod->sg, dev->iod_mempool);
536} 538}
537 539
538#ifdef CONFIG_BLK_DEV_INTEGRITY
539static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
540{
541 if (be32_to_cpu(pi->ref_tag) == v)
542 pi->ref_tag = cpu_to_be32(p);
543}
544
545static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
546{
547 if (be32_to_cpu(pi->ref_tag) == p)
548 pi->ref_tag = cpu_to_be32(v);
549}
550
551/**
552 * nvme_dif_remap - remaps ref tags to bip seed and physical lba
553 *
554 * The virtual start sector is the one that was originally submitted by the
555 * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical
556 * start sector may be different. Remap protection information to match the
557 * physical LBA on writes, and back to the original seed on reads.
558 *
559 * Type 0 and 3 do not have a ref tag, so no remapping required.
560 */
561static void nvme_dif_remap(struct request *req,
562 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
563{
564 struct nvme_ns *ns = req->rq_disk->private_data;
565 struct bio_integrity_payload *bip;
566 struct t10_pi_tuple *pi;
567 void *p, *pmap;
568 u32 i, nlb, ts, phys, virt;
569
570 if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
571 return;
572
573 bip = bio_integrity(req->bio);
574 if (!bip)
575 return;
576
577 pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
578
579 p = pmap;
580 virt = bip_get_seed(bip);
581 phys = nvme_block_nr(ns, blk_rq_pos(req));
582 nlb = (blk_rq_bytes(req) >> ns->lba_shift);
583 ts = ns->disk->queue->integrity.tuple_size;
584
585 for (i = 0; i < nlb; i++, virt++, phys++) {
586 pi = (struct t10_pi_tuple *)p;
587 dif_swap(phys, virt, pi);
588 p += ts;
589 }
590 kunmap_atomic(pmap);
591}
592#else /* CONFIG_BLK_DEV_INTEGRITY */
593static void nvme_dif_remap(struct request *req,
594 void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
595{
596}
597static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
598{
599}
600static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
601{
602}
603#endif
604
605static void nvme_print_sgl(struct scatterlist *sgl, int nents) 540static void nvme_print_sgl(struct scatterlist *sgl, int nents)
606{ 541{
607 int i; 542 int i;
@@ -827,9 +762,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
827 if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) 762 if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
828 goto out_unmap; 763 goto out_unmap;
829 764
830 if (req_op(req) == REQ_OP_WRITE)
831 nvme_dif_remap(req, nvme_dif_prep);
832
833 if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) 765 if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
834 goto out_unmap; 766 goto out_unmap;
835 } 767 }
@@ -852,11 +784,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
852 784
853 if (iod->nents) { 785 if (iod->nents) {
854 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); 786 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
855 if (blk_integrity_rq(req)) { 787 if (blk_integrity_rq(req))
856 if (req_op(req) == REQ_OP_READ)
857 nvme_dif_remap(req, nvme_dif_complete);
858 dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); 788 dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
859 }
860 } 789 }
861 790
862 nvme_cleanup_cmd(req); 791 nvme_cleanup_cmd(req);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 66ec5985c9f3..0805fa6215ee 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -40,13 +40,14 @@
40 40
41#define NVME_RDMA_MAX_SEGMENTS 256 41#define NVME_RDMA_MAX_SEGMENTS 256
42 42
43#define NVME_RDMA_MAX_INLINE_SEGMENTS 1 43#define NVME_RDMA_MAX_INLINE_SEGMENTS 4
44 44
45struct nvme_rdma_device { 45struct nvme_rdma_device {
46 struct ib_device *dev; 46 struct ib_device *dev;
47 struct ib_pd *pd; 47 struct ib_pd *pd;
48 struct kref ref; 48 struct kref ref;
49 struct list_head entry; 49 struct list_head entry;
50 unsigned int num_inline_segments;
50}; 51};
51 52
52struct nvme_rdma_qe { 53struct nvme_rdma_qe {
@@ -117,6 +118,7 @@ struct nvme_rdma_ctrl {
117 struct sockaddr_storage src_addr; 118 struct sockaddr_storage src_addr;
118 119
119 struct nvme_ctrl ctrl; 120 struct nvme_ctrl ctrl;
121 bool use_inline_data;
120}; 122};
121 123
122static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) 124static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
@@ -249,7 +251,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
249 /* +1 for drain */ 251 /* +1 for drain */
250 init_attr.cap.max_recv_wr = queue->queue_size + 1; 252 init_attr.cap.max_recv_wr = queue->queue_size + 1;
251 init_attr.cap.max_recv_sge = 1; 253 init_attr.cap.max_recv_sge = 1;
252 init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS; 254 init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
253 init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 255 init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
254 init_attr.qp_type = IB_QPT_RC; 256 init_attr.qp_type = IB_QPT_RC;
255 init_attr.send_cq = queue->ib_cq; 257 init_attr.send_cq = queue->ib_cq;
@@ -286,6 +288,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
286 struct ib_device *ibdev = dev->dev; 288 struct ib_device *ibdev = dev->dev;
287 int ret; 289 int ret;
288 290
291 nvme_req(rq)->ctrl = &ctrl->ctrl;
289 ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), 292 ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
290 DMA_TO_DEVICE); 293 DMA_TO_DEVICE);
291 if (ret) 294 if (ret)
@@ -374,6 +377,8 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
374 goto out_free_pd; 377 goto out_free_pd;
375 } 378 }
376 379
380 ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
381 ndev->dev->attrs.max_sge - 1);
377 list_add(&ndev->entry, &device_list); 382 list_add(&ndev->entry, &device_list);
378out_unlock: 383out_unlock:
379 mutex_unlock(&device_list_mutex); 384 mutex_unlock(&device_list_mutex);
@@ -868,6 +873,31 @@ out_free_io_queues:
868 return ret; 873 return ret;
869} 874}
870 875
876static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
877 bool remove)
878{
879 blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
880 nvme_rdma_stop_queue(&ctrl->queues[0]);
881 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
882 &ctrl->ctrl);
883 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
884 nvme_rdma_destroy_admin_queue(ctrl, remove);
885}
886
887static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
888 bool remove)
889{
890 if (ctrl->ctrl.queue_count > 1) {
891 nvme_stop_queues(&ctrl->ctrl);
892 nvme_rdma_stop_io_queues(ctrl);
893 blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
894 &ctrl->ctrl);
895 if (remove)
896 nvme_start_queues(&ctrl->ctrl);
897 nvme_rdma_destroy_io_queues(ctrl, remove);
898 }
899}
900
871static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) 901static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
872{ 902{
873 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); 903 struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -912,21 +942,44 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
912 } 942 }
913} 943}
914 944
915static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) 945static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
916{ 946{
917 struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), 947 int ret = -EINVAL;
918 struct nvme_rdma_ctrl, reconnect_work);
919 bool changed; 948 bool changed;
920 int ret;
921 949
922 ++ctrl->ctrl.nr_reconnects; 950 ret = nvme_rdma_configure_admin_queue(ctrl, new);
923
924 ret = nvme_rdma_configure_admin_queue(ctrl, false);
925 if (ret) 951 if (ret)
926 goto requeue; 952 return ret;
953
954 if (ctrl->ctrl.icdoff) {
955 dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
956 goto destroy_admin;
957 }
958
959 if (!(ctrl->ctrl.sgls & (1 << 2))) {
960 dev_err(ctrl->ctrl.device,
961 "Mandatory keyed sgls are not supported!\n");
962 goto destroy_admin;
963 }
964
965 if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
966 dev_warn(ctrl->ctrl.device,
967 "queue_size %zu > ctrl sqsize %u, clamping down\n",
968 ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
969 }
970
971 if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
972 dev_warn(ctrl->ctrl.device,
973 "sqsize %u > ctrl maxcmd %u, clamping down\n",
974 ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
975 ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
976 }
977
978 if (ctrl->ctrl.sgls & (1 << 20))
979 ctrl->use_inline_data = true;
927 980
928 if (ctrl->ctrl.queue_count > 1) { 981 if (ctrl->ctrl.queue_count > 1) {
929 ret = nvme_rdma_configure_io_queues(ctrl, false); 982 ret = nvme_rdma_configure_io_queues(ctrl, new);
930 if (ret) 983 if (ret)
931 goto destroy_admin; 984 goto destroy_admin;
932 } 985 }
@@ -935,10 +988,31 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
935 if (!changed) { 988 if (!changed) {
936 /* state change failure is ok if we're in DELETING state */ 989 /* state change failure is ok if we're in DELETING state */
937 WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); 990 WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
938 return; 991 ret = -EINVAL;
992 goto destroy_io;
939 } 993 }
940 994
941 nvme_start_ctrl(&ctrl->ctrl); 995 nvme_start_ctrl(&ctrl->ctrl);
996 return 0;
997
998destroy_io:
999 if (ctrl->ctrl.queue_count > 1)
1000 nvme_rdma_destroy_io_queues(ctrl, new);
1001destroy_admin:
1002 nvme_rdma_stop_queue(&ctrl->queues[0]);
1003 nvme_rdma_destroy_admin_queue(ctrl, new);
1004 return ret;
1005}
1006
1007static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
1008{
1009 struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
1010 struct nvme_rdma_ctrl, reconnect_work);
1011
1012 ++ctrl->ctrl.nr_reconnects;
1013
1014 if (nvme_rdma_setup_ctrl(ctrl, false))
1015 goto requeue;
942 1016
943 dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", 1017 dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
944 ctrl->ctrl.nr_reconnects); 1018 ctrl->ctrl.nr_reconnects);
@@ -947,9 +1021,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
947 1021
948 return; 1022 return;
949 1023
950destroy_admin:
951 nvme_rdma_stop_queue(&ctrl->queues[0]);
952 nvme_rdma_destroy_admin_queue(ctrl, false);
953requeue: 1024requeue:
954 dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", 1025 dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
955 ctrl->ctrl.nr_reconnects); 1026 ctrl->ctrl.nr_reconnects);
@@ -962,27 +1033,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
962 struct nvme_rdma_ctrl, err_work); 1033 struct nvme_rdma_ctrl, err_work);
963 1034
964 nvme_stop_keep_alive(&ctrl->ctrl); 1035 nvme_stop_keep_alive(&ctrl->ctrl);
965 1036 nvme_rdma_teardown_io_queues(ctrl, false);
966 if (ctrl->ctrl.queue_count > 1) {
967 nvme_stop_queues(&ctrl->ctrl);
968 nvme_rdma_stop_io_queues(ctrl);
969 blk_mq_tagset_busy_iter(&ctrl->tag_set,
970 nvme_cancel_request, &ctrl->ctrl);
971 nvme_rdma_destroy_io_queues(ctrl, false);
972 }
973
974 blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
975 nvme_rdma_stop_queue(&ctrl->queues[0]);
976 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
977 nvme_cancel_request, &ctrl->ctrl);
978 nvme_rdma_destroy_admin_queue(ctrl, false);
979
980 /*
981 * queues are not a live anymore, so restart the queues to fail fast
982 * new IO
983 */
984 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
985 nvme_start_queues(&ctrl->ctrl); 1037 nvme_start_queues(&ctrl->ctrl);
1038 nvme_rdma_teardown_admin_queue(ctrl, false);
986 1039
987 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 1040 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
988 /* state change failure is ok if we're in DELETING state */ 1041 /* state change failure is ok if we're in DELETING state */
@@ -1090,19 +1143,27 @@ static int nvme_rdma_set_sg_null(struct nvme_command *c)
1090} 1143}
1091 1144
1092static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, 1145static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
1093 struct nvme_rdma_request *req, struct nvme_command *c) 1146 struct nvme_rdma_request *req, struct nvme_command *c,
1147 int count)
1094{ 1148{
1095 struct nvme_sgl_desc *sg = &c->common.dptr.sgl; 1149 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1150 struct scatterlist *sgl = req->sg_table.sgl;
1151 struct ib_sge *sge = &req->sge[1];
1152 u32 len = 0;
1153 int i;
1096 1154
1097 req->sge[1].addr = sg_dma_address(req->sg_table.sgl); 1155 for (i = 0; i < count; i++, sgl++, sge++) {
1098 req->sge[1].length = sg_dma_len(req->sg_table.sgl); 1156 sge->addr = sg_dma_address(sgl);
1099 req->sge[1].lkey = queue->device->pd->local_dma_lkey; 1157 sge->length = sg_dma_len(sgl);
1158 sge->lkey = queue->device->pd->local_dma_lkey;
1159 len += sge->length;
1160 }
1100 1161
1101 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); 1162 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
1102 sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); 1163 sg->length = cpu_to_le32(len);
1103 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; 1164 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
1104 1165
1105 req->num_sge++; 1166 req->num_sge += count;
1106 return 0; 1167 return 0;
1107} 1168}
1108 1169
@@ -1195,15 +1256,16 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
1195 goto out_free_table; 1256 goto out_free_table;
1196 } 1257 }
1197 1258
1198 if (count == 1) { 1259 if (count <= dev->num_inline_segments) {
1199 if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && 1260 if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
1261 queue->ctrl->use_inline_data &&
1200 blk_rq_payload_bytes(rq) <= 1262 blk_rq_payload_bytes(rq) <=
1201 nvme_rdma_inline_data_size(queue)) { 1263 nvme_rdma_inline_data_size(queue)) {
1202 ret = nvme_rdma_map_sg_inline(queue, req, c); 1264 ret = nvme_rdma_map_sg_inline(queue, req, c, count);
1203 goto out; 1265 goto out;
1204 } 1266 }
1205 1267
1206 if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { 1268 if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
1207 ret = nvme_rdma_map_sg_single(queue, req, c); 1269 ret = nvme_rdma_map_sg_single(queue, req, c);
1208 goto out; 1270 goto out;
1209 } 1271 }
@@ -1574,6 +1636,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1574 case RDMA_CM_EVENT_CONNECT_ERROR: 1636 case RDMA_CM_EVENT_CONNECT_ERROR:
1575 case RDMA_CM_EVENT_UNREACHABLE: 1637 case RDMA_CM_EVENT_UNREACHABLE:
1576 nvme_rdma_destroy_queue_ib(queue); 1638 nvme_rdma_destroy_queue_ib(queue);
1639 /* fall through */
1577 case RDMA_CM_EVENT_ADDR_ERROR: 1640 case RDMA_CM_EVENT_ADDR_ERROR:
1578 dev_dbg(queue->ctrl->ctrl.device, 1641 dev_dbg(queue->ctrl->ctrl.device,
1579 "CM error event %d\n", ev->event); 1642 "CM error event %d\n", ev->event);
@@ -1736,25 +1799,12 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
1736 1799
1737static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) 1800static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
1738{ 1801{
1739 if (ctrl->ctrl.queue_count > 1) { 1802 nvme_rdma_teardown_io_queues(ctrl, shutdown);
1740 nvme_stop_queues(&ctrl->ctrl);
1741 nvme_rdma_stop_io_queues(ctrl);
1742 blk_mq_tagset_busy_iter(&ctrl->tag_set,
1743 nvme_cancel_request, &ctrl->ctrl);
1744 nvme_rdma_destroy_io_queues(ctrl, shutdown);
1745 }
1746
1747 if (shutdown) 1803 if (shutdown)
1748 nvme_shutdown_ctrl(&ctrl->ctrl); 1804 nvme_shutdown_ctrl(&ctrl->ctrl);
1749 else 1805 else
1750 nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); 1806 nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
1751 1807 nvme_rdma_teardown_admin_queue(ctrl, shutdown);
1752 blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1753 nvme_rdma_stop_queue(&ctrl->queues[0]);
1754 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
1755 nvme_cancel_request, &ctrl->ctrl);
1756 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
1757 nvme_rdma_destroy_admin_queue(ctrl, shutdown);
1758} 1808}
1759 1809
1760static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) 1810static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
@@ -1766,8 +1816,6 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1766{ 1816{
1767 struct nvme_rdma_ctrl *ctrl = 1817 struct nvme_rdma_ctrl *ctrl =
1768 container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); 1818 container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
1769 int ret;
1770 bool changed;
1771 1819
1772 nvme_stop_ctrl(&ctrl->ctrl); 1820 nvme_stop_ctrl(&ctrl->ctrl);
1773 nvme_rdma_shutdown_ctrl(ctrl, false); 1821 nvme_rdma_shutdown_ctrl(ctrl, false);
@@ -1778,25 +1826,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1778 return; 1826 return;
1779 } 1827 }
1780 1828
1781 ret = nvme_rdma_configure_admin_queue(ctrl, false); 1829 if (nvme_rdma_setup_ctrl(ctrl, false))
1782 if (ret)
1783 goto out_fail; 1830 goto out_fail;
1784 1831
1785 if (ctrl->ctrl.queue_count > 1) {
1786 ret = nvme_rdma_configure_io_queues(ctrl, false);
1787 if (ret)
1788 goto out_fail;
1789 }
1790
1791 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1792 if (!changed) {
1793 /* state change failure is ok if we're in DELETING state */
1794 WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
1795 return;
1796 }
1797
1798 nvme_start_ctrl(&ctrl->ctrl);
1799
1800 return; 1832 return;
1801 1833
1802out_fail: 1834out_fail:
@@ -1959,49 +1991,10 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1959 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); 1991 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
1960 WARN_ON_ONCE(!changed); 1992 WARN_ON_ONCE(!changed);
1961 1993
1962 ret = nvme_rdma_configure_admin_queue(ctrl, true); 1994 ret = nvme_rdma_setup_ctrl(ctrl, true);
1963 if (ret) 1995 if (ret)
1964 goto out_uninit_ctrl; 1996 goto out_uninit_ctrl;
1965 1997
1966 /* sanity check icdoff */
1967 if (ctrl->ctrl.icdoff) {
1968 dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
1969 ret = -EINVAL;
1970 goto out_remove_admin_queue;
1971 }
1972
1973 /* sanity check keyed sgls */
1974 if (!(ctrl->ctrl.sgls & (1 << 2))) {
1975 dev_err(ctrl->ctrl.device,
1976 "Mandatory keyed sgls are not supported!\n");
1977 ret = -EINVAL;
1978 goto out_remove_admin_queue;
1979 }
1980
1981 /* only warn if argument is too large here, will clamp later */
1982 if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
1983 dev_warn(ctrl->ctrl.device,
1984 "queue_size %zu > ctrl sqsize %u, clamping down\n",
1985 opts->queue_size, ctrl->ctrl.sqsize + 1);
1986 }
1987
1988 /* warn if maxcmd is lower than sqsize+1 */
1989 if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
1990 dev_warn(ctrl->ctrl.device,
1991 "sqsize %u > ctrl maxcmd %u, clamping down\n",
1992 ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
1993 ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
1994 }
1995
1996 if (opts->nr_io_queues) {
1997 ret = nvme_rdma_configure_io_queues(ctrl, true);
1998 if (ret)
1999 goto out_remove_admin_queue;
2000 }
2001
2002 changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
2003 WARN_ON_ONCE(!changed);
2004
2005 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", 1998 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
2006 ctrl->ctrl.opts->subsysnqn, &ctrl->addr); 1999 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2007 2000
@@ -2011,13 +2004,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
2011 list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); 2004 list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
2012 mutex_unlock(&nvme_rdma_ctrl_mutex); 2005 mutex_unlock(&nvme_rdma_ctrl_mutex);
2013 2006
2014 nvme_start_ctrl(&ctrl->ctrl);
2015
2016 return &ctrl->ctrl; 2007 return &ctrl->ctrl;
2017 2008
2018out_remove_admin_queue:
2019 nvme_rdma_stop_queue(&ctrl->queues[0]);
2020 nvme_rdma_destroy_admin_queue(ctrl, true);
2021out_uninit_ctrl: 2009out_uninit_ctrl:
2022 nvme_uninit_ctrl(&ctrl->ctrl); 2010 nvme_uninit_ctrl(&ctrl->ctrl);
2023 nvme_put_ctrl(&ctrl->ctrl); 2011 nvme_put_ctrl(&ctrl->ctrl);
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 41944bbef835..25b0e310f4a8 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -128,3 +128,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
128 return nvme_trace_common(p, cdw10); 128 return nvme_trace_common(p, cdw10);
129 } 129 }
130} 130}
131
132const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
133{
134 const char *ret = trace_seq_buffer_ptr(p);
135
136 if (*name)
137 trace_seq_printf(p, "disk=%s, ", name);
138 trace_seq_putc(p, 0);
139
140 return ret;
141}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 01390f0e1671..a490790d6691 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -50,13 +50,8 @@
50 nvme_admin_opcode_name(nvme_admin_security_recv), \ 50 nvme_admin_opcode_name(nvme_admin_security_recv), \
51 nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) 51 nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
52 52
53const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
54 u8 *cdw10);
55#define __parse_nvme_admin_cmd(opcode, cdw10) \
56 nvme_trace_parse_admin_cmd(p, opcode, cdw10)
57
58#define nvme_opcode_name(opcode) { opcode, #opcode } 53#define nvme_opcode_name(opcode) { opcode, #opcode }
59#define show_opcode_name(val) \ 54#define show_nvm_opcode_name(val) \
60 __print_symbolic(val, \ 55 __print_symbolic(val, \
61 nvme_opcode_name(nvme_cmd_flush), \ 56 nvme_opcode_name(nvme_cmd_flush), \
62 nvme_opcode_name(nvme_cmd_write), \ 57 nvme_opcode_name(nvme_cmd_write), \
@@ -70,85 +65,92 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
70 nvme_opcode_name(nvme_cmd_resv_acquire), \ 65 nvme_opcode_name(nvme_cmd_resv_acquire), \
71 nvme_opcode_name(nvme_cmd_resv_release)) 66 nvme_opcode_name(nvme_cmd_resv_release))
72 67
73const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, 68#define show_opcode_name(qid, opcode) \
74 u8 *cdw10); 69 (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode))
75#define __parse_nvme_cmd(opcode, cdw10) \
76 nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
77
78TRACE_EVENT(nvme_setup_admin_cmd,
79 TP_PROTO(struct nvme_command *cmd),
80 TP_ARGS(cmd),
81 TP_STRUCT__entry(
82 __field(u8, opcode)
83 __field(u8, flags)
84 __field(u16, cid)
85 __field(u64, metadata)
86 __array(u8, cdw10, 24)
87 ),
88 TP_fast_assign(
89 __entry->opcode = cmd->common.opcode;
90 __entry->flags = cmd->common.flags;
91 __entry->cid = cmd->common.command_id;
92 __entry->metadata = le64_to_cpu(cmd->common.metadata);
93 memcpy(__entry->cdw10, cmd->common.cdw10,
94 sizeof(__entry->cdw10));
95 ),
96 TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
97 __entry->cid, __entry->flags, __entry->metadata,
98 show_admin_opcode_name(__entry->opcode),
99 __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
100);
101
102 70
103TRACE_EVENT(nvme_setup_nvm_cmd, 71const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
104 TP_PROTO(int qid, struct nvme_command *cmd), 72 u8 *cdw10);
105 TP_ARGS(qid, cmd), 73const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
74 u8 *cdw10);
75
76#define parse_nvme_cmd(qid, opcode, cdw10) \
77 (qid ? \
78 nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \
79 nvme_trace_parse_admin_cmd(p, opcode, cdw10))
80
81const char *nvme_trace_disk_name(struct trace_seq *p, char *name);
82#define __print_disk_name(name) \
83 nvme_trace_disk_name(p, name)
84
85#ifndef TRACE_HEADER_MULTI_READ
86static inline void __assign_disk_name(char *name, struct gendisk *disk)
87{
88 if (disk)
89 memcpy(name, disk->disk_name, DISK_NAME_LEN);
90 else
91 memset(name, 0, DISK_NAME_LEN);
92}
93#endif
94
95TRACE_EVENT(nvme_setup_cmd,
96 TP_PROTO(struct request *req, struct nvme_command *cmd),
97 TP_ARGS(req, cmd),
106 TP_STRUCT__entry( 98 TP_STRUCT__entry(
107 __field(int, qid) 99 __array(char, disk, DISK_NAME_LEN)
108 __field(u8, opcode) 100 __field(int, ctrl_id)
109 __field(u8, flags) 101 __field(int, qid)
110 __field(u16, cid) 102 __field(u8, opcode)
111 __field(u32, nsid) 103 __field(u8, flags)
112 __field(u64, metadata) 104 __field(u16, cid)
113 __array(u8, cdw10, 24) 105 __field(u32, nsid)
106 __field(u64, metadata)
107 __array(u8, cdw10, 24)
114 ), 108 ),
115 TP_fast_assign( 109 TP_fast_assign(
116 __entry->qid = qid; 110 __entry->ctrl_id = nvme_req(req)->ctrl->instance;
117 __entry->opcode = cmd->common.opcode; 111 __entry->qid = nvme_req_qid(req);
118 __entry->flags = cmd->common.flags; 112 __entry->opcode = cmd->common.opcode;
119 __entry->cid = cmd->common.command_id; 113 __entry->flags = cmd->common.flags;
120 __entry->nsid = le32_to_cpu(cmd->common.nsid); 114 __entry->cid = cmd->common.command_id;
121 __entry->metadata = le64_to_cpu(cmd->common.metadata); 115 __entry->nsid = le32_to_cpu(cmd->common.nsid);
122 memcpy(__entry->cdw10, cmd->common.cdw10, 116 __entry->metadata = le64_to_cpu(cmd->common.metadata);
123 sizeof(__entry->cdw10)); 117 __assign_disk_name(__entry->disk, req->rq_disk);
118 memcpy(__entry->cdw10, cmd->common.cdw10,
119 sizeof(__entry->cdw10));
124 ), 120 ),
125 TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)", 121 TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
126 __entry->qid, __entry->nsid, __entry->cid, 122 __entry->ctrl_id, __print_disk_name(__entry->disk),
123 __entry->qid, __entry->cid, __entry->nsid,
127 __entry->flags, __entry->metadata, 124 __entry->flags, __entry->metadata,
128 show_opcode_name(__entry->opcode), 125 show_opcode_name(__entry->qid, __entry->opcode),
129 __parse_nvme_cmd(__entry->opcode, __entry->cdw10)) 126 parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10))
130); 127);
131 128
132TRACE_EVENT(nvme_complete_rq, 129TRACE_EVENT(nvme_complete_rq,
133 TP_PROTO(struct request *req), 130 TP_PROTO(struct request *req),
134 TP_ARGS(req), 131 TP_ARGS(req),
135 TP_STRUCT__entry( 132 TP_STRUCT__entry(
136 __field(int, qid) 133 __array(char, disk, DISK_NAME_LEN)
137 __field(int, cid) 134 __field(int, ctrl_id)
138 __field(u64, result) 135 __field(int, qid)
139 __field(u8, retries) 136 __field(int, cid)
140 __field(u8, flags) 137 __field(u64, result)
141 __field(u16, status) 138 __field(u8, retries)
139 __field(u8, flags)
140 __field(u16, status)
142 ), 141 ),
143 TP_fast_assign( 142 TP_fast_assign(
144 __entry->qid = req->q->id; 143 __entry->ctrl_id = nvme_req(req)->ctrl->instance;
145 __entry->cid = req->tag; 144 __entry->qid = nvme_req_qid(req);
146 __entry->result = le64_to_cpu(nvme_req(req)->result.u64); 145 __entry->cid = req->tag;
147 __entry->retries = nvme_req(req)->retries; 146 __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
148 __entry->flags = nvme_req(req)->flags; 147 __entry->retries = nvme_req(req)->retries;
149 __entry->status = nvme_req(req)->status; 148 __entry->flags = nvme_req(req)->flags;
149 __entry->status = nvme_req(req)->status;
150 __assign_disk_name(__entry->disk, req->rq_disk);
150 ), 151 ),
151 TP_printk("qid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u", 152 TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
153 __entry->ctrl_id, __print_disk_name(__entry->disk),
152 __entry->qid, __entry->cid, __entry->result, 154 __entry->qid, __entry->cid, __entry->result,
153 __entry->retries, __entry->flags, __entry->status) 155 __entry->retries, __entry->flags, __entry->status)
154 156
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 38803576d5e1..a21caea1e080 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -19,6 +19,19 @@
19#include <asm/unaligned.h> 19#include <asm/unaligned.h>
20#include "nvmet.h" 20#include "nvmet.h"
21 21
22/*
23 * This helper allows us to clear the AEN based on the RAE bit,
24 * Please use this helper when processing the log pages which are
25 * associated with the AEN.
26 */
27static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
28{
29 int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
30
31 if (!rae)
32 clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
33}
34
22u32 nvmet_get_log_page_len(struct nvme_command *cmd) 35u32 nvmet_get_log_page_len(struct nvme_command *cmd)
23{ 36{
24 u32 len = le16_to_cpu(cmd->get_log_page.numdu); 37 u32 len = le16_to_cpu(cmd->get_log_page.numdu);
@@ -128,6 +141,36 @@ out:
128 nvmet_req_complete(req, status); 141 nvmet_req_complete(req, status);
129} 142}
130 143
144static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req)
145{
146 u16 status = NVME_SC_INTERNAL;
147 struct nvme_effects_log *log;
148
149 log = kzalloc(sizeof(*log), GFP_KERNEL);
150 if (!log)
151 goto out;
152
153 log->acs[nvme_admin_get_log_page] = cpu_to_le32(1 << 0);
154 log->acs[nvme_admin_identify] = cpu_to_le32(1 << 0);
155 log->acs[nvme_admin_abort_cmd] = cpu_to_le32(1 << 0);
156 log->acs[nvme_admin_set_features] = cpu_to_le32(1 << 0);
157 log->acs[nvme_admin_get_features] = cpu_to_le32(1 << 0);
158 log->acs[nvme_admin_async_event] = cpu_to_le32(1 << 0);
159 log->acs[nvme_admin_keep_alive] = cpu_to_le32(1 << 0);
160
161 log->iocs[nvme_cmd_read] = cpu_to_le32(1 << 0);
162 log->iocs[nvme_cmd_write] = cpu_to_le32(1 << 0);
163 log->iocs[nvme_cmd_flush] = cpu_to_le32(1 << 0);
164 log->iocs[nvme_cmd_dsm] = cpu_to_le32(1 << 0);
165 log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(1 << 0);
166
167 status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
168
169 kfree(log);
170out:
171 nvmet_req_complete(req, status);
172}
173
131static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) 174static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
132{ 175{
133 struct nvmet_ctrl *ctrl = req->sq->ctrl; 176 struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -146,12 +189,76 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
146 if (!status) 189 if (!status)
147 status = nvmet_zero_sgl(req, len, req->data_len - len); 190 status = nvmet_zero_sgl(req, len, req->data_len - len);
148 ctrl->nr_changed_ns = 0; 191 ctrl->nr_changed_ns = 0;
149 clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked); 192 nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR);
150 mutex_unlock(&ctrl->lock); 193 mutex_unlock(&ctrl->lock);
151out: 194out:
152 nvmet_req_complete(req, status); 195 nvmet_req_complete(req, status);
153} 196}
154 197
198static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
199 struct nvme_ana_group_desc *desc)
200{
201 struct nvmet_ctrl *ctrl = req->sq->ctrl;
202 struct nvmet_ns *ns;
203 u32 count = 0;
204
205 if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
206 rcu_read_lock();
207 list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
208 if (ns->anagrpid == grpid)
209 desc->nsids[count++] = cpu_to_le32(ns->nsid);
210 rcu_read_unlock();
211 }
212
213 desc->grpid = cpu_to_le32(grpid);
214 desc->nnsids = cpu_to_le32(count);
215 desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
216 desc->state = req->port->ana_state[grpid];
217 memset(desc->rsvd17, 0, sizeof(desc->rsvd17));
218 return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32);
219}
220
221static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
222{
223 struct nvme_ana_rsp_hdr hdr = { 0, };
224 struct nvme_ana_group_desc *desc;
225 size_t offset = sizeof(struct nvme_ana_rsp_hdr); /* start beyond hdr */
226 size_t len;
227 u32 grpid;
228 u16 ngrps = 0;
229 u16 status;
230
231 status = NVME_SC_INTERNAL;
232 desc = kmalloc(sizeof(struct nvme_ana_group_desc) +
233 NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL);
234 if (!desc)
235 goto out;
236
237 down_read(&nvmet_ana_sem);
238 for (grpid = 1; grpid <= NVMET_MAX_ANAGRPS; grpid++) {
239 if (!nvmet_ana_group_enabled[grpid])
240 continue;
241 len = nvmet_format_ana_group(req, grpid, desc);
242 status = nvmet_copy_to_sgl(req, offset, desc, len);
243 if (status)
244 break;
245 offset += len;
246 ngrps++;
247 }
248
249 hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
250 hdr.ngrps = cpu_to_le16(ngrps);
251 nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE);
252 up_read(&nvmet_ana_sem);
253
254 kfree(desc);
255
256 /* copy the header last once we know the number of groups */
257 status = nvmet_copy_to_sgl(req, 0, &hdr, sizeof(hdr));
258out:
259 nvmet_req_complete(req, status);
260}
261
155static void nvmet_execute_identify_ctrl(struct nvmet_req *req) 262static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
156{ 263{
157 struct nvmet_ctrl *ctrl = req->sq->ctrl; 264 struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -183,8 +290,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
183 * the safest is to leave it as zeroes. 290 * the safest is to leave it as zeroes.
184 */ 291 */
185 292
186 /* we support multiple ports and multiples hosts: */ 293 /* we support multiple ports, multiples hosts and ANA: */
187 id->cmic = (1 << 0) | (1 << 1); 294 id->cmic = (1 << 0) | (1 << 1) | (1 << 3);
188 295
189 /* no limit on data transfer sizes for now */ 296 /* no limit on data transfer sizes for now */
190 id->mdts = 0; 297 id->mdts = 0;
@@ -208,7 +315,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
208 315
209 /* first slot is read-only, only one slot supported */ 316 /* first slot is read-only, only one slot supported */
210 id->frmw = (1 << 0) | (1 << 1); 317 id->frmw = (1 << 0) | (1 << 1);
211 id->lpa = (1 << 0) | (1 << 2); 318 id->lpa = (1 << 0) | (1 << 1) | (1 << 2);
212 id->elpe = NVMET_ERROR_LOG_SLOTS - 1; 319 id->elpe = NVMET_ERROR_LOG_SLOTS - 1;
213 id->npss = 0; 320 id->npss = 0;
214 321
@@ -222,6 +329,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
222 id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); 329 id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
223 330
224 id->nn = cpu_to_le32(ctrl->subsys->max_nsid); 331 id->nn = cpu_to_le32(ctrl->subsys->max_nsid);
332 id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);
225 id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM | 333 id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
226 NVME_CTRL_ONCS_WRITE_ZEROES); 334 NVME_CTRL_ONCS_WRITE_ZEROES);
227 335
@@ -238,19 +346,24 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
238 id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ 346 id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
239 if (ctrl->ops->has_keyed_sgls) 347 if (ctrl->ops->has_keyed_sgls)
240 id->sgls |= cpu_to_le32(1 << 2); 348 id->sgls |= cpu_to_le32(1 << 2);
241 if (ctrl->ops->sqe_inline_size) 349 if (req->port->inline_data_size)
242 id->sgls |= cpu_to_le32(1 << 20); 350 id->sgls |= cpu_to_le32(1 << 20);
243 351
244 strcpy(id->subnqn, ctrl->subsys->subsysnqn); 352 strcpy(id->subnqn, ctrl->subsys->subsysnqn);
245 353
246 /* Max command capsule size is sqe + single page of in-capsule data */ 354 /* Max command capsule size is sqe + single page of in-capsule data */
247 id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + 355 id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
248 ctrl->ops->sqe_inline_size) / 16); 356 req->port->inline_data_size) / 16);
249 /* Max response capsule size is cqe */ 357 /* Max response capsule size is cqe */
250 id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); 358 id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
251 359
252 id->msdbd = ctrl->ops->msdbd; 360 id->msdbd = ctrl->ops->msdbd;
253 361
362 id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4);
363 id->anatt = 10; /* random value */
364 id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS);
365 id->nanagrpid = cpu_to_le32(NVMET_MAX_ANAGRPS);
366
254 /* 367 /*
255 * Meh, we don't really support any power state. Fake up the same 368 * Meh, we don't really support any power state. Fake up the same
256 * values that qemu does. 369 * values that qemu does.
@@ -259,6 +372,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
259 id->psd[0].entry_lat = cpu_to_le32(0x10); 372 id->psd[0].entry_lat = cpu_to_le32(0x10);
260 id->psd[0].exit_lat = cpu_to_le32(0x4); 373 id->psd[0].exit_lat = cpu_to_le32(0x4);
261 374
375 id->nwpc = 1 << 0; /* write protect and no write protect */
376
262 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); 377 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
263 378
264 kfree(id); 379 kfree(id);
@@ -292,8 +407,15 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
292 * nuse = ncap = nsze isn't always true, but we have no way to find 407 * nuse = ncap = nsze isn't always true, but we have no way to find
293 * that out from the underlying device. 408 * that out from the underlying device.
294 */ 409 */
295 id->ncap = id->nuse = id->nsze = 410 id->ncap = id->nsze = cpu_to_le64(ns->size >> ns->blksize_shift);
296 cpu_to_le64(ns->size >> ns->blksize_shift); 411 switch (req->port->ana_state[ns->anagrpid]) {
412 case NVME_ANA_INACCESSIBLE:
413 case NVME_ANA_PERSISTENT_LOSS:
414 break;
415 default:
416 id->nuse = id->nsze;
417 break;
418 }
297 419
298 /* 420 /*
299 * We just provide a single LBA format that matches what the 421 * We just provide a single LBA format that matches what the
@@ -307,11 +429,14 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
307 * controllers, but also with any other user of the block device. 429 * controllers, but also with any other user of the block device.
308 */ 430 */
309 id->nmic = (1 << 0); 431 id->nmic = (1 << 0);
432 id->anagrpid = cpu_to_le32(ns->anagrpid);
310 433
311 memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le)); 434 memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid));
312 435
313 id->lbaf[0].ds = ns->blksize_shift; 436 id->lbaf[0].ds = ns->blksize_shift;
314 437
438 if (ns->readonly)
439 id->nsattr |= (1 << 0);
315 nvmet_put_namespace(ns); 440 nvmet_put_namespace(ns);
316done: 441done:
317 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); 442 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
@@ -424,6 +549,52 @@ static void nvmet_execute_abort(struct nvmet_req *req)
424 nvmet_req_complete(req, 0); 549 nvmet_req_complete(req, 0);
425} 550}
426 551
552static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req)
553{
554 u16 status;
555
556 if (req->ns->file)
557 status = nvmet_file_flush(req);
558 else
559 status = nvmet_bdev_flush(req);
560
561 if (status)
562 pr_err("write protect flush failed nsid: %u\n", req->ns->nsid);
563 return status;
564}
565
566static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
567{
568 u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]);
569 struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
570 u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE;
571
572 req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid);
573 if (unlikely(!req->ns))
574 return status;
575
576 mutex_lock(&subsys->lock);
577 switch (write_protect) {
578 case NVME_NS_WRITE_PROTECT:
579 req->ns->readonly = true;
580 status = nvmet_write_protect_flush_sync(req);
581 if (status)
582 req->ns->readonly = false;
583 break;
584 case NVME_NS_NO_WRITE_PROTECT:
585 req->ns->readonly = false;
586 status = 0;
587 break;
588 default:
589 break;
590 }
591
592 if (!status)
593 nvmet_ns_changed(subsys, req->ns->nsid);
594 mutex_unlock(&subsys->lock);
595 return status;
596}
597
427static void nvmet_execute_set_features(struct nvmet_req *req) 598static void nvmet_execute_set_features(struct nvmet_req *req)
428{ 599{
429 struct nvmet_subsys *subsys = req->sq->ctrl->subsys; 600 struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
@@ -454,6 +625,9 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
454 case NVME_FEAT_HOST_ID: 625 case NVME_FEAT_HOST_ID:
455 status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; 626 status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
456 break; 627 break;
628 case NVME_FEAT_WRITE_PROTECT:
629 status = nvmet_set_feat_write_protect(req);
630 break;
457 default: 631 default:
458 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 632 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
459 break; 633 break;
@@ -462,6 +636,26 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
462 nvmet_req_complete(req, status); 636 nvmet_req_complete(req, status);
463} 637}
464 638
639static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
640{
641 struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
642 u32 result;
643
644 req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid);
645 if (!req->ns)
646 return NVME_SC_INVALID_NS | NVME_SC_DNR;
647
648 mutex_lock(&subsys->lock);
649 if (req->ns->readonly == true)
650 result = NVME_NS_WRITE_PROTECT;
651 else
652 result = NVME_NS_NO_WRITE_PROTECT;
653 nvmet_set_result(req, result);
654 mutex_unlock(&subsys->lock);
655
656 return 0;
657}
658
465static void nvmet_execute_get_features(struct nvmet_req *req) 659static void nvmet_execute_get_features(struct nvmet_req *req)
466{ 660{
467 struct nvmet_subsys *subsys = req->sq->ctrl->subsys; 661 struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
@@ -513,6 +707,9 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
513 status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid, 707 status = nvmet_copy_to_sgl(req, 0, &req->sq->ctrl->hostid,
514 sizeof(req->sq->ctrl->hostid)); 708 sizeof(req->sq->ctrl->hostid));
515 break; 709 break;
710 case NVME_FEAT_WRITE_PROTECT:
711 status = nvmet_get_feat_write_protect(req);
712 break;
516 default: 713 default:
517 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; 714 status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
518 break; 715 break;
@@ -586,6 +783,12 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
586 case NVME_LOG_CHANGED_NS: 783 case NVME_LOG_CHANGED_NS:
587 req->execute = nvmet_execute_get_log_changed_ns; 784 req->execute = nvmet_execute_get_log_changed_ns;
588 return 0; 785 return 0;
786 case NVME_LOG_CMD_EFFECTS:
787 req->execute = nvmet_execute_get_log_cmd_effects_ns;
788 return 0;
789 case NVME_LOG_ANA:
790 req->execute = nvmet_execute_get_log_page_ana;
791 return 0;
589 } 792 }
590 break; 793 break;
591 case nvme_admin_identify: 794 case nvme_admin_identify:
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index ebea1373d1b7..b37a8e3e3f80 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -218,6 +218,35 @@ static ssize_t nvmet_addr_trsvcid_store(struct config_item *item,
218 218
219CONFIGFS_ATTR(nvmet_, addr_trsvcid); 219CONFIGFS_ATTR(nvmet_, addr_trsvcid);
220 220
221static ssize_t nvmet_param_inline_data_size_show(struct config_item *item,
222 char *page)
223{
224 struct nvmet_port *port = to_nvmet_port(item);
225
226 return snprintf(page, PAGE_SIZE, "%d\n", port->inline_data_size);
227}
228
229static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
230 const char *page, size_t count)
231{
232 struct nvmet_port *port = to_nvmet_port(item);
233 int ret;
234
235 if (port->enabled) {
236 pr_err("Cannot modify inline_data_size while port enabled\n");
237 pr_err("Disable the port before modifying\n");
238 return -EACCES;
239 }
240 ret = kstrtoint(page, 0, &port->inline_data_size);
241 if (ret) {
242 pr_err("Invalid value '%s' for inline_data_size\n", page);
243 return -EINVAL;
244 }
245 return count;
246}
247
248CONFIGFS_ATTR(nvmet_, param_inline_data_size);
249
221static ssize_t nvmet_addr_trtype_show(struct config_item *item, 250static ssize_t nvmet_addr_trtype_show(struct config_item *item,
222 char *page) 251 char *page)
223{ 252{
@@ -387,6 +416,39 @@ out_unlock:
387 416
388CONFIGFS_ATTR(nvmet_ns_, device_nguid); 417CONFIGFS_ATTR(nvmet_ns_, device_nguid);
389 418
419static ssize_t nvmet_ns_ana_grpid_show(struct config_item *item, char *page)
420{
421 return sprintf(page, "%u\n", to_nvmet_ns(item)->anagrpid);
422}
423
424static ssize_t nvmet_ns_ana_grpid_store(struct config_item *item,
425 const char *page, size_t count)
426{
427 struct nvmet_ns *ns = to_nvmet_ns(item);
428 u32 oldgrpid, newgrpid;
429 int ret;
430
431 ret = kstrtou32(page, 0, &newgrpid);
432 if (ret)
433 return ret;
434
435 if (newgrpid < 1 || newgrpid > NVMET_MAX_ANAGRPS)
436 return -EINVAL;
437
438 down_write(&nvmet_ana_sem);
439 oldgrpid = ns->anagrpid;
440 nvmet_ana_group_enabled[newgrpid]++;
441 ns->anagrpid = newgrpid;
442 nvmet_ana_group_enabled[oldgrpid]--;
443 nvmet_ana_chgcnt++;
444 up_write(&nvmet_ana_sem);
445
446 nvmet_send_ana_event(ns->subsys, NULL);
447 return count;
448}
449
450CONFIGFS_ATTR(nvmet_ns_, ana_grpid);
451
390static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) 452static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
391{ 453{
392 return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled); 454 return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
@@ -412,11 +474,41 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item,
412 474
413CONFIGFS_ATTR(nvmet_ns_, enable); 475CONFIGFS_ATTR(nvmet_ns_, enable);
414 476
477static ssize_t nvmet_ns_buffered_io_show(struct config_item *item, char *page)
478{
479 return sprintf(page, "%d\n", to_nvmet_ns(item)->buffered_io);
480}
481
482static ssize_t nvmet_ns_buffered_io_store(struct config_item *item,
483 const char *page, size_t count)
484{
485 struct nvmet_ns *ns = to_nvmet_ns(item);
486 bool val;
487
488 if (strtobool(page, &val))
489 return -EINVAL;
490
491 mutex_lock(&ns->subsys->lock);
492 if (ns->enabled) {
493 pr_err("disable ns before setting buffered_io value.\n");
494 mutex_unlock(&ns->subsys->lock);
495 return -EINVAL;
496 }
497
498 ns->buffered_io = val;
499 mutex_unlock(&ns->subsys->lock);
500 return count;
501}
502
503CONFIGFS_ATTR(nvmet_ns_, buffered_io);
504
415static struct configfs_attribute *nvmet_ns_attrs[] = { 505static struct configfs_attribute *nvmet_ns_attrs[] = {
416 &nvmet_ns_attr_device_path, 506 &nvmet_ns_attr_device_path,
417 &nvmet_ns_attr_device_nguid, 507 &nvmet_ns_attr_device_nguid,
418 &nvmet_ns_attr_device_uuid, 508 &nvmet_ns_attr_device_uuid,
509 &nvmet_ns_attr_ana_grpid,
419 &nvmet_ns_attr_enable, 510 &nvmet_ns_attr_enable,
511 &nvmet_ns_attr_buffered_io,
420 NULL, 512 NULL,
421}; 513};
422 514
@@ -863,6 +955,134 @@ static const struct config_item_type nvmet_referrals_type = {
863 .ct_group_ops = &nvmet_referral_group_ops, 955 .ct_group_ops = &nvmet_referral_group_ops,
864}; 956};
865 957
958static struct {
959 enum nvme_ana_state state;
960 const char *name;
961} nvmet_ana_state_names[] = {
962 { NVME_ANA_OPTIMIZED, "optimized" },
963 { NVME_ANA_NONOPTIMIZED, "non-optimized" },
964 { NVME_ANA_INACCESSIBLE, "inaccessible" },
965 { NVME_ANA_PERSISTENT_LOSS, "persistent-loss" },
966 { NVME_ANA_CHANGE, "change" },
967};
968
969static ssize_t nvmet_ana_group_ana_state_show(struct config_item *item,
970 char *page)
971{
972 struct nvmet_ana_group *grp = to_ana_group(item);
973 enum nvme_ana_state state = grp->port->ana_state[grp->grpid];
974 int i;
975
976 for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
977 if (state != nvmet_ana_state_names[i].state)
978 continue;
979 return sprintf(page, "%s\n", nvmet_ana_state_names[i].name);
980 }
981
982 return sprintf(page, "\n");
983}
984
985static ssize_t nvmet_ana_group_ana_state_store(struct config_item *item,
986 const char *page, size_t count)
987{
988 struct nvmet_ana_group *grp = to_ana_group(item);
989 int i;
990
991 for (i = 0; i < ARRAY_SIZE(nvmet_ana_state_names); i++) {
992 if (sysfs_streq(page, nvmet_ana_state_names[i].name))
993 goto found;
994 }
995
996 pr_err("Invalid value '%s' for ana_state\n", page);
997 return -EINVAL;
998
999found:
1000 down_write(&nvmet_ana_sem);
1001 grp->port->ana_state[grp->grpid] = nvmet_ana_state_names[i].state;
1002 nvmet_ana_chgcnt++;
1003 up_write(&nvmet_ana_sem);
1004
1005 nvmet_port_send_ana_event(grp->port);
1006 return count;
1007}
1008
1009CONFIGFS_ATTR(nvmet_ana_group_, ana_state);
1010
1011static struct configfs_attribute *nvmet_ana_group_attrs[] = {
1012 &nvmet_ana_group_attr_ana_state,
1013 NULL,
1014};
1015
1016static void nvmet_ana_group_release(struct config_item *item)
1017{
1018 struct nvmet_ana_group *grp = to_ana_group(item);
1019
1020 if (grp == &grp->port->ana_default_group)
1021 return;
1022
1023 down_write(&nvmet_ana_sem);
1024 grp->port->ana_state[grp->grpid] = NVME_ANA_INACCESSIBLE;
1025 nvmet_ana_group_enabled[grp->grpid]--;
1026 up_write(&nvmet_ana_sem);
1027
1028 nvmet_port_send_ana_event(grp->port);
1029 kfree(grp);
1030}
1031
1032static struct configfs_item_operations nvmet_ana_group_item_ops = {
1033 .release = nvmet_ana_group_release,
1034};
1035
1036static const struct config_item_type nvmet_ana_group_type = {
1037 .ct_item_ops = &nvmet_ana_group_item_ops,
1038 .ct_attrs = nvmet_ana_group_attrs,
1039 .ct_owner = THIS_MODULE,
1040};
1041
1042static struct config_group *nvmet_ana_groups_make_group(
1043 struct config_group *group, const char *name)
1044{
1045 struct nvmet_port *port = ana_groups_to_port(&group->cg_item);
1046 struct nvmet_ana_group *grp;
1047 u32 grpid;
1048 int ret;
1049
1050 ret = kstrtou32(name, 0, &grpid);
1051 if (ret)
1052 goto out;
1053
1054 ret = -EINVAL;
1055 if (grpid <= 1 || grpid > NVMET_MAX_ANAGRPS)
1056 goto out;
1057
1058 ret = -ENOMEM;
1059 grp = kzalloc(sizeof(*grp), GFP_KERNEL);
1060 if (!grp)
1061 goto out;
1062 grp->port = port;
1063 grp->grpid = grpid;
1064
1065 down_write(&nvmet_ana_sem);
1066 nvmet_ana_group_enabled[grpid]++;
1067 up_write(&nvmet_ana_sem);
1068
1069 nvmet_port_send_ana_event(grp->port);
1070
1071 config_group_init_type_name(&grp->group, name, &nvmet_ana_group_type);
1072 return &grp->group;
1073out:
1074 return ERR_PTR(ret);
1075}
1076
1077static struct configfs_group_operations nvmet_ana_groups_group_ops = {
1078 .make_group = nvmet_ana_groups_make_group,
1079};
1080
1081static const struct config_item_type nvmet_ana_groups_type = {
1082 .ct_group_ops = &nvmet_ana_groups_group_ops,
1083 .ct_owner = THIS_MODULE,
1084};
1085
866/* 1086/*
867 * Ports definitions. 1087 * Ports definitions.
868 */ 1088 */
@@ -870,6 +1090,7 @@ static void nvmet_port_release(struct config_item *item)
870{ 1090{
871 struct nvmet_port *port = to_nvmet_port(item); 1091 struct nvmet_port *port = to_nvmet_port(item);
872 1092
1093 kfree(port->ana_state);
873 kfree(port); 1094 kfree(port);
874} 1095}
875 1096
@@ -879,6 +1100,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
879 &nvmet_attr_addr_traddr, 1100 &nvmet_attr_addr_traddr,
880 &nvmet_attr_addr_trsvcid, 1101 &nvmet_attr_addr_trsvcid,
881 &nvmet_attr_addr_trtype, 1102 &nvmet_attr_addr_trtype,
1103 &nvmet_attr_param_inline_data_size,
882 NULL, 1104 NULL,
883}; 1105};
884 1106
@@ -897,6 +1119,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
897{ 1119{
898 struct nvmet_port *port; 1120 struct nvmet_port *port;
899 u16 portid; 1121 u16 portid;
1122 u32 i;
900 1123
901 if (kstrtou16(name, 0, &portid)) 1124 if (kstrtou16(name, 0, &portid))
902 return ERR_PTR(-EINVAL); 1125 return ERR_PTR(-EINVAL);
@@ -905,9 +1128,24 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
905 if (!port) 1128 if (!port)
906 return ERR_PTR(-ENOMEM); 1129 return ERR_PTR(-ENOMEM);
907 1130
1131 port->ana_state = kcalloc(NVMET_MAX_ANAGRPS + 1,
1132 sizeof(*port->ana_state), GFP_KERNEL);
1133 if (!port->ana_state) {
1134 kfree(port);
1135 return ERR_PTR(-ENOMEM);
1136 }
1137
1138 for (i = 1; i <= NVMET_MAX_ANAGRPS; i++) {
1139 if (i == NVMET_DEFAULT_ANA_GRPID)
1140 port->ana_state[1] = NVME_ANA_OPTIMIZED;
1141 else
1142 port->ana_state[i] = NVME_ANA_INACCESSIBLE;
1143 }
1144
908 INIT_LIST_HEAD(&port->entry); 1145 INIT_LIST_HEAD(&port->entry);
909 INIT_LIST_HEAD(&port->subsystems); 1146 INIT_LIST_HEAD(&port->subsystems);
910 INIT_LIST_HEAD(&port->referrals); 1147 INIT_LIST_HEAD(&port->referrals);
1148 port->inline_data_size = -1; /* < 0 == let the transport choose */
911 1149
912 port->disc_addr.portid = cpu_to_le16(portid); 1150 port->disc_addr.portid = cpu_to_le16(portid);
913 config_group_init_type_name(&port->group, name, &nvmet_port_type); 1151 config_group_init_type_name(&port->group, name, &nvmet_port_type);
@@ -920,6 +1158,18 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
920 "referrals", &nvmet_referrals_type); 1158 "referrals", &nvmet_referrals_type);
921 configfs_add_default_group(&port->referrals_group, &port->group); 1159 configfs_add_default_group(&port->referrals_group, &port->group);
922 1160
1161 config_group_init_type_name(&port->ana_groups_group,
1162 "ana_groups", &nvmet_ana_groups_type);
1163 configfs_add_default_group(&port->ana_groups_group, &port->group);
1164
1165 port->ana_default_group.port = port;
1166 port->ana_default_group.grpid = NVMET_DEFAULT_ANA_GRPID;
1167 config_group_init_type_name(&port->ana_default_group.group,
1168 __stringify(NVMET_DEFAULT_ANA_GRPID),
1169 &nvmet_ana_group_type);
1170 configfs_add_default_group(&port->ana_default_group.group,
1171 &port->ana_groups_group);
1172
923 return &port->group; 1173 return &port->group;
924} 1174}
925 1175
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 9838103f2d62..ebf3e7a6c49e 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -18,6 +18,7 @@
18 18
19#include "nvmet.h" 19#include "nvmet.h"
20 20
21struct workqueue_struct *buffered_io_wq;
21static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; 22static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
22static DEFINE_IDA(cntlid_ida); 23static DEFINE_IDA(cntlid_ida);
23 24
@@ -39,6 +40,10 @@ static DEFINE_IDA(cntlid_ida);
39 */ 40 */
40DECLARE_RWSEM(nvmet_config_sem); 41DECLARE_RWSEM(nvmet_config_sem);
41 42
43u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
44u64 nvmet_ana_chgcnt;
45DECLARE_RWSEM(nvmet_ana_sem);
46
42static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, 47static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
43 const char *subsysnqn); 48 const char *subsysnqn);
44 49
@@ -175,7 +180,7 @@ out_unlock:
175 mutex_unlock(&ctrl->lock); 180 mutex_unlock(&ctrl->lock);
176} 181}
177 182
178static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) 183void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
179{ 184{
180 struct nvmet_ctrl *ctrl; 185 struct nvmet_ctrl *ctrl;
181 186
@@ -189,6 +194,33 @@ static void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
189 } 194 }
190} 195}
191 196
197void nvmet_send_ana_event(struct nvmet_subsys *subsys,
198 struct nvmet_port *port)
199{
200 struct nvmet_ctrl *ctrl;
201
202 mutex_lock(&subsys->lock);
203 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
204 if (port && ctrl->port != port)
205 continue;
206 if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE))
207 continue;
208 nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
209 NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
210 }
211 mutex_unlock(&subsys->lock);
212}
213
214void nvmet_port_send_ana_event(struct nvmet_port *port)
215{
216 struct nvmet_subsys_link *p;
217
218 down_read(&nvmet_config_sem);
219 list_for_each_entry(p, &port->subsystems, entry)
220 nvmet_send_ana_event(p->subsys, port);
221 up_read(&nvmet_config_sem);
222}
223
192int nvmet_register_transport(const struct nvmet_fabrics_ops *ops) 224int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
193{ 225{
194 int ret = 0; 226 int ret = 0;
@@ -241,6 +273,10 @@ int nvmet_enable_port(struct nvmet_port *port)
241 return ret; 273 return ret;
242 } 274 }
243 275
276 /* If the transport didn't set inline_data_size, then disable it. */
277 if (port->inline_data_size < 0)
278 port->inline_data_size = 0;
279
244 port->enabled = true; 280 port->enabled = true;
245 return 0; 281 return 0;
246} 282}
@@ -332,9 +368,13 @@ static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
332int nvmet_ns_enable(struct nvmet_ns *ns) 368int nvmet_ns_enable(struct nvmet_ns *ns)
333{ 369{
334 struct nvmet_subsys *subsys = ns->subsys; 370 struct nvmet_subsys *subsys = ns->subsys;
335 int ret = 0; 371 int ret;
336 372
337 mutex_lock(&subsys->lock); 373 mutex_lock(&subsys->lock);
374 ret = -EMFILE;
375 if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
376 goto out_unlock;
377 ret = 0;
338 if (ns->enabled) 378 if (ns->enabled)
339 goto out_unlock; 379 goto out_unlock;
340 380
@@ -369,6 +409,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
369 409
370 list_add_tail_rcu(&ns->dev_link, &old->dev_link); 410 list_add_tail_rcu(&ns->dev_link, &old->dev_link);
371 } 411 }
412 subsys->nr_namespaces++;
372 413
373 nvmet_ns_changed(subsys, ns->nsid); 414 nvmet_ns_changed(subsys, ns->nsid);
374 ns->enabled = true; 415 ns->enabled = true;
@@ -409,6 +450,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
409 percpu_ref_exit(&ns->ref); 450 percpu_ref_exit(&ns->ref);
410 451
411 mutex_lock(&subsys->lock); 452 mutex_lock(&subsys->lock);
453 subsys->nr_namespaces--;
412 nvmet_ns_changed(subsys, ns->nsid); 454 nvmet_ns_changed(subsys, ns->nsid);
413 nvmet_ns_dev_disable(ns); 455 nvmet_ns_dev_disable(ns);
414out_unlock: 456out_unlock:
@@ -419,6 +461,10 @@ void nvmet_ns_free(struct nvmet_ns *ns)
419{ 461{
420 nvmet_ns_disable(ns); 462 nvmet_ns_disable(ns);
421 463
464 down_write(&nvmet_ana_sem);
465 nvmet_ana_group_enabled[ns->anagrpid]--;
466 up_write(&nvmet_ana_sem);
467
422 kfree(ns->device_path); 468 kfree(ns->device_path);
423 kfree(ns); 469 kfree(ns);
424} 470}
@@ -436,7 +482,14 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
436 482
437 ns->nsid = nsid; 483 ns->nsid = nsid;
438 ns->subsys = subsys; 484 ns->subsys = subsys;
485
486 down_write(&nvmet_ana_sem);
487 ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
488 nvmet_ana_group_enabled[ns->anagrpid]++;
489 up_write(&nvmet_ana_sem);
490
439 uuid_gen(&ns->uuid); 491 uuid_gen(&ns->uuid);
492 ns->buffered_io = false;
440 493
441 return ns; 494 return ns;
442} 495}
@@ -542,6 +595,35 @@ int nvmet_sq_init(struct nvmet_sq *sq)
542} 595}
543EXPORT_SYMBOL_GPL(nvmet_sq_init); 596EXPORT_SYMBOL_GPL(nvmet_sq_init);
544 597
598static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
599 struct nvmet_ns *ns)
600{
601 enum nvme_ana_state state = port->ana_state[ns->anagrpid];
602
603 if (unlikely(state == NVME_ANA_INACCESSIBLE))
604 return NVME_SC_ANA_INACCESSIBLE;
605 if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
606 return NVME_SC_ANA_PERSISTENT_LOSS;
607 if (unlikely(state == NVME_ANA_CHANGE))
608 return NVME_SC_ANA_TRANSITION;
609 return 0;
610}
611
612static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
613{
614 if (unlikely(req->ns->readonly)) {
615 switch (req->cmd->common.opcode) {
616 case nvme_cmd_read:
617 case nvme_cmd_flush:
618 break;
619 default:
620 return NVME_SC_NS_WRITE_PROTECTED;
621 }
622 }
623
624 return 0;
625}
626
545static u16 nvmet_parse_io_cmd(struct nvmet_req *req) 627static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
546{ 628{
547 struct nvme_command *cmd = req->cmd; 629 struct nvme_command *cmd = req->cmd;
@@ -554,6 +636,12 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
554 req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); 636 req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
555 if (unlikely(!req->ns)) 637 if (unlikely(!req->ns))
556 return NVME_SC_INVALID_NS | NVME_SC_DNR; 638 return NVME_SC_INVALID_NS | NVME_SC_DNR;
639 ret = nvmet_check_ana_state(req->port, req->ns);
640 if (unlikely(ret))
641 return ret;
642 ret = nvmet_io_cmd_check_access(req);
643 if (unlikely(ret))
644 return ret;
557 645
558 if (req->ns->file) 646 if (req->ns->file)
559 return nvmet_file_parse_io_cmd(req); 647 return nvmet_file_parse_io_cmd(req);
@@ -870,6 +958,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
870 958
871 nvmet_init_cap(ctrl); 959 nvmet_init_cap(ctrl);
872 960
961 ctrl->port = req->port;
962
873 INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); 963 INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
874 INIT_LIST_HEAD(&ctrl->async_events); 964 INIT_LIST_HEAD(&ctrl->async_events);
875 965
@@ -1109,6 +1199,15 @@ static int __init nvmet_init(void)
1109{ 1199{
1110 int error; 1200 int error;
1111 1201
1202 nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
1203
1204 buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
1205 WQ_MEM_RECLAIM, 0);
1206 if (!buffered_io_wq) {
1207 error = -ENOMEM;
1208 goto out;
1209 }
1210
1112 error = nvmet_init_discovery(); 1211 error = nvmet_init_discovery();
1113 if (error) 1212 if (error)
1114 goto out; 1213 goto out;
@@ -1129,6 +1228,7 @@ static void __exit nvmet_exit(void)
1129 nvmet_exit_configfs(); 1228 nvmet_exit_configfs();
1130 nvmet_exit_discovery(); 1229 nvmet_exit_discovery();
1131 ida_destroy(&cntlid_ida); 1230 ida_destroy(&cntlid_ida);
1231 destroy_workqueue(buffered_io_wq);
1132 1232
1133 BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); 1233 BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
1134 BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); 1234 BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 08656b849bd6..eae29f493a07 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -171,7 +171,7 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
171 id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ 171 id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
172 if (ctrl->ops->has_keyed_sgls) 172 if (ctrl->ops->has_keyed_sgls)
173 id->sgls |= cpu_to_le32(1 << 2); 173 id->sgls |= cpu_to_le32(1 << 2);
174 if (ctrl->ops->sqe_inline_size) 174 if (req->port->inline_data_size)
175 id->sgls |= cpu_to_le32(1 << 20); 175 id->sgls |= cpu_to_le32(1 << 20);
176 176
177 strcpy(id->subnqn, ctrl->subsys->subsysnqn); 177 strcpy(id->subnqn, ctrl->subsys->subsysnqn);
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index e0b0f7df70c2..7bc9f6240432 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -124,6 +124,13 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
124 submit_bio(bio); 124 submit_bio(bio);
125} 125}
126 126
127u16 nvmet_bdev_flush(struct nvmet_req *req)
128{
129 if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL, NULL))
130 return NVME_SC_INTERNAL | NVME_SC_DNR;
131 return 0;
132}
133
127static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns, 134static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns,
128 struct nvme_dsm_range *range, struct bio **bio) 135 struct nvme_dsm_range *range, struct bio **bio)
129{ 136{
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 8c42b3a8c420..81a9dc5290a8 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -16,6 +16,8 @@
16void nvmet_file_ns_disable(struct nvmet_ns *ns) 16void nvmet_file_ns_disable(struct nvmet_ns *ns)
17{ 17{
18 if (ns->file) { 18 if (ns->file) {
19 if (ns->buffered_io)
20 flush_workqueue(buffered_io_wq);
19 mempool_destroy(ns->bvec_pool); 21 mempool_destroy(ns->bvec_pool);
20 ns->bvec_pool = NULL; 22 ns->bvec_pool = NULL;
21 kmem_cache_destroy(ns->bvec_cache); 23 kmem_cache_destroy(ns->bvec_cache);
@@ -27,11 +29,14 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns)
27 29
28int nvmet_file_ns_enable(struct nvmet_ns *ns) 30int nvmet_file_ns_enable(struct nvmet_ns *ns)
29{ 31{
30 int ret; 32 int flags = O_RDWR | O_LARGEFILE;
31 struct kstat stat; 33 struct kstat stat;
34 int ret;
35
36 if (!ns->buffered_io)
37 flags |= O_DIRECT;
32 38
33 ns->file = filp_open(ns->device_path, 39 ns->file = filp_open(ns->device_path, flags, 0);
34 O_RDWR | O_LARGEFILE | O_DIRECT, 0);
35 if (IS_ERR(ns->file)) { 40 if (IS_ERR(ns->file)) {
36 pr_err("failed to open file %s: (%ld)\n", 41 pr_err("failed to open file %s: (%ld)\n",
37 ns->device_path, PTR_ERR(ns->file)); 42 ns->device_path, PTR_ERR(ns->file));
@@ -100,7 +105,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
100 105
101 iocb->ki_pos = pos; 106 iocb->ki_pos = pos;
102 iocb->ki_filp = req->ns->file; 107 iocb->ki_filp = req->ns->file;
103 iocb->ki_flags = IOCB_DIRECT | ki_flags; 108 iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
104 109
105 ret = call_iter(iocb, &iter); 110 ret = call_iter(iocb, &iter);
106 111
@@ -140,6 +145,12 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
140 return; 145 return;
141 } 146 }
142 147
148 pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
149 if (unlikely(pos + req->data_len > req->ns->size)) {
150 nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
151 return;
152 }
153
143 if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) 154 if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
144 req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), 155 req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
145 GFP_KERNEL); 156 GFP_KERNEL);
@@ -155,8 +166,6 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
155 is_sync = true; 166 is_sync = true;
156 } 167 }
157 168
158 pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
159
160 memset(&req->f.iocb, 0, sizeof(struct kiocb)); 169 memset(&req->f.iocb, 0, sizeof(struct kiocb));
161 for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) { 170 for_each_sg_page(req->sg, &sg_pg_iter, req->sg_cnt, 0) {
162 nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter); 171 nvmet_file_init_bvec(&req->f.bvec[bv_cnt], &sg_pg_iter);
@@ -189,14 +198,31 @@ out:
189 nvmet_file_submit_bvec(req, pos, bv_cnt, total_len); 198 nvmet_file_submit_bvec(req, pos, bv_cnt, total_len);
190} 199}
191 200
192static void nvmet_file_flush_work(struct work_struct *w) 201static void nvmet_file_buffered_io_work(struct work_struct *w)
193{ 202{
194 struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); 203 struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
195 int ret;
196 204
197 ret = vfs_fsync(req->ns->file, 1); 205 nvmet_file_execute_rw(req);
206}
198 207
199 nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); 208static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req)
209{
210 INIT_WORK(&req->f.work, nvmet_file_buffered_io_work);
211 queue_work(buffered_io_wq, &req->f.work);
212}
213
214u16 nvmet_file_flush(struct nvmet_req *req)
215{
216 if (vfs_fsync(req->ns->file, 1) < 0)
217 return NVME_SC_INTERNAL | NVME_SC_DNR;
218 return 0;
219}
220
221static void nvmet_file_flush_work(struct work_struct *w)
222{
223 struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
224
225 nvmet_req_complete(req, nvmet_file_flush(req));
200} 226}
201 227
202static void nvmet_file_execute_flush(struct nvmet_req *req) 228static void nvmet_file_execute_flush(struct nvmet_req *req)
@@ -209,22 +235,30 @@ static void nvmet_file_execute_discard(struct nvmet_req *req)
209{ 235{
210 int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; 236 int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
211 struct nvme_dsm_range range; 237 struct nvme_dsm_range range;
212 loff_t offset; 238 loff_t offset, len;
213 loff_t len; 239 u16 ret;
214 int i, ret; 240 int i;
215 241
216 for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { 242 for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
217 if (nvmet_copy_from_sgl(req, i * sizeof(range), &range, 243 ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
218 sizeof(range))) 244 sizeof(range));
245 if (ret)
219 break; 246 break;
247
220 offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; 248 offset = le64_to_cpu(range.slba) << req->ns->blksize_shift;
221 len = le32_to_cpu(range.nlb) << req->ns->blksize_shift; 249 len = le32_to_cpu(range.nlb) << req->ns->blksize_shift;
222 ret = vfs_fallocate(req->ns->file, mode, offset, len); 250 if (offset + len > req->ns->size) {
223 if (ret) 251 ret = NVME_SC_LBA_RANGE | NVME_SC_DNR;
224 break; 252 break;
253 }
254
255 if (vfs_fallocate(req->ns->file, mode, offset, len)) {
256 ret = NVME_SC_INTERNAL | NVME_SC_DNR;
257 break;
258 }
225 } 259 }
226 260
227 nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); 261 nvmet_req_complete(req, ret);
228} 262}
229 263
230static void nvmet_file_dsm_work(struct work_struct *w) 264static void nvmet_file_dsm_work(struct work_struct *w)
@@ -263,6 +297,11 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w)
263 len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << 297 len = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
264 req->ns->blksize_shift); 298 req->ns->blksize_shift);
265 299
300 if (unlikely(offset + len > req->ns->size)) {
301 nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
302 return;
303 }
304
266 ret = vfs_fallocate(req->ns->file, mode, offset, len); 305 ret = vfs_fallocate(req->ns->file, mode, offset, len);
267 nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); 306 nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
268} 307}
@@ -280,7 +319,10 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
280 switch (cmd->common.opcode) { 319 switch (cmd->common.opcode) {
281 case nvme_cmd_read: 320 case nvme_cmd_read:
282 case nvme_cmd_write: 321 case nvme_cmd_write:
283 req->execute = nvmet_file_execute_rw; 322 if (req->ns->buffered_io)
323 req->execute = nvmet_file_execute_rw_buffered_io;
324 else
325 req->execute = nvmet_file_execute_rw;
284 req->data_len = nvmet_rw_len(req); 326 req->data_len = nvmet_rw_len(req);
285 return 0; 327 return 0;
286 case nvme_cmd_flush: 328 case nvme_cmd_flush:
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index ae7586b8be07..9908082b32c4 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -227,6 +227,7 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set,
227{ 227{
228 struct nvme_loop_ctrl *ctrl = set->driver_data; 228 struct nvme_loop_ctrl *ctrl = set->driver_data;
229 229
230 nvme_req(req)->ctrl = &ctrl->ctrl;
230 return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), 231 return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req),
231 (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); 232 (set == &ctrl->tag_set) ? hctx_idx + 1 : 0);
232} 233}
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 480dfe10fad9..ec9af4ee03b6 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -30,12 +30,11 @@
30#define NVMET_ASYNC_EVENTS 4 30#define NVMET_ASYNC_EVENTS 4
31#define NVMET_ERROR_LOG_SLOTS 128 31#define NVMET_ERROR_LOG_SLOTS 128
32 32
33
34/* 33/*
35 * Supported optional AENs: 34 * Supported optional AENs:
36 */ 35 */
37#define NVMET_AEN_CFG_OPTIONAL \ 36#define NVMET_AEN_CFG_OPTIONAL \
38 NVME_AEN_CFG_NS_ATTR 37 (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE)
39 38
40/* 39/*
41 * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): 40 * Plus mandatory SMART AENs (we'll never send them, but allow enabling them):
@@ -59,12 +58,15 @@ struct nvmet_ns {
59 struct percpu_ref ref; 58 struct percpu_ref ref;
60 struct block_device *bdev; 59 struct block_device *bdev;
61 struct file *file; 60 struct file *file;
61 bool readonly;
62 u32 nsid; 62 u32 nsid;
63 u32 blksize_shift; 63 u32 blksize_shift;
64 loff_t size; 64 loff_t size;
65 u8 nguid[16]; 65 u8 nguid[16];
66 uuid_t uuid; 66 uuid_t uuid;
67 u32 anagrpid;
67 68
69 bool buffered_io;
68 bool enabled; 70 bool enabled;
69 struct nvmet_subsys *subsys; 71 struct nvmet_subsys *subsys;
70 const char *device_path; 72 const char *device_path;
@@ -97,6 +99,18 @@ struct nvmet_sq {
97 struct completion confirm_done; 99 struct completion confirm_done;
98}; 100};
99 101
102struct nvmet_ana_group {
103 struct config_group group;
104 struct nvmet_port *port;
105 u32 grpid;
106};
107
108static inline struct nvmet_ana_group *to_ana_group(struct config_item *item)
109{
110 return container_of(to_config_group(item), struct nvmet_ana_group,
111 group);
112}
113
100/** 114/**
101 * struct nvmet_port - Common structure to keep port 115 * struct nvmet_port - Common structure to keep port
102 * information for the target. 116 * information for the target.
@@ -114,8 +128,12 @@ struct nvmet_port {
114 struct list_head subsystems; 128 struct list_head subsystems;
115 struct config_group referrals_group; 129 struct config_group referrals_group;
116 struct list_head referrals; 130 struct list_head referrals;
131 struct config_group ana_groups_group;
132 struct nvmet_ana_group ana_default_group;
133 enum nvme_ana_state *ana_state;
117 void *priv; 134 void *priv;
118 bool enabled; 135 bool enabled;
136 int inline_data_size;
119}; 137};
120 138
121static inline struct nvmet_port *to_nvmet_port(struct config_item *item) 139static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
@@ -124,6 +142,13 @@ static inline struct nvmet_port *to_nvmet_port(struct config_item *item)
124 group); 142 group);
125} 143}
126 144
145static inline struct nvmet_port *ana_groups_to_port(
146 struct config_item *item)
147{
148 return container_of(to_config_group(item), struct nvmet_port,
149 ana_groups_group);
150}
151
127struct nvmet_ctrl { 152struct nvmet_ctrl {
128 struct nvmet_subsys *subsys; 153 struct nvmet_subsys *subsys;
129 struct nvmet_cq **cqs; 154 struct nvmet_cq **cqs;
@@ -138,6 +163,8 @@ struct nvmet_ctrl {
138 u16 cntlid; 163 u16 cntlid;
139 u32 kato; 164 u32 kato;
140 165
166 struct nvmet_port *port;
167
141 u32 aen_enabled; 168 u32 aen_enabled;
142 unsigned long aen_masked; 169 unsigned long aen_masked;
143 struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; 170 struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS];
@@ -166,6 +193,7 @@ struct nvmet_subsys {
166 struct kref ref; 193 struct kref ref;
167 194
168 struct list_head namespaces; 195 struct list_head namespaces;
196 unsigned int nr_namespaces;
169 unsigned int max_nsid; 197 unsigned int max_nsid;
170 198
171 struct list_head ctrls; 199 struct list_head ctrls;
@@ -225,7 +253,6 @@ struct nvmet_req;
225struct nvmet_fabrics_ops { 253struct nvmet_fabrics_ops {
226 struct module *owner; 254 struct module *owner;
227 unsigned int type; 255 unsigned int type;
228 unsigned int sqe_inline_size;
229 unsigned int msdbd; 256 unsigned int msdbd;
230 bool has_keyed_sgls : 1; 257 bool has_keyed_sgls : 1;
231 void (*queue_response)(struct nvmet_req *req); 258 void (*queue_response)(struct nvmet_req *req);
@@ -269,6 +296,8 @@ struct nvmet_req {
269 const struct nvmet_fabrics_ops *ops; 296 const struct nvmet_fabrics_ops *ops;
270}; 297};
271 298
299extern struct workqueue_struct *buffered_io_wq;
300
272static inline void nvmet_set_status(struct nvmet_req *req, u16 status) 301static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
273{ 302{
274 req->rsp->status = cpu_to_le16(status << 1); 303 req->rsp->status = cpu_to_le16(status << 1);
@@ -337,6 +366,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns);
337struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); 366struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid);
338void nvmet_ns_free(struct nvmet_ns *ns); 367void nvmet_ns_free(struct nvmet_ns *ns);
339 368
369void nvmet_send_ana_event(struct nvmet_subsys *subsys,
370 struct nvmet_port *port);
371void nvmet_port_send_ana_event(struct nvmet_port *port);
372
340int nvmet_register_transport(const struct nvmet_fabrics_ops *ops); 373int nvmet_register_transport(const struct nvmet_fabrics_ops *ops);
341void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops); 374void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops);
342 375
@@ -357,6 +390,22 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd);
357#define NVMET_QUEUE_SIZE 1024 390#define NVMET_QUEUE_SIZE 1024
358#define NVMET_NR_QUEUES 128 391#define NVMET_NR_QUEUES 128
359#define NVMET_MAX_CMD NVMET_QUEUE_SIZE 392#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
393
394/*
395 * Nice round number that makes a list of nsids fit into a page.
396 * Should become tunable at some point in the future.
397 */
398#define NVMET_MAX_NAMESPACES 1024
399
400/*
401 * 0 is not a valid ANA group ID, so we start numbering at 1.
402 *
403 * ANA Group 1 exists without manual intervention, has namespaces assigned to it
404 * by default, and is available in an optimized state through all ports.
405 */
406#define NVMET_MAX_ANAGRPS 128
407#define NVMET_DEFAULT_ANA_GRPID 1
408
360#define NVMET_KAS 10 409#define NVMET_KAS 10
361#define NVMET_DISC_KATO 120 410#define NVMET_DISC_KATO 120
362 411
@@ -370,6 +419,10 @@ extern struct nvmet_subsys *nvmet_disc_subsys;
370extern u64 nvmet_genctr; 419extern u64 nvmet_genctr;
371extern struct rw_semaphore nvmet_config_sem; 420extern struct rw_semaphore nvmet_config_sem;
372 421
422extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
423extern u64 nvmet_ana_chgcnt;
424extern struct rw_semaphore nvmet_ana_sem;
425
373bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, 426bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
374 const char *hostnqn); 427 const char *hostnqn);
375 428
@@ -377,6 +430,9 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns);
377int nvmet_file_ns_enable(struct nvmet_ns *ns); 430int nvmet_file_ns_enable(struct nvmet_ns *ns);
378void nvmet_bdev_ns_disable(struct nvmet_ns *ns); 431void nvmet_bdev_ns_disable(struct nvmet_ns *ns);
379void nvmet_file_ns_disable(struct nvmet_ns *ns); 432void nvmet_file_ns_disable(struct nvmet_ns *ns);
433u16 nvmet_bdev_flush(struct nvmet_req *req);
434u16 nvmet_file_flush(struct nvmet_req *req);
435void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid);
380 436
381static inline u32 nvmet_rw_len(struct nvmet_req *req) 437static inline u32 nvmet_rw_len(struct nvmet_req *req)
382{ 438{
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 52e0c5d579a7..e7f43d1e1779 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -33,16 +33,17 @@
33#include "nvmet.h" 33#include "nvmet.h"
34 34
35/* 35/*
36 * We allow up to a page of inline data to go with the SQE 36 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
37 */ 37 */
38#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE 38#define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE
39#define NVMET_RDMA_MAX_INLINE_SGE 4
40#define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE)
39 41
40struct nvmet_rdma_cmd { 42struct nvmet_rdma_cmd {
41 struct ib_sge sge[2]; 43 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
42 struct ib_cqe cqe; 44 struct ib_cqe cqe;
43 struct ib_recv_wr wr; 45 struct ib_recv_wr wr;
44 struct scatterlist inline_sg; 46 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
45 struct page *inline_page;
46 struct nvme_command *nvme_cmd; 47 struct nvme_command *nvme_cmd;
47 struct nvmet_rdma_queue *queue; 48 struct nvmet_rdma_queue *queue;
48}; 49};
@@ -116,6 +117,8 @@ struct nvmet_rdma_device {
116 size_t srq_size; 117 size_t srq_size;
117 struct kref ref; 118 struct kref ref;
118 struct list_head entry; 119 struct list_head entry;
120 int inline_data_size;
121 int inline_page_count;
119}; 122};
120 123
121static bool nvmet_rdma_use_srq; 124static bool nvmet_rdma_use_srq;
@@ -138,6 +141,11 @@ static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
138 141
139static const struct nvmet_fabrics_ops nvmet_rdma_ops; 142static const struct nvmet_fabrics_ops nvmet_rdma_ops;
140 143
144static int num_pages(int len)
145{
146 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
147}
148
141/* XXX: really should move to a generic header sooner or later.. */ 149/* XXX: really should move to a generic header sooner or later.. */
142static inline u32 get_unaligned_le24(const u8 *p) 150static inline u32 get_unaligned_le24(const u8 *p)
143{ 151{
@@ -184,6 +192,71 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
184 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 192 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
185} 193}
186 194
195static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
196 struct nvmet_rdma_cmd *c)
197{
198 struct scatterlist *sg;
199 struct ib_sge *sge;
200 int i;
201
202 if (!ndev->inline_data_size)
203 return;
204
205 sg = c->inline_sg;
206 sge = &c->sge[1];
207
208 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
209 if (sge->length)
210 ib_dma_unmap_page(ndev->device, sge->addr,
211 sge->length, DMA_FROM_DEVICE);
212 if (sg_page(sg))
213 __free_page(sg_page(sg));
214 }
215}
216
217static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
218 struct nvmet_rdma_cmd *c)
219{
220 struct scatterlist *sg;
221 struct ib_sge *sge;
222 struct page *pg;
223 int len;
224 int i;
225
226 if (!ndev->inline_data_size)
227 return 0;
228
229 sg = c->inline_sg;
230 sg_init_table(sg, ndev->inline_page_count);
231 sge = &c->sge[1];
232 len = ndev->inline_data_size;
233
234 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
235 pg = alloc_page(GFP_KERNEL);
236 if (!pg)
237 goto out_err;
238 sg_assign_page(sg, pg);
239 sge->addr = ib_dma_map_page(ndev->device,
240 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
241 if (ib_dma_mapping_error(ndev->device, sge->addr))
242 goto out_err;
243 sge->length = min_t(int, len, PAGE_SIZE);
244 sge->lkey = ndev->pd->local_dma_lkey;
245 len -= sge->length;
246 }
247
248 return 0;
249out_err:
250 for (; i >= 0; i--, sg--, sge--) {
251 if (sge->length)
252 ib_dma_unmap_page(ndev->device, sge->addr,
253 sge->length, DMA_FROM_DEVICE);
254 if (sg_page(sg))
255 __free_page(sg_page(sg));
256 }
257 return -ENOMEM;
258}
259
187static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 260static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
188 struct nvmet_rdma_cmd *c, bool admin) 261 struct nvmet_rdma_cmd *c, bool admin)
189{ 262{
@@ -200,33 +273,17 @@ static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
200 c->sge[0].length = sizeof(*c->nvme_cmd); 273 c->sge[0].length = sizeof(*c->nvme_cmd);
201 c->sge[0].lkey = ndev->pd->local_dma_lkey; 274 c->sge[0].lkey = ndev->pd->local_dma_lkey;
202 275
203 if (!admin) { 276 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
204 c->inline_page = alloc_pages(GFP_KERNEL, 277 goto out_unmap_cmd;
205 get_order(NVMET_RDMA_INLINE_DATA_SIZE));
206 if (!c->inline_page)
207 goto out_unmap_cmd;
208 c->sge[1].addr = ib_dma_map_page(ndev->device,
209 c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE,
210 DMA_FROM_DEVICE);
211 if (ib_dma_mapping_error(ndev->device, c->sge[1].addr))
212 goto out_free_inline_page;
213 c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE;
214 c->sge[1].lkey = ndev->pd->local_dma_lkey;
215 }
216 278
217 c->cqe.done = nvmet_rdma_recv_done; 279 c->cqe.done = nvmet_rdma_recv_done;
218 280
219 c->wr.wr_cqe = &c->cqe; 281 c->wr.wr_cqe = &c->cqe;
220 c->wr.sg_list = c->sge; 282 c->wr.sg_list = c->sge;
221 c->wr.num_sge = admin ? 1 : 2; 283 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
222 284
223 return 0; 285 return 0;
224 286
225out_free_inline_page:
226 if (!admin) {
227 __free_pages(c->inline_page,
228 get_order(NVMET_RDMA_INLINE_DATA_SIZE));
229 }
230out_unmap_cmd: 287out_unmap_cmd:
231 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 288 ib_dma_unmap_single(ndev->device, c->sge[0].addr,
232 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 289 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
@@ -240,12 +297,8 @@ out:
240static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, 297static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
241 struct nvmet_rdma_cmd *c, bool admin) 298 struct nvmet_rdma_cmd *c, bool admin)
242{ 299{
243 if (!admin) { 300 if (!admin)
244 ib_dma_unmap_page(ndev->device, c->sge[1].addr, 301 nvmet_rdma_free_inline_pages(ndev, c);
245 NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE);
246 __free_pages(c->inline_page,
247 get_order(NVMET_RDMA_INLINE_DATA_SIZE));
248 }
249 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 302 ib_dma_unmap_single(ndev->device, c->sge[0].addr,
250 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 303 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
251 kfree(c->nvme_cmd); 304 kfree(c->nvme_cmd);
@@ -383,14 +436,21 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
383 struct nvmet_rdma_cmd *cmd) 436 struct nvmet_rdma_cmd *cmd)
384{ 437{
385 struct ib_recv_wr *bad_wr; 438 struct ib_recv_wr *bad_wr;
439 int ret;
386 440
387 ib_dma_sync_single_for_device(ndev->device, 441 ib_dma_sync_single_for_device(ndev->device,
388 cmd->sge[0].addr, cmd->sge[0].length, 442 cmd->sge[0].addr, cmd->sge[0].length,
389 DMA_FROM_DEVICE); 443 DMA_FROM_DEVICE);
390 444
391 if (ndev->srq) 445 if (ndev->srq)
392 return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); 446 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr);
393 return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); 447 else
448 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr);
449
450 if (unlikely(ret))
451 pr_err("post_recv cmd failed\n");
452
453 return ret;
394} 454}
395 455
396static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) 456static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
@@ -429,7 +489,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
429 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 489 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
430 } 490 }
431 491
432 if (rsp->req.sg != &rsp->cmd->inline_sg) 492 if (rsp->req.sg != rsp->cmd->inline_sg)
433 sgl_free(rsp->req.sg); 493 sgl_free(rsp->req.sg);
434 494
435 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 495 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
@@ -493,7 +553,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req)
493 rsp->send_sge.addr, rsp->send_sge.length, 553 rsp->send_sge.addr, rsp->send_sge.length,
494 DMA_TO_DEVICE); 554 DMA_TO_DEVICE);
495 555
496 if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { 556 if (unlikely(ib_post_send(cm_id->qp, first_wr, &bad_wr))) {
497 pr_err("sending cmd response failed\n"); 557 pr_err("sending cmd response failed\n");
498 nvmet_rdma_release_rsp(rsp); 558 nvmet_rdma_release_rsp(rsp);
499 } 559 }
@@ -529,10 +589,25 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
529static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 589static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
530 u64 off) 590 u64 off)
531{ 591{
532 sg_init_table(&rsp->cmd->inline_sg, 1); 592 int sg_count = num_pages(len);
533 sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); 593 struct scatterlist *sg;
534 rsp->req.sg = &rsp->cmd->inline_sg; 594 int i;
535 rsp->req.sg_cnt = 1; 595
596 sg = rsp->cmd->inline_sg;
597 for (i = 0; i < sg_count; i++, sg++) {
598 if (i < sg_count - 1)
599 sg_unmark_end(sg);
600 else
601 sg_mark_end(sg);
602 sg->offset = off;
603 sg->length = min_t(int, len, PAGE_SIZE - off);
604 len -= sg->length;
605 if (!i)
606 off = 0;
607 }
608
609 rsp->req.sg = rsp->cmd->inline_sg;
610 rsp->req.sg_cnt = sg_count;
536} 611}
537 612
538static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) 613static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
@@ -544,7 +619,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
544 if (!nvme_is_write(rsp->req.cmd)) 619 if (!nvme_is_write(rsp->req.cmd))
545 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 620 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
546 621
547 if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { 622 if (off + len > rsp->queue->dev->inline_data_size) {
548 pr_err("invalid inline data offset!\n"); 623 pr_err("invalid inline data offset!\n");
549 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 624 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
550 } 625 }
@@ -743,7 +818,7 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
743 srq_size = 4095; /* XXX: tune */ 818 srq_size = 4095; /* XXX: tune */
744 819
745 srq_attr.attr.max_wr = srq_size; 820 srq_attr.attr.max_wr = srq_size;
746 srq_attr.attr.max_sge = 2; 821 srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
747 srq_attr.attr.srq_limit = 0; 822 srq_attr.attr.srq_limit = 0;
748 srq_attr.srq_type = IB_SRQT_BASIC; 823 srq_attr.srq_type = IB_SRQT_BASIC;
749 srq = ib_create_srq(ndev->pd, &srq_attr); 824 srq = ib_create_srq(ndev->pd, &srq_attr);
@@ -765,11 +840,16 @@ static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
765 ndev->srq = srq; 840 ndev->srq = srq;
766 ndev->srq_size = srq_size; 841 ndev->srq_size = srq_size;
767 842
768 for (i = 0; i < srq_size; i++) 843 for (i = 0; i < srq_size; i++) {
769 nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); 844 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
845 if (ret)
846 goto out_free_cmds;
847 }
770 848
771 return 0; 849 return 0;
772 850
851out_free_cmds:
852 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
773out_destroy_srq: 853out_destroy_srq:
774 ib_destroy_srq(srq); 854 ib_destroy_srq(srq);
775 return ret; 855 return ret;
@@ -793,7 +873,10 @@ static void nvmet_rdma_free_dev(struct kref *ref)
793static struct nvmet_rdma_device * 873static struct nvmet_rdma_device *
794nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) 874nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
795{ 875{
876 struct nvmet_port *port = cm_id->context;
796 struct nvmet_rdma_device *ndev; 877 struct nvmet_rdma_device *ndev;
878 int inline_page_count;
879 int inline_sge_count;
797 int ret; 880 int ret;
798 881
799 mutex_lock(&device_list_mutex); 882 mutex_lock(&device_list_mutex);
@@ -807,6 +890,18 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
807 if (!ndev) 890 if (!ndev)
808 goto out_err; 891 goto out_err;
809 892
893 inline_page_count = num_pages(port->inline_data_size);
894 inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
895 cm_id->device->attrs.max_sge) - 1;
896 if (inline_page_count > inline_sge_count) {
897 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
898 port->inline_data_size, cm_id->device->name,
899 inline_sge_count * PAGE_SIZE);
900 port->inline_data_size = inline_sge_count * PAGE_SIZE;
901 inline_page_count = inline_sge_count;
902 }
903 ndev->inline_data_size = port->inline_data_size;
904 ndev->inline_page_count = inline_page_count;
810 ndev->device = cm_id->device; 905 ndev->device = cm_id->device;
811 kref_init(&ndev->ref); 906 kref_init(&ndev->ref);
812 907
@@ -881,7 +976,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
881 } else { 976 } else {
882 /* +1 for drain */ 977 /* +1 for drain */
883 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; 978 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
884 qp_attr.cap.max_recv_sge = 2; 979 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
885 } 980 }
886 981
887 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); 982 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
@@ -899,13 +994,17 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
899 if (!ndev->srq) { 994 if (!ndev->srq) {
900 for (i = 0; i < queue->recv_queue_size; i++) { 995 for (i = 0; i < queue->recv_queue_size; i++) {
901 queue->cmds[i].queue = queue; 996 queue->cmds[i].queue = queue;
902 nvmet_rdma_post_recv(ndev, &queue->cmds[i]); 997 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
998 if (ret)
999 goto err_destroy_qp;
903 } 1000 }
904 } 1001 }
905 1002
906out: 1003out:
907 return ret; 1004 return ret;
908 1005
1006err_destroy_qp:
1007 rdma_destroy_qp(queue->cm_id);
909err_destroy_cq: 1008err_destroy_cq:
910 ib_free_cq(queue->cq); 1009 ib_free_cq(queue->cq);
911 goto out; 1010 goto out;
@@ -1379,6 +1478,15 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
1379 return -EINVAL; 1478 return -EINVAL;
1380 } 1479 }
1381 1480
1481 if (port->inline_data_size < 0) {
1482 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
1483 } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
1484 pr_warn("inline_data_size %u is too large, reducing to %u\n",
1485 port->inline_data_size,
1486 NVMET_RDMA_MAX_INLINE_DATA_SIZE);
1487 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
1488 }
1489
1382 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, 1490 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
1383 port->disc_addr.trsvcid, &addr); 1491 port->disc_addr.trsvcid, &addr);
1384 if (ret) { 1492 if (ret) {
@@ -1456,7 +1564,6 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
1456static const struct nvmet_fabrics_ops nvmet_rdma_ops = { 1564static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
1457 .owner = THIS_MODULE, 1565 .owner = THIS_MODULE,
1458 .type = NVMF_TRTYPE_RDMA, 1566 .type = NVMF_TRTYPE_RDMA,
1459 .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE,
1460 .msdbd = 1, 1567 .msdbd = 1,
1461 .has_keyed_sgls = 1, 1568 .has_keyed_sgls = 1,
1462 .add_port = nvmet_rdma_add_port, 1569 .add_port = nvmet_rdma_add_port,
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 80aca2456353..768953881c9e 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -21,6 +21,7 @@ CFLAGS_gdth.o = # -DDEBUG_GDTH=2 -D__SERIAL__ -D__COM2__ -DGDTH_STATISTICS
21obj-$(CONFIG_PCMCIA) += pcmcia/ 21obj-$(CONFIG_PCMCIA) += pcmcia/
22 22
23obj-$(CONFIG_SCSI) += scsi_mod.o 23obj-$(CONFIG_SCSI) += scsi_mod.o
24obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_common.o
24 25
25obj-$(CONFIG_RAID_ATTRS) += raid_class.o 26obj-$(CONFIG_RAID_ATTRS) += raid_class.o
26 27
@@ -156,7 +157,6 @@ obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/
156obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o 157obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o
157scsi_mod-y += scsi.o hosts.o scsi_ioctl.o \ 158scsi_mod-y += scsi.o hosts.o scsi_ioctl.o \
158 scsicam.o scsi_error.o scsi_lib.o 159 scsicam.o scsi_error.o scsi_lib.o
159scsi_mod-y += scsi_common.o
160scsi_mod-$(CONFIG_SCSI_CONSTANTS) += constants.o 160scsi_mod-$(CONFIG_SCSI_CONSTANTS) += constants.o
161scsi_mod-$(CONFIG_SCSI_DMA) += scsi_lib_dma.o 161scsi_mod-$(CONFIG_SCSI_DMA) += scsi_lib_dma.o
162scsi_mod-y += scsi_scan.o scsi_sysfs.o scsi_devinfo.o 162scsi_mod-y += scsi_scan.o scsi_sysfs.o scsi_devinfo.o
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c
index e489d89cbb45..379890c4500b 100644
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -339,7 +339,6 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli)
339 struct scsi_sense_hdr sshdr; 339 struct scsi_sense_hdr sshdr;
340 u8 *cmd_buf = NULL; 340 u8 *cmd_buf = NULL;
341 u8 *scsi_cmd = NULL; 341 u8 *scsi_cmd = NULL;
342 u8 *sense_buf = NULL;
343 int rc = 0; 342 int rc = 0;
344 int result = 0; 343 int result = 0;
345 int retry_cnt = 0; 344 int retry_cnt = 0;
@@ -348,8 +347,7 @@ static int read_cap16(struct scsi_device *sdev, struct llun_info *lli)
348retry: 347retry:
349 cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL); 348 cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL);
350 scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL); 349 scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL);
351 sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); 350 if (unlikely(!cmd_buf || !scsi_cmd)) {
352 if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) {
353 rc = -ENOMEM; 351 rc = -ENOMEM;
354 goto out; 352 goto out;
355 } 353 }
@@ -364,7 +362,7 @@ retry:
364 /* Drop the ioctl read semahpore across lengthy call */ 362 /* Drop the ioctl read semahpore across lengthy call */
365 up_read(&cfg->ioctl_rwsem); 363 up_read(&cfg->ioctl_rwsem);
366 result = scsi_execute(sdev, scsi_cmd, DMA_FROM_DEVICE, cmd_buf, 364 result = scsi_execute(sdev, scsi_cmd, DMA_FROM_DEVICE, cmd_buf,
367 CMD_BUFSIZE, sense_buf, &sshdr, to, CMD_RETRIES, 365 CMD_BUFSIZE, NULL, &sshdr, to, CMD_RETRIES,
368 0, 0, NULL); 366 0, 0, NULL);
369 down_read(&cfg->ioctl_rwsem); 367 down_read(&cfg->ioctl_rwsem);
370 rc = check_state(cfg); 368 rc = check_state(cfg);
@@ -395,7 +393,6 @@ retry:
395 if (retry_cnt++ < 1) { 393 if (retry_cnt++ < 1) {
396 kfree(cmd_buf); 394 kfree(cmd_buf);
397 kfree(scsi_cmd); 395 kfree(scsi_cmd);
398 kfree(sense_buf);
399 goto retry; 396 goto retry;
400 } 397 }
401 } 398 }
@@ -426,7 +423,6 @@ retry:
426out: 423out:
427 kfree(cmd_buf); 424 kfree(cmd_buf);
428 kfree(scsi_cmd); 425 kfree(scsi_cmd);
429 kfree(sense_buf);
430 426
431 dev_dbg(dev, "%s: maxlba=%lld blklen=%d rc=%d\n", 427 dev_dbg(dev, "%s: maxlba=%lld blklen=%d rc=%d\n",
432 __func__, gli->max_lba, gli->blk_len, rc); 428 __func__, gli->max_lba, gli->blk_len, rc);
diff --git a/drivers/scsi/cxlflash/vlun.c b/drivers/scsi/cxlflash/vlun.c
index 66e445a17d6c..2c904bf16b65 100644
--- a/drivers/scsi/cxlflash/vlun.c
+++ b/drivers/scsi/cxlflash/vlun.c
@@ -426,7 +426,6 @@ static int write_same16(struct scsi_device *sdev,
426{ 426{
427 u8 *cmd_buf = NULL; 427 u8 *cmd_buf = NULL;
428 u8 *scsi_cmd = NULL; 428 u8 *scsi_cmd = NULL;
429 u8 *sense_buf = NULL;
430 int rc = 0; 429 int rc = 0;
431 int result = 0; 430 int result = 0;
432 u64 offset = lba; 431 u64 offset = lba;
@@ -440,8 +439,7 @@ static int write_same16(struct scsi_device *sdev,
440 439
441 cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL); 440 cmd_buf = kzalloc(CMD_BUFSIZE, GFP_KERNEL);
442 scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL); 441 scsi_cmd = kzalloc(MAX_COMMAND_SIZE, GFP_KERNEL);
443 sense_buf = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); 442 if (unlikely(!cmd_buf || !scsi_cmd)) {
444 if (unlikely(!cmd_buf || !scsi_cmd || !sense_buf)) {
445 rc = -ENOMEM; 443 rc = -ENOMEM;
446 goto out; 444 goto out;
447 } 445 }
@@ -457,7 +455,7 @@ static int write_same16(struct scsi_device *sdev,
457 /* Drop the ioctl read semahpore across lengthy call */ 455 /* Drop the ioctl read semahpore across lengthy call */
458 up_read(&cfg->ioctl_rwsem); 456 up_read(&cfg->ioctl_rwsem);
459 result = scsi_execute(sdev, scsi_cmd, DMA_TO_DEVICE, cmd_buf, 457 result = scsi_execute(sdev, scsi_cmd, DMA_TO_DEVICE, cmd_buf,
460 CMD_BUFSIZE, sense_buf, NULL, to, 458 CMD_BUFSIZE, NULL, NULL, to,
461 CMD_RETRIES, 0, 0, NULL); 459 CMD_RETRIES, 0, 0, NULL);
462 down_read(&cfg->ioctl_rwsem); 460 down_read(&cfg->ioctl_rwsem);
463 rc = check_state(cfg); 461 rc = check_state(cfg);
@@ -482,7 +480,6 @@ static int write_same16(struct scsi_device *sdev,
482out: 480out:
483 kfree(cmd_buf); 481 kfree(cmd_buf);
484 kfree(scsi_cmd); 482 kfree(scsi_cmd);
485 kfree(sense_buf);
486 dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc); 483 dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
487 return rc; 484 return rc;
488} 485}
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index b8d131a455d0..dd738ae5c75b 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -4568,7 +4568,7 @@ _scsih_setup_eedp(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
4568 MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG | 4568 MPI2_SCSIIO_EEDPFLAGS_CHECK_REFTAG |
4569 MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD; 4569 MPI2_SCSIIO_EEDPFLAGS_CHECK_GUARD;
4570 mpi_request->CDB.EEDP32.PrimaryReferenceTag = 4570 mpi_request->CDB.EEDP32.PrimaryReferenceTag =
4571 cpu_to_be32(scsi_prot_ref_tag(scmd)); 4571 cpu_to_be32(t10_pi_ref_tag(scmd->request));
4572 break; 4572 break;
4573 4573
4574 case SCSI_PROT_DIF_TYPE3: 4574 case SCSI_PROT_DIF_TYPE3:
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 41e9ac9fc138..9cb9a166fa0c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -238,7 +238,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
238 238
239 239
240/** 240/**
241 * scsi_execute - insert request and wait for the result 241 * __scsi_execute - insert request and wait for the result
242 * @sdev: scsi device 242 * @sdev: scsi device
243 * @cmd: scsi command 243 * @cmd: scsi command
244 * @data_direction: data direction 244 * @data_direction: data direction
@@ -255,7 +255,7 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
255 * Returns the scsi_cmnd result field if a command was executed, or a negative 255 * Returns the scsi_cmnd result field if a command was executed, or a negative
256 * Linux error code if we didn't get that far. 256 * Linux error code if we didn't get that far.
257 */ 257 */
258int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, 258int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
259 int data_direction, void *buffer, unsigned bufflen, 259 int data_direction, void *buffer, unsigned bufflen,
260 unsigned char *sense, struct scsi_sense_hdr *sshdr, 260 unsigned char *sense, struct scsi_sense_hdr *sshdr,
261 int timeout, int retries, u64 flags, req_flags_t rq_flags, 261 int timeout, int retries, u64 flags, req_flags_t rq_flags,
@@ -309,7 +309,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
309 309
310 return ret; 310 return ret;
311} 311}
312EXPORT_SYMBOL(scsi_execute); 312EXPORT_SYMBOL(__scsi_execute);
313 313
314/* 314/*
315 * Function: scsi_init_cmd_errh() 315 * Function: scsi_init_cmd_errh()
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 9421d9877730..bbebdc3769b0 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1119,7 +1119,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
1119 SCpnt->cmnd[0] = WRITE_6; 1119 SCpnt->cmnd[0] = WRITE_6;
1120 1120
1121 if (blk_integrity_rq(rq)) 1121 if (blk_integrity_rq(rq))
1122 sd_dif_prepare(SCpnt); 1122 t10_pi_prepare(SCpnt->request, sdkp->protection_type);
1123 1123
1124 } else if (rq_data_dir(rq) == READ) { 1124 } else if (rq_data_dir(rq) == READ) {
1125 SCpnt->cmnd[0] = READ_6; 1125 SCpnt->cmnd[0] = READ_6;
@@ -2047,8 +2047,10 @@ static int sd_done(struct scsi_cmnd *SCpnt)
2047 "sd_done: completed %d of %d bytes\n", 2047 "sd_done: completed %d of %d bytes\n",
2048 good_bytes, scsi_bufflen(SCpnt))); 2048 good_bytes, scsi_bufflen(SCpnt)));
2049 2049
2050 if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt)) 2050 if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) &&
2051 sd_dif_complete(SCpnt, good_bytes); 2051 good_bytes)
2052 t10_pi_complete(SCpnt->request, sdkp->protection_type,
2053 good_bytes / scsi_prot_interval(SCpnt));
2052 2054
2053 return good_bytes; 2055 return good_bytes;
2054} 2056}
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 392c7d078ae3..a7d4f50b67d4 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -254,21 +254,12 @@ static inline unsigned int sd_prot_flag_mask(unsigned int prot_op)
254#ifdef CONFIG_BLK_DEV_INTEGRITY 254#ifdef CONFIG_BLK_DEV_INTEGRITY
255 255
256extern void sd_dif_config_host(struct scsi_disk *); 256extern void sd_dif_config_host(struct scsi_disk *);
257extern void sd_dif_prepare(struct scsi_cmnd *scmd);
258extern void sd_dif_complete(struct scsi_cmnd *, unsigned int);
259 257
260#else /* CONFIG_BLK_DEV_INTEGRITY */ 258#else /* CONFIG_BLK_DEV_INTEGRITY */
261 259
262static inline void sd_dif_config_host(struct scsi_disk *disk) 260static inline void sd_dif_config_host(struct scsi_disk *disk)
263{ 261{
264} 262}
265static inline int sd_dif_prepare(struct scsi_cmnd *scmd)
266{
267 return 0;
268}
269static inline void sd_dif_complete(struct scsi_cmnd *cmd, unsigned int a)
270{
271}
272 263
273#endif /* CONFIG_BLK_DEV_INTEGRITY */ 264#endif /* CONFIG_BLK_DEV_INTEGRITY */
274 265
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 9035380c0dda..db72c82486e3 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -95,116 +95,3 @@ out:
95 blk_integrity_register(disk, &bi); 95 blk_integrity_register(disk, &bi);
96} 96}
97 97
98/*
99 * The virtual start sector is the one that was originally submitted
100 * by the block layer. Due to partitioning, MD/DM cloning, etc. the
101 * actual physical start sector is likely to be different. Remap
102 * protection information to match the physical LBA.
103 *
104 * From a protocol perspective there's a slight difference between
105 * Type 1 and 2. The latter uses 32-byte CDBs exclusively, and the
106 * reference tag is seeded in the CDB. This gives us the potential to
107 * avoid virt->phys remapping during write. However, at read time we
108 * don't know whether the virt sector is the same as when we wrote it
109 * (we could be reading from real disk as opposed to MD/DM device. So
110 * we always remap Type 2 making it identical to Type 1.
111 *
112 * Type 3 does not have a reference tag so no remapping is required.
113 */
114void sd_dif_prepare(struct scsi_cmnd *scmd)
115{
116 const int tuple_sz = sizeof(struct t10_pi_tuple);
117 struct bio *bio;
118 struct scsi_disk *sdkp;
119 struct t10_pi_tuple *pi;
120 u32 phys, virt;
121
122 sdkp = scsi_disk(scmd->request->rq_disk);
123
124 if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION)
125 return;
126
127 phys = scsi_prot_ref_tag(scmd);
128
129 __rq_for_each_bio(bio, scmd->request) {
130 struct bio_integrity_payload *bip = bio_integrity(bio);
131 struct bio_vec iv;
132 struct bvec_iter iter;
133 unsigned int j;
134
135 /* Already remapped? */
136 if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
137 break;
138
139 virt = bip_get_seed(bip) & 0xffffffff;
140
141 bip_for_each_vec(iv, bip, iter) {
142 pi = kmap_atomic(iv.bv_page) + iv.bv_offset;
143
144 for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) {
145
146 if (be32_to_cpu(pi->ref_tag) == virt)
147 pi->ref_tag = cpu_to_be32(phys);
148
149 virt++;
150 phys++;
151 }
152
153 kunmap_atomic(pi);
154 }
155
156 bip->bip_flags |= BIP_MAPPED_INTEGRITY;
157 }
158}
159
160/*
161 * Remap physical sector values in the reference tag to the virtual
162 * values expected by the block layer.
163 */
164void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes)
165{
166 const int tuple_sz = sizeof(struct t10_pi_tuple);
167 struct scsi_disk *sdkp;
168 struct bio *bio;
169 struct t10_pi_tuple *pi;
170 unsigned int j, intervals;
171 u32 phys, virt;
172
173 sdkp = scsi_disk(scmd->request->rq_disk);
174
175 if (sdkp->protection_type == T10_PI_TYPE3_PROTECTION || good_bytes == 0)
176 return;
177
178 intervals = good_bytes / scsi_prot_interval(scmd);
179 phys = scsi_prot_ref_tag(scmd);
180
181 __rq_for_each_bio(bio, scmd->request) {
182 struct bio_integrity_payload *bip = bio_integrity(bio);
183 struct bio_vec iv;
184 struct bvec_iter iter;
185
186 virt = bip_get_seed(bip) & 0xffffffff;
187
188 bip_for_each_vec(iv, bip, iter) {
189 pi = kmap_atomic(iv.bv_page) + iv.bv_offset;
190
191 for (j = 0; j < iv.bv_len; j += tuple_sz, pi++) {
192
193 if (intervals == 0) {
194 kunmap_atomic(pi);
195 return;
196 }
197
198 if (be32_to_cpu(pi->ref_tag) == phys)
199 pi->ref_tag = cpu_to_be32(virt);
200
201 virt++;
202 phys++;
203 intervals--;
204 }
205
206 kunmap_atomic(pi);
207 }
208 }
209}
210
diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c
index 35fab1e18adc..ffcf902da390 100644
--- a/drivers/scsi/sr_ioctl.c
+++ b/drivers/scsi/sr_ioctl.c
@@ -186,14 +186,13 @@ static int sr_play_trkind(struct cdrom_device_info *cdi,
186int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc) 186int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
187{ 187{
188 struct scsi_device *SDev; 188 struct scsi_device *SDev;
189 struct scsi_sense_hdr sshdr; 189 struct scsi_sense_hdr local_sshdr, *sshdr = &local_sshdr;
190 int result, err = 0, retries = 0; 190 int result, err = 0, retries = 0;
191 unsigned char sense_buffer[SCSI_SENSE_BUFFERSIZE], *senseptr = NULL;
192 191
193 SDev = cd->device; 192 SDev = cd->device;
194 193
195 if (cgc->sense) 194 if (cgc->sshdr)
196 senseptr = sense_buffer; 195 sshdr = cgc->sshdr;
197 196
198 retry: 197 retry:
199 if (!scsi_block_when_processing_errors(SDev)) { 198 if (!scsi_block_when_processing_errors(SDev)) {
@@ -202,15 +201,12 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
202 } 201 }
203 202
204 result = scsi_execute(SDev, cgc->cmd, cgc->data_direction, 203 result = scsi_execute(SDev, cgc->cmd, cgc->data_direction,
205 cgc->buffer, cgc->buflen, senseptr, &sshdr, 204 cgc->buffer, cgc->buflen, NULL, sshdr,
206 cgc->timeout, IOCTL_RETRIES, 0, 0, NULL); 205 cgc->timeout, IOCTL_RETRIES, 0, 0, NULL);
207 206
208 if (cgc->sense)
209 memcpy(cgc->sense, sense_buffer, sizeof(*cgc->sense));
210
211 /* Minimal error checking. Ignore cases we know about, and report the rest. */ 207 /* Minimal error checking. Ignore cases we know about, and report the rest. */
212 if (driver_byte(result) != 0) { 208 if (driver_byte(result) != 0) {
213 switch (sshdr.sense_key) { 209 switch (sshdr->sense_key) {
214 case UNIT_ATTENTION: 210 case UNIT_ATTENTION:
215 SDev->changed = 1; 211 SDev->changed = 1;
216 if (!cgc->quiet) 212 if (!cgc->quiet)
@@ -221,8 +217,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
221 err = -ENOMEDIUM; 217 err = -ENOMEDIUM;
222 break; 218 break;
223 case NOT_READY: /* This happens if there is no disc in drive */ 219 case NOT_READY: /* This happens if there is no disc in drive */
224 if (sshdr.asc == 0x04 && 220 if (sshdr->asc == 0x04 &&
225 sshdr.ascq == 0x01) { 221 sshdr->ascq == 0x01) {
226 /* sense: Logical unit is in process of becoming ready */ 222 /* sense: Logical unit is in process of becoming ready */
227 if (!cgc->quiet) 223 if (!cgc->quiet)
228 sr_printk(KERN_INFO, cd, 224 sr_printk(KERN_INFO, cd,
@@ -245,8 +241,8 @@ int sr_do_ioctl(Scsi_CD *cd, struct packet_command *cgc)
245 break; 241 break;
246 case ILLEGAL_REQUEST: 242 case ILLEGAL_REQUEST:
247 err = -EIO; 243 err = -EIO;
248 if (sshdr.asc == 0x20 && 244 if (sshdr->asc == 0x20 &&
249 sshdr.ascq == 0x00) 245 sshdr->ascq == 0x00)
250 /* sense: Invalid command operation code */ 246 /* sense: Invalid command operation code */
251 err = -EDRIVE_CANT_DO_THIS; 247 err = -EDRIVE_CANT_DO_THIS;
252 break; 248 break;
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 6dc8891ccb74..1c72db94270e 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -513,12 +513,12 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev,
513 513
514 if (sc->sc_data_direction == DMA_TO_DEVICE) 514 if (sc->sc_data_direction == DMA_TO_DEVICE)
515 cmd_pi->pi_bytesout = cpu_to_virtio32(vdev, 515 cmd_pi->pi_bytesout = cpu_to_virtio32(vdev,
516 blk_rq_sectors(rq) * 516 bio_integrity_bytes(bi,
517 bi->tuple_size); 517 blk_rq_sectors(rq)));
518 else if (sc->sc_data_direction == DMA_FROM_DEVICE) 518 else if (sc->sc_data_direction == DMA_FROM_DEVICE)
519 cmd_pi->pi_bytesin = cpu_to_virtio32(vdev, 519 cmd_pi->pi_bytesin = cpu_to_virtio32(vdev,
520 blk_rq_sectors(rq) * 520 bio_integrity_bytes(bi,
521 bi->tuple_size); 521 blk_rq_sectors(rq)));
522} 522}
523#endif 523#endif
524 524
diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig
index 4c44d7bed01a..cb6f32ce7de8 100644
--- a/drivers/target/Kconfig
+++ b/drivers/target/Kconfig
@@ -1,10 +1,10 @@
1 1
2menuconfig TARGET_CORE 2menuconfig TARGET_CORE
3 tristate "Generic Target Core Mod (TCM) and ConfigFS Infrastructure" 3 tristate "Generic Target Core Mod (TCM) and ConfigFS Infrastructure"
4 depends on SCSI && BLOCK 4 depends on BLOCK
5 select CONFIGFS_FS 5 select CONFIGFS_FS
6 select CRC_T10DIF 6 select CRC_T10DIF
7 select BLK_SCSI_REQUEST # only for scsi_command_size_tbl.. 7 select BLK_SCSI_REQUEST
8 select SGL_ALLOC 8 select SGL_ALLOC
9 default n 9 default n
10 help 10 help
@@ -29,6 +29,7 @@ config TCM_FILEIO
29 29
30config TCM_PSCSI 30config TCM_PSCSI
31 tristate "TCM/pSCSI Subsystem Plugin for Linux/SCSI" 31 tristate "TCM/pSCSI Subsystem Plugin for Linux/SCSI"
32 depends on SCSI
32 help 33 help
33 Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered 34 Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered
34 passthrough access to Linux/SCSI device 35 passthrough access to Linux/SCSI device
diff --git a/drivers/target/loopback/Kconfig b/drivers/target/loopback/Kconfig
index abe8ecbcdf06..158ee9d522f7 100644
--- a/drivers/target/loopback/Kconfig
+++ b/drivers/target/loopback/Kconfig
@@ -1,5 +1,6 @@
1config LOOPBACK_TARGET 1config LOOPBACK_TARGET
2 tristate "TCM Virtual SAS target and Linux/SCSI LDD fabric loopback module" 2 tristate "TCM Virtual SAS target and Linux/SCSI LDD fabric loopback module"
3 depends on SCSI
3 help 4 help
4 Say Y here to enable the TCM Virtual SAS target and Linux/SCSI LLD 5 Say Y here to enable the TCM Virtual SAS target and Linux/SCSI LLD
5 fabric loopback module. 6 fabric loopback module.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aba25414231a..38b8ce05cbc7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -666,7 +666,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
666 result = blk_queue_enter(bdev->bd_queue, 0); 666 result = blk_queue_enter(bdev->bd_queue, 0);
667 if (result) 667 if (result)
668 return result; 668 return result;
669 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); 669 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
670 REQ_OP_READ);
670 blk_queue_exit(bdev->bd_queue); 671 blk_queue_exit(bdev->bd_queue);
671 return result; 672 return result;
672} 673}
@@ -704,7 +705,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
704 return result; 705 return result;
705 706
706 set_page_writeback(page); 707 set_page_writeback(page);
707 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true); 708 result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
709 REQ_OP_WRITE);
708 if (result) { 710 if (result) {
709 end_page_writeback(page); 711 end_page_writeback(page);
710 } else { 712 } else {
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 1b8b44637e70..5331a15a61f1 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -873,8 +873,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
873 struct bio *bio; 873 struct bio *bio;
874 874
875 if (per_dev != master_dev) { 875 if (per_dev != master_dev) {
876 bio = bio_clone_kmalloc(master_dev->bio, 876 bio = bio_clone_fast(master_dev->bio,
877 GFP_KERNEL); 877 GFP_KERNEL, NULL);
878 if (unlikely(!bio)) { 878 if (unlikely(!bio)) {
879 ORE_DBGMSG( 879 ORE_DBGMSG(
880 "Failed to allocate BIO size=%u\n", 880 "Failed to allocate BIO size=%u\n",
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7750bc5b85a..5863fd22e90b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3529,7 +3529,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3529 sbi->s_sb_block = sb_block; 3529 sbi->s_sb_block = sb_block;
3530 if (sb->s_bdev->bd_part) 3530 if (sb->s_bdev->bd_part)
3531 sbi->s_sectors_written_start = 3531 sbi->s_sectors_written_start =
3532 part_stat_read(sb->s_bdev->bd_part, sectors[1]); 3532 part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
3533 3533
3534 /* Cleanup superblock name */ 3534 /* Cleanup superblock name */
3535 strreplace(sb->s_id, '/', '!'); 3535 strreplace(sb->s_id, '/', '!');
@@ -4838,7 +4838,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4838 if (sb->s_bdev->bd_part) 4838 if (sb->s_bdev->bd_part)
4839 es->s_kbytes_written = 4839 es->s_kbytes_written =
4840 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 4840 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
4841 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 4841 ((part_stat_read(sb->s_bdev->bd_part,
4842 sectors[STAT_WRITE]) -
4842 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 4843 EXT4_SB(sb)->s_sectors_written_start) >> 1));
4843 else 4844 else
4844 es->s_kbytes_written = 4845 es->s_kbytes_written =
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index e60cc5e89023..9212a026a1f1 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -58,7 +58,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
58 if (!sb->s_bdev->bd_part) 58 if (!sb->s_bdev->bd_part)
59 return snprintf(buf, PAGE_SIZE, "0\n"); 59 return snprintf(buf, PAGE_SIZE, "0\n");
60 return snprintf(buf, PAGE_SIZE, "%lu\n", 60 return snprintf(buf, PAGE_SIZE, "%lu\n",
61 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 61 (part_stat_read(sb->s_bdev->bd_part,
62 sectors[STAT_WRITE]) -
62 sbi->s_sectors_written_start) >> 1); 63 sbi->s_sectors_written_start) >> 1);
63} 64}
64 65
@@ -70,7 +71,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf)
70 return snprintf(buf, PAGE_SIZE, "0\n"); 71 return snprintf(buf, PAGE_SIZE, "0\n");
71 return snprintf(buf, PAGE_SIZE, "%llu\n", 72 return snprintf(buf, PAGE_SIZE, "%llu\n",
72 (unsigned long long)(sbi->s_kbytes_written + 73 (unsigned long long)(sbi->s_kbytes_written +
73 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 74 ((part_stat_read(sb->s_bdev->bd_part,
75 sectors[STAT_WRITE]) -
74 EXT4_SB(sb)->s_sectors_written_start) >> 1))); 76 EXT4_SB(sb)->s_sectors_written_start) >> 1)));
75} 77}
76 78
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4d8b1de83143..6799c3fc44e3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1304,7 +1304,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
1304 * and the return value is in kbytes. s is of struct f2fs_sb_info. 1304 * and the return value is in kbytes. s is of struct f2fs_sb_info.
1305 */ 1305 */
1306#define BD_PART_WRITTEN(s) \ 1306#define BD_PART_WRITTEN(s) \
1307(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ 1307(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \
1308 (s)->sectors_written_start) >> 1) 1308 (s)->sectors_written_start) >> 1)
1309 1309
1310static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) 1310static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3995e926ba3a..17bcff789c08 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2882,7 +2882,8 @@ try_onemore:
2882 /* For write statistics */ 2882 /* For write statistics */
2883 if (sb->s_bdev->bd_part) 2883 if (sb->s_bdev->bd_part)
2884 sbi->sectors_written_start = 2884 sbi->sectors_written_start =
2885 (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]); 2885 (u64)part_stat_read(sb->s_bdev->bd_part,
2886 sectors[STAT_WRITE]);
2886 2887
2887 /* Read accumulated write IO statistics if exists */ 2888 /* Read accumulated write IO statistics if exists */
2888 seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); 2889 seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
diff --git a/fs/mpage.c b/fs/mpage.c
index b7e7f570733a..b73638db9866 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -51,8 +51,8 @@ static void mpage_end_io(struct bio *bio)
51 51
52 bio_for_each_segment_all(bv, bio, i) { 52 bio_for_each_segment_all(bv, bio, i) {
53 struct page *page = bv->bv_page; 53 struct page *page = bv->bv_page;
54 page_endio(page, op_is_write(bio_op(bio)), 54 page_endio(page, bio_op(bio),
55 blk_status_to_errno(bio->bi_status)); 55 blk_status_to_errno(bio->bi_status));
56 } 56 }
57 57
58 bio_put(bio); 58 bio_put(bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f08f5fe7bd08..51371740d2a8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -429,7 +429,6 @@ extern void bio_put(struct bio *);
429 429
430extern void __bio_clone_fast(struct bio *, struct bio *); 430extern void __bio_clone_fast(struct bio *, struct bio *);
431extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); 431extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
432extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
433 432
434extern struct bio_set fs_bio_set; 433extern struct bio_set fs_bio_set;
435 434
@@ -443,12 +442,6 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
443 return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); 442 return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
444} 443}
445 444
446static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
447{
448 return bio_clone_bioset(bio, gfp_mask, NULL);
449
450}
451
452extern blk_qc_t submit_bio(struct bio *); 445extern blk_qc_t submit_bio(struct bio *);
453 446
454extern void bio_endio(struct bio *); 447extern void bio_endio(struct bio *);
@@ -496,9 +489,9 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
496extern void bio_set_pages_dirty(struct bio *bio); 489extern void bio_set_pages_dirty(struct bio *bio);
497extern void bio_check_pages_dirty(struct bio *bio); 490extern void bio_check_pages_dirty(struct bio *bio);
498 491
499void generic_start_io_acct(struct request_queue *q, int rw, 492void generic_start_io_acct(struct request_queue *q, int op,
500 unsigned long sectors, struct hd_struct *part); 493 unsigned long sectors, struct hd_struct *part);
501void generic_end_io_acct(struct request_queue *q, int rw, 494void generic_end_io_acct(struct request_queue *q, int op,
502 struct hd_struct *part, 495 struct hd_struct *part,
503 unsigned long start_time); 496 unsigned long start_time);
504 497
@@ -553,8 +546,16 @@ do { \
553#define bio_dev(bio) \ 546#define bio_dev(bio) \
554 disk_devt((bio)->bi_disk) 547 disk_devt((bio)->bi_disk)
555 548
549#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
550int bio_associate_blkcg_from_page(struct bio *bio, struct page *page);
551#else
552static inline int bio_associate_blkcg_from_page(struct bio *bio,
553 struct page *page) { return 0; }
554#endif
555
556#ifdef CONFIG_BLK_CGROUP 556#ifdef CONFIG_BLK_CGROUP
557int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); 557int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
558int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg);
558void bio_disassociate_task(struct bio *bio); 559void bio_disassociate_task(struct bio *bio);
559void bio_clone_blkcg_association(struct bio *dst, struct bio *src); 560void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
560#else /* CONFIG_BLK_CGROUP */ 561#else /* CONFIG_BLK_CGROUP */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 6c666fd7de3c..34aec30e06c7 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -35,6 +35,7 @@ enum blkg_rwstat_type {
35 BLKG_RWSTAT_WRITE, 35 BLKG_RWSTAT_WRITE,
36 BLKG_RWSTAT_SYNC, 36 BLKG_RWSTAT_SYNC,
37 BLKG_RWSTAT_ASYNC, 37 BLKG_RWSTAT_ASYNC,
38 BLKG_RWSTAT_DISCARD,
38 39
39 BLKG_RWSTAT_NR, 40 BLKG_RWSTAT_NR,
40 BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, 41 BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
@@ -136,6 +137,12 @@ struct blkcg_gq {
136 struct blkg_policy_data *pd[BLKCG_MAX_POLS]; 137 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
137 138
138 struct rcu_head rcu_head; 139 struct rcu_head rcu_head;
140
141 atomic_t use_delay;
142 atomic64_t delay_nsec;
143 atomic64_t delay_start;
144 u64 last_delay;
145 int last_use;
139}; 146};
140 147
141typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); 148typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -148,6 +155,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
148typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); 155typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
149typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); 156typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
150typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); 157typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
158typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf,
159 size_t size);
151 160
152struct blkcg_policy { 161struct blkcg_policy {
153 int plid; 162 int plid;
@@ -167,6 +176,7 @@ struct blkcg_policy {
167 blkcg_pol_offline_pd_fn *pd_offline_fn; 176 blkcg_pol_offline_pd_fn *pd_offline_fn;
168 blkcg_pol_free_pd_fn *pd_free_fn; 177 blkcg_pol_free_pd_fn *pd_free_fn;
169 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; 178 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
179 blkcg_pol_stat_pd_fn *pd_stat_fn;
170}; 180};
171 181
172extern struct blkcg blkcg_root; 182extern struct blkcg blkcg_root;
@@ -238,6 +248,42 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
238 return css_to_blkcg(task_css(current, io_cgrp_id)); 248 return css_to_blkcg(task_css(current, io_cgrp_id));
239} 249}
240 250
251static inline bool blk_cgroup_congested(void)
252{
253 struct cgroup_subsys_state *css;
254 bool ret = false;
255
256 rcu_read_lock();
257 css = kthread_blkcg();
258 if (!css)
259 css = task_css(current, io_cgrp_id);
260 while (css) {
261 if (atomic_read(&css->cgroup->congestion_count)) {
262 ret = true;
263 break;
264 }
265 css = css->parent;
266 }
267 rcu_read_unlock();
268 return ret;
269}
270
271/**
272 * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
273 * @return: true if this bio needs to be submitted with the root blkg context.
274 *
275 * In order to avoid priority inversions we sometimes need to issue a bio as if
276 * it were attached to the root blkg, and then backcharge to the actual owning
277 * blkg. The idea is we do bio_blkcg() to look up the actual context for the
278 * bio and attach the appropriate blkg to the bio. Then we call this helper and
279 * if it is true run with the root blkg for that queue and then do any
280 * backcharging to the originating cgroup once the io is complete.
281 */
282static inline bool bio_issue_as_root_blkg(struct bio *bio)
283{
284 return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
285}
286
241/** 287/**
242 * blkcg_parent - get the parent of a blkcg 288 * blkcg_parent - get the parent of a blkcg
243 * @blkcg: blkcg of interest 289 * @blkcg: blkcg of interest
@@ -296,6 +342,17 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
296} 342}
297 343
298/** 344/**
345 * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
346 * @q: request_queue of interest
347 *
348 * Lookup blkg for @q at the root level. See also blkg_lookup().
349 */
350static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
351{
352 return q->root_blkg;
353}
354
355/**
299 * blkg_to_pdata - get policy private data 356 * blkg_to_pdata - get policy private data
300 * @blkg: blkg of interest 357 * @blkg: blkg of interest
301 * @pol: policy of interest 358 * @pol: policy of interest
@@ -355,6 +412,21 @@ static inline void blkg_get(struct blkcg_gq *blkg)
355 atomic_inc(&blkg->refcnt); 412 atomic_inc(&blkg->refcnt);
356} 413}
357 414
415/**
416 * blkg_try_get - try and get a blkg reference
417 * @blkg: blkg to get
418 *
419 * This is for use when doing an RCU lookup of the blkg. We may be in the midst
420 * of freeing this blkg, so we can only use it if the refcnt is not zero.
421 */
422static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
423{
424 if (atomic_inc_not_zero(&blkg->refcnt))
425 return blkg;
426 return NULL;
427}
428
429
358void __blkg_release_rcu(struct rcu_head *rcu); 430void __blkg_release_rcu(struct rcu_head *rcu);
359 431
360/** 432/**
@@ -589,7 +661,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
589{ 661{
590 struct percpu_counter *cnt; 662 struct percpu_counter *cnt;
591 663
592 if (op_is_write(op)) 664 if (op_is_discard(op))
665 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
666 else if (op_is_write(op))
593 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; 667 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
594 else 668 else
595 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; 669 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
@@ -706,8 +780,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
706 780
707 if (!throtl) { 781 if (!throtl) {
708 blkg = blkg ?: q->root_blkg; 782 blkg = blkg ?: q->root_blkg;
709 blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, 783 /*
710 bio->bi_iter.bi_size); 784 * If the bio is flagged with BIO_QUEUE_ENTERED it means this
785 * is a split bio and we would have already accounted for the
786 * size of the bio.
787 */
788 if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
789 blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
790 bio->bi_iter.bi_size);
711 blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); 791 blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
712 } 792 }
713 793
@@ -715,6 +795,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
715 return !throtl; 795 return !throtl;
716} 796}
717 797
798static inline void blkcg_use_delay(struct blkcg_gq *blkg)
799{
800 if (atomic_add_return(1, &blkg->use_delay) == 1)
801 atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
802}
803
804static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
805{
806 int old = atomic_read(&blkg->use_delay);
807
808 if (old == 0)
809 return 0;
810
811 /*
812 * We do this song and dance because we can race with somebody else
813 * adding or removing delay. If we just did an atomic_dec we'd end up
814 * negative and we'd already be in trouble. We need to subtract 1 and
815 * then check to see if we were the last delay so we can drop the
816 * congestion count on the cgroup.
817 */
818 while (old) {
819 int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
820 if (cur == old)
821 break;
822 old = cur;
823 }
824
825 if (old == 0)
826 return 0;
827 if (old == 1)
828 atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
829 return 1;
830}
831
832static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
833{
834 int old = atomic_read(&blkg->use_delay);
835 if (!old)
836 return;
837 /* We only want 1 person clearing the congestion count for this blkg. */
838 while (old) {
839 int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
840 if (cur == old) {
841 atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
842 break;
843 }
844 old = cur;
845 }
846}
847
848void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
849void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
850void blkcg_maybe_throttle_current(void);
718#else /* CONFIG_BLK_CGROUP */ 851#else /* CONFIG_BLK_CGROUP */
719 852
720struct blkcg { 853struct blkcg {
@@ -734,9 +867,16 @@ struct blkcg_policy {
734 867
735#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) 868#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
736 869
870static inline void blkcg_maybe_throttle_current(void) { }
871static inline bool blk_cgroup_congested(void) { return false; }
872
737#ifdef CONFIG_BLOCK 873#ifdef CONFIG_BLOCK
738 874
875static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
876
739static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } 877static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
878static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
879{ return NULL; }
740static inline int blkcg_init_queue(struct request_queue *q) { return 0; } 880static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
741static inline void blkcg_drain_queue(struct request_queue *q) { } 881static inline void blkcg_drain_queue(struct request_queue *q) { }
742static inline void blkcg_exit_queue(struct request_queue *q) { } 882static inline void blkcg_exit_queue(struct request_queue *q) { }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ca3f2c2edd85..1da59c16f637 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -35,10 +35,12 @@ struct blk_mq_hw_ctx {
35 struct sbitmap ctx_map; 35 struct sbitmap ctx_map;
36 36
37 struct blk_mq_ctx *dispatch_from; 37 struct blk_mq_ctx *dispatch_from;
38 unsigned int dispatch_busy;
38 39
39 struct blk_mq_ctx **ctxs;
40 unsigned int nr_ctx; 40 unsigned int nr_ctx;
41 struct blk_mq_ctx **ctxs;
41 42
43 spinlock_t dispatch_wait_lock;
42 wait_queue_entry_t dispatch_wait; 44 wait_queue_entry_t dispatch_wait;
43 atomic_t wait_index; 45 atomic_t wait_index;
44 46
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3c4f390aea4b..f6dfb30737d8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -179,11 +179,9 @@ struct bio {
179 */ 179 */
180 struct io_context *bi_ioc; 180 struct io_context *bi_ioc;
181 struct cgroup_subsys_state *bi_css; 181 struct cgroup_subsys_state *bi_css;
182#ifdef CONFIG_BLK_DEV_THROTTLING_LOW 182 struct blkcg_gq *bi_blkg;
183 void *bi_cg_private;
184 struct bio_issue bi_issue; 183 struct bio_issue bi_issue;
185#endif 184#endif
186#endif
187 union { 185 union {
188#if defined(CONFIG_BLK_DEV_INTEGRITY) 186#if defined(CONFIG_BLK_DEV_INTEGRITY)
189 struct bio_integrity_payload *bi_integrity; /* data integrity */ 187 struct bio_integrity_payload *bi_integrity; /* data integrity */
@@ -329,7 +327,7 @@ enum req_flag_bits {
329 327
330 /* for driver use */ 328 /* for driver use */
331 __REQ_DRV, 329 __REQ_DRV,
332 330 __REQ_SWAP, /* swapping request. */
333 __REQ_NR_BITS, /* stops here */ 331 __REQ_NR_BITS, /* stops here */
334}; 332};
335 333
@@ -351,6 +349,7 @@ enum req_flag_bits {
351#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) 349#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
352 350
353#define REQ_DRV (1ULL << __REQ_DRV) 351#define REQ_DRV (1ULL << __REQ_DRV)
352#define REQ_SWAP (1ULL << __REQ_SWAP)
354 353
355#define REQ_FAILFAST_MASK \ 354#define REQ_FAILFAST_MASK \
356 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) 355 (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -358,6 +357,14 @@ enum req_flag_bits {
358#define REQ_NOMERGE_FLAGS \ 357#define REQ_NOMERGE_FLAGS \
359 (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) 358 (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
360 359
360enum stat_group {
361 STAT_READ,
362 STAT_WRITE,
363 STAT_DISCARD,
364
365 NR_STAT_GROUPS
366};
367
361#define bio_op(bio) \ 368#define bio_op(bio) \
362 ((bio)->bi_opf & REQ_OP_MASK) 369 ((bio)->bi_opf & REQ_OP_MASK)
363#define req_op(req) \ 370#define req_op(req) \
@@ -395,6 +402,18 @@ static inline bool op_is_sync(unsigned int op)
395 (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); 402 (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH));
396} 403}
397 404
405static inline bool op_is_discard(unsigned int op)
406{
407 return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
408}
409
410static inline int op_stat_group(unsigned int op)
411{
412 if (op_is_discard(op))
413 return STAT_DISCARD;
414 return op_is_write(op);
415}
416
398typedef unsigned int blk_qc_t; 417typedef unsigned int blk_qc_t;
399#define BLK_QC_T_NONE -1U 418#define BLK_QC_T_NONE -1U
400#define BLK_QC_T_SHIFT 16 419#define BLK_QC_T_SHIFT 16
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 79226ca8f80f..d6869e0e2b64 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,8 +27,6 @@
27#include <linux/percpu-refcount.h> 27#include <linux/percpu-refcount.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/blkzoned.h> 29#include <linux/blkzoned.h>
30#include <linux/seqlock.h>
31#include <linux/u64_stats_sync.h>
32 30
33struct module; 31struct module;
34struct scsi_ioctl_command; 32struct scsi_ioctl_command;
@@ -42,7 +40,7 @@ struct bsg_job;
42struct blkcg_gq; 40struct blkcg_gq;
43struct blk_flush_queue; 41struct blk_flush_queue;
44struct pr_ops; 42struct pr_ops;
45struct rq_wb; 43struct rq_qos;
46struct blk_queue_stats; 44struct blk_queue_stats;
47struct blk_stat_callback; 45struct blk_stat_callback;
48 46
@@ -442,10 +440,8 @@ struct request_queue {
442 int nr_rqs[2]; /* # allocated [a]sync rqs */ 440 int nr_rqs[2]; /* # allocated [a]sync rqs */
443 int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ 441 int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
444 442
445 atomic_t shared_hctx_restart;
446
447 struct blk_queue_stats *stats; 443 struct blk_queue_stats *stats;
448 struct rq_wb *rq_wb; 444 struct rq_qos *rq_qos;
449 445
450 /* 446 /*
451 * If blkcg is not used, @q->root_rl serves all requests. If blkcg 447 * If blkcg is not used, @q->root_rl serves all requests. If blkcg
@@ -592,6 +588,7 @@ struct request_queue {
592 588
593 struct queue_limits limits; 589 struct queue_limits limits;
594 590
591#ifdef CONFIG_BLK_DEV_ZONED
595 /* 592 /*
596 * Zoned block device information for request dispatch control. 593 * Zoned block device information for request dispatch control.
597 * nr_zones is the total number of zones of the device. This is always 594 * nr_zones is the total number of zones of the device. This is always
@@ -612,6 +609,7 @@ struct request_queue {
612 unsigned int nr_zones; 609 unsigned int nr_zones;
613 unsigned long *seq_zones_bitmap; 610 unsigned long *seq_zones_bitmap;
614 unsigned long *seq_zones_wlock; 611 unsigned long *seq_zones_wlock;
612#endif /* CONFIG_BLK_DEV_ZONED */
615 613
616 /* 614 /*
617 * sg stuff 615 * sg stuff
@@ -800,11 +798,7 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
800 return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; 798 return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
801} 799}
802 800
803static inline unsigned int blk_queue_nr_zones(struct request_queue *q) 801#ifdef CONFIG_BLK_DEV_ZONED
804{
805 return q->nr_zones;
806}
807
808static inline unsigned int blk_queue_zone_no(struct request_queue *q, 802static inline unsigned int blk_queue_zone_no(struct request_queue *q,
809 sector_t sector) 803 sector_t sector)
810{ 804{
@@ -820,6 +814,7 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
820 return false; 814 return false;
821 return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); 815 return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
822} 816}
817#endif /* CONFIG_BLK_DEV_ZONED */
823 818
824static inline bool rq_is_sync(struct request *rq) 819static inline bool rq_is_sync(struct request *rq)
825{ 820{
@@ -1070,6 +1065,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
1070 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; 1065 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
1071} 1066}
1072 1067
1068#ifdef CONFIG_BLK_DEV_ZONED
1073static inline unsigned int blk_rq_zone_no(struct request *rq) 1069static inline unsigned int blk_rq_zone_no(struct request *rq)
1074{ 1070{
1075 return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); 1071 return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
@@ -1079,6 +1075,7 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
1079{ 1075{
1080 return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); 1076 return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
1081} 1077}
1078#endif /* CONFIG_BLK_DEV_ZONED */
1082 1079
1083/* 1080/*
1084 * Some commands like WRITE SAME have a payload or data transfer size which 1081 * Some commands like WRITE SAME have a payload or data transfer size which
@@ -1437,8 +1434,6 @@ enum blk_default_limits {
1437 BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, 1434 BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL,
1438}; 1435};
1439 1436
1440#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
1441
1442static inline unsigned long queue_segment_boundary(struct request_queue *q) 1437static inline unsigned long queue_segment_boundary(struct request_queue *q)
1443{ 1438{
1444 return q->limits.seg_boundary_mask; 1439 return q->limits.seg_boundary_mask;
@@ -1639,15 +1634,6 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
1639 return 0; 1634 return 0;
1640} 1635}
1641 1636
1642static inline unsigned int bdev_nr_zones(struct block_device *bdev)
1643{
1644 struct request_queue *q = bdev_get_queue(bdev);
1645
1646 if (q)
1647 return blk_queue_nr_zones(q);
1648 return 0;
1649}
1650
1651static inline int queue_dma_alignment(struct request_queue *q) 1637static inline int queue_dma_alignment(struct request_queue *q)
1652{ 1638{
1653 return q ? q->dma_alignment : 511; 1639 return q ? q->dma_alignment : 511;
@@ -1877,6 +1863,28 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
1877 bip_next->bip_vec[0].bv_offset); 1863 bip_next->bip_vec[0].bv_offset);
1878} 1864}
1879 1865
1866/**
1867 * bio_integrity_intervals - Return number of integrity intervals for a bio
1868 * @bi: blk_integrity profile for device
1869 * @sectors: Size of the bio in 512-byte sectors
1870 *
1871 * Description: The block layer calculates everything in 512 byte
1872 * sectors but integrity metadata is done in terms of the data integrity
1873 * interval size of the storage device. Convert the block layer sectors
1874 * to the appropriate number of integrity intervals.
1875 */
1876static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
1877 unsigned int sectors)
1878{
1879 return sectors >> (bi->interval_exp - 9);
1880}
1881
1882static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
1883 unsigned int sectors)
1884{
1885 return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
1886}
1887
1880#else /* CONFIG_BLK_DEV_INTEGRITY */ 1888#else /* CONFIG_BLK_DEV_INTEGRITY */
1881 1889
1882struct bio; 1890struct bio;
@@ -1950,12 +1958,24 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
1950 return false; 1958 return false;
1951} 1959}
1952 1960
1961static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
1962 unsigned int sectors)
1963{
1964 return 0;
1965}
1966
1967static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
1968 unsigned int sectors)
1969{
1970 return 0;
1971}
1972
1953#endif /* CONFIG_BLK_DEV_INTEGRITY */ 1973#endif /* CONFIG_BLK_DEV_INTEGRITY */
1954 1974
1955struct block_device_operations { 1975struct block_device_operations {
1956 int (*open) (struct block_device *, fmode_t); 1976 int (*open) (struct block_device *, fmode_t);
1957 void (*release) (struct gendisk *, fmode_t); 1977 void (*release) (struct gendisk *, fmode_t);
1958 int (*rw_page)(struct block_device *, sector_t, struct page *, bool); 1978 int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
1959 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1979 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1960 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 1980 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1961 unsigned int (*check_events) (struct gendisk *disk, 1981 unsigned int (*check_events) (struct gendisk *disk,
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index e75dfd1f1dec..528271c60018 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -13,6 +13,7 @@
13 13
14#include <linux/fs.h> /* not really needed, later.. */ 14#include <linux/fs.h> /* not really needed, later.. */
15#include <linux/list.h> 15#include <linux/list.h>
16#include <scsi/scsi_common.h>
16#include <uapi/linux/cdrom.h> 17#include <uapi/linux/cdrom.h>
17 18
18struct packet_command 19struct packet_command
@@ -21,7 +22,7 @@ struct packet_command
21 unsigned char *buffer; 22 unsigned char *buffer;
22 unsigned int buflen; 23 unsigned int buflen;
23 int stat; 24 int stat;
24 struct request_sense *sense; 25 struct scsi_sense_hdr *sshdr;
25 unsigned char data_direction; 26 unsigned char data_direction;
26 int quiet; 27 int quiet;
27 int timeout; 28 int timeout;
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c0e68f903011..ff20b677fb9f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -438,6 +438,9 @@ struct cgroup {
438 /* used to store eBPF programs */ 438 /* used to store eBPF programs */
439 struct cgroup_bpf bpf; 439 struct cgroup_bpf bpf;
440 440
441 /* If there is block congestion on this cgroup. */
442 atomic_t congestion_count;
443
441 /* ids of the ancestors at each level including self */ 444 /* ids of the ancestors at each level including self */
442 int ancestor_ids[]; 445 int ancestor_ids[];
443}; 446};
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6cb8a5789668..57864422a2c8 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -16,6 +16,7 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/percpu-refcount.h> 17#include <linux/percpu-refcount.h>
18#include <linux/uuid.h> 18#include <linux/uuid.h>
19#include <linux/blk_types.h>
19 20
20#ifdef CONFIG_BLOCK 21#ifdef CONFIG_BLOCK
21 22
@@ -82,10 +83,10 @@ struct partition {
82} __attribute__((packed)); 83} __attribute__((packed));
83 84
84struct disk_stats { 85struct disk_stats {
85 unsigned long sectors[2]; /* READs and WRITEs */ 86 unsigned long sectors[NR_STAT_GROUPS];
86 unsigned long ios[2]; 87 unsigned long ios[NR_STAT_GROUPS];
87 unsigned long merges[2]; 88 unsigned long merges[NR_STAT_GROUPS];
88 unsigned long ticks[2]; 89 unsigned long ticks[NR_STAT_GROUPS];
89 unsigned long io_ticks; 90 unsigned long io_ticks;
90 unsigned long time_in_queue; 91 unsigned long time_in_queue;
91}; 92};
@@ -353,6 +354,11 @@ static inline void free_part_stats(struct hd_struct *part)
353 354
354#endif /* CONFIG_SMP */ 355#endif /* CONFIG_SMP */
355 356
357#define part_stat_read_accum(part, field) \
358 (part_stat_read(part, field[STAT_READ]) + \
359 part_stat_read(part, field[STAT_WRITE]) + \
360 part_stat_read(part, field[STAT_DISCARD]))
361
356#define part_stat_add(cpu, part, field, addnd) do { \ 362#define part_stat_add(cpu, part, field, addnd) do { \
357 __part_stat_add((cpu), (part), field, addnd); \ 363 __part_stat_add((cpu), (part), field, addnd); \
358 if ((part)->partno) \ 364 if ((part)->partno) \
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb116e925..680d3395fc83 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -317,6 +317,9 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
317int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 317int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
318 gfp_t gfp_mask, struct mem_cgroup **memcgp, 318 gfp_t gfp_mask, struct mem_cgroup **memcgp,
319 bool compound); 319 bool compound);
320int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
321 gfp_t gfp_mask, struct mem_cgroup **memcgp,
322 bool compound);
320void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 323void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
321 bool lrucare, bool compound); 324 bool lrucare, bool compound);
322void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, 325void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
@@ -789,6 +792,16 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
789 return 0; 792 return 0;
790} 793}
791 794
795static inline int mem_cgroup_try_charge_delay(struct page *page,
796 struct mm_struct *mm,
797 gfp_t gfp_mask,
798 struct mem_cgroup **memcgp,
799 bool compound)
800{
801 *memcgp = NULL;
802 return 0;
803}
804
792static inline void mem_cgroup_commit_charge(struct page *page, 805static inline void mem_cgroup_commit_charge(struct page *page,
793 struct mem_cgroup *memcg, 806 struct mem_cgroup *memcg,
794 bool lrucare, bool compound) 807 bool lrucare, bool compound)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 2950ce957656..68e91ef5494c 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -242,7 +242,12 @@ struct nvme_id_ctrl {
242 __le32 sanicap; 242 __le32 sanicap;
243 __le32 hmminds; 243 __le32 hmminds;
244 __le16 hmmaxd; 244 __le16 hmmaxd;
245 __u8 rsvd338[174]; 245 __u8 rsvd338[4];
246 __u8 anatt;
247 __u8 anacap;
248 __le32 anagrpmax;
249 __le32 nanagrpid;
250 __u8 rsvd352[160];
246 __u8 sqes; 251 __u8 sqes;
247 __u8 cqes; 252 __u8 cqes;
248 __le16 maxcmd; 253 __le16 maxcmd;
@@ -254,11 +259,12 @@ struct nvme_id_ctrl {
254 __le16 awun; 259 __le16 awun;
255 __le16 awupf; 260 __le16 awupf;
256 __u8 nvscc; 261 __u8 nvscc;
257 __u8 rsvd531; 262 __u8 nwpc;
258 __le16 acwu; 263 __le16 acwu;
259 __u8 rsvd534[2]; 264 __u8 rsvd534[2];
260 __le32 sgls; 265 __le32 sgls;
261 __u8 rsvd540[228]; 266 __le32 mnan;
267 __u8 rsvd544[224];
262 char subnqn[256]; 268 char subnqn[256];
263 __u8 rsvd1024[768]; 269 __u8 rsvd1024[768];
264 __le32 ioccsz; 270 __le32 ioccsz;
@@ -312,7 +318,11 @@ struct nvme_id_ns {
312 __le16 nabspf; 318 __le16 nabspf;
313 __le16 noiob; 319 __le16 noiob;
314 __u8 nvmcap[16]; 320 __u8 nvmcap[16];
315 __u8 rsvd64[40]; 321 __u8 rsvd64[28];
322 __le32 anagrpid;
323 __u8 rsvd96[3];
324 __u8 nsattr;
325 __u8 rsvd100[4];
316 __u8 nguid[16]; 326 __u8 nguid[16];
317 __u8 eui64[8]; 327 __u8 eui64[8];
318 struct nvme_lbaf lbaf[16]; 328 struct nvme_lbaf lbaf[16];
@@ -425,6 +435,32 @@ struct nvme_effects_log {
425 __u8 resv[2048]; 435 __u8 resv[2048];
426}; 436};
427 437
438enum nvme_ana_state {
439 NVME_ANA_OPTIMIZED = 0x01,
440 NVME_ANA_NONOPTIMIZED = 0x02,
441 NVME_ANA_INACCESSIBLE = 0x03,
442 NVME_ANA_PERSISTENT_LOSS = 0x04,
443 NVME_ANA_CHANGE = 0x0f,
444};
445
446struct nvme_ana_group_desc {
447 __le32 grpid;
448 __le32 nnsids;
449 __le64 chgcnt;
450 __u8 state;
451 __u8 rsvd17[15];
452 __le32 nsids[];
453};
454
455/* flag for the log specific field of the ANA log */
456#define NVME_ANA_LOG_RGO (1 << 0)
457
458struct nvme_ana_rsp_hdr {
459 __le64 chgcnt;
460 __le16 ngrps;
461 __le16 rsvd10[3];
462};
463
428enum { 464enum {
429 NVME_SMART_CRIT_SPARE = 1 << 0, 465 NVME_SMART_CRIT_SPARE = 1 << 0,
430 NVME_SMART_CRIT_TEMPERATURE = 1 << 1, 466 NVME_SMART_CRIT_TEMPERATURE = 1 << 1,
@@ -444,11 +480,13 @@ enum {
444enum { 480enum {
445 NVME_AER_NOTICE_NS_CHANGED = 0x00, 481 NVME_AER_NOTICE_NS_CHANGED = 0x00,
446 NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, 482 NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
483 NVME_AER_NOTICE_ANA = 0x03,
447}; 484};
448 485
449enum { 486enum {
450 NVME_AEN_CFG_NS_ATTR = 1 << 8, 487 NVME_AEN_CFG_NS_ATTR = 1 << 8,
451 NVME_AEN_CFG_FW_ACT = 1 << 9, 488 NVME_AEN_CFG_FW_ACT = 1 << 9,
489 NVME_AEN_CFG_ANA_CHANGE = 1 << 11,
452}; 490};
453 491
454struct nvme_lba_range_type { 492struct nvme_lba_range_type {
@@ -749,15 +787,22 @@ enum {
749 NVME_FEAT_HOST_MEM_BUF = 0x0d, 787 NVME_FEAT_HOST_MEM_BUF = 0x0d,
750 NVME_FEAT_TIMESTAMP = 0x0e, 788 NVME_FEAT_TIMESTAMP = 0x0e,
751 NVME_FEAT_KATO = 0x0f, 789 NVME_FEAT_KATO = 0x0f,
790 NVME_FEAT_HCTM = 0x10,
791 NVME_FEAT_NOPSC = 0x11,
792 NVME_FEAT_RRL = 0x12,
793 NVME_FEAT_PLM_CONFIG = 0x13,
794 NVME_FEAT_PLM_WINDOW = 0x14,
752 NVME_FEAT_SW_PROGRESS = 0x80, 795 NVME_FEAT_SW_PROGRESS = 0x80,
753 NVME_FEAT_HOST_ID = 0x81, 796 NVME_FEAT_HOST_ID = 0x81,
754 NVME_FEAT_RESV_MASK = 0x82, 797 NVME_FEAT_RESV_MASK = 0x82,
755 NVME_FEAT_RESV_PERSIST = 0x83, 798 NVME_FEAT_RESV_PERSIST = 0x83,
799 NVME_FEAT_WRITE_PROTECT = 0x84,
756 NVME_LOG_ERROR = 0x01, 800 NVME_LOG_ERROR = 0x01,
757 NVME_LOG_SMART = 0x02, 801 NVME_LOG_SMART = 0x02,
758 NVME_LOG_FW_SLOT = 0x03, 802 NVME_LOG_FW_SLOT = 0x03,
759 NVME_LOG_CHANGED_NS = 0x04, 803 NVME_LOG_CHANGED_NS = 0x04,
760 NVME_LOG_CMD_EFFECTS = 0x05, 804 NVME_LOG_CMD_EFFECTS = 0x05,
805 NVME_LOG_ANA = 0x0c,
761 NVME_LOG_DISC = 0x70, 806 NVME_LOG_DISC = 0x70,
762 NVME_LOG_RESERVATION = 0x80, 807 NVME_LOG_RESERVATION = 0x80,
763 NVME_FWACT_REPL = (0 << 3), 808 NVME_FWACT_REPL = (0 << 3),
@@ -765,6 +810,14 @@ enum {
765 NVME_FWACT_ACTV = (2 << 3), 810 NVME_FWACT_ACTV = (2 << 3),
766}; 811};
767 812
813/* NVMe Namespace Write Protect State */
814enum {
815 NVME_NS_NO_WRITE_PROTECT = 0,
816 NVME_NS_WRITE_PROTECT,
817 NVME_NS_WRITE_PROTECT_POWER_CYCLE,
818 NVME_NS_WRITE_PROTECT_PERMANENT,
819};
820
768#define NVME_MAX_CHANGED_NAMESPACES 1024 821#define NVME_MAX_CHANGED_NAMESPACES 1024
769 822
770struct nvme_identify { 823struct nvme_identify {
@@ -880,7 +933,7 @@ struct nvme_get_log_page_command {
880 __u64 rsvd2[2]; 933 __u64 rsvd2[2];
881 union nvme_data_ptr dptr; 934 union nvme_data_ptr dptr;
882 __u8 lid; 935 __u8 lid;
883 __u8 rsvd10; 936 __u8 lsp; /* upper 4 bits reserved */
884 __le16 numdl; 937 __le16 numdl;
885 __le16 numdu; 938 __le16 numdu;
886 __u16 rsvd11; 939 __u16 rsvd11;
@@ -1111,6 +1164,8 @@ enum {
1111 NVME_SC_SGL_INVALID_OFFSET = 0x16, 1164 NVME_SC_SGL_INVALID_OFFSET = 0x16,
1112 NVME_SC_SGL_INVALID_SUBTYPE = 0x17, 1165 NVME_SC_SGL_INVALID_SUBTYPE = 0x17,
1113 1166
1167 NVME_SC_NS_WRITE_PROTECTED = 0x20,
1168
1114 NVME_SC_LBA_RANGE = 0x80, 1169 NVME_SC_LBA_RANGE = 0x80,
1115 NVME_SC_CAP_EXCEEDED = 0x81, 1170 NVME_SC_CAP_EXCEEDED = 0x81,
1116 NVME_SC_NS_NOT_READY = 0x82, 1171 NVME_SC_NS_NOT_READY = 0x82,
@@ -1180,6 +1235,13 @@ enum {
1180 NVME_SC_ACCESS_DENIED = 0x286, 1235 NVME_SC_ACCESS_DENIED = 0x286,
1181 NVME_SC_UNWRITTEN_BLOCK = 0x287, 1236 NVME_SC_UNWRITTEN_BLOCK = 0x287,
1182 1237
1238 /*
1239 * Path-related Errors:
1240 */
1241 NVME_SC_ANA_PERSISTENT_LOSS = 0x301,
1242 NVME_SC_ANA_INACCESSIBLE = 0x302,
1243 NVME_SC_ANA_TRANSITION = 0x303,
1244
1183 NVME_SC_DNR = 0x4000, 1245 NVME_SC_DNR = 0x4000,
1184}; 1246};
1185 1247
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dac5086e3815..95a5018c338e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,6 +734,10 @@ struct task_struct {
734 /* disallow userland-initiated cgroup migration */ 734 /* disallow userland-initiated cgroup migration */
735 unsigned no_cgroup_migration:1; 735 unsigned no_cgroup_migration:1;
736#endif 736#endif
737#ifdef CONFIG_BLK_CGROUP
738 /* to be used once the psi infrastructure lands upstream. */
739 unsigned use_memdelay:1;
740#endif
737 741
738 unsigned long atomic_flags; /* Flags requiring atomic access. */ 742 unsigned long atomic_flags; /* Flags requiring atomic access. */
739 743
@@ -1150,6 +1154,10 @@ struct task_struct {
1150 unsigned int memcg_nr_pages_over_high; 1154 unsigned int memcg_nr_pages_over_high;
1151#endif 1155#endif
1152 1156
1157#ifdef CONFIG_BLK_CGROUP
1158 struct request_queue *throttle_queue;
1159#endif
1160
1153#ifdef CONFIG_UPROBES 1161#ifdef CONFIG_UPROBES
1154 struct uprobe_task *utask; 1162 struct uprobe_task *utask;
1155#endif 1163#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index c063443d8638..1a8bd05a335e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -629,7 +629,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
629 629
630 return memcg->swappiness; 630 return memcg->swappiness;
631} 631}
632
633#else 632#else
634static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) 633static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
635{ 634{
@@ -637,6 +636,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
637} 636}
638#endif 637#endif
639 638
639#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
640extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
641 gfp_t gfp_mask);
642#else
643static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg,
644 int node, gfp_t gfp_mask)
645{
646}
647#endif
648
640#ifdef CONFIG_MEMCG_SWAP 649#ifdef CONFIG_MEMCG_SWAP
641extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); 650extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
642extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); 651extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index c6aa8a3c42ed..b9626aa7e90c 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -37,9 +37,33 @@ struct t10_pi_tuple {
37#define T10_PI_APP_ESCAPE cpu_to_be16(0xffff) 37#define T10_PI_APP_ESCAPE cpu_to_be16(0xffff)
38#define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff) 38#define T10_PI_REF_ESCAPE cpu_to_be32(0xffffffff)
39 39
40static inline u32 t10_pi_ref_tag(struct request *rq)
41{
42#ifdef CONFIG_BLK_DEV_INTEGRITY
43 return blk_rq_pos(rq) >>
44 (rq->q->integrity.interval_exp - 9) & 0xffffffff;
45#else
46 return -1U;
47#endif
48}
49
40extern const struct blk_integrity_profile t10_pi_type1_crc; 50extern const struct blk_integrity_profile t10_pi_type1_crc;
41extern const struct blk_integrity_profile t10_pi_type1_ip; 51extern const struct blk_integrity_profile t10_pi_type1_ip;
42extern const struct blk_integrity_profile t10_pi_type3_crc; 52extern const struct blk_integrity_profile t10_pi_type3_crc;
43extern const struct blk_integrity_profile t10_pi_type3_ip; 53extern const struct blk_integrity_profile t10_pi_type3_ip;
44 54
55#ifdef CONFIG_BLK_DEV_INTEGRITY
56extern void t10_pi_prepare(struct request *rq, u8 protection_type);
57extern void t10_pi_complete(struct request *rq, u8 protection_type,
58 unsigned int intervals);
59#else
60static inline void t10_pi_complete(struct request *rq, u8 protection_type,
61 unsigned int intervals)
62{
63}
64static inline void t10_pi_prepare(struct request *rq, u8 protection_type)
65{
66}
67#endif
68
45#endif 69#endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 4a8841963c2e..05589a3e37f4 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -51,6 +51,7 @@
51#include <linux/security.h> 51#include <linux/security.h>
52#include <linux/task_work.h> 52#include <linux/task_work.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/blk-cgroup.h>
54struct linux_binprm; 55struct linux_binprm;
55 56
56/* 57/*
@@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
192 task_work_run(); 193 task_work_run();
193 194
194 mem_cgroup_handle_over_high(); 195 mem_cgroup_handle_over_high();
196 blkcg_maybe_throttle_current();
195} 197}
196 198
197#endif /* <linux/tracehook.h> */ 199#endif /* <linux/tracehook.h> */
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index aaf1e971c6a3..c891ada3c5c2 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -4,6 +4,7 @@
4 4
5#include <linux/dma-mapping.h> 5#include <linux/dma-mapping.h>
6#include <linux/blkdev.h> 6#include <linux/blkdev.h>
7#include <linux/t10-pi.h>
7#include <linux/list.h> 8#include <linux/list.h>
8#include <linux/types.h> 9#include <linux/types.h>
9#include <linux/timer.h> 10#include <linux/timer.h>
@@ -14,8 +15,6 @@
14struct Scsi_Host; 15struct Scsi_Host;
15struct scsi_driver; 16struct scsi_driver;
16 17
17#include <scsi/scsi_device.h>
18
19/* 18/*
20 * MAX_COMMAND_SIZE is: 19 * MAX_COMMAND_SIZE is:
21 * The longest fixed-length SCSI CDB as per the SCSI standard. 20 * The longest fixed-length SCSI CDB as per the SCSI standard.
@@ -120,11 +119,11 @@ struct scsi_cmnd {
120 struct request *request; /* The command we are 119 struct request *request; /* The command we are
121 working on */ 120 working on */
122 121
123#define SCSI_SENSE_BUFFERSIZE 96
124 unsigned char *sense_buffer; 122 unsigned char *sense_buffer;
125 /* obtained by REQUEST SENSE when 123 /* obtained by REQUEST SENSE when
126 * CHECK CONDITION is received on original 124 * CHECK CONDITION is received on original
127 * command (auto-sense) */ 125 * command (auto-sense). Length must be
126 * SCSI_SENSE_BUFFERSIZE bytes. */
128 127
129 /* Low-level done function - can be used by low-level driver to point 128 /* Low-level done function - can be used by low-level driver to point
130 * to completion function. Not used by mid/upper level code. */ 129 * to completion function. Not used by mid/upper level code. */
@@ -313,12 +312,6 @@ static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd)
313 return scmd->device->sector_size; 312 return scmd->device->sector_size;
314} 313}
315 314
316static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd)
317{
318 return blk_rq_pos(scmd->request) >>
319 (ilog2(scsi_prot_interval(scmd)) - 9) & 0xffffffff;
320}
321
322static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) 315static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
323{ 316{
324 return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; 317 return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0;
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 4c36af6edd79..202f4d6a4342 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -17,6 +17,8 @@ struct scsi_sense_hdr;
17 17
18typedef __u64 __bitwise blist_flags_t; 18typedef __u64 __bitwise blist_flags_t;
19 19
20#define SCSI_SENSE_BUFFERSIZE 96
21
20struct scsi_mode_data { 22struct scsi_mode_data {
21 __u32 length; 23 __u32 length;
22 __u16 block_descriptor_length; 24 __u16 block_descriptor_length;
@@ -426,11 +428,21 @@ extern const char *scsi_device_state_name(enum scsi_device_state);
426extern int scsi_is_sdev_device(const struct device *); 428extern int scsi_is_sdev_device(const struct device *);
427extern int scsi_is_target_device(const struct device *); 429extern int scsi_is_target_device(const struct device *);
428extern void scsi_sanitize_inquiry_string(unsigned char *s, int len); 430extern void scsi_sanitize_inquiry_string(unsigned char *s, int len);
429extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, 431extern int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
430 int data_direction, void *buffer, unsigned bufflen, 432 int data_direction, void *buffer, unsigned bufflen,
431 unsigned char *sense, struct scsi_sense_hdr *sshdr, 433 unsigned char *sense, struct scsi_sense_hdr *sshdr,
432 int timeout, int retries, u64 flags, 434 int timeout, int retries, u64 flags,
433 req_flags_t rq_flags, int *resid); 435 req_flags_t rq_flags, int *resid);
436/* Make sure any sense buffer is the correct size. */
437#define scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense, \
438 sshdr, timeout, retries, flags, rq_flags, resid) \
439({ \
440 BUILD_BUG_ON((sense) != NULL && \
441 sizeof(sense) != SCSI_SENSE_BUFFERSIZE); \
442 __scsi_execute(sdev, cmd, data_direction, buffer, bufflen, \
443 sense, sshdr, timeout, retries, flags, rq_flags, \
444 resid); \
445})
434static inline int scsi_execute_req(struct scsi_device *sdev, 446static inline int scsi_execute_req(struct scsi_device *sdev,
435 const unsigned char *cmd, int data_direction, void *buffer, 447 const unsigned char *cmd, int data_direction, void *buffer,
436 unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, 448 unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout,
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 821f71a2e48f..8d19e02d752a 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -195,7 +195,7 @@ struct cache_sb {
195 }; 195 };
196 }; 196 };
197 197
198 __u32 last_mount; /* time_t */ 198 __u32 last_mount; /* time overflow in y2106 */
199 199
200 __u16 first_bucket; 200 __u16 first_bucket;
201 union { 201 union {
@@ -318,7 +318,7 @@ struct uuid_entry {
318 struct { 318 struct {
319 __u8 uuid[16]; 319 __u8 uuid[16];
320 __u8 label[32]; 320 __u8 label[32];
321 __u32 first_reg; 321 __u32 first_reg; /* time overflow in y2106 */
322 __u32 last_reg; 322 __u32 last_reg;
323 __u32 invalidated; 323 __u32 invalidated;
324 324
diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index e3c70fe6bf0f..ff5a5db8906a 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -117,7 +117,7 @@ struct blk_zone_report {
117 __u32 nr_zones; 117 __u32 nr_zones;
118 __u8 reserved[4]; 118 __u8 reserved[4];
119 struct blk_zone zones[0]; 119 struct blk_zone zones[0];
120} __packed; 120};
121 121
122/** 122/**
123 * struct blk_zone_range - BLKRESETZONE ioctl request 123 * struct blk_zone_range - BLKRESETZONE ioctl request
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d8d0e016fc6..33112315b5c0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -866,6 +866,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
866 tsk->fail_nth = 0; 866 tsk->fail_nth = 0;
867#endif 867#endif
868 868
869#ifdef CONFIG_BLK_CGROUP
870 tsk->throttle_queue = NULL;
871 tsk->use_memdelay = 0;
872#endif
873
869 return tsk; 874 return tsk;
870 875
871free_stack: 876free_stack:
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 987d9a9ae283..b951aa1fac61 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -494,6 +494,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
494 if (!buts->buf_size || !buts->buf_nr) 494 if (!buts->buf_size || !buts->buf_nr)
495 return -EINVAL; 495 return -EINVAL;
496 496
497 if (!blk_debugfs_root)
498 return -ENOENT;
499
497 strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); 500 strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
498 buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; 501 buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
499 502
@@ -518,9 +521,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
518 521
519 ret = -ENOENT; 522 ret = -ENOENT;
520 523
521 if (!blk_debugfs_root)
522 goto err;
523
524 dir = debugfs_lookup(buts->name, blk_debugfs_root); 524 dir = debugfs_lookup(buts->name, blk_debugfs_root);
525 if (!dir) 525 if (!dir)
526 bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); 526 bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 25346bd99364..a9e1e093df51 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -552,7 +552,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
552 552
553 VM_BUG_ON_PAGE(!PageCompound(page), page); 553 VM_BUG_ON_PAGE(!PageCompound(page), page);
554 554
555 if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { 555 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
556 put_page(page); 556 put_page(page);
557 count_vm_event(THP_FAULT_FALLBACK); 557 count_vm_event(THP_FAULT_FALLBACK);
558 return VM_FAULT_FALLBACK; 558 return VM_FAULT_FALLBACK;
@@ -1142,7 +1142,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
1142 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, 1142 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
1143 vmf->address, page_to_nid(page)); 1143 vmf->address, page_to_nid(page));
1144 if (unlikely(!pages[i] || 1144 if (unlikely(!pages[i] ||
1145 mem_cgroup_try_charge(pages[i], vma->vm_mm, 1145 mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
1146 GFP_KERNEL, &memcg, false))) { 1146 GFP_KERNEL, &memcg, false))) {
1147 if (pages[i]) 1147 if (pages[i])
1148 put_page(pages[i]); 1148 put_page(pages[i]);
@@ -1312,7 +1312,7 @@ alloc:
1312 goto out; 1312 goto out;
1313 } 1313 }
1314 1314
1315 if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, 1315 if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
1316 huge_gfp, &memcg, true))) { 1316 huge_gfp, &memcg, true))) {
1317 put_page(new_page); 1317 put_page(new_page);
1318 split_huge_pmd(vma, vmf->pmd, vmf->address); 1318 split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b2173f7e5164..b836e7f00309 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5600,6 +5600,19 @@ out:
5600 return ret; 5600 return ret;
5601} 5601}
5602 5602
5603int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
5604 gfp_t gfp_mask, struct mem_cgroup **memcgp,
5605 bool compound)
5606{
5607 struct mem_cgroup *memcg;
5608 int ret;
5609
5610 ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
5611 memcg = *memcgp;
5612 mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
5613 return ret;
5614}
5615
5603/** 5616/**
5604 * mem_cgroup_commit_charge - commit a page charge 5617 * mem_cgroup_commit_charge - commit a page charge
5605 * @page: page to charge 5618 * @page: page to charge
diff --git a/mm/memory.c b/mm/memory.c
index 6d175057cfd0..348279ff6e51 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2524,7 +2524,7 @@ static int wp_page_copy(struct vm_fault *vmf)
2524 cow_user_page(new_page, old_page, vmf->address, vma); 2524 cow_user_page(new_page, old_page, vmf->address, vma);
2525 } 2525 }
2526 2526
2527 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) 2527 if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
2528 goto oom_free_new; 2528 goto oom_free_new;
2529 2529
2530 __SetPageUptodate(new_page); 2530 __SetPageUptodate(new_page);
@@ -3024,8 +3024,8 @@ int do_swap_page(struct vm_fault *vmf)
3024 goto out_page; 3024 goto out_page;
3025 } 3025 }
3026 3026
3027 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, 3027 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
3028 &memcg, false)) { 3028 &memcg, false)) {
3029 ret = VM_FAULT_OOM; 3029 ret = VM_FAULT_OOM;
3030 goto out_page; 3030 goto out_page;
3031 } 3031 }
@@ -3186,7 +3186,8 @@ static int do_anonymous_page(struct vm_fault *vmf)
3186 if (!page) 3186 if (!page)
3187 goto oom; 3187 goto oom;
3188 3188
3189 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) 3189 if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
3190 false))
3190 goto oom_free_page; 3191 goto oom_free_page;
3191 3192
3192 /* 3193 /*
@@ -3682,7 +3683,7 @@ static int do_cow_fault(struct vm_fault *vmf)
3682 if (!vmf->cow_page) 3683 if (!vmf->cow_page)
3683 return VM_FAULT_OOM; 3684 return VM_FAULT_OOM;
3684 3685
3685 if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, 3686 if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3686 &vmf->memcg, false)) { 3687 &vmf->memcg, false)) {
3687 put_page(vmf->cow_page); 3688 put_page(vmf->cow_page);
3688 return VM_FAULT_OOM; 3689 return VM_FAULT_OOM;
diff --git a/mm/page_io.c b/mm/page_io.c
index b41cf9644585..aafd19ec1db4 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,7 +338,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
338 ret = -ENOMEM; 338 ret = -ENOMEM;
339 goto out; 339 goto out;
340 } 340 }
341 bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); 341 bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
342 bio_associate_blkcg_from_page(bio, page);
342 count_swpout_vm_event(page); 343 count_swpout_vm_event(page);
343 set_page_writeback(page); 344 set_page_writeback(page);
344 unlock_page(page); 345 unlock_page(page);
diff --git a/mm/readahead.c b/mm/readahead.c
index e273f0de3376..a59ea70527b9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
19#include <linux/syscalls.h> 19#include <linux/syscalls.h>
20#include <linux/file.h> 20#include <linux/file.h>
21#include <linux/mm_inline.h> 21#include <linux/mm_inline.h>
22#include <linux/blk-cgroup.h>
22 23
23#include "internal.h" 24#include "internal.h"
24 25
@@ -385,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
385{ 386{
386 struct backing_dev_info *bdi = inode_to_bdi(mapping->host); 387 struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
387 unsigned long max_pages = ra->ra_pages; 388 unsigned long max_pages = ra->ra_pages;
389 unsigned long add_pages;
388 pgoff_t prev_offset; 390 pgoff_t prev_offset;
389 391
390 /* 392 /*
@@ -474,10 +476,17 @@ readit:
474 * Will this read hit the readahead marker made by itself? 476 * Will this read hit the readahead marker made by itself?
475 * If so, trigger the readahead marker hit now, and merge 477 * If so, trigger the readahead marker hit now, and merge
476 * the resulted next readahead window into the current one. 478 * the resulted next readahead window into the current one.
479 * Take care of maximum IO pages as above.
477 */ 480 */
478 if (offset == ra->start && ra->size == ra->async_size) { 481 if (offset == ra->start && ra->size == ra->async_size) {
479 ra->async_size = get_next_ra_size(ra, max_pages); 482 add_pages = get_next_ra_size(ra, max_pages);
480 ra->size += ra->async_size; 483 if (ra->size + add_pages <= max_pages) {
484 ra->async_size = add_pages;
485 ra->size += add_pages;
486 } else {
487 ra->size = max_pages;
488 ra->async_size = max_pages >> 1;
489 }
481 } 490 }
482 491
483 return ra_submit(ra, mapping, filp); 492 return ra_submit(ra, mapping, filp);
@@ -505,6 +514,9 @@ void page_cache_sync_readahead(struct address_space *mapping,
505 if (!ra->ra_pages) 514 if (!ra->ra_pages)
506 return; 515 return;
507 516
517 if (blk_cgroup_congested())
518 return;
519
508 /* be dumb */ 520 /* be dumb */
509 if (filp && (filp->f_mode & FMODE_RANDOM)) { 521 if (filp && (filp->f_mode & FMODE_RANDOM)) {
510 force_page_cache_readahead(mapping, filp, offset, req_size); 522 force_page_cache_readahead(mapping, filp, offset, req_size);
@@ -555,6 +567,9 @@ page_cache_async_readahead(struct address_space *mapping,
555 if (inode_read_congested(mapping->host)) 567 if (inode_read_congested(mapping->host))
556 return; 568 return;
557 569
570 if (blk_cgroup_congested())
571 return;
572
558 /* do read-ahead */ 573 /* do read-ahead */
559 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 574 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
560} 575}
diff --git a/mm/shmem.c b/mm/shmem.c
index 96bcc51fb9ec..06ebe17bb924 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1239,8 +1239,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
1239 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 1239 * the shmem_swaplist_mutex which might hold up shmem_writepage().
1240 * Charged back to the user (not to caller) when swap account is used. 1240 * Charged back to the user (not to caller) when swap account is used.
1241 */ 1241 */
1242 error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, 1242 error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
1243 false); 1243 &memcg, false);
1244 if (error) 1244 if (error)
1245 goto out; 1245 goto out;
1246 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 1246 /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1713,7 +1713,7 @@ repeat:
1713 goto failed; 1713 goto failed;
1714 } 1714 }
1715 1715
1716 error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, 1716 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1717 false); 1717 false);
1718 if (!error) { 1718 if (!error) {
1719 error = shmem_add_to_page_cache(page, mapping, index, 1719 error = shmem_add_to_page_cache(page, mapping, index,
@@ -1819,7 +1819,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
1819 if (sgp == SGP_WRITE) 1819 if (sgp == SGP_WRITE)
1820 __SetPageReferenced(page); 1820 __SetPageReferenced(page);
1821 1821
1822 error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, 1822 error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
1823 PageTransHuge(page)); 1823 PageTransHuge(page));
1824 if (error) 1824 if (error)
1825 goto unacct; 1825 goto unacct;
@@ -2292,7 +2292,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2292 __SetPageSwapBacked(page); 2292 __SetPageSwapBacked(page);
2293 __SetPageUptodate(page); 2293 __SetPageUptodate(page);
2294 2294
2295 ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false); 2295 ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
2296 if (ret) 2296 if (ret)
2297 goto out_release; 2297 goto out_release;
2298 2298
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 18185ae4f223..8837b22c848d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3745,6 +3745,37 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
3745 } 3745 }
3746} 3746}
3747 3747
3748#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3749void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
3750 gfp_t gfp_mask)
3751{
3752 struct swap_info_struct *si, *next;
3753 if (!(gfp_mask & __GFP_IO) || !memcg)
3754 return;
3755
3756 if (!blk_cgroup_congested())
3757 return;
3758
3759 /*
3760 * We've already scheduled a throttle, avoid taking the global swap
3761 * lock.
3762 */
3763 if (current->throttle_queue)
3764 return;
3765
3766 spin_lock(&swap_avail_lock);
3767 plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
3768 avail_lists[node]) {
3769 if (si->bdev) {
3770 blkcg_schedule_throttle(bdev_get_queue(si->bdev),
3771 true);
3772 break;
3773 }
3774 }
3775 spin_unlock(&swap_avail_lock);
3776}
3777#endif
3778
3748static int __init swapfile_init(void) 3779static int __init swapfile_init(void)
3749{ 3780{
3750 int nid; 3781 int nid;