summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-07 21:14:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-07 21:14:36 -0400
commit67a242223958d628f0ba33283668e3ddd192d057 (patch)
treea39e7039e9a2ef9ab46f8ba561175dbdc6101d11
parent8b35ad6232c462b02e397e87ce702bcddd4ba543 (diff)
parentb8753433fc611e23e31300e1d099001a08955c88 (diff)
Merge tag 'for-5.2/block-20190507' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "Nothing major in this series, just fixes and improvements all over the map. This contains: - Series of fixes for sed-opal (David, Jonas) - Fixes and performance tweaks for BFQ (via Paolo) - Set of fixes for bcache (via Coly) - Set of fixes for md (via Song) - Enabling multi-page for passthrough requests (Ming) - Queue release fix series (Ming) - Device notification improvements (Martin) - Propagate underlying device rotational status in loop (Holger) - Removal of mtip32xx trim support, which has been disabled for years (Christoph) - Improvement and cleanup of nvme command handling (Christoph) - Add block SPDX tags (Christoph) - Cleanup/hardening of bio/bvec iteration (Christoph) - A few NVMe pull requests (Christoph) - Removal of CONFIG_LBDAF (Christoph) - Various little fixes here and there" * tag 'for-5.2/block-20190507' of git://git.kernel.dk/linux-block: (164 commits) block: fix mismerge in bvec_advance block: don't drain in-progress dispatch in blk_cleanup_queue() blk-mq: move cancel of hctx->run_work into blk_mq_hw_sysfs_release blk-mq: always free hctx after request queue is freed blk-mq: split blk_mq_alloc_and_init_hctx into two parts blk-mq: free hw queue's resource in hctx's release handler blk-mq: move cancel of requeue_work into blk_mq_release blk-mq: grab .q_usage_counter when queuing request from plug code path block: fix function name in comment nvmet: protect discovery change log event list iteration nvme: mark nvme_core_init and nvme_core_exit static nvme: move command size checks to the core nvme-fabrics: check more command sizes nvme-pci: check more command sizes nvme-pci: remove an unneeded variable initialization nvme-pci: unquiesce admin queue on shutdown nvme-pci: shutdown on timeout during deletion nvme-pci: fix psdt field for single segment sgls nvme-multipath: don't print ANA group state by default nvme-multipath: split bios with the ns_head bio_set before submitting ...
-rw-r--r--Documentation/block/bfq-iosched.txt29
-rw-r--r--Documentation/block/null_blk.txt4
-rw-r--r--Documentation/process/submit-checklist.rst27
-rw-r--r--Documentation/translations/ja_JP/SubmitChecklist22
-rw-r--r--arch/arc/configs/haps_hs_defconfig1
-rw-r--r--arch/arc/configs/haps_hs_smp_defconfig1
-rw-r--r--arch/arc/configs/nsim_700_defconfig1
-rw-r--r--arch/arc/configs/nsim_hs_defconfig1
-rw-r--r--arch/arc/configs/nsim_hs_smp_defconfig1
-rw-r--r--arch/arc/configs/nsimosci_defconfig1
-rw-r--r--arch/arc/configs/nsimosci_hs_defconfig1
-rw-r--r--arch/arc/configs/nsimosci_hs_smp_defconfig1
-rw-r--r--arch/arm/configs/aspeed_g4_defconfig1
-rw-r--r--arch/arm/configs/aspeed_g5_defconfig1
-rw-r--r--arch/arm/configs/at91_dt_defconfig1
-rw-r--r--arch/arm/configs/clps711x_defconfig1
-rw-r--r--arch/arm/configs/efm32_defconfig1
-rw-r--r--arch/arm/configs/ezx_defconfig1
-rw-r--r--arch/arm/configs/h3600_defconfig1
-rw-r--r--arch/arm/configs/imote2_defconfig1
-rw-r--r--arch/arm/configs/moxart_defconfig1
-rw-r--r--arch/arm/configs/multi_v4t_defconfig1
-rw-r--r--arch/arm/configs/omap1_defconfig1
-rw-r--r--arch/arm/configs/stm32_defconfig1
-rw-r--r--arch/arm/configs/u300_defconfig1
-rw-r--r--arch/arm/configs/vexpress_defconfig1
-rw-r--r--arch/m68k/configs/amcore_defconfig1
-rw-r--r--arch/m68k/configs/m5475evb_defconfig1
-rw-r--r--arch/m68k/configs/stmark2_defconfig1
-rw-r--r--arch/mips/configs/ar7_defconfig1
-rw-r--r--arch/mips/configs/decstation_defconfig1
-rw-r--r--arch/mips/configs/decstation_r4k_defconfig1
-rw-r--r--arch/mips/configs/loongson1b_defconfig1
-rw-r--r--arch/mips/configs/loongson1c_defconfig1
-rw-r--r--arch/mips/configs/rb532_defconfig1
-rw-r--r--arch/mips/configs/rbtx49xx_defconfig1
-rw-r--r--arch/parisc/configs/generic-32bit_defconfig1
-rw-r--r--arch/sh/configs/apsh4ad0a_defconfig1
-rw-r--r--arch/sh/configs/ecovec24-romimage_defconfig1
-rw-r--r--arch/sh/configs/rsk7264_defconfig1
-rw-r--r--arch/sh/configs/rsk7269_defconfig1
-rw-r--r--arch/sh/configs/sh7785lcr_32bit_defconfig1
-rw-r--r--block/Kconfig24
-rw-r--r--block/badblocks.c10
-rw-r--r--block/bfq-cgroup.c16
-rw-r--r--block/bfq-iosched.c811
-rw-r--r--block/bfq-iosched.h107
-rw-r--r--block/bfq-wf2q.c23
-rw-r--r--block/bio-integrity.c16
-rw-r--r--block/bio.c286
-rw-r--r--block/blk-cgroup.c1
-rw-r--r--block/blk-core.c24
-rw-r--r--block/blk-exec.c1
-rw-r--r--block/blk-flush.c3
-rw-r--r--block/blk-integrity.c16
-rw-r--r--block/blk-iolatency.c1
-rw-r--r--block/blk-merge.c147
-rw-r--r--block/blk-mq-cpumap.c1
-rw-r--r--block/blk-mq-debugfs.c13
-rw-r--r--block/blk-mq-pci.c10
-rw-r--r--block/blk-mq-rdma.c10
-rw-r--r--block/blk-mq-sched.c13
-rw-r--r--block/blk-mq-sysfs.c9
-rw-r--r--block/blk-mq-tag.c1
-rw-r--r--block/blk-mq-virtio.c10
-rw-r--r--block/blk-mq.c192
-rw-r--r--block/blk-mq.h2
-rw-r--r--block/blk-rq-qos.c2
-rw-r--r--block/blk-rq-qos.h1
-rw-r--r--block/blk-settings.c17
-rw-r--r--block/blk-stat.c1
-rw-r--r--block/blk-sysfs.c31
-rw-r--r--block/blk-timeout.c1
-rw-r--r--block/blk-wbt.c1
-rw-r--r--block/blk-zoned.c1
-rw-r--r--block/blk.h2
-rw-r--r--block/bounce.c3
-rw-r--r--block/bsg-lib.c16
-rw-r--r--block/bsg.c9
-rw-r--r--block/elevator.c7
-rw-r--r--block/genhd.c68
-rw-r--r--block/ioctl.c1
-rw-r--r--block/ioprio.c1
-rw-r--r--block/kyber-iosched.c13
-rw-r--r--block/mq-deadline.c1
-rw-r--r--block/opal_proto.h12
-rw-r--r--block/partition-generic.c7
-rw-r--r--block/partitions/acorn.c7
-rw-r--r--block/partitions/aix.h1
-rw-r--r--block/partitions/amiga.h1
-rw-r--r--block/partitions/efi.c16
-rw-r--r--block/partitions/efi.h16
-rw-r--r--block/partitions/ibm.h1
-rw-r--r--block/partitions/karma.h1
-rw-r--r--block/partitions/ldm.c16
-rw-r--r--block/partitions/ldm.h16
-rw-r--r--block/partitions/msdos.h1
-rw-r--r--block/partitions/osf.h1
-rw-r--r--block/partitions/sgi.h1
-rw-r--r--block/partitions/sun.h1
-rw-r--r--block/partitions/sysv68.h1
-rw-r--r--block/partitions/ultrix.h1
-rw-r--r--block/scsi_ioctl.c16
-rw-r--r--block/sed-opal.c726
-rw-r--r--block/t10-pi.c19
-rw-r--r--drivers/block/amiflop.c1
-rw-r--r--drivers/block/ataflop.c1
-rw-r--r--drivers/block/brd.c7
-rw-r--r--drivers/block/drbd/drbd_int.h5
-rw-r--r--drivers/block/floppy.c1
-rw-r--r--drivers/block/loop.c35
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c89
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h17
-rw-r--r--drivers/block/paride/pcd.c1
-rw-r--r--drivers/block/paride/pd.c1
-rw-r--r--drivers/block/paride/pf.c1
-rw-r--r--drivers/block/pktcdvd.c1
-rw-r--r--drivers/block/ps3disk.c4
-rw-r--r--drivers/block/swim.c1
-rw-r--r--drivers/block/swim3.c1
-rw-r--r--drivers/block/virtio_blk.c3
-rw-r--r--drivers/block/xsysace.c1
-rw-r--r--drivers/cdrom/gdrom.c1
-rw-r--r--drivers/ide/ide-cd.c1
-rw-r--r--drivers/ide/ide-cd_ioctl.c5
-rw-r--r--drivers/ide/ide-gd.c6
-rw-r--r--drivers/md/bcache/alloc.c5
-rw-r--r--drivers/md/bcache/btree.c12
-rw-r--r--drivers/md/bcache/journal.c42
-rw-r--r--drivers/md/bcache/request.c41
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/super.c84
-rw-r--r--drivers/md/bcache/sysfs.c2
-rw-r--r--drivers/md/bcache/util.h26
-rw-r--r--drivers/md/dm-crypt.c3
-rw-r--r--drivers/md/dm-exception-store.h28
-rw-r--r--drivers/md/dm-integrity.c8
-rw-r--r--drivers/md/md-bitmap.c8
-rw-r--r--drivers/md/md.c199
-rw-r--r--drivers/md/md.h25
-rw-r--r--drivers/md/raid1.c6
-rw-r--r--drivers/md/raid5.c16
-rw-r--r--drivers/nvdimm/pfn_devs.c4
-rw-r--r--drivers/nvme/host/core.c44
-rw-r--r--drivers/nvme/host/fabrics.c1
-rw-r--r--drivers/nvme/host/multipath.c10
-rw-r--r--drivers/nvme/host/nvme.h3
-rw-r--r--drivers/nvme/host/pci.c300
-rw-r--r--drivers/nvme/host/rdma.c10
-rw-r--r--drivers/nvme/host/tcp.c21
-rw-r--r--drivers/nvme/target/Kconfig1
-rw-r--r--drivers/nvme/target/configfs.c4
-rw-r--r--drivers/nvme/target/core.c38
-rw-r--r--drivers/nvme/target/discovery.c9
-rw-r--r--drivers/nvme/target/fabrics-cmd.c16
-rw-r--r--drivers/nvme/target/fc.c9
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c6
-rw-r--r--drivers/nvme/target/io-cmd-file.c7
-rw-r--r--drivers/nvme/target/loop.c22
-rw-r--r--drivers/nvme/target/nvmet.h4
-rw-r--r--drivers/nvme/target/rdma.c21
-rw-r--r--drivers/nvme/target/tcp.c38
-rw-r--r--drivers/scsi/sd.c33
-rw-r--r--drivers/scsi/sr.c1
-rw-r--r--drivers/staging/erofs/data.c3
-rw-r--r--drivers/staging/erofs/unzip_vle.c3
-rw-r--r--drivers/xen/biomerge.c5
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/extent_io.c10
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/crypto/bio.c3
-rw-r--r--fs/direct-io.c3
-rw-r--r--fs/ext4/page-io.c3
-rw-r--r--fs/ext4/readpage.c3
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c32
-rw-r--r--fs/f2fs/data.c9
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/lops.c3
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/iomap.c6
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/nfs/Kconfig1
-rw-r--r--fs/ocfs2/super.c10
-rw-r--r--fs/stack.c15
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/xfs_aops.c3
-rw-r--r--fs/xfs/xfs_super.c10
-rw-r--r--include/linux/bio.h20
-rw-r--r--include/linux/blk-mq-rdma.h1
-rw-r--r--include/linux/blk-mq.h2
-rw-r--r--include/linux/blk_types.h29
-rw-r--r--include/linux/blkdev.h42
-rw-r--r--include/linux/bsg-lib.h16
-rw-r--r--include/linux/bvec.h36
-rw-r--r--include/linux/genhd.h20
-rw-r--r--include/linux/kernel.h14
-rw-r--r--include/linux/nvme-rdma.h2
-rw-r--r--include/linux/sed-opal.h10
-rw-r--r--include/linux/types.h5
-rw-r--r--include/uapi/linux/sed-opal.h11
-rw-r--r--include/xen/xen.h4
-rw-r--r--lib/Kconfig.debug1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h4
207 files changed, 2312 insertions, 2257 deletions
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 98a8dd5ee385..1a0f2ac02eb6 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -20,13 +20,26 @@ for that device, by setting low_latency to 0. See Section 3 for
20details on how to configure BFQ for the desired tradeoff between 20details on how to configure BFQ for the desired tradeoff between
21latency and throughput, or on how to maximize throughput. 21latency and throughput, or on how to maximize throughput.
22 22
23BFQ has a non-null overhead, which limits the maximum IOPS that a CPU 23As every I/O scheduler, BFQ adds some overhead to per-I/O-request
24can process for a device scheduled with BFQ. To give an idea of the 24processing. To give an idea of this overhead, the total,
25limits on slow or average CPUs, here are, first, the limits of BFQ for 25single-lock-protected, per-request processing time of BFQ---i.e., the
26three different CPUs, on, respectively, an average laptop, an old 26sum of the execution times of the request insertion, dispatch and
27desktop, and a cheap embedded system, in case full hierarchical 27completion hooks---is, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
28support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but 28(dated CPU for notebooks; time measured with simple code
29CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): 29instrumentation, and using the throughput-sync.sh script of the S
30suite [1], in performance-profiling mode). To put this result into
31context, the total, single-lock-protected, per-request execution time
32of the lightest I/O scheduler available in blk-mq, mq-deadline, is 0.7
33us (mq-deadline is ~800 LOC, against ~10500 LOC for BFQ).
34
35Scheduling overhead further limits the maximum IOPS that a CPU can
36process (already limited by the execution of the rest of the I/O
37stack). To give an idea of the limits with BFQ, on slow or average
38CPUs, here are, first, the limits of BFQ for three different CPUs, on,
39respectively, an average laptop, an old desktop, and a cheap embedded
40system, in case full hierarchical support is enabled (i.e.,
41CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_DEBUG_BLK_CGROUP is not
42set (Section 4-2):
30- Intel i7-4850HQ: 400 KIOPS 43- Intel i7-4850HQ: 400 KIOPS
31- AMD A8-3850: 250 KIOPS 44- AMD A8-3850: 250 KIOPS
32- ARM CortexTM-A53 Octa-core: 80 KIOPS 45- ARM CortexTM-A53 Octa-core: 80 KIOPS
@@ -566,3 +579,5 @@ applications. Unset this tunable if you need/want to control weights.
566 Slightly extended version: 579 Slightly extended version:
567 http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- 580 http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
568 results.pdf 581 results.pdf
582
583[3] https://github.com/Algodev-github/S
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 4cad1024fff7..41f0a3d33bbd 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -93,3 +93,7 @@ zoned=[0/1]: Default: 0
93 93
94zone_size=[MB]: Default: 256 94zone_size=[MB]: Default: 256
95 Per zone size when exposed as a zoned block device. Must be a power of two. 95 Per zone size when exposed as a zoned block device. Must be a power of two.
96
97zone_nr_conv=[nr_conv]: Default: 0
98 The number of conventional zones to create when block device is zoned. If
99 zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1.
diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst
index 367353c54949..c88867b173d9 100644
--- a/Documentation/process/submit-checklist.rst
+++ b/Documentation/process/submit-checklist.rst
@@ -72,47 +72,44 @@ and elsewhere regarding submitting Linux kernel patches.
7213) Has been build- and runtime tested with and without ``CONFIG_SMP`` and 7213) Has been build- and runtime tested with and without ``CONFIG_SMP`` and
73 ``CONFIG_PREEMPT.`` 73 ``CONFIG_PREEMPT.``
74 74
7514) If the patch affects IO/Disk, etc: has been tested with and without 7516) All codepaths have been exercised with all lockdep features enabled.
76 ``CONFIG_LBDAF.``
77 76
7815) All codepaths have been exercised with all lockdep features enabled. 7717) All new ``/proc`` entries are documented under ``Documentation/``
79 78
8016) All new ``/proc`` entries are documented under ``Documentation/`` 7918) All new kernel boot parameters are documented in
81
8217) All new kernel boot parameters are documented in
83 ``Documentation/admin-guide/kernel-parameters.rst``. 80 ``Documentation/admin-guide/kernel-parameters.rst``.
84 81
8518) All new module parameters are documented with ``MODULE_PARM_DESC()`` 8219) All new module parameters are documented with ``MODULE_PARM_DESC()``
86 83
8719) All new userspace interfaces are documented in ``Documentation/ABI/``. 8420) All new userspace interfaces are documented in ``Documentation/ABI/``.
88 See ``Documentation/ABI/README`` for more information. 85 See ``Documentation/ABI/README`` for more information.
89 Patches that change userspace interfaces should be CCed to 86 Patches that change userspace interfaces should be CCed to
90 linux-api@vger.kernel.org. 87 linux-api@vger.kernel.org.
91 88
9220) Check that it all passes ``make headers_check``. 8921) Check that it all passes ``make headers_check``.
93 90
9421) Has been checked with injection of at least slab and page-allocation 9122) Has been checked with injection of at least slab and page-allocation
95 failures. See ``Documentation/fault-injection/``. 92 failures. See ``Documentation/fault-injection/``.
96 93
97 If the new code is substantial, addition of subsystem-specific fault 94 If the new code is substantial, addition of subsystem-specific fault
98 injection might be appropriate. 95 injection might be appropriate.
99 96
10022) Newly-added code has been compiled with ``gcc -W`` (use 9723) Newly-added code has been compiled with ``gcc -W`` (use
101 ``make EXTRA_CFLAGS=-W``). This will generate lots of noise, but is good 98 ``make EXTRA_CFLAGS=-W``). This will generate lots of noise, but is good
102 for finding bugs like "warning: comparison between signed and unsigned". 99 for finding bugs like "warning: comparison between signed and unsigned".
103 100
10423) Tested after it has been merged into the -mm patchset to make sure 10124) Tested after it has been merged into the -mm patchset to make sure
105 that it still works with all of the other queued patches and various 102 that it still works with all of the other queued patches and various
106 changes in the VM, VFS, and other subsystems. 103 changes in the VM, VFS, and other subsystems.
107 104
10824) All memory barriers {e.g., ``barrier()``, ``rmb()``, ``wmb()``} need a 10525) All memory barriers {e.g., ``barrier()``, ``rmb()``, ``wmb()``} need a
109 comment in the source code that explains the logic of what they are doing 106 comment in the source code that explains the logic of what they are doing
110 and why. 107 and why.
111 108
11225) If any ioctl's are added by the patch, then also update 10926) If any ioctl's are added by the patch, then also update
113 ``Documentation/ioctl/ioctl-number.txt``. 110 ``Documentation/ioctl/ioctl-number.txt``.
114 111
11526) If your modified source code depends on or uses any of the kernel 11227) If your modified source code depends on or uses any of the kernel
116 APIs or features that are related to the following ``Kconfig`` symbols, 113 APIs or features that are related to the following ``Kconfig`` symbols,
117 then test multiple builds with the related ``Kconfig`` symbols disabled 114 then test multiple builds with the related ``Kconfig`` symbols disabled
118 and/or ``=m`` (if that option is available) [not all of these at the 115 and/or ``=m`` (if that option is available) [not all of these at the
diff --git a/Documentation/translations/ja_JP/SubmitChecklist b/Documentation/translations/ja_JP/SubmitChecklist
index 60c7c35ac517..b42220d3d46c 100644
--- a/Documentation/translations/ja_JP/SubmitChecklist
+++ b/Documentation/translations/ja_JP/SubmitChecklist
@@ -74,38 +74,34 @@ Linux カーネルパッチ投稿者向けチェックリスト
7413: CONFIG_SMP, CONFIG_PREEMPT を有効にした場合と無効にした場合の両方で 7413: CONFIG_SMP, CONFIG_PREEMPT を有効にした場合と無効にした場合の両方で
75 ビルドした上、動作確認を行ってください。 75 ビルドした上、動作確認を行ってください。
76 76
7714: もしパッチがディスクのI/O性能などに影響を与えるようであれば、 7714: lockdepの機能を全て有効にした上で、全てのコードパスを評価してください。
78 'CONFIG_LBDAF'オプションを有効にした場合と無効にした場合の両方で
79 テストを実施してみてください。
80 78
8115: lockdepの機能を全て有効にした上で、全てのコードパスを評価してください。 7915: /proc に新しいエントリを追加した場合には、Documentation/ 配下に
82
8316: /proc に新しいエントリを追加した場合には、Documentation/ 配下に
84 必ずドキュメントを追加してください。 80 必ずドキュメントを追加してください。
85 81
8617: 新しいブートパラメータを追加した場合には、 8216: 新しいブートパラメータを追加した場合には、
87 必ずDocumentation/admin-guide/kernel-parameters.rst に説明を追加してください。 83 必ずDocumentation/admin-guide/kernel-parameters.rst に説明を追加してください。
88 84
8918: 新しくmoduleにパラメータを追加した場合には、MODULE_PARM_DESC()を 8517: 新しくmoduleにパラメータを追加した場合には、MODULE_PARM_DESC()を
90 利用して必ずその説明を記述してください。 86 利用して必ずその説明を記述してください。
91 87
9219: 新しいuserspaceインタフェースを作成した場合には、Documentation/ABI/ に 8818: 新しいuserspaceインタフェースを作成した場合には、Documentation/ABI/ に
93 Documentation/ABI/README を参考にして必ずドキュメントを追加してください。 89 Documentation/ABI/README を参考にして必ずドキュメントを追加してください。
94 90
9520: 'make headers_check'を実行して全く問題がないことを確認してください。 9119: 'make headers_check'を実行して全く問題がないことを確認してください。
96 92
9721: 少なくともslabアロケーションとpageアロケーションに失敗した場合の 9320: 少なくともslabアロケーションとpageアロケーションに失敗した場合の
98 挙動について、fault-injectionを利用して確認してください。 94 挙動について、fault-injectionを利用して確認してください。
99 Documentation/fault-injection/ を参照してください。 95 Documentation/fault-injection/ を参照してください。
100 96
101 追加したコードがかなりの量であったならば、サブシステム特有の 97 追加したコードがかなりの量であったならば、サブシステム特有の
102 fault-injectionを追加したほうが良いかもしれません。 98 fault-injectionを追加したほうが良いかもしれません。
103 99
10422: 新たに追加したコードは、`gcc -W'でコンパイルしてください。 10021: 新たに追加したコードは、`gcc -W'でコンパイルしてください。
105 このオプションは大量の不要なメッセージを出力しますが、 101 このオプションは大量の不要なメッセージを出力しますが、
106 "warning: comparison between signed and unsigned" のようなメッセージは、 102 "warning: comparison between signed and unsigned" のようなメッセージは、
107 バグを見つけるのに役に立ちます。 103 バグを見つけるのに役に立ちます。
108 104
10923: 投稿したパッチが -mm パッチセットにマージされた後、全ての既存のパッチや 10522: 投稿したパッチが -mm パッチセットにマージされた後、全ての既存のパッチや
110 VM, VFS およびその他のサブシステムに関する様々な変更と、現時点でも共存 106 VM, VFS およびその他のサブシステムに関する様々な変更と、現時点でも共存
111 できることを確認するテストを行ってください。 107 できることを確認するテストを行ってください。
diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
index f56cc2070c11..b117e6c16d41 100644
--- a/arch/arc/configs/haps_hs_defconfig
+++ b/arch/arc/configs/haps_hs_defconfig
@@ -15,7 +15,6 @@ CONFIG_PERF_EVENTS=y
15# CONFIG_COMPAT_BRK is not set 15# CONFIG_COMPAT_BRK is not set
16CONFIG_SLAB=y 16CONFIG_SLAB=y
17CONFIG_MODULES=y 17CONFIG_MODULES=y
18# CONFIG_LBDAF is not set
19# CONFIG_BLK_DEV_BSG is not set 18# CONFIG_BLK_DEV_BSG is not set
20# CONFIG_IOSCHED_DEADLINE is not set 19# CONFIG_IOSCHED_DEADLINE is not set
21# CONFIG_IOSCHED_CFQ is not set 20# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
index b6f2482c7e74..33a787c375e2 100644
--- a/arch/arc/configs/haps_hs_smp_defconfig
+++ b/arch/arc/configs/haps_hs_smp_defconfig
@@ -17,7 +17,6 @@ CONFIG_PERF_EVENTS=y
17CONFIG_SLAB=y 17CONFIG_SLAB=y
18CONFIG_KPROBES=y 18CONFIG_KPROBES=y
19CONFIG_MODULES=y 19CONFIG_MODULES=y
20# CONFIG_LBDAF is not set
21# CONFIG_BLK_DEV_BSG is not set 20# CONFIG_BLK_DEV_BSG is not set
22# CONFIG_IOSCHED_DEADLINE is not set 21# CONFIG_IOSCHED_DEADLINE is not set
23# CONFIG_IOSCHED_CFQ is not set 22# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index 318e4cd29629..de398c7b10b3 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -18,7 +18,6 @@ CONFIG_PERF_EVENTS=y
18CONFIG_ISA_ARCOMPACT=y 18CONFIG_ISA_ARCOMPACT=y
19CONFIG_KPROBES=y 19CONFIG_KPROBES=y
20CONFIG_MODULES=y 20CONFIG_MODULES=y
21# CONFIG_LBDAF is not set
22# CONFIG_BLK_DEV_BSG is not set 21# CONFIG_BLK_DEV_BSG is not set
23# CONFIG_IOSCHED_DEADLINE is not set 22# CONFIG_IOSCHED_DEADLINE is not set
24# CONFIG_IOSCHED_CFQ is not set 23# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsim_hs_defconfig b/arch/arc/configs/nsim_hs_defconfig
index c15807b0e0c1..2dbd34a9ff07 100644
--- a/arch/arc/configs/nsim_hs_defconfig
+++ b/arch/arc/configs/nsim_hs_defconfig
@@ -20,7 +20,6 @@ CONFIG_MODULES=y
20CONFIG_MODULE_FORCE_LOAD=y 20CONFIG_MODULE_FORCE_LOAD=y
21CONFIG_MODULE_UNLOAD=y 21CONFIG_MODULE_UNLOAD=y
22CONFIG_MODULE_FORCE_UNLOAD=y 22CONFIG_MODULE_FORCE_UNLOAD=y
23# CONFIG_LBDAF is not set
24# CONFIG_BLK_DEV_BSG is not set 23# CONFIG_BLK_DEV_BSG is not set
25# CONFIG_IOSCHED_DEADLINE is not set 24# CONFIG_IOSCHED_DEADLINE is not set
26# CONFIG_IOSCHED_CFQ is not set 25# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsim_hs_smp_defconfig b/arch/arc/configs/nsim_hs_smp_defconfig
index 65e983fd942b..c7135f1e2583 100644
--- a/arch/arc/configs/nsim_hs_smp_defconfig
+++ b/arch/arc/configs/nsim_hs_smp_defconfig
@@ -18,7 +18,6 @@ CONFIG_MODULES=y
18CONFIG_MODULE_FORCE_LOAD=y 18CONFIG_MODULE_FORCE_LOAD=y
19CONFIG_MODULE_UNLOAD=y 19CONFIG_MODULE_UNLOAD=y
20CONFIG_MODULE_FORCE_UNLOAD=y 20CONFIG_MODULE_FORCE_UNLOAD=y
21# CONFIG_LBDAF is not set
22# CONFIG_BLK_DEV_BSG is not set 21# CONFIG_BLK_DEV_BSG is not set
23# CONFIG_IOSCHED_DEADLINE is not set 22# CONFIG_IOSCHED_DEADLINE is not set
24# CONFIG_IOSCHED_CFQ is not set 23# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index 08c5b99ac341..385a71d3c478 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -18,7 +18,6 @@ CONFIG_PERF_EVENTS=y
18CONFIG_ISA_ARCOMPACT=y 18CONFIG_ISA_ARCOMPACT=y
19CONFIG_KPROBES=y 19CONFIG_KPROBES=y
20CONFIG_MODULES=y 20CONFIG_MODULES=y
21# CONFIG_LBDAF is not set
22# CONFIG_BLK_DEV_BSG is not set 21# CONFIG_BLK_DEV_BSG is not set
23# CONFIG_IOSCHED_DEADLINE is not set 22# CONFIG_IOSCHED_DEADLINE is not set
24# CONFIG_IOSCHED_CFQ is not set 23# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
index 5b5e26d67955..248a2c3bdc12 100644
--- a/arch/arc/configs/nsimosci_hs_defconfig
+++ b/arch/arc/configs/nsimosci_hs_defconfig
@@ -17,7 +17,6 @@ CONFIG_PERF_EVENTS=y
17# CONFIG_COMPAT_BRK is not set 17# CONFIG_COMPAT_BRK is not set
18CONFIG_KPROBES=y 18CONFIG_KPROBES=y
19CONFIG_MODULES=y 19CONFIG_MODULES=y
20# CONFIG_LBDAF is not set
21# CONFIG_BLK_DEV_BSG is not set 20# CONFIG_BLK_DEV_BSG is not set
22# CONFIG_IOSCHED_DEADLINE is not set 21# CONFIG_IOSCHED_DEADLINE is not set
23# CONFIG_IOSCHED_CFQ is not set 22# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
index 26af9b2f7fcb..1a4bc7b660fb 100644
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
@@ -12,7 +12,6 @@ CONFIG_PERF_EVENTS=y
12# CONFIG_COMPAT_BRK is not set 12# CONFIG_COMPAT_BRK is not set
13CONFIG_KPROBES=y 13CONFIG_KPROBES=y
14CONFIG_MODULES=y 14CONFIG_MODULES=y
15# CONFIG_LBDAF is not set
16# CONFIG_BLK_DEV_BSG is not set 15# CONFIG_BLK_DEV_BSG is not set
17# CONFIG_IOSCHED_DEADLINE is not set 16# CONFIG_IOSCHED_DEADLINE is not set
18# CONFIG_IOSCHED_CFQ is not set 17# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig
index 1446262921b4..bdbade6af9c7 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -23,7 +23,6 @@ CONFIG_SLAB_FREELIST_RANDOM=y
23CONFIG_JUMP_LABEL=y 23CONFIG_JUMP_LABEL=y
24CONFIG_STRICT_KERNEL_RWX=y 24CONFIG_STRICT_KERNEL_RWX=y
25CONFIG_GCC_PLUGINS=y 25CONFIG_GCC_PLUGINS=y
26# CONFIG_LBDAF is not set
27# CONFIG_BLK_DEV_BSG is not set 26# CONFIG_BLK_DEV_BSG is not set
28# CONFIG_BLK_DEBUG_FS is not set 27# CONFIG_BLK_DEBUG_FS is not set
29# CONFIG_IOSCHED_DEADLINE is not set 28# CONFIG_IOSCHED_DEADLINE is not set
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 02fa3a41add5..4bde84eae4eb 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -23,7 +23,6 @@ CONFIG_SLAB_FREELIST_RANDOM=y
23CONFIG_JUMP_LABEL=y 23CONFIG_JUMP_LABEL=y
24CONFIG_STRICT_KERNEL_RWX=y 24CONFIG_STRICT_KERNEL_RWX=y
25CONFIG_GCC_PLUGINS=y 25CONFIG_GCC_PLUGINS=y
26# CONFIG_LBDAF is not set
27# CONFIG_BLK_DEV_BSG is not set 26# CONFIG_BLK_DEV_BSG is not set
28# CONFIG_BLK_DEBUG_FS is not set 27# CONFIG_BLK_DEBUG_FS is not set
29# CONFIG_IOSCHED_DEADLINE is not set 28# CONFIG_IOSCHED_DEADLINE is not set
diff --git a/arch/arm/configs/at91_dt_defconfig b/arch/arm/configs/at91_dt_defconfig
index e4b1be66b3f5..b7752929975c 100644
--- a/arch/arm/configs/at91_dt_defconfig
+++ b/arch/arm/configs/at91_dt_defconfig
@@ -9,7 +9,6 @@ CONFIG_EMBEDDED=y
9CONFIG_SLAB=y 9CONFIG_SLAB=y
10CONFIG_MODULES=y 10CONFIG_MODULES=y
11CONFIG_MODULE_UNLOAD=y 11CONFIG_MODULE_UNLOAD=y
12# CONFIG_LBDAF is not set
13# CONFIG_BLK_DEV_BSG is not set 12# CONFIG_BLK_DEV_BSG is not set
14# CONFIG_IOSCHED_DEADLINE is not set 13# CONFIG_IOSCHED_DEADLINE is not set
15# CONFIG_IOSCHED_CFQ is not set 14# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/clps711x_defconfig b/arch/arm/configs/clps711x_defconfig
index fc105c9178cc..09ae750164e0 100644
--- a/arch/arm/configs/clps711x_defconfig
+++ b/arch/arm/configs/clps711x_defconfig
@@ -6,7 +6,6 @@ CONFIG_RD_LZMA=y
6CONFIG_EMBEDDED=y 6CONFIG_EMBEDDED=y
7CONFIG_SLOB=y 7CONFIG_SLOB=y
8CONFIG_JUMP_LABEL=y 8CONFIG_JUMP_LABEL=y
9# CONFIG_LBDAF is not set
10CONFIG_PARTITION_ADVANCED=y 9CONFIG_PARTITION_ADVANCED=y
11# CONFIG_IOSCHED_CFQ is not set 10# CONFIG_IOSCHED_CFQ is not set
12CONFIG_ARCH_CLPS711X=y 11CONFIG_ARCH_CLPS711X=y
diff --git a/arch/arm/configs/efm32_defconfig b/arch/arm/configs/efm32_defconfig
index ee42158f41ec..10ea92513a69 100644
--- a/arch/arm/configs/efm32_defconfig
+++ b/arch/arm/configs/efm32_defconfig
@@ -11,7 +11,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
11CONFIG_EMBEDDED=y 11CONFIG_EMBEDDED=y
12# CONFIG_VM_EVENT_COUNTERS is not set 12# CONFIG_VM_EVENT_COUNTERS is not set
13# CONFIG_SLUB_DEBUG is not set 13# CONFIG_SLUB_DEBUG is not set
14# CONFIG_LBDAF is not set
15# CONFIG_BLK_DEV_BSG is not set 14# CONFIG_BLK_DEV_BSG is not set
16# CONFIG_IOSCHED_DEADLINE is not set 15# CONFIG_IOSCHED_DEADLINE is not set
17# CONFIG_IOSCHED_CFQ is not set 16# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/ezx_defconfig b/arch/arm/configs/ezx_defconfig
index 484e51fbd4a6..e3afca5bd9d6 100644
--- a/arch/arm/configs/ezx_defconfig
+++ b/arch/arm/configs/ezx_defconfig
@@ -13,7 +13,6 @@ CONFIG_MODULES=y
13CONFIG_MODULE_UNLOAD=y 13CONFIG_MODULE_UNLOAD=y
14CONFIG_MODULE_FORCE_UNLOAD=y 14CONFIG_MODULE_FORCE_UNLOAD=y
15CONFIG_MODVERSIONS=y 15CONFIG_MODVERSIONS=y
16# CONFIG_LBDAF is not set
17# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
18# CONFIG_IOSCHED_CFQ is not set 17# CONFIG_IOSCHED_CFQ is not set
19CONFIG_ARCH_PXA=y 18CONFIG_ARCH_PXA=y
diff --git a/arch/arm/configs/h3600_defconfig b/arch/arm/configs/h3600_defconfig
index ebeca11faa48..175881b7da7c 100644
--- a/arch/arm/configs/h3600_defconfig
+++ b/arch/arm/configs/h3600_defconfig
@@ -4,7 +4,6 @@ CONFIG_HIGH_RES_TIMERS=y
4CONFIG_LOG_BUF_SHIFT=14 4CONFIG_LOG_BUF_SHIFT=14
5CONFIG_BLK_DEV_INITRD=y 5CONFIG_BLK_DEV_INITRD=y
6CONFIG_MODULES=y 6CONFIG_MODULES=y
7# CONFIG_LBDAF is not set
8# CONFIG_BLK_DEV_BSG is not set 7# CONFIG_BLK_DEV_BSG is not set
9# CONFIG_IOSCHED_DEADLINE is not set 8# CONFIG_IOSCHED_DEADLINE is not set
10# CONFIG_IOSCHED_CFQ is not set 9# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/imote2_defconfig b/arch/arm/configs/imote2_defconfig
index f204017c26b9..9b779e13e05d 100644
--- a/arch/arm/configs/imote2_defconfig
+++ b/arch/arm/configs/imote2_defconfig
@@ -12,7 +12,6 @@ CONFIG_MODULES=y
12CONFIG_MODULE_UNLOAD=y 12CONFIG_MODULE_UNLOAD=y
13CONFIG_MODULE_FORCE_UNLOAD=y 13CONFIG_MODULE_FORCE_UNLOAD=y
14CONFIG_MODVERSIONS=y 14CONFIG_MODVERSIONS=y
15# CONFIG_LBDAF is not set
16# CONFIG_BLK_DEV_BSG is not set 15# CONFIG_BLK_DEV_BSG is not set
17# CONFIG_IOSCHED_CFQ is not set 16# CONFIG_IOSCHED_CFQ is not set
18CONFIG_ARCH_PXA=y 17CONFIG_ARCH_PXA=y
diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig
index 078228a19339..6a11669fa536 100644
--- a/arch/arm/configs/moxart_defconfig
+++ b/arch/arm/configs/moxart_defconfig
@@ -15,7 +15,6 @@ CONFIG_EMBEDDED=y
15# CONFIG_VM_EVENT_COUNTERS is not set 15# CONFIG_VM_EVENT_COUNTERS is not set
16# CONFIG_SLUB_DEBUG is not set 16# CONFIG_SLUB_DEBUG is not set
17# CONFIG_COMPAT_BRK is not set 17# CONFIG_COMPAT_BRK is not set
18# CONFIG_LBDAF is not set
19# CONFIG_BLK_DEV_BSG is not set 18# CONFIG_BLK_DEV_BSG is not set
20# CONFIG_IOSCHED_DEADLINE is not set 19# CONFIG_IOSCHED_DEADLINE is not set
21CONFIG_ARCH_MULTI_V4=y 20CONFIG_ARCH_MULTI_V4=y
diff --git a/arch/arm/configs/multi_v4t_defconfig b/arch/arm/configs/multi_v4t_defconfig
index 9a6390c172d6..eeea0c41138b 100644
--- a/arch/arm/configs/multi_v4t_defconfig
+++ b/arch/arm/configs/multi_v4t_defconfig
@@ -5,7 +5,6 @@ CONFIG_BLK_DEV_INITRD=y
5CONFIG_EMBEDDED=y 5CONFIG_EMBEDDED=y
6CONFIG_SLOB=y 6CONFIG_SLOB=y
7CONFIG_JUMP_LABEL=y 7CONFIG_JUMP_LABEL=y
8# CONFIG_LBDAF is not set
9CONFIG_PARTITION_ADVANCED=y 8CONFIG_PARTITION_ADVANCED=y
10# CONFIG_IOSCHED_CFQ is not set 9# CONFIG_IOSCHED_CFQ is not set
11CONFIG_ARCH_MULTI_V4T=y 10CONFIG_ARCH_MULTI_V4T=y
diff --git a/arch/arm/configs/omap1_defconfig b/arch/arm/configs/omap1_defconfig
index cfc00b0961ec..8448a7f407a4 100644
--- a/arch/arm/configs/omap1_defconfig
+++ b/arch/arm/configs/omap1_defconfig
@@ -17,7 +17,6 @@ CONFIG_OPROFILE=y
17CONFIG_MODULES=y 17CONFIG_MODULES=y
18CONFIG_MODULE_UNLOAD=y 18CONFIG_MODULE_UNLOAD=y
19CONFIG_MODULE_FORCE_UNLOAD=y 19CONFIG_MODULE_FORCE_UNLOAD=y
20# CONFIG_LBDAF is not set
21# CONFIG_BLK_DEV_BSG is not set 20# CONFIG_BLK_DEV_BSG is not set
22# CONFIG_IOSCHED_DEADLINE is not set 21# CONFIG_IOSCHED_DEADLINE is not set
23# CONFIG_IOSCHED_CFQ is not set 22# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig
index 0258ba891376..152321d2893e 100644
--- a/arch/arm/configs/stm32_defconfig
+++ b/arch/arm/configs/stm32_defconfig
@@ -13,7 +13,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
13CONFIG_EMBEDDED=y 13CONFIG_EMBEDDED=y
14# CONFIG_VM_EVENT_COUNTERS is not set 14# CONFIG_VM_EVENT_COUNTERS is not set
15# CONFIG_SLUB_DEBUG is not set 15# CONFIG_SLUB_DEBUG is not set
16# CONFIG_LBDAF is not set
17# CONFIG_BLK_DEV_BSG is not set 16# CONFIG_BLK_DEV_BSG is not set
18# CONFIG_IOSCHED_DEADLINE is not set 17# CONFIG_IOSCHED_DEADLINE is not set
19# CONFIG_IOSCHED_CFQ is not set 18# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/u300_defconfig b/arch/arm/configs/u300_defconfig
index 36d77406e31b..831ba6a9ee8b 100644
--- a/arch/arm/configs/u300_defconfig
+++ b/arch/arm/configs/u300_defconfig
@@ -9,7 +9,6 @@ CONFIG_EXPERT=y
9# CONFIG_VM_EVENT_COUNTERS is not set 9# CONFIG_VM_EVENT_COUNTERS is not set
10CONFIG_MODULES=y 10CONFIG_MODULES=y
11CONFIG_MODULE_UNLOAD=y 11CONFIG_MODULE_UNLOAD=y
12# CONFIG_LBDAF is not set
13# CONFIG_BLK_DEV_BSG is not set 12# CONFIG_BLK_DEV_BSG is not set
14CONFIG_PARTITION_ADVANCED=y 13CONFIG_PARTITION_ADVANCED=y
15# CONFIG_IOSCHED_CFQ is not set 14# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/arm/configs/vexpress_defconfig b/arch/arm/configs/vexpress_defconfig
index 392ed3b3613c..484d77a7f589 100644
--- a/arch/arm/configs/vexpress_defconfig
+++ b/arch/arm/configs/vexpress_defconfig
@@ -14,7 +14,6 @@ CONFIG_PROFILING=y
14CONFIG_OPROFILE=y 14CONFIG_OPROFILE=y
15CONFIG_MODULES=y 15CONFIG_MODULES=y
16CONFIG_MODULE_UNLOAD=y 16CONFIG_MODULE_UNLOAD=y
17# CONFIG_LBDAF is not set
18# CONFIG_BLK_DEV_BSG is not set 17# CONFIG_BLK_DEV_BSG is not set
19# CONFIG_IOSCHED_DEADLINE is not set 18# CONFIG_IOSCHED_DEADLINE is not set
20# CONFIG_IOSCHED_CFQ is not set 19# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig
index 0857cdbfde0c..d5e683dd885d 100644
--- a/arch/m68k/configs/amcore_defconfig
+++ b/arch/m68k/configs/amcore_defconfig
@@ -12,7 +12,6 @@ CONFIG_EMBEDDED=y
12# CONFIG_VM_EVENT_COUNTERS is not set 12# CONFIG_VM_EVENT_COUNTERS is not set
13# CONFIG_SLUB_DEBUG is not set 13# CONFIG_SLUB_DEBUG is not set
14# CONFIG_COMPAT_BRK is not set 14# CONFIG_COMPAT_BRK is not set
15# CONFIG_LBDAF is not set
16# CONFIG_BLK_DEV_BSG is not set 15# CONFIG_BLK_DEV_BSG is not set
17# CONFIG_IOSCHED_CFQ is not set 16# CONFIG_IOSCHED_CFQ is not set
18# CONFIG_MMU is not set 17# CONFIG_MMU is not set
diff --git a/arch/m68k/configs/m5475evb_defconfig b/arch/m68k/configs/m5475evb_defconfig
index 4f4ccd13c11b..434bd3750966 100644
--- a/arch/m68k/configs/m5475evb_defconfig
+++ b/arch/m68k/configs/m5475evb_defconfig
@@ -11,7 +11,6 @@ CONFIG_SYSCTL_SYSCALL=y
11# CONFIG_AIO is not set 11# CONFIG_AIO is not set
12CONFIG_EMBEDDED=y 12CONFIG_EMBEDDED=y
13CONFIG_MODULES=y 13CONFIG_MODULES=y
14# CONFIG_LBDAF is not set
15# CONFIG_BLK_DEV_BSG is not set 14# CONFIG_BLK_DEV_BSG is not set
16# CONFIG_IOSCHED_DEADLINE is not set 15# CONFIG_IOSCHED_DEADLINE is not set
17# CONFIG_IOSCHED_CFQ is not set 16# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/m68k/configs/stmark2_defconfig b/arch/m68k/configs/stmark2_defconfig
index 69f23c7b0497..27fa9465d19d 100644
--- a/arch/m68k/configs/stmark2_defconfig
+++ b/arch/m68k/configs/stmark2_defconfig
@@ -17,7 +17,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
17CONFIG_EMBEDDED=y 17CONFIG_EMBEDDED=y
18# CONFIG_VM_EVENT_COUNTERS is not set 18# CONFIG_VM_EVENT_COUNTERS is not set
19# CONFIG_COMPAT_BRK is not set 19# CONFIG_COMPAT_BRK is not set
20# CONFIG_LBDAF is not set
21# CONFIG_BLK_DEV_BSG is not set 20# CONFIG_BLK_DEV_BSG is not set
22CONFIG_BLK_CMDLINE_PARSER=y 21CONFIG_BLK_CMDLINE_PARSER=y
23# CONFIG_MMU is not set 22# CONFIG_MMU is not set
diff --git a/arch/mips/configs/ar7_defconfig b/arch/mips/configs/ar7_defconfig
index 9fbfb6e5c7d2..c83fdf649327 100644
--- a/arch/mips/configs/ar7_defconfig
+++ b/arch/mips/configs/ar7_defconfig
@@ -18,7 +18,6 @@ CONFIG_KEXEC=y
18# CONFIG_SECCOMP is not set 18# CONFIG_SECCOMP is not set
19CONFIG_MODULES=y 19CONFIG_MODULES=y
20CONFIG_MODULE_UNLOAD=y 20CONFIG_MODULE_UNLOAD=y
21# CONFIG_LBDAF is not set
22# CONFIG_BLK_DEV_BSG is not set 21# CONFIG_BLK_DEV_BSG is not set
23CONFIG_PARTITION_ADVANCED=y 22CONFIG_PARTITION_ADVANCED=y
24CONFIG_BSD_DISKLABEL=y 23CONFIG_BSD_DISKLABEL=y
diff --git a/arch/mips/configs/decstation_defconfig b/arch/mips/configs/decstation_defconfig
index 0c86ed86266a..30a6eafdb1d0 100644
--- a/arch/mips/configs/decstation_defconfig
+++ b/arch/mips/configs/decstation_defconfig
@@ -17,7 +17,6 @@ CONFIG_TC=y
17CONFIG_MODULES=y 17CONFIG_MODULES=y
18CONFIG_MODULE_UNLOAD=y 18CONFIG_MODULE_UNLOAD=y
19CONFIG_MODULE_SRCVERSION_ALL=y 19CONFIG_MODULE_SRCVERSION_ALL=y
20# CONFIG_LBDAF is not set
21CONFIG_PARTITION_ADVANCED=y 20CONFIG_PARTITION_ADVANCED=y
22CONFIG_OSF_PARTITION=y 21CONFIG_OSF_PARTITION=y
23# CONFIG_EFI_PARTITION is not set 22# CONFIG_EFI_PARTITION is not set
diff --git a/arch/mips/configs/decstation_r4k_defconfig b/arch/mips/configs/decstation_r4k_defconfig
index 0e54ab2680ce..e2b58dbf4aa9 100644
--- a/arch/mips/configs/decstation_r4k_defconfig
+++ b/arch/mips/configs/decstation_r4k_defconfig
@@ -16,7 +16,6 @@ CONFIG_TC=y
16CONFIG_MODULES=y 16CONFIG_MODULES=y
17CONFIG_MODULE_UNLOAD=y 17CONFIG_MODULE_UNLOAD=y
18CONFIG_MODULE_SRCVERSION_ALL=y 18CONFIG_MODULE_SRCVERSION_ALL=y
19# CONFIG_LBDAF is not set
20CONFIG_PARTITION_ADVANCED=y 19CONFIG_PARTITION_ADVANCED=y
21CONFIG_OSF_PARTITION=y 20CONFIG_OSF_PARTITION=y
22# CONFIG_EFI_PARTITION is not set 21# CONFIG_EFI_PARTITION is not set
diff --git a/arch/mips/configs/loongson1b_defconfig b/arch/mips/configs/loongson1b_defconfig
index b064d68a5424..aa7e98c5f5fc 100644
--- a/arch/mips/configs/loongson1b_defconfig
+++ b/arch/mips/configs/loongson1b_defconfig
@@ -19,7 +19,6 @@ CONFIG_MACH_LOONGSON32=y
19CONFIG_MODULES=y 19CONFIG_MODULES=y
20CONFIG_MODULE_UNLOAD=y 20CONFIG_MODULE_UNLOAD=y
21CONFIG_MODVERSIONS=y 21CONFIG_MODVERSIONS=y
22# CONFIG_LBDAF is not set
23# CONFIG_BLK_DEV_BSG is not set 22# CONFIG_BLK_DEV_BSG is not set
24# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set 23# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
25CONFIG_NET=y 24CONFIG_NET=y
diff --git a/arch/mips/configs/loongson1c_defconfig b/arch/mips/configs/loongson1c_defconfig
index 5d76559b56cd..520e7ef35383 100644
--- a/arch/mips/configs/loongson1c_defconfig
+++ b/arch/mips/configs/loongson1c_defconfig
@@ -20,7 +20,6 @@ CONFIG_LOONGSON1_LS1C=y
20CONFIG_MODULES=y 20CONFIG_MODULES=y
21CONFIG_MODULE_UNLOAD=y 21CONFIG_MODULE_UNLOAD=y
22CONFIG_MODVERSIONS=y 22CONFIG_MODVERSIONS=y
23# CONFIG_LBDAF is not set
24# CONFIG_BLK_DEV_BSG is not set 23# CONFIG_BLK_DEV_BSG is not set
25# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set 24# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
26CONFIG_NET=y 25CONFIG_NET=y
diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig
index 7befe05fd813..ed1038f62a2c 100644
--- a/arch/mips/configs/rb532_defconfig
+++ b/arch/mips/configs/rb532_defconfig
@@ -19,7 +19,6 @@ CONFIG_PCI=y
19# CONFIG_PCI_QUIRKS is not set 19# CONFIG_PCI_QUIRKS is not set
20CONFIG_MODULES=y 20CONFIG_MODULES=y
21CONFIG_MODULE_UNLOAD=y 21CONFIG_MODULE_UNLOAD=y
22# CONFIG_LBDAF is not set
23# CONFIG_BLK_DEV_BSG is not set 22# CONFIG_BLK_DEV_BSG is not set
24CONFIG_PARTITION_ADVANCED=y 23CONFIG_PARTITION_ADVANCED=y
25CONFIG_MAC_PARTITION=y 24CONFIG_MAC_PARTITION=y
diff --git a/arch/mips/configs/rbtx49xx_defconfig b/arch/mips/configs/rbtx49xx_defconfig
index 50a2c9ad583f..b0f0c5f9ad9d 100644
--- a/arch/mips/configs/rbtx49xx_defconfig
+++ b/arch/mips/configs/rbtx49xx_defconfig
@@ -17,7 +17,6 @@ CONFIG_TOSHIBA_RBTX4938_MPLEX_KEEP=y
17CONFIG_PCI=y 17CONFIG_PCI=y
18CONFIG_MODULES=y 18CONFIG_MODULES=y
19CONFIG_MODULE_UNLOAD=y 19CONFIG_MODULE_UNLOAD=y
20# CONFIG_LBDAF is not set
21# CONFIG_BLK_DEV_BSG is not set 20# CONFIG_BLK_DEV_BSG is not set
22CONFIG_NET=y 21CONFIG_NET=y
23CONFIG_PACKET=y 22CONFIG_PACKET=y
diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig
index 37ae4b57c001..a8f9bbef0975 100644
--- a/arch/parisc/configs/generic-32bit_defconfig
+++ b/arch/parisc/configs/generic-32bit_defconfig
@@ -14,7 +14,6 @@ CONFIG_SLAB=y
14CONFIG_MODULES=y 14CONFIG_MODULES=y
15CONFIG_MODULE_UNLOAD=y 15CONFIG_MODULE_UNLOAD=y
16CONFIG_MODULE_FORCE_UNLOAD=y 16CONFIG_MODULE_FORCE_UNLOAD=y
17# CONFIG_LBDAF is not set
18# CONFIG_BLK_DEV_BSG is not set 17# CONFIG_BLK_DEV_BSG is not set
19CONFIG_PA7100LC=y 18CONFIG_PA7100LC=y
20CONFIG_SMP=y 19CONFIG_SMP=y
diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig
index 825c641726c4..d0d9ebc7165b 100644
--- a/arch/sh/configs/apsh4ad0a_defconfig
+++ b/arch/sh/configs/apsh4ad0a_defconfig
@@ -19,7 +19,6 @@ CONFIG_SLAB=y
19CONFIG_PROFILING=y 19CONFIG_PROFILING=y
20CONFIG_MODULES=y 20CONFIG_MODULES=y
21CONFIG_MODULE_UNLOAD=y 21CONFIG_MODULE_UNLOAD=y
22# CONFIG_LBDAF is not set
23# CONFIG_BLK_DEV_BSG is not set 22# CONFIG_BLK_DEV_BSG is not set
24CONFIG_CFQ_GROUP_IOSCHED=y 23CONFIG_CFQ_GROUP_IOSCHED=y
25CONFIG_CPU_SUBTYPE_SH7786=y 24CONFIG_CPU_SUBTYPE_SH7786=y
diff --git a/arch/sh/configs/ecovec24-romimage_defconfig b/arch/sh/configs/ecovec24-romimage_defconfig
index 0c5dfccbfe37..bdb61d1d0127 100644
--- a/arch/sh/configs/ecovec24-romimage_defconfig
+++ b/arch/sh/configs/ecovec24-romimage_defconfig
@@ -7,7 +7,6 @@ CONFIG_LOG_BUF_SHIFT=14
7CONFIG_BLK_DEV_INITRD=y 7CONFIG_BLK_DEV_INITRD=y
8# CONFIG_KALLSYMS is not set 8# CONFIG_KALLSYMS is not set
9CONFIG_SLAB=y 9CONFIG_SLAB=y
10# CONFIG_LBDAF is not set
11# CONFIG_BLK_DEV_BSG is not set 10# CONFIG_BLK_DEV_BSG is not set
12CONFIG_CPU_SUBTYPE_SH7724=y 11CONFIG_CPU_SUBTYPE_SH7724=y
13CONFIG_MEMORY_SIZE=0x10000000 12CONFIG_MEMORY_SIZE=0x10000000
diff --git a/arch/sh/configs/rsk7264_defconfig b/arch/sh/configs/rsk7264_defconfig
index 2b9b731fc86b..ad003ee469ea 100644
--- a/arch/sh/configs/rsk7264_defconfig
+++ b/arch/sh/configs/rsk7264_defconfig
@@ -16,7 +16,6 @@ CONFIG_PERF_COUNTERS=y
16CONFIG_SLAB=y 16CONFIG_SLAB=y
17CONFIG_MMAP_ALLOW_UNINITIALIZED=y 17CONFIG_MMAP_ALLOW_UNINITIALIZED=y
18CONFIG_PROFILING=y 18CONFIG_PROFILING=y
19# CONFIG_LBDAF is not set
20# CONFIG_BLK_DEV_BSG is not set 19# CONFIG_BLK_DEV_BSG is not set
21CONFIG_PARTITION_ADVANCED=y 20CONFIG_PARTITION_ADVANCED=y
22# CONFIG_IOSCHED_DEADLINE is not set 21# CONFIG_IOSCHED_DEADLINE is not set
diff --git a/arch/sh/configs/rsk7269_defconfig b/arch/sh/configs/rsk7269_defconfig
index d041f7bcb84c..27fc01d58cf8 100644
--- a/arch/sh/configs/rsk7269_defconfig
+++ b/arch/sh/configs/rsk7269_defconfig
@@ -3,7 +3,6 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
3CONFIG_EMBEDDED=y 3CONFIG_EMBEDDED=y
4# CONFIG_VM_EVENT_COUNTERS is not set 4# CONFIG_VM_EVENT_COUNTERS is not set
5CONFIG_SLAB=y 5CONFIG_SLAB=y
6# CONFIG_LBDAF is not set
7# CONFIG_BLK_DEV_BSG is not set 6# CONFIG_BLK_DEV_BSG is not set
8# CONFIG_IOSCHED_DEADLINE is not set 7# CONFIG_IOSCHED_DEADLINE is not set
9# CONFIG_IOSCHED_CFQ is not set 8# CONFIG_IOSCHED_CFQ is not set
diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig
index 2ddf5ca7094e..a89ccc15af23 100644
--- a/arch/sh/configs/sh7785lcr_32bit_defconfig
+++ b/arch/sh/configs/sh7785lcr_32bit_defconfig
@@ -11,7 +11,6 @@ CONFIG_PROFILING=y
11CONFIG_GCOV_KERNEL=y 11CONFIG_GCOV_KERNEL=y
12CONFIG_MODULES=y 12CONFIG_MODULES=y
13CONFIG_MODULE_UNLOAD=y 13CONFIG_MODULE_UNLOAD=y
14# CONFIG_LBDAF is not set
15# CONFIG_BLK_DEV_BSG is not set 14# CONFIG_BLK_DEV_BSG is not set
16CONFIG_CPU_SUBTYPE_SH7785=y 15CONFIG_CPU_SUBTYPE_SH7785=y
17CONFIG_MEMORY_START=0x40000000 16CONFIG_MEMORY_START=0x40000000
diff --git a/block/Kconfig b/block/Kconfig
index 028bc085dac8..1b220101a9cb 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,30 +26,6 @@ menuconfig BLOCK
26 26
27if BLOCK 27if BLOCK
28 28
29config LBDAF
30 bool "Support for large (2TB+) block devices and files"
31 depends on !64BIT
32 default y
33 help
34 Enable block devices or files of size 2TB and larger.
35
36 This option is required to support the full capacity of large
37 (2TB+) block devices, including RAID, disk, Network Block Device,
38 Logical Volume Manager (LVM) and loopback.
39
40 This option also enables support for single files larger than
41 2TB.
42
43 The ext4 filesystem requires that this feature be enabled in
44 order to support filesystems that have the huge_file feature
45 enabled. Otherwise, it will refuse to mount in the read-write
46 mode any filesystems that use the huge_file feature, which is
47 enabled by default by mke2fs.ext4.
48
49 The GFS2 filesystem also requires this feature.
50
51 If unsure, say Y.
52
53config BLK_SCSI_REQUEST 29config BLK_SCSI_REQUEST
54 bool 30 bool
55 31
diff --git a/block/badblocks.c b/block/badblocks.c
index 91f7bcf979d3..2e5f5697db35 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -1,18 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Bad block management 3 * Bad block management
3 * 4 *
4 * - Heavily based on MD badblocks code from Neil Brown 5 * - Heavily based on MD badblocks code from Neil Brown
5 * 6 *
6 * Copyright (c) 2015, Intel Corporation. 7 * Copyright (c) 2015, Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 */ 8 */
17 9
18#include <linux/badblocks.h> 10#include <linux/badblocks.h>
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index c6113af31960..b3796a40a61a 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -1,15 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * cgroups support for the BFQ I/O scheduler. 3 * cgroups support for the BFQ I/O scheduler.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 */ 4 */
14#include <linux/module.h> 5#include <linux/module.h>
15#include <linux/slab.h> 6#include <linux/slab.h>
@@ -578,7 +569,8 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
578 bfqg_and_blkg_get(bfqg); 569 bfqg_and_blkg_get(bfqg);
579 570
580 if (bfq_bfqq_busy(bfqq)) { 571 if (bfq_bfqq_busy(bfqq)) {
581 bfq_pos_tree_add_move(bfqd, bfqq); 572 if (unlikely(!bfqd->nonrot_with_queueing))
573 bfq_pos_tree_add_move(bfqd, bfqq);
582 bfq_activate_bfqq(bfqd, bfqq); 574 bfq_activate_bfqq(bfqd, bfqq);
583 } 575 }
584 576
@@ -1102,7 +1094,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
1102 }, 1094 },
1103#endif /* CONFIG_DEBUG_BLK_CGROUP */ 1095#endif /* CONFIG_DEBUG_BLK_CGROUP */
1104 1096
1105 /* the same statictics which cover the bfqg and its descendants */ 1097 /* the same statistics which cover the bfqg and its descendants */
1106 { 1098 {
1107 .name = "bfq.io_service_bytes_recursive", 1099 .name = "bfq.io_service_bytes_recursive",
1108 .private = (unsigned long)&blkcg_policy_bfq, 1100 .private = (unsigned long)&blkcg_policy_bfq,
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 5ba1e0d841b4..f8d430f88d25 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Budget Fair Queueing (BFQ) I/O scheduler. 3 * Budget Fair Queueing (BFQ) I/O scheduler.
3 * 4 *
@@ -12,16 +13,6 @@
12 * 13 *
13 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> 14 * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
14 * 15 *
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License as
17 * published by the Free Software Foundation; either version 2 of the
18 * License, or (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 * General Public License for more details.
24 *
25 * BFQ is a proportional-share I/O scheduler, with some extra 16 * BFQ is a proportional-share I/O scheduler, with some extra
26 * low-latency capabilities. BFQ also supports full hierarchical 17 * low-latency capabilities. BFQ also supports full hierarchical
27 * scheduling through cgroups. Next paragraphs provide an introduction 18 * scheduling through cgroups. Next paragraphs provide an introduction
@@ -189,7 +180,7 @@ static const int bfq_default_max_budget = 16 * 1024;
189/* 180/*
190 * When a sync request is dispatched, the queue that contains that 181 * When a sync request is dispatched, the queue that contains that
191 * request, and all the ancestor entities of that queue, are charged 182 * request, and all the ancestor entities of that queue, are charged
192 * with the number of sectors of the request. In constrast, if the 183 * with the number of sectors of the request. In contrast, if the
193 * request is async, then the queue and its ancestor entities are 184 * request is async, then the queue and its ancestor entities are
194 * charged with the number of sectors of the request, multiplied by 185 * charged with the number of sectors of the request, multiplied by
195 * the factor below. This throttles the bandwidth for async I/O, 186 * the factor below. This throttles the bandwidth for async I/O,
@@ -217,7 +208,7 @@ const int bfq_timeout = HZ / 8;
217 * queue merging. 208 * queue merging.
218 * 209 *
219 * As can be deduced from the low time limit below, queue merging, if 210 * As can be deduced from the low time limit below, queue merging, if
220 * successful, happens at the very beggining of the I/O of the involved 211 * successful, happens at the very beginning of the I/O of the involved
221 * cooperating processes, as a consequence of the arrival of the very 212 * cooperating processes, as a consequence of the arrival of the very
222 * first requests from each cooperator. After that, there is very 213 * first requests from each cooperator. After that, there is very
223 * little chance to find cooperators. 214 * little chance to find cooperators.
@@ -242,6 +233,14 @@ static struct kmem_cache *bfq_pool;
242 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) 233 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
243#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) 234#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
244#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) 235#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
236/*
237 * Sync random I/O is likely to be confused with soft real-time I/O,
238 * because it is characterized by limited throughput and apparently
239 * isochronous arrival pattern. To avoid false positives, queues
240 * containing only random (seeky) I/O are prevented from being tagged
241 * as soft real-time.
242 */
243#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history & -1)
245 244
246/* Min number of samples required to perform peak-rate update */ 245/* Min number of samples required to perform peak-rate update */
247#define BFQ_RATE_MIN_SAMPLES 32 246#define BFQ_RATE_MIN_SAMPLES 32
@@ -433,7 +432,7 @@ void bfq_schedule_dispatch(struct bfq_data *bfqd)
433 432
434/* 433/*
435 * Lifted from AS - choose which of rq1 and rq2 that is best served now. 434 * Lifted from AS - choose which of rq1 and rq2 that is best served now.
436 * We choose the request that is closesr to the head right now. Distance 435 * We choose the request that is closer to the head right now. Distance
437 * behind the head is penalized and only allowed to a certain extent. 436 * behind the head is penalized and only allowed to a certain extent.
438 */ 437 */
439static struct request *bfq_choose_req(struct bfq_data *bfqd, 438static struct request *bfq_choose_req(struct bfq_data *bfqd,
@@ -595,7 +594,16 @@ static bool bfq_too_late_for_merging(struct bfq_queue *bfqq)
595 bfq_merge_time_limit); 594 bfq_merge_time_limit);
596} 595}
597 596
598void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) 597/*
598 * The following function is not marked as __cold because it is
599 * actually cold, but for the same performance goal described in the
600 * comments on the likely() at the beginning of
601 * bfq_setup_cooperator(). Unexpectedly, to reach an even lower
602 * execution time for the case where this function is not invoked, we
603 * had to add an unlikely() in each involved if().
604 */
605void __cold
606bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
599{ 607{
600 struct rb_node **p, *parent; 608 struct rb_node **p, *parent;
601 struct bfq_queue *__bfqq; 609 struct bfq_queue *__bfqq;
@@ -629,12 +637,19 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
629} 637}
630 638
631/* 639/*
632 * The following function returns true if every queue must receive the 640 * The following function returns false either if every active queue
633 * same share of the throughput (this condition is used when deciding 641 * must receive the same share of the throughput (symmetric scenario),
634 * whether idling may be disabled, see the comments in the function 642 * or, as a special case, if bfqq must receive a share of the
635 * bfq_better_to_idle()). 643 * throughput lower than or equal to the share that every other active
644 * queue must receive. If bfqq does sync I/O, then these are the only
645 * two cases where bfqq happens to be guaranteed its share of the
646 * throughput even if I/O dispatching is not plugged when bfqq remains
647 * temporarily empty (for more details, see the comments in the
648 * function bfq_better_to_idle()). For this reason, the return value
649 * of this function is used to check whether I/O-dispatch plugging can
650 * be avoided.
636 * 651 *
637 * Such a scenario occurs when: 652 * The above first case (symmetric scenario) occurs when:
638 * 1) all active queues have the same weight, 653 * 1) all active queues have the same weight,
639 * 2) all active queues belong to the same I/O-priority class, 654 * 2) all active queues belong to the same I/O-priority class,
640 * 3) all active groups at the same level in the groups tree have the same 655 * 3) all active groups at the same level in the groups tree have the same
@@ -654,30 +669,36 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
654 * support or the cgroups interface are not enabled, thus no state 669 * support or the cgroups interface are not enabled, thus no state
655 * needs to be maintained in this case. 670 * needs to be maintained in this case.
656 */ 671 */
657static bool bfq_symmetric_scenario(struct bfq_data *bfqd) 672static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
673 struct bfq_queue *bfqq)
658{ 674{
675 bool smallest_weight = bfqq &&
676 bfqq->weight_counter &&
677 bfqq->weight_counter ==
678 container_of(
679 rb_first_cached(&bfqd->queue_weights_tree),
680 struct bfq_weight_counter,
681 weights_node);
682
659 /* 683 /*
660 * For queue weights to differ, queue_weights_tree must contain 684 * For queue weights to differ, queue_weights_tree must contain
661 * at least two nodes. 685 * at least two nodes.
662 */ 686 */
663 bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && 687 bool varied_queue_weights = !smallest_weight &&
664 (bfqd->queue_weights_tree.rb_node->rb_left || 688 !RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) &&
665 bfqd->queue_weights_tree.rb_node->rb_right); 689 (bfqd->queue_weights_tree.rb_root.rb_node->rb_left ||
690 bfqd->queue_weights_tree.rb_root.rb_node->rb_right);
666 691
667 bool multiple_classes_busy = 692 bool multiple_classes_busy =
668 (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || 693 (bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
669 (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || 694 (bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
670 (bfqd->busy_queues[1] && bfqd->busy_queues[2]); 695 (bfqd->busy_queues[1] && bfqd->busy_queues[2]);
671 696
672 /* 697 return varied_queue_weights || multiple_classes_busy
673 * For queue weights to differ, queue_weights_tree must contain
674 * at least two nodes.
675 */
676 return !(varied_queue_weights || multiple_classes_busy
677#ifdef CONFIG_BFQ_GROUP_IOSCHED 698#ifdef CONFIG_BFQ_GROUP_IOSCHED
678 || bfqd->num_groups_with_pending_reqs > 0 699 || bfqd->num_groups_with_pending_reqs > 0
679#endif 700#endif
680 ); 701 ;
681} 702}
682 703
683/* 704/*
@@ -694,10 +715,11 @@ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
694 * should be low too. 715 * should be low too.
695 */ 716 */
696void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, 717void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
697 struct rb_root *root) 718 struct rb_root_cached *root)
698{ 719{
699 struct bfq_entity *entity = &bfqq->entity; 720 struct bfq_entity *entity = &bfqq->entity;
700 struct rb_node **new = &(root->rb_node), *parent = NULL; 721 struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
722 bool leftmost = true;
701 723
702 /* 724 /*
703 * Do not insert if the queue is already associated with a 725 * Do not insert if the queue is already associated with a
@@ -726,8 +748,10 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
726 } 748 }
727 if (entity->weight < __counter->weight) 749 if (entity->weight < __counter->weight)
728 new = &((*new)->rb_left); 750 new = &((*new)->rb_left);
729 else 751 else {
730 new = &((*new)->rb_right); 752 new = &((*new)->rb_right);
753 leftmost = false;
754 }
731 } 755 }
732 756
733 bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), 757 bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
@@ -736,7 +760,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
736 /* 760 /*
737 * In the unlucky event of an allocation failure, we just 761 * In the unlucky event of an allocation failure, we just
738 * exit. This will cause the weight of queue to not be 762 * exit. This will cause the weight of queue to not be
739 * considered in bfq_symmetric_scenario, which, in its turn, 763 * considered in bfq_asymmetric_scenario, which, in its turn,
740 * causes the scenario to be deemed wrongly symmetric in case 764 * causes the scenario to be deemed wrongly symmetric in case
741 * bfqq's weight would have been the only weight making the 765 * bfqq's weight would have been the only weight making the
742 * scenario asymmetric. On the bright side, no unbalance will 766 * scenario asymmetric. On the bright side, no unbalance will
@@ -750,7 +774,8 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
750 774
751 bfqq->weight_counter->weight = entity->weight; 775 bfqq->weight_counter->weight = entity->weight;
752 rb_link_node(&bfqq->weight_counter->weights_node, parent, new); 776 rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
753 rb_insert_color(&bfqq->weight_counter->weights_node, root); 777 rb_insert_color_cached(&bfqq->weight_counter->weights_node, root,
778 leftmost);
754 779
755inc_counter: 780inc_counter:
756 bfqq->weight_counter->num_active++; 781 bfqq->weight_counter->num_active++;
@@ -765,7 +790,7 @@ inc_counter:
765 */ 790 */
766void __bfq_weights_tree_remove(struct bfq_data *bfqd, 791void __bfq_weights_tree_remove(struct bfq_data *bfqd,
767 struct bfq_queue *bfqq, 792 struct bfq_queue *bfqq,
768 struct rb_root *root) 793 struct rb_root_cached *root)
769{ 794{
770 if (!bfqq->weight_counter) 795 if (!bfqq->weight_counter)
771 return; 796 return;
@@ -774,7 +799,7 @@ void __bfq_weights_tree_remove(struct bfq_data *bfqd,
774 if (bfqq->weight_counter->num_active > 0) 799 if (bfqq->weight_counter->num_active > 0)
775 goto reset_entity_pointer; 800 goto reset_entity_pointer;
776 801
777 rb_erase(&bfqq->weight_counter->weights_node, root); 802 rb_erase_cached(&bfqq->weight_counter->weights_node, root);
778 kfree(bfqq->weight_counter); 803 kfree(bfqq->weight_counter);
779 804
780reset_entity_pointer: 805reset_entity_pointer:
@@ -889,7 +914,7 @@ static unsigned long bfq_serv_to_charge(struct request *rq,
889 struct bfq_queue *bfqq) 914 struct bfq_queue *bfqq)
890{ 915{
891 if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || 916 if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
892 !bfq_symmetric_scenario(bfqq->bfqd)) 917 bfq_asymmetric_scenario(bfqq->bfqd, bfqq))
893 return blk_rq_sectors(rq); 918 return blk_rq_sectors(rq);
894 919
895 return blk_rq_sectors(rq) * bfq_async_charge_factor; 920 return blk_rq_sectors(rq) * bfq_async_charge_factor;
@@ -955,7 +980,7 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
955 * of several files 980 * of several files
956 * mplayer took 23 seconds to start, if constantly weight-raised. 981 * mplayer took 23 seconds to start, if constantly weight-raised.
957 * 982 *
958 * As for higher values than that accomodating the above bad 983 * As for higher values than that accommodating the above bad
959 * scenario, tests show that higher values would often yield 984 * scenario, tests show that higher values would often yield
960 * the opposite of the desired result, i.e., would worsen 985 * the opposite of the desired result, i.e., would worsen
961 * responsiveness by allowing non-interactive applications to 986 * responsiveness by allowing non-interactive applications to
@@ -994,6 +1019,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
994 else 1019 else
995 bfq_clear_bfqq_IO_bound(bfqq); 1020 bfq_clear_bfqq_IO_bound(bfqq);
996 1021
1022 bfqq->entity.new_weight = bic->saved_weight;
997 bfqq->ttime = bic->saved_ttime; 1023 bfqq->ttime = bic->saved_ttime;
998 bfqq->wr_coeff = bic->saved_wr_coeff; 1024 bfqq->wr_coeff = bic->saved_wr_coeff;
999 bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; 1025 bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
@@ -1041,8 +1067,18 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1041 1067
1042 hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) 1068 hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
1043 hlist_del_init(&item->burst_list_node); 1069 hlist_del_init(&item->burst_list_node);
1044 hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); 1070
1045 bfqd->burst_size = 1; 1071 /*
1072 * Start the creation of a new burst list only if there is no
1073 * active queue. See comments on the conditional invocation of
1074 * bfq_handle_burst().
1075 */
1076 if (bfq_tot_busy_queues(bfqd) == 0) {
1077 hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
1078 bfqd->burst_size = 1;
1079 } else
1080 bfqd->burst_size = 0;
1081
1046 bfqd->burst_parent_entity = bfqq->entity.parent; 1082 bfqd->burst_parent_entity = bfqq->entity.parent;
1047} 1083}
1048 1084
@@ -1098,7 +1134,8 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1098 * many parallel threads/processes. Examples are systemd during boot, 1134 * many parallel threads/processes. Examples are systemd during boot,
1099 * or git grep. To help these processes get their job done as soon as 1135 * or git grep. To help these processes get their job done as soon as
1100 * possible, it is usually better to not grant either weight-raising 1136 * possible, it is usually better to not grant either weight-raising
1101 * or device idling to their queues. 1137 * or device idling to their queues, unless these queues must be
1138 * protected from the I/O flowing through other active queues.
1102 * 1139 *
1103 * In this comment we describe, firstly, the reasons why this fact 1140 * In this comment we describe, firstly, the reasons why this fact
1104 * holds, and, secondly, the next function, which implements the main 1141 * holds, and, secondly, the next function, which implements the main
@@ -1110,7 +1147,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1110 * cumulatively served, the sooner the target job of these queues gets 1147 * cumulatively served, the sooner the target job of these queues gets
1111 * completed. As a consequence, weight-raising any of these queues, 1148 * completed. As a consequence, weight-raising any of these queues,
1112 * which also implies idling the device for it, is almost always 1149 * which also implies idling the device for it, is almost always
1113 * counterproductive. In most cases it just lowers throughput. 1150 * counterproductive, unless there are other active queues to isolate
1151 * these new queues from. If there no other active queues, then
1152 * weight-raising these new queues just lowers throughput in most
1153 * cases.
1114 * 1154 *
1115 * On the other hand, a burst of queue creations may be caused also by 1155 * On the other hand, a burst of queue creations may be caused also by
1116 * the start of an application that does not consist of a lot of 1156 * the start of an application that does not consist of a lot of
@@ -1144,14 +1184,16 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1144 * are very rare. They typically occur if some service happens to 1184 * are very rare. They typically occur if some service happens to
1145 * start doing I/O exactly when the interactive task starts. 1185 * start doing I/O exactly when the interactive task starts.
1146 * 1186 *
1147 * Turning back to the next function, it implements all the steps 1187 * Turning back to the next function, it is invoked only if there are
1148 * needed to detect the occurrence of a large burst and to properly 1188 * no active queues (apart from active queues that would belong to the
1149 * mark all the queues belonging to it (so that they can then be 1189 * same, possible burst bfqq would belong to), and it implements all
1150 * treated in a different way). This goal is achieved by maintaining a 1190 * the steps needed to detect the occurrence of a large burst and to
1151 * "burst list" that holds, temporarily, the queues that belong to the 1191 * properly mark all the queues belonging to it (so that they can then
1152 * burst in progress. The list is then used to mark these queues as 1192 * be treated in a different way). This goal is achieved by
1153 * belonging to a large burst if the burst does become large. The main 1193 * maintaining a "burst list" that holds, temporarily, the queues that
1154 * steps are the following. 1194 * belong to the burst in progress. The list is then used to mark
1195 * these queues as belonging to a large burst if the burst does become
1196 * large. The main steps are the following.
1155 * 1197 *
1156 * . when the very first queue is created, the queue is inserted into the 1198 * . when the very first queue is created, the queue is inserted into the
1157 * list (as it could be the first queue in a possible burst) 1199 * list (as it could be the first queue in a possible burst)
@@ -1596,6 +1638,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1596 */ 1638 */
1597 in_burst = bfq_bfqq_in_large_burst(bfqq); 1639 in_burst = bfq_bfqq_in_large_burst(bfqq);
1598 soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && 1640 soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
1641 !BFQQ_TOTALLY_SEEKY(bfqq) &&
1599 !in_burst && 1642 !in_burst &&
1600 time_is_before_jiffies(bfqq->soft_rt_next_start) && 1643 time_is_before_jiffies(bfqq->soft_rt_next_start) &&
1601 bfqq->dispatched == 0; 1644 bfqq->dispatched == 0;
@@ -1704,6 +1747,123 @@ static void bfq_add_request(struct request *rq)
1704 bfqq->queued[rq_is_sync(rq)]++; 1747 bfqq->queued[rq_is_sync(rq)]++;
1705 bfqd->queued++; 1748 bfqd->queued++;
1706 1749
1750 if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
1751 /*
1752 * Periodically reset inject limit, to make sure that
1753 * the latter eventually drops in case workload
1754 * changes, see step (3) in the comments on
1755 * bfq_update_inject_limit().
1756 */
1757 if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
1758 msecs_to_jiffies(1000))) {
1759 /* invalidate baseline total service time */
1760 bfqq->last_serv_time_ns = 0;
1761
1762 /*
1763 * Reset pointer in case we are waiting for
1764 * some request completion.
1765 */
1766 bfqd->waited_rq = NULL;
1767
1768 /*
1769 * If bfqq has a short think time, then start
1770 * by setting the inject limit to 0
1771 * prudentially, because the service time of
1772 * an injected I/O request may be higher than
1773 * the think time of bfqq, and therefore, if
1774 * one request was injected when bfqq remains
1775 * empty, this injected request might delay
1776 * the service of the next I/O request for
1777 * bfqq significantly. In case bfqq can
1778 * actually tolerate some injection, then the
1779 * adaptive update will however raise the
1780 * limit soon. This lucky circumstance holds
1781 * exactly because bfqq has a short think
1782 * time, and thus, after remaining empty, is
1783 * likely to get new I/O enqueued---and then
1784 * completed---before being expired. This is
1785 * the very pattern that gives the
1786 * limit-update algorithm the chance to
1787 * measure the effect of injection on request
1788 * service times, and then to update the limit
1789 * accordingly.
1790 *
1791 * On the opposite end, if bfqq has a long
1792 * think time, then start directly by 1,
1793 * because:
1794 * a) on the bright side, keeping at most one
1795 * request in service in the drive is unlikely
1796 * to cause any harm to the latency of bfqq's
1797 * requests, as the service time of a single
1798 * request is likely to be lower than the
1799 * think time of bfqq;
1800 * b) on the downside, after becoming empty,
1801 * bfqq is likely to expire before getting its
1802 * next request. With this request arrival
1803 * pattern, it is very hard to sample total
1804 * service times and update the inject limit
1805 * accordingly (see comments on
1806 * bfq_update_inject_limit()). So the limit is
1807 * likely to be never, or at least seldom,
1808 * updated. As a consequence, by setting the
1809 * limit to 1, we avoid that no injection ever
1810 * occurs with bfqq. On the downside, this
1811 * proactive step further reduces chances to
1812 * actually compute the baseline total service
1813 * time. Thus it reduces chances to execute the
1814 * limit-update algorithm and possibly raise the
1815 * limit to more than 1.
1816 */
1817 if (bfq_bfqq_has_short_ttime(bfqq))
1818 bfqq->inject_limit = 0;
1819 else
1820 bfqq->inject_limit = 1;
1821 bfqq->decrease_time_jif = jiffies;
1822 }
1823
1824 /*
1825 * The following conditions must hold to setup a new
1826 * sampling of total service time, and then a new
1827 * update of the inject limit:
1828 * - bfqq is in service, because the total service
1829 * time is evaluated only for the I/O requests of
1830 * the queues in service;
1831 * - this is the right occasion to compute or to
1832 * lower the baseline total service time, because
1833 * there are actually no requests in the drive,
1834 * or
1835 * the baseline total service time is available, and
1836 * this is the right occasion to compute the other
1837 * quantity needed to update the inject limit, i.e.,
1838 * the total service time caused by the amount of
1839 * injection allowed by the current value of the
1840 * limit. It is the right occasion because injection
1841 * has actually been performed during the service
1842 * hole, and there are still in-flight requests,
1843 * which are very likely to be exactly the injected
1844 * requests, or part of them;
1845 * - the minimum interval for sampling the total
1846 * service time and updating the inject limit has
1847 * elapsed.
1848 */
1849 if (bfqq == bfqd->in_service_queue &&
1850 (bfqd->rq_in_driver == 0 ||
1851 (bfqq->last_serv_time_ns > 0 &&
1852 bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
1853 time_is_before_eq_jiffies(bfqq->decrease_time_jif +
1854 msecs_to_jiffies(100))) {
1855 bfqd->last_empty_occupied_ns = ktime_get_ns();
1856 /*
1857 * Start the state machine for measuring the
1858 * total service time of rq: setting
1859 * wait_dispatch will cause bfqd->waited_rq to
1860 * be set when rq will be dispatched.
1861 */
1862 bfqd->wait_dispatch = true;
1863 bfqd->rqs_injected = false;
1864 }
1865 }
1866
1707 elv_rb_add(&bfqq->sort_list, rq); 1867 elv_rb_add(&bfqq->sort_list, rq);
1708 1868
1709 /* 1869 /*
@@ -1715,8 +1875,9 @@ static void bfq_add_request(struct request *rq)
1715 1875
1716 /* 1876 /*
1717 * Adjust priority tree position, if next_rq changes. 1877 * Adjust priority tree position, if next_rq changes.
1878 * See comments on bfq_pos_tree_add_move() for the unlikely().
1718 */ 1879 */
1719 if (prev != bfqq->next_rq) 1880 if (unlikely(!bfqd->nonrot_with_queueing && prev != bfqq->next_rq))
1720 bfq_pos_tree_add_move(bfqd, bfqq); 1881 bfq_pos_tree_add_move(bfqd, bfqq);
1721 1882
1722 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ 1883 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
@@ -1856,7 +2017,9 @@ static void bfq_remove_request(struct request_queue *q,
1856 bfqq->pos_root = NULL; 2017 bfqq->pos_root = NULL;
1857 } 2018 }
1858 } else { 2019 } else {
1859 bfq_pos_tree_add_move(bfqd, bfqq); 2020 /* see comments on bfq_pos_tree_add_move() for the unlikely() */
2021 if (unlikely(!bfqd->nonrot_with_queueing))
2022 bfq_pos_tree_add_move(bfqd, bfqq);
1860 } 2023 }
1861 2024
1862 if (rq->cmd_flags & REQ_META) 2025 if (rq->cmd_flags & REQ_META)
@@ -1941,7 +2104,12 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
1941 */ 2104 */
1942 if (prev != bfqq->next_rq) { 2105 if (prev != bfqq->next_rq) {
1943 bfq_updated_next_req(bfqd, bfqq); 2106 bfq_updated_next_req(bfqd, bfqq);
1944 bfq_pos_tree_add_move(bfqd, bfqq); 2107 /*
2108 * See comments on bfq_pos_tree_add_move() for
2109 * the unlikely().
2110 */
2111 if (unlikely(!bfqd->nonrot_with_queueing))
2112 bfq_pos_tree_add_move(bfqd, bfqq);
1945 } 2113 }
1946 } 2114 }
1947} 2115}
@@ -2224,6 +2392,46 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2224 struct bfq_queue *in_service_bfqq, *new_bfqq; 2392 struct bfq_queue *in_service_bfqq, *new_bfqq;
2225 2393
2226 /* 2394 /*
2395 * Do not perform queue merging if the device is non
2396 * rotational and performs internal queueing. In fact, such a
2397 * device reaches a high speed through internal parallelism
2398 * and pipelining. This means that, to reach a high
2399 * throughput, it must have many requests enqueued at the same
2400 * time. But, in this configuration, the internal scheduling
2401 * algorithm of the device does exactly the job of queue
2402 * merging: it reorders requests so as to obtain as much as
2403 * possible a sequential I/O pattern. As a consequence, with
2404 * the workload generated by processes doing interleaved I/O,
2405 * the throughput reached by the device is likely to be the
2406 * same, with and without queue merging.
2407 *
2408 * Disabling merging also provides a remarkable benefit in
2409 * terms of throughput. Merging tends to make many workloads
2410 * artificially more uneven, because of shared queues
2411 * remaining non empty for incomparably more time than
2412 * non-merged queues. This may accentuate workload
2413 * asymmetries. For example, if one of the queues in a set of
2414 * merged queues has a higher weight than a normal queue, then
2415 * the shared queue may inherit such a high weight and, by
2416 * staying almost always active, may force BFQ to perform I/O
2417 * plugging most of the time. This evidently makes it harder
2418 * for BFQ to let the device reach a high throughput.
2419 *
2420 * Finally, the likely() macro below is not used because one
2421 * of the two branches is more likely than the other, but to
2422 * have the code path after the following if() executed as
2423 * fast as possible for the case of a non rotational device
2424 * with queueing. We want it because this is the fastest kind
2425 * of device. On the opposite end, the likely() may lengthen
2426 * the execution time of BFQ for the case of slower devices
2427 * (rotational or at least without queueing). But in this case
2428 * the execution time of BFQ matters very little, if not at
2429 * all.
2430 */
2431 if (likely(bfqd->nonrot_with_queueing))
2432 return NULL;
2433
2434 /*
2227 * Prevent bfqq from being merged if it has been created too 2435 * Prevent bfqq from being merged if it has been created too
2228 * long ago. The idea is that true cooperating processes, and 2436 * long ago. The idea is that true cooperating processes, and
2229 * thus their associated bfq_queues, are supposed to be 2437 * thus their associated bfq_queues, are supposed to be
@@ -2286,6 +2494,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
2286 if (!bic) 2494 if (!bic)
2287 return; 2495 return;
2288 2496
2497 bic->saved_weight = bfqq->entity.orig_weight;
2289 bic->saved_ttime = bfqq->ttime; 2498 bic->saved_ttime = bfqq->ttime;
2290 bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); 2499 bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
2291 bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); 2500 bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
@@ -2374,6 +2583,16 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
2374 * assignment causes no harm). 2583 * assignment causes no harm).
2375 */ 2584 */
2376 new_bfqq->bic = NULL; 2585 new_bfqq->bic = NULL;
2586 /*
2587 * If the queue is shared, the pid is the pid of one of the associated
2588 * processes. Which pid depends on the exact sequence of merge events
2589 * the queue underwent. So printing such a pid is useless and confusing
2590 * because it reports a random pid between those of the associated
2591 * processes.
2592 * We mark such a queue with a pid -1, and then print SHARED instead of
2593 * a pid in logging messages.
2594 */
2595 new_bfqq->pid = -1;
2377 bfqq->bic = NULL; 2596 bfqq->bic = NULL;
2378 /* release process reference to bfqq */ 2597 /* release process reference to bfqq */
2379 bfq_put_queue(bfqq); 2598 bfq_put_queue(bfqq);
@@ -2408,8 +2627,8 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
2408 /* 2627 /*
2409 * bic still points to bfqq, then it has not yet been 2628 * bic still points to bfqq, then it has not yet been
2410 * redirected to some other bfq_queue, and a queue 2629 * redirected to some other bfq_queue, and a queue
2411 * merge beween bfqq and new_bfqq can be safely 2630 * merge between bfqq and new_bfqq can be safely
2412 * fulfillled, i.e., bic can be redirected to new_bfqq 2631 * fulfilled, i.e., bic can be redirected to new_bfqq
2413 * and bfqq can be put. 2632 * and bfqq can be put.
2414 */ 2633 */
2415 bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, 2634 bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
@@ -2543,10 +2762,14 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2543 * queue). 2762 * queue).
2544 */ 2763 */
2545 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && 2764 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
2546 bfq_symmetric_scenario(bfqd)) 2765 !bfq_asymmetric_scenario(bfqd, bfqq))
2547 sl = min_t(u64, sl, BFQ_MIN_TT); 2766 sl = min_t(u64, sl, BFQ_MIN_TT);
2767 else if (bfqq->wr_coeff > 1)
2768 sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
2548 2769
2549 bfqd->last_idling_start = ktime_get(); 2770 bfqd->last_idling_start = ktime_get();
2771 bfqd->last_idling_start_jiffies = jiffies;
2772
2550 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), 2773 hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
2551 HRTIMER_MODE_REL); 2774 HRTIMER_MODE_REL);
2552 bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); 2775 bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
@@ -2848,8 +3071,10 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2848 bfq_requeue_bfqq(bfqd, bfqq, true); 3071 bfq_requeue_bfqq(bfqd, bfqq, true);
2849 /* 3072 /*
2850 * Resort priority tree of potential close cooperators. 3073 * Resort priority tree of potential close cooperators.
3074 * See comments on bfq_pos_tree_add_move() for the unlikely().
2851 */ 3075 */
2852 bfq_pos_tree_add_move(bfqd, bfqq); 3076 if (unlikely(!bfqd->nonrot_with_queueing))
3077 bfq_pos_tree_add_move(bfqd, bfqq);
2853 } 3078 }
2854 3079
2855 /* 3080 /*
@@ -3223,13 +3448,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
3223 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); 3448 jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
3224} 3449}
3225 3450
3226static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
3227{
3228 return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
3229 blk_queue_nonrot(bfqq->bfqd->queue) &&
3230 bfqq->bfqd->hw_tag;
3231}
3232
3233/** 3451/**
3234 * bfq_bfqq_expire - expire a queue. 3452 * bfq_bfqq_expire - expire a queue.
3235 * @bfqd: device owning the queue. 3453 * @bfqd: device owning the queue.
@@ -3344,6 +3562,14 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
3344 slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq)); 3562 slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq));
3345 3563
3346 /* 3564 /*
3565 * bfqq expired, so no total service time needs to be computed
3566 * any longer: reset state machine for measuring total service
3567 * times.
3568 */
3569 bfqd->rqs_injected = bfqd->wait_dispatch = false;
3570 bfqd->waited_rq = NULL;
3571
3572 /*
3347 * Increase, decrease or leave budget unchanged according to 3573 * Increase, decrease or leave budget unchanged according to
3348 * reason. 3574 * reason.
3349 */ 3575 */
@@ -3352,8 +3578,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
3352 /* bfqq is gone, no more actions on it */ 3578 /* bfqq is gone, no more actions on it */
3353 return; 3579 return;
3354 3580
3355 bfqq->injected_service = 0;
3356
3357 /* mark bfqq as waiting a request only if a bic still points to it */ 3581 /* mark bfqq as waiting a request only if a bic still points to it */
3358 if (!bfq_bfqq_busy(bfqq) && 3582 if (!bfq_bfqq_busy(bfqq) &&
3359 reason != BFQQE_BUDGET_TIMEOUT && 3583 reason != BFQQE_BUDGET_TIMEOUT &&
@@ -3497,8 +3721,9 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
3497} 3721}
3498 3722
3499/* 3723/*
3500 * There is a case where idling must be performed not for 3724 * There is a case where idling does not have to be performed for
3501 * throughput concerns, but to preserve service guarantees. 3725 * throughput concerns, but to preserve the throughput share of
3726 * the process associated with bfqq.
3502 * 3727 *
3503 * To introduce this case, we can note that allowing the drive 3728 * To introduce this case, we can note that allowing the drive
3504 * to enqueue more than one request at a time, and hence 3729 * to enqueue more than one request at a time, and hence
@@ -3514,77 +3739,83 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
3514 * concern about per-process throughput distribution, and 3739 * concern about per-process throughput distribution, and
3515 * makes its decisions only on a per-request basis. Therefore, 3740 * makes its decisions only on a per-request basis. Therefore,
3516 * the service distribution enforced by the drive's internal 3741 * the service distribution enforced by the drive's internal
3517 * scheduler is likely to coincide with the desired 3742 * scheduler is likely to coincide with the desired throughput
3518 * device-throughput distribution only in a completely 3743 * distribution only in a completely symmetric, or favorably
3519 * symmetric scenario where: 3744 * skewed scenario where:
3520 * (i) each of these processes must get the same throughput as 3745 * (i-a) each of these processes must get the same throughput as
3521 * the others; 3746 * the others,
3522 * (ii) the I/O of each process has the same properties, in 3747 * (i-b) in case (i-a) does not hold, it holds that the process
3523 * terms of locality (sequential or random), direction 3748 * associated with bfqq must receive a lower or equal
3524 * (reads or writes), request sizes, greediness 3749 * throughput than any of the other processes;
3525 * (from I/O-bound to sporadic), and so on. 3750 * (ii) the I/O of each process has the same properties, in
3526 * In fact, in such a scenario, the drive tends to treat 3751 * terms of locality (sequential or random), direction
3527 * the requests of each of these processes in about the same 3752 * (reads or writes), request sizes, greediness
3528 * way as the requests of the others, and thus to provide 3753 * (from I/O-bound to sporadic), and so on;
3529 * each of these processes with about the same throughput 3754
3530 * (which is exactly the desired throughput distribution). In 3755 * In fact, in such a scenario, the drive tends to treat the requests
3531 * contrast, in any asymmetric scenario, device idling is 3756 * of each process in about the same way as the requests of the
3532 * certainly needed to guarantee that bfqq receives its 3757 * others, and thus to provide each of these processes with about the
3533 * assigned fraction of the device throughput (see [1] for 3758 * same throughput. This is exactly the desired throughput
3534 * details). 3759 * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
3535 * The problem is that idling may significantly reduce 3760 * even more convenient distribution for (the process associated with)
3536 * throughput with certain combinations of types of I/O and 3761 * bfqq.
3537 * devices. An important example is sync random I/O, on flash 3762 *
3538 * storage with command queueing. So, unless bfqq falls in the 3763 * In contrast, in any asymmetric or unfavorable scenario, device
3539 * above cases where idling also boosts throughput, it would 3764 * idling (I/O-dispatch plugging) is certainly needed to guarantee
3540 * be important to check conditions (i) and (ii) accurately, 3765 * that bfqq receives its assigned fraction of the device throughput
3541 * so as to avoid idling when not strictly needed for service 3766 * (see [1] for details).
3542 * guarantees.
3543 * 3767 *
3544 * Unfortunately, it is extremely difficult to thoroughly 3768 * The problem is that idling may significantly reduce throughput with
3545 * check condition (ii). And, in case there are active groups, 3769 * certain combinations of types of I/O and devices. An important
3546 * it becomes very difficult to check condition (i) too. In 3770 * example is sync random I/O on flash storage with command
3547 * fact, if there are active groups, then, for condition (i) 3771 * queueing. So, unless bfqq falls in cases where idling also boosts
3548 * to become false, it is enough that an active group contains 3772 * throughput, it is important to check conditions (i-a), i(-b) and
3549 * more active processes or sub-groups than some other active 3773 * (ii) accurately, so as to avoid idling when not strictly needed for
3550 * group. More precisely, for condition (i) to hold because of 3774 * service guarantees.
3551 * such a group, it is not even necessary that the group is 3775 *
3552 * (still) active: it is sufficient that, even if the group 3776 * Unfortunately, it is extremely difficult to thoroughly check
3553 * has become inactive, some of its descendant processes still 3777 * condition (ii). And, in case there are active groups, it becomes
3554 * have some request already dispatched but still waiting for 3778 * very difficult to check conditions (i-a) and (i-b) too. In fact,
3555 * completion. In fact, requests have still to be guaranteed 3779 * if there are active groups, then, for conditions (i-a) or (i-b) to
3556 * their share of the throughput even after being 3780 * become false 'indirectly', it is enough that an active group
3557 * dispatched. In this respect, it is easy to show that, if a 3781 * contains more active processes or sub-groups than some other active
3558 * group frequently becomes inactive while still having 3782 * group. More precisely, for conditions (i-a) or (i-b) to become
3559 * in-flight requests, and if, when this happens, the group is 3783 * false because of such a group, it is not even necessary that the
3560 * not considered in the calculation of whether the scenario 3784 * group is (still) active: it is sufficient that, even if the group
3561 * is asymmetric, then the group may fail to be guaranteed its 3785 * has become inactive, some of its descendant processes still have
3562 * fair share of the throughput (basically because idling may 3786 * some request already dispatched but still waiting for
3563 * not be performed for the descendant processes of the group, 3787 * completion. In fact, requests have still to be guaranteed their
3564 * but it had to be). We address this issue with the 3788 * share of the throughput even after being dispatched. In this
3565 * following bi-modal behavior, implemented in the function 3789 * respect, it is easy to show that, if a group frequently becomes
3566 * bfq_symmetric_scenario(). 3790 * inactive while still having in-flight requests, and if, when this
3791 * happens, the group is not considered in the calculation of whether
3792 * the scenario is asymmetric, then the group may fail to be
3793 * guaranteed its fair share of the throughput (basically because
3794 * idling may not be performed for the descendant processes of the
3795 * group, but it had to be). We address this issue with the following
3796 * bi-modal behavior, implemented in the function
3797 * bfq_asymmetric_scenario().
3567 * 3798 *
3568 * If there are groups with requests waiting for completion 3799 * If there are groups with requests waiting for completion
3569 * (as commented above, some of these groups may even be 3800 * (as commented above, some of these groups may even be
3570 * already inactive), then the scenario is tagged as 3801 * already inactive), then the scenario is tagged as
3571 * asymmetric, conservatively, without checking any of the 3802 * asymmetric, conservatively, without checking any of the
3572 * conditions (i) and (ii). So the device is idled for bfqq. 3803 * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
3573 * This behavior matches also the fact that groups are created 3804 * This behavior matches also the fact that groups are created
3574 * exactly if controlling I/O is a primary concern (to 3805 * exactly if controlling I/O is a primary concern (to
3575 * preserve bandwidth and latency guarantees). 3806 * preserve bandwidth and latency guarantees).
3576 * 3807 *
3577 * On the opposite end, if there are no groups with requests 3808 * On the opposite end, if there are no groups with requests waiting
3578 * waiting for completion, then only condition (i) is actually 3809 * for completion, then only conditions (i-a) and (i-b) are actually
3579 * controlled, i.e., provided that condition (i) holds, idling 3810 * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
3580 * is not performed, regardless of whether condition (ii) 3811 * idling is not performed, regardless of whether condition (ii)
3581 * holds. In other words, only if condition (i) does not hold, 3812 * holds. In other words, only if conditions (i-a) and (i-b) do not
3582 * then idling is allowed, and the device tends to be 3813 * hold, then idling is allowed, and the device tends to be prevented
3583 * prevented from queueing many requests, possibly of several 3814 * from queueing many requests, possibly of several processes. Since
3584 * processes. Since there are no groups with requests waiting 3815 * there are no groups with requests waiting for completion, then, to
3585 * for completion, then, to control condition (i) it is enough 3816 * control conditions (i-a) and (i-b) it is enough to check just
3586 * to check just whether all the queues with requests waiting 3817 * whether all the queues with requests waiting for completion also
3587 * for completion also have the same weight. 3818 * have the same weight.
3588 * 3819 *
3589 * Not checking condition (ii) evidently exposes bfqq to the 3820 * Not checking condition (ii) evidently exposes bfqq to the
3590 * risk of getting less throughput than its fair share. 3821 * risk of getting less throughput than its fair share.
@@ -3636,7 +3867,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
3636 * compound condition that is checked below for deciding 3867 * compound condition that is checked below for deciding
3637 * whether the scenario is asymmetric. To explain this 3868 * whether the scenario is asymmetric. To explain this
3638 * compound condition, we need to add that the function 3869 * compound condition, we need to add that the function
3639 * bfq_symmetric_scenario checks the weights of only 3870 * bfq_asymmetric_scenario checks the weights of only
3640 * non-weight-raised queues, for efficiency reasons (see 3871 * non-weight-raised queues, for efficiency reasons (see
3641 * comments on bfq_weights_tree_add()). Then the fact that 3872 * comments on bfq_weights_tree_add()). Then the fact that
3642 * bfqq is weight-raised is checked explicitly here. More 3873 * bfqq is weight-raised is checked explicitly here. More
@@ -3664,7 +3895,7 @@ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
3664 return (bfqq->wr_coeff > 1 && 3895 return (bfqq->wr_coeff > 1 &&
3665 bfqd->wr_busy_queues < 3896 bfqd->wr_busy_queues <
3666 bfq_tot_busy_queues(bfqd)) || 3897 bfq_tot_busy_queues(bfqd)) ||
3667 !bfq_symmetric_scenario(bfqd); 3898 bfq_asymmetric_scenario(bfqd, bfqq);
3668} 3899}
3669 3900
3670/* 3901/*
@@ -3740,26 +3971,98 @@ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
3740 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); 3971 return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
3741} 3972}
3742 3973
3743static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) 3974/*
3975 * This function chooses the queue from which to pick the next extra
3976 * I/O request to inject, if it finds a compatible queue. See the
3977 * comments on bfq_update_inject_limit() for details on the injection
3978 * mechanism, and for the definitions of the quantities mentioned
3979 * below.
3980 */
3981static struct bfq_queue *
3982bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
3744{ 3983{
3745 struct bfq_queue *bfqq; 3984 struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue;
3985 unsigned int limit = in_serv_bfqq->inject_limit;
3986 /*
3987 * If
3988 * - bfqq is not weight-raised and therefore does not carry
3989 * time-critical I/O,
3990 * or
3991 * - regardless of whether bfqq is weight-raised, bfqq has
3992 * however a long think time, during which it can absorb the
3993 * effect of an appropriate number of extra I/O requests
3994 * from other queues (see bfq_update_inject_limit for
3995 * details on the computation of this number);
3996 * then injection can be performed without restrictions.
3997 */
3998 bool in_serv_always_inject = in_serv_bfqq->wr_coeff == 1 ||
3999 !bfq_bfqq_has_short_ttime(in_serv_bfqq);
4000
4001 /*
4002 * If
4003 * - the baseline total service time could not be sampled yet,
4004 * so the inject limit happens to be still 0, and
4005 * - a lot of time has elapsed since the plugging of I/O
4006 * dispatching started, so drive speed is being wasted
4007 * significantly;
4008 * then temporarily raise inject limit to one request.
4009 */
4010 if (limit == 0 && in_serv_bfqq->last_serv_time_ns == 0 &&
4011 bfq_bfqq_wait_request(in_serv_bfqq) &&
4012 time_is_before_eq_jiffies(bfqd->last_idling_start_jiffies +
4013 bfqd->bfq_slice_idle)
4014 )
4015 limit = 1;
4016
4017 if (bfqd->rq_in_driver >= limit)
4018 return NULL;
3746 4019
3747 /* 4020 /*
3748 * A linear search; but, with a high probability, very few 4021 * Linear search of the source queue for injection; but, with
3749 * steps are needed to find a candidate queue, i.e., a queue 4022 * a high probability, very few steps are needed to find a
3750 * with enough budget left for its next request. In fact: 4023 * candidate queue, i.e., a queue with enough budget left for
4024 * its next request. In fact:
3751 * - BFQ dynamically updates the budget of every queue so as 4025 * - BFQ dynamically updates the budget of every queue so as
3752 * to accommodate the expected backlog of the queue; 4026 * to accommodate the expected backlog of the queue;
3753 * - if a queue gets all its requests dispatched as injected 4027 * - if a queue gets all its requests dispatched as injected
3754 * service, then the queue is removed from the active list 4028 * service, then the queue is removed from the active list
3755 * (and re-added only if it gets new requests, but with 4029 * (and re-added only if it gets new requests, but then it
3756 * enough budget for its new backlog). 4030 * is assigned again enough budget for its new backlog).
3757 */ 4031 */
3758 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) 4032 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
3759 if (!RB_EMPTY_ROOT(&bfqq->sort_list) && 4033 if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
4034 (in_serv_always_inject || bfqq->wr_coeff > 1) &&
3760 bfq_serv_to_charge(bfqq->next_rq, bfqq) <= 4035 bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
3761 bfq_bfqq_budget_left(bfqq)) 4036 bfq_bfqq_budget_left(bfqq)) {
3762 return bfqq; 4037 /*
4038 * Allow for only one large in-flight request
4039 * on non-rotational devices, for the
4040 * following reason. On non-rotationl drives,
4041 * large requests take much longer than
4042 * smaller requests to be served. In addition,
4043 * the drive prefers to serve large requests
4044 * w.r.t. to small ones, if it can choose. So,
4045 * having more than one large requests queued
4046 * in the drive may easily make the next first
4047 * request of the in-service queue wait for so
4048 * long to break bfqq's service guarantees. On
4049 * the bright side, large requests let the
4050 * drive reach a very high throughput, even if
4051 * there is only one in-flight large request
4052 * at a time.
4053 */
4054 if (blk_queue_nonrot(bfqd->queue) &&
4055 blk_rq_sectors(bfqq->next_rq) >=
4056 BFQQ_SECT_THR_NONROT)
4057 limit = min_t(unsigned int, 1, limit);
4058 else
4059 limit = in_serv_bfqq->inject_limit;
4060
4061 if (bfqd->rq_in_driver < limit) {
4062 bfqd->rqs_injected = true;
4063 return bfqq;
4064 }
4065 }
3763 4066
3764 return NULL; 4067 return NULL;
3765} 4068}
@@ -3846,14 +4149,32 @@ check_queue:
3846 * for a new request, or has requests waiting for a completion and 4149 * for a new request, or has requests waiting for a completion and
3847 * may idle after their completion, then keep it anyway. 4150 * may idle after their completion, then keep it anyway.
3848 * 4151 *
3849 * Yet, to boost throughput, inject service from other queues if 4152 * Yet, inject service from other queues if it boosts
3850 * possible. 4153 * throughput and is possible.
3851 */ 4154 */
3852 if (bfq_bfqq_wait_request(bfqq) || 4155 if (bfq_bfqq_wait_request(bfqq) ||
3853 (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { 4156 (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
3854 if (bfq_bfqq_injectable(bfqq) && 4157 struct bfq_queue *async_bfqq =
3855 bfqq->injected_service * bfqq->inject_coeff < 4158 bfqq->bic && bfqq->bic->bfqq[0] &&
3856 bfqq->entity.service * 10) 4159 bfq_bfqq_busy(bfqq->bic->bfqq[0]) ?
4160 bfqq->bic->bfqq[0] : NULL;
4161
4162 /*
4163 * If the process associated with bfqq has also async
4164 * I/O pending, then inject it
4165 * unconditionally. Injecting I/O from the same
4166 * process can cause no harm to the process. On the
4167 * contrary, it can only increase bandwidth and reduce
4168 * latency for the process.
4169 */
4170 if (async_bfqq &&
4171 icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
4172 bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
4173 bfq_bfqq_budget_left(async_bfqq))
4174 bfqq = bfqq->bic->bfqq[0];
4175 else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
4176 (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
4177 !bfq_bfqq_has_short_ttime(bfqq)))
3857 bfqq = bfq_choose_bfqq_for_injection(bfqd); 4178 bfqq = bfq_choose_bfqq_for_injection(bfqd);
3858 else 4179 else
3859 bfqq = NULL; 4180 bfqq = NULL;
@@ -3945,15 +4266,15 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
3945 4266
3946 bfq_bfqq_served(bfqq, service_to_charge); 4267 bfq_bfqq_served(bfqq, service_to_charge);
3947 4268
3948 bfq_dispatch_remove(bfqd->queue, rq); 4269 if (bfqq == bfqd->in_service_queue && bfqd->wait_dispatch) {
4270 bfqd->wait_dispatch = false;
4271 bfqd->waited_rq = rq;
4272 }
3949 4273
3950 if (bfqq != bfqd->in_service_queue) { 4274 bfq_dispatch_remove(bfqd->queue, rq);
3951 if (likely(bfqd->in_service_queue))
3952 bfqd->in_service_queue->injected_service +=
3953 bfq_serv_to_charge(rq, bfqq);
3954 4275
4276 if (bfqq != bfqd->in_service_queue)
3955 goto return_rq; 4277 goto return_rq;
3956 }
3957 4278
3958 /* 4279 /*
3959 * If weight raising has to terminate for bfqq, then next 4280 * If weight raising has to terminate for bfqq, then next
@@ -4384,13 +4705,6 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4384 bfq_mark_bfqq_has_short_ttime(bfqq); 4705 bfq_mark_bfqq_has_short_ttime(bfqq);
4385 bfq_mark_bfqq_sync(bfqq); 4706 bfq_mark_bfqq_sync(bfqq);
4386 bfq_mark_bfqq_just_created(bfqq); 4707 bfq_mark_bfqq_just_created(bfqq);
4387 /*
4388 * Aggressively inject a lot of service: up to 90%.
4389 * This coefficient remains constant during bfqq life,
4390 * but this behavior might be changed, after enough
4391 * testing and tuning.
4392 */
4393 bfqq->inject_coeff = 1;
4394 } else 4708 } else
4395 bfq_clear_bfqq_sync(bfqq); 4709 bfq_clear_bfqq_sync(bfqq);
4396 4710
@@ -4529,6 +4843,11 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4529{ 4843{
4530 bfqq->seek_history <<= 1; 4844 bfqq->seek_history <<= 1;
4531 bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); 4845 bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
4846
4847 if (bfqq->wr_coeff > 1 &&
4848 bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
4849 BFQQ_TOTALLY_SEEKY(bfqq))
4850 bfq_bfqq_end_wr(bfqq);
4532} 4851}
4533 4852
4534static void bfq_update_has_short_ttime(struct bfq_data *bfqd, 4853static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@@ -4823,6 +5142,9 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
4823 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; 5142 bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
4824 bfqd->max_rq_in_driver = 0; 5143 bfqd->max_rq_in_driver = 0;
4825 bfqd->hw_tag_samples = 0; 5144 bfqd->hw_tag_samples = 0;
5145
5146 bfqd->nonrot_with_queueing =
5147 blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag;
4826} 5148}
4827 5149
4828static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) 5150static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
@@ -4950,6 +5272,147 @@ static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
4950} 5272}
4951 5273
4952/* 5274/*
5275 * The processes associated with bfqq may happen to generate their
5276 * cumulative I/O at a lower rate than the rate at which the device
5277 * could serve the same I/O. This is rather probable, e.g., if only
5278 * one process is associated with bfqq and the device is an SSD. It
5279 * results in bfqq becoming often empty while in service. In this
5280 * respect, if BFQ is allowed to switch to another queue when bfqq
5281 * remains empty, then the device goes on being fed with I/O requests,
5282 * and the throughput is not affected. In contrast, if BFQ is not
5283 * allowed to switch to another queue---because bfqq is sync and
5284 * I/O-dispatch needs to be plugged while bfqq is temporarily
5285 * empty---then, during the service of bfqq, there will be frequent
5286 * "service holes", i.e., time intervals during which bfqq gets empty
5287 * and the device can only consume the I/O already queued in its
5288 * hardware queues. During service holes, the device may even get to
5289 * remaining idle. In the end, during the service of bfqq, the device
5290 * is driven at a lower speed than the one it can reach with the kind
5291 * of I/O flowing through bfqq.
5292 *
5293 * To counter this loss of throughput, BFQ implements a "request
5294 * injection mechanism", which tries to fill the above service holes
5295 * with I/O requests taken from other queues. The hard part in this
5296 * mechanism is finding the right amount of I/O to inject, so as to
5297 * both boost throughput and not break bfqq's bandwidth and latency
5298 * guarantees. In this respect, the mechanism maintains a per-queue
5299 * inject limit, computed as below. While bfqq is empty, the injection
5300 * mechanism dispatches extra I/O requests only until the total number
5301 * of I/O requests in flight---i.e., already dispatched but not yet
5302 * completed---remains lower than this limit.
5303 *
5304 * A first definition comes in handy to introduce the algorithm by
5305 * which the inject limit is computed. We define as first request for
5306 * bfqq, an I/O request for bfqq that arrives while bfqq is in
5307 * service, and causes bfqq to switch from empty to non-empty. The
5308 * algorithm updates the limit as a function of the effect of
5309 * injection on the service times of only the first requests of
5310 * bfqq. The reason for this restriction is that these are the
5311 * requests whose service time is affected most, because they are the
5312 * first to arrive after injection possibly occurred.
5313 *
5314 * To evaluate the effect of injection, the algorithm measures the
5315 * "total service time" of first requests. We define as total service
5316 * time of an I/O request, the time that elapses since when the
5317 * request is enqueued into bfqq, to when it is completed. This
5318 * quantity allows the whole effect of injection to be measured. It is
5319 * easy to see why. Suppose that some requests of other queues are
5320 * actually injected while bfqq is empty, and that a new request R
5321 * then arrives for bfqq. If the device does start to serve all or
5322 * part of the injected requests during the service hole, then,
5323 * because of this extra service, it may delay the next invocation of
5324 * the dispatch hook of BFQ. Then, even after R gets eventually
5325 * dispatched, the device may delay the actual service of R if it is
5326 * still busy serving the extra requests, or if it decides to serve,
5327 * before R, some extra request still present in its queues. As a
5328 * conclusion, the cumulative extra delay caused by injection can be
5329 * easily evaluated by just comparing the total service time of first
5330 * requests with and without injection.
5331 *
5332 * The limit-update algorithm works as follows. On the arrival of a
5333 * first request of bfqq, the algorithm measures the total time of the
5334 * request only if one of the three cases below holds, and, for each
5335 * case, it updates the limit as described below:
5336 *
5337 * (1) If there is no in-flight request. This gives a baseline for the
5338 * total service time of the requests of bfqq. If the baseline has
5339 * not been computed yet, then, after computing it, the limit is
5340 * set to 1, to start boosting throughput, and to prepare the
5341 * ground for the next case. If the baseline has already been
5342 * computed, then it is updated, in case it results to be lower
5343 * than the previous value.
5344 *
5345 * (2) If the limit is higher than 0 and there are in-flight
5346 * requests. By comparing the total service time in this case with
5347 * the above baseline, it is possible to know at which extent the
5348 * current value of the limit is inflating the total service
5349 * time. If the inflation is below a certain threshold, then bfqq
5350 * is assumed to be suffering from no perceivable loss of its
5351 * service guarantees, and the limit is even tentatively
5352 * increased. If the inflation is above the threshold, then the
5353 * limit is decreased. Due to the lack of any hysteresis, this
5354 * logic makes the limit oscillate even in steady workload
5355 * conditions. Yet we opted for it, because it is fast in reaching
5356 * the best value for the limit, as a function of the current I/O
5357 * workload. To reduce oscillations, this step is disabled for a
5358 * short time interval after the limit happens to be decreased.
5359 *
5360 * (3) Periodically, after resetting the limit, to make sure that the
5361 * limit eventually drops in case the workload changes. This is
5362 * needed because, after the limit has gone safely up for a
5363 * certain workload, it is impossible to guess whether the
5364 * baseline total service time may have changed, without measuring
5365 * it again without injection. A more effective version of this
5366 * step might be to just sample the baseline, by interrupting
5367 * injection only once, and then to reset/lower the limit only if
5368 * the total service time with the current limit does happen to be
5369 * too large.
5370 *
5371 * More details on each step are provided in the comments on the
5372 * pieces of code that implement these steps: the branch handling the
5373 * transition from empty to non empty in bfq_add_request(), the branch
5374 * handling injection in bfq_select_queue(), and the function
5375 * bfq_choose_bfqq_for_injection(). These comments also explain some
5376 * exceptions, made by the injection mechanism in some special cases.
5377 */
5378static void bfq_update_inject_limit(struct bfq_data *bfqd,
5379 struct bfq_queue *bfqq)
5380{
5381 u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
5382 unsigned int old_limit = bfqq->inject_limit;
5383
5384 if (bfqq->last_serv_time_ns > 0) {
5385 u64 threshold = (bfqq->last_serv_time_ns * 3)>>1;
5386
5387 if (tot_time_ns >= threshold && old_limit > 0) {
5388 bfqq->inject_limit--;
5389 bfqq->decrease_time_jif = jiffies;
5390 } else if (tot_time_ns < threshold &&
5391 old_limit < bfqd->max_rq_in_driver<<1)
5392 bfqq->inject_limit++;
5393 }
5394
5395 /*
5396 * Either we still have to compute the base value for the
5397 * total service time, and there seem to be the right
5398 * conditions to do it, or we can lower the last base value
5399 * computed.
5400 */
5401 if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 0) ||
5402 tot_time_ns < bfqq->last_serv_time_ns) {
5403 bfqq->last_serv_time_ns = tot_time_ns;
5404 /*
5405 * Now we certainly have a base value: make sure we
5406 * start trying injection.
5407 */
5408 bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
5409 }
5410
5411 /* update complete, not waiting for any request completion any longer */
5412 bfqd->waited_rq = NULL;
5413}
5414
5415/*
4953 * Handle either a requeue or a finish for rq. The things to do are 5416 * Handle either a requeue or a finish for rq. The things to do are
4954 * the same in both cases: all references to rq are to be dropped. In 5417 * the same in both cases: all references to rq are to be dropped. In
4955 * particular, rq is considered completed from the point of view of 5418 * particular, rq is considered completed from the point of view of
@@ -4993,6 +5456,9 @@ static void bfq_finish_requeue_request(struct request *rq)
4993 5456
4994 spin_lock_irqsave(&bfqd->lock, flags); 5457 spin_lock_irqsave(&bfqd->lock, flags);
4995 5458
5459 if (rq == bfqd->waited_rq)
5460 bfq_update_inject_limit(bfqd, bfqq);
5461
4996 bfq_completed_request(bfqq, bfqd); 5462 bfq_completed_request(bfqq, bfqd);
4997 bfq_finish_requeue_request_body(bfqq); 5463 bfq_finish_requeue_request_body(bfqq);
4998 5464
@@ -5156,7 +5622,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio)
5156 * preparation is that, after the prepare_request hook is invoked for 5622 * preparation is that, after the prepare_request hook is invoked for
5157 * rq, rq may still be transformed into a request with no icq, i.e., a 5623 * rq, rq may still be transformed into a request with no icq, i.e., a
5158 * request not associated with any queue. No bfq hook is invoked to 5624 * request not associated with any queue. No bfq hook is invoked to
5159 * signal this tranformation. As a consequence, should these 5625 * signal this transformation. As a consequence, should these
5160 * preparation operations be performed when the prepare_request hook 5626 * preparation operations be performed when the prepare_request hook
5161 * is invoked, and should rq be transformed one moment later, bfq 5627 * is invoked, and should rq be transformed one moment later, bfq
5162 * would end up in an inconsistent state, because it would have 5628 * would end up in an inconsistent state, because it would have
@@ -5247,7 +5713,29 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
5247 } 5713 }
5248 } 5714 }
5249 5715
5250 if (unlikely(bfq_bfqq_just_created(bfqq))) 5716 /*
5717 * Consider bfqq as possibly belonging to a burst of newly
5718 * created queues only if:
5719 * 1) A burst is actually happening (bfqd->burst_size > 0)
5720 * or
5721 * 2) There is no other active queue. In fact, if, in
5722 * contrast, there are active queues not belonging to the
5723 * possible burst bfqq may belong to, then there is no gain
5724 * in considering bfqq as belonging to a burst, and
5725 * therefore in not weight-raising bfqq. See comments on
5726 * bfq_handle_burst().
5727 *
5728 * This filtering also helps eliminating false positives,
5729 * occurring when bfqq does not belong to an actual large
5730 * burst, but some background task (e.g., a service) happens
5731 * to trigger the creation of new queues very close to when
5732 * bfqq and its possible companion queues are created. See
5733 * comments on bfq_handle_burst() for further details also on
5734 * this issue.
5735 */
5736 if (unlikely(bfq_bfqq_just_created(bfqq) &&
5737 (bfqd->burst_size > 0 ||
5738 bfq_tot_busy_queues(bfqd) == 0)))
5251 bfq_handle_burst(bfqd, bfqq); 5739 bfq_handle_burst(bfqd, bfqq);
5252 5740
5253 return bfqq; 5741 return bfqq;
@@ -5507,7 +5995,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5507 HRTIMER_MODE_REL); 5995 HRTIMER_MODE_REL);
5508 bfqd->idle_slice_timer.function = bfq_idle_slice_timer; 5996 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
5509 5997
5510 bfqd->queue_weights_tree = RB_ROOT; 5998 bfqd->queue_weights_tree = RB_ROOT_CACHED;
5511 bfqd->num_groups_with_pending_reqs = 0; 5999 bfqd->num_groups_with_pending_reqs = 0;
5512 6000
5513 INIT_LIST_HEAD(&bfqd->active_list); 6001 INIT_LIST_HEAD(&bfqd->active_list);
@@ -5515,6 +6003,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5515 INIT_HLIST_HEAD(&bfqd->burst_list); 6003 INIT_HLIST_HEAD(&bfqd->burst_list);
5516 6004
5517 bfqd->hw_tag = -1; 6005 bfqd->hw_tag = -1;
6006 bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue);
5518 6007
5519 bfqd->bfq_max_budget = bfq_default_max_budget; 6008 bfqd->bfq_max_budget = bfq_default_max_budget;
5520 6009
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 86394e503ca9..c2faa77824f8 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -1,16 +1,7 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
1/* 2/*
2 * Header file for the BFQ I/O scheduler: data structures and 3 * Header file for the BFQ I/O scheduler: data structures and
3 * prototypes of interface functions among BFQ components. 4 * prototypes of interface functions among BFQ components.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 */ 5 */
15#ifndef _BFQ_H 6#ifndef _BFQ_H
16#define _BFQ_H 7#define _BFQ_H
@@ -32,6 +23,8 @@
32#define BFQ_DEFAULT_GRP_IOPRIO 0 23#define BFQ_DEFAULT_GRP_IOPRIO 0
33#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE 24#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
34 25
26#define MAX_PID_STR_LENGTH 12
27
35/* 28/*
36 * Soft real-time applications are extremely more latency sensitive 29 * Soft real-time applications are extremely more latency sensitive
37 * than interactive ones. Over-raise the weight of the former to 30 * than interactive ones. Over-raise the weight of the former to
@@ -89,7 +82,7 @@ struct bfq_service_tree {
89 * expiration. This peculiar definition allows for the following 82 * expiration. This peculiar definition allows for the following
90 * optimization, not yet exploited: while a given entity is still in 83 * optimization, not yet exploited: while a given entity is still in
91 * service, we already know which is the best candidate for next 84 * service, we already know which is the best candidate for next
92 * service among the other active entitities in the same parent 85 * service among the other active entities in the same parent
93 * entity. We can then quickly compare the timestamps of the 86 * entity. We can then quickly compare the timestamps of the
94 * in-service entity with those of such best candidate. 87 * in-service entity with those of such best candidate.
95 * 88 *
@@ -140,7 +133,7 @@ struct bfq_weight_counter {
140 * 133 *
141 * Unless cgroups are used, the weight value is calculated from the 134 * Unless cgroups are used, the weight value is calculated from the
142 * ioprio to export the same interface as CFQ. When dealing with 135 * ioprio to export the same interface as CFQ. When dealing with
143 * ``well-behaved'' queues (i.e., queues that do not spend too much 136 * "well-behaved" queues (i.e., queues that do not spend too much
144 * time to consume their budget and have true sequential behavior, and 137 * time to consume their budget and have true sequential behavior, and
145 * when there are no external factors breaking anticipation) the 138 * when there are no external factors breaking anticipation) the
146 * relative weights at each level of the cgroups hierarchy should be 139 * relative weights at each level of the cgroups hierarchy should be
@@ -240,6 +233,13 @@ struct bfq_queue {
240 /* next ioprio and ioprio class if a change is in progress */ 233 /* next ioprio and ioprio class if a change is in progress */
241 unsigned short new_ioprio, new_ioprio_class; 234 unsigned short new_ioprio, new_ioprio_class;
242 235
236 /* last total-service-time sample, see bfq_update_inject_limit() */
237 u64 last_serv_time_ns;
238 /* limit for request injection */
239 unsigned int inject_limit;
240 /* last time the inject limit has been decreased, in jiffies */
241 unsigned long decrease_time_jif;
242
243 /* 243 /*
244 * Shared bfq_queue if queue is cooperating with one or more 244 * Shared bfq_queue if queue is cooperating with one or more
245 * other queues. 245 * other queues.
@@ -357,29 +357,6 @@ struct bfq_queue {
357 357
358 /* max service rate measured so far */ 358 /* max service rate measured so far */
359 u32 max_service_rate; 359 u32 max_service_rate;
360 /*
361 * Ratio between the service received by bfqq while it is in
362 * service, and the cumulative service (of requests of other
363 * queues) that may be injected while bfqq is empty but still
364 * in service. To increase precision, the coefficient is
365 * measured in tenths of unit. Here are some example of (1)
366 * ratios, (2) resulting percentages of service injected
367 * w.r.t. to the total service dispatched while bfqq is in
368 * service, and (3) corresponding values of the coefficient:
369 * 1 (50%) -> 10
370 * 2 (33%) -> 20
371 * 10 (9%) -> 100
372 * 9.9 (9%) -> 99
373 * 1.5 (40%) -> 15
374 * 0.5 (66%) -> 5
375 * 0.1 (90%) -> 1
376 *
377 * So, if the coefficient is lower than 10, then
378 * injected service is more than bfqq service.
379 */
380 unsigned int inject_coeff;
381 /* amount of service injected in current service slot */
382 unsigned int injected_service;
383}; 360};
384 361
385/** 362/**
@@ -419,6 +396,15 @@ struct bfq_io_cq {
419 bool was_in_burst_list; 396 bool was_in_burst_list;
420 397
421 /* 398 /*
399 * Save the weight when a merge occurs, to be able
400 * to restore it in case of split. If the weight is not
401 * correctly resumed when the queue is recycled,
402 * then the weight of the recycled queue could differ
403 * from the weight of the original queue.
404 */
405 unsigned int saved_weight;
406
407 /*
422 * Similar to previous fields: save wr information. 408 * Similar to previous fields: save wr information.
423 */ 409 */
424 unsigned long saved_wr_coeff; 410 unsigned long saved_wr_coeff;
@@ -450,7 +436,7 @@ struct bfq_data {
450 * weight-raised @bfq_queue (see the comments to the functions 436 * weight-raised @bfq_queue (see the comments to the functions
451 * bfq_weights_tree_[add|remove] for further details). 437 * bfq_weights_tree_[add|remove] for further details).
452 */ 438 */
453 struct rb_root queue_weights_tree; 439 struct rb_root_cached queue_weights_tree;
454 440
455 /* 441 /*
456 * Number of groups with at least one descendant process that 442 * Number of groups with at least one descendant process that
@@ -513,6 +499,9 @@ struct bfq_data {
513 /* number of requests dispatched and waiting for completion */ 499 /* number of requests dispatched and waiting for completion */
514 int rq_in_driver; 500 int rq_in_driver;
515 501
502 /* true if the device is non rotational and performs queueing */
503 bool nonrot_with_queueing;
504
516 /* 505 /*
517 * Maximum number of requests in driver in the last 506 * Maximum number of requests in driver in the last
518 * @hw_tag_samples completed requests. 507 * @hw_tag_samples completed requests.
@@ -544,6 +533,26 @@ struct bfq_data {
544 /* time of last request completion (ns) */ 533 /* time of last request completion (ns) */
545 u64 last_completion; 534 u64 last_completion;
546 535
536 /* time of last transition from empty to non-empty (ns) */
537 u64 last_empty_occupied_ns;
538
539 /*
540 * Flag set to activate the sampling of the total service time
541 * of a just-arrived first I/O request (see
542 * bfq_update_inject_limit()). This will cause the setting of
543 * waited_rq when the request is finally dispatched.
544 */
545 bool wait_dispatch;
546 /*
547 * If set, then bfq_update_inject_limit() is invoked when
548 * waited_rq is eventually completed.
549 */
550 struct request *waited_rq;
551 /*
552 * True if some request has been injected during the last service hole.
553 */
554 bool rqs_injected;
555
547 /* time of first rq dispatch in current observation interval (ns) */ 556 /* time of first rq dispatch in current observation interval (ns) */
548 u64 first_dispatch; 557 u64 first_dispatch;
549 /* time of last rq dispatch in current observation interval (ns) */ 558 /* time of last rq dispatch in current observation interval (ns) */
@@ -553,6 +562,7 @@ struct bfq_data {
553 ktime_t last_budget_start; 562 ktime_t last_budget_start;
554 /* beginning of the last idle slice */ 563 /* beginning of the last idle slice */
555 ktime_t last_idling_start; 564 ktime_t last_idling_start;
565 unsigned long last_idling_start_jiffies;
556 566
557 /* number of samples in current observation interval */ 567 /* number of samples in current observation interval */
558 int peak_rate_samples; 568 int peak_rate_samples;
@@ -898,10 +908,10 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
898struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); 908struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
899void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); 909void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
900void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, 910void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
901 struct rb_root *root); 911 struct rb_root_cached *root);
902void __bfq_weights_tree_remove(struct bfq_data *bfqd, 912void __bfq_weights_tree_remove(struct bfq_data *bfqd,
903 struct bfq_queue *bfqq, 913 struct bfq_queue *bfqq,
904 struct rb_root *root); 914 struct rb_root_cached *root);
905void bfq_weights_tree_remove(struct bfq_data *bfqd, 915void bfq_weights_tree_remove(struct bfq_data *bfqd,
906 struct bfq_queue *bfqq); 916 struct bfq_queue *bfqq);
907void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, 917void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
@@ -1008,13 +1018,23 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
1008/* --------------- end of interface of B-WF2Q+ ---------------- */ 1018/* --------------- end of interface of B-WF2Q+ ---------------- */
1009 1019
1010/* Logging facilities. */ 1020/* Logging facilities. */
1021static inline void bfq_pid_to_str(int pid, char *str, int len)
1022{
1023 if (pid != -1)
1024 snprintf(str, len, "%d", pid);
1025 else
1026 snprintf(str, len, "SHARED-");
1027}
1028
1011#ifdef CONFIG_BFQ_GROUP_IOSCHED 1029#ifdef CONFIG_BFQ_GROUP_IOSCHED
1012struct bfq_group *bfqq_group(struct bfq_queue *bfqq); 1030struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
1013 1031
1014#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ 1032#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
1033 char pid_str[MAX_PID_STR_LENGTH]; \
1034 bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
1015 blk_add_cgroup_trace_msg((bfqd)->queue, \ 1035 blk_add_cgroup_trace_msg((bfqd)->queue, \
1016 bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ 1036 bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
1017 "bfq%d%c " fmt, (bfqq)->pid, \ 1037 "bfq%s%c " fmt, pid_str, \
1018 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ 1038 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \
1019} while (0) 1039} while (0)
1020 1040
@@ -1025,10 +1045,13 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
1025 1045
1026#else /* CONFIG_BFQ_GROUP_IOSCHED */ 1046#else /* CONFIG_BFQ_GROUP_IOSCHED */
1027 1047
1028#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ 1048#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
1029 blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ 1049 char pid_str[MAX_PID_STR_LENGTH]; \
1050 bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
1051 blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \
1030 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ 1052 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
1031 ##args) 1053 ##args); \
1054} while (0)
1032#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) 1055#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
1033 1056
1034#endif /* CONFIG_BFQ_GROUP_IOSCHED */ 1057#endif /* CONFIG_BFQ_GROUP_IOSCHED */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index ae4d000ac0af..c9ba225081ce 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1,19 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * Hierarchical Budget Worst-case Fair Weighted Fair Queueing 3 * Hierarchical Budget Worst-case Fair Weighted Fair Queueing
3 * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O 4 * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O
4 * scheduler schedules generic entities. The latter can represent 5 * scheduler schedules generic entities. The latter can represent
5 * either single bfq queues (associated with processes) or groups of 6 * either single bfq queues (associated with processes) or groups of
6 * bfq queues (associated with cgroups). 7 * bfq queues (associated with cgroups).
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 */ 8 */
18#include "bfq-iosched.h" 9#include "bfq-iosched.h"
19 10
@@ -59,7 +50,7 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
59 * bfq_update_next_in_service - update sd->next_in_service 50 * bfq_update_next_in_service - update sd->next_in_service
60 * @sd: sched_data for which to perform the update. 51 * @sd: sched_data for which to perform the update.
61 * @new_entity: if not NULL, pointer to the entity whose activation, 52 * @new_entity: if not NULL, pointer to the entity whose activation,
62 * requeueing or repositionig triggered the invocation of 53 * requeueing or repositioning triggered the invocation of
63 * this function. 54 * this function.
64 * @expiration: id true, this function is being invoked after the 55 * @expiration: id true, this function is being invoked after the
65 * expiration of the in-service entity 56 * expiration of the in-service entity
@@ -90,7 +81,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
90 81
91 /* 82 /*
92 * If this update is triggered by the activation, requeueing 83 * If this update is triggered by the activation, requeueing
93 * or repositiong of an entity that does not coincide with 84 * or repositioning of an entity that does not coincide with
94 * sd->next_in_service, then a full lookup in the active tree 85 * sd->next_in_service, then a full lookup in the active tree
95 * can be avoided. In fact, it is enough to check whether the 86 * can be avoided. In fact, it is enough to check whether the
96 * just-modified entity has the same priority as 87 * just-modified entity has the same priority as
@@ -737,7 +728,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
737 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); 728 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
738 unsigned int prev_weight, new_weight; 729 unsigned int prev_weight, new_weight;
739 struct bfq_data *bfqd = NULL; 730 struct bfq_data *bfqd = NULL;
740 struct rb_root *root; 731 struct rb_root_cached *root;
741#ifdef CONFIG_BFQ_GROUP_IOSCHED 732#ifdef CONFIG_BFQ_GROUP_IOSCHED
742 struct bfq_sched_data *sd; 733 struct bfq_sched_data *sd;
743 struct bfq_group *bfqg; 734 struct bfq_group *bfqg;
@@ -1396,7 +1387,7 @@ left:
1396 * In this first case, update the virtual time in @st too (see the 1387 * In this first case, update the virtual time in @st too (see the
1397 * comments on this update inside the function). 1388 * comments on this update inside the function).
1398 * 1389 *
1399 * In constrast, if there is an in-service entity, then return the 1390 * In contrast, if there is an in-service entity, then return the
1400 * entity that would be set in service if not only the above 1391 * entity that would be set in service if not only the above
1401 * conditions, but also the next one held true: the currently 1392 * conditions, but also the next one held true: the currently
1402 * in-service entity, on expiration, 1393 * in-service entity, on expiration,
@@ -1479,12 +1470,12 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
1479 * is being invoked as a part of the expiration path 1470 * is being invoked as a part of the expiration path
1480 * of the in-service queue. In this case, even if 1471 * of the in-service queue. In this case, even if
1481 * sd->in_service_entity is not NULL, 1472 * sd->in_service_entity is not NULL,
1482 * sd->in_service_entiy at this point is actually not 1473 * sd->in_service_entity at this point is actually not
1483 * in service any more, and, if needed, has already 1474 * in service any more, and, if needed, has already
1484 * been properly queued or requeued into the right 1475 * been properly queued or requeued into the right
1485 * tree. The reason why sd->in_service_entity is still 1476 * tree. The reason why sd->in_service_entity is still
1486 * not NULL here, even if expiration is true, is that 1477 * not NULL here, even if expiration is true, is that
1487 * sd->in_service_entiy is reset as a last step in the 1478 * sd->in_service_entity is reset as a last step in the
1488 * expiration path. So, if expiration is true, tell 1479 * expiration path. So, if expiration is true, tell
1489 * __bfq_lookup_next_entity that there is no 1480 * __bfq_lookup_next_entity that there is no
1490 * sd->in_service_entity. 1481 * sd->in_service_entity.
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 1b633a3526d4..42536674020a 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -1,23 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * bio-integrity.c - bio data integrity extensions 3 * bio-integrity.c - bio data integrity extensions
3 * 4 *
4 * Copyright (C) 2007, 2008, 2009 Oracle Corporation 5 * Copyright (C) 2007, 2008, 2009 Oracle Corporation
5 * Written by: Martin K. Petersen <martin.petersen@oracle.com> 6 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License version
9 * 2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
19 * USA.
20 *
21 */ 7 */
22 8
23#include <linux/blkdev.h> 9#include <linux/blkdev.h>
diff --git a/block/bio.c b/block/bio.c
index 716510ecd7ff..683cbb40f051 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1,19 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> 3 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public Licens
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
16 *
17 */ 4 */
18#include <linux/mm.h> 5#include <linux/mm.h>
19#include <linux/swap.h> 6#include <linux/swap.h>
@@ -647,25 +634,72 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
647} 634}
648EXPORT_SYMBOL(bio_clone_fast); 635EXPORT_SYMBOL(bio_clone_fast);
649 636
637static inline bool page_is_mergeable(const struct bio_vec *bv,
638 struct page *page, unsigned int len, unsigned int off,
639 bool same_page)
640{
641 phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
642 bv->bv_offset + bv->bv_len - 1;
643 phys_addr_t page_addr = page_to_phys(page);
644
645 if (vec_end_addr + 1 != page_addr + off)
646 return false;
647 if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
648 return false;
649
650 if ((vec_end_addr & PAGE_MASK) != page_addr) {
651 if (same_page)
652 return false;
653 if (pfn_to_page(PFN_DOWN(vec_end_addr)) + 1 != page)
654 return false;
655 }
656
657 WARN_ON_ONCE(same_page && (len + off) > PAGE_SIZE);
658
659 return true;
660}
661
662/*
663 * Check if the @page can be added to the current segment(@bv), and make
664 * sure to call it only if page_is_mergeable(@bv, @page) is true
665 */
666static bool can_add_page_to_seg(struct request_queue *q,
667 struct bio_vec *bv, struct page *page, unsigned len,
668 unsigned offset)
669{
670 unsigned long mask = queue_segment_boundary(q);
671 phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
672 phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
673
674 if ((addr1 | mask) != (addr2 | mask))
675 return false;
676
677 if (bv->bv_len + len > queue_max_segment_size(q))
678 return false;
679
680 return true;
681}
682
650/** 683/**
651 * bio_add_pc_page - attempt to add page to bio 684 * __bio_add_pc_page - attempt to add page to passthrough bio
652 * @q: the target queue 685 * @q: the target queue
653 * @bio: destination bio 686 * @bio: destination bio
654 * @page: page to add 687 * @page: page to add
655 * @len: vec entry length 688 * @len: vec entry length
656 * @offset: vec entry offset 689 * @offset: vec entry offset
690 * @put_same_page: put the page if it is same with last added page
657 * 691 *
658 * Attempt to add a page to the bio_vec maplist. This can fail for a 692 * Attempt to add a page to the bio_vec maplist. This can fail for a
659 * number of reasons, such as the bio being full or target block device 693 * number of reasons, such as the bio being full or target block device
660 * limitations. The target block device must allow bio's up to PAGE_SIZE, 694 * limitations. The target block device must allow bio's up to PAGE_SIZE,
661 * so it is always possible to add a single page to an empty bio. 695 * so it is always possible to add a single page to an empty bio.
662 * 696 *
663 * This should only be used by REQ_PC bios. 697 * This should only be used by passthrough bios.
664 */ 698 */
665int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page 699static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
666 *page, unsigned int len, unsigned int offset) 700 struct page *page, unsigned int len, unsigned int offset,
701 bool put_same_page)
667{ 702{
668 int retried_segments = 0;
669 struct bio_vec *bvec; 703 struct bio_vec *bvec;
670 704
671 /* 705 /*
@@ -677,18 +711,14 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
677 if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q)) 711 if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
678 return 0; 712 return 0;
679 713
680 /*
681 * For filesystems with a blocksize smaller than the pagesize
682 * we will often be called with the same page as last time and
683 * a consecutive offset. Optimize this special case.
684 */
685 if (bio->bi_vcnt > 0) { 714 if (bio->bi_vcnt > 0) {
686 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 715 bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
687 716
688 if (page == prev->bv_page && 717 if (page == bvec->bv_page &&
689 offset == prev->bv_offset + prev->bv_len) { 718 offset == bvec->bv_offset + bvec->bv_len) {
690 prev->bv_len += len; 719 if (put_same_page)
691 bio->bi_iter.bi_size += len; 720 put_page(page);
721 bvec->bv_len += len;
692 goto done; 722 goto done;
693 } 723 }
694 724
@@ -696,63 +726,47 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
696 * If the queue doesn't support SG gaps and adding this 726 * If the queue doesn't support SG gaps and adding this
697 * offset would create a gap, disallow it. 727 * offset would create a gap, disallow it.
698 */ 728 */
699 if (bvec_gap_to_prev(q, prev, offset)) 729 if (bvec_gap_to_prev(q, bvec, offset))
700 return 0; 730 return 0;
731
732 if (page_is_mergeable(bvec, page, len, offset, false) &&
733 can_add_page_to_seg(q, bvec, page, len, offset)) {
734 bvec->bv_len += len;
735 goto done;
736 }
701 } 737 }
702 738
703 if (bio_full(bio)) 739 if (bio_full(bio))
704 return 0; 740 return 0;
705 741
706 /* 742 if (bio->bi_phys_segments >= queue_max_segments(q))
707 * setup the new entry, we might clear it again later if we 743 return 0;
708 * cannot add the page 744
709 */
710 bvec = &bio->bi_io_vec[bio->bi_vcnt]; 745 bvec = &bio->bi_io_vec[bio->bi_vcnt];
711 bvec->bv_page = page; 746 bvec->bv_page = page;
712 bvec->bv_len = len; 747 bvec->bv_len = len;
713 bvec->bv_offset = offset; 748 bvec->bv_offset = offset;
714 bio->bi_vcnt++; 749 bio->bi_vcnt++;
715 bio->bi_phys_segments++;
716 bio->bi_iter.bi_size += len;
717
718 /*
719 * Perform a recount if the number of segments is greater
720 * than queue_max_segments(q).
721 */
722
723 while (bio->bi_phys_segments > queue_max_segments(q)) {
724
725 if (retried_segments)
726 goto failed;
727
728 retried_segments = 1;
729 blk_recount_segments(q, bio);
730 }
731
732 /* If we may be able to merge these biovecs, force a recount */
733 if (bio->bi_vcnt > 1 && biovec_phys_mergeable(q, bvec - 1, bvec))
734 bio_clear_flag(bio, BIO_SEG_VALID);
735
736 done: 750 done:
751 bio->bi_iter.bi_size += len;
752 bio->bi_phys_segments = bio->bi_vcnt;
753 bio_set_flag(bio, BIO_SEG_VALID);
737 return len; 754 return len;
755}
738 756
739 failed: 757int bio_add_pc_page(struct request_queue *q, struct bio *bio,
740 bvec->bv_page = NULL; 758 struct page *page, unsigned int len, unsigned int offset)
741 bvec->bv_len = 0; 759{
742 bvec->bv_offset = 0; 760 return __bio_add_pc_page(q, bio, page, len, offset, false);
743 bio->bi_vcnt--;
744 bio->bi_iter.bi_size -= len;
745 blk_recount_segments(q, bio);
746 return 0;
747} 761}
748EXPORT_SYMBOL(bio_add_pc_page); 762EXPORT_SYMBOL(bio_add_pc_page);
749 763
750/** 764/**
751 * __bio_try_merge_page - try appending data to an existing bvec. 765 * __bio_try_merge_page - try appending data to an existing bvec.
752 * @bio: destination bio 766 * @bio: destination bio
753 * @page: page to add 767 * @page: start page to add
754 * @len: length of the data to add 768 * @len: length of the data to add
755 * @off: offset of the data in @page 769 * @off: offset of the data relative to @page
756 * @same_page: if %true only merge if the new data is in the same physical 770 * @same_page: if %true only merge if the new data is in the same physical
757 * page as the last segment of the bio. 771 * page as the last segment of the bio.
758 * 772 *
@@ -760,6 +774,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
760 * a useful optimisation for file systems with a block size smaller than the 774 * a useful optimisation for file systems with a block size smaller than the
761 * page size. 775 * page size.
762 * 776 *
777 * Warn if (@len, @off) crosses pages in case that @same_page is true.
778 *
763 * Return %true on success or %false on failure. 779 * Return %true on success or %false on failure.
764 */ 780 */
765bool __bio_try_merge_page(struct bio *bio, struct page *page, 781bool __bio_try_merge_page(struct bio *bio, struct page *page,
@@ -770,29 +786,23 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
770 786
771 if (bio->bi_vcnt > 0) { 787 if (bio->bi_vcnt > 0) {
772 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; 788 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
773 phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
774 bv->bv_offset + bv->bv_len - 1;
775 phys_addr_t page_addr = page_to_phys(page);
776 789
777 if (vec_end_addr + 1 != page_addr + off) 790 if (page_is_mergeable(bv, page, len, off, same_page)) {
778 return false; 791 bv->bv_len += len;
779 if (same_page && (vec_end_addr & PAGE_MASK) != page_addr) 792 bio->bi_iter.bi_size += len;
780 return false; 793 return true;
781 794 }
782 bv->bv_len += len;
783 bio->bi_iter.bi_size += len;
784 return true;
785 } 795 }
786 return false; 796 return false;
787} 797}
788EXPORT_SYMBOL_GPL(__bio_try_merge_page); 798EXPORT_SYMBOL_GPL(__bio_try_merge_page);
789 799
790/** 800/**
791 * __bio_add_page - add page to a bio in a new segment 801 * __bio_add_page - add page(s) to a bio in a new segment
792 * @bio: destination bio 802 * @bio: destination bio
793 * @page: page to add 803 * @page: start page to add
794 * @len: length of the data to add 804 * @len: length of the data to add, may cross pages
795 * @off: offset of the data in @page 805 * @off: offset of the data relative to @page, may cross pages
796 * 806 *
797 * Add the data at @page + @off to @bio as a new bvec. The caller must ensure 807 * Add the data at @page + @off to @bio as a new bvec. The caller must ensure
798 * that @bio has space for another bvec. 808 * that @bio has space for another bvec.
@@ -815,13 +825,13 @@ void __bio_add_page(struct bio *bio, struct page *page,
815EXPORT_SYMBOL_GPL(__bio_add_page); 825EXPORT_SYMBOL_GPL(__bio_add_page);
816 826
817/** 827/**
818 * bio_add_page - attempt to add page to bio 828 * bio_add_page - attempt to add page(s) to bio
819 * @bio: destination bio 829 * @bio: destination bio
820 * @page: page to add 830 * @page: start page to add
821 * @len: vec entry length 831 * @len: vec entry length, may cross pages
822 * @offset: vec entry offset 832 * @offset: vec entry offset relative to @page, may cross pages
823 * 833 *
824 * Attempt to add a page to the bio_vec maplist. This will only fail 834 * Attempt to add page(s) to the bio_vec maplist. This will only fail
825 * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. 835 * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
826 */ 836 */
827int bio_add_page(struct bio *bio, struct page *page, 837int bio_add_page(struct bio *bio, struct page *page,
@@ -836,6 +846,24 @@ int bio_add_page(struct bio *bio, struct page *page,
836} 846}
837EXPORT_SYMBOL(bio_add_page); 847EXPORT_SYMBOL(bio_add_page);
838 848
849static void bio_get_pages(struct bio *bio)
850{
851 struct bvec_iter_all iter_all;
852 struct bio_vec *bvec;
853
854 bio_for_each_segment_all(bvec, bio, iter_all)
855 get_page(bvec->bv_page);
856}
857
858static void bio_release_pages(struct bio *bio)
859{
860 struct bvec_iter_all iter_all;
861 struct bio_vec *bvec;
862
863 bio_for_each_segment_all(bvec, bio, iter_all)
864 put_page(bvec->bv_page);
865}
866
839static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) 867static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
840{ 868{
841 const struct bio_vec *bv = iter->bvec; 869 const struct bio_vec *bv = iter->bvec;
@@ -848,20 +876,10 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
848 len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); 876 len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count);
849 size = bio_add_page(bio, bv->bv_page, len, 877 size = bio_add_page(bio, bv->bv_page, len,
850 bv->bv_offset + iter->iov_offset); 878 bv->bv_offset + iter->iov_offset);
851 if (size == len) { 879 if (unlikely(size != len))
852 if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { 880 return -EINVAL;
853 struct page *page; 881 iov_iter_advance(iter, size);
854 int i; 882 return 0;
855
856 mp_bvec_for_each_page(page, bv, i)
857 get_page(page);
858 }
859
860 iov_iter_advance(iter, size);
861 return 0;
862 }
863
864 return -EINVAL;
865} 883}
866 884
867#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) 885#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
@@ -934,29 +952,24 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
934int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) 952int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
935{ 953{
936 const bool is_bvec = iov_iter_is_bvec(iter); 954 const bool is_bvec = iov_iter_is_bvec(iter);
937 unsigned short orig_vcnt = bio->bi_vcnt; 955 int ret;
938 956
939 /* 957 if (WARN_ON_ONCE(bio->bi_vcnt))
940 * If this is a BVEC iter, then the pages are kernel pages. Don't 958 return -EINVAL;
941 * release them on IO completion, if the caller asked us to.
942 */
943 if (is_bvec && iov_iter_bvec_no_ref(iter))
944 bio_set_flag(bio, BIO_NO_PAGE_REF);
945 959
946 do { 960 do {
947 int ret;
948
949 if (is_bvec) 961 if (is_bvec)
950 ret = __bio_iov_bvec_add_pages(bio, iter); 962 ret = __bio_iov_bvec_add_pages(bio, iter);
951 else 963 else
952 ret = __bio_iov_iter_get_pages(bio, iter); 964 ret = __bio_iov_iter_get_pages(bio, iter);
965 } while (!ret && iov_iter_count(iter) && !bio_full(bio));
953 966
954 if (unlikely(ret)) 967 if (iov_iter_bvec_no_ref(iter))
955 return bio->bi_vcnt > orig_vcnt ? 0 : ret; 968 bio_set_flag(bio, BIO_NO_PAGE_REF);
956 969 else if (is_bvec)
957 } while (iov_iter_count(iter) && !bio_full(bio)); 970 bio_get_pages(bio);
958 971
959 return 0; 972 return bio->bi_vcnt ? 0 : ret;
960} 973}
961 974
962static void submit_bio_wait_endio(struct bio *bio) 975static void submit_bio_wait_endio(struct bio *bio)
@@ -1127,11 +1140,10 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
1127 */ 1140 */
1128static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) 1141static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
1129{ 1142{
1130 int i;
1131 struct bio_vec *bvec; 1143 struct bio_vec *bvec;
1132 struct bvec_iter_all iter_all; 1144 struct bvec_iter_all iter_all;
1133 1145
1134 bio_for_each_segment_all(bvec, bio, i, iter_all) { 1146 bio_for_each_segment_all(bvec, bio, iter_all) {
1135 ssize_t ret; 1147 ssize_t ret;
1136 1148
1137 ret = copy_page_from_iter(bvec->bv_page, 1149 ret = copy_page_from_iter(bvec->bv_page,
@@ -1159,11 +1171,10 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
1159 */ 1171 */
1160static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) 1172static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
1161{ 1173{
1162 int i;
1163 struct bio_vec *bvec; 1174 struct bio_vec *bvec;
1164 struct bvec_iter_all iter_all; 1175 struct bvec_iter_all iter_all;
1165 1176
1166 bio_for_each_segment_all(bvec, bio, i, iter_all) { 1177 bio_for_each_segment_all(bvec, bio, iter_all) {
1167 ssize_t ret; 1178 ssize_t ret;
1168 1179
1169 ret = copy_page_to_iter(bvec->bv_page, 1180 ret = copy_page_to_iter(bvec->bv_page,
@@ -1184,10 +1195,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
1184void bio_free_pages(struct bio *bio) 1195void bio_free_pages(struct bio *bio)
1185{ 1196{
1186 struct bio_vec *bvec; 1197 struct bio_vec *bvec;
1187 int i;
1188 struct bvec_iter_all iter_all; 1198 struct bvec_iter_all iter_all;
1189 1199
1190 bio_for_each_segment_all(bvec, bio, i, iter_all) 1200 bio_for_each_segment_all(bvec, bio, iter_all)
1191 __free_page(bvec->bv_page); 1201 __free_page(bvec->bv_page);
1192} 1202}
1193EXPORT_SYMBOL(bio_free_pages); 1203EXPORT_SYMBOL(bio_free_pages);
@@ -1388,21 +1398,14 @@ struct bio *bio_map_user_iov(struct request_queue *q,
1388 for (j = 0; j < npages; j++) { 1398 for (j = 0; j < npages; j++) {
1389 struct page *page = pages[j]; 1399 struct page *page = pages[j];
1390 unsigned int n = PAGE_SIZE - offs; 1400 unsigned int n = PAGE_SIZE - offs;
1391 unsigned short prev_bi_vcnt = bio->bi_vcnt;
1392 1401
1393 if (n > bytes) 1402 if (n > bytes)
1394 n = bytes; 1403 n = bytes;
1395 1404
1396 if (!bio_add_pc_page(q, bio, page, n, offs)) 1405 if (!__bio_add_pc_page(q, bio, page, n, offs,
1406 true))
1397 break; 1407 break;
1398 1408
1399 /*
1400 * check if vector was merged with previous
1401 * drop page reference if needed
1402 */
1403 if (bio->bi_vcnt == prev_bi_vcnt)
1404 put_page(page);
1405
1406 added += n; 1409 added += n;
1407 bytes -= n; 1410 bytes -= n;
1408 offs = 0; 1411 offs = 0;
@@ -1432,7 +1435,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
1432 return bio; 1435 return bio;
1433 1436
1434 out_unmap: 1437 out_unmap:
1435 bio_for_each_segment_all(bvec, bio, j, iter_all) { 1438 bio_for_each_segment_all(bvec, bio, iter_all) {
1436 put_page(bvec->bv_page); 1439 put_page(bvec->bv_page);
1437 } 1440 }
1438 bio_put(bio); 1441 bio_put(bio);
@@ -1442,13 +1445,12 @@ struct bio *bio_map_user_iov(struct request_queue *q,
1442static void __bio_unmap_user(struct bio *bio) 1445static void __bio_unmap_user(struct bio *bio)
1443{ 1446{
1444 struct bio_vec *bvec; 1447 struct bio_vec *bvec;
1445 int i;
1446 struct bvec_iter_all iter_all; 1448 struct bvec_iter_all iter_all;
1447 1449
1448 /* 1450 /*
1449 * make sure we dirty pages we wrote to 1451 * make sure we dirty pages we wrote to
1450 */ 1452 */
1451 bio_for_each_segment_all(bvec, bio, i, iter_all) { 1453 bio_for_each_segment_all(bvec, bio, iter_all) {
1452 if (bio_data_dir(bio) == READ) 1454 if (bio_data_dir(bio) == READ)
1453 set_page_dirty_lock(bvec->bv_page); 1455 set_page_dirty_lock(bvec->bv_page);
1454 1456
@@ -1539,10 +1541,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
1539{ 1541{
1540 char *p = bio->bi_private; 1542 char *p = bio->bi_private;
1541 struct bio_vec *bvec; 1543 struct bio_vec *bvec;
1542 int i;
1543 struct bvec_iter_all iter_all; 1544 struct bvec_iter_all iter_all;
1544 1545
1545 bio_for_each_segment_all(bvec, bio, i, iter_all) { 1546 bio_for_each_segment_all(bvec, bio, iter_all) {
1546 memcpy(p, page_address(bvec->bv_page), bvec->bv_len); 1547 memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
1547 p += bvec->bv_len; 1548 p += bvec->bv_len;
1548 } 1549 }
@@ -1650,25 +1651,14 @@ cleanup:
1650void bio_set_pages_dirty(struct bio *bio) 1651void bio_set_pages_dirty(struct bio *bio)
1651{ 1652{
1652 struct bio_vec *bvec; 1653 struct bio_vec *bvec;
1653 int i;
1654 struct bvec_iter_all iter_all; 1654 struct bvec_iter_all iter_all;
1655 1655
1656 bio_for_each_segment_all(bvec, bio, i, iter_all) { 1656 bio_for_each_segment_all(bvec, bio, iter_all) {
1657 if (!PageCompound(bvec->bv_page)) 1657 if (!PageCompound(bvec->bv_page))
1658 set_page_dirty_lock(bvec->bv_page); 1658 set_page_dirty_lock(bvec->bv_page);
1659 } 1659 }
1660} 1660}
1661 1661
1662static void bio_release_pages(struct bio *bio)
1663{
1664 struct bio_vec *bvec;
1665 int i;
1666 struct bvec_iter_all iter_all;
1667
1668 bio_for_each_segment_all(bvec, bio, i, iter_all)
1669 put_page(bvec->bv_page);
1670}
1671
1672/* 1662/*
1673 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. 1663 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
1674 * If they are, then fine. If, however, some pages are clean then they must 1664 * If they are, then fine. If, however, some pages are clean then they must
@@ -1712,10 +1702,9 @@ void bio_check_pages_dirty(struct bio *bio)
1712{ 1702{
1713 struct bio_vec *bvec; 1703 struct bio_vec *bvec;
1714 unsigned long flags; 1704 unsigned long flags;
1715 int i;
1716 struct bvec_iter_all iter_all; 1705 struct bvec_iter_all iter_all;
1717 1706
1718 bio_for_each_segment_all(bvec, bio, i, iter_all) { 1707 bio_for_each_segment_all(bvec, bio, iter_all) {
1719 if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) 1708 if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
1720 goto defer; 1709 goto defer;
1721 } 1710 }
@@ -2203,6 +2192,9 @@ static int __init init_bio(void)
2203 bio_slab_nr = 0; 2192 bio_slab_nr = 0;
2204 bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), 2193 bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab),
2205 GFP_KERNEL); 2194 GFP_KERNEL);
2195
2196 BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET);
2197
2206 if (!bio_slabs) 2198 if (!bio_slabs)
2207 panic("bio: can't allocate bios\n"); 2199 panic("bio: can't allocate bios\n");
2208 2200
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 617a2b3f7582..b97b479e4f64 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Common Block IO controller cgroup interface 3 * Common Block IO controller cgroup interface
3 * 4 *
diff --git a/block/blk-core.c b/block/blk-core.c
index a55389ba8779..419d600e6637 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Copyright (C) 1991, 1992 Linus Torvalds 3 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics 4 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
@@ -232,15 +233,6 @@ void blk_sync_queue(struct request_queue *q)
232{ 233{
233 del_timer_sync(&q->timeout); 234 del_timer_sync(&q->timeout);
234 cancel_work_sync(&q->timeout_work); 235 cancel_work_sync(&q->timeout_work);
235
236 if (queue_is_mq(q)) {
237 struct blk_mq_hw_ctx *hctx;
238 int i;
239
240 cancel_delayed_work_sync(&q->requeue_work);
241 queue_for_each_hw_ctx(q, hctx, i)
242 cancel_delayed_work_sync(&hctx->run_work);
243 }
244} 236}
245EXPORT_SYMBOL(blk_sync_queue); 237EXPORT_SYMBOL(blk_sync_queue);
246 238
@@ -347,18 +339,6 @@ void blk_cleanup_queue(struct request_queue *q)
347 339
348 blk_queue_flag_set(QUEUE_FLAG_DEAD, q); 340 blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
349 341
350 /*
351 * make sure all in-progress dispatch are completed because
352 * blk_freeze_queue() can only complete all requests, and
353 * dispatch may still be in-progress since we dispatch requests
354 * from more than one contexts.
355 *
356 * We rely on driver to deal with the race in case that queue
357 * initialization isn't done.
358 */
359 if (queue_is_mq(q) && blk_queue_init_done(q))
360 blk_mq_quiesce_queue(q);
361
362 /* for synchronous bio-based driver finish in-flight integrity i/o */ 342 /* for synchronous bio-based driver finish in-flight integrity i/o */
363 blk_flush_integrity(); 343 blk_flush_integrity();
364 344
@@ -375,7 +355,7 @@ void blk_cleanup_queue(struct request_queue *q)
375 blk_exit_queue(q); 355 blk_exit_queue(q);
376 356
377 if (queue_is_mq(q)) 357 if (queue_is_mq(q))
378 blk_mq_free_queue(q); 358 blk_mq_exit_queue(q);
379 359
380 percpu_ref_exit(&q->q_usage_counter); 360 percpu_ref_exit(&q->q_usage_counter);
381 361
diff --git a/block/blk-exec.c b/block/blk-exec.c
index a34b7d918742..1db44ca0f4a6 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Functions related to setting various queue properties from drivers 3 * Functions related to setting various queue properties from drivers
3 */ 4 */
diff --git a/block/blk-flush.c b/block/blk-flush.c
index d95f94892015..aedd9320e605 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -1,11 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Functions to sequence PREFLUSH and FUA writes. 3 * Functions to sequence PREFLUSH and FUA writes.
3 * 4 *
4 * Copyright (C) 2011 Max Planck Institute for Gravitational Physics 5 * Copyright (C) 2011 Max Planck Institute for Gravitational Physics
5 * Copyright (C) 2011 Tejun Heo <tj@kernel.org> 6 * Copyright (C) 2011 Tejun Heo <tj@kernel.org>
6 * 7 *
7 * This file is released under the GPLv2.
8 *
9 * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three 8 * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three
10 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request 9 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
11 * properties and hardware capability. 10 * properties and hardware capability.
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 85864c71e858..825c9c070458 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -1,23 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * blk-integrity.c - Block layer data integrity extensions 3 * blk-integrity.c - Block layer data integrity extensions
3 * 4 *
4 * Copyright (C) 2007, 2008 Oracle Corporation 5 * Copyright (C) 2007, 2008 Oracle Corporation
5 * Written by: Martin K. Petersen <martin.petersen@oracle.com> 6 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License version
9 * 2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING. If not, write to
18 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
19 * USA.
20 *
21 */ 7 */
22 8
23#include <linux/blkdev.h> 9#include <linux/blkdev.h>
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 507212d75ee2..d22e61bced86 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Block rq-qos base io controller 3 * Block rq-qos base io controller
3 * 4 *
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1c9d4f0f96ea..21e87a714a73 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -267,23 +267,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
267 goto split; 267 goto split;
268 } 268 }
269 269
270 if (bvprvp) {
271 if (seg_size + bv.bv_len > queue_max_segment_size(q))
272 goto new_segment;
273 if (!biovec_phys_mergeable(q, bvprvp, &bv))
274 goto new_segment;
275
276 seg_size += bv.bv_len;
277 bvprv = bv;
278 bvprvp = &bvprv;
279 sectors += bv.bv_len >> 9;
280
281 if (nsegs == 1 && seg_size > front_seg_size)
282 front_seg_size = seg_size;
283
284 continue;
285 }
286new_segment:
287 if (nsegs == max_segs) 270 if (nsegs == max_segs)
288 goto split; 271 goto split;
289 272
@@ -370,12 +353,12 @@ EXPORT_SYMBOL(blk_queue_split);
370static unsigned int __blk_recalc_rq_segments(struct request_queue *q, 353static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
371 struct bio *bio) 354 struct bio *bio)
372{ 355{
373 struct bio_vec bv, bvprv = { NULL }; 356 struct bio_vec uninitialized_var(bv), bvprv = { NULL };
374 int prev = 0;
375 unsigned int seg_size, nr_phys_segs; 357 unsigned int seg_size, nr_phys_segs;
376 unsigned front_seg_size; 358 unsigned front_seg_size;
377 struct bio *fbio, *bbio; 359 struct bio *fbio, *bbio;
378 struct bvec_iter iter; 360 struct bvec_iter iter;
361 bool new_bio = false;
379 362
380 if (!bio) 363 if (!bio)
381 return 0; 364 return 0;
@@ -396,7 +379,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
396 nr_phys_segs = 0; 379 nr_phys_segs = 0;
397 for_each_bio(bio) { 380 for_each_bio(bio) {
398 bio_for_each_bvec(bv, bio, iter) { 381 bio_for_each_bvec(bv, bio, iter) {
399 if (prev) { 382 if (new_bio) {
400 if (seg_size + bv.bv_len 383 if (seg_size + bv.bv_len
401 > queue_max_segment_size(q)) 384 > queue_max_segment_size(q))
402 goto new_segment; 385 goto new_segment;
@@ -404,7 +387,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
404 goto new_segment; 387 goto new_segment;
405 388
406 seg_size += bv.bv_len; 389 seg_size += bv.bv_len;
407 bvprv = bv;
408 390
409 if (nr_phys_segs == 1 && seg_size > 391 if (nr_phys_segs == 1 && seg_size >
410 front_seg_size) 392 front_seg_size)
@@ -413,12 +395,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
413 continue; 395 continue;
414 } 396 }
415new_segment: 397new_segment:
416 bvprv = bv;
417 prev = 1;
418 bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size, 398 bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size,
419 &front_seg_size, NULL, UINT_MAX); 399 &front_seg_size, NULL, UINT_MAX);
400 new_bio = false;
420 } 401 }
421 bbio = bio; 402 bbio = bio;
403 if (likely(bio->bi_iter.bi_size)) {
404 bvprv = bv;
405 new_bio = true;
406 }
422 } 407 }
423 408
424 fbio->bi_seg_front_size = front_seg_size; 409 fbio->bi_seg_front_size = front_seg_size;
@@ -484,79 +469,97 @@ static unsigned blk_bvec_map_sg(struct request_queue *q,
484 struct scatterlist **sg) 469 struct scatterlist **sg)
485{ 470{
486 unsigned nbytes = bvec->bv_len; 471 unsigned nbytes = bvec->bv_len;
487 unsigned nsegs = 0, total = 0, offset = 0; 472 unsigned nsegs = 0, total = 0;
488 473
489 while (nbytes > 0) { 474 while (nbytes > 0) {
490 unsigned seg_size; 475 unsigned offset = bvec->bv_offset + total;
491 struct page *pg; 476 unsigned len = min(get_max_segment_size(q, offset), nbytes);
492 unsigned idx; 477 struct page *page = bvec->bv_page;
493
494 *sg = blk_next_sg(sg, sglist);
495 478
496 seg_size = get_max_segment_size(q, bvec->bv_offset + total); 479 /*
497 seg_size = min(nbytes, seg_size); 480 * Unfortunately a fair number of drivers barf on scatterlists
498 481 * that have an offset larger than PAGE_SIZE, despite other
499 offset = (total + bvec->bv_offset) % PAGE_SIZE; 482 * subsystems dealing with that invariant just fine. For now
500 idx = (total + bvec->bv_offset) / PAGE_SIZE; 483 * stick to the legacy format where we never present those from
501 pg = bvec_nth_page(bvec->bv_page, idx); 484 * the block layer, but the code below should be removed once
485 * these offenders (mostly MMC/SD drivers) are fixed.
486 */
487 page += (offset >> PAGE_SHIFT);
488 offset &= ~PAGE_MASK;
502 489
503 sg_set_page(*sg, pg, seg_size, offset); 490 *sg = blk_next_sg(sg, sglist);
491 sg_set_page(*sg, page, len, offset);
504 492
505 total += seg_size; 493 total += len;
506 nbytes -= seg_size; 494 nbytes -= len;
507 nsegs++; 495 nsegs++;
508 } 496 }
509 497
510 return nsegs; 498 return nsegs;
511} 499}
512 500
513static inline void 501static inline int __blk_bvec_map_sg(struct bio_vec bv,
514__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, 502 struct scatterlist *sglist, struct scatterlist **sg)
515 struct scatterlist *sglist, struct bio_vec *bvprv, 503{
516 struct scatterlist **sg, int *nsegs) 504 *sg = blk_next_sg(sg, sglist);
505 sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
506 return 1;
507}
508
509/* only try to merge bvecs into one sg if they are from two bios */
510static inline bool
511__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec,
512 struct bio_vec *bvprv, struct scatterlist **sg)
517{ 513{
518 514
519 int nbytes = bvec->bv_len; 515 int nbytes = bvec->bv_len;
520 516
521 if (*sg) { 517 if (!*sg)
522 if ((*sg)->length + nbytes > queue_max_segment_size(q)) 518 return false;
523 goto new_segment;
524 if (!biovec_phys_mergeable(q, bvprv, bvec))
525 goto new_segment;
526 519
527 (*sg)->length += nbytes; 520 if ((*sg)->length + nbytes > queue_max_segment_size(q))
528 } else { 521 return false;
529new_segment:
530 if (bvec->bv_offset + bvec->bv_len <= PAGE_SIZE) {
531 *sg = blk_next_sg(sg, sglist);
532 sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
533 (*nsegs) += 1;
534 } else
535 (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
536 }
537 *bvprv = *bvec;
538}
539 522
540static inline int __blk_bvec_map_sg(struct request_queue *q, struct bio_vec bv, 523 if (!biovec_phys_mergeable(q, bvprv, bvec))
541 struct scatterlist *sglist, struct scatterlist **sg) 524 return false;
542{ 525
543 *sg = sglist; 526 (*sg)->length += nbytes;
544 sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); 527
545 return 1; 528 return true;
546} 529}
547 530
548static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, 531static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
549 struct scatterlist *sglist, 532 struct scatterlist *sglist,
550 struct scatterlist **sg) 533 struct scatterlist **sg)
551{ 534{
552 struct bio_vec bvec, bvprv = { NULL }; 535 struct bio_vec uninitialized_var(bvec), bvprv = { NULL };
553 struct bvec_iter iter; 536 struct bvec_iter iter;
554 int nsegs = 0; 537 int nsegs = 0;
538 bool new_bio = false;
555 539
556 for_each_bio(bio) 540 for_each_bio(bio) {
557 bio_for_each_bvec(bvec, bio, iter) 541 bio_for_each_bvec(bvec, bio, iter) {
558 __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, 542 /*
559 &nsegs); 543 * Only try to merge bvecs from two bios given we
544 * have done bio internal merge when adding pages
545 * to bio
546 */
547 if (new_bio &&
548 __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg))
549 goto next_bvec;
550
551 if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE)
552 nsegs += __blk_bvec_map_sg(bvec, sglist, sg);
553 else
554 nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg);
555 next_bvec:
556 new_bio = false;
557 }
558 if (likely(bio->bi_iter.bi_size)) {
559 bvprv = bvec;
560 new_bio = true;
561 }
562 }
560 563
561 return nsegs; 564 return nsegs;
562} 565}
@@ -572,9 +575,9 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
572 int nsegs = 0; 575 int nsegs = 0;
573 576
574 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 577 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
575 nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg); 578 nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, &sg);
576 else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) 579 else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
577 nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg); 580 nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, &sg);
578 else if (rq->bio) 581 else if (rq->bio)
579 nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); 582 nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
580 583
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 03a534820271..48bebf00a5f3 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * CPU <-> hardware queue mapping helpers 3 * CPU <-> hardware queue mapping helpers
3 * 4 *
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index ec1d18cb643c..6aea0ebc3a73 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -1,17 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Copyright (C) 2017 Facebook 3 * Copyright (C) 2017 Facebook
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <https://www.gnu.org/licenses/>.
15 */ 4 */
16 5
17#include <linux/kernel.h> 6#include <linux/kernel.h>
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index 1dce18553984..ad4545a2a98b 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -1,14 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Copyright (c) 2016 Christoph Hellwig. 3 * Copyright (c) 2016 Christoph Hellwig.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */ 4 */
13#include <linux/kobject.h> 5#include <linux/kobject.h>
14#include <linux/blkdev.h> 6#include <linux/blkdev.h>
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
index 45030a81a1ed..cc921e6ba709 100644
--- a/block/blk-mq-rdma.c
+++ b/block/blk-mq-rdma.c
@@ -1,14 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Copyright (c) 2017 Sagi Grimberg. 3 * Copyright (c) 2017 Sagi Grimberg.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */ 4 */
13#include <linux/blk-mq.h> 5#include <linux/blk-mq.h>
14#include <linux/blk-mq-rdma.h> 6#include <linux/blk-mq-rdma.h>
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index aa6bc5c02643..74c6bb871f7e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * blk-mq scheduling framework 3 * blk-mq scheduling framework
3 * 4 *
@@ -413,6 +414,14 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
413 struct list_head *list, bool run_queue_async) 414 struct list_head *list, bool run_queue_async)
414{ 415{
415 struct elevator_queue *e; 416 struct elevator_queue *e;
417 struct request_queue *q = hctx->queue;
418
419 /*
420 * blk_mq_sched_insert_requests() is called from flush plug
421 * context only, and hold one usage counter to prevent queue
422 * from being released.
423 */
424 percpu_ref_get(&q->q_usage_counter);
416 425
417 e = hctx->queue->elevator; 426 e = hctx->queue->elevator;
418 if (e && e->type->ops.insert_requests) 427 if (e && e->type->ops.insert_requests)
@@ -426,12 +435,14 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
426 if (!hctx->dispatch_busy && !e && !run_queue_async) { 435 if (!hctx->dispatch_busy && !e && !run_queue_async) {
427 blk_mq_try_issue_list_directly(hctx, list); 436 blk_mq_try_issue_list_directly(hctx, list);
428 if (list_empty(list)) 437 if (list_empty(list))
429 return; 438 goto out;
430 } 439 }
431 blk_mq_insert_requests(hctx, ctx, list); 440 blk_mq_insert_requests(hctx, ctx, list);
432 } 441 }
433 442
434 blk_mq_run_hw_queue(hctx, run_queue_async); 443 blk_mq_run_hw_queue(hctx, run_queue_async);
444 out:
445 percpu_ref_put(&q->q_usage_counter);
435} 446}
436 447
437static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, 448static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 5315e538b3b1..d6e1a9bd7131 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/module.h> 3#include <linux/module.h>
3#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
@@ -10,6 +11,7 @@
10#include <linux/smp.h> 11#include <linux/smp.h>
11 12
12#include <linux/blk-mq.h> 13#include <linux/blk-mq.h>
14#include "blk.h"
13#include "blk-mq.h" 15#include "blk-mq.h"
14#include "blk-mq-tag.h" 16#include "blk-mq-tag.h"
15 17
@@ -33,6 +35,13 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
33{ 35{
34 struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, 36 struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
35 kobj); 37 kobj);
38
39 cancel_delayed_work_sync(&hctx->run_work);
40
41 if (hctx->flags & BLK_MQ_F_BLOCKING)
42 cleanup_srcu_struct(hctx->srcu);
43 blk_free_flush_queue(hctx->fq);
44 sbitmap_free(&hctx->ctx_map);
36 free_cpumask_var(hctx->cpumask); 45 free_cpumask_var(hctx->cpumask);
37 kfree(hctx->ctxs); 46 kfree(hctx->ctxs);
38 kfree(hctx); 47 kfree(hctx);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index a4931fc7be8a..7513c8eaabee 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Tag allocation using scalable bitmaps. Uses active queue tracking to support 3 * Tag allocation using scalable bitmaps. Uses active queue tracking to support
3 * fairer distribution of tags between multiple submitters when a shared tag map 4 * fairer distribution of tags between multiple submitters when a shared tag map
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
index 370827163835..75a52c18a8f6 100644
--- a/block/blk-mq-virtio.c
+++ b/block/blk-mq-virtio.c
@@ -1,14 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Copyright (c) 2016 Christoph Hellwig. 3 * Copyright (c) 2016 Christoph Hellwig.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */ 4 */
13#include <linux/device.h> 5#include <linux/device.h>
14#include <linux/blk-mq.h> 6#include <linux/blk-mq.h>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fc60ed7e940e..08a6248d8536 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Block multiqueue core code 3 * Block multiqueue core code
3 * 4 *
@@ -2062,7 +2063,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2062 list_del_init(&page->lru); 2063 list_del_init(&page->lru);
2063 /* 2064 /*
2064 * Remove kmemleak object previously allocated in 2065 * Remove kmemleak object previously allocated in
2065 * blk_mq_init_rq_map(). 2066 * blk_mq_alloc_rqs().
2066 */ 2067 */
2067 kmemleak_free(page_address(page)); 2068 kmemleak_free(page_address(page));
2068 __free_pages(page, page->private); 2069 __free_pages(page, page->private);
@@ -2267,12 +2268,11 @@ static void blk_mq_exit_hctx(struct request_queue *q,
2267 if (set->ops->exit_hctx) 2268 if (set->ops->exit_hctx)
2268 set->ops->exit_hctx(hctx, hctx_idx); 2269 set->ops->exit_hctx(hctx, hctx_idx);
2269 2270
2270 if (hctx->flags & BLK_MQ_F_BLOCKING)
2271 cleanup_srcu_struct(hctx->srcu);
2272
2273 blk_mq_remove_cpuhp(hctx); 2271 blk_mq_remove_cpuhp(hctx);
2274 blk_free_flush_queue(hctx->fq); 2272
2275 sbitmap_free(&hctx->ctx_map); 2273 spin_lock(&q->unused_hctx_lock);
2274 list_add(&hctx->hctx_list, &q->unused_hctx_list);
2275 spin_unlock(&q->unused_hctx_lock);
2276} 2276}
2277 2277
2278static void blk_mq_exit_hw_queues(struct request_queue *q, 2278static void blk_mq_exit_hw_queues(struct request_queue *q,
@@ -2289,15 +2289,65 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
2289 } 2289 }
2290} 2290}
2291 2291
2292static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2293{
2294 int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2295
2296 BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2297 __alignof__(struct blk_mq_hw_ctx)) !=
2298 sizeof(struct blk_mq_hw_ctx));
2299
2300 if (tag_set->flags & BLK_MQ_F_BLOCKING)
2301 hw_ctx_size += sizeof(struct srcu_struct);
2302
2303 return hw_ctx_size;
2304}
2305
2292static int blk_mq_init_hctx(struct request_queue *q, 2306static int blk_mq_init_hctx(struct request_queue *q,
2293 struct blk_mq_tag_set *set, 2307 struct blk_mq_tag_set *set,
2294 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 2308 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2295{ 2309{
2296 int node; 2310 hctx->queue_num = hctx_idx;
2311
2312 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2313
2314 hctx->tags = set->tags[hctx_idx];
2315
2316 if (set->ops->init_hctx &&
2317 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2318 goto unregister_cpu_notifier;
2319
2320 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
2321 hctx->numa_node))
2322 goto exit_hctx;
2323 return 0;
2324
2325 exit_hctx:
2326 if (set->ops->exit_hctx)
2327 set->ops->exit_hctx(hctx, hctx_idx);
2328 unregister_cpu_notifier:
2329 blk_mq_remove_cpuhp(hctx);
2330 return -1;
2331}
2332
2333static struct blk_mq_hw_ctx *
2334blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
2335 int node)
2336{
2337 struct blk_mq_hw_ctx *hctx;
2338 gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
2339
2340 hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
2341 if (!hctx)
2342 goto fail_alloc_hctx;
2297 2343
2298 node = hctx->numa_node; 2344 if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
2345 goto free_hctx;
2346
2347 atomic_set(&hctx->nr_active, 0);
2299 if (node == NUMA_NO_NODE) 2348 if (node == NUMA_NO_NODE)
2300 node = hctx->numa_node = set->numa_node; 2349 node = set->numa_node;
2350 hctx->numa_node = node;
2301 2351
2302 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 2352 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2303 spin_lock_init(&hctx->lock); 2353 spin_lock_init(&hctx->lock);
@@ -2305,58 +2355,47 @@ static int blk_mq_init_hctx(struct request_queue *q,
2305 hctx->queue = q; 2355 hctx->queue = q;
2306 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; 2356 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
2307 2357
2308 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 2358 INIT_LIST_HEAD(&hctx->hctx_list);
2309
2310 hctx->tags = set->tags[hctx_idx];
2311 2359
2312 /* 2360 /*
2313 * Allocate space for all possible cpus to avoid allocation at 2361 * Allocate space for all possible cpus to avoid allocation at
2314 * runtime 2362 * runtime
2315 */ 2363 */
2316 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), 2364 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2317 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); 2365 gfp, node);
2318 if (!hctx->ctxs) 2366 if (!hctx->ctxs)
2319 goto unregister_cpu_notifier; 2367 goto free_cpumask;
2320 2368
2321 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), 2369 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
2322 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node)) 2370 gfp, node))
2323 goto free_ctxs; 2371 goto free_ctxs;
2324
2325 hctx->nr_ctx = 0; 2372 hctx->nr_ctx = 0;
2326 2373
2327 spin_lock_init(&hctx->dispatch_wait_lock); 2374 spin_lock_init(&hctx->dispatch_wait_lock);
2328 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 2375 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2329 INIT_LIST_HEAD(&hctx->dispatch_wait.entry); 2376 INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2330 2377
2331 if (set->ops->init_hctx &&
2332 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2333 goto free_bitmap;
2334
2335 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size, 2378 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
2336 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); 2379 gfp);
2337 if (!hctx->fq) 2380 if (!hctx->fq)
2338 goto exit_hctx; 2381 goto free_bitmap;
2339
2340 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2341 goto free_fq;
2342 2382
2343 if (hctx->flags & BLK_MQ_F_BLOCKING) 2383 if (hctx->flags & BLK_MQ_F_BLOCKING)
2344 init_srcu_struct(hctx->srcu); 2384 init_srcu_struct(hctx->srcu);
2385 blk_mq_hctx_kobj_init(hctx);
2345 2386
2346 return 0; 2387 return hctx;
2347 2388
2348 free_fq:
2349 blk_free_flush_queue(hctx->fq);
2350 exit_hctx:
2351 if (set->ops->exit_hctx)
2352 set->ops->exit_hctx(hctx, hctx_idx);
2353 free_bitmap: 2389 free_bitmap:
2354 sbitmap_free(&hctx->ctx_map); 2390 sbitmap_free(&hctx->ctx_map);
2355 free_ctxs: 2391 free_ctxs:
2356 kfree(hctx->ctxs); 2392 kfree(hctx->ctxs);
2357 unregister_cpu_notifier: 2393 free_cpumask:
2358 blk_mq_remove_cpuhp(hctx); 2394 free_cpumask_var(hctx->cpumask);
2359 return -1; 2395 free_hctx:
2396 kfree(hctx);
2397 fail_alloc_hctx:
2398 return NULL;
2360} 2399}
2361 2400
2362static void blk_mq_init_cpu_queues(struct request_queue *q, 2401static void blk_mq_init_cpu_queues(struct request_queue *q,
@@ -2631,13 +2670,17 @@ static int blk_mq_alloc_ctxs(struct request_queue *q)
2631 */ 2670 */
2632void blk_mq_release(struct request_queue *q) 2671void blk_mq_release(struct request_queue *q)
2633{ 2672{
2634 struct blk_mq_hw_ctx *hctx; 2673 struct blk_mq_hw_ctx *hctx, *next;
2635 unsigned int i; 2674 int i;
2636 2675
2637 /* hctx kobj stays in hctx */ 2676 cancel_delayed_work_sync(&q->requeue_work);
2638 queue_for_each_hw_ctx(q, hctx, i) { 2677
2639 if (!hctx) 2678 queue_for_each_hw_ctx(q, hctx, i)
2640 continue; 2679 WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
2680
2681 /* all hctx are in .unused_hctx_list now */
2682 list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
2683 list_del_init(&hctx->hctx_list);
2641 kobject_put(&hctx->kobj); 2684 kobject_put(&hctx->kobj);
2642 } 2685 }
2643 2686
@@ -2700,51 +2743,38 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
2700} 2743}
2701EXPORT_SYMBOL(blk_mq_init_sq_queue); 2744EXPORT_SYMBOL(blk_mq_init_sq_queue);
2702 2745
2703static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2704{
2705 int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2706
2707 BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2708 __alignof__(struct blk_mq_hw_ctx)) !=
2709 sizeof(struct blk_mq_hw_ctx));
2710
2711 if (tag_set->flags & BLK_MQ_F_BLOCKING)
2712 hw_ctx_size += sizeof(struct srcu_struct);
2713
2714 return hw_ctx_size;
2715}
2716
2717static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( 2746static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
2718 struct blk_mq_tag_set *set, struct request_queue *q, 2747 struct blk_mq_tag_set *set, struct request_queue *q,
2719 int hctx_idx, int node) 2748 int hctx_idx, int node)
2720{ 2749{
2721 struct blk_mq_hw_ctx *hctx; 2750 struct blk_mq_hw_ctx *hctx = NULL, *tmp;
2722
2723 hctx = kzalloc_node(blk_mq_hw_ctx_size(set),
2724 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2725 node);
2726 if (!hctx)
2727 return NULL;
2728 2751
2729 if (!zalloc_cpumask_var_node(&hctx->cpumask, 2752 /* reuse dead hctx first */
2730 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2753 spin_lock(&q->unused_hctx_lock);
2731 node)) { 2754 list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
2732 kfree(hctx); 2755 if (tmp->numa_node == node) {
2733 return NULL; 2756 hctx = tmp;
2757 break;
2758 }
2734 } 2759 }
2760 if (hctx)
2761 list_del_init(&hctx->hctx_list);
2762 spin_unlock(&q->unused_hctx_lock);
2735 2763
2736 atomic_set(&hctx->nr_active, 0); 2764 if (!hctx)
2737 hctx->numa_node = node; 2765 hctx = blk_mq_alloc_hctx(q, set, node);
2738 hctx->queue_num = hctx_idx; 2766 if (!hctx)
2767 goto fail;
2739 2768
2740 if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) { 2769 if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
2741 free_cpumask_var(hctx->cpumask); 2770 goto free_hctx;
2742 kfree(hctx);
2743 return NULL;
2744 }
2745 blk_mq_hctx_kobj_init(hctx);
2746 2771
2747 return hctx; 2772 return hctx;
2773
2774 free_hctx:
2775 kobject_put(&hctx->kobj);
2776 fail:
2777 return NULL;
2748} 2778}
2749 2779
2750static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 2780static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
@@ -2770,10 +2800,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2770 2800
2771 hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); 2801 hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
2772 if (hctx) { 2802 if (hctx) {
2773 if (hctxs[i]) { 2803 if (hctxs[i])
2774 blk_mq_exit_hctx(q, set, hctxs[i], i); 2804 blk_mq_exit_hctx(q, set, hctxs[i], i);
2775 kobject_put(&hctxs[i]->kobj);
2776 }
2777 hctxs[i] = hctx; 2805 hctxs[i] = hctx;
2778 } else { 2806 } else {
2779 if (hctxs[i]) 2807 if (hctxs[i])
@@ -2804,9 +2832,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2804 if (hctx->tags) 2832 if (hctx->tags)
2805 blk_mq_free_map_and_requests(set, j); 2833 blk_mq_free_map_and_requests(set, j);
2806 blk_mq_exit_hctx(q, set, hctx, j); 2834 blk_mq_exit_hctx(q, set, hctx, j);
2807 kobject_put(&hctx->kobj);
2808 hctxs[j] = NULL; 2835 hctxs[j] = NULL;
2809
2810 } 2836 }
2811 } 2837 }
2812 mutex_unlock(&q->sysfs_lock); 2838 mutex_unlock(&q->sysfs_lock);
@@ -2849,6 +2875,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2849 if (!q->queue_hw_ctx) 2875 if (!q->queue_hw_ctx)
2850 goto err_sys_init; 2876 goto err_sys_init;
2851 2877
2878 INIT_LIST_HEAD(&q->unused_hctx_list);
2879 spin_lock_init(&q->unused_hctx_lock);
2880
2852 blk_mq_realloc_hw_ctxs(set, q); 2881 blk_mq_realloc_hw_ctxs(set, q);
2853 if (!q->nr_hw_queues) 2882 if (!q->nr_hw_queues)
2854 goto err_hctxs; 2883 goto err_hctxs;
@@ -2905,7 +2934,8 @@ err_exit:
2905} 2934}
2906EXPORT_SYMBOL(blk_mq_init_allocated_queue); 2935EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2907 2936
2908void blk_mq_free_queue(struct request_queue *q) 2937/* tags can _not_ be used after returning from blk_mq_exit_queue */
2938void blk_mq_exit_queue(struct request_queue *q)
2909{ 2939{
2910 struct blk_mq_tag_set *set = q->tag_set; 2940 struct blk_mq_tag_set *set = q->tag_set;
2911 2941
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 423ea88ab6fb..633a5a77ee8b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -37,7 +37,7 @@ struct blk_mq_ctx {
37 struct kobject kobj; 37 struct kobject kobj;
38} ____cacheline_aligned_in_smp; 38} ____cacheline_aligned_in_smp;
39 39
40void blk_mq_free_queue(struct request_queue *q); 40void blk_mq_exit_queue(struct request_queue *q);
41int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); 41int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
42void blk_mq_wake_waiters(struct request_queue *q); 42void blk_mq_wake_waiters(struct request_queue *q);
43bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); 43bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index d169d7188fa6..3f55b56f24bc 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -1,3 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0
2
1#include "blk-rq-qos.h" 3#include "blk-rq-qos.h"
2 4
3/* 5/*
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 564851889550..2300e038b9fa 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1#ifndef RQ_QOS_H 2#ifndef RQ_QOS_H
2#define RQ_QOS_H 3#define RQ_QOS_H
3 4
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6375afaedcec..3facc41476be 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Functions related to setting various queue properties from drivers 3 * Functions related to setting various queue properties from drivers
3 */ 4 */
@@ -663,22 +664,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
663EXPORT_SYMBOL(disk_stack_limits); 664EXPORT_SYMBOL(disk_stack_limits);
664 665
665/** 666/**
666 * blk_queue_dma_pad - set pad mask
667 * @q: the request queue for the device
668 * @mask: pad mask
669 *
670 * Set dma pad mask.
671 *
672 * Appending pad buffer to a request modifies the last entry of a
673 * scatter list such that it includes the pad buffer.
674 **/
675void blk_queue_dma_pad(struct request_queue *q, unsigned int mask)
676{
677 q->dma_pad_mask = mask;
678}
679EXPORT_SYMBOL(blk_queue_dma_pad);
680
681/**
682 * blk_queue_update_dma_pad - update pad mask 667 * blk_queue_update_dma_pad - update pad mask
683 * @q: the request queue for the device 668 * @q: the request queue for the device
684 * @mask: pad mask 669 * @mask: pad mask
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 696a04176e4d..940f15d600f8 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Block stat tracking code 3 * Block stat tracking code
3 * 4 *
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7a95a1eb27e1..a16a02c52a85 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -728,7 +728,7 @@ static struct queue_sysfs_entry throtl_sample_time_entry = {
728}; 728};
729#endif 729#endif
730 730
731static struct attribute *default_attrs[] = { 731static struct attribute *queue_attrs[] = {
732 &queue_requests_entry.attr, 732 &queue_requests_entry.attr,
733 &queue_ra_entry.attr, 733 &queue_ra_entry.attr,
734 &queue_max_hw_sectors_entry.attr, 734 &queue_max_hw_sectors_entry.attr,
@@ -769,7 +769,25 @@ static struct attribute *default_attrs[] = {
769#endif 769#endif
770 NULL, 770 NULL,
771}; 771};
772ATTRIBUTE_GROUPS(default); 772
773static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
774 int n)
775{
776 struct request_queue *q =
777 container_of(kobj, struct request_queue, kobj);
778
779 if (attr == &queue_io_timeout_entry.attr &&
780 (!q->mq_ops || !q->mq_ops->timeout))
781 return 0;
782
783 return attr->mode;
784}
785
786static struct attribute_group queue_attr_group = {
787 .attrs = queue_attrs,
788 .is_visible = queue_attr_visible,
789};
790
773 791
774#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) 792#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
775 793
@@ -891,7 +909,6 @@ static const struct sysfs_ops queue_sysfs_ops = {
891 909
892struct kobj_type blk_queue_ktype = { 910struct kobj_type blk_queue_ktype = {
893 .sysfs_ops = &queue_sysfs_ops, 911 .sysfs_ops = &queue_sysfs_ops,
894 .default_groups = default_groups,
895 .release = blk_release_queue, 912 .release = blk_release_queue,
896}; 913};
897 914
@@ -940,6 +957,14 @@ int blk_register_queue(struct gendisk *disk)
940 goto unlock; 957 goto unlock;
941 } 958 }
942 959
960 ret = sysfs_create_group(&q->kobj, &queue_attr_group);
961 if (ret) {
962 blk_trace_remove_sysfs(dev);
963 kobject_del(&q->kobj);
964 kobject_put(&dev->kobj);
965 goto unlock;
966 }
967
943 if (queue_is_mq(q)) { 968 if (queue_is_mq(q)) {
944 __blk_mq_register_dev(dev, q); 969 __blk_mq_register_dev(dev, q);
945 blk_mq_debugfs_register(q); 970 blk_mq_debugfs_register(q);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 124c26128bf6..8aa68fae96ad 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Functions related to generic timeout handling of requests. 3 * Functions related to generic timeout handling of requests.
3 */ 4 */
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index fd166fbb0f65..313f45a37e9d 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * buffered writeback throttling. loosely based on CoDel. We can't drop 3 * buffered writeback throttling. loosely based on CoDel. We can't drop
3 * packets for IO scheduling, so the logic is something like this: 4 * packets for IO scheduling, so the logic is something like this:
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 2d98803faec2..ae7e91bd0618 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Zoned block device handling 3 * Zoned block device handling
3 * 4 *
diff --git a/block/blk.h b/block/blk.h
index 5d636ee41663..e27fd1512e4b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -75,7 +75,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
75 75
76 if (addr1 + vec1->bv_len != addr2) 76 if (addr1 + vec1->bv_len != addr2)
77 return false; 77 return false;
78 if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2)) 78 if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
79 return false; 79 return false;
80 if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask)) 80 if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
81 return false; 81 return false;
diff --git a/block/bounce.c b/block/bounce.c
index 47eb7e936e22..f8ed677a1bf7 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -163,14 +163,13 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
163{ 163{
164 struct bio *bio_orig = bio->bi_private; 164 struct bio *bio_orig = bio->bi_private;
165 struct bio_vec *bvec, orig_vec; 165 struct bio_vec *bvec, orig_vec;
166 int i;
167 struct bvec_iter orig_iter = bio_orig->bi_iter; 166 struct bvec_iter orig_iter = bio_orig->bi_iter;
168 struct bvec_iter_all iter_all; 167 struct bvec_iter_all iter_all;
169 168
170 /* 169 /*
171 * free up bounce indirect pages used 170 * free up bounce indirect pages used
172 */ 171 */
173 bio_for_each_segment_all(bvec, bio, i, iter_all) { 172 bio_for_each_segment_all(bvec, bio, iter_all) {
174 orig_vec = bio_iter_iovec(bio_orig, orig_iter); 173 orig_vec = bio_iter_iovec(bio_orig, orig_iter);
175 if (bvec->bv_page != orig_vec.bv_page) { 174 if (bvec->bv_page != orig_vec.bv_page) {
176 dec_zone_page_state(bvec->bv_page, NR_BOUNCE); 175 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 005e2b75d775..b898a1cdf872 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -1,24 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/* 2/*
2 * BSG helper library 3 * BSG helper library
3 * 4 *
4 * Copyright (C) 2008 James Smart, Emulex Corporation 5 * Copyright (C) 2008 James Smart, Emulex Corporation
5 * Copyright (C) 2011 Red Hat, Inc. All rights reserved. 6 * Copyright (C) 2011 Red Hat, Inc. All rights reserved.
6 * Copyright (C) 2011 Mike Christie 7 * Copyright (C) 2011 Mike Christie
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 *
22 */ 8 */
23#include <linux/slab.h> 9#include <linux/slab.h>
24#include <linux/blk-mq.h> 10#include <linux/blk-mq.h>
diff --git a/block/bsg.c b/block/bsg.c
index f306853c6b08..833c44b3d458 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -1,13 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * bsg.c - block layer implementation of the sg v4 interface 3 * bsg.c - block layer implementation of the sg v4 interface
3 *
4 * Copyright (C) 2004 Jens Axboe <axboe@suse.de> SUSE Labs
5 * Copyright (C) 2004 Peter M. Jones <pjones@redhat.com>
6 *
7 * This file is subject to the terms and conditions of the GNU General Public
8 * License version 2. See the file "COPYING" in the main directory of this
9 * archive for more details.
10 *
11 */ 4 */
12#include <linux/module.h> 5#include <linux/module.h>
13#include <linux/init.h> 6#include <linux/init.h>
diff --git a/block/elevator.c b/block/elevator.c
index d6d835a08de6..ec55d5fc0b3e 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Block device elevator/IO-scheduler. 3 * Block device elevator/IO-scheduler.
3 * 4 *
@@ -509,8 +510,6 @@ void elv_unregister_queue(struct request_queue *q)
509 510
510int elv_register(struct elevator_type *e) 511int elv_register(struct elevator_type *e)
511{ 512{
512 char *def = "";
513
514 /* create icq_cache if requested */ 513 /* create icq_cache if requested */
515 if (e->icq_size) { 514 if (e->icq_size) {
516 if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || 515 if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
@@ -535,8 +534,8 @@ int elv_register(struct elevator_type *e)
535 list_add_tail(&e->list, &elv_list); 534 list_add_tail(&e->list, &elv_list);
536 spin_unlock(&elv_list_lock); 535 spin_unlock(&elv_list_lock);
537 536
538 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, 537 printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name);
539 def); 538
540 return 0; 539 return 0;
541} 540}
542EXPORT_SYMBOL_GPL(elv_register); 541EXPORT_SYMBOL_GPL(elv_register);
diff --git a/block/genhd.c b/block/genhd.c
index 703267865f14..ad6826628e79 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * gendisk handling 3 * gendisk handling
3 */ 4 */
@@ -531,6 +532,18 @@ void blk_free_devt(dev_t devt)
531 } 532 }
532} 533}
533 534
535/**
536 * We invalidate devt by assigning NULL pointer for devt in idr.
537 */
538void blk_invalidate_devt(dev_t devt)
539{
540 if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
541 spin_lock_bh(&ext_devt_lock);
542 idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt)));
543 spin_unlock_bh(&ext_devt_lock);
544 }
545}
546
534static char *bdevt_str(dev_t devt, char *buf) 547static char *bdevt_str(dev_t devt, char *buf)
535{ 548{
536 if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { 549 if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
@@ -793,6 +806,13 @@ void del_gendisk(struct gendisk *disk)
793 806
794 if (!(disk->flags & GENHD_FL_HIDDEN)) 807 if (!(disk->flags & GENHD_FL_HIDDEN))
795 blk_unregister_region(disk_devt(disk), disk->minors); 808 blk_unregister_region(disk_devt(disk), disk->minors);
809 /*
810 * Remove gendisk pointer from idr so that it cannot be looked up
811 * while RCU period before freeing gendisk is running to prevent
812 * use-after-free issues. Note that the device number stays
813 * "in-use" until we really free the gendisk.
814 */
815 blk_invalidate_devt(disk_devt(disk));
796 816
797 kobject_put(disk->part0.holder_dir); 817 kobject_put(disk->part0.holder_dir);
798 kobject_put(disk->slave_dir); 818 kobject_put(disk->slave_dir);
@@ -1628,12 +1648,11 @@ static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1628 1648
1629 /* 1649 /*
1630 * If device-specific poll interval is set, always use it. If 1650 * If device-specific poll interval is set, always use it. If
1631 * the default is being used, poll iff there are events which 1651 * the default is being used, poll if the POLL flag is set.
1632 * can't be monitored asynchronously.
1633 */ 1652 */
1634 if (ev->poll_msecs >= 0) 1653 if (ev->poll_msecs >= 0)
1635 intv_msecs = ev->poll_msecs; 1654 intv_msecs = ev->poll_msecs;
1636 else if (disk->events & ~disk->async_events) 1655 else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
1637 intv_msecs = disk_events_dfl_poll_msecs; 1656 intv_msecs = disk_events_dfl_poll_msecs;
1638 1657
1639 return msecs_to_jiffies(intv_msecs); 1658 return msecs_to_jiffies(intv_msecs);
@@ -1843,11 +1862,13 @@ static void disk_check_events(struct disk_events *ev,
1843 1862
1844 /* 1863 /*
1845 * Tell userland about new events. Only the events listed in 1864 * Tell userland about new events. Only the events listed in
1846 * @disk->events are reported. Unlisted events are processed the 1865 * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
1847 * same internally but never get reported to userland. 1866 * is set. Otherwise, events are processed internally but never
1867 * get reported to userland.
1848 */ 1868 */
1849 for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) 1869 for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1850 if (events & disk->events & (1 << i)) 1870 if ((events & disk->events & (1 << i)) &&
1871 (disk->event_flags & DISK_EVENT_FLAG_UEVENT))
1851 envp[nr_events++] = disk_uevents[i]; 1872 envp[nr_events++] = disk_uevents[i];
1852 1873
1853 if (nr_events) 1874 if (nr_events)
@@ -1860,6 +1881,7 @@ static void disk_check_events(struct disk_events *ev,
1860 * 1881 *
1861 * events : list of all supported events 1882 * events : list of all supported events
1862 * events_async : list of events which can be detected w/o polling 1883 * events_async : list of events which can be detected w/o polling
1884 * (always empty, only for backwards compatibility)
1863 * events_poll_msecs : polling interval, 0: disable, -1: system default 1885 * events_poll_msecs : polling interval, 0: disable, -1: system default
1864 */ 1886 */
1865static ssize_t __disk_events_show(unsigned int events, char *buf) 1887static ssize_t __disk_events_show(unsigned int events, char *buf)
@@ -1884,15 +1906,16 @@ static ssize_t disk_events_show(struct device *dev,
1884{ 1906{
1885 struct gendisk *disk = dev_to_disk(dev); 1907 struct gendisk *disk = dev_to_disk(dev);
1886 1908
1909 if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
1910 return 0;
1911
1887 return __disk_events_show(disk->events, buf); 1912 return __disk_events_show(disk->events, buf);
1888} 1913}
1889 1914
1890static ssize_t disk_events_async_show(struct device *dev, 1915static ssize_t disk_events_async_show(struct device *dev,
1891 struct device_attribute *attr, char *buf) 1916 struct device_attribute *attr, char *buf)
1892{ 1917{
1893 struct gendisk *disk = dev_to_disk(dev); 1918 return 0;
1894
1895 return __disk_events_show(disk->async_events, buf);
1896} 1919}
1897 1920
1898static ssize_t disk_events_poll_msecs_show(struct device *dev, 1921static ssize_t disk_events_poll_msecs_show(struct device *dev,
@@ -1901,6 +1924,9 @@ static ssize_t disk_events_poll_msecs_show(struct device *dev,
1901{ 1924{
1902 struct gendisk *disk = dev_to_disk(dev); 1925 struct gendisk *disk = dev_to_disk(dev);
1903 1926
1927 if (!disk->ev)
1928 return sprintf(buf, "-1\n");
1929
1904 return sprintf(buf, "%ld\n", disk->ev->poll_msecs); 1930 return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1905} 1931}
1906 1932
@@ -1917,6 +1943,9 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev,
1917 if (intv < 0 && intv != -1) 1943 if (intv < 0 && intv != -1)
1918 return -EINVAL; 1944 return -EINVAL;
1919 1945
1946 if (!disk->ev)
1947 return -ENODEV;
1948
1920 disk_block_events(disk); 1949 disk_block_events(disk);
1921 disk->ev->poll_msecs = intv; 1950 disk->ev->poll_msecs = intv;
1922 __disk_unblock_events(disk, true); 1951 __disk_unblock_events(disk, true);
@@ -1981,7 +2010,7 @@ static void disk_alloc_events(struct gendisk *disk)
1981{ 2010{
1982 struct disk_events *ev; 2011 struct disk_events *ev;
1983 2012
1984 if (!disk->fops->check_events) 2013 if (!disk->fops->check_events || !disk->events)
1985 return; 2014 return;
1986 2015
1987 ev = kzalloc(sizeof(*ev), GFP_KERNEL); 2016 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
@@ -2003,14 +2032,14 @@ static void disk_alloc_events(struct gendisk *disk)
2003 2032
2004static void disk_add_events(struct gendisk *disk) 2033static void disk_add_events(struct gendisk *disk)
2005{ 2034{
2006 if (!disk->ev)
2007 return;
2008
2009 /* FIXME: error handling */ 2035 /* FIXME: error handling */
2010 if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) 2036 if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
2011 pr_warn("%s: failed to create sysfs files for events\n", 2037 pr_warn("%s: failed to create sysfs files for events\n",
2012 disk->disk_name); 2038 disk->disk_name);
2013 2039
2040 if (!disk->ev)
2041 return;
2042
2014 mutex_lock(&disk_events_mutex); 2043 mutex_lock(&disk_events_mutex);
2015 list_add_tail(&disk->ev->node, &disk_events); 2044 list_add_tail(&disk->ev->node, &disk_events);
2016 mutex_unlock(&disk_events_mutex); 2045 mutex_unlock(&disk_events_mutex);
@@ -2024,14 +2053,13 @@ static void disk_add_events(struct gendisk *disk)
2024 2053
2025static void disk_del_events(struct gendisk *disk) 2054static void disk_del_events(struct gendisk *disk)
2026{ 2055{
2027 if (!disk->ev) 2056 if (disk->ev) {
2028 return; 2057 disk_block_events(disk);
2029
2030 disk_block_events(disk);
2031 2058
2032 mutex_lock(&disk_events_mutex); 2059 mutex_lock(&disk_events_mutex);
2033 list_del_init(&disk->ev->node); 2060 list_del_init(&disk->ev->node);
2034 mutex_unlock(&disk_events_mutex); 2061 mutex_unlock(&disk_events_mutex);
2062 }
2035 2063
2036 sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); 2064 sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
2037} 2065}
diff --git a/block/ioctl.c b/block/ioctl.c
index 4825c78a6baa..15a0eb80ada9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1#include <linux/capability.h> 2#include <linux/capability.h>
2#include <linux/blkdev.h> 3#include <linux/blkdev.h>
3#include <linux/export.h> 4#include <linux/export.h>
diff --git a/block/ioprio.c b/block/ioprio.c
index f9821080c92c..2e0559f157c8 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * fs/ioprio.c 3 * fs/ioprio.c
3 * 4 *
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index ec6a04e01bc1..c3b05119cebd 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -1,20 +1,9 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using 3 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
3 * scalable techniques. 4 * scalable techniques.
4 * 5 *
5 * Copyright (C) 2017 Facebook 6 * Copyright (C) 2017 Facebook
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License v2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <https://www.gnu.org/licenses/>.
18 */ 7 */
19 8
20#include <linux/kernel.h> 9#include <linux/kernel.h>
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 14288f864e94..1876f5712bfd 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, 3 * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
3 * for the blk-mq scheduling framework 4 * for the blk-mq scheduling framework
diff --git a/block/opal_proto.h b/block/opal_proto.h
index e20be8258854..d9a05ad02eb5 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -1,18 +1,10 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * Copyright © 2016 Intel Corporation 3 * Copyright © 2016 Intel Corporation
3 * 4 *
4 * Authors: 5 * Authors:
5 * Rafael Antognolli <rafael.antognolli@intel.com> 6 * Rafael Antognolli <rafael.antognolli@intel.com>
6 * Scott Bauer <scott.bauer@intel.com> 7 * Scott Bauer <scott.bauer@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 */ 8 */
17#include <linux/types.h> 9#include <linux/types.h>
18 10
@@ -170,6 +162,8 @@ enum opal_token {
170 OPAL_READLOCKED = 0x07, 162 OPAL_READLOCKED = 0x07,
171 OPAL_WRITELOCKED = 0x08, 163 OPAL_WRITELOCKED = 0x08,
172 OPAL_ACTIVEKEY = 0x0A, 164 OPAL_ACTIVEKEY = 0x0A,
165 /* lockingsp table */
166 OPAL_LIFECYCLE = 0x06,
173 /* locking info table */ 167 /* locking info table */
174 OPAL_MAXRANGES = 0x04, 168 OPAL_MAXRANGES = 0x04,
175 /* mbr control */ 169 /* mbr control */
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 8e596a8dff32..aee643ce13d1 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -285,6 +285,13 @@ void delete_partition(struct gendisk *disk, int partno)
285 kobject_put(part->holder_dir); 285 kobject_put(part->holder_dir);
286 device_del(part_to_dev(part)); 286 device_del(part_to_dev(part));
287 287
288 /*
289 * Remove gendisk pointer from idr so that it cannot be looked up
290 * while RCU period before freeing gendisk is running to prevent
291 * use-after-free issues. Note that the device number stays
292 * "in-use" until we really free the gendisk.
293 */
294 blk_invalidate_devt(part_devt(part));
288 hd_struct_kill(part); 295 hd_struct_kill(part);
289} 296}
290 297
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c
index fbeb697374d5..7587700fad4a 100644
--- a/block/partitions/acorn.c
+++ b/block/partitions/acorn.c
@@ -1,12 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * linux/fs/partitions/acorn.c
3 *
4 * Copyright (c) 1996-2000 Russell King. 3 * Copyright (c) 1996-2000 Russell King.
5 * 4 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Scan ADFS partitions on hard disk drives. Unfortunately, there 5 * Scan ADFS partitions on hard disk drives. Unfortunately, there
11 * isn't a standard for partitioning drives on Acorn machines, so 6 * isn't a standard for partitioning drives on Acorn machines, so
12 * every single manufacturer of SCSI and IDE cards created their own 7 * every single manufacturer of SCSI and IDE cards created their own
diff --git a/block/partitions/aix.h b/block/partitions/aix.h
index e0c66a987523..b4449f0b9f2b 100644
--- a/block/partitions/aix.h
+++ b/block/partitions/aix.h
@@ -1 +1,2 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1extern int aix_partition(struct parsed_partitions *state); 2extern int aix_partition(struct parsed_partitions *state);
diff --git a/block/partitions/amiga.h b/block/partitions/amiga.h
index d094585cadaa..7e63f4d9d969 100644
--- a/block/partitions/amiga.h
+++ b/block/partitions/amiga.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * fs/partitions/amiga.h 3 * fs/partitions/amiga.h
3 */ 4 */
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index 39f70d968754..db2fef7dfc47 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/************************************************************ 2/************************************************************
2 * EFI GUID Partition Table handling 3 * EFI GUID Partition Table handling
3 * 4 *
@@ -7,21 +8,6 @@
7 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> 8 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
8 * Copyright 2000,2001,2002,2004 Dell Inc. 9 * Copyright 2000,2001,2002,2004 Dell Inc.
9 * 10 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 *
25 * TODO: 11 * TODO:
26 * 12 *
27 * Changelog: 13 * Changelog:
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index abd0b19288a6..3e8576157575 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
1/************************************************************ 2/************************************************************
2 * EFI GUID Partition Table 3 * EFI GUID Partition Table
3 * Per Intel EFI Specification v1.02 4 * Per Intel EFI Specification v1.02
@@ -5,21 +6,6 @@
5 * 6 *
6 * By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000 7 * By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000
7 * Copyright 2000,2001 Dell Inc. 8 * Copyright 2000,2001 Dell Inc.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 ************************************************************/ 9 ************************************************************/
24 10
25#ifndef FS_PART_EFI_H_INCLUDED 11#ifndef FS_PART_EFI_H_INCLUDED
diff --git a/block/partitions/ibm.h b/block/partitions/ibm.h
index 08fb0804a812..8bf13febb2b6 100644
--- a/block/partitions/ibm.h
+++ b/block/partitions/ibm.h
@@ -1 +1,2 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1int ibm_partition(struct parsed_partitions *); 2int ibm_partition(struct parsed_partitions *);
diff --git a/block/partitions/karma.h b/block/partitions/karma.h
index c764b2e9df21..48e074d417fb 100644
--- a/block/partitions/karma.h
+++ b/block/partitions/karma.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * fs/partitions/karma.h 3 * fs/partitions/karma.h
3 */ 4 */
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index 16766f267559..6db573f33219 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/** 2/**
2 * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) 3 * ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
3 * 4 *
@@ -6,21 +7,6 @@
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 7 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 8 *
8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 9 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 *
10 * This program is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU General Public License as published by the Free Software
12 * Foundation; either version 2 of the License, or (at your option) any later
13 * version.
14 *
15 * This program is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License along with
21 * this program (in the main directory of the source in the file COPYING); if
22 * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
23 * Boston, MA 02111-1307 USA
24 */ 10 */
25 11
26#include <linux/slab.h> 12#include <linux/slab.h>
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index f4c6055df956..1ca63e97bccc 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -1,3 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0-or-later
1/** 2/**
2 * ldm - Part of the Linux-NTFS project. 3 * ldm - Part of the Linux-NTFS project.
3 * 4 *
@@ -6,21 +7,6 @@
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> 7 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 * 8 *
8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 9 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free
12 * Software Foundation; either version 2 of the License, or (at your option)
13 * any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program (in the main directory of the Linux-NTFS source
22 * in the file COPYING); if not, write to the Free Software Foundation,
23 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */ 10 */
25 11
26#ifndef _FS_PT_LDM_H_ 12#ifndef _FS_PT_LDM_H_
diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h
index 38c781c490b3..fcacfc486092 100644
--- a/block/partitions/msdos.h
+++ b/block/partitions/msdos.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * fs/partitions/msdos.h 3 * fs/partitions/msdos.h
3 */ 4 */
diff --git a/block/partitions/osf.h b/block/partitions/osf.h
index 20ed2315ec16..4d8088e7ea8c 100644
--- a/block/partitions/osf.h
+++ b/block/partitions/osf.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * fs/partitions/osf.h 3 * fs/partitions/osf.h
3 */ 4 */
diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h
index b9553ebdd5a9..a5b77c3987cf 100644
--- a/block/partitions/sgi.h
+++ b/block/partitions/sgi.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * fs/partitions/sgi.h 3 * fs/partitions/sgi.h
3 */ 4 */
diff --git a/block/partitions/sun.h b/block/partitions/sun.h
index 2424baa8319f..ae1b9eed3fd7 100644
--- a/block/partitions/sun.h
+++ b/block/partitions/sun.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * fs/partitions/sun.h 3 * fs/partitions/sun.h
3 */ 4 */
diff --git a/block/partitions/sysv68.h b/block/partitions/sysv68.h
index bf2f5ffa97ac..4fb6b8ec78ae 100644
--- a/block/partitions/sysv68.h
+++ b/block/partitions/sysv68.h
@@ -1 +1,2 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1extern int sysv68_partition(struct parsed_partitions *state); 2extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/block/partitions/ultrix.h b/block/partitions/ultrix.h
index a3cc00b2bded..9f676cead222 100644
--- a/block/partitions/ultrix.h
+++ b/block/partitions/ultrix.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * fs/partitions/ultrix.h 3 * fs/partitions/ultrix.h
3 */ 4 */
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 533f4aee8567..f5e0ad65e86a 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -1,20 +1,6 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de> 3 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 *
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 *
18 */ 4 */
19#include <linux/kernel.h> 5#include <linux/kernel.h>
20#include <linux/errno.h> 6#include <linux/errno.h>
diff --git a/block/sed-opal.c b/block/sed-opal.c
index e0de4dd448b3..a46e8d13e16d 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1,18 +1,10 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * Copyright © 2016 Intel Corporation 3 * Copyright © 2016 Intel Corporation
3 * 4 *
4 * Authors: 5 * Authors:
5 * Scott Bauer <scott.bauer@intel.com> 6 * Scott Bauer <scott.bauer@intel.com>
6 * Rafael Antognolli <rafael.antognolli@intel.com> 7 * Rafael Antognolli <rafael.antognolli@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 */ 8 */
17 9
18#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt 10#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt
@@ -85,7 +77,6 @@ struct opal_dev {
85 void *data; 77 void *data;
86 sec_send_recv *send_recv; 78 sec_send_recv *send_recv;
87 79
88 const struct opal_step *steps;
89 struct mutex dev_lock; 80 struct mutex dev_lock;
90 u16 comid; 81 u16 comid;
91 u32 hsn; 82 u32 hsn;
@@ -157,7 +148,7 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
157 148
158 /* C_PIN_TABLE object ID's */ 149 /* C_PIN_TABLE object ID's */
159 150
160 [OPAL_C_PIN_MSID] = 151 [OPAL_C_PIN_MSID] =
161 { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02}, 152 { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
162 [OPAL_C_PIN_SID] = 153 [OPAL_C_PIN_SID] =
163 { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01}, 154 { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01},
@@ -181,7 +172,7 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
181 * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 172 * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
182 * Section: 6.3 Assigned UIDs 173 * Section: 6.3 Assigned UIDs
183 */ 174 */
184static const u8 opalmethod[][OPAL_UID_LENGTH] = { 175static const u8 opalmethod[][OPAL_METHOD_LENGTH] = {
185 [OPAL_PROPERTIES] = 176 [OPAL_PROPERTIES] =
186 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 }, 177 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 },
187 [OPAL_STARTSESSION] = 178 [OPAL_STARTSESSION] =
@@ -217,6 +208,7 @@ static const u8 opalmethod[][OPAL_UID_LENGTH] = {
217}; 208};
218 209
219static int end_opal_session_error(struct opal_dev *dev); 210static int end_opal_session_error(struct opal_dev *dev);
211static int opal_discovery0_step(struct opal_dev *dev);
220 212
221struct opal_suspend_data { 213struct opal_suspend_data {
222 struct opal_lock_unlock unlk; 214 struct opal_lock_unlock unlk;
@@ -382,37 +374,50 @@ static void check_geometry(struct opal_dev *dev, const void *data)
382 dev->lowest_lba = geo->lowest_aligned_lba; 374 dev->lowest_lba = geo->lowest_aligned_lba;
383} 375}
384 376
385static int next(struct opal_dev *dev) 377static int execute_step(struct opal_dev *dev,
378 const struct opal_step *step, size_t stepIndex)
386{ 379{
387 const struct opal_step *step; 380 int error = step->fn(dev, step->data);
388 int state = 0, error = 0;
389 381
390 do { 382 if (error) {
391 step = &dev->steps[state]; 383 pr_debug("Step %zu (%pS) failed with error %d: %s\n",
392 if (!step->fn) 384 stepIndex, step->fn, error,
393 break; 385 opal_error_to_human(error));
386 }
394 387
395 error = step->fn(dev, step->data); 388 return error;
396 if (error) { 389}
397 pr_debug("Error on step function: %d with error %d: %s\n",
398 state, error,
399 opal_error_to_human(error));
400
401 /* For each OPAL command we do a discovery0 then we
402 * start some sort of session.
403 * If we haven't passed state 1 then there was an error
404 * on discovery0 or during the attempt to start a
405 * session. Therefore we shouldn't attempt to terminate
406 * a session, as one has not yet been created.
407 */
408 if (state > 1) {
409 end_opal_session_error(dev);
410 return error;
411 }
412 390
413 } 391static int execute_steps(struct opal_dev *dev,
414 state++; 392 const struct opal_step *steps, size_t n_steps)
415 } while (!error); 393{
394 size_t state = 0;
395 int error;
396
397 /* first do a discovery0 */
398 error = opal_discovery0_step(dev);
399 if (error)
400 return error;
401
402 for (state = 0; state < n_steps; state++) {
403 error = execute_step(dev, &steps[state], state);
404 if (error)
405 goto out_error;
406 }
407
408 return 0;
409
410out_error:
411 /*
412 * For each OPAL command the first step in steps starts some sort of
413 * session. If an error occurred in the initial discovery0 or if an
414 * error occurred in the first step (and thus stopping the loop with
415 * state == 0) then there was an error before or during the attempt to
416 * start a session. Therefore we shouldn't attempt to terminate a
417 * session, as one has not yet been created.
418 */
419 if (state > 0)
420 end_opal_session_error(dev);
416 421
417 return error; 422 return error;
418} 423}
@@ -510,15 +515,32 @@ static int opal_discovery0(struct opal_dev *dev, void *data)
510 return opal_discovery0_end(dev); 515 return opal_discovery0_end(dev);
511} 516}
512 517
513static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok) 518static int opal_discovery0_step(struct opal_dev *dev)
519{
520 const struct opal_step discovery0_step = {
521 opal_discovery0,
522 };
523 return execute_step(dev, &discovery0_step, 0);
524}
525
526static bool can_add(int *err, struct opal_dev *cmd, size_t len)
514{ 527{
515 if (*err) 528 if (*err)
516 return; 529 return false;
517 if (cmd->pos >= IO_BUFFER_LENGTH - 1) { 530
518 pr_debug("Error adding u8: end of buffer.\n"); 531 if (len > IO_BUFFER_LENGTH || cmd->pos > IO_BUFFER_LENGTH - len) {
532 pr_debug("Error adding %zu bytes: end of buffer.\n", len);
519 *err = -ERANGE; 533 *err = -ERANGE;
520 return; 534 return false;
521 } 535 }
536
537 return true;
538}
539
540static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
541{
542 if (!can_add(err, cmd, 1))
543 return;
522 cmd->cmd[cmd->pos++] = tok; 544 cmd->cmd[cmd->pos++] = tok;
523} 545}
524 546
@@ -551,7 +573,6 @@ static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring,
551 573
552static void add_token_u64(int *err, struct opal_dev *cmd, u64 number) 574static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
553{ 575{
554
555 size_t len; 576 size_t len;
556 int msb; 577 int msb;
557 578
@@ -563,9 +584,8 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
563 msb = fls64(number); 584 msb = fls64(number);
564 len = DIV_ROUND_UP(msb, 8); 585 len = DIV_ROUND_UP(msb, 8);
565 586
566 if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) { 587 if (!can_add(err, cmd, len + 1)) {
567 pr_debug("Error adding u64: end of buffer.\n"); 588 pr_debug("Error adding u64: end of buffer.\n");
568 *err = -ERANGE;
569 return; 589 return;
570 } 590 }
571 add_short_atom_header(cmd, false, false, len); 591 add_short_atom_header(cmd, false, false, len);
@@ -573,24 +593,19 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
573 add_token_u8(err, cmd, number >> (len * 8)); 593 add_token_u8(err, cmd, number >> (len * 8));
574} 594}
575 595
576static void add_token_bytestring(int *err, struct opal_dev *cmd, 596static u8 *add_bytestring_header(int *err, struct opal_dev *cmd, size_t len)
577 const u8 *bytestring, size_t len)
578{ 597{
579 size_t header_len = 1; 598 size_t header_len = 1;
580 bool is_short_atom = true; 599 bool is_short_atom = true;
581 600
582 if (*err)
583 return;
584
585 if (len & ~SHORT_ATOM_LEN_MASK) { 601 if (len & ~SHORT_ATOM_LEN_MASK) {
586 header_len = 2; 602 header_len = 2;
587 is_short_atom = false; 603 is_short_atom = false;
588 } 604 }
589 605
590 if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) { 606 if (!can_add(err, cmd, header_len + len)) {
591 pr_debug("Error adding bytestring: end of buffer.\n"); 607 pr_debug("Error adding bytestring: end of buffer.\n");
592 *err = -ERANGE; 608 return NULL;
593 return;
594 } 609 }
595 610
596 if (is_short_atom) 611 if (is_short_atom)
@@ -598,9 +613,19 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd,
598 else 613 else
599 add_medium_atom_header(cmd, true, false, len); 614 add_medium_atom_header(cmd, true, false, len);
600 615
601 memcpy(&cmd->cmd[cmd->pos], bytestring, len); 616 return &cmd->cmd[cmd->pos];
602 cmd->pos += len; 617}
618
619static void add_token_bytestring(int *err, struct opal_dev *cmd,
620 const u8 *bytestring, size_t len)
621{
622 u8 *start;
603 623
624 start = add_bytestring_header(err, cmd, len);
625 if (!start)
626 return;
627 memcpy(start, bytestring, len);
628 cmd->pos += len;
604} 629}
605 630
606static int build_locking_range(u8 *buffer, size_t length, u8 lr) 631static int build_locking_range(u8 *buffer, size_t length, u8 lr)
@@ -623,7 +648,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr)
623static int build_locking_user(u8 *buffer, size_t length, u8 lr) 648static int build_locking_user(u8 *buffer, size_t length, u8 lr)
624{ 649{
625 if (length > OPAL_UID_LENGTH) { 650 if (length > OPAL_UID_LENGTH) {
626 pr_debug("Can't build locking range user, Length OOB\n"); 651 pr_debug("Can't build locking range user. Length OOB\n");
627 return -ERANGE; 652 return -ERANGE;
628 } 653 }
629 654
@@ -649,6 +674,9 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
649 struct opal_header *hdr; 674 struct opal_header *hdr;
650 int err = 0; 675 int err = 0;
651 676
677 /* close the parameter list opened from cmd_start */
678 add_token_u8(&err, cmd, OPAL_ENDLIST);
679
652 add_token_u8(&err, cmd, OPAL_ENDOFDATA); 680 add_token_u8(&err, cmd, OPAL_ENDOFDATA);
653 add_token_u8(&err, cmd, OPAL_STARTLIST); 681 add_token_u8(&err, cmd, OPAL_STARTLIST);
654 add_token_u8(&err, cmd, 0); 682 add_token_u8(&err, cmd, 0);
@@ -687,6 +715,11 @@ static const struct opal_resp_tok *response_get_token(
687{ 715{
688 const struct opal_resp_tok *tok; 716 const struct opal_resp_tok *tok;
689 717
718 if (!resp) {
719 pr_debug("Response is NULL\n");
720 return ERR_PTR(-EINVAL);
721 }
722
690 if (n >= resp->num) { 723 if (n >= resp->num) {
691 pr_debug("Token number doesn't exist: %d, resp: %d\n", 724 pr_debug("Token number doesn't exist: %d, resp: %d\n",
692 n, resp->num); 725 n, resp->num);
@@ -869,27 +902,19 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
869 const char **store) 902 const char **store)
870{ 903{
871 u8 skip; 904 u8 skip;
872 const struct opal_resp_tok *token; 905 const struct opal_resp_tok *tok;
873 906
874 *store = NULL; 907 *store = NULL;
875 if (!resp) { 908 tok = response_get_token(resp, n);
876 pr_debug("Response is NULL\n"); 909 if (IS_ERR(tok))
877 return 0;
878 }
879
880 if (n >= resp->num) {
881 pr_debug("Response has %d tokens. Can't access %d\n",
882 resp->num, n);
883 return 0; 910 return 0;
884 }
885 911
886 token = &resp->toks[n]; 912 if (tok->type != OPAL_DTA_TOKENID_BYTESTRING) {
887 if (token->type != OPAL_DTA_TOKENID_BYTESTRING) {
888 pr_debug("Token is not a byte string!\n"); 913 pr_debug("Token is not a byte string!\n");
889 return 0; 914 return 0;
890 } 915 }
891 916
892 switch (token->width) { 917 switch (tok->width) {
893 case OPAL_WIDTH_TINY: 918 case OPAL_WIDTH_TINY:
894 case OPAL_WIDTH_SHORT: 919 case OPAL_WIDTH_SHORT:
895 skip = 1; 920 skip = 1;
@@ -905,37 +930,29 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
905 return 0; 930 return 0;
906 } 931 }
907 932
908 *store = token->pos + skip; 933 *store = tok->pos + skip;
909 return token->len - skip; 934 return tok->len - skip;
910} 935}
911 936
912static u64 response_get_u64(const struct parsed_resp *resp, int n) 937static u64 response_get_u64(const struct parsed_resp *resp, int n)
913{ 938{
914 if (!resp) { 939 const struct opal_resp_tok *tok;
915 pr_debug("Response is NULL\n");
916 return 0;
917 }
918 940
919 if (n >= resp->num) { 941 tok = response_get_token(resp, n);
920 pr_debug("Response has %d tokens. Can't access %d\n", 942 if (IS_ERR(tok))
921 resp->num, n);
922 return 0; 943 return 0;
923 }
924 944
925 if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) { 945 if (tok->type != OPAL_DTA_TOKENID_UINT) {
926 pr_debug("Token is not unsigned it: %d\n", 946 pr_debug("Token is not unsigned int: %d\n", tok->type);
927 resp->toks[n].type);
928 return 0; 947 return 0;
929 } 948 }
930 949
931 if (!(resp->toks[n].width == OPAL_WIDTH_TINY || 950 if (tok->width != OPAL_WIDTH_TINY && tok->width != OPAL_WIDTH_SHORT) {
932 resp->toks[n].width == OPAL_WIDTH_SHORT)) { 951 pr_debug("Atom is not short or tiny: %d\n", tok->width);
933 pr_debug("Atom is not short or tiny: %d\n",
934 resp->toks[n].width);
935 return 0; 952 return 0;
936 } 953 }
937 954
938 return resp->toks[n].stored.u; 955 return tok->stored.u;
939} 956}
940 957
941static bool response_token_matches(const struct opal_resp_tok *token, u8 match) 958static bool response_token_matches(const struct opal_resp_tok *token, u8 match)
@@ -991,6 +1008,27 @@ static void clear_opal_cmd(struct opal_dev *dev)
991 memset(dev->cmd, 0, IO_BUFFER_LENGTH); 1008 memset(dev->cmd, 0, IO_BUFFER_LENGTH);
992} 1009}
993 1010
1011static int cmd_start(struct opal_dev *dev, const u8 *uid, const u8 *method)
1012{
1013 int err = 0;
1014
1015 clear_opal_cmd(dev);
1016 set_comid(dev, dev->comid);
1017
1018 add_token_u8(&err, dev, OPAL_CALL);
1019 add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
1020 add_token_bytestring(&err, dev, method, OPAL_METHOD_LENGTH);
1021
1022 /*
1023 * Every method call is followed by its parameters enclosed within
1024 * OPAL_STARTLIST and OPAL_ENDLIST tokens. We automatically open the
1025 * parameter list here and close it later in cmd_finalize.
1026 */
1027 add_token_u8(&err, dev, OPAL_STARTLIST);
1028
1029 return err;
1030}
1031
994static int start_opal_session_cont(struct opal_dev *dev) 1032static int start_opal_session_cont(struct opal_dev *dev)
995{ 1033{
996 u32 hsn, tsn; 1034 u32 hsn, tsn;
@@ -1050,24 +1088,47 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
1050 return opal_send_recv(dev, cont); 1088 return opal_send_recv(dev, cont);
1051} 1089}
1052 1090
1091/*
1092 * request @column from table @table on device @dev. On success, the column
1093 * data will be available in dev->resp->tok[4]
1094 */
1095static int generic_get_column(struct opal_dev *dev, const u8 *table,
1096 u64 column)
1097{
1098 int err;
1099
1100 err = cmd_start(dev, table, opalmethod[OPAL_GET]);
1101
1102 add_token_u8(&err, dev, OPAL_STARTLIST);
1103
1104 add_token_u8(&err, dev, OPAL_STARTNAME);
1105 add_token_u8(&err, dev, OPAL_STARTCOLUMN);
1106 add_token_u64(&err, dev, column);
1107 add_token_u8(&err, dev, OPAL_ENDNAME);
1108
1109 add_token_u8(&err, dev, OPAL_STARTNAME);
1110 add_token_u8(&err, dev, OPAL_ENDCOLUMN);
1111 add_token_u64(&err, dev, column);
1112 add_token_u8(&err, dev, OPAL_ENDNAME);
1113
1114 add_token_u8(&err, dev, OPAL_ENDLIST);
1115
1116 if (err)
1117 return err;
1118
1119 return finalize_and_send(dev, parse_and_check_status);
1120}
1121
1053static int gen_key(struct opal_dev *dev, void *data) 1122static int gen_key(struct opal_dev *dev, void *data)
1054{ 1123{
1055 u8 uid[OPAL_UID_LENGTH]; 1124 u8 uid[OPAL_UID_LENGTH];
1056 int err = 0; 1125 int err;
1057
1058 clear_opal_cmd(dev);
1059 set_comid(dev, dev->comid);
1060 1126
1061 memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len)); 1127 memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len));
1062 kfree(dev->prev_data); 1128 kfree(dev->prev_data);
1063 dev->prev_data = NULL; 1129 dev->prev_data = NULL;
1064 1130
1065 add_token_u8(&err, dev, OPAL_CALL); 1131 err = cmd_start(dev, uid, opalmethod[OPAL_GENKEY]);
1066 add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
1067 add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY],
1068 OPAL_UID_LENGTH);
1069 add_token_u8(&err, dev, OPAL_STARTLIST);
1070 add_token_u8(&err, dev, OPAL_ENDLIST);
1071 1132
1072 if (err) { 1133 if (err) {
1073 pr_debug("Error building gen key command\n"); 1134 pr_debug("Error building gen key command\n");
@@ -1105,62 +1166,39 @@ static int get_active_key_cont(struct opal_dev *dev)
1105static int get_active_key(struct opal_dev *dev, void *data) 1166static int get_active_key(struct opal_dev *dev, void *data)
1106{ 1167{
1107 u8 uid[OPAL_UID_LENGTH]; 1168 u8 uid[OPAL_UID_LENGTH];
1108 int err = 0; 1169 int err;
1109 u8 *lr = data; 1170 u8 *lr = data;
1110 1171
1111 clear_opal_cmd(dev);
1112 set_comid(dev, dev->comid);
1113
1114 err = build_locking_range(uid, sizeof(uid), *lr); 1172 err = build_locking_range(uid, sizeof(uid), *lr);
1115 if (err) 1173 if (err)
1116 return err; 1174 return err;
1117 1175
1118 err = 0; 1176 err = generic_get_column(dev, uid, OPAL_ACTIVEKEY);
1119 add_token_u8(&err, dev, OPAL_CALL); 1177 if (err)
1120 add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
1121 add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
1122 add_token_u8(&err, dev, OPAL_STARTLIST);
1123 add_token_u8(&err, dev, OPAL_STARTLIST);
1124 add_token_u8(&err, dev, OPAL_STARTNAME);
1125 add_token_u8(&err, dev, 3); /* startCloumn */
1126 add_token_u8(&err, dev, 10); /* ActiveKey */
1127 add_token_u8(&err, dev, OPAL_ENDNAME);
1128 add_token_u8(&err, dev, OPAL_STARTNAME);
1129 add_token_u8(&err, dev, 4); /* endColumn */
1130 add_token_u8(&err, dev, 10); /* ActiveKey */
1131 add_token_u8(&err, dev, OPAL_ENDNAME);
1132 add_token_u8(&err, dev, OPAL_ENDLIST);
1133 add_token_u8(&err, dev, OPAL_ENDLIST);
1134 if (err) {
1135 pr_debug("Error building get active key command\n");
1136 return err; 1178 return err;
1137 }
1138 1179
1139 return finalize_and_send(dev, get_active_key_cont); 1180 return get_active_key_cont(dev);
1140} 1181}
1141 1182
1142static int generic_lr_enable_disable(struct opal_dev *dev, 1183static int generic_lr_enable_disable(struct opal_dev *dev,
1143 u8 *uid, bool rle, bool wle, 1184 u8 *uid, bool rle, bool wle,
1144 bool rl, bool wl) 1185 bool rl, bool wl)
1145{ 1186{
1146 int err = 0; 1187 int err;
1147 1188
1148 add_token_u8(&err, dev, OPAL_CALL); 1189 err = cmd_start(dev, uid, opalmethod[OPAL_SET]);
1149 add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
1150 add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
1151 1190
1152 add_token_u8(&err, dev, OPAL_STARTLIST);
1153 add_token_u8(&err, dev, OPAL_STARTNAME); 1191 add_token_u8(&err, dev, OPAL_STARTNAME);
1154 add_token_u8(&err, dev, OPAL_VALUES); 1192 add_token_u8(&err, dev, OPAL_VALUES);
1155 add_token_u8(&err, dev, OPAL_STARTLIST); 1193 add_token_u8(&err, dev, OPAL_STARTLIST);
1156 1194
1157 add_token_u8(&err, dev, OPAL_STARTNAME); 1195 add_token_u8(&err, dev, OPAL_STARTNAME);
1158 add_token_u8(&err, dev, 5); /* ReadLockEnabled */ 1196 add_token_u8(&err, dev, OPAL_READLOCKENABLED);
1159 add_token_u8(&err, dev, rle); 1197 add_token_u8(&err, dev, rle);
1160 add_token_u8(&err, dev, OPAL_ENDNAME); 1198 add_token_u8(&err, dev, OPAL_ENDNAME);
1161 1199
1162 add_token_u8(&err, dev, OPAL_STARTNAME); 1200 add_token_u8(&err, dev, OPAL_STARTNAME);
1163 add_token_u8(&err, dev, 6); /* WriteLockEnabled */ 1201 add_token_u8(&err, dev, OPAL_WRITELOCKENABLED);
1164 add_token_u8(&err, dev, wle); 1202 add_token_u8(&err, dev, wle);
1165 add_token_u8(&err, dev, OPAL_ENDNAME); 1203 add_token_u8(&err, dev, OPAL_ENDNAME);
1166 1204
@@ -1176,7 +1214,6 @@ static int generic_lr_enable_disable(struct opal_dev *dev,
1176 1214
1177 add_token_u8(&err, dev, OPAL_ENDLIST); 1215 add_token_u8(&err, dev, OPAL_ENDLIST);
1178 add_token_u8(&err, dev, OPAL_ENDNAME); 1216 add_token_u8(&err, dev, OPAL_ENDNAME);
1179 add_token_u8(&err, dev, OPAL_ENDLIST);
1180 return err; 1217 return err;
1181} 1218}
1182 1219
@@ -1197,10 +1234,7 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
1197 u8 uid[OPAL_UID_LENGTH]; 1234 u8 uid[OPAL_UID_LENGTH];
1198 struct opal_user_lr_setup *setup = data; 1235 struct opal_user_lr_setup *setup = data;
1199 u8 lr; 1236 u8 lr;
1200 int err = 0; 1237 int err;
1201
1202 clear_opal_cmd(dev);
1203 set_comid(dev, dev->comid);
1204 1238
1205 lr = setup->session.opal_key.lr; 1239 lr = setup->session.opal_key.lr;
1206 err = build_locking_range(uid, sizeof(uid), lr); 1240 err = build_locking_range(uid, sizeof(uid), lr);
@@ -1210,40 +1244,34 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
1210 if (lr == 0) 1244 if (lr == 0)
1211 err = enable_global_lr(dev, uid, setup); 1245 err = enable_global_lr(dev, uid, setup);
1212 else { 1246 else {
1213 add_token_u8(&err, dev, OPAL_CALL); 1247 err = cmd_start(dev, uid, opalmethod[OPAL_SET]);
1214 add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
1215 add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
1216 OPAL_UID_LENGTH);
1217 1248
1218 add_token_u8(&err, dev, OPAL_STARTLIST);
1219 add_token_u8(&err, dev, OPAL_STARTNAME); 1249 add_token_u8(&err, dev, OPAL_STARTNAME);
1220 add_token_u8(&err, dev, OPAL_VALUES); 1250 add_token_u8(&err, dev, OPAL_VALUES);
1221 add_token_u8(&err, dev, OPAL_STARTLIST); 1251 add_token_u8(&err, dev, OPAL_STARTLIST);
1222 1252
1223 add_token_u8(&err, dev, OPAL_STARTNAME); 1253 add_token_u8(&err, dev, OPAL_STARTNAME);
1224 add_token_u8(&err, dev, 3); /* Ranges Start */ 1254 add_token_u8(&err, dev, OPAL_RANGESTART);
1225 add_token_u64(&err, dev, setup->range_start); 1255 add_token_u64(&err, dev, setup->range_start);
1226 add_token_u8(&err, dev, OPAL_ENDNAME); 1256 add_token_u8(&err, dev, OPAL_ENDNAME);
1227 1257
1228 add_token_u8(&err, dev, OPAL_STARTNAME); 1258 add_token_u8(&err, dev, OPAL_STARTNAME);
1229 add_token_u8(&err, dev, 4); /* Ranges length */ 1259 add_token_u8(&err, dev, OPAL_RANGELENGTH);
1230 add_token_u64(&err, dev, setup->range_length); 1260 add_token_u64(&err, dev, setup->range_length);
1231 add_token_u8(&err, dev, OPAL_ENDNAME); 1261 add_token_u8(&err, dev, OPAL_ENDNAME);
1232 1262
1233 add_token_u8(&err, dev, OPAL_STARTNAME); 1263 add_token_u8(&err, dev, OPAL_STARTNAME);
1234 add_token_u8(&err, dev, 5); /*ReadLockEnabled */ 1264 add_token_u8(&err, dev, OPAL_READLOCKENABLED);
1235 add_token_u64(&err, dev, !!setup->RLE); 1265 add_token_u64(&err, dev, !!setup->RLE);
1236 add_token_u8(&err, dev, OPAL_ENDNAME); 1266 add_token_u8(&err, dev, OPAL_ENDNAME);
1237 1267
1238 add_token_u8(&err, dev, OPAL_STARTNAME); 1268 add_token_u8(&err, dev, OPAL_STARTNAME);
1239 add_token_u8(&err, dev, 6); /*WriteLockEnabled*/ 1269 add_token_u8(&err, dev, OPAL_WRITELOCKENABLED);
1240 add_token_u64(&err, dev, !!setup->WLE); 1270 add_token_u64(&err, dev, !!setup->WLE);
1241 add_token_u8(&err, dev, OPAL_ENDNAME); 1271 add_token_u8(&err, dev, OPAL_ENDNAME);
1242 1272
1243 add_token_u8(&err, dev, OPAL_ENDLIST); 1273 add_token_u8(&err, dev, OPAL_ENDLIST);
1244 add_token_u8(&err, dev, OPAL_ENDNAME); 1274 add_token_u8(&err, dev, OPAL_ENDNAME);
1245 add_token_u8(&err, dev, OPAL_ENDLIST);
1246
1247 } 1275 }
1248 if (err) { 1276 if (err) {
1249 pr_debug("Error building Setup Locking range command.\n"); 1277 pr_debug("Error building Setup Locking range command.\n");
@@ -1261,29 +1289,21 @@ static int start_generic_opal_session(struct opal_dev *dev,
1261 u8 key_len) 1289 u8 key_len)
1262{ 1290{
1263 u32 hsn; 1291 u32 hsn;
1264 int err = 0; 1292 int err;
1265 1293
1266 if (key == NULL && auth != OPAL_ANYBODY_UID) 1294 if (key == NULL && auth != OPAL_ANYBODY_UID)
1267 return OPAL_INVAL_PARAM; 1295 return OPAL_INVAL_PARAM;
1268 1296
1269 clear_opal_cmd(dev);
1270
1271 set_comid(dev, dev->comid);
1272 hsn = GENERIC_HOST_SESSION_NUM; 1297 hsn = GENERIC_HOST_SESSION_NUM;
1298 err = cmd_start(dev, opaluid[OPAL_SMUID_UID],
1299 opalmethod[OPAL_STARTSESSION]);
1273 1300
1274 add_token_u8(&err, dev, OPAL_CALL);
1275 add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID],
1276 OPAL_UID_LENGTH);
1277 add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION],
1278 OPAL_UID_LENGTH);
1279 add_token_u8(&err, dev, OPAL_STARTLIST);
1280 add_token_u64(&err, dev, hsn); 1301 add_token_u64(&err, dev, hsn);
1281 add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH); 1302 add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH);
1282 add_token_u8(&err, dev, 1); 1303 add_token_u8(&err, dev, 1);
1283 1304
1284 switch (auth) { 1305 switch (auth) {
1285 case OPAL_ANYBODY_UID: 1306 case OPAL_ANYBODY_UID:
1286 add_token_u8(&err, dev, OPAL_ENDLIST);
1287 break; 1307 break;
1288 case OPAL_ADMIN1_UID: 1308 case OPAL_ADMIN1_UID:
1289 case OPAL_SID_UID: 1309 case OPAL_SID_UID:
@@ -1296,7 +1316,6 @@ static int start_generic_opal_session(struct opal_dev *dev,
1296 add_token_bytestring(&err, dev, opaluid[auth], 1316 add_token_bytestring(&err, dev, opaluid[auth],
1297 OPAL_UID_LENGTH); 1317 OPAL_UID_LENGTH);
1298 add_token_u8(&err, dev, OPAL_ENDNAME); 1318 add_token_u8(&err, dev, OPAL_ENDNAME);
1299 add_token_u8(&err, dev, OPAL_ENDLIST);
1300 break; 1319 break;
1301 default: 1320 default:
1302 pr_debug("Cannot start Admin SP session with auth %d\n", auth); 1321 pr_debug("Cannot start Admin SP session with auth %d\n", auth);
@@ -1324,6 +1343,7 @@ static int start_SIDASP_opal_session(struct opal_dev *dev, void *data)
1324 1343
1325 if (!key) { 1344 if (!key) {
1326 const struct opal_key *okey = data; 1345 const struct opal_key *okey = data;
1346
1327 ret = start_generic_opal_session(dev, OPAL_SID_UID, 1347 ret = start_generic_opal_session(dev, OPAL_SID_UID,
1328 OPAL_ADMINSP_UID, 1348 OPAL_ADMINSP_UID,
1329 okey->key, 1349 okey->key,
@@ -1341,6 +1361,7 @@ static int start_SIDASP_opal_session(struct opal_dev *dev, void *data)
1341static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data) 1361static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data)
1342{ 1362{
1343 struct opal_key *key = data; 1363 struct opal_key *key = data;
1364
1344 return start_generic_opal_session(dev, OPAL_ADMIN1_UID, 1365 return start_generic_opal_session(dev, OPAL_ADMIN1_UID,
1345 OPAL_LOCKINGSP_UID, 1366 OPAL_LOCKINGSP_UID,
1346 key->key, key->key_len); 1367 key->key, key->key_len);
@@ -1356,30 +1377,21 @@ static int start_auth_opal_session(struct opal_dev *dev, void *data)
1356 u8 *key = session->opal_key.key; 1377 u8 *key = session->opal_key.key;
1357 u32 hsn = GENERIC_HOST_SESSION_NUM; 1378 u32 hsn = GENERIC_HOST_SESSION_NUM;
1358 1379
1359 clear_opal_cmd(dev); 1380 if (session->sum)
1360 set_comid(dev, dev->comid);
1361
1362 if (session->sum) {
1363 err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), 1381 err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
1364 session->opal_key.lr); 1382 session->opal_key.lr);
1365 if (err) 1383 else if (session->who != OPAL_ADMIN1 && !session->sum)
1366 return err;
1367
1368 } else if (session->who != OPAL_ADMIN1 && !session->sum) {
1369 err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), 1384 err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
1370 session->who - 1); 1385 session->who - 1);
1371 if (err) 1386 else
1372 return err;
1373 } else
1374 memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH); 1387 memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH);
1375 1388
1376 add_token_u8(&err, dev, OPAL_CALL); 1389 if (err)
1377 add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID], 1390 return err;
1378 OPAL_UID_LENGTH); 1391
1379 add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION], 1392 err = cmd_start(dev, opaluid[OPAL_SMUID_UID],
1380 OPAL_UID_LENGTH); 1393 opalmethod[OPAL_STARTSESSION]);
1381 1394
1382 add_token_u8(&err, dev, OPAL_STARTLIST);
1383 add_token_u64(&err, dev, hsn); 1395 add_token_u64(&err, dev, hsn);
1384 add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], 1396 add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
1385 OPAL_UID_LENGTH); 1397 OPAL_UID_LENGTH);
@@ -1392,7 +1404,6 @@ static int start_auth_opal_session(struct opal_dev *dev, void *data)
1392 add_token_u8(&err, dev, 3); 1404 add_token_u8(&err, dev, 3);
1393 add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH); 1405 add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH);
1394 add_token_u8(&err, dev, OPAL_ENDNAME); 1406 add_token_u8(&err, dev, OPAL_ENDNAME);
1395 add_token_u8(&err, dev, OPAL_ENDLIST);
1396 1407
1397 if (err) { 1408 if (err) {
1398 pr_debug("Error building STARTSESSION command.\n"); 1409 pr_debug("Error building STARTSESSION command.\n");
@@ -1404,18 +1415,10 @@ static int start_auth_opal_session(struct opal_dev *dev, void *data)
1404 1415
1405static int revert_tper(struct opal_dev *dev, void *data) 1416static int revert_tper(struct opal_dev *dev, void *data)
1406{ 1417{
1407 int err = 0; 1418 int err;
1408
1409 clear_opal_cmd(dev);
1410 set_comid(dev, dev->comid);
1411 1419
1412 add_token_u8(&err, dev, OPAL_CALL); 1420 err = cmd_start(dev, opaluid[OPAL_ADMINSP_UID],
1413 add_token_bytestring(&err, dev, opaluid[OPAL_ADMINSP_UID], 1421 opalmethod[OPAL_REVERT]);
1414 OPAL_UID_LENGTH);
1415 add_token_bytestring(&err, dev, opalmethod[OPAL_REVERT],
1416 OPAL_UID_LENGTH);
1417 add_token_u8(&err, dev, OPAL_STARTLIST);
1418 add_token_u8(&err, dev, OPAL_ENDLIST);
1419 if (err) { 1422 if (err) {
1420 pr_debug("Error building REVERT TPER command.\n"); 1423 pr_debug("Error building REVERT TPER command.\n");
1421 return err; 1424 return err;
@@ -1428,18 +1431,12 @@ static int internal_activate_user(struct opal_dev *dev, void *data)
1428{ 1431{
1429 struct opal_session_info *session = data; 1432 struct opal_session_info *session = data;
1430 u8 uid[OPAL_UID_LENGTH]; 1433 u8 uid[OPAL_UID_LENGTH];
1431 int err = 0; 1434 int err;
1432
1433 clear_opal_cmd(dev);
1434 set_comid(dev, dev->comid);
1435 1435
1436 memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); 1436 memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
1437 uid[7] = session->who; 1437 uid[7] = session->who;
1438 1438
1439 add_token_u8(&err, dev, OPAL_CALL); 1439 err = cmd_start(dev, uid, opalmethod[OPAL_SET]);
1440 add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
1441 add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
1442 add_token_u8(&err, dev, OPAL_STARTLIST);
1443 add_token_u8(&err, dev, OPAL_STARTNAME); 1440 add_token_u8(&err, dev, OPAL_STARTNAME);
1444 add_token_u8(&err, dev, OPAL_VALUES); 1441 add_token_u8(&err, dev, OPAL_VALUES);
1445 add_token_u8(&err, dev, OPAL_STARTLIST); 1442 add_token_u8(&err, dev, OPAL_STARTLIST);
@@ -1449,7 +1446,6 @@ static int internal_activate_user(struct opal_dev *dev, void *data)
1449 add_token_u8(&err, dev, OPAL_ENDNAME); 1446 add_token_u8(&err, dev, OPAL_ENDNAME);
1450 add_token_u8(&err, dev, OPAL_ENDLIST); 1447 add_token_u8(&err, dev, OPAL_ENDLIST);
1451 add_token_u8(&err, dev, OPAL_ENDNAME); 1448 add_token_u8(&err, dev, OPAL_ENDNAME);
1452 add_token_u8(&err, dev, OPAL_ENDLIST);
1453 1449
1454 if (err) { 1450 if (err) {
1455 pr_debug("Error building Activate UserN command.\n"); 1451 pr_debug("Error building Activate UserN command.\n");
@@ -1463,20 +1459,12 @@ static int erase_locking_range(struct opal_dev *dev, void *data)
1463{ 1459{
1464 struct opal_session_info *session = data; 1460 struct opal_session_info *session = data;
1465 u8 uid[OPAL_UID_LENGTH]; 1461 u8 uid[OPAL_UID_LENGTH];
1466 int err = 0; 1462 int err;
1467
1468 clear_opal_cmd(dev);
1469 set_comid(dev, dev->comid);
1470 1463
1471 if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0) 1464 if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0)
1472 return -ERANGE; 1465 return -ERANGE;
1473 1466
1474 add_token_u8(&err, dev, OPAL_CALL); 1467 err = cmd_start(dev, uid, opalmethod[OPAL_ERASE]);
1475 add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
1476 add_token_bytestring(&err, dev, opalmethod[OPAL_ERASE],
1477 OPAL_UID_LENGTH);
1478 add_token_u8(&err, dev, OPAL_STARTLIST);
1479 add_token_u8(&err, dev, OPAL_ENDLIST);
1480 1468
1481 if (err) { 1469 if (err) {
1482 pr_debug("Error building Erase Locking Range Command.\n"); 1470 pr_debug("Error building Erase Locking Range Command.\n");
@@ -1488,26 +1476,20 @@ static int erase_locking_range(struct opal_dev *dev, void *data)
1488static int set_mbr_done(struct opal_dev *dev, void *data) 1476static int set_mbr_done(struct opal_dev *dev, void *data)
1489{ 1477{
1490 u8 *mbr_done_tf = data; 1478 u8 *mbr_done_tf = data;
1491 int err = 0; 1479 int err;
1492 1480
1493 clear_opal_cmd(dev); 1481 err = cmd_start(dev, opaluid[OPAL_MBRCONTROL],
1494 set_comid(dev, dev->comid); 1482 opalmethod[OPAL_SET]);
1495 1483
1496 add_token_u8(&err, dev, OPAL_CALL);
1497 add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
1498 OPAL_UID_LENGTH);
1499 add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
1500 add_token_u8(&err, dev, OPAL_STARTLIST);
1501 add_token_u8(&err, dev, OPAL_STARTNAME); 1484 add_token_u8(&err, dev, OPAL_STARTNAME);
1502 add_token_u8(&err, dev, OPAL_VALUES); 1485 add_token_u8(&err, dev, OPAL_VALUES);
1503 add_token_u8(&err, dev, OPAL_STARTLIST); 1486 add_token_u8(&err, dev, OPAL_STARTLIST);
1504 add_token_u8(&err, dev, OPAL_STARTNAME); 1487 add_token_u8(&err, dev, OPAL_STARTNAME);
1505 add_token_u8(&err, dev, 2); /* Done */ 1488 add_token_u8(&err, dev, OPAL_MBRDONE);
1506 add_token_u8(&err, dev, *mbr_done_tf); /* Done T or F */ 1489 add_token_u8(&err, dev, *mbr_done_tf); /* Done T or F */
1507 add_token_u8(&err, dev, OPAL_ENDNAME); 1490 add_token_u8(&err, dev, OPAL_ENDNAME);
1508 add_token_u8(&err, dev, OPAL_ENDLIST); 1491 add_token_u8(&err, dev, OPAL_ENDLIST);
1509 add_token_u8(&err, dev, OPAL_ENDNAME); 1492 add_token_u8(&err, dev, OPAL_ENDNAME);
1510 add_token_u8(&err, dev, OPAL_ENDLIST);
1511 1493
1512 if (err) { 1494 if (err) {
1513 pr_debug("Error Building set MBR Done command\n"); 1495 pr_debug("Error Building set MBR Done command\n");
@@ -1520,26 +1502,20 @@ static int set_mbr_done(struct opal_dev *dev, void *data)
1520static int set_mbr_enable_disable(struct opal_dev *dev, void *data) 1502static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
1521{ 1503{
1522 u8 *mbr_en_dis = data; 1504 u8 *mbr_en_dis = data;
1523 int err = 0; 1505 int err;
1524 1506
1525 clear_opal_cmd(dev); 1507 err = cmd_start(dev, opaluid[OPAL_MBRCONTROL],
1526 set_comid(dev, dev->comid); 1508 opalmethod[OPAL_SET]);
1527 1509
1528 add_token_u8(&err, dev, OPAL_CALL);
1529 add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
1530 OPAL_UID_LENGTH);
1531 add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
1532 add_token_u8(&err, dev, OPAL_STARTLIST);
1533 add_token_u8(&err, dev, OPAL_STARTNAME); 1510 add_token_u8(&err, dev, OPAL_STARTNAME);
1534 add_token_u8(&err, dev, OPAL_VALUES); 1511 add_token_u8(&err, dev, OPAL_VALUES);
1535 add_token_u8(&err, dev, OPAL_STARTLIST); 1512 add_token_u8(&err, dev, OPAL_STARTLIST);
1536 add_token_u8(&err, dev, OPAL_STARTNAME); 1513 add_token_u8(&err, dev, OPAL_STARTNAME);
1537 add_token_u8(&err, dev, 1); 1514 add_token_u8(&err, dev, OPAL_MBRENABLE);
1538 add_token_u8(&err, dev, *mbr_en_dis); 1515 add_token_u8(&err, dev, *mbr_en_dis);
1539 add_token_u8(&err, dev, OPAL_ENDNAME); 1516 add_token_u8(&err, dev, OPAL_ENDNAME);
1540 add_token_u8(&err, dev, OPAL_ENDLIST); 1517 add_token_u8(&err, dev, OPAL_ENDLIST);
1541 add_token_u8(&err, dev, OPAL_ENDNAME); 1518 add_token_u8(&err, dev, OPAL_ENDNAME);
1542 add_token_u8(&err, dev, OPAL_ENDLIST);
1543 1519
1544 if (err) { 1520 if (err) {
1545 pr_debug("Error Building set MBR done command\n"); 1521 pr_debug("Error Building set MBR done command\n");
@@ -1552,26 +1528,19 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
1552static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, 1528static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
1553 struct opal_dev *dev) 1529 struct opal_dev *dev)
1554{ 1530{
1555 int err = 0; 1531 int err;
1556 1532
1557 clear_opal_cmd(dev); 1533 err = cmd_start(dev, cpin_uid, opalmethod[OPAL_SET]);
1558 set_comid(dev, dev->comid);
1559 1534
1560 add_token_u8(&err, dev, OPAL_CALL);
1561 add_token_bytestring(&err, dev, cpin_uid, OPAL_UID_LENGTH);
1562 add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
1563 OPAL_UID_LENGTH);
1564 add_token_u8(&err, dev, OPAL_STARTLIST);
1565 add_token_u8(&err, dev, OPAL_STARTNAME); 1535 add_token_u8(&err, dev, OPAL_STARTNAME);
1566 add_token_u8(&err, dev, OPAL_VALUES); 1536 add_token_u8(&err, dev, OPAL_VALUES);
1567 add_token_u8(&err, dev, OPAL_STARTLIST); 1537 add_token_u8(&err, dev, OPAL_STARTLIST);
1568 add_token_u8(&err, dev, OPAL_STARTNAME); 1538 add_token_u8(&err, dev, OPAL_STARTNAME);
1569 add_token_u8(&err, dev, 3); /* PIN */ 1539 add_token_u8(&err, dev, OPAL_PIN);
1570 add_token_bytestring(&err, dev, key, key_len); 1540 add_token_bytestring(&err, dev, key, key_len);
1571 add_token_u8(&err, dev, OPAL_ENDNAME); 1541 add_token_u8(&err, dev, OPAL_ENDNAME);
1572 add_token_u8(&err, dev, OPAL_ENDLIST); 1542 add_token_u8(&err, dev, OPAL_ENDLIST);
1573 add_token_u8(&err, dev, OPAL_ENDNAME); 1543 add_token_u8(&err, dev, OPAL_ENDNAME);
1574 add_token_u8(&err, dev, OPAL_ENDLIST);
1575 1544
1576 return err; 1545 return err;
1577} 1546}
@@ -1619,10 +1588,7 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
1619 u8 lr_buffer[OPAL_UID_LENGTH]; 1588 u8 lr_buffer[OPAL_UID_LENGTH];
1620 u8 user_uid[OPAL_UID_LENGTH]; 1589 u8 user_uid[OPAL_UID_LENGTH];
1621 struct opal_lock_unlock *lkul = data; 1590 struct opal_lock_unlock *lkul = data;
1622 int err = 0; 1591 int err;
1623
1624 clear_opal_cmd(dev);
1625 set_comid(dev, dev->comid);
1626 1592
1627 memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED], 1593 memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED],
1628 OPAL_UID_LENGTH); 1594 OPAL_UID_LENGTH);
@@ -1637,12 +1603,8 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
1637 1603
1638 user_uid[7] = lkul->session.who; 1604 user_uid[7] = lkul->session.who;
1639 1605
1640 add_token_u8(&err, dev, OPAL_CALL); 1606 err = cmd_start(dev, lr_buffer, opalmethod[OPAL_SET]);
1641 add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH);
1642 add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
1643 OPAL_UID_LENGTH);
1644 1607
1645 add_token_u8(&err, dev, OPAL_STARTLIST);
1646 add_token_u8(&err, dev, OPAL_STARTNAME); 1608 add_token_u8(&err, dev, OPAL_STARTNAME);
1647 add_token_u8(&err, dev, OPAL_VALUES); 1609 add_token_u8(&err, dev, OPAL_VALUES);
1648 1610
@@ -1680,7 +1642,6 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
1680 add_token_u8(&err, dev, OPAL_ENDNAME); 1642 add_token_u8(&err, dev, OPAL_ENDNAME);
1681 add_token_u8(&err, dev, OPAL_ENDLIST); 1643 add_token_u8(&err, dev, OPAL_ENDLIST);
1682 add_token_u8(&err, dev, OPAL_ENDNAME); 1644 add_token_u8(&err, dev, OPAL_ENDNAME);
1683 add_token_u8(&err, dev, OPAL_ENDLIST);
1684 1645
1685 if (err) { 1646 if (err) {
1686 pr_debug("Error building add user to locking range command.\n"); 1647 pr_debug("Error building add user to locking range command.\n");
@@ -1697,9 +1658,6 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
1697 u8 read_locked = 1, write_locked = 1; 1658 u8 read_locked = 1, write_locked = 1;
1698 int err = 0; 1659 int err = 0;
1699 1660
1700 clear_opal_cmd(dev);
1701 set_comid(dev, dev->comid);
1702
1703 if (build_locking_range(lr_buffer, sizeof(lr_buffer), 1661 if (build_locking_range(lr_buffer, sizeof(lr_buffer),
1704 lkul->session.opal_key.lr) < 0) 1662 lkul->session.opal_key.lr) < 0)
1705 return -ERANGE; 1663 return -ERANGE;
@@ -1714,17 +1672,15 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
1714 write_locked = 0; 1672 write_locked = 0;
1715 break; 1673 break;
1716 case OPAL_LK: 1674 case OPAL_LK:
1717 /* vars are initalized to locked */ 1675 /* vars are initialized to locked */
1718 break; 1676 break;
1719 default: 1677 default:
1720 pr_debug("Tried to set an invalid locking state... returning to uland\n"); 1678 pr_debug("Tried to set an invalid locking state... returning to uland\n");
1721 return OPAL_INVAL_PARAM; 1679 return OPAL_INVAL_PARAM;
1722 } 1680 }
1723 1681
1724 add_token_u8(&err, dev, OPAL_CALL); 1682 err = cmd_start(dev, lr_buffer, opalmethod[OPAL_SET]);
1725 add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH); 1683
1726 add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
1727 add_token_u8(&err, dev, OPAL_STARTLIST);
1728 add_token_u8(&err, dev, OPAL_STARTNAME); 1684 add_token_u8(&err, dev, OPAL_STARTNAME);
1729 add_token_u8(&err, dev, OPAL_VALUES); 1685 add_token_u8(&err, dev, OPAL_VALUES);
1730 add_token_u8(&err, dev, OPAL_STARTLIST); 1686 add_token_u8(&err, dev, OPAL_STARTLIST);
@@ -1741,7 +1697,6 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
1741 1697
1742 add_token_u8(&err, dev, OPAL_ENDLIST); 1698 add_token_u8(&err, dev, OPAL_ENDLIST);
1743 add_token_u8(&err, dev, OPAL_ENDNAME); 1699 add_token_u8(&err, dev, OPAL_ENDNAME);
1744 add_token_u8(&err, dev, OPAL_ENDLIST);
1745 1700
1746 if (err) { 1701 if (err) {
1747 pr_debug("Error building SET command.\n"); 1702 pr_debug("Error building SET command.\n");
@@ -1775,7 +1730,7 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
1775 write_locked = 0; 1730 write_locked = 0;
1776 break; 1731 break;
1777 case OPAL_LK: 1732 case OPAL_LK:
1778 /* vars are initalized to locked */ 1733 /* vars are initialized to locked */
1779 break; 1734 break;
1780 default: 1735 default:
1781 pr_debug("Tried to set an invalid locking state.\n"); 1736 pr_debug("Tried to set an invalid locking state.\n");
@@ -1796,17 +1751,10 @@ static int activate_lsp(struct opal_dev *dev, void *data)
1796 struct opal_lr_act *opal_act = data; 1751 struct opal_lr_act *opal_act = data;
1797 u8 user_lr[OPAL_UID_LENGTH]; 1752 u8 user_lr[OPAL_UID_LENGTH];
1798 u8 uint_3 = 0x83; 1753 u8 uint_3 = 0x83;
1799 int err = 0, i; 1754 int err, i;
1800
1801 clear_opal_cmd(dev);
1802 set_comid(dev, dev->comid);
1803
1804 add_token_u8(&err, dev, OPAL_CALL);
1805 add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
1806 OPAL_UID_LENGTH);
1807 add_token_bytestring(&err, dev, opalmethod[OPAL_ACTIVATE],
1808 OPAL_UID_LENGTH);
1809 1755
1756 err = cmd_start(dev, opaluid[OPAL_LOCKINGSP_UID],
1757 opalmethod[OPAL_ACTIVATE]);
1810 1758
1811 if (opal_act->sum) { 1759 if (opal_act->sum) {
1812 err = build_locking_range(user_lr, sizeof(user_lr), 1760 err = build_locking_range(user_lr, sizeof(user_lr),
@@ -1814,7 +1762,6 @@ static int activate_lsp(struct opal_dev *dev, void *data)
1814 if (err) 1762 if (err)
1815 return err; 1763 return err;
1816 1764
1817 add_token_u8(&err, dev, OPAL_STARTLIST);
1818 add_token_u8(&err, dev, OPAL_STARTNAME); 1765 add_token_u8(&err, dev, OPAL_STARTNAME);
1819 add_token_u8(&err, dev, uint_3); 1766 add_token_u8(&err, dev, uint_3);
1820 add_token_u8(&err, dev, 6); 1767 add_token_u8(&err, dev, 6);
@@ -1829,11 +1776,6 @@ static int activate_lsp(struct opal_dev *dev, void *data)
1829 } 1776 }
1830 add_token_u8(&err, dev, OPAL_ENDLIST); 1777 add_token_u8(&err, dev, OPAL_ENDLIST);
1831 add_token_u8(&err, dev, OPAL_ENDNAME); 1778 add_token_u8(&err, dev, OPAL_ENDNAME);
1832 add_token_u8(&err, dev, OPAL_ENDLIST);
1833
1834 } else {
1835 add_token_u8(&err, dev, OPAL_STARTLIST);
1836 add_token_u8(&err, dev, OPAL_ENDLIST);
1837 } 1779 }
1838 1780
1839 if (err) { 1781 if (err) {
@@ -1844,17 +1786,19 @@ static int activate_lsp(struct opal_dev *dev, void *data)
1844 return finalize_and_send(dev, parse_and_check_status); 1786 return finalize_and_send(dev, parse_and_check_status);
1845} 1787}
1846 1788
1847static int get_lsp_lifecycle_cont(struct opal_dev *dev) 1789/* Determine if we're in the Manufactured Inactive or Active state */
1790static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
1848{ 1791{
1849 u8 lc_status; 1792 u8 lc_status;
1850 int error = 0; 1793 int err;
1851 1794
1852 error = parse_and_check_status(dev); 1795 err = generic_get_column(dev, opaluid[OPAL_LOCKINGSP_UID],
1853 if (error) 1796 OPAL_LIFECYCLE);
1854 return error; 1797 if (err)
1798 return err;
1855 1799
1856 lc_status = response_get_u64(&dev->parsed, 4); 1800 lc_status = response_get_u64(&dev->parsed, 4);
1857 /* 0x08 is Manufacured Inactive */ 1801 /* 0x08 is Manufactured Inactive */
1858 /* 0x09 is Manufactured */ 1802 /* 0x09 is Manufactured */
1859 if (lc_status != OPAL_MANUFACTURED_INACTIVE) { 1803 if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
1860 pr_debug("Couldn't determine the status of the Lifecycle state\n"); 1804 pr_debug("Couldn't determine the status of the Lifecycle state\n");
@@ -1864,56 +1808,19 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev)
1864 return 0; 1808 return 0;
1865} 1809}
1866 1810
1867/* Determine if we're in the Manufactured Inactive or Active state */ 1811static int get_msid_cpin_pin(struct opal_dev *dev, void *data)
1868static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
1869{
1870 int err = 0;
1871
1872 clear_opal_cmd(dev);
1873 set_comid(dev, dev->comid);
1874
1875 add_token_u8(&err, dev, OPAL_CALL);
1876 add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
1877 OPAL_UID_LENGTH);
1878 add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
1879
1880 add_token_u8(&err, dev, OPAL_STARTLIST);
1881 add_token_u8(&err, dev, OPAL_STARTLIST);
1882
1883 add_token_u8(&err, dev, OPAL_STARTNAME);
1884 add_token_u8(&err, dev, 3); /* Start Column */
1885 add_token_u8(&err, dev, 6); /* Lifecycle Column */
1886 add_token_u8(&err, dev, OPAL_ENDNAME);
1887
1888 add_token_u8(&err, dev, OPAL_STARTNAME);
1889 add_token_u8(&err, dev, 4); /* End Column */
1890 add_token_u8(&err, dev, 6); /* Lifecycle Column */
1891 add_token_u8(&err, dev, OPAL_ENDNAME);
1892
1893 add_token_u8(&err, dev, OPAL_ENDLIST);
1894 add_token_u8(&err, dev, OPAL_ENDLIST);
1895
1896 if (err) {
1897 pr_debug("Error Building GET Lifecycle Status command\n");
1898 return err;
1899 }
1900
1901 return finalize_and_send(dev, get_lsp_lifecycle_cont);
1902}
1903
1904static int get_msid_cpin_pin_cont(struct opal_dev *dev)
1905{ 1812{
1906 const char *msid_pin; 1813 const char *msid_pin;
1907 size_t strlen; 1814 size_t strlen;
1908 int error = 0; 1815 int err;
1909 1816
1910 error = parse_and_check_status(dev); 1817 err = generic_get_column(dev, opaluid[OPAL_C_PIN_MSID], OPAL_PIN);
1911 if (error) 1818 if (err)
1912 return error; 1819 return err;
1913 1820
1914 strlen = response_get_string(&dev->parsed, 4, &msid_pin); 1821 strlen = response_get_string(&dev->parsed, 4, &msid_pin);
1915 if (!msid_pin) { 1822 if (!msid_pin) {
1916 pr_debug("%s: Couldn't extract PIN from response\n", __func__); 1823 pr_debug("Couldn't extract MSID_CPIN from response\n");
1917 return OPAL_INVAL_PARAM; 1824 return OPAL_INVAL_PARAM;
1918 } 1825 }
1919 1826
@@ -1926,42 +1833,6 @@ static int get_msid_cpin_pin_cont(struct opal_dev *dev)
1926 return 0; 1833 return 0;
1927} 1834}
1928 1835
1929static int get_msid_cpin_pin(struct opal_dev *dev, void *data)
1930{
1931 int err = 0;
1932
1933 clear_opal_cmd(dev);
1934 set_comid(dev, dev->comid);
1935
1936 add_token_u8(&err, dev, OPAL_CALL);
1937 add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID],
1938 OPAL_UID_LENGTH);
1939 add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
1940
1941 add_token_u8(&err, dev, OPAL_STARTLIST);
1942 add_token_u8(&err, dev, OPAL_STARTLIST);
1943
1944 add_token_u8(&err, dev, OPAL_STARTNAME);
1945 add_token_u8(&err, dev, 3); /* Start Column */
1946 add_token_u8(&err, dev, 3); /* PIN */
1947 add_token_u8(&err, dev, OPAL_ENDNAME);
1948
1949 add_token_u8(&err, dev, OPAL_STARTNAME);
1950 add_token_u8(&err, dev, 4); /* End Column */
1951 add_token_u8(&err, dev, 3); /* Lifecycle Column */
1952 add_token_u8(&err, dev, OPAL_ENDNAME);
1953
1954 add_token_u8(&err, dev, OPAL_ENDLIST);
1955 add_token_u8(&err, dev, OPAL_ENDLIST);
1956
1957 if (err) {
1958 pr_debug("Error building Get MSID CPIN PIN command.\n");
1959 return err;
1960 }
1961
1962 return finalize_and_send(dev, get_msid_cpin_pin_cont);
1963}
1964
1965static int end_opal_session(struct opal_dev *dev, void *data) 1836static int end_opal_session(struct opal_dev *dev, void *data)
1966{ 1837{
1967 int err = 0; 1838 int err = 0;
@@ -1977,18 +1848,14 @@ static int end_opal_session(struct opal_dev *dev, void *data)
1977 1848
1978static int end_opal_session_error(struct opal_dev *dev) 1849static int end_opal_session_error(struct opal_dev *dev)
1979{ 1850{
1980 const struct opal_step error_end_session[] = { 1851 const struct opal_step error_end_session = {
1981 { end_opal_session, }, 1852 end_opal_session,
1982 { NULL, }
1983 }; 1853 };
1984 dev->steps = error_end_session; 1854 return execute_step(dev, &error_end_session, 0);
1985 return next(dev);
1986} 1855}
1987 1856
1988static inline void setup_opal_dev(struct opal_dev *dev, 1857static inline void setup_opal_dev(struct opal_dev *dev)
1989 const struct opal_step *steps)
1990{ 1858{
1991 dev->steps = steps;
1992 dev->tsn = 0; 1859 dev->tsn = 0;
1993 dev->hsn = 0; 1860 dev->hsn = 0;
1994 dev->prev_data = NULL; 1861 dev->prev_data = NULL;
@@ -1996,15 +1863,11 @@ static inline void setup_opal_dev(struct opal_dev *dev,
1996 1863
1997static int check_opal_support(struct opal_dev *dev) 1864static int check_opal_support(struct opal_dev *dev)
1998{ 1865{
1999 const struct opal_step steps[] = {
2000 { opal_discovery0, },
2001 { NULL, }
2002 };
2003 int ret; 1866 int ret;
2004 1867
2005 mutex_lock(&dev->dev_lock); 1868 mutex_lock(&dev->dev_lock);
2006 setup_opal_dev(dev, steps); 1869 setup_opal_dev(dev);
2007 ret = next(dev); 1870 ret = opal_discovery0_step(dev);
2008 dev->supported = !ret; 1871 dev->supported = !ret;
2009 mutex_unlock(&dev->dev_lock); 1872 mutex_unlock(&dev->dev_lock);
2010 return ret; 1873 return ret;
@@ -2057,18 +1920,16 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
2057 struct opal_session_info *opal_session) 1920 struct opal_session_info *opal_session)
2058{ 1921{
2059 const struct opal_step erase_steps[] = { 1922 const struct opal_step erase_steps[] = {
2060 { opal_discovery0, },
2061 { start_auth_opal_session, opal_session }, 1923 { start_auth_opal_session, opal_session },
2062 { get_active_key, &opal_session->opal_key.lr }, 1924 { get_active_key, &opal_session->opal_key.lr },
2063 { gen_key, }, 1925 { gen_key, },
2064 { end_opal_session, }, 1926 { end_opal_session, }
2065 { NULL, }
2066 }; 1927 };
2067 int ret; 1928 int ret;
2068 1929
2069 mutex_lock(&dev->dev_lock); 1930 mutex_lock(&dev->dev_lock);
2070 setup_opal_dev(dev, erase_steps); 1931 setup_opal_dev(dev);
2071 ret = next(dev); 1932 ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
2072 mutex_unlock(&dev->dev_lock); 1933 mutex_unlock(&dev->dev_lock);
2073 return ret; 1934 return ret;
2074} 1935}
@@ -2077,17 +1938,15 @@ static int opal_erase_locking_range(struct opal_dev *dev,
2077 struct opal_session_info *opal_session) 1938 struct opal_session_info *opal_session)
2078{ 1939{
2079 const struct opal_step erase_steps[] = { 1940 const struct opal_step erase_steps[] = {
2080 { opal_discovery0, },
2081 { start_auth_opal_session, opal_session }, 1941 { start_auth_opal_session, opal_session },
2082 { erase_locking_range, opal_session }, 1942 { erase_locking_range, opal_session },
2083 { end_opal_session, }, 1943 { end_opal_session, }
2084 { NULL, }
2085 }; 1944 };
2086 int ret; 1945 int ret;
2087 1946
2088 mutex_lock(&dev->dev_lock); 1947 mutex_lock(&dev->dev_lock);
2089 setup_opal_dev(dev, erase_steps); 1948 setup_opal_dev(dev);
2090 ret = next(dev); 1949 ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
2091 mutex_unlock(&dev->dev_lock); 1950 mutex_unlock(&dev->dev_lock);
2092 return ret; 1951 return ret;
2093} 1952}
@@ -2095,15 +1954,16 @@ static int opal_erase_locking_range(struct opal_dev *dev,
2095static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, 1954static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
2096 struct opal_mbr_data *opal_mbr) 1955 struct opal_mbr_data *opal_mbr)
2097{ 1956{
1957 u8 enable_disable = opal_mbr->enable_disable == OPAL_MBR_ENABLE ?
1958 OPAL_TRUE : OPAL_FALSE;
1959
2098 const struct opal_step mbr_steps[] = { 1960 const struct opal_step mbr_steps[] = {
2099 { opal_discovery0, },
2100 { start_admin1LSP_opal_session, &opal_mbr->key }, 1961 { start_admin1LSP_opal_session, &opal_mbr->key },
2101 { set_mbr_done, &opal_mbr->enable_disable }, 1962 { set_mbr_done, &enable_disable },
2102 { end_opal_session, }, 1963 { end_opal_session, },
2103 { start_admin1LSP_opal_session, &opal_mbr->key }, 1964 { start_admin1LSP_opal_session, &opal_mbr->key },
2104 { set_mbr_enable_disable, &opal_mbr->enable_disable }, 1965 { set_mbr_enable_disable, &enable_disable },
2105 { end_opal_session, }, 1966 { end_opal_session, }
2106 { NULL, }
2107 }; 1967 };
2108 int ret; 1968 int ret;
2109 1969
@@ -2112,8 +1972,8 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
2112 return -EINVAL; 1972 return -EINVAL;
2113 1973
2114 mutex_lock(&dev->dev_lock); 1974 mutex_lock(&dev->dev_lock);
2115 setup_opal_dev(dev, mbr_steps); 1975 setup_opal_dev(dev);
2116 ret = next(dev); 1976 ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
2117 mutex_unlock(&dev->dev_lock); 1977 mutex_unlock(&dev->dev_lock);
2118 return ret; 1978 return ret;
2119} 1979}
@@ -2130,7 +1990,7 @@ static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
2130 suspend->lr = lk_unlk->session.opal_key.lr; 1990 suspend->lr = lk_unlk->session.opal_key.lr;
2131 1991
2132 mutex_lock(&dev->dev_lock); 1992 mutex_lock(&dev->dev_lock);
2133 setup_opal_dev(dev, NULL); 1993 setup_opal_dev(dev);
2134 add_suspend_info(dev, suspend); 1994 add_suspend_info(dev, suspend);
2135 mutex_unlock(&dev->dev_lock); 1995 mutex_unlock(&dev->dev_lock);
2136 return 0; 1996 return 0;
@@ -2140,11 +2000,9 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
2140 struct opal_lock_unlock *lk_unlk) 2000 struct opal_lock_unlock *lk_unlk)
2141{ 2001{
2142 const struct opal_step steps[] = { 2002 const struct opal_step steps[] = {
2143 { opal_discovery0, },
2144 { start_admin1LSP_opal_session, &lk_unlk->session.opal_key }, 2003 { start_admin1LSP_opal_session, &lk_unlk->session.opal_key },
2145 { add_user_to_lr, lk_unlk }, 2004 { add_user_to_lr, lk_unlk },
2146 { end_opal_session, }, 2005 { end_opal_session, }
2147 { NULL, }
2148 }; 2006 };
2149 int ret; 2007 int ret;
2150 2008
@@ -2166,8 +2024,8 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
2166 } 2024 }
2167 2025
2168 mutex_lock(&dev->dev_lock); 2026 mutex_lock(&dev->dev_lock);
2169 setup_opal_dev(dev, steps); 2027 setup_opal_dev(dev);
2170 ret = next(dev); 2028 ret = execute_steps(dev, steps, ARRAY_SIZE(steps));
2171 mutex_unlock(&dev->dev_lock); 2029 mutex_unlock(&dev->dev_lock);
2172 return ret; 2030 return ret;
2173} 2031}
@@ -2175,16 +2033,14 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
2175static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal) 2033static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
2176{ 2034{
2177 const struct opal_step revert_steps[] = { 2035 const struct opal_step revert_steps[] = {
2178 { opal_discovery0, },
2179 { start_SIDASP_opal_session, opal }, 2036 { start_SIDASP_opal_session, opal },
2180 { revert_tper, }, /* controller will terminate session */ 2037 { revert_tper, } /* controller will terminate session */
2181 { NULL, }
2182 }; 2038 };
2183 int ret; 2039 int ret;
2184 2040
2185 mutex_lock(&dev->dev_lock); 2041 mutex_lock(&dev->dev_lock);
2186 setup_opal_dev(dev, revert_steps); 2042 setup_opal_dev(dev);
2187 ret = next(dev); 2043 ret = execute_steps(dev, revert_steps, ARRAY_SIZE(revert_steps));
2188 mutex_unlock(&dev->dev_lock); 2044 mutex_unlock(&dev->dev_lock);
2189 2045
2190 /* 2046 /*
@@ -2201,37 +2057,34 @@ static int __opal_lock_unlock(struct opal_dev *dev,
2201 struct opal_lock_unlock *lk_unlk) 2057 struct opal_lock_unlock *lk_unlk)
2202{ 2058{
2203 const struct opal_step unlock_steps[] = { 2059 const struct opal_step unlock_steps[] = {
2204 { opal_discovery0, },
2205 { start_auth_opal_session, &lk_unlk->session }, 2060 { start_auth_opal_session, &lk_unlk->session },
2206 { lock_unlock_locking_range, lk_unlk }, 2061 { lock_unlock_locking_range, lk_unlk },
2207 { end_opal_session, }, 2062 { end_opal_session, }
2208 { NULL, }
2209 }; 2063 };
2210 const struct opal_step unlock_sum_steps[] = { 2064 const struct opal_step unlock_sum_steps[] = {
2211 { opal_discovery0, },
2212 { start_auth_opal_session, &lk_unlk->session }, 2065 { start_auth_opal_session, &lk_unlk->session },
2213 { lock_unlock_locking_range_sum, lk_unlk }, 2066 { lock_unlock_locking_range_sum, lk_unlk },
2214 { end_opal_session, }, 2067 { end_opal_session, }
2215 { NULL, }
2216 }; 2068 };
2217 2069
2218 dev->steps = lk_unlk->session.sum ? unlock_sum_steps : unlock_steps; 2070 if (lk_unlk->session.sum)
2219 return next(dev); 2071 return execute_steps(dev, unlock_sum_steps,
2072 ARRAY_SIZE(unlock_sum_steps));
2073 else
2074 return execute_steps(dev, unlock_steps,
2075 ARRAY_SIZE(unlock_steps));
2220} 2076}
2221 2077
2222static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key) 2078static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key)
2223{ 2079{
2224 u8 mbr_done_tf = 1; 2080 u8 mbr_done_tf = OPAL_TRUE;
2225 const struct opal_step mbrdone_step [] = { 2081 const struct opal_step mbrdone_step[] = {
2226 { opal_discovery0, },
2227 { start_admin1LSP_opal_session, key }, 2082 { start_admin1LSP_opal_session, key },
2228 { set_mbr_done, &mbr_done_tf }, 2083 { set_mbr_done, &mbr_done_tf },
2229 { end_opal_session, }, 2084 { end_opal_session, }
2230 { NULL, }
2231 }; 2085 };
2232 2086
2233 dev->steps = mbrdone_step; 2087 return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step));
2234 return next(dev);
2235} 2088}
2236 2089
2237static int opal_lock_unlock(struct opal_dev *dev, 2090static int opal_lock_unlock(struct opal_dev *dev,
@@ -2252,14 +2105,12 @@ static int opal_lock_unlock(struct opal_dev *dev,
2252static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) 2105static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
2253{ 2106{
2254 const struct opal_step owner_steps[] = { 2107 const struct opal_step owner_steps[] = {
2255 { opal_discovery0, },
2256 { start_anybodyASP_opal_session, }, 2108 { start_anybodyASP_opal_session, },
2257 { get_msid_cpin_pin, }, 2109 { get_msid_cpin_pin, },
2258 { end_opal_session, }, 2110 { end_opal_session, },
2259 { start_SIDASP_opal_session, opal }, 2111 { start_SIDASP_opal_session, opal },
2260 { set_sid_cpin_pin, opal }, 2112 { set_sid_cpin_pin, opal },
2261 { end_opal_session, }, 2113 { end_opal_session, }
2262 { NULL, }
2263 }; 2114 };
2264 int ret; 2115 int ret;
2265 2116
@@ -2267,21 +2118,20 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
2267 return -ENODEV; 2118 return -ENODEV;
2268 2119
2269 mutex_lock(&dev->dev_lock); 2120 mutex_lock(&dev->dev_lock);
2270 setup_opal_dev(dev, owner_steps); 2121 setup_opal_dev(dev);
2271 ret = next(dev); 2122 ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps));
2272 mutex_unlock(&dev->dev_lock); 2123 mutex_unlock(&dev->dev_lock);
2273 return ret; 2124 return ret;
2274} 2125}
2275 2126
2276static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act) 2127static int opal_activate_lsp(struct opal_dev *dev,
2128 struct opal_lr_act *opal_lr_act)
2277{ 2129{
2278 const struct opal_step active_steps[] = { 2130 const struct opal_step active_steps[] = {
2279 { opal_discovery0, },
2280 { start_SIDASP_opal_session, &opal_lr_act->key }, 2131 { start_SIDASP_opal_session, &opal_lr_act->key },
2281 { get_lsp_lifecycle, }, 2132 { get_lsp_lifecycle, },
2282 { activate_lsp, opal_lr_act }, 2133 { activate_lsp, opal_lr_act },
2283 { end_opal_session, }, 2134 { end_opal_session, }
2284 { NULL, }
2285 }; 2135 };
2286 int ret; 2136 int ret;
2287 2137
@@ -2289,8 +2139,8 @@ static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_a
2289 return -EINVAL; 2139 return -EINVAL;
2290 2140
2291 mutex_lock(&dev->dev_lock); 2141 mutex_lock(&dev->dev_lock);
2292 setup_opal_dev(dev, active_steps); 2142 setup_opal_dev(dev);
2293 ret = next(dev); 2143 ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps));
2294 mutex_unlock(&dev->dev_lock); 2144 mutex_unlock(&dev->dev_lock);
2295 return ret; 2145 return ret;
2296} 2146}
@@ -2299,17 +2149,15 @@ static int opal_setup_locking_range(struct opal_dev *dev,
2299 struct opal_user_lr_setup *opal_lrs) 2149 struct opal_user_lr_setup *opal_lrs)
2300{ 2150{
2301 const struct opal_step lr_steps[] = { 2151 const struct opal_step lr_steps[] = {
2302 { opal_discovery0, },
2303 { start_auth_opal_session, &opal_lrs->session }, 2152 { start_auth_opal_session, &opal_lrs->session },
2304 { setup_locking_range, opal_lrs }, 2153 { setup_locking_range, opal_lrs },
2305 { end_opal_session, }, 2154 { end_opal_session, }
2306 { NULL, }
2307 }; 2155 };
2308 int ret; 2156 int ret;
2309 2157
2310 mutex_lock(&dev->dev_lock); 2158 mutex_lock(&dev->dev_lock);
2311 setup_opal_dev(dev, lr_steps); 2159 setup_opal_dev(dev);
2312 ret = next(dev); 2160 ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
2313 mutex_unlock(&dev->dev_lock); 2161 mutex_unlock(&dev->dev_lock);
2314 return ret; 2162 return ret;
2315} 2163}
@@ -2317,11 +2165,9 @@ static int opal_setup_locking_range(struct opal_dev *dev,
2317static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) 2165static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
2318{ 2166{
2319 const struct opal_step pw_steps[] = { 2167 const struct opal_step pw_steps[] = {
2320 { opal_discovery0, },
2321 { start_auth_opal_session, &opal_pw->session }, 2168 { start_auth_opal_session, &opal_pw->session },
2322 { set_new_pw, &opal_pw->new_user_pw }, 2169 { set_new_pw, &opal_pw->new_user_pw },
2323 { end_opal_session, }, 2170 { end_opal_session, }
2324 { NULL }
2325 }; 2171 };
2326 int ret; 2172 int ret;
2327 2173
@@ -2332,8 +2178,8 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
2332 return -EINVAL; 2178 return -EINVAL;
2333 2179
2334 mutex_lock(&dev->dev_lock); 2180 mutex_lock(&dev->dev_lock);
2335 setup_opal_dev(dev, pw_steps); 2181 setup_opal_dev(dev);
2336 ret = next(dev); 2182 ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
2337 mutex_unlock(&dev->dev_lock); 2183 mutex_unlock(&dev->dev_lock);
2338 return ret; 2184 return ret;
2339} 2185}
@@ -2342,11 +2188,9 @@ static int opal_activate_user(struct opal_dev *dev,
2342 struct opal_session_info *opal_session) 2188 struct opal_session_info *opal_session)
2343{ 2189{
2344 const struct opal_step act_steps[] = { 2190 const struct opal_step act_steps[] = {
2345 { opal_discovery0, },
2346 { start_admin1LSP_opal_session, &opal_session->opal_key }, 2191 { start_admin1LSP_opal_session, &opal_session->opal_key },
2347 { internal_activate_user, opal_session }, 2192 { internal_activate_user, opal_session },
2348 { end_opal_session, }, 2193 { end_opal_session, }
2349 { NULL, }
2350 }; 2194 };
2351 int ret; 2195 int ret;
2352 2196
@@ -2358,8 +2202,8 @@ static int opal_activate_user(struct opal_dev *dev,
2358 } 2202 }
2359 2203
2360 mutex_lock(&dev->dev_lock); 2204 mutex_lock(&dev->dev_lock);
2361 setup_opal_dev(dev, act_steps); 2205 setup_opal_dev(dev);
2362 ret = next(dev); 2206 ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps));
2363 mutex_unlock(&dev->dev_lock); 2207 mutex_unlock(&dev->dev_lock);
2364 return ret; 2208 return ret;
2365} 2209}
@@ -2376,7 +2220,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
2376 return false; 2220 return false;
2377 2221
2378 mutex_lock(&dev->dev_lock); 2222 mutex_lock(&dev->dev_lock);
2379 setup_opal_dev(dev, NULL); 2223 setup_opal_dev(dev);
2380 2224
2381 list_for_each_entry(suspend, &dev->unlk_lst, node) { 2225 list_for_each_entry(suspend, &dev->unlk_lst, node) {
2382 dev->tsn = 0; 2226 dev->tsn = 0;
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 62aed77d0bb9..0c0094609dd6 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -1,24 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0
1/* 2/*
2 * t10_pi.c - Functions for generating and verifying T10 Protection 3 * t10_pi.c - Functions for generating and verifying T10 Protection
3 * Information. 4 * Information.
4 *
5 * Copyright (C) 2007, 2008, 2014 Oracle Corporation
6 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version
10 * 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; see the file COPYING. If not, write to
19 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
20 * USA.
21 *
22 */ 5 */
23 6
24#include <linux/t10-pi.h> 7#include <linux/t10-pi.h>
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 0903e0803ec8..92b930cb3b72 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1829,6 +1829,7 @@ static int __init fd_probe_drives(void)
1829 disk->major = FLOPPY_MAJOR; 1829 disk->major = FLOPPY_MAJOR;
1830 disk->first_minor = drive; 1830 disk->first_minor = drive;
1831 disk->fops = &floppy_fops; 1831 disk->fops = &floppy_fops;
1832 disk->events = DISK_EVENT_MEDIA_CHANGE;
1832 sprintf(disk->disk_name, "fd%d", drive); 1833 sprintf(disk->disk_name, "fd%d", drive);
1833 disk->private_data = &unit[drive]; 1834 disk->private_data = &unit[drive];
1834 set_capacity(disk, 880*2); 1835 set_capacity(disk, 880*2);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index b0dbbdfeb33e..c7b5c4671f05 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -2028,6 +2028,7 @@ static int __init atari_floppy_init (void)
2028 unit[i].disk->first_minor = i; 2028 unit[i].disk->first_minor = i;
2029 sprintf(unit[i].disk->disk_name, "fd%d", i); 2029 sprintf(unit[i].disk->disk_name, "fd%d", i);
2030 unit[i].disk->fops = &floppy_fops; 2030 unit[i].disk->fops = &floppy_fops;
2031 unit[i].disk->events = DISK_EVENT_MEDIA_CHANGE;
2031 unit[i].disk->private_data = &unit[i]; 2032 unit[i].disk->private_data = &unit[i];
2032 set_capacity(unit[i].disk, MAX_DISK_SIZE * 2); 2033 set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
2033 add_disk(unit[i].disk); 2034 add_disk(unit[i].disk);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c18586fccb6f..17defbf4f332 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -96,13 +96,8 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
96 /* 96 /*
97 * Must use NOIO because we don't want to recurse back into the 97 * Must use NOIO because we don't want to recurse back into the
98 * block or filesystem layers from page reclaim. 98 * block or filesystem layers from page reclaim.
99 *
100 * Cannot support DAX and highmem, because our ->direct_access
101 * routine for DAX must return memory that is always addressable.
102 * If DAX was reworked to use pfns and kmap throughout, this
103 * restriction might be able to be lifted.
104 */ 99 */
105 gfp_flags = GFP_NOIO | __GFP_ZERO; 100 gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
106 page = alloc_page(gfp_flags); 101 page = alloc_page(gfp_flags);
107 if (!page) 102 if (!page)
108 return NULL; 103 return NULL;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f070f7200fc0..549c64df9708 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1317,10 +1317,6 @@ struct bm_extent {
1317 1317
1318#define DRBD_MAX_SECTORS_FIXED_BM \ 1318#define DRBD_MAX_SECTORS_FIXED_BM \
1319 ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9))) 1319 ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
1320#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
1321#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
1322#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
1323#else
1324#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM 1320#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
1325/* 16 TB in units of sectors */ 1321/* 16 TB in units of sectors */
1326#if BITS_PER_LONG == 32 1322#if BITS_PER_LONG == 32
@@ -1333,7 +1329,6 @@ struct bm_extent {
1333#define DRBD_MAX_SECTORS_FLEX (1UL << 51) 1329#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
1334/* corresponds to (1UL << 38) bits right now. */ 1330/* corresponds to (1UL << 38) bits right now. */
1335#endif 1331#endif
1336#endif
1337 1332
1338/* Estimate max bio size as 256 * PAGE_SIZE, 1333/* Estimate max bio size as 256 * PAGE_SIZE,
1339 * so for typical PAGE_SIZE of 4k, that is (1<<20) Byte. 1334 * so for typical PAGE_SIZE of 4k, that is (1<<20) Byte.
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 49f89db0766f..b8998abd86a5 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4540,6 +4540,7 @@ static int __init do_floppy_init(void)
4540 disks[drive]->major = FLOPPY_MAJOR; 4540 disks[drive]->major = FLOPPY_MAJOR;
4541 disks[drive]->first_minor = TOMINOR(drive); 4541 disks[drive]->first_minor = TOMINOR(drive);
4542 disks[drive]->fops = &floppy_fops; 4542 disks[drive]->fops = &floppy_fops;
4543 disks[drive]->events = DISK_EVENT_MEDIA_CHANGE;
4543 sprintf(disks[drive]->disk_name, "fd%d", drive); 4544 sprintf(disks[drive]->disk_name, "fd%d", drive);
4544 4545
4545 timer_setup(&motor_off_timer[drive], motor_off_callback, 0); 4546 timer_setup(&motor_off_timer[drive], motor_off_callback, 0);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bf1c61cab8eb..102d79575895 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -264,12 +264,20 @@ lo_do_transfer(struct loop_device *lo, int cmd,
264 return ret; 264 return ret;
265} 265}
266 266
267static inline void loop_iov_iter_bvec(struct iov_iter *i,
268 unsigned int direction, const struct bio_vec *bvec,
269 unsigned long nr_segs, size_t count)
270{
271 iov_iter_bvec(i, direction, bvec, nr_segs, count);
272 i->type |= ITER_BVEC_FLAG_NO_REF;
273}
274
267static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) 275static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
268{ 276{
269 struct iov_iter i; 277 struct iov_iter i;
270 ssize_t bw; 278 ssize_t bw;
271 279
272 iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len); 280 loop_iov_iter_bvec(&i, WRITE, bvec, 1, bvec->bv_len);
273 281
274 file_start_write(file); 282 file_start_write(file);
275 bw = vfs_iter_write(file, &i, ppos, 0); 283 bw = vfs_iter_write(file, &i, ppos, 0);
@@ -347,7 +355,7 @@ static int lo_read_simple(struct loop_device *lo, struct request *rq,
347 ssize_t len; 355 ssize_t len;
348 356
349 rq_for_each_segment(bvec, rq, iter) { 357 rq_for_each_segment(bvec, rq, iter) {
350 iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len); 358 loop_iov_iter_bvec(&i, READ, &bvec, 1, bvec.bv_len);
351 len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); 359 len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
352 if (len < 0) 360 if (len < 0)
353 return len; 361 return len;
@@ -388,7 +396,7 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq,
388 b.bv_offset = 0; 396 b.bv_offset = 0;
389 b.bv_len = bvec.bv_len; 397 b.bv_len = bvec.bv_len;
390 398
391 iov_iter_bvec(&i, READ, &b, 1, b.bv_len); 399 loop_iov_iter_bvec(&i, READ, &b, 1, b.bv_len);
392 len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); 400 len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
393 if (len < 0) { 401 if (len < 0) {
394 ret = len; 402 ret = len;
@@ -555,7 +563,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
555 } 563 }
556 atomic_set(&cmd->ref, 2); 564 atomic_set(&cmd->ref, 2);
557 565
558 iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); 566 loop_iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
559 iter.iov_offset = offset; 567 iter.iov_offset = offset;
560 568
561 cmd->iocb.ki_pos = pos; 569 cmd->iocb.ki_pos = pos;
@@ -900,6 +908,24 @@ static int loop_prepare_queue(struct loop_device *lo)
900 return 0; 908 return 0;
901} 909}
902 910
911static void loop_update_rotational(struct loop_device *lo)
912{
913 struct file *file = lo->lo_backing_file;
914 struct inode *file_inode = file->f_mapping->host;
915 struct block_device *file_bdev = file_inode->i_sb->s_bdev;
916 struct request_queue *q = lo->lo_queue;
917 bool nonrot = true;
918
919 /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */
920 if (file_bdev)
921 nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev));
922
923 if (nonrot)
924 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
925 else
926 blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
927}
928
903static int loop_set_fd(struct loop_device *lo, fmode_t mode, 929static int loop_set_fd(struct loop_device *lo, fmode_t mode,
904 struct block_device *bdev, unsigned int arg) 930 struct block_device *bdev, unsigned int arg)
905{ 931{
@@ -963,6 +989,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
963 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) 989 if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
964 blk_queue_write_cache(lo->lo_queue, true, false); 990 blk_queue_write_cache(lo->lo_queue, true, false);
965 991
992 loop_update_rotational(lo);
966 loop_update_dio(lo); 993 loop_update_dio(lo);
967 set_capacity(lo->lo_disk, size); 994 set_capacity(lo->lo_disk, size);
968 bd_set_size(bdev, size << 9); 995 bd_set_size(bdev, size << 9);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 83302ecdc8db..f0105d118056 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1192,14 +1192,6 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
1192 else 1192 else
1193 clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); 1193 clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
1194 1194
1195#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */
1196 /* Demux ID.DRAT & ID.RZAT to determine trim support */
1197 if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
1198 port->dd->trim_supp = true;
1199 else
1200#endif
1201 port->dd->trim_supp = false;
1202
1203 /* Set the identify buffer as valid. */ 1195 /* Set the identify buffer as valid. */
1204 port->identify_valid = 1; 1196 port->identify_valid = 1;
1205 1197
@@ -1387,77 +1379,6 @@ static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
1387} 1379}
1388 1380
1389/* 1381/*
1390 * Trim unused sectors
1391 *
1392 * @dd pointer to driver_data structure
1393 * @lba starting lba
1394 * @len # of 512b sectors to trim
1395 */
1396static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
1397 unsigned int len)
1398{
1399 u64 tlba, tlen, sect_left;
1400 struct mtip_trim_entry *buf;
1401 dma_addr_t dma_addr;
1402 struct host_to_dev_fis fis;
1403 blk_status_t ret = BLK_STS_OK;
1404 int i;
1405
1406 if (!len || dd->trim_supp == false)
1407 return BLK_STS_IOERR;
1408
1409 /* Trim request too big */
1410 WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES));
1411
1412 /* Trim request not aligned on 4k boundary */
1413 WARN_ON(len % 8 != 0);
1414
1415 /* Warn if vu_trim structure is too big */
1416 WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
1417
1418 /* Allocate a DMA buffer for the trim structure */
1419 buf = dma_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
1420 GFP_KERNEL);
1421 if (!buf)
1422 return BLK_STS_RESOURCE;
1423 memset(buf, 0, ATA_SECT_SIZE);
1424
1425 for (i = 0, sect_left = len, tlba = lba;
1426 i < MTIP_MAX_TRIM_ENTRIES && sect_left;
1427 i++) {
1428 tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ?
1429 MTIP_MAX_TRIM_ENTRY_LEN :
1430 sect_left);
1431 buf[i].lba = cpu_to_le32(tlba);
1432 buf[i].range = cpu_to_le16(tlen);
1433 tlba += tlen;
1434 sect_left -= tlen;
1435 }
1436 WARN_ON(sect_left != 0);
1437
1438 /* Build the fis */
1439 memset(&fis, 0, sizeof(struct host_to_dev_fis));
1440 fis.type = 0x27;
1441 fis.opts = 1 << 7;
1442 fis.command = 0xfb;
1443 fis.features = 0x60;
1444 fis.sect_count = 1;
1445 fis.device = ATA_DEVICE_OBS;
1446
1447 if (mtip_exec_internal_command(dd->port,
1448 &fis,
1449 5,
1450 dma_addr,
1451 ATA_SECT_SIZE,
1452 0,
1453 MTIP_TRIM_TIMEOUT_MS) < 0)
1454 ret = BLK_STS_IOERR;
1455
1456 dma_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
1457 return ret;
1458}
1459
1460/*
1461 * Get the drive capacity. 1382 * Get the drive capacity.
1462 * 1383 *
1463 * @dd Pointer to the device data structure. 1384 * @dd Pointer to the device data structure.
@@ -3590,8 +3511,6 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
3590 3511
3591 blk_mq_start_request(rq); 3512 blk_mq_start_request(rq);
3592 3513
3593 if (req_op(rq) == REQ_OP_DISCARD)
3594 return mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
3595 mtip_hw_submit_io(dd, rq, cmd, hctx); 3514 mtip_hw_submit_io(dd, rq, cmd, hctx);
3596 return BLK_STS_OK; 3515 return BLK_STS_OK;
3597} 3516}
@@ -3769,14 +3688,6 @@ skip_create_disk:
3769 blk_queue_max_segment_size(dd->queue, 0x400000); 3688 blk_queue_max_segment_size(dd->queue, 0x400000);
3770 blk_queue_io_min(dd->queue, 4096); 3689 blk_queue_io_min(dd->queue, 4096);
3771 3690
3772 /* Signal trim support */
3773 if (dd->trim_supp == true) {
3774 blk_queue_flag_set(QUEUE_FLAG_DISCARD, dd->queue);
3775 dd->queue->limits.discard_granularity = 4096;
3776 blk_queue_max_discard_sectors(dd->queue,
3777 MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
3778 }
3779
3780 /* Set the capacity of the device in 512 byte sectors. */ 3691 /* Set the capacity of the device in 512 byte sectors. */
3781 if (!(mtip_hw_get_capacity(dd, &capacity))) { 3692 if (!(mtip_hw_get_capacity(dd, &capacity))) {
3782 dev_warn(&dd->pdev->dev, 3693 dev_warn(&dd->pdev->dev,
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index abce25f27f57..91c1cb5b1532 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -193,21 +193,6 @@ struct mtip_work {
193 mtip_workq_sdbfx(w->port, group, w->completed); \ 193 mtip_workq_sdbfx(w->port, group, w->completed); \
194 } 194 }
195 195
196#define MTIP_TRIM_TIMEOUT_MS 240000
197#define MTIP_MAX_TRIM_ENTRIES 8
198#define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8
199
200struct mtip_trim_entry {
201 __le32 lba; /* starting lba of region */
202 __le16 rsvd; /* unused */
203 __le16 range; /* # of 512b blocks to trim */
204} __packed;
205
206struct mtip_trim {
207 /* Array of regions to trim */
208 struct mtip_trim_entry entry[MTIP_MAX_TRIM_ENTRIES];
209} __packed;
210
211/* Register Frame Information Structure (FIS), host to device. */ 196/* Register Frame Information Structure (FIS), host to device. */
212struct host_to_dev_fis { 197struct host_to_dev_fis {
213 /* 198 /*
@@ -474,8 +459,6 @@ struct driver_data {
474 459
475 struct dentry *dfs_node; 460 struct dentry *dfs_node;
476 461
477 bool trim_supp; /* flag indicating trim support */
478
479 bool sr; 462 bool sr;
480 463
481 int numa_node; /* NUMA support */ 464 int numa_node; /* NUMA support */
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 6d415b20fb70..001dbdcbf355 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -343,6 +343,7 @@ static void pcd_init_units(void)
343 strcpy(disk->disk_name, cd->name); /* umm... */ 343 strcpy(disk->disk_name, cd->name); /* umm... */
344 disk->fops = &pcd_bdops; 344 disk->fops = &pcd_bdops;
345 disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; 345 disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
346 disk->events = DISK_EVENT_MEDIA_CHANGE;
346 } 347 }
347} 348}
348 349
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 0ff9b12d0e35..6f9ad3fc716f 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -897,6 +897,7 @@ static void pd_probe_drive(struct pd_unit *disk)
897 p->fops = &pd_fops; 897 p->fops = &pd_fops;
898 p->major = major; 898 p->major = major;
899 p->first_minor = (disk - pd) << PD_BITS; 899 p->first_minor = (disk - pd) << PD_BITS;
900 p->events = DISK_EVENT_MEDIA_CHANGE;
900 disk->gd = p; 901 disk->gd = p;
901 p->private_data = disk; 902 p->private_data = disk;
902 903
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 35e6e271b219..1e9c50a7256c 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -319,6 +319,7 @@ static void __init pf_init_units(void)
319 disk->first_minor = unit; 319 disk->first_minor = unit;
320 strcpy(disk->disk_name, pf->name); 320 strcpy(disk->disk_name, pf->name);
321 disk->fops = &pf_fops; 321 disk->fops = &pf_fops;
322 disk->events = DISK_EVENT_MEDIA_CHANGE;
322 if (!(*drives[unit])[D_PRT]) 323 if (!(*drives[unit])[D_PRT])
323 pf_drive_count++; 324 pf_drive_count++;
324 } 325 }
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index f5a71023f76c..024060165afa 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2761,7 +2761,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
2761 2761
2762 /* inherit events of the host device */ 2762 /* inherit events of the host device */
2763 disk->events = pd->bdev->bd_disk->events; 2763 disk->events = pd->bdev->bd_disk->events;
2764 disk->async_events = pd->bdev->bd_disk->async_events;
2765 2764
2766 add_disk(disk); 2765 add_disk(disk);
2767 2766
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 4e1d9b31f60c..cc61c5ce3ad5 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -102,7 +102,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
102 102
103 rq_for_each_segment(bvec, req, iter) { 103 rq_for_each_segment(bvec, req, iter) {
104 unsigned long flags; 104 unsigned long flags;
105 dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %lu\n", 105 dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %llu\n",
106 __func__, __LINE__, i, bio_sectors(iter.bio), 106 __func__, __LINE__, i, bio_sectors(iter.bio),
107 iter.bio->bi_iter.bi_sector); 107 iter.bio->bi_iter.bi_sector);
108 108
@@ -496,7 +496,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
496 dev->regions[dev->region_idx].size*priv->blocking_factor); 496 dev->regions[dev->region_idx].size*priv->blocking_factor);
497 497
498 dev_info(&dev->sbd.core, 498 dev_info(&dev->sbd.core,
499 "%s is a %s (%llu MiB total, %lu MiB for OtherOS)\n", 499 "%s is a %s (%llu MiB total, %llu MiB for OtherOS)\n",
500 gendisk->disk_name, priv->model, priv->raw_capacity >> 11, 500 gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
501 get_capacity(gendisk) >> 11); 501 get_capacity(gendisk) >> 11);
502 502
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 3fa6fcc34790..67b5ec281c6d 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -862,6 +862,7 @@ static int swim_floppy_init(struct swim_priv *swd)
862 swd->unit[drive].disk->first_minor = drive; 862 swd->unit[drive].disk->first_minor = drive;
863 sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); 863 sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive);
864 swd->unit[drive].disk->fops = &floppy_fops; 864 swd->unit[drive].disk->fops = &floppy_fops;
865 swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE;
865 swd->unit[drive].disk->private_data = &swd->unit[drive]; 866 swd->unit[drive].disk->private_data = &swd->unit[drive];
866 set_capacity(swd->unit[drive].disk, 2880); 867 set_capacity(swd->unit[drive].disk, 2880);
867 add_disk(swd->unit[drive].disk); 868 add_disk(swd->unit[drive].disk);
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 1e2ae90d7715..cf42729c788e 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1216,6 +1216,7 @@ static int swim3_attach(struct macio_dev *mdev,
1216 disk->first_minor = floppy_count; 1216 disk->first_minor = floppy_count;
1217 disk->fops = &floppy_fops; 1217 disk->fops = &floppy_fops;
1218 disk->private_data = fs; 1218 disk->private_data = fs;
1219 disk->events = DISK_EVENT_MEDIA_CHANGE;
1219 disk->flags |= GENHD_FL_REMOVABLE; 1220 disk->flags |= GENHD_FL_REMOVABLE;
1220 sprintf(disk->disk_name, "fd%d", floppy_count); 1221 sprintf(disk->disk_name, "fd%d", floppy_count);
1221 set_capacity(disk, 2880); 1222 set_capacity(disk, 2880);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2a7ca4a1e6f7..f1d90cd3dc47 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -693,7 +693,8 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
693{ 693{
694 struct virtio_blk *vblk = set->driver_data; 694 struct virtio_blk *vblk = set->driver_data;
695 695
696 return blk_mq_virtio_map_queues(&set->map[0], vblk->vdev, 0); 696 return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
697 vblk->vdev, 0);
697} 698}
698 699
699#ifdef CONFIG_VIRTIO_BLK_SCSI 700#ifdef CONFIG_VIRTIO_BLK_SCSI
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 32a21b8d1d85..464c9092bc8b 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -1032,6 +1032,7 @@ static int ace_setup(struct ace_device *ace)
1032 ace->gd->major = ace_major; 1032 ace->gd->major = ace_major;
1033 ace->gd->first_minor = ace->id * ACE_NUM_MINORS; 1033 ace->gd->first_minor = ace->id * ACE_NUM_MINORS;
1034 ace->gd->fops = &ace_fops; 1034 ace->gd->fops = &ace_fops;
1035 ace->gd->events = DISK_EVENT_MEDIA_CHANGE;
1035 ace->gd->queue = ace->queue; 1036 ace->gd->queue = ace->queue;
1036 ace->gd->private_data = ace; 1037 ace->gd->private_data = ace;
1037 snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a'); 1038 snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a');
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index f8b7345fe1cb..5cf3bade0d57 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -786,6 +786,7 @@ static int probe_gdrom(struct platform_device *devptr)
786 goto probe_fail_cdrom_register; 786 goto probe_fail_cdrom_register;
787 } 787 }
788 gd.disk->fops = &gdrom_bdops; 788 gd.disk->fops = &gdrom_bdops;
789 gd.disk->events = DISK_EVENT_MEDIA_CHANGE;
789 /* latch on to the interrupt */ 790 /* latch on to the interrupt */
790 err = gdrom_set_interrupt_handlers(); 791 err = gdrom_set_interrupt_handlers();
791 if (err) 792 if (err)
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 1f03884a6808..3b15adc6ce98 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1797,6 +1797,7 @@ static int ide_cd_probe(ide_drive_t *drive)
1797 ide_cd_read_toc(drive); 1797 ide_cd_read_toc(drive);
1798 g->fops = &idecd_ops; 1798 g->fops = &idecd_ops;
1799 g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; 1799 g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
1800 g->events = DISK_EVENT_MEDIA_CHANGE;
1800 device_add_disk(&drive->gendev, g, NULL); 1801 device_add_disk(&drive->gendev, g, NULL);
1801 return 0; 1802 return 0;
1802 1803
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 4a6e1a413ead..46f2df288c6a 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -82,8 +82,9 @@ int ide_cdrom_drive_status(struct cdrom_device_info *cdi, int slot_nr)
82 82
83/* 83/*
84 * ide-cd always generates media changed event if media is missing, which 84 * ide-cd always generates media changed event if media is missing, which
85 * makes it impossible to use for proper event reporting, so disk->events 85 * makes it impossible to use for proper event reporting, so
86 * is cleared to 0 and the following function is used only to trigger 86 * DISK_EVENT_FLAG_UEVENT is cleared in disk->event_flags
87 * and the following function is used only to trigger
87 * revalidation and never propagated to userland. 88 * revalidation and never propagated to userland.
88 */ 89 */
89unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi, 90unsigned int ide_cdrom_check_events_real(struct cdrom_device_info *cdi,
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 04e008e8f6f9..f233b34ea0c0 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -299,8 +299,9 @@ static unsigned int ide_gd_check_events(struct gendisk *disk,
299 /* 299 /*
300 * The following is used to force revalidation on the first open on 300 * The following is used to force revalidation on the first open on
301 * removeable devices, and never gets reported to userland as 301 * removeable devices, and never gets reported to userland as
302 * genhd->events is 0. This is intended as removeable ide disk 302 * DISK_EVENT_FLAG_UEVENT isn't set in genhd->event_flags.
303 * can't really detect MEDIA_CHANGE events. 303 * This is intended as removable ide disk can't really detect
304 * MEDIA_CHANGE events.
304 */ 305 */
305 ret = drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED; 306 ret = drive->dev_flags & IDE_DFLAG_MEDIA_CHANGED;
306 drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED; 307 drive->dev_flags &= ~IDE_DFLAG_MEDIA_CHANGED;
@@ -416,6 +417,7 @@ static int ide_gd_probe(ide_drive_t *drive)
416 if (drive->dev_flags & IDE_DFLAG_REMOVABLE) 417 if (drive->dev_flags & IDE_DFLAG_REMOVABLE)
417 g->flags = GENHD_FL_REMOVABLE; 418 g->flags = GENHD_FL_REMOVABLE;
418 g->fops = &ide_gd_ops; 419 g->fops = &ide_gd_ops;
420 g->events = DISK_EVENT_MEDIA_CHANGE;
419 device_add_disk(&drive->gendev, g, NULL); 421 device_add_disk(&drive->gendev, g, NULL);
420 return 0; 422 return 0;
421 423
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 5002838ea476..f8986effcb50 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -327,10 +327,11 @@ static int bch_allocator_thread(void *arg)
327 * possibly issue discards to them, then we add the bucket to 327 * possibly issue discards to them, then we add the bucket to
328 * the free list: 328 * the free list:
329 */ 329 */
330 while (!fifo_empty(&ca->free_inc)) { 330 while (1) {
331 long bucket; 331 long bucket;
332 332
333 fifo_pop(&ca->free_inc, bucket); 333 if (!fifo_pop(&ca->free_inc, bucket))
334 break;
334 335
335 if (ca->discard) { 336 if (ca->discard) {
336 mutex_unlock(&ca->set->bucket_lock); 337 mutex_unlock(&ca->set->bucket_lock);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 64def336f053..773f5fdad25f 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -429,14 +429,14 @@ static void do_btree_node_write(struct btree *b)
429 bset_sector_offset(&b->keys, i)); 429 bset_sector_offset(&b->keys, i));
430 430
431 if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { 431 if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
432 int j;
433 struct bio_vec *bv; 432 struct bio_vec *bv;
434 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 433 void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
435 struct bvec_iter_all iter_all; 434 struct bvec_iter_all iter_all;
436 435
437 bio_for_each_segment_all(bv, b->bio, j, iter_all) 436 bio_for_each_segment_all(bv, b->bio, iter_all) {
438 memcpy(page_address(bv->bv_page), 437 memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
439 base + j * PAGE_SIZE, PAGE_SIZE); 438 addr += PAGE_SIZE;
439 }
440 440
441 bch_submit_bbio(b->bio, b->c, &k.key, 0); 441 bch_submit_bbio(b->bio, b->c, &k.key, 0);
442 442
@@ -1476,11 +1476,11 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1476 1476
1477out_nocoalesce: 1477out_nocoalesce:
1478 closure_sync(&cl); 1478 closure_sync(&cl);
1479 bch_keylist_free(&keylist);
1480 1479
1481 while ((k = bch_keylist_pop(&keylist))) 1480 while ((k = bch_keylist_pop(&keylist)))
1482 if (!bkey_cmp(k, &ZERO_KEY)) 1481 if (!bkey_cmp(k, &ZERO_KEY))
1483 atomic_dec(&b->c->prio_blocked); 1482 atomic_dec(&b->c->prio_blocked);
1483 bch_keylist_free(&keylist);
1484 1484
1485 for (i = 0; i < nodes; i++) 1485 for (i = 0; i < nodes; i++)
1486 if (!IS_ERR_OR_NULL(new_nodes[i])) { 1486 if (!IS_ERR_OR_NULL(new_nodes[i])) {
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index b2fd412715b1..12dae9348147 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -147,7 +147,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
147{ 147{
148#define read_bucket(b) \ 148#define read_bucket(b) \
149 ({ \ 149 ({ \
150 int ret = journal_read_bucket(ca, list, b); \ 150 ret = journal_read_bucket(ca, list, b); \
151 __set_bit(b, bitmap); \ 151 __set_bit(b, bitmap); \
152 if (ret < 0) \ 152 if (ret < 0) \
153 return ret; \ 153 return ret; \
@@ -156,6 +156,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
156 156
157 struct cache *ca; 157 struct cache *ca;
158 unsigned int iter; 158 unsigned int iter;
159 int ret = 0;
159 160
160 for_each_cache(ca, c, iter) { 161 for_each_cache(ca, c, iter) {
161 struct journal_device *ja = &ca->journal; 162 struct journal_device *ja = &ca->journal;
@@ -267,7 +268,7 @@ bsearch:
267 struct journal_replay, 268 struct journal_replay,
268 list)->j.seq; 269 list)->j.seq;
269 270
270 return 0; 271 return ret;
271#undef read_bucket 272#undef read_bucket
272} 273}
273 274
@@ -317,6 +318,18 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
317 } 318 }
318} 319}
319 320
321static bool is_discard_enabled(struct cache_set *s)
322{
323 struct cache *ca;
324 unsigned int i;
325
326 for_each_cache(ca, s, i)
327 if (ca->discard)
328 return true;
329
330 return false;
331}
332
320int bch_journal_replay(struct cache_set *s, struct list_head *list) 333int bch_journal_replay(struct cache_set *s, struct list_head *list)
321{ 334{
322 int ret = 0, keys = 0, entries = 0; 335 int ret = 0, keys = 0, entries = 0;
@@ -330,9 +343,17 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
330 list_for_each_entry(i, list, list) { 343 list_for_each_entry(i, list, list) {
331 BUG_ON(i->pin && atomic_read(i->pin) != 1); 344 BUG_ON(i->pin && atomic_read(i->pin) != 1);
332 345
333 cache_set_err_on(n != i->j.seq, s, 346 if (n != i->j.seq) {
334"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", 347 if (n == start && is_discard_enabled(s))
335 n, i->j.seq - 1, start, end); 348 pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)",
349 n, i->j.seq - 1, start, end);
350 else {
351 pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
352 n, i->j.seq - 1, start, end);
353 ret = -EIO;
354 goto err;
355 }
356 }
336 357
337 for (k = i->j.start; 358 for (k = i->j.start;
338 k < bset_bkey_last(&i->j); 359 k < bset_bkey_last(&i->j);
@@ -540,11 +561,11 @@ static void journal_reclaim(struct cache_set *c)
540 ca->sb.nr_this_dev); 561 ca->sb.nr_this_dev);
541 } 562 }
542 563
543 bkey_init(k); 564 if (n) {
544 SET_KEY_PTRS(k, n); 565 bkey_init(k);
545 566 SET_KEY_PTRS(k, n);
546 if (n)
547 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; 567 c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
568 }
548out: 569out:
549 if (!journal_full(&c->journal)) 570 if (!journal_full(&c->journal))
550 __closure_wake_up(&c->journal.wait); 571 __closure_wake_up(&c->journal.wait);
@@ -671,6 +692,9 @@ static void journal_write_unlocked(struct closure *cl)
671 ca->journal.seq[ca->journal.cur_idx] = w->data->seq; 692 ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
672 } 693 }
673 694
695 /* If KEY_PTRS(k) == 0, this jset gets lost in air */
696 BUG_ON(i == 0);
697
674 atomic_dec_bug(&fifo_back(&c->journal.pin)); 698 atomic_dec_bug(&fifo_back(&c->journal.pin));
675 bch_journal_next(&c->journal); 699 bch_journal_next(&c->journal);
676 journal_reclaim(c); 700 journal_reclaim(c);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f101bfe8657a..41adcd1546f1 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -329,12 +329,13 @@ void bch_data_insert(struct closure *cl)
329 bch_data_insert_start(cl); 329 bch_data_insert_start(cl);
330} 330}
331 331
332/* Congested? */ 332/*
333 333 * Congested? Return 0 (not congested) or the limit (in sectors)
334unsigned int bch_get_congested(struct cache_set *c) 334 * beyond which we should bypass the cache due to congestion.
335 */
336unsigned int bch_get_congested(const struct cache_set *c)
335{ 337{
336 int i; 338 int i;
337 long rand;
338 339
339 if (!c->congested_read_threshold_us && 340 if (!c->congested_read_threshold_us &&
340 !c->congested_write_threshold_us) 341 !c->congested_write_threshold_us)
@@ -353,8 +354,7 @@ unsigned int bch_get_congested(struct cache_set *c)
353 if (i > 0) 354 if (i > 0)
354 i = fract_exp_two(i, 6); 355 i = fract_exp_two(i, 6);
355 356
356 rand = get_random_int(); 357 i -= hweight32(get_random_u32());
357 i -= bitmap_weight(&rand, BITS_PER_LONG);
358 358
359 return i > 0 ? i : 1; 359 return i > 0 ? i : 1;
360} 360}
@@ -376,7 +376,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
376{ 376{
377 struct cache_set *c = dc->disk.c; 377 struct cache_set *c = dc->disk.c;
378 unsigned int mode = cache_mode(dc); 378 unsigned int mode = cache_mode(dc);
379 unsigned int sectors, congested = bch_get_congested(c); 379 unsigned int sectors, congested;
380 struct task_struct *task = current; 380 struct task_struct *task = current;
381 struct io *i; 381 struct io *i;
382 382
@@ -412,6 +412,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
412 goto rescale; 412 goto rescale;
413 } 413 }
414 414
415 congested = bch_get_congested(c);
415 if (!congested && !dc->sequential_cutoff) 416 if (!congested && !dc->sequential_cutoff)
416 goto rescale; 417 goto rescale;
417 418
@@ -706,14 +707,14 @@ static void search_free(struct closure *cl)
706{ 707{
707 struct search *s = container_of(cl, struct search, cl); 708 struct search *s = container_of(cl, struct search, cl);
708 709
709 atomic_dec(&s->d->c->search_inflight); 710 atomic_dec(&s->iop.c->search_inflight);
710 711
711 if (s->iop.bio) 712 if (s->iop.bio)
712 bio_put(s->iop.bio); 713 bio_put(s->iop.bio);
713 714
714 bio_complete(s); 715 bio_complete(s);
715 closure_debug_destroy(cl); 716 closure_debug_destroy(cl);
716 mempool_free(s, &s->d->c->search); 717 mempool_free(s, &s->iop.c->search);
717} 718}
718 719
719static inline struct search *search_alloc(struct bio *bio, 720static inline struct search *search_alloc(struct bio *bio,
@@ -756,13 +757,13 @@ static void cached_dev_bio_complete(struct closure *cl)
756 struct search *s = container_of(cl, struct search, cl); 757 struct search *s = container_of(cl, struct search, cl);
757 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 758 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
758 759
759 search_free(cl);
760 cached_dev_put(dc); 760 cached_dev_put(dc);
761 search_free(cl);
761} 762}
762 763
763/* Process reads */ 764/* Process reads */
764 765
765static void cached_dev_cache_miss_done(struct closure *cl) 766static void cached_dev_read_error_done(struct closure *cl)
766{ 767{
767 struct search *s = container_of(cl, struct search, cl); 768 struct search *s = container_of(cl, struct search, cl);
768 769
@@ -800,7 +801,22 @@ static void cached_dev_read_error(struct closure *cl)
800 closure_bio_submit(s->iop.c, bio, cl); 801 closure_bio_submit(s->iop.c, bio, cl);
801 } 802 }
802 803
803 continue_at(cl, cached_dev_cache_miss_done, NULL); 804 continue_at(cl, cached_dev_read_error_done, NULL);
805}
806
807static void cached_dev_cache_miss_done(struct closure *cl)
808{
809 struct search *s = container_of(cl, struct search, cl);
810 struct bcache_device *d = s->d;
811
812 if (s->iop.replace_collision)
813 bch_mark_cache_miss_collision(s->iop.c, s->d);
814
815 if (s->iop.bio)
816 bio_free_pages(s->iop.bio);
817
818 cached_dev_bio_complete(cl);
819 closure_put(&d->cl);
804} 820}
805 821
806static void cached_dev_read_done(struct closure *cl) 822static void cached_dev_read_done(struct closure *cl)
@@ -833,6 +849,7 @@ static void cached_dev_read_done(struct closure *cl)
833 if (verify(dc) && s->recoverable && !s->read_dirty_data) 849 if (verify(dc) && s->recoverable && !s->read_dirty_data)
834 bch_data_verify(dc, s->orig_bio); 850 bch_data_verify(dc, s->orig_bio);
835 851
852 closure_get(&dc->disk.cl);
836 bio_complete(s); 853 bio_complete(s);
837 854
838 if (s->iop.bio && 855 if (s->iop.bio &&
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 721bf336ed1a..c64dbd7a91aa 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -33,7 +33,7 @@ struct data_insert_op {
33 BKEY_PADDED(replace_key); 33 BKEY_PADDED(replace_key);
34}; 34};
35 35
36unsigned int bch_get_congested(struct cache_set *c); 36unsigned int bch_get_congested(const struct cache_set *c);
37void bch_data_insert(struct closure *cl); 37void bch_data_insert(struct closure *cl);
38 38
39void bch_cached_dev_request_init(struct cached_dev *dc); 39void bch_cached_dev_request_init(struct cached_dev *dc);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a697a3a923cd..1b63ac876169 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -662,6 +662,11 @@ static const struct block_device_operations bcache_ops = {
662void bcache_device_stop(struct bcache_device *d) 662void bcache_device_stop(struct bcache_device *d)
663{ 663{
664 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) 664 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
665 /*
666 * closure_fn set to
667 * - cached device: cached_dev_flush()
668 * - flash dev: flash_dev_flush()
669 */
665 closure_queue(&d->cl); 670 closure_queue(&d->cl);
666} 671}
667 672
@@ -906,21 +911,18 @@ static int cached_dev_status_update(void *arg)
906void bch_cached_dev_run(struct cached_dev *dc) 911void bch_cached_dev_run(struct cached_dev *dc)
907{ 912{
908 struct bcache_device *d = &dc->disk; 913 struct bcache_device *d = &dc->disk;
909 char buf[SB_LABEL_SIZE + 1]; 914 char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
910 char *env[] = { 915 char *env[] = {
911 "DRIVER=bcache", 916 "DRIVER=bcache",
912 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), 917 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
913 NULL, 918 kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
914 NULL, 919 NULL,
915 }; 920 };
916 921
917 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
918 buf[SB_LABEL_SIZE] = '\0';
919 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
920
921 if (atomic_xchg(&dc->running, 1)) { 922 if (atomic_xchg(&dc->running, 1)) {
922 kfree(env[1]); 923 kfree(env[1]);
923 kfree(env[2]); 924 kfree(env[2]);
925 kfree(buf);
924 return; 926 return;
925 } 927 }
926 928
@@ -944,6 +946,7 @@ void bch_cached_dev_run(struct cached_dev *dc)
944 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); 946 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
945 kfree(env[1]); 947 kfree(env[1]);
946 kfree(env[2]); 948 kfree(env[2]);
949 kfree(buf);
947 950
948 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || 951 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
949 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) 952 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
@@ -1174,6 +1177,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1174 return 0; 1177 return 0;
1175} 1178}
1176 1179
1180/* when dc->disk.kobj released */
1177void bch_cached_dev_release(struct kobject *kobj) 1181void bch_cached_dev_release(struct kobject *kobj)
1178{ 1182{
1179 struct cached_dev *dc = container_of(kobj, struct cached_dev, 1183 struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -1280,7 +1284,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
1280 1284
1281/* Cached device - bcache superblock */ 1285/* Cached device - bcache superblock */
1282 1286
1283static void register_bdev(struct cache_sb *sb, struct page *sb_page, 1287static int register_bdev(struct cache_sb *sb, struct page *sb_page,
1284 struct block_device *bdev, 1288 struct block_device *bdev,
1285 struct cached_dev *dc) 1289 struct cached_dev *dc)
1286{ 1290{
@@ -1318,14 +1322,16 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1318 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) 1322 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1319 bch_cached_dev_run(dc); 1323 bch_cached_dev_run(dc);
1320 1324
1321 return; 1325 return 0;
1322err: 1326err:
1323 pr_notice("error %s: %s", dc->backing_dev_name, err); 1327 pr_notice("error %s: %s", dc->backing_dev_name, err);
1324 bcache_device_stop(&dc->disk); 1328 bcache_device_stop(&dc->disk);
1329 return -EIO;
1325} 1330}
1326 1331
1327/* Flash only volumes */ 1332/* Flash only volumes */
1328 1333
1334/* When d->kobj released */
1329void bch_flash_dev_release(struct kobject *kobj) 1335void bch_flash_dev_release(struct kobject *kobj)
1330{ 1336{
1331 struct bcache_device *d = container_of(kobj, struct bcache_device, 1337 struct bcache_device *d = container_of(kobj, struct bcache_device,
@@ -1496,6 +1502,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1496 return true; 1502 return true;
1497} 1503}
1498 1504
1505/* When c->kobj released */
1499void bch_cache_set_release(struct kobject *kobj) 1506void bch_cache_set_release(struct kobject *kobj)
1500{ 1507{
1501 struct cache_set *c = container_of(kobj, struct cache_set, kobj); 1508 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
@@ -1516,6 +1523,7 @@ static void cache_set_free(struct closure *cl)
1516 bch_btree_cache_free(c); 1523 bch_btree_cache_free(c);
1517 bch_journal_free(c); 1524 bch_journal_free(c);
1518 1525
1526 mutex_lock(&bch_register_lock);
1519 for_each_cache(ca, c, i) 1527 for_each_cache(ca, c, i)
1520 if (ca) { 1528 if (ca) {
1521 ca->set = NULL; 1529 ca->set = NULL;
@@ -1534,7 +1542,6 @@ static void cache_set_free(struct closure *cl)
1534 mempool_exit(&c->search); 1542 mempool_exit(&c->search);
1535 kfree(c->devices); 1543 kfree(c->devices);
1536 1544
1537 mutex_lock(&bch_register_lock);
1538 list_del(&c->list); 1545 list_del(&c->list);
1539 mutex_unlock(&bch_register_lock); 1546 mutex_unlock(&bch_register_lock);
1540 1547
@@ -1673,6 +1680,7 @@ static void __cache_set_unregister(struct closure *cl)
1673void bch_cache_set_stop(struct cache_set *c) 1680void bch_cache_set_stop(struct cache_set *c)
1674{ 1681{
1675 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) 1682 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1683 /* closure_fn set to __cache_set_unregister() */
1676 closure_queue(&c->caching); 1684 closure_queue(&c->caching);
1677} 1685}
1678 1686
@@ -1775,13 +1783,15 @@ err:
1775 return NULL; 1783 return NULL;
1776} 1784}
1777 1785
1778static void run_cache_set(struct cache_set *c) 1786static int run_cache_set(struct cache_set *c)
1779{ 1787{
1780 const char *err = "cannot allocate memory"; 1788 const char *err = "cannot allocate memory";
1781 struct cached_dev *dc, *t; 1789 struct cached_dev *dc, *t;
1782 struct cache *ca; 1790 struct cache *ca;
1783 struct closure cl; 1791 struct closure cl;
1784 unsigned int i; 1792 unsigned int i;
1793 LIST_HEAD(journal);
1794 struct journal_replay *l;
1785 1795
1786 closure_init_stack(&cl); 1796 closure_init_stack(&cl);
1787 1797
@@ -1790,7 +1800,6 @@ static void run_cache_set(struct cache_set *c)
1790 set_gc_sectors(c); 1800 set_gc_sectors(c);
1791 1801
1792 if (CACHE_SYNC(&c->sb)) { 1802 if (CACHE_SYNC(&c->sb)) {
1793 LIST_HEAD(journal);
1794 struct bkey *k; 1803 struct bkey *k;
1795 struct jset *j; 1804 struct jset *j;
1796 1805
@@ -1869,7 +1878,9 @@ static void run_cache_set(struct cache_set *c)
1869 if (j->version < BCACHE_JSET_VERSION_UUID) 1878 if (j->version < BCACHE_JSET_VERSION_UUID)
1870 __uuid_write(c); 1879 __uuid_write(c);
1871 1880
1872 bch_journal_replay(c, &journal); 1881 err = "bcache: replay journal failed";
1882 if (bch_journal_replay(c, &journal))
1883 goto err;
1873 } else { 1884 } else {
1874 pr_notice("invalidating existing data"); 1885 pr_notice("invalidating existing data");
1875 1886
@@ -1937,11 +1948,19 @@ static void run_cache_set(struct cache_set *c)
1937 flash_devs_run(c); 1948 flash_devs_run(c);
1938 1949
1939 set_bit(CACHE_SET_RUNNING, &c->flags); 1950 set_bit(CACHE_SET_RUNNING, &c->flags);
1940 return; 1951 return 0;
1941err: 1952err:
1953 while (!list_empty(&journal)) {
1954 l = list_first_entry(&journal, struct journal_replay, list);
1955 list_del(&l->list);
1956 kfree(l);
1957 }
1958
1942 closure_sync(&cl); 1959 closure_sync(&cl);
1943 /* XXX: test this, it's broken */ 1960 /* XXX: test this, it's broken */
1944 bch_cache_set_error(c, "%s", err); 1961 bch_cache_set_error(c, "%s", err);
1962
1963 return -EIO;
1945} 1964}
1946 1965
1947static bool can_attach_cache(struct cache *ca, struct cache_set *c) 1966static bool can_attach_cache(struct cache *ca, struct cache_set *c)
@@ -2005,8 +2024,11 @@ found:
2005 ca->set->cache[ca->sb.nr_this_dev] = ca; 2024 ca->set->cache[ca->sb.nr_this_dev] = ca;
2006 c->cache_by_alloc[c->caches_loaded++] = ca; 2025 c->cache_by_alloc[c->caches_loaded++] = ca;
2007 2026
2008 if (c->caches_loaded == c->sb.nr_in_set) 2027 if (c->caches_loaded == c->sb.nr_in_set) {
2009 run_cache_set(c); 2028 err = "failed to run cache set";
2029 if (run_cache_set(c) < 0)
2030 goto err;
2031 }
2010 2032
2011 return NULL; 2033 return NULL;
2012err: 2034err:
@@ -2016,6 +2038,7 @@ err:
2016 2038
2017/* Cache device */ 2039/* Cache device */
2018 2040
2041/* When ca->kobj released */
2019void bch_cache_release(struct kobject *kobj) 2042void bch_cache_release(struct kobject *kobj)
2020{ 2043{
2021 struct cache *ca = container_of(kobj, struct cache, kobj); 2044 struct cache *ca = container_of(kobj, struct cache, kobj);
@@ -2179,6 +2202,12 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
2179 2202
2180 ret = cache_alloc(ca); 2203 ret = cache_alloc(ca);
2181 if (ret != 0) { 2204 if (ret != 0) {
2205 /*
2206 * If we failed here, it means ca->kobj is not initialized yet,
2207 * kobject_put() won't be called and there is no chance to
2208 * call blkdev_put() to bdev in bch_cache_release(). So we
2209 * explicitly call blkdev_put() here.
2210 */
2182 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2211 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2183 if (ret == -ENOMEM) 2212 if (ret == -ENOMEM)
2184 err = "cache_alloc(): -ENOMEM"; 2213 err = "cache_alloc(): -ENOMEM";
@@ -2262,7 +2291,7 @@ static bool bch_is_open(struct block_device *bdev)
2262static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, 2291static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2263 const char *buffer, size_t size) 2292 const char *buffer, size_t size)
2264{ 2293{
2265 ssize_t ret = size; 2294 ssize_t ret = -EINVAL;
2266 const char *err = "cannot allocate memory"; 2295 const char *err = "cannot allocate memory";
2267 char *path = NULL; 2296 char *path = NULL;
2268 struct cache_sb *sb = NULL; 2297 struct cache_sb *sb = NULL;
@@ -2296,7 +2325,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2296 if (!IS_ERR(bdev)) 2325 if (!IS_ERR(bdev))
2297 bdput(bdev); 2326 bdput(bdev);
2298 if (attr == &ksysfs_register_quiet) 2327 if (attr == &ksysfs_register_quiet)
2299 goto out; 2328 goto quiet_out;
2300 } 2329 }
2301 goto err; 2330 goto err;
2302 } 2331 }
@@ -2317,17 +2346,23 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2317 goto err_close; 2346 goto err_close;
2318 2347
2319 mutex_lock(&bch_register_lock); 2348 mutex_lock(&bch_register_lock);
2320 register_bdev(sb, sb_page, bdev, dc); 2349 ret = register_bdev(sb, sb_page, bdev, dc);
2321 mutex_unlock(&bch_register_lock); 2350 mutex_unlock(&bch_register_lock);
2351 /* blkdev_put() will be called in cached_dev_free() */
2352 if (ret < 0)
2353 goto err;
2322 } else { 2354 } else {
2323 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2355 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2324 2356
2325 if (!ca) 2357 if (!ca)
2326 goto err_close; 2358 goto err_close;
2327 2359
2360 /* blkdev_put() will be called in bch_cache_release() */
2328 if (register_cache(sb, sb_page, bdev, ca) != 0) 2361 if (register_cache(sb, sb_page, bdev, ca) != 0)
2329 goto err; 2362 goto err;
2330 } 2363 }
2364quiet_out:
2365 ret = size;
2331out: 2366out:
2332 if (sb_page) 2367 if (sb_page)
2333 put_page(sb_page); 2368 put_page(sb_page);
@@ -2340,7 +2375,6 @@ err_close:
2340 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2375 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2341err: 2376err:
2342 pr_info("error %s: %s", path, err); 2377 pr_info("error %s: %s", path, err);
2343 ret = -EINVAL;
2344 goto out; 2378 goto out;
2345} 2379}
2346 2380
@@ -2370,10 +2404,19 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2370 list_for_each_entry_safe(dc, tdc, &uncached_devices, list) 2404 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2371 bcache_device_stop(&dc->disk); 2405 bcache_device_stop(&dc->disk);
2372 2406
2407 mutex_unlock(&bch_register_lock);
2408
2409 /*
2410 * Give an early chance for other kthreads and
2411 * kworkers to stop themselves
2412 */
2413 schedule();
2414
2373 /* What's a condition variable? */ 2415 /* What's a condition variable? */
2374 while (1) { 2416 while (1) {
2375 long timeout = start + 2 * HZ - jiffies; 2417 long timeout = start + 10 * HZ - jiffies;
2376 2418
2419 mutex_lock(&bch_register_lock);
2377 stopped = list_empty(&bch_cache_sets) && 2420 stopped = list_empty(&bch_cache_sets) &&
2378 list_empty(&uncached_devices); 2421 list_empty(&uncached_devices);
2379 2422
@@ -2385,7 +2428,6 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2385 2428
2386 mutex_unlock(&bch_register_lock); 2429 mutex_unlock(&bch_register_lock);
2387 schedule_timeout(timeout); 2430 schedule_timeout(timeout);
2388 mutex_lock(&bch_register_lock);
2389 } 2431 }
2390 2432
2391 finish_wait(&unregister_wait, &wait); 2433 finish_wait(&unregister_wait, &wait);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 17bae9c14ca0..6cd44d3cf906 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -996,8 +996,6 @@ SHOW(__bch_cache)
996 !cached[n - 1]) 996 !cached[n - 1])
997 --n; 997 --n;
998 998
999 unused = ca->sb.nbuckets - n;
1000
1001 while (cached < p + n && 999 while (cached < p + n &&
1002 *cached == BTREE_PRIO) 1000 *cached == BTREE_PRIO)
1003 cached++, n--; 1001 cached++, n--;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 00aab6abcfe4..1fbced94e4cc 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -560,17 +560,29 @@ static inline uint64_t bch_crc64_update(uint64_t crc,
560 return crc; 560 return crc;
561} 561}
562 562
563/* Does linear interpolation between powers of two */ 563/*
564 * A stepwise-linear pseudo-exponential. This returns 1 << (x >>
565 * frac_bits), with the less-significant bits filled in by linear
566 * interpolation.
567 *
568 * This can also be interpreted as a floating-point number format,
569 * where the low frac_bits are the mantissa (with implicit leading
570 * 1 bit), and the more significant bits are the exponent.
571 * The return value is 1.mantissa * 2^exponent.
572 *
573 * The way this is used, fract_bits is 6 and the largest possible
574 * input is CONGESTED_MAX-1 = 1023 (exponent 16, mantissa 0x1.fc),
575 * so the maximum output is 0x1fc00.
576 */
564static inline unsigned int fract_exp_two(unsigned int x, 577static inline unsigned int fract_exp_two(unsigned int x,
565 unsigned int fract_bits) 578 unsigned int fract_bits)
566{ 579{
567 unsigned int fract = x & ~(~0 << fract_bits); 580 unsigned int mantissa = 1 << fract_bits; /* Implicit bit */
568
569 x >>= fract_bits;
570 x = 1 << x;
571 x += (x * fract) >> fract_bits;
572 581
573 return x; 582 mantissa += x & (mantissa - 1);
583 x >>= fract_bits; /* The exponent */
584 /* Largest intermediate value 0x7f0000 */
585 return mantissa << x >> fract_bits;
574} 586}
575 587
576void bch_bio_map(struct bio *bio, void *base); 588void bch_bio_map(struct bio *bio, void *base);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9faed1c92b52..7f6462f74ac8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1442,11 +1442,10 @@ out:
1442 1442
1443static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) 1443static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
1444{ 1444{
1445 unsigned int i;
1446 struct bio_vec *bv; 1445 struct bio_vec *bv;
1447 struct bvec_iter_all iter_all; 1446 struct bvec_iter_all iter_all;
1448 1447
1449 bio_for_each_segment_all(bv, clone, i, iter_all) { 1448 bio_for_each_segment_all(bv, clone, iter_all) {
1450 BUG_ON(!bv->bv_page); 1449 BUG_ON(!bv->bv_page);
1451 mempool_free(bv->bv_page, &cc->page_pool); 1450 mempool_free(bv->bv_page, &cc->page_pool);
1452 } 1451 }
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 12b5216c2cfe..721efc493942 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -135,9 +135,8 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
135/* 135/*
136 * Funtions to manipulate consecutive chunks 136 * Funtions to manipulate consecutive chunks
137 */ 137 */
138# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) 138#define DM_CHUNK_CONSECUTIVE_BITS 8
139# define DM_CHUNK_CONSECUTIVE_BITS 8 139#define DM_CHUNK_NUMBER_BITS 56
140# define DM_CHUNK_NUMBER_BITS 56
141 140
142static inline chunk_t dm_chunk_number(chunk_t chunk) 141static inline chunk_t dm_chunk_number(chunk_t chunk)
143{ 142{
@@ -163,29 +162,6 @@ static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
163 e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); 162 e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
164} 163}
165 164
166# else
167# define DM_CHUNK_CONSECUTIVE_BITS 0
168
169static inline chunk_t dm_chunk_number(chunk_t chunk)
170{
171 return chunk;
172}
173
174static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
175{
176 return 0;
177}
178
179static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
180{
181}
182
183static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
184{
185}
186
187# endif
188
189/* 165/*
190 * Return the number of sectors in the device. 166 * Return the number of sectors in the device.
191 */ 167 */
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 95ae4bf34203..c27c32cf4a30 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -88,14 +88,10 @@ struct journal_entry {
88 88
89#if BITS_PER_LONG == 64 89#if BITS_PER_LONG == 64
90#define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0) 90#define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0)
91#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
92#elif defined(CONFIG_LBDAF)
93#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
94#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
95#else 91#else
96#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0) 92#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
97#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo)
98#endif 93#endif
94#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
99#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) 95#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1))
100#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0) 96#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
101#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2)) 97#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2))
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 1cd4f991792c..3a62a46b75c7 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -490,10 +490,10 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
490 pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); 490 pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
491 pr_debug(" version: %d\n", le32_to_cpu(sb->version)); 491 pr_debug(" version: %d\n", le32_to_cpu(sb->version));
492 pr_debug(" uuid: %08x.%08x.%08x.%08x\n", 492 pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
493 le32_to_cpu(*(__u32 *)(sb->uuid+0)), 493 le32_to_cpu(*(__le32 *)(sb->uuid+0)),
494 le32_to_cpu(*(__u32 *)(sb->uuid+4)), 494 le32_to_cpu(*(__le32 *)(sb->uuid+4)),
495 le32_to_cpu(*(__u32 *)(sb->uuid+8)), 495 le32_to_cpu(*(__le32 *)(sb->uuid+8)),
496 le32_to_cpu(*(__u32 *)(sb->uuid+12))); 496 le32_to_cpu(*(__le32 *)(sb->uuid+12)));
497 pr_debug(" events: %llu\n", 497 pr_debug(" events: %llu\n",
498 (unsigned long long) le64_to_cpu(sb->events)); 498 (unsigned long long) le64_to_cpu(sb->events));
499 pr_debug("events cleared: %llu\n", 499 pr_debug("events cleared: %llu\n",
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 05ffffb8b769..45ffa23fa85d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -88,8 +88,7 @@ static struct kobj_type md_ktype;
88 88
89struct md_cluster_operations *md_cluster_ops; 89struct md_cluster_operations *md_cluster_ops;
90EXPORT_SYMBOL(md_cluster_ops); 90EXPORT_SYMBOL(md_cluster_ops);
91struct module *md_cluster_mod; 91static struct module *md_cluster_mod;
92EXPORT_SYMBOL(md_cluster_mod);
93 92
94static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 93static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
95static struct workqueue_struct *md_wq; 94static struct workqueue_struct *md_wq;
@@ -132,24 +131,6 @@ static inline int speed_max(struct mddev *mddev)
132 mddev->sync_speed_max : sysctl_speed_limit_max; 131 mddev->sync_speed_max : sysctl_speed_limit_max;
133} 132}
134 133
135static void * flush_info_alloc(gfp_t gfp_flags, void *data)
136{
137 return kzalloc(sizeof(struct flush_info), gfp_flags);
138}
139static void flush_info_free(void *flush_info, void *data)
140{
141 kfree(flush_info);
142}
143
144static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
145{
146 return kzalloc(sizeof(struct flush_bio), gfp_flags);
147}
148static void flush_bio_free(void *flush_bio, void *data)
149{
150 kfree(flush_bio);
151}
152
153static struct ctl_table_header *raid_table_header; 134static struct ctl_table_header *raid_table_header;
154 135
155static struct ctl_table raid_table[] = { 136static struct ctl_table raid_table[] = {
@@ -423,54 +404,31 @@ static int md_congested(void *data, int bits)
423/* 404/*
424 * Generic flush handling for md 405 * Generic flush handling for md
425 */ 406 */
426static void submit_flushes(struct work_struct *ws)
427{
428 struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
429 struct mddev *mddev = fi->mddev;
430 struct bio *bio = fi->bio;
431
432 bio->bi_opf &= ~REQ_PREFLUSH;
433 md_handle_request(mddev, bio);
434
435 mempool_free(fi, mddev->flush_pool);
436}
437 407
438static void md_end_flush(struct bio *fbio) 408static void md_end_flush(struct bio *bio)
439{ 409{
440 struct flush_bio *fb = fbio->bi_private; 410 struct md_rdev *rdev = bio->bi_private;
441 struct md_rdev *rdev = fb->rdev; 411 struct mddev *mddev = rdev->mddev;
442 struct flush_info *fi = fb->fi;
443 struct bio *bio = fi->bio;
444 struct mddev *mddev = fi->mddev;
445 412
446 rdev_dec_pending(rdev, mddev); 413 rdev_dec_pending(rdev, mddev);
447 414
448 if (atomic_dec_and_test(&fi->flush_pending)) { 415 if (atomic_dec_and_test(&mddev->flush_pending)) {
449 if (bio->bi_iter.bi_size == 0) { 416 /* The pre-request flush has finished */
450 /* an empty barrier - all done */ 417 queue_work(md_wq, &mddev->flush_work);
451 bio_endio(bio);
452 mempool_free(fi, mddev->flush_pool);
453 } else {
454 INIT_WORK(&fi->flush_work, submit_flushes);
455 queue_work(md_wq, &fi->flush_work);
456 }
457 } 418 }
458 419 bio_put(bio);
459 mempool_free(fb, mddev->flush_bio_pool);
460 bio_put(fbio);
461} 420}
462 421
463void md_flush_request(struct mddev *mddev, struct bio *bio) 422static void md_submit_flush_data(struct work_struct *ws);
423
424static void submit_flushes(struct work_struct *ws)
464{ 425{
426 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
465 struct md_rdev *rdev; 427 struct md_rdev *rdev;
466 struct flush_info *fi;
467
468 fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
469
470 fi->bio = bio;
471 fi->mddev = mddev;
472 atomic_set(&fi->flush_pending, 1);
473 428
429 mddev->start_flush = ktime_get_boottime();
430 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
431 atomic_set(&mddev->flush_pending, 1);
474 rcu_read_lock(); 432 rcu_read_lock();
475 rdev_for_each_rcu(rdev, mddev) 433 rdev_for_each_rcu(rdev, mddev)
476 if (rdev->raid_disk >= 0 && 434 if (rdev->raid_disk >= 0 &&
@@ -480,37 +438,74 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
480 * we reclaim rcu_read_lock 438 * we reclaim rcu_read_lock
481 */ 439 */
482 struct bio *bi; 440 struct bio *bi;
483 struct flush_bio *fb;
484 atomic_inc(&rdev->nr_pending); 441 atomic_inc(&rdev->nr_pending);
485 atomic_inc(&rdev->nr_pending); 442 atomic_inc(&rdev->nr_pending);
486 rcu_read_unlock(); 443 rcu_read_unlock();
487
488 fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
489 fb->fi = fi;
490 fb->rdev = rdev;
491
492 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 444 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
493 bio_set_dev(bi, rdev->bdev);
494 bi->bi_end_io = md_end_flush; 445 bi->bi_end_io = md_end_flush;
495 bi->bi_private = fb; 446 bi->bi_private = rdev;
447 bio_set_dev(bi, rdev->bdev);
496 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 448 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
497 449 atomic_inc(&mddev->flush_pending);
498 atomic_inc(&fi->flush_pending);
499 submit_bio(bi); 450 submit_bio(bi);
500
501 rcu_read_lock(); 451 rcu_read_lock();
502 rdev_dec_pending(rdev, mddev); 452 rdev_dec_pending(rdev, mddev);
503 } 453 }
504 rcu_read_unlock(); 454 rcu_read_unlock();
455 if (atomic_dec_and_test(&mddev->flush_pending))
456 queue_work(md_wq, &mddev->flush_work);
457}
458
459static void md_submit_flush_data(struct work_struct *ws)
460{
461 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
462 struct bio *bio = mddev->flush_bio;
463
464 /*
465 * must reset flush_bio before calling into md_handle_request to avoid a
466 * deadlock, because other bios passed md_handle_request suspend check
467 * could wait for this and below md_handle_request could wait for those
468 * bios because of suspend check
469 */
470 mddev->last_flush = mddev->start_flush;
471 mddev->flush_bio = NULL;
472 wake_up(&mddev->sb_wait);
473
474 if (bio->bi_iter.bi_size == 0) {
475 /* an empty barrier - all done */
476 bio_endio(bio);
477 } else {
478 bio->bi_opf &= ~REQ_PREFLUSH;
479 md_handle_request(mddev, bio);
480 }
481}
505 482
506 if (atomic_dec_and_test(&fi->flush_pending)) { 483void md_flush_request(struct mddev *mddev, struct bio *bio)
507 if (bio->bi_iter.bi_size == 0) { 484{
485 ktime_t start = ktime_get_boottime();
486 spin_lock_irq(&mddev->lock);
487 wait_event_lock_irq(mddev->sb_wait,
488 !mddev->flush_bio ||
489 ktime_after(mddev->last_flush, start),
490 mddev->lock);
491 if (!ktime_after(mddev->last_flush, start)) {
492 WARN_ON(mddev->flush_bio);
493 mddev->flush_bio = bio;
494 bio = NULL;
495 }
496 spin_unlock_irq(&mddev->lock);
497
498 if (!bio) {
499 INIT_WORK(&mddev->flush_work, submit_flushes);
500 queue_work(md_wq, &mddev->flush_work);
501 } else {
502 /* flush was performed for some other bio while we waited. */
503 if (bio->bi_iter.bi_size == 0)
508 /* an empty barrier - all done */ 504 /* an empty barrier - all done */
509 bio_endio(bio); 505 bio_endio(bio);
510 mempool_free(fi, mddev->flush_pool); 506 else {
511 } else { 507 bio->bi_opf &= ~REQ_PREFLUSH;
512 INIT_WORK(&fi->flush_work, submit_flushes); 508 mddev->pers->make_request(mddev, bio);
513 queue_work(md_wq, &fi->flush_work);
514 } 509 }
515 } 510 }
516} 511}
@@ -560,6 +555,7 @@ void mddev_init(struct mddev *mddev)
560 atomic_set(&mddev->openers, 0); 555 atomic_set(&mddev->openers, 0);
561 atomic_set(&mddev->active_io, 0); 556 atomic_set(&mddev->active_io, 0);
562 spin_lock_init(&mddev->lock); 557 spin_lock_init(&mddev->lock);
558 atomic_set(&mddev->flush_pending, 0);
563 init_waitqueue_head(&mddev->sb_wait); 559 init_waitqueue_head(&mddev->sb_wait);
564 init_waitqueue_head(&mddev->recovery_wait); 560 init_waitqueue_head(&mddev->recovery_wait);
565 mddev->reshape_position = MaxSector; 561 mddev->reshape_position = MaxSector;
@@ -1109,8 +1105,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
1109 * (not needed for Linear and RAID0 as metadata doesn't 1105 * (not needed for Linear and RAID0 as metadata doesn't
1110 * record this size) 1106 * record this size)
1111 */ 1107 */
1112 if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && 1108 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1113 sb->level >= 1)
1114 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1109 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1115 1110
1116 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1111 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
@@ -1408,8 +1403,7 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1408 /* Limit to 4TB as metadata cannot record more than that. 1403 /* Limit to 4TB as metadata cannot record more than that.
1409 * 4TB == 2^32 KB, or 2*2^32 sectors. 1404 * 4TB == 2^32 KB, or 2*2^32 sectors.
1410 */ 1405 */
1411 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && 1406 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1412 rdev->mddev->level >= 1)
1413 num_sectors = (sector_t)(2ULL << 32) - 2; 1407 num_sectors = (sector_t)(2ULL << 32) - 2;
1414 do { 1408 do {
1415 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1409 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -1553,7 +1547,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1553 */ 1547 */
1554 s32 offset; 1548 s32 offset;
1555 sector_t bb_sector; 1549 sector_t bb_sector;
1556 u64 *bbp; 1550 __le64 *bbp;
1557 int i; 1551 int i;
1558 int sectors = le16_to_cpu(sb->bblog_size); 1552 int sectors = le16_to_cpu(sb->bblog_size);
1559 if (sectors > (PAGE_SIZE / 512)) 1553 if (sectors > (PAGE_SIZE / 512))
@@ -1565,7 +1559,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
1565 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1559 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1566 rdev->bb_page, REQ_OP_READ, 0, true)) 1560 rdev->bb_page, REQ_OP_READ, 0, true))
1567 return -EIO; 1561 return -EIO;
1568 bbp = (u64 *)page_address(rdev->bb_page); 1562 bbp = (__le64 *)page_address(rdev->bb_page);
1569 rdev->badblocks.shift = sb->bblog_shift; 1563 rdev->badblocks.shift = sb->bblog_shift;
1570 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1564 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1571 u64 bb = le64_to_cpu(*bbp); 1565 u64 bb = le64_to_cpu(*bbp);
@@ -1877,7 +1871,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1877 md_error(mddev, rdev); 1871 md_error(mddev, rdev);
1878 else { 1872 else {
1879 struct badblocks *bb = &rdev->badblocks; 1873 struct badblocks *bb = &rdev->badblocks;
1880 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1874 __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
1881 u64 *p = bb->page; 1875 u64 *p = bb->page;
1882 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1876 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1883 if (bb->changed) { 1877 if (bb->changed) {
@@ -2855,8 +2849,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
2855 err = 0; 2849 err = 0;
2856 } 2850 }
2857 } else if (cmd_match(buf, "re-add")) { 2851 } else if (cmd_match(buf, "re-add")) {
2858 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 2852 if (!rdev->mddev->pers)
2859 rdev->saved_raid_disk >= 0) { 2853 err = -EINVAL;
2854 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
2855 rdev->saved_raid_disk >= 0) {
2860 /* clear_bit is performed _after_ all the devices 2856 /* clear_bit is performed _after_ all the devices
2861 * have their local Faulty bit cleared. If any writes 2857 * have their local Faulty bit cleared. If any writes
2862 * happen in the meantime in the local node, they 2858 * happen in the meantime in the local node, they
@@ -3384,10 +3380,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3384 return -EIO; 3380 return -EIO;
3385 if (!capable(CAP_SYS_ADMIN)) 3381 if (!capable(CAP_SYS_ADMIN))
3386 return -EACCES; 3382 return -EACCES;
3387 rv = mddev ? mddev_lock(mddev): -EBUSY; 3383 rv = mddev ? mddev_lock(mddev) : -ENODEV;
3388 if (!rv) { 3384 if (!rv) {
3389 if (rdev->mddev == NULL) 3385 if (rdev->mddev == NULL)
3390 rv = -EBUSY; 3386 rv = -ENODEV;
3391 else 3387 else
3392 rv = entry->store(rdev, page, length); 3388 rv = entry->store(rdev, page, length);
3393 mddev_unlock(mddev); 3389 mddev_unlock(mddev);
@@ -5511,22 +5507,6 @@ int md_run(struct mddev *mddev)
5511 if (err) 5507 if (err)
5512 return err; 5508 return err;
5513 } 5509 }
5514 if (mddev->flush_pool == NULL) {
5515 mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
5516 flush_info_free, mddev);
5517 if (!mddev->flush_pool) {
5518 err = -ENOMEM;
5519 goto abort;
5520 }
5521 }
5522 if (mddev->flush_bio_pool == NULL) {
5523 mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
5524 flush_bio_free, mddev);
5525 if (!mddev->flush_bio_pool) {
5526 err = -ENOMEM;
5527 goto abort;
5528 }
5529 }
5530 5510
5531 spin_lock(&pers_lock); 5511 spin_lock(&pers_lock);
5532 pers = find_pers(mddev->level, mddev->clevel); 5512 pers = find_pers(mddev->level, mddev->clevel);
@@ -5686,11 +5666,8 @@ int md_run(struct mddev *mddev)
5686 return 0; 5666 return 0;
5687 5667
5688abort: 5668abort:
5689 mempool_destroy(mddev->flush_bio_pool); 5669 bioset_exit(&mddev->bio_set);
5690 mddev->flush_bio_pool = NULL; 5670 bioset_exit(&mddev->sync_set);
5691 mempool_destroy(mddev->flush_pool);
5692 mddev->flush_pool = NULL;
5693
5694 return err; 5671 return err;
5695} 5672}
5696EXPORT_SYMBOL_GPL(md_run); 5673EXPORT_SYMBOL_GPL(md_run);
@@ -5894,14 +5871,6 @@ static void __md_stop(struct mddev *mddev)
5894 mddev->to_remove = &md_redundancy_group; 5871 mddev->to_remove = &md_redundancy_group;
5895 module_put(pers->owner); 5872 module_put(pers->owner);
5896 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5873 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5897 if (mddev->flush_bio_pool) {
5898 mempool_destroy(mddev->flush_bio_pool);
5899 mddev->flush_bio_pool = NULL;
5900 }
5901 if (mddev->flush_pool) {
5902 mempool_destroy(mddev->flush_pool);
5903 mddev->flush_pool = NULL;
5904 }
5905} 5874}
5906 5875
5907void md_stop(struct mddev *mddev) 5876void md_stop(struct mddev *mddev)
@@ -9257,7 +9226,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9257 * reshape is happening in the remote node, we need to 9226 * reshape is happening in the remote node, we need to
9258 * update reshape_position and call start_reshape. 9227 * update reshape_position and call start_reshape.
9259 */ 9228 */
9260 mddev->reshape_position = sb->reshape_position; 9229 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
9261 if (mddev->pers->update_reshape_pos) 9230 if (mddev->pers->update_reshape_pos)
9262 mddev->pers->update_reshape_pos(mddev); 9231 mddev->pers->update_reshape_pos(mddev);
9263 if (mddev->pers->start_reshape) 9232 if (mddev->pers->start_reshape)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index c52afb52c776..257cb4c9e22b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,19 +252,6 @@ enum mddev_sb_flags {
252 MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ 252 MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
253}; 253};
254 254
255#define NR_FLUSH_INFOS 8
256#define NR_FLUSH_BIOS 64
257struct flush_info {
258 struct bio *bio;
259 struct mddev *mddev;
260 struct work_struct flush_work;
261 atomic_t flush_pending;
262};
263struct flush_bio {
264 struct flush_info *fi;
265 struct md_rdev *rdev;
266};
267
268struct mddev { 255struct mddev {
269 void *private; 256 void *private;
270 struct md_personality *pers; 257 struct md_personality *pers;
@@ -470,8 +457,16 @@ struct mddev {
470 * metadata and bitmap writes 457 * metadata and bitmap writes
471 */ 458 */
472 459
473 mempool_t *flush_pool; 460 /* Generic flush handling.
474 mempool_t *flush_bio_pool; 461 * The last to finish preflush schedules a worker to submit
462 * the rest of the request (without the REQ_PREFLUSH flag).
463 */
464 struct bio *flush_bio;
465 atomic_t flush_pending;
466 ktime_t start_flush, last_flush; /* last_flush is when the last completed
467 * flush was started.
468 */
469 struct work_struct flush_work;
475 struct work_struct event_work; /* used by dm to report failure event */ 470 struct work_struct event_work; /* used by dm to report failure event */
476 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); 471 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
477 struct md_cluster_info *cluster_info; 472 struct md_cluster_info *cluster_info;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fdf451aac369..0c8a098d220e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2110,7 +2110,7 @@ static void process_checks(struct r1bio *r1_bio)
2110 } 2110 }
2111 r1_bio->read_disk = primary; 2111 r1_bio->read_disk = primary;
2112 for (i = 0; i < conf->raid_disks * 2; i++) { 2112 for (i = 0; i < conf->raid_disks * 2; i++) {
2113 int j; 2113 int j = 0;
2114 struct bio *pbio = r1_bio->bios[primary]; 2114 struct bio *pbio = r1_bio->bios[primary];
2115 struct bio *sbio = r1_bio->bios[i]; 2115 struct bio *sbio = r1_bio->bios[i];
2116 blk_status_t status = sbio->bi_status; 2116 blk_status_t status = sbio->bi_status;
@@ -2125,8 +2125,8 @@ static void process_checks(struct r1bio *r1_bio)
2125 /* Now we can 'fixup' the error value */ 2125 /* Now we can 'fixup' the error value */
2126 sbio->bi_status = 0; 2126 sbio->bi_status = 0;
2127 2127
2128 bio_for_each_segment_all(bi, sbio, j, iter_all) 2128 bio_for_each_segment_all(bi, sbio, iter_all)
2129 page_len[j] = bi->bv_len; 2129 page_len[j++] = bi->bv_len;
2130 2130
2131 if (!status) { 2131 if (!status) {
2132 for (j = vcnt; j-- ; ) { 2132 for (j = vcnt; j-- ; ) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c033bfcb209e..7fde645d2e90 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -711,6 +711,8 @@ static bool is_full_stripe_write(struct stripe_head *sh)
711} 711}
712 712
713static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 713static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
714 __acquires(&sh1->stripe_lock)
715 __acquires(&sh2->stripe_lock)
714{ 716{
715 if (sh1 > sh2) { 717 if (sh1 > sh2) {
716 spin_lock_irq(&sh2->stripe_lock); 718 spin_lock_irq(&sh2->stripe_lock);
@@ -722,6 +724,8 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
722} 724}
723 725
724static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 726static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
727 __releases(&sh1->stripe_lock)
728 __releases(&sh2->stripe_lock)
725{ 729{
726 spin_unlock(&sh1->stripe_lock); 730 spin_unlock(&sh1->stripe_lock);
727 spin_unlock_irq(&sh2->stripe_lock); 731 spin_unlock_irq(&sh2->stripe_lock);
@@ -4187,7 +4191,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4187 /* now write out any block on a failed drive, 4191 /* now write out any block on a failed drive,
4188 * or P or Q if they were recomputed 4192 * or P or Q if they were recomputed
4189 */ 4193 */
4190 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 4194 dev = NULL;
4191 if (s->failed == 2) { 4195 if (s->failed == 2) {
4192 dev = &sh->dev[s->failed_num[1]]; 4196 dev = &sh->dev[s->failed_num[1]];
4193 s->locked++; 4197 s->locked++;
@@ -4212,6 +4216,14 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
4212 set_bit(R5_LOCKED, &dev->flags); 4216 set_bit(R5_LOCKED, &dev->flags);
4213 set_bit(R5_Wantwrite, &dev->flags); 4217 set_bit(R5_Wantwrite, &dev->flags);
4214 } 4218 }
4219 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
4220 "%s: disk%td not up to date\n",
4221 mdname(conf->mddev),
4222 dev - (struct r5dev *) &sh->dev)) {
4223 clear_bit(R5_LOCKED, &dev->flags);
4224 clear_bit(R5_Wantwrite, &dev->flags);
4225 s->locked--;
4226 }
4215 clear_bit(STRIPE_DEGRADED, &sh->state); 4227 clear_bit(STRIPE_DEGRADED, &sh->state);
4216 4228
4217 set_bit(STRIPE_INSYNC, &sh->state); 4229 set_bit(STRIPE_INSYNC, &sh->state);
@@ -6166,6 +6178,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
6166static int handle_active_stripes(struct r5conf *conf, int group, 6178static int handle_active_stripes(struct r5conf *conf, int group,
6167 struct r5worker *worker, 6179 struct r5worker *worker,
6168 struct list_head *temp_inactive_list) 6180 struct list_head *temp_inactive_list)
6181 __releases(&conf->device_lock)
6182 __acquires(&conf->device_lock)
6169{ 6183{
6170 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6184 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
6171 int i, batch_size = 0, hash; 6185 int i, batch_size = 0, hash;
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index d271bd731af7..01f40672507f 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -391,7 +391,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
391 bb_present = badblocks_check(&nd_region->bb, meta_start, 391 bb_present = badblocks_check(&nd_region->bb, meta_start,
392 meta_num, &first_bad, &num_bad); 392 meta_num, &first_bad, &num_bad);
393 if (bb_present) { 393 if (bb_present) {
394 dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %lx\n", 394 dev_dbg(&nd_pfn->dev, "meta: %x badblocks at %llx\n",
395 num_bad, first_bad); 395 num_bad, first_bad);
396 nsoff = ALIGN_DOWN((nd_region->ndr_start 396 nsoff = ALIGN_DOWN((nd_region->ndr_start
397 + (first_bad << 9)) - nsio->res.start, 397 + (first_bad << 9)) - nsio->res.start,
@@ -410,7 +410,7 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
410 } 410 }
411 if (rc) { 411 if (rc) {
412 dev_err(&nd_pfn->dev, 412 dev_err(&nd_pfn->dev,
413 "error clearing %x badblocks at %lx\n", 413 "error clearing %x badblocks at %llx\n",
414 num_bad, first_bad); 414 num_bad, first_bad);
415 return rc; 415 return rc;
416 } 416 }
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6265d9225ec8..a6644a2c3ef7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1105,7 +1105,7 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
1105 1105
1106 error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); 1106 error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1107 if (error) { 1107 if (error) {
1108 dev_warn(ctrl->device, "Identify namespace failed\n"); 1108 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1109 kfree(id); 1109 kfree(id);
1110 return NULL; 1110 return NULL;
1111 } 1111 }
@@ -1588,9 +1588,13 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1588static void nvme_update_disk_info(struct gendisk *disk, 1588static void nvme_update_disk_info(struct gendisk *disk,
1589 struct nvme_ns *ns, struct nvme_id_ns *id) 1589 struct nvme_ns *ns, struct nvme_id_ns *id)
1590{ 1590{
1591 sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9); 1591 sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9);
1592 unsigned short bs = 1 << ns->lba_shift; 1592 unsigned short bs = 1 << ns->lba_shift;
1593 1593
1594 if (ns->lba_shift > PAGE_SHIFT) {
1595 /* unsupported block size, set capacity to 0 later */
1596 bs = (1 << 9);
1597 }
1594 blk_mq_freeze_queue(disk->queue); 1598 blk_mq_freeze_queue(disk->queue);
1595 blk_integrity_unregister(disk); 1599 blk_integrity_unregister(disk);
1596 1600
@@ -1601,7 +1605,8 @@ static void nvme_update_disk_info(struct gendisk *disk,
1601 if (ns->ms && !ns->ext && 1605 if (ns->ms && !ns->ext &&
1602 (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) 1606 (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1603 nvme_init_integrity(disk, ns->ms, ns->pi_type); 1607 nvme_init_integrity(disk, ns->ms, ns->pi_type);
1604 if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) 1608 if ((ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) ||
1609 ns->lba_shift > PAGE_SHIFT)
1605 capacity = 0; 1610 capacity = 0;
1606 1611
1607 set_capacity(disk, capacity); 1612 set_capacity(disk, capacity);
@@ -2549,7 +2554,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
2549 ctrl->crdt[2] = le16_to_cpu(id->crdt3); 2554 ctrl->crdt[2] = le16_to_cpu(id->crdt3);
2550 2555
2551 ctrl->oacs = le16_to_cpu(id->oacs); 2556 ctrl->oacs = le16_to_cpu(id->oacs);
2552 ctrl->oncs = le16_to_cpup(&id->oncs); 2557 ctrl->oncs = le16_to_cpu(id->oncs);
2553 ctrl->oaes = le32_to_cpu(id->oaes); 2558 ctrl->oaes = le32_to_cpu(id->oaes);
2554 atomic_set(&ctrl->abort_limit, id->acl + 1); 2559 atomic_set(&ctrl->abort_limit, id->acl + 1);
2555 ctrl->vwc = id->vwc; 2560 ctrl->vwc = id->vwc;
@@ -3874,10 +3879,37 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
3874} 3879}
3875EXPORT_SYMBOL_GPL(nvme_start_queues); 3880EXPORT_SYMBOL_GPL(nvme_start_queues);
3876 3881
3877int __init nvme_core_init(void) 3882/*
3883 * Check we didn't inadvertently grow the command structure sizes:
3884 */
3885static inline void _nvme_check_size(void)
3886{
3887 BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
3888 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
3889 BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
3890 BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
3891 BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
3892 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
3893 BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
3894 BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
3895 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
3896 BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
3897 BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
3898 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
3899 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
3900 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
3901 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
3902 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
3903 BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
3904}
3905
3906
3907static int __init nvme_core_init(void)
3878{ 3908{
3879 int result = -ENOMEM; 3909 int result = -ENOMEM;
3880 3910
3911 _nvme_check_size();
3912
3881 nvme_wq = alloc_workqueue("nvme-wq", 3913 nvme_wq = alloc_workqueue("nvme-wq",
3882 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 3914 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3883 if (!nvme_wq) 3915 if (!nvme_wq)
@@ -3924,7 +3956,7 @@ out:
3924 return result; 3956 return result;
3925} 3957}
3926 3958
3927void __exit nvme_core_exit(void) 3959static void __exit nvme_core_exit(void)
3928{ 3960{
3929 ida_destroy(&nvme_subsystems_ida); 3961 ida_destroy(&nvme_subsystems_ida);
3930 class_destroy(nvme_subsys_class); 3962 class_destroy(nvme_subsys_class);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index d4cb826f58ff..592d1e61ef7e 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -1188,6 +1188,7 @@ static void __exit nvmf_exit(void)
1188 class_destroy(nvmf_class); 1188 class_destroy(nvmf_class);
1189 nvmf_host_put(nvmf_default_host); 1189 nvmf_host_put(nvmf_default_host);
1190 1190
1191 BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64);
1191 BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64); 1192 BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
1192 BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64); 1193 BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
1193 BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64); 1194 BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f0716f6ce41f..5c9429d41120 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -232,6 +232,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
232 blk_qc_t ret = BLK_QC_T_NONE; 232 blk_qc_t ret = BLK_QC_T_NONE;
233 int srcu_idx; 233 int srcu_idx;
234 234
235 /*
236 * The namespace might be going away and the bio might
237 * be moved to a different queue via blk_steal_bios(),
238 * so we need to use the bio_split pool from the original
239 * queue to allocate the bvecs from.
240 */
241 blk_queue_split(q, &bio);
242
235 srcu_idx = srcu_read_lock(&head->srcu); 243 srcu_idx = srcu_read_lock(&head->srcu);
236 ns = nvme_find_path(head); 244 ns = nvme_find_path(head);
237 if (likely(ns)) { 245 if (likely(ns)) {
@@ -421,7 +429,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
421 unsigned *nr_change_groups = data; 429 unsigned *nr_change_groups = data;
422 struct nvme_ns *ns; 430 struct nvme_ns *ns;
423 431
424 dev_info(ctrl->device, "ANA group %d: %s.\n", 432 dev_dbg(ctrl->device, "ANA group %d: %s.\n",
425 le32_to_cpu(desc->grpid), 433 le32_to_cpu(desc->grpid),
426 nvme_ana_state_names[desc->state]); 434 nvme_ana_state_names[desc->state]);
427 435
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 527d64545023..5ee75b5ff83f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -577,7 +577,4 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
577 return dev_to_disk(dev)->private_data; 577 return dev_to_disk(dev)->private_data;
578} 578}
579 579
580int __init nvme_core_init(void);
581void __exit nvme_core_exit(void);
582
583#endif /* _NVME_H */ 580#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a90cf5d63aac..3e4fb891a95a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -146,7 +146,7 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
146 146
147static int queue_count_set(const char *val, const struct kernel_param *kp) 147static int queue_count_set(const char *val, const struct kernel_param *kp)
148{ 148{
149 int n = 0, ret; 149 int n, ret;
150 150
151 ret = kstrtoint(val, 10, &n); 151 ret = kstrtoint(val, 10, &n);
152 if (ret) 152 if (ret)
@@ -177,7 +177,6 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
177 * commands and one for I/O commands). 177 * commands and one for I/O commands).
178 */ 178 */
179struct nvme_queue { 179struct nvme_queue {
180 struct device *q_dmadev;
181 struct nvme_dev *dev; 180 struct nvme_dev *dev;
182 spinlock_t sq_lock; 181 spinlock_t sq_lock;
183 struct nvme_command *sq_cmds; 182 struct nvme_command *sq_cmds;
@@ -189,7 +188,7 @@ struct nvme_queue {
189 dma_addr_t cq_dma_addr; 188 dma_addr_t cq_dma_addr;
190 u32 __iomem *q_db; 189 u32 __iomem *q_db;
191 u16 q_depth; 190 u16 q_depth;
192 s16 cq_vector; 191 u16 cq_vector;
193 u16 sq_tail; 192 u16 sq_tail;
194 u16 last_sq_tail; 193 u16 last_sq_tail;
195 u16 cq_head; 194 u16 cq_head;
@@ -200,6 +199,7 @@ struct nvme_queue {
200#define NVMEQ_ENABLED 0 199#define NVMEQ_ENABLED 0
201#define NVMEQ_SQ_CMB 1 200#define NVMEQ_SQ_CMB 1
202#define NVMEQ_DELETE_ERROR 2 201#define NVMEQ_DELETE_ERROR 2
202#define NVMEQ_POLLED 3
203 u32 *dbbuf_sq_db; 203 u32 *dbbuf_sq_db;
204 u32 *dbbuf_cq_db; 204 u32 *dbbuf_cq_db;
205 u32 *dbbuf_sq_ei; 205 u32 *dbbuf_sq_ei;
@@ -208,10 +208,10 @@ struct nvme_queue {
208}; 208};
209 209
210/* 210/*
211 * The nvme_iod describes the data in an I/O, including the list of PRP 211 * The nvme_iod describes the data in an I/O.
212 * entries. You can't see it in this data structure because C doesn't let 212 *
213 * me express that. Use nvme_init_iod to ensure there's enough space 213 * The sg pointer contains the list of PRP/SGL chunk allocations in addition
214 * allocated to store the PRP list. 214 * to the actual struct scatterlist.
215 */ 215 */
216struct nvme_iod { 216struct nvme_iod {
217 struct nvme_request req; 217 struct nvme_request req;
@@ -220,33 +220,12 @@ struct nvme_iod {
220 int aborted; 220 int aborted;
221 int npages; /* In the PRP list. 0 means small pool in use */ 221 int npages; /* In the PRP list. 0 means small pool in use */
222 int nents; /* Used in scatterlist */ 222 int nents; /* Used in scatterlist */
223 int length; /* Of data, in bytes */
224 dma_addr_t first_dma; 223 dma_addr_t first_dma;
225 struct scatterlist meta_sg; /* metadata requires single contiguous buffer */ 224 unsigned int dma_len; /* length of single DMA segment mapping */
225 dma_addr_t meta_dma;
226 struct scatterlist *sg; 226 struct scatterlist *sg;
227 struct scatterlist inline_sg[0];
228}; 227};
229 228
230/*
231 * Check we didin't inadvertently grow the command struct
232 */
233static inline void _nvme_check_size(void)
234{
235 BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
236 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
237 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
238 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
239 BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
240 BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
241 BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
242 BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
243 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
244 BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
245 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
246 BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
247 BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
248}
249
250static unsigned int max_io_queues(void) 229static unsigned int max_io_queues(void)
251{ 230{
252 return num_possible_cpus() + write_queues + poll_queues; 231 return num_possible_cpus() + write_queues + poll_queues;
@@ -372,12 +351,6 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
372} 351}
373 352
374/* 353/*
375 * Max size of iod being embedded in the request payload
376 */
377#define NVME_INT_PAGES 2
378#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size)
379
380/*
381 * Will slightly overestimate the number of pages needed. This is OK 354 * Will slightly overestimate the number of pages needed. This is OK
382 * as it only leads to a small amount of wasted memory for the lifetime of 355 * as it only leads to a small amount of wasted memory for the lifetime of
383 * the I/O. 356 * the I/O.
@@ -411,15 +384,6 @@ static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
411 return alloc_size + sizeof(struct scatterlist) * nseg; 384 return alloc_size + sizeof(struct scatterlist) * nseg;
412} 385}
413 386
414static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
415{
416 unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
417 NVME_INT_BYTES(dev), NVME_INT_PAGES,
418 use_sgl);
419
420 return sizeof(struct nvme_iod) + alloc_size;
421}
422
423static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 387static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
424 unsigned int hctx_idx) 388 unsigned int hctx_idx)
425{ 389{
@@ -584,37 +548,26 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
584 return true; 548 return true;
585} 549}
586 550
587static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) 551static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
588{ 552{
589 struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); 553 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
590 int nseg = blk_rq_nr_phys_segments(rq); 554 enum dma_data_direction dma_dir = rq_data_dir(req) ?
591 unsigned int size = blk_rq_payload_bytes(rq); 555 DMA_TO_DEVICE : DMA_FROM_DEVICE;
592 556 const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
593 iod->use_sgl = nvme_pci_use_sgls(dev, rq); 557 dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
558 int i;
594 559
595 if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { 560 if (iod->dma_len) {
596 iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); 561 dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir);
597 if (!iod->sg) 562 return;
598 return BLK_STS_RESOURCE;
599 } else {
600 iod->sg = iod->inline_sg;
601 } 563 }
602 564
603 iod->aborted = 0; 565 WARN_ON_ONCE(!iod->nents);
604 iod->npages = -1;
605 iod->nents = 0;
606 iod->length = size;
607
608 return BLK_STS_OK;
609}
610 566
611static void nvme_free_iod(struct nvme_dev *dev, struct request *req) 567 /* P2PDMA requests do not need to be unmapped */
612{ 568 if (!is_pci_p2pdma_page(sg_page(iod->sg)))
613 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 569 dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
614 const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
615 dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
616 570
617 int i;
618 571
619 if (iod->npages == 0) 572 if (iod->npages == 0)
620 dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], 573 dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
@@ -638,8 +591,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
638 dma_addr = next_dma_addr; 591 dma_addr = next_dma_addr;
639 } 592 }
640 593
641 if (iod->sg != iod->inline_sg) 594 mempool_free(iod->sg, dev->iod_mempool);
642 mempool_free(iod->sg, dev->iod_mempool);
643} 595}
644 596
645static void nvme_print_sgl(struct scatterlist *sgl, int nents) 597static void nvme_print_sgl(struct scatterlist *sgl, int nents)
@@ -829,80 +781,104 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
829 return BLK_STS_OK; 781 return BLK_STS_OK;
830} 782}
831 783
784static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
785 struct request *req, struct nvme_rw_command *cmnd,
786 struct bio_vec *bv)
787{
788 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
789 unsigned int first_prp_len = dev->ctrl.page_size - bv->bv_offset;
790
791 iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
792 if (dma_mapping_error(dev->dev, iod->first_dma))
793 return BLK_STS_RESOURCE;
794 iod->dma_len = bv->bv_len;
795
796 cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
797 if (bv->bv_len > first_prp_len)
798 cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
799 return 0;
800}
801
802static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
803 struct request *req, struct nvme_rw_command *cmnd,
804 struct bio_vec *bv)
805{
806 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
807
808 iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
809 if (dma_mapping_error(dev->dev, iod->first_dma))
810 return BLK_STS_RESOURCE;
811 iod->dma_len = bv->bv_len;
812
813 cmnd->flags = NVME_CMD_SGL_METABUF;
814 cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
815 cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
816 cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
817 return 0;
818}
819
832static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, 820static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
833 struct nvme_command *cmnd) 821 struct nvme_command *cmnd)
834{ 822{
835 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 823 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
836 struct request_queue *q = req->q; 824 blk_status_t ret = BLK_STS_RESOURCE;
837 enum dma_data_direction dma_dir = rq_data_dir(req) ?
838 DMA_TO_DEVICE : DMA_FROM_DEVICE;
839 blk_status_t ret = BLK_STS_IOERR;
840 int nr_mapped; 825 int nr_mapped;
841 826
827 if (blk_rq_nr_phys_segments(req) == 1) {
828 struct bio_vec bv = req_bvec(req);
829
830 if (!is_pci_p2pdma_page(bv.bv_page)) {
831 if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
832 return nvme_setup_prp_simple(dev, req,
833 &cmnd->rw, &bv);
834
835 if (iod->nvmeq->qid &&
836 dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
837 return nvme_setup_sgl_simple(dev, req,
838 &cmnd->rw, &bv);
839 }
840 }
841
842 iod->dma_len = 0;
843 iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
844 if (!iod->sg)
845 return BLK_STS_RESOURCE;
842 sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); 846 sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
843 iod->nents = blk_rq_map_sg(q, req, iod->sg); 847 iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
844 if (!iod->nents) 848 if (!iod->nents)
845 goto out; 849 goto out;
846 850
847 ret = BLK_STS_RESOURCE;
848
849 if (is_pci_p2pdma_page(sg_page(iod->sg))) 851 if (is_pci_p2pdma_page(sg_page(iod->sg)))
850 nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents, 852 nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
851 dma_dir); 853 rq_dma_dir(req));
852 else 854 else
853 nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, 855 nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
854 dma_dir, DMA_ATTR_NO_WARN); 856 rq_dma_dir(req), DMA_ATTR_NO_WARN);
855 if (!nr_mapped) 857 if (!nr_mapped)
856 goto out; 858 goto out;
857 859
860 iod->use_sgl = nvme_pci_use_sgls(dev, req);
858 if (iod->use_sgl) 861 if (iod->use_sgl)
859 ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); 862 ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
860 else 863 else
861 ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); 864 ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
862
863 if (ret != BLK_STS_OK)
864 goto out_unmap;
865
866 ret = BLK_STS_IOERR;
867 if (blk_integrity_rq(req)) {
868 if (blk_rq_count_integrity_sg(q, req->bio) != 1)
869 goto out_unmap;
870
871 sg_init_table(&iod->meta_sg, 1);
872 if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
873 goto out_unmap;
874
875 if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
876 goto out_unmap;
877
878 cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
879 }
880
881 return BLK_STS_OK;
882
883out_unmap:
884 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
885out: 865out:
866 if (ret != BLK_STS_OK)
867 nvme_unmap_data(dev, req);
886 return ret; 868 return ret;
887} 869}
888 870
889static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) 871static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
872 struct nvme_command *cmnd)
890{ 873{
891 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 874 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
892 enum dma_data_direction dma_dir = rq_data_dir(req) ?
893 DMA_TO_DEVICE : DMA_FROM_DEVICE;
894
895 if (iod->nents) {
896 /* P2PDMA requests do not need to be unmapped */
897 if (!is_pci_p2pdma_page(sg_page(iod->sg)))
898 dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
899
900 if (blk_integrity_rq(req))
901 dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
902 }
903 875
904 nvme_cleanup_cmd(req); 876 iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
905 nvme_free_iod(dev, req); 877 rq_dma_dir(req), 0);
878 if (dma_mapping_error(dev->dev, iod->meta_dma))
879 return BLK_STS_IOERR;
880 cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
881 return 0;
906} 882}
907 883
908/* 884/*
@@ -915,9 +891,14 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
915 struct nvme_queue *nvmeq = hctx->driver_data; 891 struct nvme_queue *nvmeq = hctx->driver_data;
916 struct nvme_dev *dev = nvmeq->dev; 892 struct nvme_dev *dev = nvmeq->dev;
917 struct request *req = bd->rq; 893 struct request *req = bd->rq;
894 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
918 struct nvme_command cmnd; 895 struct nvme_command cmnd;
919 blk_status_t ret; 896 blk_status_t ret;
920 897
898 iod->aborted = 0;
899 iod->npages = -1;
900 iod->nents = 0;
901
921 /* 902 /*
922 * We should not need to do this, but we're still using this to 903 * We should not need to do this, but we're still using this to
923 * ensure we can drain requests on a dying queue. 904 * ensure we can drain requests on a dying queue.
@@ -929,21 +910,23 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
929 if (ret) 910 if (ret)
930 return ret; 911 return ret;
931 912
932 ret = nvme_init_iod(req, dev);
933 if (ret)
934 goto out_free_cmd;
935
936 if (blk_rq_nr_phys_segments(req)) { 913 if (blk_rq_nr_phys_segments(req)) {
937 ret = nvme_map_data(dev, req, &cmnd); 914 ret = nvme_map_data(dev, req, &cmnd);
938 if (ret) 915 if (ret)
939 goto out_cleanup_iod; 916 goto out_free_cmd;
917 }
918
919 if (blk_integrity_rq(req)) {
920 ret = nvme_map_metadata(dev, req, &cmnd);
921 if (ret)
922 goto out_unmap_data;
940 } 923 }
941 924
942 blk_mq_start_request(req); 925 blk_mq_start_request(req);
943 nvme_submit_cmd(nvmeq, &cmnd, bd->last); 926 nvme_submit_cmd(nvmeq, &cmnd, bd->last);
944 return BLK_STS_OK; 927 return BLK_STS_OK;
945out_cleanup_iod: 928out_unmap_data:
946 nvme_free_iod(dev, req); 929 nvme_unmap_data(dev, req);
947out_free_cmd: 930out_free_cmd:
948 nvme_cleanup_cmd(req); 931 nvme_cleanup_cmd(req);
949 return ret; 932 return ret;
@@ -952,8 +935,14 @@ out_free_cmd:
952static void nvme_pci_complete_rq(struct request *req) 935static void nvme_pci_complete_rq(struct request *req)
953{ 936{
954 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 937 struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
938 struct nvme_dev *dev = iod->nvmeq->dev;
955 939
956 nvme_unmap_data(iod->nvmeq->dev, req); 940 nvme_cleanup_cmd(req);
941 if (blk_integrity_rq(req))
942 dma_unmap_page(dev->dev, iod->meta_dma,
943 rq_integrity_vec(req)->bv_len, rq_data_dir(req));
944 if (blk_rq_nr_phys_segments(req))
945 nvme_unmap_data(dev, req);
957 nvme_complete_rq(req); 946 nvme_complete_rq(req);
958} 947}
959 948
@@ -1088,7 +1077,7 @@ static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
1088 * using the CQ lock. For normal interrupt driven threads we have 1077 * using the CQ lock. For normal interrupt driven threads we have
1089 * to disable the interrupt to avoid racing with it. 1078 * to disable the interrupt to avoid racing with it.
1090 */ 1079 */
1091 if (nvmeq->cq_vector == -1) { 1080 if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) {
1092 spin_lock(&nvmeq->cq_poll_lock); 1081 spin_lock(&nvmeq->cq_poll_lock);
1093 found = nvme_process_cq(nvmeq, &start, &end, tag); 1082 found = nvme_process_cq(nvmeq, &start, &end, tag);
1094 spin_unlock(&nvmeq->cq_poll_lock); 1083 spin_unlock(&nvmeq->cq_poll_lock);
@@ -1148,7 +1137,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1148 struct nvme_command c; 1137 struct nvme_command c;
1149 int flags = NVME_QUEUE_PHYS_CONTIG; 1138 int flags = NVME_QUEUE_PHYS_CONTIG;
1150 1139
1151 if (vector != -1) 1140 if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
1152 flags |= NVME_CQ_IRQ_ENABLED; 1141 flags |= NVME_CQ_IRQ_ENABLED;
1153 1142
1154 /* 1143 /*
@@ -1161,10 +1150,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1161 c.create_cq.cqid = cpu_to_le16(qid); 1150 c.create_cq.cqid = cpu_to_le16(qid);
1162 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); 1151 c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
1163 c.create_cq.cq_flags = cpu_to_le16(flags); 1152 c.create_cq.cq_flags = cpu_to_le16(flags);
1164 if (vector != -1) 1153 c.create_cq.irq_vector = cpu_to_le16(vector);
1165 c.create_cq.irq_vector = cpu_to_le16(vector);
1166 else
1167 c.create_cq.irq_vector = 0;
1168 1154
1169 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); 1155 return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
1170} 1156}
@@ -1271,6 +1257,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1271 struct nvme_dev *dev = nvmeq->dev; 1257 struct nvme_dev *dev = nvmeq->dev;
1272 struct request *abort_req; 1258 struct request *abort_req;
1273 struct nvme_command cmd; 1259 struct nvme_command cmd;
1260 bool shutdown = false;
1274 u32 csts = readl(dev->bar + NVME_REG_CSTS); 1261 u32 csts = readl(dev->bar + NVME_REG_CSTS);
1275 1262
1276 /* If PCI error recovery process is happening, we cannot reset or 1263 /* If PCI error recovery process is happening, we cannot reset or
@@ -1307,12 +1294,14 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1307 * shutdown, so we return BLK_EH_DONE. 1294 * shutdown, so we return BLK_EH_DONE.
1308 */ 1295 */
1309 switch (dev->ctrl.state) { 1296 switch (dev->ctrl.state) {
1297 case NVME_CTRL_DELETING:
1298 shutdown = true;
1310 case NVME_CTRL_CONNECTING: 1299 case NVME_CTRL_CONNECTING:
1311 case NVME_CTRL_RESETTING: 1300 case NVME_CTRL_RESETTING:
1312 dev_warn_ratelimited(dev->ctrl.device, 1301 dev_warn_ratelimited(dev->ctrl.device,
1313 "I/O %d QID %d timeout, disable controller\n", 1302 "I/O %d QID %d timeout, disable controller\n",
1314 req->tag, nvmeq->qid); 1303 req->tag, nvmeq->qid);
1315 nvme_dev_disable(dev, false); 1304 nvme_dev_disable(dev, shutdown);
1316 nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1305 nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1317 return BLK_EH_DONE; 1306 return BLK_EH_DONE;
1318 default: 1307 default:
@@ -1371,16 +1360,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1371 1360
1372static void nvme_free_queue(struct nvme_queue *nvmeq) 1361static void nvme_free_queue(struct nvme_queue *nvmeq)
1373{ 1362{
1374 dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), 1363 dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth),
1375 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1364 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1376 if (!nvmeq->sq_cmds) 1365 if (!nvmeq->sq_cmds)
1377 return; 1366 return;
1378 1367
1379 if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { 1368 if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
1380 pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev), 1369 pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
1381 nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth)); 1370 nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
1382 } else { 1371 } else {
1383 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1372 dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth),
1384 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1373 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1385 } 1374 }
1386} 1375}
@@ -1410,10 +1399,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1410 nvmeq->dev->online_queues--; 1399 nvmeq->dev->online_queues--;
1411 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) 1400 if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1412 blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); 1401 blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1413 if (nvmeq->cq_vector == -1) 1402 if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
1414 return 0; 1403 pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
1415 pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
1416 nvmeq->cq_vector = -1;
1417 return 0; 1404 return 0;
1418} 1405}
1419 1406
@@ -1498,7 +1485,6 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
1498 if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) 1485 if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
1499 goto free_cqdma; 1486 goto free_cqdma;
1500 1487
1501 nvmeq->q_dmadev = dev->dev;
1502 nvmeq->dev = dev; 1488 nvmeq->dev = dev;
1503 spin_lock_init(&nvmeq->sq_lock); 1489 spin_lock_init(&nvmeq->sq_lock);
1504 spin_lock_init(&nvmeq->cq_poll_lock); 1490 spin_lock_init(&nvmeq->cq_poll_lock);
@@ -1507,7 +1493,6 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
1507 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; 1493 nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1508 nvmeq->q_depth = depth; 1494 nvmeq->q_depth = depth;
1509 nvmeq->qid = qid; 1495 nvmeq->qid = qid;
1510 nvmeq->cq_vector = -1;
1511 dev->ctrl.queue_count++; 1496 dev->ctrl.queue_count++;
1512 1497
1513 return 0; 1498 return 0;
@@ -1552,7 +1537,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1552{ 1537{
1553 struct nvme_dev *dev = nvmeq->dev; 1538 struct nvme_dev *dev = nvmeq->dev;
1554 int result; 1539 int result;
1555 s16 vector; 1540 u16 vector = 0;
1556 1541
1557 clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); 1542 clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
1558 1543
@@ -1563,7 +1548,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1563 if (!polled) 1548 if (!polled)
1564 vector = dev->num_vecs == 1 ? 0 : qid; 1549 vector = dev->num_vecs == 1 ? 0 : qid;
1565 else 1550 else
1566 vector = -1; 1551 set_bit(NVMEQ_POLLED, &nvmeq->flags);
1567 1552
1568 result = adapter_alloc_cq(dev, qid, nvmeq, vector); 1553 result = adapter_alloc_cq(dev, qid, nvmeq, vector);
1569 if (result) 1554 if (result)
@@ -1578,7 +1563,8 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1578 nvmeq->cq_vector = vector; 1563 nvmeq->cq_vector = vector;
1579 nvme_init_queue(nvmeq, qid); 1564 nvme_init_queue(nvmeq, qid);
1580 1565
1581 if (vector != -1) { 1566 if (!polled) {
1567 nvmeq->cq_vector = vector;
1582 result = queue_request_irq(nvmeq); 1568 result = queue_request_irq(nvmeq);
1583 if (result < 0) 1569 if (result < 0)
1584 goto release_sq; 1570 goto release_sq;
@@ -1588,7 +1574,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1588 return result; 1574 return result;
1589 1575
1590release_sq: 1576release_sq:
1591 nvmeq->cq_vector = -1;
1592 dev->online_queues--; 1577 dev->online_queues--;
1593 adapter_delete_sq(dev, qid); 1578 adapter_delete_sq(dev, qid);
1594release_cq: 1579release_cq:
@@ -1639,7 +1624,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1639 dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; 1624 dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1640 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1625 dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1641 dev->admin_tagset.numa_node = dev_to_node(dev->dev); 1626 dev->admin_tagset.numa_node = dev_to_node(dev->dev);
1642 dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false); 1627 dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
1643 dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; 1628 dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
1644 dev->admin_tagset.driver_data = dev; 1629 dev->admin_tagset.driver_data = dev;
1645 1630
@@ -1730,7 +1715,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
1730 nvme_init_queue(nvmeq, 0); 1715 nvme_init_queue(nvmeq, 0);
1731 result = queue_request_irq(nvmeq); 1716 result = queue_request_irq(nvmeq);
1732 if (result) { 1717 if (result) {
1733 nvmeq->cq_vector = -1; 1718 dev->online_queues--;
1734 return result; 1719 return result;
1735 } 1720 }
1736 1721
@@ -2171,10 +2156,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
2171 * number of interrupts. 2156 * number of interrupts.
2172 */ 2157 */
2173 result = queue_request_irq(adminq); 2158 result = queue_request_irq(adminq);
2174 if (result) { 2159 if (result)
2175 adminq->cq_vector = -1;
2176 return result; 2160 return result;
2177 }
2178 set_bit(NVMEQ_ENABLED, &adminq->flags); 2161 set_bit(NVMEQ_ENABLED, &adminq->flags);
2179 2162
2180 result = nvme_create_io_queues(dev); 2163 result = nvme_create_io_queues(dev);
@@ -2286,11 +2269,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
2286 dev->tagset.numa_node = dev_to_node(dev->dev); 2269 dev->tagset.numa_node = dev_to_node(dev->dev);
2287 dev->tagset.queue_depth = 2270 dev->tagset.queue_depth =
2288 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 2271 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
2289 dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false); 2272 dev->tagset.cmd_size = sizeof(struct nvme_iod);
2290 if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
2291 dev->tagset.cmd_size = max(dev->tagset.cmd_size,
2292 nvme_pci_cmd_size(dev, true));
2293 }
2294 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2273 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
2295 dev->tagset.driver_data = dev; 2274 dev->tagset.driver_data = dev;
2296 2275
@@ -2438,8 +2417,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
2438 * must flush all entered requests to their failed completion to avoid 2417 * must flush all entered requests to their failed completion to avoid
2439 * deadlocking blk-mq hot-cpu notifier. 2418 * deadlocking blk-mq hot-cpu notifier.
2440 */ 2419 */
2441 if (shutdown) 2420 if (shutdown) {
2442 nvme_start_queues(&dev->ctrl); 2421 nvme_start_queues(&dev->ctrl);
2422 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
2423 blk_mq_unquiesce_queue(dev->ctrl.admin_q);
2424 }
2443 mutex_unlock(&dev->shutdown_lock); 2425 mutex_unlock(&dev->shutdown_lock);
2444} 2426}
2445 2427
@@ -2979,6 +2961,9 @@ static struct pci_driver nvme_driver = {
2979 2961
2980static int __init nvme_init(void) 2962static int __init nvme_init(void)
2981{ 2963{
2964 BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
2965 BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
2966 BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
2982 BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); 2967 BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
2983 return pci_register_driver(&nvme_driver); 2968 return pci_register_driver(&nvme_driver);
2984} 2969}
@@ -2987,7 +2972,6 @@ static void __exit nvme_exit(void)
2987{ 2972{
2988 pci_unregister_driver(&nvme_driver); 2973 pci_unregister_driver(&nvme_driver);
2989 flush_workqueue(nvme_wq); 2974 flush_workqueue(nvme_wq);
2990 _nvme_check_size();
2991} 2975}
2992 2976
2993MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); 2977MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 11a5ecae78c8..e1824c2e0a1c 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -914,8 +914,9 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
914{ 914{
915 blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 915 blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
916 nvme_rdma_stop_queue(&ctrl->queues[0]); 916 nvme_rdma_stop_queue(&ctrl->queues[0]);
917 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, 917 if (ctrl->ctrl.admin_tagset)
918 &ctrl->ctrl); 918 blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
919 nvme_cancel_request, &ctrl->ctrl);
919 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 920 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
920 nvme_rdma_destroy_admin_queue(ctrl, remove); 921 nvme_rdma_destroy_admin_queue(ctrl, remove);
921} 922}
@@ -926,8 +927,9 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
926 if (ctrl->ctrl.queue_count > 1) { 927 if (ctrl->ctrl.queue_count > 1) {
927 nvme_stop_queues(&ctrl->ctrl); 928 nvme_stop_queues(&ctrl->ctrl);
928 nvme_rdma_stop_io_queues(ctrl); 929 nvme_rdma_stop_io_queues(ctrl);
929 blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, 930 if (ctrl->ctrl.tagset)
930 &ctrl->ctrl); 931 blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
932 nvme_cancel_request, &ctrl->ctrl);
931 if (remove) 933 if (remove)
932 nvme_start_queues(&ctrl->ctrl); 934 nvme_start_queues(&ctrl->ctrl);
933 nvme_rdma_destroy_io_queues(ctrl, remove); 935 nvme_rdma_destroy_io_queues(ctrl, remove);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 68c49dd67210..2b107a1d152b 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -473,7 +473,6 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
473 } 473 }
474 474
475 return 0; 475 return 0;
476
477} 476}
478 477
479static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, 478static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
@@ -634,7 +633,6 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
634 nvme_end_request(rq, cpu_to_le16(status << 1), res); 633 nvme_end_request(rq, cpu_to_le16(status << 1), res);
635} 634}
636 635
637
638static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, 636static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
639 unsigned int *offset, size_t *len) 637 unsigned int *offset, size_t *len)
640{ 638{
@@ -1425,7 +1423,8 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1425 if (!ret) { 1423 if (!ret) {
1426 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags); 1424 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1427 } else { 1425 } else {
1428 __nvme_tcp_stop_queue(&ctrl->queues[idx]); 1426 if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1427 __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1429 dev_err(nctrl->device, 1428 dev_err(nctrl->device,
1430 "failed to connect queue: %d ret=%d\n", idx, ret); 1429 "failed to connect queue: %d ret=%d\n", idx, ret);
1431 } 1430 }
@@ -1535,7 +1534,7 @@ out_free_queue:
1535 return ret; 1534 return ret;
1536} 1535}
1537 1536
1538static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) 1537static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1539{ 1538{
1540 int i, ret; 1539 int i, ret;
1541 1540
@@ -1565,7 +1564,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1565 return nr_io_queues; 1564 return nr_io_queues;
1566} 1565}
1567 1566
1568static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl) 1567static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1569{ 1568{
1570 unsigned int nr_io_queues; 1569 unsigned int nr_io_queues;
1571 int ret; 1570 int ret;
@@ -1582,7 +1581,7 @@ static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
1582 dev_info(ctrl->device, 1581 dev_info(ctrl->device,
1583 "creating %d I/O queues.\n", nr_io_queues); 1582 "creating %d I/O queues.\n", nr_io_queues);
1584 1583
1585 return nvme_tcp_alloc_io_queues(ctrl); 1584 return __nvme_tcp_alloc_io_queues(ctrl);
1586} 1585}
1587 1586
1588static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) 1587static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
@@ -1599,7 +1598,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1599{ 1598{
1600 int ret; 1599 int ret;
1601 1600
1602 ret = nvme_alloc_io_queues(ctrl); 1601 ret = nvme_tcp_alloc_io_queues(ctrl);
1603 if (ret) 1602 if (ret)
1604 return ret; 1603 return ret;
1605 1604
@@ -1710,7 +1709,9 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1710{ 1709{
1711 blk_mq_quiesce_queue(ctrl->admin_q); 1710 blk_mq_quiesce_queue(ctrl->admin_q);
1712 nvme_tcp_stop_queue(ctrl, 0); 1711 nvme_tcp_stop_queue(ctrl, 0);
1713 blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); 1712 if (ctrl->admin_tagset)
1713 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
1714 nvme_cancel_request, ctrl);
1714 blk_mq_unquiesce_queue(ctrl->admin_q); 1715 blk_mq_unquiesce_queue(ctrl->admin_q);
1715 nvme_tcp_destroy_admin_queue(ctrl, remove); 1716 nvme_tcp_destroy_admin_queue(ctrl, remove);
1716} 1717}
@@ -1722,7 +1723,9 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1722 return; 1723 return;
1723 nvme_stop_queues(ctrl); 1724 nvme_stop_queues(ctrl);
1724 nvme_tcp_stop_io_queues(ctrl); 1725 nvme_tcp_stop_io_queues(ctrl);
1725 blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl); 1726 if (ctrl->tagset)
1727 blk_mq_tagset_busy_iter(ctrl->tagset,
1728 nvme_cancel_request, ctrl);
1726 if (remove) 1729 if (remove)
1727 nvme_start_queues(ctrl); 1730 nvme_start_queues(ctrl);
1728 nvme_tcp_destroy_io_queues(ctrl, remove); 1731 nvme_tcp_destroy_io_queues(ctrl, remove);
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index d94f25cde019..3ef0a4e5eed6 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -3,6 +3,7 @@ config NVME_TARGET
3 tristate "NVMe Target support" 3 tristate "NVMe Target support"
4 depends on BLOCK 4 depends on BLOCK
5 depends on CONFIGFS_FS 5 depends on CONFIGFS_FS
6 select SGL_ALLOC
6 help 7 help
7 This enabled target side support for the NVMe protocol, that is 8 This enabled target side support for the NVMe protocol, that is
8 it allows the Linux kernel to implement NVMe subsystems and 9 it allows the Linux kernel to implement NVMe subsystems and
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index adb79545cdd7..08dd5af357f7 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -898,8 +898,8 @@ static struct config_group *nvmet_subsys_make(struct config_group *group,
898 } 898 }
899 899
900 subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME); 900 subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME);
901 if (!subsys) 901 if (IS_ERR(subsys))
902 return ERR_PTR(-ENOMEM); 902 return ERR_CAST(subsys);
903 903
904 config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type); 904 config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type);
905 905
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b3e765a95af8..7734a6acff85 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -8,6 +8,7 @@
8#include <linux/random.h> 8#include <linux/random.h>
9#include <linux/rculist.h> 9#include <linux/rculist.h>
10#include <linux/pci-p2pdma.h> 10#include <linux/pci-p2pdma.h>
11#include <linux/scatterlist.h>
11 12
12#include "nvmet.h" 13#include "nvmet.h"
13 14
@@ -214,6 +215,8 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
214{ 215{
215 struct nvmet_ctrl *ctrl; 216 struct nvmet_ctrl *ctrl;
216 217
218 lockdep_assert_held(&subsys->lock);
219
217 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { 220 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
218 nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid)); 221 nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
219 if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR)) 222 if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
@@ -494,13 +497,14 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
494 int ret; 497 int ret;
495 498
496 mutex_lock(&subsys->lock); 499 mutex_lock(&subsys->lock);
497 ret = -EMFILE;
498 if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
499 goto out_unlock;
500 ret = 0; 500 ret = 0;
501 if (ns->enabled) 501 if (ns->enabled)
502 goto out_unlock; 502 goto out_unlock;
503 503
504 ret = -EMFILE;
505 if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
506 goto out_unlock;
507
504 ret = nvmet_bdev_ns_enable(ns); 508 ret = nvmet_bdev_ns_enable(ns);
505 if (ret == -ENOTBLK) 509 if (ret == -ENOTBLK)
506 ret = nvmet_file_ns_enable(ns); 510 ret = nvmet_file_ns_enable(ns);
@@ -644,7 +648,7 @@ static void nvmet_update_sq_head(struct nvmet_req *req)
644 } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) != 648 } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
645 old_sqhd); 649 old_sqhd);
646 } 650 }
647 req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); 651 req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
648} 652}
649 653
650static void nvmet_set_error(struct nvmet_req *req, u16 status) 654static void nvmet_set_error(struct nvmet_req *req, u16 status)
@@ -653,7 +657,7 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status)
653 struct nvme_error_slot *new_error_slot; 657 struct nvme_error_slot *new_error_slot;
654 unsigned long flags; 658 unsigned long flags;
655 659
656 req->rsp->status = cpu_to_le16(status << 1); 660 req->cqe->status = cpu_to_le16(status << 1);
657 661
658 if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC) 662 if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
659 return; 663 return;
@@ -673,15 +677,15 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status)
673 spin_unlock_irqrestore(&ctrl->error_lock, flags); 677 spin_unlock_irqrestore(&ctrl->error_lock, flags);
674 678
675 /* set the more bit for this request */ 679 /* set the more bit for this request */
676 req->rsp->status |= cpu_to_le16(1 << 14); 680 req->cqe->status |= cpu_to_le16(1 << 14);
677} 681}
678 682
679static void __nvmet_req_complete(struct nvmet_req *req, u16 status) 683static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
680{ 684{
681 if (!req->sq->sqhd_disabled) 685 if (!req->sq->sqhd_disabled)
682 nvmet_update_sq_head(req); 686 nvmet_update_sq_head(req);
683 req->rsp->sq_id = cpu_to_le16(req->sq->qid); 687 req->cqe->sq_id = cpu_to_le16(req->sq->qid);
684 req->rsp->command_id = req->cmd->common.command_id; 688 req->cqe->command_id = req->cmd->common.command_id;
685 689
686 if (unlikely(status)) 690 if (unlikely(status))
687 nvmet_set_error(req, status); 691 nvmet_set_error(req, status);
@@ -838,8 +842,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
838 req->sg = NULL; 842 req->sg = NULL;
839 req->sg_cnt = 0; 843 req->sg_cnt = 0;
840 req->transfer_len = 0; 844 req->transfer_len = 0;
841 req->rsp->status = 0; 845 req->cqe->status = 0;
842 req->rsp->sq_head = 0; 846 req->cqe->sq_head = 0;
843 req->ns = NULL; 847 req->ns = NULL;
844 req->error_loc = NVMET_NO_ERROR_LOC; 848 req->error_loc = NVMET_NO_ERROR_LOC;
845 req->error_slba = 0; 849 req->error_slba = 0;
@@ -1066,7 +1070,7 @@ u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
1066 if (!subsys) { 1070 if (!subsys) {
1067 pr_warn("connect request for invalid subsystem %s!\n", 1071 pr_warn("connect request for invalid subsystem %s!\n",
1068 subsysnqn); 1072 subsysnqn);
1069 req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); 1073 req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1070 return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; 1074 return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1071 } 1075 }
1072 1076
@@ -1087,7 +1091,7 @@ u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
1087 1091
1088 pr_warn("could not find controller %d for subsys %s / host %s\n", 1092 pr_warn("could not find controller %d for subsys %s / host %s\n",
1089 cntlid, subsysnqn, hostnqn); 1093 cntlid, subsysnqn, hostnqn);
1090 req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); 1094 req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
1091 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; 1095 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
1092 1096
1093out: 1097out:
@@ -1185,7 +1189,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1185 if (!subsys) { 1189 if (!subsys) {
1186 pr_warn("connect request for invalid subsystem %s!\n", 1190 pr_warn("connect request for invalid subsystem %s!\n",
1187 subsysnqn); 1191 subsysnqn);
1188 req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); 1192 req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
1189 goto out; 1193 goto out;
1190 } 1194 }
1191 1195
@@ -1194,7 +1198,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
1194 if (!nvmet_host_allowed(subsys, hostnqn)) { 1198 if (!nvmet_host_allowed(subsys, hostnqn)) {
1195 pr_info("connect by host %s for subsystem %s not allowed\n", 1199 pr_info("connect by host %s for subsystem %s not allowed\n",
1196 hostnqn, subsysnqn); 1200 hostnqn, subsysnqn);
1197 req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); 1201 req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
1198 up_read(&nvmet_config_sem); 1202 up_read(&nvmet_config_sem);
1199 status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR; 1203 status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR;
1200 goto out_put_subsystem; 1204 goto out_put_subsystem;
@@ -1364,7 +1368,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
1364 1368
1365 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); 1369 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
1366 if (!subsys) 1370 if (!subsys)
1367 return NULL; 1371 return ERR_PTR(-ENOMEM);
1368 1372
1369 subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */ 1373 subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
1370 /* generate a random serial number as our controllers are ephemeral: */ 1374 /* generate a random serial number as our controllers are ephemeral: */
@@ -1380,14 +1384,14 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
1380 default: 1384 default:
1381 pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); 1385 pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
1382 kfree(subsys); 1386 kfree(subsys);
1383 return NULL; 1387 return ERR_PTR(-EINVAL);
1384 } 1388 }
1385 subsys->type = type; 1389 subsys->type = type;
1386 subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, 1390 subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
1387 GFP_KERNEL); 1391 GFP_KERNEL);
1388 if (!subsys->subsysnqn) { 1392 if (!subsys->subsysnqn) {
1389 kfree(subsys); 1393 kfree(subsys);
1390 return NULL; 1394 return ERR_PTR(-ENOMEM);
1391 } 1395 }
1392 1396
1393 kref_init(&subsys->ref); 1397 kref_init(&subsys->ref);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 33ed95e72d6b..5baf269f3f8a 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -30,14 +30,17 @@ void nvmet_port_disc_changed(struct nvmet_port *port,
30{ 30{
31 struct nvmet_ctrl *ctrl; 31 struct nvmet_ctrl *ctrl;
32 32
33 lockdep_assert_held(&nvmet_config_sem);
33 nvmet_genctr++; 34 nvmet_genctr++;
34 35
36 mutex_lock(&nvmet_disc_subsys->lock);
35 list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) { 37 list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
36 if (subsys && !nvmet_host_allowed(subsys, ctrl->hostnqn)) 38 if (subsys && !nvmet_host_allowed(subsys, ctrl->hostnqn))
37 continue; 39 continue;
38 40
39 __nvmet_disc_changed(port, ctrl); 41 __nvmet_disc_changed(port, ctrl);
40 } 42 }
43 mutex_unlock(&nvmet_disc_subsys->lock);
41} 44}
42 45
43static void __nvmet_subsys_disc_changed(struct nvmet_port *port, 46static void __nvmet_subsys_disc_changed(struct nvmet_port *port,
@@ -46,12 +49,14 @@ static void __nvmet_subsys_disc_changed(struct nvmet_port *port,
46{ 49{
47 struct nvmet_ctrl *ctrl; 50 struct nvmet_ctrl *ctrl;
48 51
52 mutex_lock(&nvmet_disc_subsys->lock);
49 list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) { 53 list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
50 if (host && strcmp(nvmet_host_name(host), ctrl->hostnqn)) 54 if (host && strcmp(nvmet_host_name(host), ctrl->hostnqn))
51 continue; 55 continue;
52 56
53 __nvmet_disc_changed(port, ctrl); 57 __nvmet_disc_changed(port, ctrl);
54 } 58 }
59 mutex_unlock(&nvmet_disc_subsys->lock);
55} 60}
56 61
57void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, 62void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
@@ -372,8 +377,8 @@ int __init nvmet_init_discovery(void)
372{ 377{
373 nvmet_disc_subsys = 378 nvmet_disc_subsys =
374 nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC); 379 nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC);
375 if (!nvmet_disc_subsys) 380 if (IS_ERR(nvmet_disc_subsys))
376 return -ENOMEM; 381 return PTR_ERR(nvmet_disc_subsys);
377 return 0; 382 return 0;
378} 383}
379 384
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 3a76ebc3d155..3b9f79aba98f 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -72,7 +72,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
72 offsetof(struct nvmf_property_get_command, attrib); 72 offsetof(struct nvmf_property_get_command, attrib);
73 } 73 }
74 74
75 req->rsp->result.u64 = cpu_to_le64(val); 75 req->cqe->result.u64 = cpu_to_le64(val);
76 nvmet_req_complete(req, status); 76 nvmet_req_complete(req, status);
77} 77}
78 78
@@ -124,7 +124,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
124 124
125 if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) { 125 if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) {
126 req->sq->sqhd_disabled = true; 126 req->sq->sqhd_disabled = true;
127 req->rsp->sq_head = cpu_to_le16(0xffff); 127 req->cqe->sq_head = cpu_to_le16(0xffff);
128 } 128 }
129 129
130 if (ctrl->ops->install_queue) { 130 if (ctrl->ops->install_queue) {
@@ -158,7 +158,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
158 goto out; 158 goto out;
159 159
160 /* zero out initial completion result, assign values as needed */ 160 /* zero out initial completion result, assign values as needed */
161 req->rsp->result.u32 = 0; 161 req->cqe->result.u32 = 0;
162 162
163 if (c->recfmt != 0) { 163 if (c->recfmt != 0) {
164 pr_warn("invalid connect version (%d).\n", 164 pr_warn("invalid connect version (%d).\n",
@@ -172,7 +172,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
172 pr_warn("connect attempt for invalid controller ID %#x\n", 172 pr_warn("connect attempt for invalid controller ID %#x\n",
173 d->cntlid); 173 d->cntlid);
174 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; 174 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
175 req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); 175 req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
176 goto out; 176 goto out;
177 } 177 }
178 178
@@ -195,7 +195,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
195 195
196 pr_info("creating controller %d for subsystem %s for NQN %s.\n", 196 pr_info("creating controller %d for subsystem %s for NQN %s.\n",
197 ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn); 197 ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn);
198 req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid); 198 req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
199 199
200out: 200out:
201 kfree(d); 201 kfree(d);
@@ -222,7 +222,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
222 goto out; 222 goto out;
223 223
224 /* zero out initial completion result, assign values as needed */ 224 /* zero out initial completion result, assign values as needed */
225 req->rsp->result.u32 = 0; 225 req->cqe->result.u32 = 0;
226 226
227 if (c->recfmt != 0) { 227 if (c->recfmt != 0) {
228 pr_warn("invalid connect version (%d).\n", 228 pr_warn("invalid connect version (%d).\n",
@@ -240,14 +240,14 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
240 if (unlikely(qid > ctrl->subsys->max_qid)) { 240 if (unlikely(qid > ctrl->subsys->max_qid)) {
241 pr_warn("invalid queue id (%d)\n", qid); 241 pr_warn("invalid queue id (%d)\n", qid);
242 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; 242 status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
243 req->rsp->result.u32 = IPO_IATTR_CONNECT_SQE(qid); 243 req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(qid);
244 goto out_ctrl_put; 244 goto out_ctrl_put;
245 } 245 }
246 246
247 status = nvmet_install_queue(ctrl, req); 247 status = nvmet_install_queue(ctrl, req);
248 if (status) { 248 if (status) {
249 /* pass back cntlid that had the issue of installing queue */ 249 /* pass back cntlid that had the issue of installing queue */
250 req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid); 250 req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
251 goto out_ctrl_put; 251 goto out_ctrl_put;
252 } 252 }
253 253
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 98b7b1f4ee96..508661af0f50 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -128,12 +128,12 @@ struct nvmet_fc_tgt_queue {
128 struct nvmet_cq nvme_cq; 128 struct nvmet_cq nvme_cq;
129 struct nvmet_sq nvme_sq; 129 struct nvmet_sq nvme_sq;
130 struct nvmet_fc_tgt_assoc *assoc; 130 struct nvmet_fc_tgt_assoc *assoc;
131 struct nvmet_fc_fcp_iod *fod; /* array of fcp_iods */
132 struct list_head fod_list; 131 struct list_head fod_list;
133 struct list_head pending_cmd_list; 132 struct list_head pending_cmd_list;
134 struct list_head avail_defer_list; 133 struct list_head avail_defer_list;
135 struct workqueue_struct *work_q; 134 struct workqueue_struct *work_q;
136 struct kref ref; 135 struct kref ref;
136 struct nvmet_fc_fcp_iod fod[]; /* array of fcp_iods */
137} __aligned(sizeof(unsigned long long)); 137} __aligned(sizeof(unsigned long long));
138 138
139struct nvmet_fc_tgt_assoc { 139struct nvmet_fc_tgt_assoc {
@@ -588,9 +588,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
588 if (qid > NVMET_NR_QUEUES) 588 if (qid > NVMET_NR_QUEUES)
589 return NULL; 589 return NULL;
590 590
591 queue = kzalloc((sizeof(*queue) + 591 queue = kzalloc(struct_size(queue, fod, sqsize), GFP_KERNEL);
592 (sizeof(struct nvmet_fc_fcp_iod) * sqsize)),
593 GFP_KERNEL);
594 if (!queue) 592 if (!queue)
595 return NULL; 593 return NULL;
596 594
@@ -603,7 +601,6 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
603 if (!queue->work_q) 601 if (!queue->work_q)
604 goto out_a_put; 602 goto out_a_put;
605 603
606 queue->fod = (struct nvmet_fc_fcp_iod *)&queue[1];
607 queue->qid = qid; 604 queue->qid = qid;
608 queue->sqsize = sqsize; 605 queue->sqsize = sqsize;
609 queue->assoc = assoc; 606 queue->assoc = assoc;
@@ -2187,7 +2184,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
2187 } 2184 }
2188 2185
2189 fod->req.cmd = &fod->cmdiubuf.sqe; 2186 fod->req.cmd = &fod->cmdiubuf.sqe;
2190 fod->req.rsp = &fod->rspiubuf.cqe; 2187 fod->req.cqe = &fod->rspiubuf.cqe;
2191 fod->req.port = tgtport->pe->port; 2188 fod->req.port = tgtport->pe->port;
2192 2189
2193 /* clear any response payload */ 2190 /* clear any response payload */
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index a065dbfc43b1..3efc52f9c309 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -196,7 +196,7 @@ static u16 nvmet_bdev_discard_range(struct nvmet_req *req,
196 GFP_KERNEL, 0, bio); 196 GFP_KERNEL, 0, bio);
197 if (ret && ret != -EOPNOTSUPP) { 197 if (ret && ret != -EOPNOTSUPP) {
198 req->error_slba = le64_to_cpu(range->slba); 198 req->error_slba = le64_to_cpu(range->slba);
199 return blk_to_nvme_status(req, errno_to_blk_status(ret)); 199 return errno_to_nvme_status(req, ret);
200 } 200 }
201 return NVME_SC_SUCCESS; 201 return NVME_SC_SUCCESS;
202} 202}
@@ -252,7 +252,6 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
252{ 252{
253 struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes; 253 struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes;
254 struct bio *bio = NULL; 254 struct bio *bio = NULL;
255 u16 status = NVME_SC_SUCCESS;
256 sector_t sector; 255 sector_t sector;
257 sector_t nr_sector; 256 sector_t nr_sector;
258 int ret; 257 int ret;
@@ -264,13 +263,12 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
264 263
265 ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, 264 ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
266 GFP_KERNEL, &bio, 0); 265 GFP_KERNEL, &bio, 0);
267 status = blk_to_nvme_status(req, errno_to_blk_status(ret));
268 if (bio) { 266 if (bio) {
269 bio->bi_private = req; 267 bio->bi_private = req;
270 bio->bi_end_io = nvmet_bio_done; 268 bio->bi_end_io = nvmet_bio_done;
271 submit_bio(bio); 269 submit_bio(bio);
272 } else { 270 } else {
273 nvmet_req_complete(req, status); 271 nvmet_req_complete(req, errno_to_nvme_status(req, ret));
274 } 272 }
275} 273}
276 274
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index bc6ebb51b0bf..05453f5d1448 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -49,7 +49,12 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns)
49 goto err; 49 goto err;
50 50
51 ns->size = stat.size; 51 ns->size = stat.size;
52 ns->blksize_shift = file_inode(ns->file)->i_blkbits; 52 /*
53 * i_blkbits can be greater than the universally accepted upper bound,
54 * so make sure we export a sane namespace lba_shift.
55 */
56 ns->blksize_shift = min_t(u8,
57 file_inode(ns->file)->i_blkbits, 12);
53 58
54 ns->bvec_cache = kmem_cache_create("nvmet-bvec", 59 ns->bvec_cache = kmem_cache_create("nvmet-bvec",
55 NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), 60 NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec),
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index b9f623ab01f3..9e211ad6bdd3 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -18,7 +18,7 @@
18struct nvme_loop_iod { 18struct nvme_loop_iod {
19 struct nvme_request nvme_req; 19 struct nvme_request nvme_req;
20 struct nvme_command cmd; 20 struct nvme_command cmd;
21 struct nvme_completion rsp; 21 struct nvme_completion cqe;
22 struct nvmet_req req; 22 struct nvmet_req req;
23 struct nvme_loop_queue *queue; 23 struct nvme_loop_queue *queue;
24 struct work_struct work; 24 struct work_struct work;
@@ -94,7 +94,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
94{ 94{
95 struct nvme_loop_queue *queue = 95 struct nvme_loop_queue *queue =
96 container_of(req->sq, struct nvme_loop_queue, nvme_sq); 96 container_of(req->sq, struct nvme_loop_queue, nvme_sq);
97 struct nvme_completion *cqe = req->rsp; 97 struct nvme_completion *cqe = req->cqe;
98 98
99 /* 99 /*
100 * AEN requests are special as they don't time out and can 100 * AEN requests are special as they don't time out and can
@@ -129,20 +129,6 @@ static void nvme_loop_execute_work(struct work_struct *work)
129 nvmet_req_execute(&iod->req); 129 nvmet_req_execute(&iod->req);
130} 130}
131 131
132static enum blk_eh_timer_return
133nvme_loop_timeout(struct request *rq, bool reserved)
134{
135 struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq);
136
137 /* queue error recovery */
138 nvme_reset_ctrl(&iod->queue->ctrl->ctrl);
139
140 /* fail with DNR on admin cmd timeout */
141 nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
142
143 return BLK_EH_DONE;
144}
145
146static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, 132static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
147 const struct blk_mq_queue_data *bd) 133 const struct blk_mq_queue_data *bd)
148{ 134{
@@ -207,7 +193,7 @@ static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl,
207 struct nvme_loop_iod *iod, unsigned int queue_idx) 193 struct nvme_loop_iod *iod, unsigned int queue_idx)
208{ 194{
209 iod->req.cmd = &iod->cmd; 195 iod->req.cmd = &iod->cmd;
210 iod->req.rsp = &iod->rsp; 196 iod->req.cqe = &iod->cqe;
211 iod->queue = &ctrl->queues[queue_idx]; 197 iod->queue = &ctrl->queues[queue_idx];
212 INIT_WORK(&iod->work, nvme_loop_execute_work); 198 INIT_WORK(&iod->work, nvme_loop_execute_work);
213 return 0; 199 return 0;
@@ -253,7 +239,6 @@ static const struct blk_mq_ops nvme_loop_mq_ops = {
253 .complete = nvme_loop_complete_rq, 239 .complete = nvme_loop_complete_rq,
254 .init_request = nvme_loop_init_request, 240 .init_request = nvme_loop_init_request,
255 .init_hctx = nvme_loop_init_hctx, 241 .init_hctx = nvme_loop_init_hctx,
256 .timeout = nvme_loop_timeout,
257}; 242};
258 243
259static const struct blk_mq_ops nvme_loop_admin_mq_ops = { 244static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
@@ -261,7 +246,6 @@ static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
261 .complete = nvme_loop_complete_rq, 246 .complete = nvme_loop_complete_rq,
262 .init_request = nvme_loop_init_request, 247 .init_request = nvme_loop_init_request,
263 .init_hctx = nvme_loop_init_admin_hctx, 248 .init_hctx = nvme_loop_init_admin_hctx,
264 .timeout = nvme_loop_timeout,
265}; 249};
266 250
267static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) 251static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 1653d19b187f..c25d88fc9dec 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -284,7 +284,7 @@ struct nvmet_fabrics_ops {
284 284
285struct nvmet_req { 285struct nvmet_req {
286 struct nvme_command *cmd; 286 struct nvme_command *cmd;
287 struct nvme_completion *rsp; 287 struct nvme_completion *cqe;
288 struct nvmet_sq *sq; 288 struct nvmet_sq *sq;
289 struct nvmet_cq *cq; 289 struct nvmet_cq *cq;
290 struct nvmet_ns *ns; 290 struct nvmet_ns *ns;
@@ -322,7 +322,7 @@ extern struct workqueue_struct *buffered_io_wq;
322 322
323static inline void nvmet_set_result(struct nvmet_req *req, u32 result) 323static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
324{ 324{
325 req->rsp->result.u32 = cpu_to_le32(result); 325 req->cqe->result.u32 = cpu_to_le32(result);
326} 326}
327 327
328/* 328/*
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index ef893addf341..36d906a7f70d 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -160,7 +160,7 @@ static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
160{ 160{
161 return !nvme_is_write(rsp->req.cmd) && 161 return !nvme_is_write(rsp->req.cmd) &&
162 rsp->req.transfer_len && 162 rsp->req.transfer_len &&
163 !rsp->req.rsp->status && 163 !rsp->req.cqe->status &&
164 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 164 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
165} 165}
166 166
@@ -364,16 +364,17 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
364 struct nvmet_rdma_rsp *r) 364 struct nvmet_rdma_rsp *r)
365{ 365{
366 /* NVMe CQE / RDMA SEND */ 366 /* NVMe CQE / RDMA SEND */
367 r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); 367 r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL);
368 if (!r->req.rsp) 368 if (!r->req.cqe)
369 goto out; 369 goto out;
370 370
371 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, 371 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe,
372 sizeof(*r->req.rsp), DMA_TO_DEVICE); 372 sizeof(*r->req.cqe), DMA_TO_DEVICE);
373 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) 373 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
374 goto out_free_rsp; 374 goto out_free_rsp;
375 375
376 r->send_sge.length = sizeof(*r->req.rsp); 376 r->req.p2p_client = &ndev->device->dev;
377 r->send_sge.length = sizeof(*r->req.cqe);
377 r->send_sge.lkey = ndev->pd->local_dma_lkey; 378 r->send_sge.lkey = ndev->pd->local_dma_lkey;
378 379
379 r->send_cqe.done = nvmet_rdma_send_done; 380 r->send_cqe.done = nvmet_rdma_send_done;
@@ -388,7 +389,7 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
388 return 0; 389 return 0;
389 390
390out_free_rsp: 391out_free_rsp:
391 kfree(r->req.rsp); 392 kfree(r->req.cqe);
392out: 393out:
393 return -ENOMEM; 394 return -ENOMEM;
394} 395}
@@ -397,8 +398,8 @@ static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
397 struct nvmet_rdma_rsp *r) 398 struct nvmet_rdma_rsp *r)
398{ 399{
399 ib_dma_unmap_single(ndev->device, r->send_sge.addr, 400 ib_dma_unmap_single(ndev->device, r->send_sge.addr,
400 sizeof(*r->req.rsp), DMA_TO_DEVICE); 401 sizeof(*r->req.cqe), DMA_TO_DEVICE);
401 kfree(r->req.rsp); 402 kfree(r->req.cqe);
402} 403}
403 404
404static int 405static int
@@ -763,8 +764,6 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
763 cmd->send_sge.addr, cmd->send_sge.length, 764 cmd->send_sge.addr, cmd->send_sge.length,
764 DMA_TO_DEVICE); 765 DMA_TO_DEVICE);
765 766
766 cmd->req.p2p_client = &queue->dev->device->dev;
767
768 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, 767 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
769 &queue->nvme_sq, &nvmet_rdma_ops)) 768 &queue->nvme_sq, &nvmet_rdma_ops))
770 return; 769 return;
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index ad0df786fe93..69b83fa0c76c 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -161,14 +161,14 @@ static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
161 161
162static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) 162static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
163{ 163{
164 return nvmet_tcp_has_data_in(cmd) && !cmd->req.rsp->status; 164 return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
165} 165}
166 166
167static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd) 167static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
168{ 168{
169 return !nvme_is_write(cmd->req.cmd) && 169 return !nvme_is_write(cmd->req.cmd) &&
170 cmd->req.transfer_len > 0 && 170 cmd->req.transfer_len > 0 &&
171 !cmd->req.rsp->status; 171 !cmd->req.cqe->status;
172} 172}
173 173
174static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd) 174static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
@@ -371,13 +371,14 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
371 cmd->state = NVMET_TCP_SEND_DATA_PDU; 371 cmd->state = NVMET_TCP_SEND_DATA_PDU;
372 372
373 pdu->hdr.type = nvme_tcp_c2h_data; 373 pdu->hdr.type = nvme_tcp_c2h_data;
374 pdu->hdr.flags = NVME_TCP_F_DATA_LAST; 374 pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
375 NVME_TCP_F_DATA_SUCCESS : 0);
375 pdu->hdr.hlen = sizeof(*pdu); 376 pdu->hdr.hlen = sizeof(*pdu);
376 pdu->hdr.pdo = pdu->hdr.hlen + hdgst; 377 pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
377 pdu->hdr.plen = 378 pdu->hdr.plen =
378 cpu_to_le32(pdu->hdr.hlen + hdgst + 379 cpu_to_le32(pdu->hdr.hlen + hdgst +
379 cmd->req.transfer_len + ddgst); 380 cmd->req.transfer_len + ddgst);
380 pdu->command_id = cmd->req.rsp->command_id; 381 pdu->command_id = cmd->req.cqe->command_id;
381 pdu->data_length = cpu_to_le32(cmd->req.transfer_len); 382 pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
382 pdu->data_offset = cpu_to_le32(cmd->wbytes_done); 383 pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
383 384
@@ -542,8 +543,19 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
542 cmd->state = NVMET_TCP_SEND_DDGST; 543 cmd->state = NVMET_TCP_SEND_DDGST;
543 cmd->offset = 0; 544 cmd->offset = 0;
544 } else { 545 } else {
545 nvmet_setup_response_pdu(cmd); 546 if (queue->nvme_sq.sqhd_disabled) {
547 cmd->queue->snd_cmd = NULL;
548 nvmet_tcp_put_cmd(cmd);
549 } else {
550 nvmet_setup_response_pdu(cmd);
551 }
552 }
553
554 if (queue->nvme_sq.sqhd_disabled) {
555 kfree(cmd->iov);
556 sgl_free(cmd->req.sg);
546 } 557 }
558
547 return 1; 559 return 1;
548 560
549} 561}
@@ -619,7 +631,13 @@ static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
619 return ret; 631 return ret;
620 632
621 cmd->offset += ret; 633 cmd->offset += ret;
622 nvmet_setup_response_pdu(cmd); 634
635 if (queue->nvme_sq.sqhd_disabled) {
636 cmd->queue->snd_cmd = NULL;
637 nvmet_tcp_put_cmd(cmd);
638 } else {
639 nvmet_setup_response_pdu(cmd);
640 }
623 return 1; 641 return 1;
624} 642}
625 643
@@ -756,12 +774,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
756 return -EPROTO; 774 return -EPROTO;
757 } 775 }
758 776
759 if (icreq->maxr2t != 0) {
760 pr_err("queue %d: unsupported maxr2t %d\n", queue->idx,
761 le32_to_cpu(icreq->maxr2t) + 1);
762 return -EPROTO;
763 }
764
765 queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); 777 queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
766 queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); 778 queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
767 if (queue->hdr_digest || queue->data_digest) { 779 if (queue->hdr_digest || queue->data_digest) {
@@ -1206,7 +1218,7 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1206 sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1218 sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1207 if (!c->rsp_pdu) 1219 if (!c->rsp_pdu)
1208 goto out_free_cmd; 1220 goto out_free_cmd;
1209 c->req.rsp = &c->rsp_pdu->cqe; 1221 c->req.cqe = &c->rsp_pdu->cqe;
1210 1222
1211 c->data_pdu = page_frag_alloc(&queue->pf_cache, 1223 c->data_pdu = page_frag_alloc(&queue->pf_cache,
1212 sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1224 sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 2b2bc4b49d78..ebc80354714c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2256,22 +2256,6 @@ static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp,
2256 2256
2257#define READ_CAPACITY_RETRIES_ON_RESET 10 2257#define READ_CAPACITY_RETRIES_ON_RESET 10
2258 2258
2259/*
2260 * Ensure that we don't overflow sector_t when CONFIG_LBDAF is not set
2261 * and the reported logical block size is bigger than 512 bytes. Note
2262 * that last_sector is a u64 and therefore logical_to_sectors() is not
2263 * applicable.
2264 */
2265static bool sd_addressable_capacity(u64 lba, unsigned int sector_size)
2266{
2267 u64 last_sector = (lba + 1ULL) << (ilog2(sector_size) - 9);
2268
2269 if (sizeof(sector_t) == 4 && last_sector > U32_MAX)
2270 return false;
2271
2272 return true;
2273}
2274
2275static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, 2259static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
2276 unsigned char *buffer) 2260 unsigned char *buffer)
2277{ 2261{
@@ -2337,14 +2321,6 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
2337 return -ENODEV; 2321 return -ENODEV;
2338 } 2322 }
2339 2323
2340 if (!sd_addressable_capacity(lba, sector_size)) {
2341 sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "
2342 "kernel compiled with support for large block "
2343 "devices.\n");
2344 sdkp->capacity = 0;
2345 return -EOVERFLOW;
2346 }
2347
2348 /* Logical blocks per physical block exponent */ 2324 /* Logical blocks per physical block exponent */
2349 sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size; 2325 sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size;
2350 2326
@@ -2426,14 +2402,6 @@ static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp,
2426 return sector_size; 2402 return sector_size;
2427 } 2403 }
2428 2404
2429 if (!sd_addressable_capacity(lba, sector_size)) {
2430 sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "
2431 "kernel compiled with support for large block "
2432 "devices.\n");
2433 sdkp->capacity = 0;
2434 return -EOVERFLOW;
2435 }
2436
2437 sdkp->capacity = lba + 1; 2405 sdkp->capacity = lba + 1;
2438 sdkp->physical_block_size = sector_size; 2406 sdkp->physical_block_size = sector_size;
2439 return sector_size; 2407 return sector_size;
@@ -3325,6 +3293,7 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
3325 if (sdp->removable) { 3293 if (sdp->removable) {
3326 gd->flags |= GENHD_FL_REMOVABLE; 3294 gd->flags |= GENHD_FL_REMOVABLE;
3327 gd->events |= DISK_EVENT_MEDIA_CHANGE; 3295 gd->events |= DISK_EVENT_MEDIA_CHANGE;
3296 gd->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT;
3328 } 3297 }
3329 3298
3330 blk_pm_runtime_init(sdp->request_queue, dev); 3299 blk_pm_runtime_init(sdp->request_queue, dev);
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 039c27c2d7b3..c3f443d5aea8 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -716,6 +716,7 @@ static int sr_probe(struct device *dev)
716 disk->fops = &sr_bdops; 716 disk->fops = &sr_bdops;
717 disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; 717 disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
718 disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST; 718 disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST;
719 disk->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT;
719 720
720 blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT); 721 blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT);
721 722
diff --git a/drivers/staging/erofs/data.c b/drivers/staging/erofs/data.c
index c64ec76643d4..746685f90564 100644
--- a/drivers/staging/erofs/data.c
+++ b/drivers/staging/erofs/data.c
@@ -18,7 +18,6 @@
18static inline void read_endio(struct bio *bio) 18static inline void read_endio(struct bio *bio)
19{ 19{
20 struct super_block *const sb = bio->bi_private; 20 struct super_block *const sb = bio->bi_private;
21 int i;
22 struct bio_vec *bvec; 21 struct bio_vec *bvec;
23 blk_status_t err = bio->bi_status; 22 blk_status_t err = bio->bi_status;
24 struct bvec_iter_all iter_all; 23 struct bvec_iter_all iter_all;
@@ -28,7 +27,7 @@ static inline void read_endio(struct bio *bio)
28 err = BLK_STS_IOERR; 27 err = BLK_STS_IOERR;
29 } 28 }
30 29
31 bio_for_each_segment_all(bvec, bio, i, iter_all) { 30 bio_for_each_segment_all(bvec, bio, iter_all) {
32 struct page *page = bvec->bv_page; 31 struct page *page = bvec->bv_page;
33 32
34 /* page is already locked */ 33 /* page is already locked */
diff --git a/drivers/staging/erofs/unzip_vle.c b/drivers/staging/erofs/unzip_vle.c
index a2e03c932102..9ecaa872bae8 100644
--- a/drivers/staging/erofs/unzip_vle.c
+++ b/drivers/staging/erofs/unzip_vle.c
@@ -846,11 +846,10 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
846{ 846{
847 struct erofs_sb_info *sbi = NULL; 847 struct erofs_sb_info *sbi = NULL;
848 blk_status_t err = bio->bi_status; 848 blk_status_t err = bio->bi_status;
849 unsigned int i;
850 struct bio_vec *bvec; 849 struct bio_vec *bvec;
851 struct bvec_iter_all iter_all; 850 struct bvec_iter_all iter_all;
852 851
853 bio_for_each_segment_all(bvec, bio, i, iter_all) { 852 bio_for_each_segment_all(bvec, bio, iter_all) {
854 struct page *page = bvec->bv_page; 853 struct page *page = bvec->bv_page;
855 bool cachemngd = false; 854 bool cachemngd = false;
856 855
diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
index f3fbb700f569..05a286d24f14 100644
--- a/drivers/xen/biomerge.c
+++ b/drivers/xen/biomerge.c
@@ -4,12 +4,13 @@
4#include <xen/xen.h> 4#include <xen/xen.h>
5#include <xen/page.h> 5#include <xen/page.h>
6 6
7/* check if @page can be merged with 'vec1' */
7bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, 8bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
8 const struct bio_vec *vec2) 9 const struct page *page)
9{ 10{
10#if XEN_PAGE_SIZE == PAGE_SIZE 11#if XEN_PAGE_SIZE == PAGE_SIZE
11 unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page)); 12 unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page));
12 unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page)); 13 unsigned long bfn2 = pfn_to_bfn(page_to_pfn(page));
13 14
14 return bfn1 + PFN_DOWN(vec1->bv_offset + vec1->bv_len) == bfn2; 15 return bfn1 + PFN_DOWN(vec1->bv_offset + vec1->bv_len) == bfn2;
15#else 16#else
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9ee3117ee0bf..500aaa3e5990 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -210,7 +210,6 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
210 struct bio bio; 210 struct bio bio;
211 ssize_t ret; 211 ssize_t ret;
212 blk_qc_t qc; 212 blk_qc_t qc;
213 int i;
214 struct bvec_iter_all iter_all; 213 struct bvec_iter_all iter_all;
215 214
216 if ((pos | iov_iter_alignment(iter)) & 215 if ((pos | iov_iter_alignment(iter)) &
@@ -261,7 +260,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
261 } 260 }
262 __set_current_state(TASK_RUNNING); 261 __set_current_state(TASK_RUNNING);
263 262
264 bio_for_each_segment_all(bvec, &bio, i, iter_all) { 263 bio_for_each_segment_all(bvec, &bio, iter_all) {
265 if (should_dirty && !PageCompound(bvec->bv_page)) 264 if (should_dirty && !PageCompound(bvec->bv_page))
266 set_page_dirty_lock(bvec->bv_page); 265 set_page_dirty_lock(bvec->bv_page);
267 if (!bio_flagged(&bio, BIO_NO_PAGE_REF)) 266 if (!bio_flagged(&bio, BIO_NO_PAGE_REF))
@@ -340,9 +339,8 @@ static void blkdev_bio_end_io(struct bio *bio)
340 if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { 339 if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
341 struct bvec_iter_all iter_all; 340 struct bvec_iter_all iter_all;
342 struct bio_vec *bvec; 341 struct bio_vec *bvec;
343 int i;
344 342
345 bio_for_each_segment_all(bvec, bio, i, iter_all) 343 bio_for_each_segment_all(bvec, bio, iter_all)
346 put_page(bvec->bv_page); 344 put_page(bvec->bv_page);
347 } 345 }
348 bio_put(bio); 346 bio_put(bio);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1463e14af2fb..daf7908d1e35 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -160,7 +160,6 @@ csum_failed:
160 if (cb->errors) { 160 if (cb->errors) {
161 bio_io_error(cb->orig_bio); 161 bio_io_error(cb->orig_bio);
162 } else { 162 } else {
163 int i;
164 struct bio_vec *bvec; 163 struct bio_vec *bvec;
165 struct bvec_iter_all iter_all; 164 struct bvec_iter_all iter_all;
166 165
@@ -169,7 +168,7 @@ csum_failed:
169 * checked so the end_io handlers know about it 168 * checked so the end_io handlers know about it
170 */ 169 */
171 ASSERT(!bio_flagged(bio, BIO_CLONED)); 170 ASSERT(!bio_flagged(bio, BIO_CLONED));
172 bio_for_each_segment_all(bvec, cb->orig_bio, i, iter_all) 171 bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
173 SetPageChecked(bvec->bv_page); 172 SetPageChecked(bvec->bv_page);
174 173
175 bio_endio(cb->orig_bio); 174 bio_endio(cb->orig_bio);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 663efce22d98..deb74a8c191a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -849,11 +849,11 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
849{ 849{
850 struct bio_vec *bvec; 850 struct bio_vec *bvec;
851 struct btrfs_root *root; 851 struct btrfs_root *root;
852 int i, ret = 0; 852 int ret = 0;
853 struct bvec_iter_all iter_all; 853 struct bvec_iter_all iter_all;
854 854
855 ASSERT(!bio_flagged(bio, BIO_CLONED)); 855 ASSERT(!bio_flagged(bio, BIO_CLONED));
856 bio_for_each_segment_all(bvec, bio, i, iter_all) { 856 bio_for_each_segment_all(bvec, bio, iter_all) {
857 root = BTRFS_I(bvec->bv_page->mapping->host)->root; 857 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
858 ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); 858 ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
859 if (ret) 859 if (ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 13fca7bfc1f2..db337e53aab3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2582,11 +2582,10 @@ static void end_bio_extent_writepage(struct bio *bio)
2582 struct bio_vec *bvec; 2582 struct bio_vec *bvec;
2583 u64 start; 2583 u64 start;
2584 u64 end; 2584 u64 end;
2585 int i;
2586 struct bvec_iter_all iter_all; 2585 struct bvec_iter_all iter_all;
2587 2586
2588 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2587 ASSERT(!bio_flagged(bio, BIO_CLONED));
2589 bio_for_each_segment_all(bvec, bio, i, iter_all) { 2588 bio_for_each_segment_all(bvec, bio, iter_all) {
2590 struct page *page = bvec->bv_page; 2589 struct page *page = bvec->bv_page;
2591 struct inode *inode = page->mapping->host; 2590 struct inode *inode = page->mapping->host;
2592 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2591 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2654,11 +2653,10 @@ static void end_bio_extent_readpage(struct bio *bio)
2654 u64 extent_len = 0; 2653 u64 extent_len = 0;
2655 int mirror; 2654 int mirror;
2656 int ret; 2655 int ret;
2657 int i;
2658 struct bvec_iter_all iter_all; 2656 struct bvec_iter_all iter_all;
2659 2657
2660 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2658 ASSERT(!bio_flagged(bio, BIO_CLONED));
2661 bio_for_each_segment_all(bvec, bio, i, iter_all) { 2659 bio_for_each_segment_all(bvec, bio, iter_all) {
2662 struct page *page = bvec->bv_page; 2660 struct page *page = bvec->bv_page;
2663 struct inode *inode = page->mapping->host; 2661 struct inode *inode = page->mapping->host;
2664 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2662 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3755,11 +3753,11 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
3755{ 3753{
3756 struct bio_vec *bvec; 3754 struct bio_vec *bvec;
3757 struct extent_buffer *eb; 3755 struct extent_buffer *eb;
3758 int i, done; 3756 int done;
3759 struct bvec_iter_all iter_all; 3757 struct bvec_iter_all iter_all;
3760 3758
3761 ASSERT(!bio_flagged(bio, BIO_CLONED)); 3759 ASSERT(!bio_flagged(bio, BIO_CLONED));
3762 bio_for_each_segment_all(bvec, bio, i, iter_all) { 3760 bio_for_each_segment_all(bvec, bio, iter_all) {
3763 struct page *page = bvec->bv_page; 3761 struct page *page = bvec->bv_page;
3764 3762
3765 eb = (struct extent_buffer *)page->private; 3763 eb = (struct extent_buffer *)page->private;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 56929daea0f7..9aba9660efe5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7872,7 +7872,6 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
7872 struct inode *inode = done->inode; 7872 struct inode *inode = done->inode;
7873 struct bio_vec *bvec; 7873 struct bio_vec *bvec;
7874 struct extent_io_tree *io_tree, *failure_tree; 7874 struct extent_io_tree *io_tree, *failure_tree;
7875 int i;
7876 struct bvec_iter_all iter_all; 7875 struct bvec_iter_all iter_all;
7877 7876
7878 if (bio->bi_status) 7877 if (bio->bi_status)
@@ -7885,7 +7884,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
7885 7884
7886 done->uptodate = 1; 7885 done->uptodate = 1;
7887 ASSERT(!bio_flagged(bio, BIO_CLONED)); 7886 ASSERT(!bio_flagged(bio, BIO_CLONED));
7888 bio_for_each_segment_all(bvec, bio, i, iter_all) 7887 bio_for_each_segment_all(bvec, bio, iter_all)
7889 clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, 7888 clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
7890 io_tree, done->start, bvec->bv_page, 7889 io_tree, done->start, bvec->bv_page,
7891 btrfs_ino(BTRFS_I(inode)), 0); 7890 btrfs_ino(BTRFS_I(inode)), 0);
@@ -7963,7 +7962,7 @@ static void btrfs_retry_endio(struct bio *bio)
7963 struct bio_vec *bvec; 7962 struct bio_vec *bvec;
7964 int uptodate; 7963 int uptodate;
7965 int ret; 7964 int ret;
7966 int i; 7965 int i = 0;
7967 struct bvec_iter_all iter_all; 7966 struct bvec_iter_all iter_all;
7968 7967
7969 if (bio->bi_status) 7968 if (bio->bi_status)
@@ -7978,7 +7977,7 @@ static void btrfs_retry_endio(struct bio *bio)
7978 failure_tree = &BTRFS_I(inode)->io_failure_tree; 7977 failure_tree = &BTRFS_I(inode)->io_failure_tree;
7979 7978
7980 ASSERT(!bio_flagged(bio, BIO_CLONED)); 7979 ASSERT(!bio_flagged(bio, BIO_CLONED));
7981 bio_for_each_segment_all(bvec, bio, i, iter_all) { 7980 bio_for_each_segment_all(bvec, bio, iter_all) {
7982 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, 7981 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
7983 bvec->bv_offset, done->start, 7982 bvec->bv_offset, done->start,
7984 bvec->bv_len); 7983 bvec->bv_len);
@@ -7990,6 +7989,7 @@ static void btrfs_retry_endio(struct bio *bio)
7990 bvec->bv_offset); 7989 bvec->bv_offset);
7991 else 7990 else
7992 uptodate = 0; 7991 uptodate = 0;
7992 i++;
7993 } 7993 }
7994 7994
7995 done->uptodate = uptodate; 7995 done->uptodate = uptodate;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 67a6f7d47402..f3d0576dd327 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1442,12 +1442,11 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1442static void set_bio_pages_uptodate(struct bio *bio) 1442static void set_bio_pages_uptodate(struct bio *bio)
1443{ 1443{
1444 struct bio_vec *bvec; 1444 struct bio_vec *bvec;
1445 int i;
1446 struct bvec_iter_all iter_all; 1445 struct bvec_iter_all iter_all;
1447 1446
1448 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1447 ASSERT(!bio_flagged(bio, BIO_CLONED));
1449 1448
1450 bio_for_each_segment_all(bvec, bio, i, iter_all) 1449 bio_for_each_segment_all(bvec, bio, iter_all)
1451 SetPageUptodate(bvec->bv_page); 1450 SetPageUptodate(bvec->bv_page);
1452} 1451}
1453 1452
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 5759bcd018cd..8f3a8bc15d98 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -29,10 +29,9 @@
29static void __fscrypt_decrypt_bio(struct bio *bio, bool done) 29static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
30{ 30{
31 struct bio_vec *bv; 31 struct bio_vec *bv;
32 int i;
33 struct bvec_iter_all iter_all; 32 struct bvec_iter_all iter_all;
34 33
35 bio_for_each_segment_all(bv, bio, i, iter_all) { 34 bio_for_each_segment_all(bv, bio, iter_all) {
36 struct page *page = bv->bv_page; 35 struct page *page = bv->bv_page;
37 int ret = fscrypt_decrypt_page(page->mapping->host, page, 36 int ret = fscrypt_decrypt_page(page->mapping->host, page,
38 PAGE_SIZE, 0, page->index); 37 PAGE_SIZE, 0, page->index);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9bb015bc4a83..fbe885d68035 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -538,7 +538,6 @@ static struct bio *dio_await_one(struct dio *dio)
538static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) 538static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
539{ 539{
540 struct bio_vec *bvec; 540 struct bio_vec *bvec;
541 unsigned i;
542 blk_status_t err = bio->bi_status; 541 blk_status_t err = bio->bi_status;
543 542
544 if (err) { 543 if (err) {
@@ -553,7 +552,7 @@ static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
553 } else { 552 } else {
554 struct bvec_iter_all iter_all; 553 struct bvec_iter_all iter_all;
555 554
556 bio_for_each_segment_all(bvec, bio, i, iter_all) { 555 bio_for_each_segment_all(bvec, bio, iter_all) {
557 struct page *page = bvec->bv_page; 556 struct page *page = bvec->bv_page;
558 557
559 if (dio->op == REQ_OP_READ && !PageCompound(page) && 558 if (dio->op == REQ_OP_READ && !PageCompound(page) &&
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 3e9298e6a705..4690618a92e9 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -61,11 +61,10 @@ static void buffer_io_error(struct buffer_head *bh)
61 61
62static void ext4_finish_bio(struct bio *bio) 62static void ext4_finish_bio(struct bio *bio)
63{ 63{
64 int i;
65 struct bio_vec *bvec; 64 struct bio_vec *bvec;
66 struct bvec_iter_all iter_all; 65 struct bvec_iter_all iter_all;
67 66
68 bio_for_each_segment_all(bvec, bio, i, iter_all) { 67 bio_for_each_segment_all(bvec, bio, iter_all) {
69 struct page *page = bvec->bv_page; 68 struct page *page = bvec->bv_page;
70#ifdef CONFIG_FS_ENCRYPTION 69#ifdef CONFIG_FS_ENCRYPTION
71 struct page *data_page = NULL; 70 struct page *data_page = NULL;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3adadf461825..3629a74b7f94 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -71,7 +71,6 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
71static void mpage_end_io(struct bio *bio) 71static void mpage_end_io(struct bio *bio)
72{ 72{
73 struct bio_vec *bv; 73 struct bio_vec *bv;
74 int i;
75 struct bvec_iter_all iter_all; 74 struct bvec_iter_all iter_all;
76 75
77 if (ext4_bio_encrypted(bio)) { 76 if (ext4_bio_encrypted(bio)) {
@@ -82,7 +81,7 @@ static void mpage_end_io(struct bio *bio)
82 return; 81 return;
83 } 82 }
84 } 83 }
85 bio_for_each_segment_all(bv, bio, i, iter_all) { 84 bio_for_each_segment_all(bv, bio, iter_all) {
86 struct page *page = bv->bv_page; 85 struct page *page = bv->bv_page;
87 86
88 if (!bio->bi_status) { 87 if (!bio->bi_status) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e7ae26e36c9c..38faf661e237 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1760,8 +1760,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1760 ext4_msg(sb, KERN_ERR, 1760 ext4_msg(sb, KERN_ERR,
1761 "filesystem too large to resize to %llu blocks safely", 1761 "filesystem too large to resize to %llu blocks safely",
1762 n_blocks_count); 1762 n_blocks_count);
1763 if (sizeof(sector_t) < 8)
1764 ext4_warning(sb, "CONFIG_LBDAF not enabled");
1765 return -EINVAL; 1763 return -EINVAL;
1766 } 1764 }
1767 1765
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 981f702848e7..0e63069b9d5b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2705,13 +2705,9 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
2705 loff_t res; 2705 loff_t res;
2706 loff_t upper_limit = MAX_LFS_FILESIZE; 2706 loff_t upper_limit = MAX_LFS_FILESIZE;
2707 2707
2708 /* small i_blocks in vfs inode? */ 2708 BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
2709 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 2709
2710 /* 2710 if (!has_huge_files) {
2711 * CONFIG_LBDAF is not enabled implies the inode
2712 * i_block represent total blocks in 512 bytes
2713 * 32 == size of vfs inode i_blocks * 8
2714 */
2715 upper_limit = (1LL << 32) - 1; 2711 upper_limit = (1LL << 32) - 1;
2716 2712
2717 /* total blocks in file system block size */ 2713 /* total blocks in file system block size */
@@ -2752,11 +2748,11 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
2752 * number of 512-byte sectors of the file. 2748 * number of 512-byte sectors of the file.
2753 */ 2749 */
2754 2750
2755 if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { 2751 if (!has_huge_files) {
2756 /* 2752 /*
2757 * !has_huge_files or CONFIG_LBDAF not enabled implies that 2753 * !has_huge_files or implies that the inode i_block field
2758 * the inode i_block field represents total file blocks in 2754 * represents total file blocks in 2^32 512-byte sectors ==
2759 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8 2755 * size of vfs inode i_blocks * 8
2760 */ 2756 */
2761 upper_limit = (1LL << 32) - 1; 2757 upper_limit = (1LL << 32) - 1;
2762 2758
@@ -2896,18 +2892,6 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2896 ~EXT4_FEATURE_RO_COMPAT_SUPP)); 2892 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2897 return 0; 2893 return 0;
2898 } 2894 }
2899 /*
2900 * Large file size enabled file system can only be mounted
2901 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
2902 */
2903 if (ext4_has_feature_huge_file(sb)) {
2904 if (sizeof(blkcnt_t) < sizeof(u64)) {
2905 ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
2906 "cannot be mounted RDWR without "
2907 "CONFIG_LBDAF");
2908 return 0;
2909 }
2910 }
2911 if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { 2895 if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
2912 ext4_msg(sb, KERN_ERR, 2896 ext4_msg(sb, KERN_ERR,
2913 "Can't support bigalloc feature without " 2897 "Can't support bigalloc feature without "
@@ -4056,8 +4040,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4056 if (err) { 4040 if (err) {
4057 ext4_msg(sb, KERN_ERR, "filesystem" 4041 ext4_msg(sb, KERN_ERR, "filesystem"
4058 " too large to mount safely on this system"); 4042 " too large to mount safely on this system");
4059 if (sizeof(sector_t) < 8)
4060 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
4061 goto failed_mount; 4043 goto failed_mount;
4062 } 4044 }
4063 4045
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9727944139f2..64040e998439 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -86,10 +86,9 @@ static void __read_end_io(struct bio *bio)
86{ 86{
87 struct page *page; 87 struct page *page;
88 struct bio_vec *bv; 88 struct bio_vec *bv;
89 int i;
90 struct bvec_iter_all iter_all; 89 struct bvec_iter_all iter_all;
91 90
92 bio_for_each_segment_all(bv, bio, i, iter_all) { 91 bio_for_each_segment_all(bv, bio, iter_all) {
93 page = bv->bv_page; 92 page = bv->bv_page;
94 93
95 /* PG_error was set if any post_read step failed */ 94 /* PG_error was set if any post_read step failed */
@@ -164,7 +163,6 @@ static void f2fs_write_end_io(struct bio *bio)
164{ 163{
165 struct f2fs_sb_info *sbi = bio->bi_private; 164 struct f2fs_sb_info *sbi = bio->bi_private;
166 struct bio_vec *bvec; 165 struct bio_vec *bvec;
167 int i;
168 struct bvec_iter_all iter_all; 166 struct bvec_iter_all iter_all;
169 167
170 if (time_to_inject(sbi, FAULT_WRITE_IO)) { 168 if (time_to_inject(sbi, FAULT_WRITE_IO)) {
@@ -172,7 +170,7 @@ static void f2fs_write_end_io(struct bio *bio)
172 bio->bi_status = BLK_STS_IOERR; 170 bio->bi_status = BLK_STS_IOERR;
173 } 171 }
174 172
175 bio_for_each_segment_all(bvec, bio, i, iter_all) { 173 bio_for_each_segment_all(bvec, bio, iter_all) {
176 struct page *page = bvec->bv_page; 174 struct page *page = bvec->bv_page;
177 enum count_type type = WB_DATA_TYPE(page); 175 enum count_type type = WB_DATA_TYPE(page);
178 176
@@ -349,7 +347,6 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
349{ 347{
350 struct bio_vec *bvec; 348 struct bio_vec *bvec;
351 struct page *target; 349 struct page *target;
352 int i;
353 struct bvec_iter_all iter_all; 350 struct bvec_iter_all iter_all;
354 351
355 if (!io->bio) 352 if (!io->bio)
@@ -358,7 +355,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
358 if (!inode && !page && !ino) 355 if (!inode && !page && !ino)
359 return true; 356 return true;
360 357
361 bio_for_each_segment_all(bvec, io->bio, i, iter_all) { 358 bio_for_each_segment_all(bvec, io->bio, iter_all) {
362 359
363 if (bvec->bv_page->mapping) 360 if (bvec->bv_page->mapping)
364 target = bvec->bv_page; 361 target = bvec->bv_page;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3ed2b088dcfd..6a1e499543f5 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,5 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on (64BIT || LBDAF)
4 select FS_POSIX_ACL 3 select FS_POSIX_ACL
5 select CRC32 4 select CRC32
6 select LIBCRC32C 5 select LIBCRC32C
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 8722c60b11fe..6f09b5e3dd6e 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -207,7 +207,6 @@ static void gfs2_end_log_write(struct bio *bio)
207 struct gfs2_sbd *sdp = bio->bi_private; 207 struct gfs2_sbd *sdp = bio->bi_private;
208 struct bio_vec *bvec; 208 struct bio_vec *bvec;
209 struct page *page; 209 struct page *page;
210 int i;
211 struct bvec_iter_all iter_all; 210 struct bvec_iter_all iter_all;
212 211
213 if (bio->bi_status) { 212 if (bio->bi_status) {
@@ -216,7 +215,7 @@ static void gfs2_end_log_write(struct bio *bio)
216 wake_up(&sdp->sd_logd_waitq); 215 wake_up(&sdp->sd_logd_waitq);
217 } 216 }
218 217
219 bio_for_each_segment_all(bvec, bio, i, iter_all) { 218 bio_for_each_segment_all(bvec, bio, iter_all) {
220 page = bvec->bv_page; 219 page = bvec->bv_page;
221 if (page_has_buffers(page)) 220 if (page_has_buffers(page))
222 gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); 221 gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 3201342404a7..ff86e1d4f8ff 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -189,10 +189,9 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
189static void gfs2_meta_read_endio(struct bio *bio) 189static void gfs2_meta_read_endio(struct bio *bio)
190{ 190{
191 struct bio_vec *bvec; 191 struct bio_vec *bvec;
192 int i;
193 struct bvec_iter_all iter_all; 192 struct bvec_iter_all iter_all;
194 193
195 bio_for_each_segment_all(bvec, bio, i, iter_all) { 194 bio_for_each_segment_all(bvec, bio, iter_all) {
196 struct page *page = bvec->bv_page; 195 struct page *page = bvec->bv_page;
197 struct buffer_head *bh = page_buffers(page); 196 struct buffer_head *bh = page_buffers(page);
198 unsigned int len = bvec->bv_len; 197 unsigned int len = bvec->bv_len;
diff --git a/fs/iomap.c b/fs/iomap.c
index 9ef049d61e8a..23ef63fd1669 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -245,10 +245,9 @@ iomap_read_end_io(struct bio *bio)
245{ 245{
246 int error = blk_status_to_errno(bio->bi_status); 246 int error = blk_status_to_errno(bio->bi_status);
247 struct bio_vec *bvec; 247 struct bio_vec *bvec;
248 int i;
249 struct bvec_iter_all iter_all; 248 struct bvec_iter_all iter_all;
250 249
251 bio_for_each_segment_all(bvec, bio, i, iter_all) 250 bio_for_each_segment_all(bvec, bio, iter_all)
252 iomap_read_page_end_io(bvec, error); 251 iomap_read_page_end_io(bvec, error);
253 bio_put(bio); 252 bio_put(bio);
254} 253}
@@ -1599,9 +1598,8 @@ static void iomap_dio_bio_end_io(struct bio *bio)
1599 if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { 1598 if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
1600 struct bvec_iter_all iter_all; 1599 struct bvec_iter_all iter_all;
1601 struct bio_vec *bvec; 1600 struct bio_vec *bvec;
1602 int i;
1603 1601
1604 bio_for_each_segment_all(bvec, bio, i, iter_all) 1602 bio_for_each_segment_all(bvec, bio, iter_all)
1605 put_page(bvec->bv_page); 1603 put_page(bvec->bv_page);
1606 } 1604 }
1607 bio_put(bio); 1605 bio_put(bio);
diff --git a/fs/mpage.c b/fs/mpage.c
index 3f19da75178b..436a85260394 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -47,10 +47,9 @@
47static void mpage_end_io(struct bio *bio) 47static void mpage_end_io(struct bio *bio)
48{ 48{
49 struct bio_vec *bv; 49 struct bio_vec *bv;
50 int i;
51 struct bvec_iter_all iter_all; 50 struct bvec_iter_all iter_all;
52 51
53 bio_for_each_segment_all(bv, bio, i, iter_all) { 52 bio_for_each_segment_all(bv, bio, iter_all) {
54 struct page *page = bv->bv_page; 53 struct page *page = bv->bv_page;
55 page_endio(page, bio_op(bio), 54 page_endio(page, bio_op(bio),
56 blk_status_to_errno(bio->bi_status)); 55 blk_status_to_errno(bio->bi_status));
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 5f93cfacb3d1..69d02cf8cf37 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -121,7 +121,6 @@ config PNFS_FILE_LAYOUT
121config PNFS_BLOCK 121config PNFS_BLOCK
122 tristate 122 tristate
123 depends on NFS_V4_1 && BLK_DEV_DM 123 depends on NFS_V4_1 && BLK_DEV_DM
124 depends on 64BIT || LBDAF
125 default NFS_V4 124 default NFS_V4
126 125
127config PNFS_FLEXFILE_LAYOUT 126config PNFS_FLEXFILE_LAYOUT
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7982a93e630f..8821bc7b9c72 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -594,7 +594,6 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
594 */ 594 */
595 595
596#if BITS_PER_LONG == 32 596#if BITS_PER_LONG == 32
597# if defined(CONFIG_LBDAF)
598 BUILD_BUG_ON(sizeof(sector_t) != 8); 597 BUILD_BUG_ON(sizeof(sector_t) != 8);
599 /* 598 /*
600 * We might be limited by page cache size. 599 * We might be limited by page cache size.
@@ -608,15 +607,6 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
608 */ 607 */
609 bitshift = 31; 608 bitshift = 31;
610 } 609 }
611# else
612 /*
613 * We are limited by the size of sector_t. Use block size, as
614 * that's what we expose to the VFS.
615 */
616 bytes = 1 << bbits;
617 trim = 1;
618 bitshift = 31;
619# endif
620#endif 610#endif
621 611
622 /* 612 /*
diff --git a/fs/stack.c b/fs/stack.c
index a54e33ed10f1..664ed35558bd 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -21,11 +21,10 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
21 i_size = i_size_read(src); 21 i_size = i_size_read(src);
22 22
23 /* 23 /*
24 * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to 24 * But on 32-bit, we ought to make an effort to keep the two halves of
25 * keep the two halves of i_blocks in sync despite SMP or PREEMPT - 25 * i_blocks in sync despite SMP or PREEMPT - though stat's
26 * though stat's generic_fillattr() doesn't bother, and we won't be 26 * generic_fillattr() doesn't bother, and we won't be applying quotas
27 * applying quotas (where i_blocks does become important) at the 27 * (where i_blocks does become important) at the upper level.
28 * upper level.
29 * 28 *
30 * We don't actually know what locking is used at the lower level; 29 * We don't actually know what locking is used at the lower level;
31 * but if it's a filesystem that supports quotas, it will be using 30 * but if it's a filesystem that supports quotas, it will be using
@@ -44,9 +43,9 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
44 * include/linux/fs.h). We don't necessarily hold i_mutex when this 43 * include/linux/fs.h). We don't necessarily hold i_mutex when this
45 * is called, so take i_lock for that case. 44 * is called, so take i_lock for that case.
46 * 45 *
47 * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the 46 * And if on 32-bit, continue our effort to keep the two halves of
48 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock 47 * i_blocks in sync despite SMP or PREEMPT: use i_lock for that case
49 * for that case too, and do both at once by combining the tests. 48 * too, and do both at once by combining the tests.
50 * 49 *
51 * There is none of this locking overhead in the 64-bit case. 50 * There is none of this locking overhead in the 64-bit case.
52 */ 51 */
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 457ac9f97377..99af5e5bda9f 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,7 +1,6 @@
1config XFS_FS 1config XFS_FS
2 tristate "XFS filesystem support" 2 tristate "XFS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 depends on (64BIT || LBDAF)
5 select EXPORTFS 4 select EXPORTFS
6 select LIBCRC32C 5 select LIBCRC32C
7 select FS_IOMAP 6 select FS_IOMAP
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 09ac1bb4c2b7..a6f0f4761a37 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -98,7 +98,6 @@ xfs_destroy_ioend(
98 98
99 for (bio = &ioend->io_inline_bio; bio; bio = next) { 99 for (bio = &ioend->io_inline_bio; bio; bio = next) {
100 struct bio_vec *bvec; 100 struct bio_vec *bvec;
101 int i;
102 struct bvec_iter_all iter_all; 101 struct bvec_iter_all iter_all;
103 102
104 /* 103 /*
@@ -111,7 +110,7 @@ xfs_destroy_ioend(
111 next = bio->bi_private; 110 next = bio->bi_private;
112 111
113 /* walk each page on bio, ending page IO on them */ 112 /* walk each page on bio, ending page IO on them */
114 bio_for_each_segment_all(bvec, bio, i, iter_all) 113 bio_for_each_segment_all(bvec, bio, iter_all)
115 xfs_finish_page_writeback(inode, bvec, error); 114 xfs_finish_page_writeback(inode, bvec, error);
116 bio_put(bio); 115 bio_put(bio);
117 } 116 }
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b56c6e585ece..a14d11d78bd8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -534,26 +534,18 @@ xfs_max_file_offset(
534 534
535 /* Figure out maximum filesize, on Linux this can depend on 535 /* Figure out maximum filesize, on Linux this can depend on
536 * the filesystem blocksize (on 32 bit platforms). 536 * the filesystem blocksize (on 32 bit platforms).
537 * __block_write_begin does this in an [unsigned] long... 537 * __block_write_begin does this in an [unsigned] long long...
538 * page->index << (PAGE_SHIFT - bbits) 538 * page->index << (PAGE_SHIFT - bbits)
539 * So, for page sized blocks (4K on 32 bit platforms), 539 * So, for page sized blocks (4K on 32 bit platforms),
540 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is 540 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
541 * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) 541 * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
542 * but for smaller blocksizes it is less (bbits = log2 bsize). 542 * but for smaller blocksizes it is less (bbits = log2 bsize).
543 * Note1: get_block_t takes a long (implicit cast from above)
544 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
545 * can optionally convert the [unsigned] long from above into
546 * an [unsigned] long long.
547 */ 543 */
548 544
549#if BITS_PER_LONG == 32 545#if BITS_PER_LONG == 32
550# if defined(CONFIG_LBDAF)
551 ASSERT(sizeof(sector_t) == 8); 546 ASSERT(sizeof(sector_t) == 8);
552 pagefactor = PAGE_SIZE; 547 pagefactor = PAGE_SIZE;
553 bitshift = BITS_PER_LONG; 548 bitshift = BITS_PER_LONG;
554# else
555 pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift);
556# endif
557#endif 549#endif
558 550
559 return (((uint64_t)pagefactor) << bitshift) - 1; 551 return (((uint64_t)pagefactor) << bitshift) - 1;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e584673c1881..ea73df36529a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -1,19 +1,6 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de> 3 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 *
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */ 4 */
18#ifndef __LINUX_BIO_H 5#ifndef __LINUX_BIO_H
19#define __LINUX_BIO_H 6#define __LINUX_BIO_H
@@ -134,9 +121,8 @@ static inline bool bio_next_segment(const struct bio *bio,
134 * drivers should _never_ use the all version - the bio may have been split 121 * drivers should _never_ use the all version - the bio may have been split
135 * before it got to the driver and the driver won't own all of it 122 * before it got to the driver and the driver won't own all of it
136 */ 123 */
137#define bio_for_each_segment_all(bvl, bio, i, iter) \ 124#define bio_for_each_segment_all(bvl, bio, iter) \
138 for (i = 0, bvl = bvec_init_iter_all(&iter); \ 125 for (bvl = bvec_init_iter_all(&iter); bio_next_segment((bio), &iter); )
139 bio_next_segment((bio), &iter); i++)
140 126
141static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, 127static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
142 unsigned bytes) 128 unsigned bytes)
diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h
index 7b6ecf9ac4c3..5cc5f0f36218 100644
--- a/include/linux/blk-mq-rdma.h
+++ b/include/linux/blk-mq-rdma.h
@@ -1,3 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1#ifndef _LINUX_BLK_MQ_RDMA_H 2#ifndef _LINUX_BLK_MQ_RDMA_H
2#define _LINUX_BLK_MQ_RDMA_H 3#define _LINUX_BLK_MQ_RDMA_H
3 4
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index db29928de467..15d1aa53d96c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -70,6 +70,8 @@ struct blk_mq_hw_ctx {
70 struct dentry *sched_debugfs_dir; 70 struct dentry *sched_debugfs_dir;
71#endif 71#endif
72 72
73 struct list_head hctx_list;
74
73 /* Must be the last member - see also blk_mq_hw_ctx_size(). */ 75 /* Must be the last member - see also blk_mq_hw_ctx_size(). */
74 struct srcu_struct srcu[0]; 76 struct srcu_struct srcu[0];
75}; 77};
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 791fee35df88..be418275763c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -215,21 +215,24 @@ struct bio {
215/* 215/*
216 * bio flags 216 * bio flags
217 */ 217 */
218#define BIO_NO_PAGE_REF 0 /* don't put release vec pages */ 218enum {
219#define BIO_SEG_VALID 1 /* bi_phys_segments valid */ 219 BIO_NO_PAGE_REF, /* don't put release vec pages */
220#define BIO_CLONED 2 /* doesn't own data */ 220 BIO_SEG_VALID, /* bi_phys_segments valid */
221#define BIO_BOUNCED 3 /* bio is a bounce bio */ 221 BIO_CLONED, /* doesn't own data */
222#define BIO_USER_MAPPED 4 /* contains user pages */ 222 BIO_BOUNCED, /* bio is a bounce bio */
223#define BIO_NULL_MAPPED 5 /* contains invalid user pages */ 223 BIO_USER_MAPPED, /* contains user pages */
224#define BIO_QUIET 6 /* Make BIO Quiet */ 224 BIO_NULL_MAPPED, /* contains invalid user pages */
225#define BIO_CHAIN 7 /* chained bio, ->bi_remaining in effect */ 225 BIO_QUIET, /* Make BIO Quiet */
226#define BIO_REFFED 8 /* bio has elevated ->bi_cnt */ 226 BIO_CHAIN, /* chained bio, ->bi_remaining in effect */
227#define BIO_THROTTLED 9 /* This bio has already been subjected to 227 BIO_REFFED, /* bio has elevated ->bi_cnt */
228 BIO_THROTTLED, /* This bio has already been subjected to
228 * throttling rules. Don't do it again. */ 229 * throttling rules. Don't do it again. */
229#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion 230 BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion
230 * of this bio. */ 231 * of this bio. */
231#define BIO_QUEUE_ENTERED 11 /* can use blk_queue_enter_live() */ 232 BIO_QUEUE_ENTERED, /* can use blk_queue_enter_live() */
232#define BIO_TRACKED 12 /* set if bio goes through the rq_qos path */ 233 BIO_TRACKED, /* set if bio goes through the rq_qos path */
234 BIO_FLAG_LAST
235};
233 236
234/* See BVEC_POOL_OFFSET below before adding new flags */ 237/* See BVEC_POOL_OFFSET below before adding new flags */
235 238
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 317ab30d2904..1aafeb923e7b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -535,6 +535,13 @@ struct request_queue {
535 535
536 struct mutex sysfs_lock; 536 struct mutex sysfs_lock;
537 537
538 /*
539 * for reusing dead hctx instance in case of updating
540 * nr_hw_queues
541 */
542 struct list_head unused_hctx_list;
543 spinlock_t unused_hctx_lock;
544
538 atomic_t mq_freeze_depth; 545 atomic_t mq_freeze_depth;
539 546
540#if defined(CONFIG_BLK_DEV_BSG) 547#if defined(CONFIG_BLK_DEV_BSG)
@@ -640,6 +647,13 @@ static inline bool blk_account_rq(struct request *rq)
640 647
641#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) 648#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ)
642 649
650#define rq_dma_dir(rq) \
651 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
652
653#define dma_map_bvec(dev, bv, dir, attrs) \
654 dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
655 (dir), (attrs))
656
643static inline bool queue_is_mq(struct request_queue *q) 657static inline bool queue_is_mq(struct request_queue *q)
644{ 658{
645 return q->mq_ops; 659 return q->mq_ops;
@@ -931,6 +945,17 @@ static inline unsigned int blk_rq_payload_bytes(struct request *rq)
931 return blk_rq_bytes(rq); 945 return blk_rq_bytes(rq);
932} 946}
933 947
948/*
949 * Return the first full biovec in the request. The caller needs to check that
950 * there are any bvecs before calling this helper.
951 */
952static inline struct bio_vec req_bvec(struct request *rq)
953{
954 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
955 return rq->special_vec;
956 return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
957}
958
934static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, 959static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
935 int op) 960 int op)
936{ 961{
@@ -1051,7 +1076,6 @@ extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
1051extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, 1076extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
1052 sector_t offset); 1077 sector_t offset);
1053extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); 1078extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
1054extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
1055extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); 1079extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
1056extern int blk_queue_dma_drain(struct request_queue *q, 1080extern int blk_queue_dma_drain(struct request_queue *q,
1057 dma_drain_needed_fn *dma_drain_needed, 1081 dma_drain_needed_fn *dma_drain_needed,
@@ -1547,6 +1571,17 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
1547 return bio_integrity_intervals(bi, sectors) * bi->tuple_size; 1571 return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
1548} 1572}
1549 1573
1574/*
1575 * Return the first bvec that contains integrity data. Only drivers that are
1576 * limited to a single integrity segment should use this helper.
1577 */
1578static inline struct bio_vec *rq_integrity_vec(struct request *rq)
1579{
1580 if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
1581 return NULL;
1582 return rq->bio->bi_integrity->bip_vec;
1583}
1584
1550#else /* CONFIG_BLK_DEV_INTEGRITY */ 1585#else /* CONFIG_BLK_DEV_INTEGRITY */
1551 1586
1552struct bio; 1587struct bio;
@@ -1621,6 +1656,11 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
1621 return 0; 1656 return 0;
1622} 1657}
1623 1658
1659static inline struct bio_vec *rq_integrity_vec(struct request *rq)
1660{
1661 return NULL;
1662}
1663
1624#endif /* CONFIG_BLK_DEV_INTEGRITY */ 1664#endif /* CONFIG_BLK_DEV_INTEGRITY */
1625 1665
1626struct block_device_operations { 1666struct block_device_operations {
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 7f14517a559b..960988d42f77 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -1,24 +1,10 @@
1/* SPDX-License-Identifier: GPL-2.0-or-later */
1/* 2/*
2 * BSG helper library 3 * BSG helper library
3 * 4 *
4 * Copyright (C) 2008 James Smart, Emulex Corporation 5 * Copyright (C) 2008 James Smart, Emulex Corporation
5 * Copyright (C) 2011 Red Hat, Inc. All rights reserved. 6 * Copyright (C) 2011 Red Hat, Inc. All rights reserved.
6 * Copyright (C) 2011 Mike Christie 7 * Copyright (C) 2011 Mike Christie
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 *
22 */ 8 */
23#ifndef _BLK_BSG_ 9#ifndef _BLK_BSG_
24#define _BLK_BSG_ 10#define _BLK_BSG_
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index ff13cbc1887d..a032f01e928c 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -1,21 +1,8 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * bvec iterator 3 * bvec iterator
3 * 4 *
4 * Copyright (C) 2001 Ming Lei <ming.lei@canonical.com> 5 * Copyright (C) 2001 Ming Lei <ming.lei@canonical.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 *
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public Licens
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
19 */ 6 */
20#ifndef __LINUX_BVEC_ITER_H 7#ifndef __LINUX_BVEC_ITER_H
21#define __LINUX_BVEC_ITER_H 8#define __LINUX_BVEC_ITER_H
@@ -51,11 +38,6 @@ struct bvec_iter_all {
51 unsigned done; 38 unsigned done;
52}; 39};
53 40
54static inline struct page *bvec_nth_page(struct page *page, int idx)
55{
56 return idx == 0 ? page : nth_page(page, idx);
57}
58
59/* 41/*
60 * various member access, note that bio_data should of course not be used 42 * various member access, note that bio_data should of course not be used
61 * on highmem page vectors 43 * on highmem page vectors
@@ -92,8 +74,8 @@ static inline struct page *bvec_nth_page(struct page *page, int idx)
92 PAGE_SIZE - bvec_iter_offset((bvec), (iter))) 74 PAGE_SIZE - bvec_iter_offset((bvec), (iter)))
93 75
94#define bvec_iter_page(bvec, iter) \ 76#define bvec_iter_page(bvec, iter) \
95 bvec_nth_page(mp_bvec_iter_page((bvec), (iter)), \ 77 (mp_bvec_iter_page((bvec), (iter)) + \
96 mp_bvec_iter_page_idx((bvec), (iter))) 78 mp_bvec_iter_page_idx((bvec), (iter)))
97 79
98#define bvec_iter_bvec(bvec, iter) \ 80#define bvec_iter_bvec(bvec, iter) \
99((struct bio_vec) { \ 81((struct bio_vec) { \
@@ -157,11 +139,10 @@ static inline void bvec_advance(const struct bio_vec *bvec,
157 struct bio_vec *bv = &iter_all->bv; 139 struct bio_vec *bv = &iter_all->bv;
158 140
159 if (iter_all->done) { 141 if (iter_all->done) {
160 bv->bv_page = nth_page(bv->bv_page, 1); 142 bv->bv_page++;
161 bv->bv_offset = 0; 143 bv->bv_offset = 0;
162 } else { 144 } else {
163 bv->bv_page = bvec_nth_page(bvec->bv_page, bvec->bv_offset / 145 bv->bv_page = bvec->bv_page + (bvec->bv_offset >> PAGE_SHIFT);
164 PAGE_SIZE);
165 bv->bv_offset = bvec->bv_offset & ~PAGE_MASK; 146 bv->bv_offset = bvec->bv_offset & ~PAGE_MASK;
166 } 147 }
167 bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, 148 bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset,
@@ -184,7 +165,7 @@ static inline void mp_bvec_last_segment(const struct bio_vec *bvec,
184 unsigned total = bvec->bv_offset + bvec->bv_len; 165 unsigned total = bvec->bv_offset + bvec->bv_len;
185 unsigned last_page = (total - 1) / PAGE_SIZE; 166 unsigned last_page = (total - 1) / PAGE_SIZE;
186 167
187 seg->bv_page = bvec_nth_page(bvec->bv_page, last_page); 168 seg->bv_page = bvec->bv_page + last_page;
188 169
189 /* the whole segment is inside the last page */ 170 /* the whole segment is inside the last page */
190 if (bvec->bv_offset >= last_page * PAGE_SIZE) { 171 if (bvec->bv_offset >= last_page * PAGE_SIZE) {
@@ -196,9 +177,4 @@ static inline void mp_bvec_last_segment(const struct bio_vec *bvec,
196 } 177 }
197} 178}
198 179
199#define mp_bvec_for_each_page(pg, bv, i) \
200 for (i = (bv)->bv_offset / PAGE_SIZE; \
201 (i <= (((bv)->bv_offset + (bv)->bv_len - 1) / PAGE_SIZE)) && \
202 (pg = bvec_nth_page((bv)->bv_page, i)); i += 1)
203
204#endif /* __LINUX_BVEC_ITER_H */ 180#endif /* __LINUX_BVEC_ITER_H */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 06c0fd594097..8b5330dd5ac0 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -150,6 +150,13 @@ enum {
150 DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ 150 DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */
151}; 151};
152 152
153enum {
154 /* Poll even if events_poll_msecs is unset */
155 DISK_EVENT_FLAG_POLL = 1 << 0,
156 /* Forward events to udev */
157 DISK_EVENT_FLAG_UEVENT = 1 << 1,
158};
159
153struct disk_part_tbl { 160struct disk_part_tbl {
154 struct rcu_head rcu_head; 161 struct rcu_head rcu_head;
155 int len; 162 int len;
@@ -184,8 +191,8 @@ struct gendisk {
184 char disk_name[DISK_NAME_LEN]; /* name of major driver */ 191 char disk_name[DISK_NAME_LEN]; /* name of major driver */
185 char *(*devnode)(struct gendisk *gd, umode_t *mode); 192 char *(*devnode)(struct gendisk *gd, umode_t *mode);
186 193
187 unsigned int events; /* supported events */ 194 unsigned short events; /* supported events */
188 unsigned int async_events; /* async events, subset of all */ 195 unsigned short event_flags; /* flags related to event processing */
189 196
190 /* Array of pointers to partitions indexed by partno. 197 /* Array of pointers to partitions indexed by partno.
191 * Protected with matching bdev lock but stat and other 198 * Protected with matching bdev lock but stat and other
@@ -610,6 +617,7 @@ struct unixware_disklabel {
610 617
611extern int blk_alloc_devt(struct hd_struct *part, dev_t *devt); 618extern int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
612extern void blk_free_devt(dev_t devt); 619extern void blk_free_devt(dev_t devt);
620extern void blk_invalidate_devt(dev_t devt);
613extern dev_t blk_lookup_devt(const char *name, int partno); 621extern dev_t blk_lookup_devt(const char *name, int partno);
614extern char *disk_name (struct gendisk *hd, int partno, char *buf); 622extern char *disk_name (struct gendisk *hd, int partno, char *buf);
615 623
@@ -714,7 +722,7 @@ static inline void hd_free_part(struct hd_struct *part)
714 */ 722 */
715static inline sector_t part_nr_sects_read(struct hd_struct *part) 723static inline sector_t part_nr_sects_read(struct hd_struct *part)
716{ 724{
717#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) 725#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
718 sector_t nr_sects; 726 sector_t nr_sects;
719 unsigned seq; 727 unsigned seq;
720 do { 728 do {
@@ -722,7 +730,7 @@ static inline sector_t part_nr_sects_read(struct hd_struct *part)
722 nr_sects = part->nr_sects; 730 nr_sects = part->nr_sects;
723 } while (read_seqcount_retry(&part->nr_sects_seq, seq)); 731 } while (read_seqcount_retry(&part->nr_sects_seq, seq));
724 return nr_sects; 732 return nr_sects;
725#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) 733#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
726 sector_t nr_sects; 734 sector_t nr_sects;
727 735
728 preempt_disable(); 736 preempt_disable();
@@ -741,11 +749,11 @@ static inline sector_t part_nr_sects_read(struct hd_struct *part)
741 */ 749 */
742static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) 750static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
743{ 751{
744#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) 752#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
745 write_seqcount_begin(&part->nr_sects_seq); 753 write_seqcount_begin(&part->nr_sects_seq);
746 part->nr_sects = size; 754 part->nr_sects = size;
747 write_seqcount_end(&part->nr_sects_seq); 755 write_seqcount_end(&part->nr_sects_seq);
748#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) 756#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
749 preempt_disable(); 757 preempt_disable();
750 part->nr_sects = size; 758 part->nr_sects = size;
751 preempt_enable(); 759 preempt_enable();
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2d14e21c16c0..a3b59d143afb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -17,6 +17,7 @@
17#include <asm/byteorder.h> 17#include <asm/byteorder.h>
18#include <asm/div64.h> 18#include <asm/div64.h>
19#include <uapi/linux/kernel.h> 19#include <uapi/linux/kernel.h>
20#include <asm/div64.h>
20 21
21#define STACK_MAGIC 0xdeadbeef 22#define STACK_MAGIC 0xdeadbeef
22 23
@@ -175,18 +176,7 @@
175#define _RET_IP_ (unsigned long)__builtin_return_address(0) 176#define _RET_IP_ (unsigned long)__builtin_return_address(0)
176#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; }) 177#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; })
177 178
178#ifdef CONFIG_LBDAF 179#define sector_div(a, b) do_div(a, b)
179# define sector_div(a, b) do_div(a, b)
180#else
181# define sector_div(n, b)( \
182{ \
183 int _res; \
184 _res = (n) % (b); \
185 (n) /= (b); \
186 _res; \
187} \
188)
189#endif
190 180
191/** 181/**
192 * upper_32_bits - return bits 32-63 of a number 182 * upper_32_bits - return bits 32-63 of a number
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
index 3aa97b98dc89..3ec8e50efa16 100644
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -77,7 +77,7 @@ struct nvme_rdma_cm_rep {
77 * struct nvme_rdma_cm_rej - rdma connect reject 77 * struct nvme_rdma_cm_rej - rdma connect reject
78 * 78 *
79 * @recfmt: format of the RDMA Private Data 79 * @recfmt: format of the RDMA Private Data
80 * @fsts: error status for the associated connect request 80 * @sts: error status for the associated connect request
81 */ 81 */
82struct nvme_rdma_cm_rej { 82struct nvme_rdma_cm_rej {
83 __le16 recfmt; 83 __le16 recfmt;
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 04b124fca51e..3e76b6d7d97f 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -1,18 +1,10 @@
1/* SPDX-License-Identifier: GPL-2.0 */
1/* 2/*
2 * Copyright © 2016 Intel Corporation 3 * Copyright © 2016 Intel Corporation
3 * 4 *
4 * Authors: 5 * Authors:
5 * Rafael Antognolli <rafael.antognolli@intel.com> 6 * Rafael Antognolli <rafael.antognolli@intel.com>
6 * Scott Bauer <scott.bauer@intel.com> 7 * Scott Bauer <scott.bauer@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 */ 8 */
17 9
18#ifndef LINUX_OPAL_H 10#ifndef LINUX_OPAL_H
diff --git a/include/linux/types.h b/include/linux/types.h
index cc0dbbe551d5..231114ae38f4 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -127,13 +127,8 @@ typedef s64 int64_t;
127 * 127 *
128 * blkcnt_t is the type of the inode's block count. 128 * blkcnt_t is the type of the inode's block count.
129 */ 129 */
130#ifdef CONFIG_LBDAF
131typedef u64 sector_t; 130typedef u64 sector_t;
132typedef u64 blkcnt_t; 131typedef u64 blkcnt_t;
133#else
134typedef unsigned long sector_t;
135typedef unsigned long blkcnt_t;
136#endif
137 132
138/* 133/*
139 * The type of an index into the pagecache. 134 * The type of an index into the pagecache.
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 627624d35030..33e53b80cd1f 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -5,15 +5,6 @@
5 * Authors: 5 * Authors:
6 * Rafael Antognolli <rafael.antognolli@intel.com> 6 * Rafael Antognolli <rafael.antognolli@intel.com>
7 * Scott Bauer <scott.bauer@intel.com> 7 * Scott Bauer <scott.bauer@intel.com>
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms and conditions of the GNU General Public License,
11 * version 2, as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 * more details.
17 */ 8 */
18 9
19#ifndef _UAPI_SED_OPAL_H 10#ifndef _UAPI_SED_OPAL_H
@@ -58,7 +49,7 @@ struct opal_key {
58struct opal_lr_act { 49struct opal_lr_act {
59 struct opal_key key; 50 struct opal_key key;
60 __u32 sum; 51 __u32 sum;
61 __u8 num_lrs; 52 __u8 num_lrs;
62 __u8 lr[OPAL_MAX_LRS]; 53 __u8 lr[OPAL_MAX_LRS];
63 __u8 align[2]; /* Align to 8 byte boundary */ 54 __u8 align[2]; /* Align to 8 byte boundary */
64}; 55};
diff --git a/include/xen/xen.h b/include/xen/xen.h
index 19d032373de5..19a72f591e2b 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -43,8 +43,10 @@ extern struct hvm_start_info pvh_start_info;
43#endif /* CONFIG_XEN_DOM0 */ 43#endif /* CONFIG_XEN_DOM0 */
44 44
45struct bio_vec; 45struct bio_vec;
46struct page;
47
46bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, 48bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
47 const struct bio_vec *vec2); 49 const struct page *page);
48 50
49#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_XEN_BALLOON) 51#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_XEN_BALLOON)
50extern u64 xen_saved_max_mem_size; 52extern u64 xen_saved_max_mem_size;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4c54a89f06ee..971c6c70891e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1930,7 +1930,6 @@ config TEST_STATIC_KEYS
1930config TEST_KMOD 1930config TEST_KMOD
1931 tristate "kmod stress tester" 1931 tristate "kmod stress tester"
1932 depends on m 1932 depends on m
1933 depends on BLOCK && (64BIT || LBDAF) # for XFS, BTRFS
1934 depends on NETDEVICES && NET_CORE && INET # for TUN 1933 depends on NETDEVICES && NET_CORE && INET # for TUN
1935 depends on BLOCK 1934 depends on BLOCK
1936 select TEST_LKM 1935 select TEST_LKM
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
index d27285f8ee82..8bc960e5e713 100644
--- a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
@@ -59,11 +59,7 @@ typedef __u32 uint32_t;
59 * 59 *
60 * blkcnt_t is the type of the inode's block count. 60 * blkcnt_t is the type of the inode's block count.
61 */ 61 */
62#ifdef CONFIG_LBDAF
63typedef u64 sector_t; 62typedef u64 sector_t;
64#else
65typedef unsigned long sector_t;
66#endif
67 63
68/* 64/*
69 * The type of an index into the pagecache. 65 * The type of an index into the pagecache.