summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-16 00:20:52 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-16 00:20:52 -0400
commit9637d517347e80ee2fe1c5d8ce45ba1b88d8b5cd (patch)
tree3cee2a1d8b3c6ea466924517307a1f98ada1e92f
parent273cbf61c3ddee9574ef1f4959b9bc6db5b24271 (diff)
parent787c79d6393fc028887cc1b6066915f0b094e92f (diff)
Merge tag 'for-linus-20190715' of git://git.kernel.dk/linux-block
Pull more block updates from Jens Axboe: "A later pull request with some followup items. I had some vacation coming up to the merge window, so certain things items were delayed a bit. This pull request also contains fixes that came in within the last few days of the merge window, which I didn't want to push right before sending you a pull request. This contains: - NVMe pull request, mostly fixes, but also a few minor items on the feature side that were timing constrained (Christoph et al) - Report zones fixes (Damien) - Removal of dead code (Damien) - Turn on cgroup psi memstall (Josef) - block cgroup MAINTAINERS entry (Konstantin) - Flush init fix (Josef) - blk-throttle low iops timing fix (Konstantin) - nbd resize fixes (Mike) - nbd 0 blocksize crash fix (Xiubo) - block integrity error leak fix (Wenwen) - blk-cgroup writeback and priority inheritance fixes (Tejun)" * tag 'for-linus-20190715' of git://git.kernel.dk/linux-block: (42 commits) MAINTAINERS: add entry for block io cgroup null_blk: fixup ->report_zones() for !CONFIG_BLK_DEV_ZONED block: Limit zone array allocation size sd_zbc: Fix report zones buffer allocation block: Kill gfp_t argument of blkdev_report_zones() block: Allow mapping of vmalloc-ed buffers block/bio-integrity: fix a memory leak bug nvme: fix NULL deref for fabrics options nbd: add netlink reconfigure resize support nbd: fix crash when the blksize is zero block: Disable write plugging for zoned block devices block: Fix elevator name declaration block: Remove unused definitions nvme: fix regression upon hot device removal and insertion blk-throttle: fix zero wait time for iops throttled group block: Fix potential overflow in blk_report_zones() blkcg: implement REQ_CGROUP_PUNT blkcg, writeback: Implement wbc_blkcg_css() blkcg, writeback: Add wbc->no_cgroup_owner blkcg, writeback: Rename wbc_account_io() to wbc_account_cgroup_owner() ...
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst2
-rw-r--r--Documentation/block/biodoc.txt5
-rw-r--r--MAINTAINERS13
-rw-r--r--block/bio-integrity.c8
-rw-r--r--block/bio.c28
-rw-r--r--block/blk-cgroup.c66
-rw-r--r--block/blk-core.c6
-rw-r--r--block/blk-mq.c2
-rw-r--r--block/blk-mq.h32
-rw-r--r--block/blk-throttle.c9
-rw-r--r--block/blk-zoned.c69
-rw-r--r--drivers/block/nbd.c59
-rw-r--r--drivers/block/null_blk.h5
-rw-r--r--drivers/block/null_blk_zoned.c3
-rw-r--r--drivers/md/dm-flakey.c5
-rw-r--r--drivers/md/dm-linear.c5
-rw-r--r--drivers/md/dm-zoned-metadata.c16
-rw-r--r--drivers/md/dm.c6
-rw-r--r--drivers/nvme/host/core.c43
-rw-r--r--drivers/nvme/host/fc.c51
-rw-r--r--drivers/nvme/host/multipath.c18
-rw-r--r--drivers/nvme/host/nvme.h1
-rw-r--r--drivers/nvme/host/pci.c26
-rw-r--r--drivers/nvme/host/tcp.c9
-rw-r--r--drivers/nvme/host/trace.c28
-rw-r--r--drivers/nvme/target/admin-cmd.c3
-rw-r--r--drivers/nvme/target/configfs.c4
-rw-r--r--drivers/nvme/target/fcloop.c44
-rw-r--r--drivers/nvme/target/io-cmd-bdev.c39
-rw-r--r--drivers/nvme/target/nvmet.h8
-rw-r--r--drivers/nvme/target/trace.c2
-rw-r--r--drivers/scsi/sd.h3
-rw-r--r--drivers/scsi/sd_zbc.c108
-rw-r--r--fs/btrfs/extent_io.c4
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/f2fs/data.c6
-rw-r--r--fs/f2fs/super.c4
-rw-r--r--fs/fs-writeback.c13
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/blk-cgroup.h16
-rw-r--r--include/linux/blk_types.h10
-rw-r--r--include/linux/blkdev.h14
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--include/linux/device-mapper.h3
-rw-r--r--include/linux/elevator.h11
-rw-r--r--include/linux/nvme.h12
-rw-r--r--include/linux/writeback.h41
50 files changed, 660 insertions, 210 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index a9548de56ac9..8269e869cb1e 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2124,7 +2124,7 @@ following two functions.
2124 a queue (device) has been associated with the bio and 2124 a queue (device) has been associated with the bio and
2125 before submission. 2125 before submission.
2126 2126
2127 wbc_account_io(@wbc, @page, @bytes) 2127 wbc_account_cgroup_owner(@wbc, @page, @bytes)
2128 Should be called for each data segment being written out. 2128 Should be called for each data segment being written out.
2129 While this function doesn't care exactly when it's called 2129 While this function doesn't care exactly when it's called
2130 during the writeback session, it's the easiest and most 2130 during the writeback session, it's the easiest and most
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 31c177663ed5..5a4a799fe61b 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -843,11 +843,6 @@ elevator_latter_req_fn These return the request before or after the
843 843
844elevator_completed_req_fn called when a request is completed. 844elevator_completed_req_fn called when a request is completed.
845 845
846elevator_may_queue_fn returns true if the scheduler wants to allow the
847 current context to queue a new request even if
848 it is over the queue limit. This must be used
849 very carefully!!
850
851elevator_set_req_fn 846elevator_set_req_fn
852elevator_put_req_fn Must be used to allocate and free any elevator 847elevator_put_req_fn Must be used to allocate and free any elevator
853 specific storage for a request. 848 specific storage for a request.
diff --git a/MAINTAINERS b/MAINTAINERS
index 4aee3a1de331..6debe6829716 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4183,6 +4183,19 @@ S: Maintained
4183F: mm/memcontrol.c 4183F: mm/memcontrol.c
4184F: mm/swap_cgroup.c 4184F: mm/swap_cgroup.c
4185 4185
4186CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO)
4187M: Tejun Heo <tj@kernel.org>
4188M: Jens Axboe <axboe@kernel.dk>
4189L: cgroups@vger.kernel.org
4190L: linux-block@vger.kernel.org
4191T: git git://git.kernel.dk/linux-block
4192F: Documentation/cgroup-v1/blkio-controller.rst
4193F: block/blk-cgroup.c
4194F: include/linux/blk-cgroup.h
4195F: block/blk-throttle.c
4196F: block/blk-iolatency.c
4197F: block/bfq-cgroup.c
4198
4186CORETEMP HARDWARE MONITORING DRIVER 4199CORETEMP HARDWARE MONITORING DRIVER
4187M: Fenghua Yu <fenghua.yu@intel.com> 4200M: Fenghua Yu <fenghua.yu@intel.com>
4188L: linux-hwmon@vger.kernel.org 4201L: linux-hwmon@vger.kernel.org
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4db620849515..fb95dbb21dd8 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -276,8 +276,12 @@ bool bio_integrity_prep(struct bio *bio)
276 ret = bio_integrity_add_page(bio, virt_to_page(buf), 276 ret = bio_integrity_add_page(bio, virt_to_page(buf),
277 bytes, offset); 277 bytes, offset);
278 278
279 if (ret == 0) 279 if (ret == 0) {
280 return false; 280 printk(KERN_ERR "could not attach integrity payload\n");
281 kfree(buf);
282 status = BLK_STS_RESOURCE;
283 goto err_end_io;
284 }
281 285
282 if (ret < bytes) 286 if (ret < bytes)
283 break; 287 break;
diff --git a/block/bio.c b/block/bio.c
index 29cd6cf4da51..299a0e7651ec 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -16,6 +16,7 @@
16#include <linux/workqueue.h> 16#include <linux/workqueue.h>
17#include <linux/cgroup.h> 17#include <linux/cgroup.h>
18#include <linux/blk-cgroup.h> 18#include <linux/blk-cgroup.h>
19#include <linux/highmem.h>
19 20
20#include <trace/events/block.h> 21#include <trace/events/block.h>
21#include "blk.h" 22#include "blk.h"
@@ -1441,8 +1442,22 @@ void bio_unmap_user(struct bio *bio)
1441 bio_put(bio); 1442 bio_put(bio);
1442} 1443}
1443 1444
1445static void bio_invalidate_vmalloc_pages(struct bio *bio)
1446{
1447#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
1448 if (bio->bi_private && !op_is_write(bio_op(bio))) {
1449 unsigned long i, len = 0;
1450
1451 for (i = 0; i < bio->bi_vcnt; i++)
1452 len += bio->bi_io_vec[i].bv_len;
1453 invalidate_kernel_vmap_range(bio->bi_private, len);
1454 }
1455#endif
1456}
1457
1444static void bio_map_kern_endio(struct bio *bio) 1458static void bio_map_kern_endio(struct bio *bio)
1445{ 1459{
1460 bio_invalidate_vmalloc_pages(bio);
1446 bio_put(bio); 1461 bio_put(bio);
1447} 1462}
1448 1463
@@ -1463,6 +1478,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1463 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1478 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1464 unsigned long start = kaddr >> PAGE_SHIFT; 1479 unsigned long start = kaddr >> PAGE_SHIFT;
1465 const int nr_pages = end - start; 1480 const int nr_pages = end - start;
1481 bool is_vmalloc = is_vmalloc_addr(data);
1482 struct page *page;
1466 int offset, i; 1483 int offset, i;
1467 struct bio *bio; 1484 struct bio *bio;
1468 1485
@@ -1470,6 +1487,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1470 if (!bio) 1487 if (!bio)
1471 return ERR_PTR(-ENOMEM); 1488 return ERR_PTR(-ENOMEM);
1472 1489
1490 if (is_vmalloc) {
1491 flush_kernel_vmap_range(data, len);
1492 bio->bi_private = data;
1493 }
1494
1473 offset = offset_in_page(kaddr); 1495 offset = offset_in_page(kaddr);
1474 for (i = 0; i < nr_pages; i++) { 1496 for (i = 0; i < nr_pages; i++) {
1475 unsigned int bytes = PAGE_SIZE - offset; 1497 unsigned int bytes = PAGE_SIZE - offset;
@@ -1480,7 +1502,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1480 if (bytes > len) 1502 if (bytes > len)
1481 bytes = len; 1503 bytes = len;
1482 1504
1483 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, 1505 if (!is_vmalloc)
1506 page = virt_to_page(data);
1507 else
1508 page = vmalloc_to_page(data);
1509 if (bio_add_pc_page(q, bio, page, bytes,
1484 offset) < bytes) { 1510 offset) < bytes) {
1485 /* we don't support partial mappings */ 1511 /* we don't support partial mappings */
1486 bio_put(bio); 1512 bio_put(bio);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 53b7bd4c7000..24ed26957367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -29,6 +29,7 @@
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/blk-cgroup.h> 30#include <linux/blk-cgroup.h>
31#include <linux/tracehook.h> 31#include <linux/tracehook.h>
32#include <linux/psi.h>
32#include "blk.h" 33#include "blk.h"
33 34
34#define MAX_KEY_LEN 100 35#define MAX_KEY_LEN 100
@@ -47,12 +48,14 @@ struct blkcg blkcg_root;
47EXPORT_SYMBOL_GPL(blkcg_root); 48EXPORT_SYMBOL_GPL(blkcg_root);
48 49
49struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; 50struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
51EXPORT_SYMBOL_GPL(blkcg_root_css);
50 52
51static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 53static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
52 54
53static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ 55static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
54 56
55static bool blkcg_debug_stats = false; 57static bool blkcg_debug_stats = false;
58static struct workqueue_struct *blkcg_punt_bio_wq;
56 59
57static bool blkcg_policy_enabled(struct request_queue *q, 60static bool blkcg_policy_enabled(struct request_queue *q,
58 const struct blkcg_policy *pol) 61 const struct blkcg_policy *pol)
@@ -87,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
87{ 90{
88 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); 91 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
89 92
93 WARN_ON(!bio_list_empty(&blkg->async_bios));
94
90 /* release the blkcg and parent blkg refs this blkg has been holding */ 95 /* release the blkcg and parent blkg refs this blkg has been holding */
91 css_put(&blkg->blkcg->css); 96 css_put(&blkg->blkcg->css);
92 if (blkg->parent) 97 if (blkg->parent)
@@ -112,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
112 call_rcu(&blkg->rcu_head, __blkg_release); 117 call_rcu(&blkg->rcu_head, __blkg_release);
113} 118}
114 119
120static void blkg_async_bio_workfn(struct work_struct *work)
121{
122 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
123 async_bio_work);
124 struct bio_list bios = BIO_EMPTY_LIST;
125 struct bio *bio;
126
127 /* as long as there are pending bios, @blkg can't go away */
128 spin_lock_bh(&blkg->async_bio_lock);
129 bio_list_merge(&bios, &blkg->async_bios);
130 bio_list_init(&blkg->async_bios);
131 spin_unlock_bh(&blkg->async_bio_lock);
132
133 while ((bio = bio_list_pop(&bios)))
134 submit_bio(bio);
135}
136
115/** 137/**
116 * blkg_alloc - allocate a blkg 138 * blkg_alloc - allocate a blkg
117 * @blkcg: block cgroup the new blkg is associated with 139 * @blkcg: block cgroup the new blkg is associated with
@@ -140,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
140 162
141 blkg->q = q; 163 blkg->q = q;
142 INIT_LIST_HEAD(&blkg->q_node); 164 INIT_LIST_HEAD(&blkg->q_node);
165 spin_lock_init(&blkg->async_bio_lock);
166 bio_list_init(&blkg->async_bios);
167 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
143 blkg->blkcg = blkcg; 168 blkg->blkcg = blkcg;
144 169
145 for (i = 0; i < BLKCG_MAX_POLS; i++) { 170 for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1526,6 +1551,25 @@ out_unlock:
1526} 1551}
1527EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1552EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1528 1553
1554bool __blkcg_punt_bio_submit(struct bio *bio)
1555{
1556 struct blkcg_gq *blkg = bio->bi_blkg;
1557
1558 /* consume the flag first */
1559 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1560
1561 /* never bounce for the root cgroup */
1562 if (!blkg->parent)
1563 return false;
1564
1565 spin_lock_bh(&blkg->async_bio_lock);
1566 bio_list_add(&blkg->async_bios, bio);
1567 spin_unlock_bh(&blkg->async_bio_lock);
1568
1569 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1570 return true;
1571}
1572
1529/* 1573/*
1530 * Scale the accumulated delay based on how long it has been since we updated 1574 * Scale the accumulated delay based on how long it has been since we updated
1531 * the delay. We only call this when we are adding delay, in case it's been a 1575 * the delay. We only call this when we are adding delay, in case it's been a
@@ -1587,6 +1631,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1587 */ 1631 */
1588static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) 1632static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1589{ 1633{
1634 unsigned long pflags;
1590 u64 now = ktime_to_ns(ktime_get()); 1635 u64 now = ktime_to_ns(ktime_get());
1591 u64 exp; 1636 u64 exp;
1592 u64 delay_nsec = 0; 1637 u64 delay_nsec = 0;
@@ -1613,11 +1658,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1613 */ 1658 */
1614 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); 1659 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1615 1660
1616 /* 1661 if (use_memdelay)
1617 * TODO: the use_memdelay flag is going to be for the upcoming psi stuff 1662 psi_memstall_enter(&pflags);
1618 * that hasn't landed upstream yet. Once that stuff is in place we need
1619 * to do a psi_memstall_enter/leave if memdelay is set.
1620 */
1621 1663
1622 exp = ktime_add_ns(now, delay_nsec); 1664 exp = ktime_add_ns(now, delay_nsec);
1623 tok = io_schedule_prepare(); 1665 tok = io_schedule_prepare();
@@ -1627,6 +1669,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1627 break; 1669 break;
1628 } while (!fatal_signal_pending(current)); 1670 } while (!fatal_signal_pending(current));
1629 io_schedule_finish(tok); 1671 io_schedule_finish(tok);
1672
1673 if (use_memdelay)
1674 psi_memstall_leave(&pflags);
1630} 1675}
1631 1676
1632/** 1677/**
@@ -1726,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1726 atomic64_add(delta, &blkg->delay_nsec); 1771 atomic64_add(delta, &blkg->delay_nsec);
1727} 1772}
1728 1773
1774static int __init blkcg_init(void)
1775{
1776 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1777 WQ_MEM_RECLAIM | WQ_FREEZABLE |
1778 WQ_UNBOUND | WQ_SYSFS, 0);
1779 if (!blkcg_punt_bio_wq)
1780 return -ENOMEM;
1781 return 0;
1782}
1783subsys_initcall(blkcg_init);
1784
1729module_param(blkcg_debug_stats, bool, 0644); 1785module_param(blkcg_debug_stats, bool, 0644);
1730MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); 1786MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index 5d1fc8e17dd1..d0cc6e14d2f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
117 rq->internal_tag = -1; 117 rq->internal_tag = -1;
118 rq->start_time_ns = ktime_get_ns(); 118 rq->start_time_ns = ktime_get_ns();
119 rq->part = NULL; 119 rq->part = NULL;
120 refcount_set(&rq->ref, 1);
120} 121}
121EXPORT_SYMBOL(blk_rq_init); 122EXPORT_SYMBOL(blk_rq_init);
122 123
@@ -687,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
687 struct request *rq; 688 struct request *rq;
688 struct list_head *plug_list; 689 struct list_head *plug_list;
689 690
690 plug = current->plug; 691 plug = blk_mq_plug(q, bio);
691 if (!plug) 692 if (!plug)
692 return false; 693 return false;
693 694
@@ -1127,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
1127 */ 1128 */
1128blk_qc_t submit_bio(struct bio *bio) 1129blk_qc_t submit_bio(struct bio *bio)
1129{ 1130{
1131 if (blkcg_punt_bio_submit(bio))
1132 return BLK_QC_T_NONE;
1133
1130 /* 1134 /*
1131 * If it's a regular read/write or a barrier with data attached, 1135 * If it's a regular read/write or a barrier with data attached,
1132 * go through the normal accounting stuff before submission. 1136 * go through the normal accounting stuff before submission.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e5ef40c603ca..b038ec680e84 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1973 1973
1974 blk_mq_bio_to_request(rq, bio, nr_segs); 1974 blk_mq_bio_to_request(rq, bio, nr_segs);
1975 1975
1976 plug = current->plug; 1976 plug = blk_mq_plug(q, bio);
1977 if (unlikely(is_flush_fua)) { 1977 if (unlikely(is_flush_fua)) {
1978 /* bypass scheduler for flush rq */ 1978 /* bypass scheduler for flush rq */
1979 blk_insert_flush(rq); 1979 blk_insert_flush(rq);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f4bf5161333e..32c62c64e6c2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
233 qmap->mq_map[cpu] = 0; 233 qmap->mq_map[cpu] = 0;
234} 234}
235 235
236/*
237 * blk_mq_plug() - Get caller context plug
238 * @q: request queue
239 * @bio : the bio being submitted by the caller context
240 *
241 * Plugging, by design, may delay the insertion of BIOs into the elevator in
242 * order to increase BIO merging opportunities. This however can cause BIO
243 * insertion order to change from the order in which submit_bio() is being
244 * executed in the case of multiple contexts concurrently issuing BIOs to a
245 * device, even if these context are synchronized to tightly control BIO issuing
246 * order. While this is not a problem with regular block devices, this ordering
247 * change can cause write BIO failures with zoned block devices as these
248 * require sequential write patterns to zones. Prevent this from happening by
249 * ignoring the plug state of a BIO issuing context if the target request queue
250 * is for a zoned block device and the BIO to plug is a write operation.
251 *
252 * Return current->plug if the bio can be plugged and NULL otherwise
253 */
254static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
255 struct bio *bio)
256{
257 /*
258 * For regular block devices or read operations, use the context plug
259 * which may be NULL if blk_start_plug() was not executed.
260 */
261 if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
262 return current->plug;
263
264 /* Zoned block device write operation case: do not plug the BIO */
265 return NULL;
266}
267
236#endif 268#endif
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9ea7c0ecad10..8ab6c8153223 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
881 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 881 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
882 u64 tmp; 882 u64 tmp;
883 883
884 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 884 jiffy_elapsed = jiffies - tg->slice_start[rw];
885
886 /* Slice has just started. Consider one slice interval */
887 if (!jiffy_elapsed)
888 jiffy_elapsed_rnd = tg->td->throtl_slice;
889 885
890 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); 886 /* Round up to the next throttle slice, wait time must be nonzero */
887 jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
891 888
892 /* 889 /*
893 * jiffy_elapsed_rnd should not be a big value as minimum iops can be 890 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ae7e91bd0618..6c503824ba3f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,9 @@
14#include <linux/rbtree.h> 14#include <linux/rbtree.h>
15#include <linux/blkdev.h> 15#include <linux/blkdev.h>
16#include <linux/blk-mq.h> 16#include <linux/blk-mq.h>
17#include <linux/mm.h>
18#include <linux/vmalloc.h>
19#include <linux/sched/mm.h>
17 20
18#include "blk.h" 21#include "blk.h"
19 22
@@ -70,7 +73,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
70static inline unsigned int __blkdev_nr_zones(struct request_queue *q, 73static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
71 sector_t nr_sectors) 74 sector_t nr_sectors)
72{ 75{
73 unsigned long zone_sectors = blk_queue_zone_sectors(q); 76 sector_t zone_sectors = blk_queue_zone_sectors(q);
74 77
75 return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); 78 return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
76} 79}
@@ -117,8 +120,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
117} 120}
118 121
119static int blk_report_zones(struct gendisk *disk, sector_t sector, 122static int blk_report_zones(struct gendisk *disk, sector_t sector,
120 struct blk_zone *zones, unsigned int *nr_zones, 123 struct blk_zone *zones, unsigned int *nr_zones)
121 gfp_t gfp_mask)
122{ 124{
123 struct request_queue *q = disk->queue; 125 struct request_queue *q = disk->queue;
124 unsigned int z = 0, n, nrz = *nr_zones; 126 unsigned int z = 0, n, nrz = *nr_zones;
@@ -127,8 +129,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
127 129
128 while (z < nrz && sector < capacity) { 130 while (z < nrz && sector < capacity) {
129 n = nrz - z; 131 n = nrz - z;
130 ret = disk->fops->report_zones(disk, sector, &zones[z], &n, 132 ret = disk->fops->report_zones(disk, sector, &zones[z], &n);
131 gfp_mask);
132 if (ret) 133 if (ret)
133 return ret; 134 return ret;
134 if (!n) 135 if (!n)
@@ -149,17 +150,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
149 * @sector: Sector from which to report zones 150 * @sector: Sector from which to report zones
150 * @zones: Array of zone structures where to return the zones information 151 * @zones: Array of zone structures where to return the zones information
151 * @nr_zones: Number of zone structures in the zone array 152 * @nr_zones: Number of zone structures in the zone array
152 * @gfp_mask: Memory allocation flags (for bio_alloc)
153 * 153 *
154 * Description: 154 * Description:
155 * Get zone information starting from the zone containing @sector. 155 * Get zone information starting from the zone containing @sector.
156 * The number of zone information reported may be less than the number 156 * The number of zone information reported may be less than the number
157 * requested by @nr_zones. The number of zones actually reported is 157 * requested by @nr_zones. The number of zones actually reported is
158 * returned in @nr_zones. 158 * returned in @nr_zones.
159 * The caller must use memalloc_noXX_save/restore() calls to control
160 * memory allocations done within this function (zone array and command
161 * buffer allocation by the device driver).
159 */ 162 */
160int blkdev_report_zones(struct block_device *bdev, sector_t sector, 163int blkdev_report_zones(struct block_device *bdev, sector_t sector,
161 struct blk_zone *zones, unsigned int *nr_zones, 164 struct blk_zone *zones, unsigned int *nr_zones)
162 gfp_t gfp_mask)
163{ 165{
164 struct request_queue *q = bdev_get_queue(bdev); 166 struct request_queue *q = bdev_get_queue(bdev);
165 unsigned int i, nrz; 167 unsigned int i, nrz;
@@ -184,7 +186,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
184 nrz = min(*nr_zones, 186 nrz = min(*nr_zones,
185 __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector)); 187 __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector));
186 ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector, 188 ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector,
187 zones, &nrz, gfp_mask); 189 zones, &nrz);
188 if (ret) 190 if (ret)
189 return ret; 191 return ret;
190 192
@@ -305,9 +307,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
305 if (!zones) 307 if (!zones)
306 return -ENOMEM; 308 return -ENOMEM;
307 309
308 ret = blkdev_report_zones(bdev, rep.sector, 310 ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones);
309 zones, &rep.nr_zones,
310 GFP_KERNEL);
311 if (ret) 311 if (ret)
312 goto out; 312 goto out;
313 313
@@ -373,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
373 * Allocate an array of struct blk_zone to get nr_zones zone information. 373 * Allocate an array of struct blk_zone to get nr_zones zone information.
374 * The allocated array may be smaller than nr_zones. 374 * The allocated array may be smaller than nr_zones.
375 */ 375 */
376static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) 376static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
377{ 377{
378 size_t size = *nr_zones * sizeof(struct blk_zone); 378 struct blk_zone *zones;
379 struct page *page; 379 size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
380 int order; 380
381 381 /*
382 for (order = get_order(size); order >= 0; order--) { 382 * GFP_KERNEL here is meaningless as the caller task context has
383 page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); 383 * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
384 if (page) { 384 * with memalloc_noio_save().
385 *nr_zones = min_t(unsigned int, *nr_zones, 385 */
386 (PAGE_SIZE << order) / sizeof(struct blk_zone)); 386 zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
387 return page_address(page); 387 if (!zones) {
388 } 388 *nr_zones = 0;
389 return NULL;
389 } 390 }
390 391
391 return NULL; 392 *nr_zones = nrz;
393
394 return zones;
392} 395}
393 396
394void blk_queue_free_zone_bitmaps(struct request_queue *q) 397void blk_queue_free_zone_bitmaps(struct request_queue *q)
@@ -415,6 +418,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
415 unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; 418 unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
416 unsigned int i, rep_nr_zones = 0, z = 0, nrz; 419 unsigned int i, rep_nr_zones = 0, z = 0, nrz;
417 struct blk_zone *zones = NULL; 420 struct blk_zone *zones = NULL;
421 unsigned int noio_flag;
418 sector_t sector = 0; 422 sector_t sector = 0;
419 int ret = 0; 423 int ret = 0;
420 424
@@ -427,6 +431,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
427 return 0; 431 return 0;
428 } 432 }
429 433
434 /*
435 * Ensure that all memory allocations in this context are done as
436 * if GFP_NOIO was specified.
437 */
438 noio_flag = memalloc_noio_save();
439
430 if (!blk_queue_is_zoned(q) || !nr_zones) { 440 if (!blk_queue_is_zoned(q) || !nr_zones) {
431 nr_zones = 0; 441 nr_zones = 0;
432 goto update; 442 goto update;
@@ -443,13 +453,13 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
443 453
444 /* Get zone information and initialize seq_zones_bitmap */ 454 /* Get zone information and initialize seq_zones_bitmap */
445 rep_nr_zones = nr_zones; 455 rep_nr_zones = nr_zones;
446 zones = blk_alloc_zones(q->node, &rep_nr_zones); 456 zones = blk_alloc_zones(&rep_nr_zones);
447 if (!zones) 457 if (!zones)
448 goto out; 458 goto out;
449 459
450 while (z < nr_zones) { 460 while (z < nr_zones) {
451 nrz = min(nr_zones - z, rep_nr_zones); 461 nrz = min(nr_zones - z, rep_nr_zones);
452 ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO); 462 ret = blk_report_zones(disk, sector, zones, &nrz);
453 if (ret) 463 if (ret)
454 goto out; 464 goto out;
455 if (!nrz) 465 if (!nrz)
@@ -480,8 +490,9 @@ update:
480 blk_mq_unfreeze_queue(q); 490 blk_mq_unfreeze_queue(q);
481 491
482out: 492out:
483 free_pages((unsigned long)zones, 493 memalloc_noio_restore(noio_flag);
484 get_order(rep_nr_zones * sizeof(struct blk_zone))); 494
495 kvfree(zones);
485 kfree(seq_zones_wlock); 496 kfree(seq_zones_wlock);
486 kfree(seq_zones_bitmap); 497 kfree(seq_zones_bitmap);
487 498
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3a9bca3aa093..9bcde2325893 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -134,6 +134,8 @@ static struct dentry *nbd_dbg_dir;
134 134
135#define NBD_MAGIC 0x68797548 135#define NBD_MAGIC 0x68797548
136 136
137#define NBD_DEF_BLKSIZE 1024
138
137static unsigned int nbds_max = 16; 139static unsigned int nbds_max = 16;
138static int max_part = 16; 140static int max_part = 16;
139static struct workqueue_struct *recv_workqueue; 141static struct workqueue_struct *recv_workqueue;
@@ -1236,6 +1238,14 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1236 nbd_config_put(nbd); 1238 nbd_config_put(nbd);
1237} 1239}
1238 1240
1241static bool nbd_is_valid_blksize(unsigned long blksize)
1242{
1243 if (!blksize || !is_power_of_2(blksize) || blksize < 512 ||
1244 blksize > PAGE_SIZE)
1245 return false;
1246 return true;
1247}
1248
1239/* Must be called with config_lock held */ 1249/* Must be called with config_lock held */
1240static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, 1250static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1241 unsigned int cmd, unsigned long arg) 1251 unsigned int cmd, unsigned long arg)
@@ -1251,8 +1261,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1251 case NBD_SET_SOCK: 1261 case NBD_SET_SOCK:
1252 return nbd_add_socket(nbd, arg, false); 1262 return nbd_add_socket(nbd, arg, false);
1253 case NBD_SET_BLKSIZE: 1263 case NBD_SET_BLKSIZE:
1254 if (!arg || !is_power_of_2(arg) || arg < 512 || 1264 if (!arg)
1255 arg > PAGE_SIZE) 1265 arg = NBD_DEF_BLKSIZE;
1266 if (!nbd_is_valid_blksize(arg))
1256 return -EINVAL; 1267 return -EINVAL;
1257 nbd_size_set(nbd, arg, 1268 nbd_size_set(nbd, arg,
1258 div_s64(config->bytesize, arg)); 1269 div_s64(config->bytesize, arg));
@@ -1332,7 +1343,7 @@ static struct nbd_config *nbd_alloc_config(void)
1332 atomic_set(&config->recv_threads, 0); 1343 atomic_set(&config->recv_threads, 0);
1333 init_waitqueue_head(&config->recv_wq); 1344 init_waitqueue_head(&config->recv_wq);
1334 init_waitqueue_head(&config->conn_wait); 1345 init_waitqueue_head(&config->conn_wait);
1335 config->blksize = 1024; 1346 config->blksize = NBD_DEF_BLKSIZE;
1336 atomic_set(&config->live_connections, 0); 1347 atomic_set(&config->live_connections, 0);
1337 try_module_get(THIS_MODULE); 1348 try_module_get(THIS_MODULE);
1338 return config; 1349 return config;
@@ -1673,6 +1684,30 @@ nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1673 [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 }, 1684 [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 },
1674}; 1685};
1675 1686
1687static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
1688{
1689 struct nbd_config *config = nbd->config;
1690 u64 bsize = config->blksize;
1691 u64 bytes = config->bytesize;
1692
1693 if (info->attrs[NBD_ATTR_SIZE_BYTES])
1694 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1695
1696 if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
1697 bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1698 if (!bsize)
1699 bsize = NBD_DEF_BLKSIZE;
1700 if (!nbd_is_valid_blksize(bsize)) {
1701 printk(KERN_ERR "Invalid block size %llu\n", bsize);
1702 return -EINVAL;
1703 }
1704 }
1705
1706 if (bytes != config->bytesize || bsize != config->blksize)
1707 nbd_size_set(nbd, bsize, div64_u64(bytes, bsize));
1708 return 0;
1709}
1710
1676static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) 1711static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1677{ 1712{
1678 struct nbd_device *nbd = NULL; 1713 struct nbd_device *nbd = NULL;
@@ -1760,16 +1795,10 @@ again:
1760 refcount_set(&nbd->config_refs, 1); 1795 refcount_set(&nbd->config_refs, 1);
1761 set_bit(NBD_BOUND, &config->runtime_flags); 1796 set_bit(NBD_BOUND, &config->runtime_flags);
1762 1797
1763 if (info->attrs[NBD_ATTR_SIZE_BYTES]) { 1798 ret = nbd_genl_size_set(info, nbd);
1764 u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); 1799 if (ret)
1765 nbd_size_set(nbd, config->blksize, 1800 goto out;
1766 div64_u64(bytes, config->blksize)); 1801
1767 }
1768 if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
1769 u64 bsize =
1770 nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1771 nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
1772 }
1773 if (info->attrs[NBD_ATTR_TIMEOUT]) { 1802 if (info->attrs[NBD_ATTR_TIMEOUT]) {
1774 u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); 1803 u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1775 nbd->tag_set.timeout = timeout * HZ; 1804 nbd->tag_set.timeout = timeout * HZ;
@@ -1938,6 +1967,10 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
1938 goto out; 1967 goto out;
1939 } 1968 }
1940 1969
1970 ret = nbd_genl_size_set(info, nbd);
1971 if (ret)
1972 goto out;
1973
1941 if (info->attrs[NBD_ATTR_TIMEOUT]) { 1974 if (info->attrs[NBD_ATTR_TIMEOUT]) {
1942 u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); 1975 u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1943 nbd->tag_set.timeout = timeout * HZ; 1976 nbd->tag_set.timeout = timeout * HZ;
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 34b22d6523ba..a1b9929bd911 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -89,8 +89,7 @@ struct nullb {
89int null_zone_init(struct nullb_device *dev); 89int null_zone_init(struct nullb_device *dev);
90void null_zone_exit(struct nullb_device *dev); 90void null_zone_exit(struct nullb_device *dev);
91int null_zone_report(struct gendisk *disk, sector_t sector, 91int null_zone_report(struct gendisk *disk, sector_t sector,
92 struct blk_zone *zones, unsigned int *nr_zones, 92 struct blk_zone *zones, unsigned int *nr_zones);
93 gfp_t gfp_mask);
94void null_zone_write(struct nullb_cmd *cmd, sector_t sector, 93void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
95 unsigned int nr_sectors); 94 unsigned int nr_sectors);
96void null_zone_reset(struct nullb_cmd *cmd, sector_t sector); 95void null_zone_reset(struct nullb_cmd *cmd, sector_t sector);
@@ -103,7 +102,7 @@ static inline int null_zone_init(struct nullb_device *dev)
103static inline void null_zone_exit(struct nullb_device *dev) {} 102static inline void null_zone_exit(struct nullb_device *dev) {}
104static inline int null_zone_report(struct gendisk *disk, sector_t sector, 103static inline int null_zone_report(struct gendisk *disk, sector_t sector,
105 struct blk_zone *zones, 104 struct blk_zone *zones,
106 unsigned int *nr_zones, gfp_t gfp_mask) 105 unsigned int *nr_zones)
107{ 106{
108 return -EOPNOTSUPP; 107 return -EOPNOTSUPP;
109} 108}
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index fca0c97ff1aa..cb28d93f2bd1 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -67,8 +67,7 @@ void null_zone_exit(struct nullb_device *dev)
67} 67}
68 68
69int null_zone_report(struct gendisk *disk, sector_t sector, 69int null_zone_report(struct gendisk *disk, sector_t sector,
70 struct blk_zone *zones, unsigned int *nr_zones, 70 struct blk_zone *zones, unsigned int *nr_zones)
71 gfp_t gfp_mask)
72{ 71{
73 struct nullb *nullb = disk->private_data; 72 struct nullb *nullb = disk->private_data;
74 struct nullb_device *dev = nullb->dev; 73 struct nullb_device *dev = nullb->dev;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index a9bc518156f2..2900fbde89b3 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -461,15 +461,14 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
461 461
462#ifdef CONFIG_BLK_DEV_ZONED 462#ifdef CONFIG_BLK_DEV_ZONED
463static int flakey_report_zones(struct dm_target *ti, sector_t sector, 463static int flakey_report_zones(struct dm_target *ti, sector_t sector,
464 struct blk_zone *zones, unsigned int *nr_zones, 464 struct blk_zone *zones, unsigned int *nr_zones)
465 gfp_t gfp_mask)
466{ 465{
467 struct flakey_c *fc = ti->private; 466 struct flakey_c *fc = ti->private;
468 int ret; 467 int ret;
469 468
470 /* Do report and remap it */ 469 /* Do report and remap it */
471 ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector), 470 ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector),
472 zones, nr_zones, gfp_mask); 471 zones, nr_zones);
473 if (ret != 0) 472 if (ret != 0)
474 return ret; 473 return ret;
475 474
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ad980a38fb1e..ecefe6703736 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -137,15 +137,14 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
137 137
138#ifdef CONFIG_BLK_DEV_ZONED 138#ifdef CONFIG_BLK_DEV_ZONED
139static int linear_report_zones(struct dm_target *ti, sector_t sector, 139static int linear_report_zones(struct dm_target *ti, sector_t sector,
140 struct blk_zone *zones, unsigned int *nr_zones, 140 struct blk_zone *zones, unsigned int *nr_zones)
141 gfp_t gfp_mask)
142{ 141{
143 struct linear_c *lc = (struct linear_c *) ti->private; 142 struct linear_c *lc = (struct linear_c *) ti->private;
144 int ret; 143 int ret;
145 144
146 /* Do report and remap it */ 145 /* Do report and remap it */
147 ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector), 146 ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector),
148 zones, nr_zones, gfp_mask); 147 zones, nr_zones);
149 if (ret != 0) 148 if (ret != 0)
150 return ret; 149 return ret;
151 150
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index d8334cd45d7c..9faf3e49c7af 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/crc32.h> 10#include <linux/crc32.h>
11#include <linux/sched/mm.h>
11 12
12#define DM_MSG_PREFIX "zoned metadata" 13#define DM_MSG_PREFIX "zoned metadata"
13 14
@@ -1162,8 +1163,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
1162 while (sector < dev->capacity) { 1163 while (sector < dev->capacity) {
1163 /* Get zone information */ 1164 /* Get zone information */
1164 nr_blkz = DMZ_REPORT_NR_ZONES; 1165 nr_blkz = DMZ_REPORT_NR_ZONES;
1165 ret = blkdev_report_zones(dev->bdev, sector, blkz, 1166 ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
1166 &nr_blkz, GFP_KERNEL);
1167 if (ret) { 1167 if (ret) {
1168 dmz_dev_err(dev, "Report zones failed %d", ret); 1168 dmz_dev_err(dev, "Report zones failed %d", ret);
1169 goto out; 1169 goto out;
@@ -1201,12 +1201,20 @@ out:
1201static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1201static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1202{ 1202{
1203 unsigned int nr_blkz = 1; 1203 unsigned int nr_blkz = 1;
1204 unsigned int noio_flag;
1204 struct blk_zone blkz; 1205 struct blk_zone blkz;
1205 int ret; 1206 int ret;
1206 1207
1207 /* Get zone information from disk */ 1208 /*
1209 * Get zone information from disk. Since blkdev_report_zones() uses
1210 * GFP_KERNEL by default for memory allocations, set the per-task
1211 * PF_MEMALLOC_NOIO flag so that all allocations are done as if
1212 * GFP_NOIO was specified.
1213 */
1214 noio_flag = memalloc_noio_save();
1208 ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1215 ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
1209 &blkz, &nr_blkz, GFP_NOIO); 1216 &blkz, &nr_blkz);
1217 memalloc_noio_restore(noio_flag);
1210 if (!nr_blkz) 1218 if (!nr_blkz)
1211 ret = -EIO; 1219 ret = -EIO;
1212 if (ret) { 1220 if (ret) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5475081dcbd6..61f1152b74e9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -441,8 +441,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
441} 441}
442 442
443static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, 443static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
444 struct blk_zone *zones, unsigned int *nr_zones, 444 struct blk_zone *zones, unsigned int *nr_zones)
445 gfp_t gfp_mask)
446{ 445{
447#ifdef CONFIG_BLK_DEV_ZONED 446#ifdef CONFIG_BLK_DEV_ZONED
448 struct mapped_device *md = disk->private_data; 447 struct mapped_device *md = disk->private_data;
@@ -480,8 +479,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
480 * So there is no need to loop here trying to fill the entire array 479 * So there is no need to loop here trying to fill the entire array
481 * of zones. 480 * of zones.
482 */ 481 */
483 ret = tgt->type->report_zones(tgt, sector, zones, 482 ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
484 nr_zones, gfp_mask);
485 483
486out: 484out:
487 dm_put_live_table(md, srcu_idx); 485 dm_put_live_table(md, srcu_idx);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b2dd4e391f5c..cc09b81fc7f4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -11,6 +11,7 @@
11#include <linux/hdreg.h> 11#include <linux/hdreg.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/backing-dev.h>
14#include <linux/list_sort.h> 15#include <linux/list_sort.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/types.h> 17#include <linux/types.h>
@@ -1626,6 +1627,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
1626{ 1627{
1627 sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9); 1628 sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9);
1628 unsigned short bs = 1 << ns->lba_shift; 1629 unsigned short bs = 1 << ns->lba_shift;
1630 u32 atomic_bs, phys_bs, io_opt;
1629 1631
1630 if (ns->lba_shift > PAGE_SHIFT) { 1632 if (ns->lba_shift > PAGE_SHIFT) {
1631 /* unsupported block size, set capacity to 0 later */ 1633 /* unsupported block size, set capacity to 0 later */
@@ -1634,9 +1636,37 @@ static void nvme_update_disk_info(struct gendisk *disk,
1634 blk_mq_freeze_queue(disk->queue); 1636 blk_mq_freeze_queue(disk->queue);
1635 blk_integrity_unregister(disk); 1637 blk_integrity_unregister(disk);
1636 1638
1639 if (id->nabo == 0) {
1640 /*
1641 * Bit 1 indicates whether NAWUPF is defined for this namespace
1642 * and whether it should be used instead of AWUPF. If NAWUPF ==
1643 * 0 then AWUPF must be used instead.
1644 */
1645 if (id->nsfeat & (1 << 1) && id->nawupf)
1646 atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
1647 else
1648 atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
1649 } else {
1650 atomic_bs = bs;
1651 }
1652 phys_bs = bs;
1653 io_opt = bs;
1654 if (id->nsfeat & (1 << 4)) {
1655 /* NPWG = Namespace Preferred Write Granularity */
1656 phys_bs *= 1 + le16_to_cpu(id->npwg);
1657 /* NOWS = Namespace Optimal Write Size */
1658 io_opt *= 1 + le16_to_cpu(id->nows);
1659 }
1660
1637 blk_queue_logical_block_size(disk->queue, bs); 1661 blk_queue_logical_block_size(disk->queue, bs);
1638 blk_queue_physical_block_size(disk->queue, bs); 1662 /*
1639 blk_queue_io_min(disk->queue, bs); 1663 * Linux filesystems assume writing a single physical block is
1664 * an atomic operation. Hence limit the physical block size to the
1665 * value of the Atomic Write Unit Power Fail parameter.
1666 */
1667 blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
1668 blk_queue_io_min(disk->queue, phys_bs);
1669 blk_queue_io_opt(disk->queue, io_opt);
1640 1670
1641 if (ns->ms && !ns->ext && 1671 if (ns->ms && !ns->ext &&
1642 (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) 1672 (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
@@ -2386,8 +2416,8 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2386 lockdep_assert_held(&nvme_subsystems_lock); 2416 lockdep_assert_held(&nvme_subsystems_lock);
2387 2417
2388 list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { 2418 list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2389 if (ctrl->state == NVME_CTRL_DELETING || 2419 if (tmp->state == NVME_CTRL_DELETING ||
2390 ctrl->state == NVME_CTRL_DEAD) 2420 tmp->state == NVME_CTRL_DEAD)
2391 continue; 2421 continue;
2392 2422
2393 if (tmp->cntlid == ctrl->cntlid) { 2423 if (tmp->cntlid == ctrl->cntlid) {
@@ -2433,6 +2463,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2433 memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); 2463 memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
2434 subsys->vendor_id = le16_to_cpu(id->vid); 2464 subsys->vendor_id = le16_to_cpu(id->vid);
2435 subsys->cmic = id->cmic; 2465 subsys->cmic = id->cmic;
2466 subsys->awupf = le16_to_cpu(id->awupf);
2436#ifdef CONFIG_NVME_MULTIPATH 2467#ifdef CONFIG_NVME_MULTIPATH
2437 subsys->iopolicy = NVME_IOPOLICY_NUMA; 2468 subsys->iopolicy = NVME_IOPOLICY_NUMA;
2438#endif 2469#endif
@@ -3274,6 +3305,10 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3274 goto out_free_ns; 3305 goto out_free_ns;
3275 } 3306 }
3276 3307
3308 if (ctrl->opts && ctrl->opts->data_digest)
3309 ns->queue->backing_dev_info->capabilities
3310 |= BDI_CAP_STABLE_WRITES;
3311
3277 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); 3312 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
3278 if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) 3313 if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
3279 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); 3314 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index dcb2b799966f..232d8094091b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -204,6 +204,9 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
204 204
205static struct workqueue_struct *nvme_fc_wq; 205static struct workqueue_struct *nvme_fc_wq;
206 206
207static bool nvme_fc_waiting_to_unload;
208static DECLARE_COMPLETION(nvme_fc_unload_proceed);
209
207/* 210/*
208 * These items are short-term. They will eventually be moved into 211 * These items are short-term. They will eventually be moved into
209 * a generic FC class. See comments in module init. 212 * a generic FC class. See comments in module init.
@@ -229,6 +232,8 @@ nvme_fc_free_lport(struct kref *ref)
229 /* remove from transport list */ 232 /* remove from transport list */
230 spin_lock_irqsave(&nvme_fc_lock, flags); 233 spin_lock_irqsave(&nvme_fc_lock, flags);
231 list_del(&lport->port_list); 234 list_del(&lport->port_list);
235 if (nvme_fc_waiting_to_unload && list_empty(&nvme_fc_lport_list))
236 complete(&nvme_fc_unload_proceed);
232 spin_unlock_irqrestore(&nvme_fc_lock, flags); 237 spin_unlock_irqrestore(&nvme_fc_lock, flags);
233 238
234 ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); 239 ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num);
@@ -3457,11 +3462,51 @@ out_destroy_wq:
3457 return ret; 3462 return ret;
3458} 3463}
3459 3464
3465static void
3466nvme_fc_delete_controllers(struct nvme_fc_rport *rport)
3467{
3468 struct nvme_fc_ctrl *ctrl;
3469
3470 spin_lock(&rport->lock);
3471 list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
3472 dev_warn(ctrl->ctrl.device,
3473 "NVME-FC{%d}: transport unloading: deleting ctrl\n",
3474 ctrl->cnum);
3475 nvme_delete_ctrl(&ctrl->ctrl);
3476 }
3477 spin_unlock(&rport->lock);
3478}
3479
3480static void
3481nvme_fc_cleanup_for_unload(void)
3482{
3483 struct nvme_fc_lport *lport;
3484 struct nvme_fc_rport *rport;
3485
3486 list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
3487 list_for_each_entry(rport, &lport->endp_list, endp_list) {
3488 nvme_fc_delete_controllers(rport);
3489 }
3490 }
3491}
3492
3460static void __exit nvme_fc_exit_module(void) 3493static void __exit nvme_fc_exit_module(void)
3461{ 3494{
3462 /* sanity check - all lports should be removed */ 3495 unsigned long flags;
3463 if (!list_empty(&nvme_fc_lport_list)) 3496 bool need_cleanup = false;
3464 pr_warn("%s: localport list not empty\n", __func__); 3497
3498 spin_lock_irqsave(&nvme_fc_lock, flags);
3499 nvme_fc_waiting_to_unload = true;
3500 if (!list_empty(&nvme_fc_lport_list)) {
3501 need_cleanup = true;
3502 nvme_fc_cleanup_for_unload();
3503 }
3504 spin_unlock_irqrestore(&nvme_fc_lock, flags);
3505 if (need_cleanup) {
3506 pr_info("%s: waiting for ctlr deletes\n", __func__);
3507 wait_for_completion(&nvme_fc_unload_proceed);
3508 pr_info("%s: ctrl deletes complete\n", __func__);
3509 }
3465 3510
3466 nvmf_unregister_transport(&nvme_fc_transport); 3511 nvmf_unregister_transport(&nvme_fc_transport);
3467 3512
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 499acf07d61a..a9a927677970 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -123,14 +123,20 @@ void nvme_mpath_clear_current_path(struct nvme_ns *ns)
123 } 123 }
124} 124}
125 125
126static bool nvme_path_is_disabled(struct nvme_ns *ns)
127{
128 return ns->ctrl->state != NVME_CTRL_LIVE ||
129 test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
130 test_bit(NVME_NS_REMOVING, &ns->flags);
131}
132
126static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) 133static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
127{ 134{
128 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 135 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
129 struct nvme_ns *found = NULL, *fallback = NULL, *ns; 136 struct nvme_ns *found = NULL, *fallback = NULL, *ns;
130 137
131 list_for_each_entry_rcu(ns, &head->list, siblings) { 138 list_for_each_entry_rcu(ns, &head->list, siblings) {
132 if (ns->ctrl->state != NVME_CTRL_LIVE || 139 if (nvme_path_is_disabled(ns))
133 test_bit(NVME_NS_ANA_PENDING, &ns->flags))
134 continue; 140 continue;
135 141
136 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) 142 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
@@ -178,14 +184,16 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
178{ 184{
179 struct nvme_ns *ns, *found, *fallback = NULL; 185 struct nvme_ns *ns, *found, *fallback = NULL;
180 186
181 if (list_is_singular(&head->list)) 187 if (list_is_singular(&head->list)) {
188 if (nvme_path_is_disabled(old))
189 return NULL;
182 return old; 190 return old;
191 }
183 192
184 for (ns = nvme_next_ns(head, old); 193 for (ns = nvme_next_ns(head, old);
185 ns != old; 194 ns != old;
186 ns = nvme_next_ns(head, ns)) { 195 ns = nvme_next_ns(head, ns)) {
187 if (ns->ctrl->state != NVME_CTRL_LIVE || 196 if (nvme_path_is_disabled(ns))
188 test_bit(NVME_NS_ANA_PENDING, &ns->flags))
189 continue; 197 continue;
190 198
191 if (ns->ana_state == NVME_ANA_OPTIMIZED) { 199 if (ns->ana_state == NVME_ANA_OPTIMIZED) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ea45d7d393ad..716a876119c8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -283,6 +283,7 @@ struct nvme_subsystem {
283 char firmware_rev[8]; 283 char firmware_rev[8];
284 u8 cmic; 284 u8 cmic;
285 u16 vendor_id; 285 u16 vendor_id;
286 u16 awupf; /* 0's based awupf value. */
286 struct ida ns_ida; 287 struct ida ns_ida;
287#ifdef CONFIG_NVME_MULTIPATH 288#ifdef CONFIG_NVME_MULTIPATH
288 enum nvme_iopolicy iopolicy; 289 enum nvme_iopolicy iopolicy;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 189352081994..bb970ca82517 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1439,11 +1439,15 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1439 1439
1440 if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { 1440 if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
1441 nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); 1441 nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
1442 nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, 1442 if (nvmeq->sq_cmds) {
1443 nvmeq->sq_cmds); 1443 nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
1444 if (nvmeq->sq_dma_addr) { 1444 nvmeq->sq_cmds);
1445 set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); 1445 if (nvmeq->sq_dma_addr) {
1446 return 0; 1446 set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
1447 return 0;
1448 }
1449
1450 pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth));
1447 } 1451 }
1448 } 1452 }
1449 1453
@@ -2250,7 +2254,9 @@ static int nvme_dev_add(struct nvme_dev *dev)
2250 if (!dev->ctrl.tagset) { 2254 if (!dev->ctrl.tagset) {
2251 dev->tagset.ops = &nvme_mq_ops; 2255 dev->tagset.ops = &nvme_mq_ops;
2252 dev->tagset.nr_hw_queues = dev->online_queues - 1; 2256 dev->tagset.nr_hw_queues = dev->online_queues - 1;
2253 dev->tagset.nr_maps = 2; /* default + read */ 2257 dev->tagset.nr_maps = 1; /* default */
2258 if (dev->io_queues[HCTX_TYPE_READ])
2259 dev->tagset.nr_maps++;
2254 if (dev->io_queues[HCTX_TYPE_POLL]) 2260 if (dev->io_queues[HCTX_TYPE_POLL])
2255 dev->tagset.nr_maps++; 2261 dev->tagset.nr_maps++;
2256 dev->tagset.timeout = NVME_IO_TIMEOUT; 2262 dev->tagset.timeout = NVME_IO_TIMEOUT;
@@ -2289,8 +2295,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
2289 2295
2290 pci_set_master(pdev); 2296 pci_set_master(pdev);
2291 2297
2292 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 2298 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)))
2293 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
2294 goto disable; 2299 goto disable;
2295 2300
2296 if (readl(dev->bar + NVME_REG_CSTS) == -1) { 2301 if (readl(dev->bar + NVME_REG_CSTS) == -1) {
@@ -2498,7 +2503,8 @@ static void nvme_reset_work(struct work_struct *work)
2498 * Limit the max command size to prevent iod->sg allocations going 2503 * Limit the max command size to prevent iod->sg allocations going
2499 * over a single page. 2504 * over a single page.
2500 */ 2505 */
2501 dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; 2506 dev->ctrl.max_hw_sectors = min_t(u32,
2507 NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
2502 dev->ctrl.max_segments = NVME_MAX_SEGS; 2508 dev->ctrl.max_segments = NVME_MAX_SEGS;
2503 2509
2504 /* 2510 /*
@@ -2923,7 +2929,7 @@ static int nvme_simple_resume(struct device *dev)
2923 return 0; 2929 return 0;
2924} 2930}
2925 2931
2926const struct dev_pm_ops nvme_dev_pm_ops = { 2932static const struct dev_pm_ops nvme_dev_pm_ops = {
2927 .suspend = nvme_suspend, 2933 .suspend = nvme_suspend,
2928 .resume = nvme_resume, 2934 .resume = nvme_resume,
2929 .freeze = nvme_simple_suspend, 2935 .freeze = nvme_simple_suspend,
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 08a2501b9357..606b13d35d16 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -860,7 +860,14 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
860 else 860 else
861 flags |= MSG_MORE; 861 flags |= MSG_MORE;
862 862
863 ret = kernel_sendpage(queue->sock, page, offset, len, flags); 863 /* can't zcopy slab pages */
864 if (unlikely(PageSlab(page))) {
865 ret = sock_no_sendpage(queue->sock, page, offset, len,
866 flags);
867 } else {
868 ret = kernel_sendpage(queue->sock, page, offset, len,
869 flags);
870 }
864 if (ret <= 0) 871 if (ret <= 0)
865 return ret; 872 return ret;
866 873
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index f01ad0fd60bb..9778eb0406b3 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -7,6 +7,17 @@
7#include <asm/unaligned.h> 7#include <asm/unaligned.h>
8#include "trace.h" 8#include "trace.h"
9 9
10static const char *nvme_trace_delete_sq(struct trace_seq *p, u8 *cdw10)
11{
12 const char *ret = trace_seq_buffer_ptr(p);
13 u16 sqid = get_unaligned_le16(cdw10);
14
15 trace_seq_printf(p, "sqid=%u", sqid);
16 trace_seq_putc(p, 0);
17
18 return ret;
19}
20
10static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) 21static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
11{ 22{
12 const char *ret = trace_seq_buffer_ptr(p); 23 const char *ret = trace_seq_buffer_ptr(p);
@@ -23,6 +34,17 @@ static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
23 return ret; 34 return ret;
24} 35}
25 36
37static const char *nvme_trace_delete_cq(struct trace_seq *p, u8 *cdw10)
38{
39 const char *ret = trace_seq_buffer_ptr(p);
40 u16 cqid = get_unaligned_le16(cdw10);
41
42 trace_seq_printf(p, "cqid=%u", cqid);
43 trace_seq_putc(p, 0);
44
45 return ret;
46}
47
26static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10) 48static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10)
27{ 49{
28 const char *ret = trace_seq_buffer_ptr(p); 50 const char *ret = trace_seq_buffer_ptr(p);
@@ -107,8 +129,12 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
107 u8 opcode, u8 *cdw10) 129 u8 opcode, u8 *cdw10)
108{ 130{
109 switch (opcode) { 131 switch (opcode) {
132 case nvme_admin_delete_sq:
133 return nvme_trace_delete_sq(p, cdw10);
110 case nvme_admin_create_sq: 134 case nvme_admin_create_sq:
111 return nvme_trace_create_sq(p, cdw10); 135 return nvme_trace_create_sq(p, cdw10);
136 case nvme_admin_delete_cq:
137 return nvme_trace_delete_cq(p, cdw10);
112 case nvme_admin_create_cq: 138 case nvme_admin_create_cq:
113 return nvme_trace_create_cq(p, cdw10); 139 return nvme_trace_create_cq(p, cdw10);
114 case nvme_admin_identify: 140 case nvme_admin_identify:
@@ -178,7 +204,7 @@ static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
178{ 204{
179 const char *ret = trace_seq_buffer_ptr(p); 205 const char *ret = trace_seq_buffer_ptr(p);
180 206
181 trace_seq_printf(p, "spcecific=%*ph", 24, spc); 207 trace_seq_printf(p, "specific=%*ph", 24, spc);
182 trace_seq_putc(p, 0); 208 trace_seq_putc(p, 0);
183 return ret; 209 return ret;
184} 210}
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 9f72d515fc4b..4dc12ea52f23 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -442,6 +442,9 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
442 break; 442 break;
443 } 443 }
444 444
445 if (ns->bdev)
446 nvmet_bdev_set_limits(ns->bdev, id);
447
445 /* 448 /*
446 * We just provide a single LBA format that matches what the 449 * We just provide a single LBA format that matches what the
447 * underlying device reports. 450 * underlying device reports.
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 08dd5af357f7..cd52b9f15376 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -588,8 +588,10 @@ static struct config_group *nvmet_ns_make(struct config_group *group,
588 goto out; 588 goto out;
589 589
590 ret = -EINVAL; 590 ret = -EINVAL;
591 if (nsid == 0 || nsid == NVME_NSID_ALL) 591 if (nsid == 0 || nsid == NVME_NSID_ALL) {
592 pr_err("invalid nsid %#x", nsid);
592 goto out; 593 goto out;
594 }
593 595
594 ret = -ENOMEM; 596 ret = -ENOMEM;
595 ns = nvmet_ns_alloc(subsys, nsid); 597 ns = nvmet_ns_alloc(subsys, nsid);
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index b8c1cc54a0db..b50b53db3746 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -434,7 +434,7 @@ fcloop_fcp_recv_work(struct work_struct *work)
434 int ret = 0; 434 int ret = 0;
435 bool aborted = false; 435 bool aborted = false;
436 436
437 spin_lock(&tfcp_req->reqlock); 437 spin_lock_irq(&tfcp_req->reqlock);
438 switch (tfcp_req->inistate) { 438 switch (tfcp_req->inistate) {
439 case INI_IO_START: 439 case INI_IO_START:
440 tfcp_req->inistate = INI_IO_ACTIVE; 440 tfcp_req->inistate = INI_IO_ACTIVE;
@@ -443,11 +443,11 @@ fcloop_fcp_recv_work(struct work_struct *work)
443 aborted = true; 443 aborted = true;
444 break; 444 break;
445 default: 445 default:
446 spin_unlock(&tfcp_req->reqlock); 446 spin_unlock_irq(&tfcp_req->reqlock);
447 WARN_ON(1); 447 WARN_ON(1);
448 return; 448 return;
449 } 449 }
450 spin_unlock(&tfcp_req->reqlock); 450 spin_unlock_irq(&tfcp_req->reqlock);
451 451
452 if (unlikely(aborted)) 452 if (unlikely(aborted))
453 ret = -ECANCELED; 453 ret = -ECANCELED;
@@ -469,7 +469,7 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
469 struct nvmefc_fcp_req *fcpreq; 469 struct nvmefc_fcp_req *fcpreq;
470 bool completed = false; 470 bool completed = false;
471 471
472 spin_lock(&tfcp_req->reqlock); 472 spin_lock_irq(&tfcp_req->reqlock);
473 fcpreq = tfcp_req->fcpreq; 473 fcpreq = tfcp_req->fcpreq;
474 switch (tfcp_req->inistate) { 474 switch (tfcp_req->inistate) {
475 case INI_IO_ABORTED: 475 case INI_IO_ABORTED:
@@ -478,11 +478,11 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
478 completed = true; 478 completed = true;
479 break; 479 break;
480 default: 480 default:
481 spin_unlock(&tfcp_req->reqlock); 481 spin_unlock_irq(&tfcp_req->reqlock);
482 WARN_ON(1); 482 WARN_ON(1);
483 return; 483 return;
484 } 484 }
485 spin_unlock(&tfcp_req->reqlock); 485 spin_unlock_irq(&tfcp_req->reqlock);
486 486
487 if (unlikely(completed)) { 487 if (unlikely(completed)) {
488 /* remove reference taken in original abort downcall */ 488 /* remove reference taken in original abort downcall */
@@ -494,9 +494,9 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
494 nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, 494 nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
495 &tfcp_req->tgt_fcp_req); 495 &tfcp_req->tgt_fcp_req);
496 496
497 spin_lock(&tfcp_req->reqlock); 497 spin_lock_irq(&tfcp_req->reqlock);
498 tfcp_req->fcpreq = NULL; 498 tfcp_req->fcpreq = NULL;
499 spin_unlock(&tfcp_req->reqlock); 499 spin_unlock_irq(&tfcp_req->reqlock);
500 500
501 fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); 501 fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
502 /* call_host_done releases reference for abort downcall */ 502 /* call_host_done releases reference for abort downcall */
@@ -513,10 +513,10 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
513 container_of(work, struct fcloop_fcpreq, tio_done_work); 513 container_of(work, struct fcloop_fcpreq, tio_done_work);
514 struct nvmefc_fcp_req *fcpreq; 514 struct nvmefc_fcp_req *fcpreq;
515 515
516 spin_lock(&tfcp_req->reqlock); 516 spin_lock_irq(&tfcp_req->reqlock);
517 fcpreq = tfcp_req->fcpreq; 517 fcpreq = tfcp_req->fcpreq;
518 tfcp_req->inistate = INI_IO_COMPLETED; 518 tfcp_req->inistate = INI_IO_COMPLETED;
519 spin_unlock(&tfcp_req->reqlock); 519 spin_unlock_irq(&tfcp_req->reqlock);
520 520
521 fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status); 521 fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
522} 522}
@@ -535,7 +535,7 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
535 if (!rport->targetport) 535 if (!rport->targetport)
536 return -ECONNREFUSED; 536 return -ECONNREFUSED;
537 537
538 tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_KERNEL); 538 tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_ATOMIC);
539 if (!tfcp_req) 539 if (!tfcp_req)
540 return -ENOMEM; 540 return -ENOMEM;
541 541
@@ -621,12 +621,12 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
621 int fcp_err = 0, active, aborted; 621 int fcp_err = 0, active, aborted;
622 u8 op = tgt_fcpreq->op; 622 u8 op = tgt_fcpreq->op;
623 623
624 spin_lock(&tfcp_req->reqlock); 624 spin_lock_irq(&tfcp_req->reqlock);
625 fcpreq = tfcp_req->fcpreq; 625 fcpreq = tfcp_req->fcpreq;
626 active = tfcp_req->active; 626 active = tfcp_req->active;
627 aborted = tfcp_req->aborted; 627 aborted = tfcp_req->aborted;
628 tfcp_req->active = true; 628 tfcp_req->active = true;
629 spin_unlock(&tfcp_req->reqlock); 629 spin_unlock_irq(&tfcp_req->reqlock);
630 630
631 if (unlikely(active)) 631 if (unlikely(active))
632 /* illegal - call while i/o active */ 632 /* illegal - call while i/o active */
@@ -634,9 +634,9 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
634 634
635 if (unlikely(aborted)) { 635 if (unlikely(aborted)) {
636 /* target transport has aborted i/o prior */ 636 /* target transport has aborted i/o prior */
637 spin_lock(&tfcp_req->reqlock); 637 spin_lock_irq(&tfcp_req->reqlock);
638 tfcp_req->active = false; 638 tfcp_req->active = false;
639 spin_unlock(&tfcp_req->reqlock); 639 spin_unlock_irq(&tfcp_req->reqlock);
640 tgt_fcpreq->transferred_length = 0; 640 tgt_fcpreq->transferred_length = 0;
641 tgt_fcpreq->fcp_error = -ECANCELED; 641 tgt_fcpreq->fcp_error = -ECANCELED;
642 tgt_fcpreq->done(tgt_fcpreq); 642 tgt_fcpreq->done(tgt_fcpreq);
@@ -693,9 +693,9 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
693 break; 693 break;
694 } 694 }
695 695
696 spin_lock(&tfcp_req->reqlock); 696 spin_lock_irq(&tfcp_req->reqlock);
697 tfcp_req->active = false; 697 tfcp_req->active = false;
698 spin_unlock(&tfcp_req->reqlock); 698 spin_unlock_irq(&tfcp_req->reqlock);
699 699
700 tgt_fcpreq->transferred_length = xfrlen; 700 tgt_fcpreq->transferred_length = xfrlen;
701 tgt_fcpreq->fcp_error = fcp_err; 701 tgt_fcpreq->fcp_error = fcp_err;
@@ -715,9 +715,9 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport,
715 * (one doing io, other doing abort) and only kills ops posted 715 * (one doing io, other doing abort) and only kills ops posted
716 * after the abort request 716 * after the abort request
717 */ 717 */
718 spin_lock(&tfcp_req->reqlock); 718 spin_lock_irq(&tfcp_req->reqlock);
719 tfcp_req->aborted = true; 719 tfcp_req->aborted = true;
720 spin_unlock(&tfcp_req->reqlock); 720 spin_unlock_irq(&tfcp_req->reqlock);
721 721
722 tfcp_req->status = NVME_SC_INTERNAL; 722 tfcp_req->status = NVME_SC_INTERNAL;
723 723
@@ -765,7 +765,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
765 return; 765 return;
766 766
767 /* break initiator/target relationship for io */ 767 /* break initiator/target relationship for io */
768 spin_lock(&tfcp_req->reqlock); 768 spin_lock_irq(&tfcp_req->reqlock);
769 switch (tfcp_req->inistate) { 769 switch (tfcp_req->inistate) {
770 case INI_IO_START: 770 case INI_IO_START:
771 case INI_IO_ACTIVE: 771 case INI_IO_ACTIVE:
@@ -775,11 +775,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
775 abortio = false; 775 abortio = false;
776 break; 776 break;
777 default: 777 default:
778 spin_unlock(&tfcp_req->reqlock); 778 spin_unlock_irq(&tfcp_req->reqlock);
779 WARN_ON(1); 779 WARN_ON(1);
780 return; 780 return;
781 } 781 }
782 spin_unlock(&tfcp_req->reqlock); 782 spin_unlock_irq(&tfcp_req->reqlock);
783 783
784 if (abortio) 784 if (abortio)
785 /* leave the reference while the work item is scheduled */ 785 /* leave the reference while the work item is scheduled */
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 7a1cf6437a6a..de0bff70ebb6 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -8,6 +8,45 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include "nvmet.h" 9#include "nvmet.h"
10 10
11void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
12{
13 const struct queue_limits *ql = &bdev_get_queue(bdev)->limits;
14 /* Number of physical blocks per logical block. */
15 const u32 ppl = ql->physical_block_size / ql->logical_block_size;
16 /* Physical blocks per logical block, 0's based. */
17 const __le16 ppl0b = to0based(ppl);
18
19 /*
20 * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN,
21 * NAWUPF, and NACWU are defined for this namespace and should be
22 * used by the host for this namespace instead of the AWUN, AWUPF,
23 * and ACWU fields in the Identify Controller data structure. If
24 * any of these fields are zero that means that the corresponding
25 * field from the identify controller data structure should be used.
26 */
27 id->nsfeat |= 1 << 1;
28 id->nawun = ppl0b;
29 id->nawupf = ppl0b;
30 id->nacwu = ppl0b;
31
32 /*
33 * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and
34 * NOWS are defined for this namespace and should be used by
35 * the host for I/O optimization.
36 */
37 id->nsfeat |= 1 << 4;
38 /* NPWG = Namespace Preferred Write Granularity. 0's based */
39 id->npwg = ppl0b;
40 /* NPWA = Namespace Preferred Write Alignment. 0's based */
41 id->npwa = id->npwg;
42 /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */
43 id->npdg = to0based(ql->discard_granularity / ql->logical_block_size);
44 /* NPDG = Namespace Preferred Deallocate Alignment */
45 id->npda = id->npdg;
46 /* NOWS = Namespace Optimal Write Size */
47 id->nows = to0based(ql->io_opt / ql->logical_block_size);
48}
49
11int nvmet_bdev_ns_enable(struct nvmet_ns *ns) 50int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
12{ 51{
13 int ret; 52 int ret;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index dc270944bb25..6ee66c610739 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -365,6 +365,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask);
365void nvmet_execute_async_event(struct nvmet_req *req); 365void nvmet_execute_async_event(struct nvmet_req *req);
366 366
367u16 nvmet_parse_connect_cmd(struct nvmet_req *req); 367u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
368void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id);
368u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); 369u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
369u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); 370u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
370u16 nvmet_parse_admin_cmd(struct nvmet_req *req); 371u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
@@ -492,4 +493,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req)
492} 493}
493 494
494u16 errno_to_nvme_status(struct nvmet_req *req, int errno); 495u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
496
497/* Convert a 32-bit number to a 16-bit 0's based number */
498static inline __le16 to0based(u32 a)
499{
500 return cpu_to_le16(max(1U, min(1U << 16, a)) - 1);
501}
502
495#endif /* _NVMET_H */ 503#endif /* _NVMET_H */
diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c
index cdcdd14c6408..6af11d493271 100644
--- a/drivers/nvme/target/trace.c
+++ b/drivers/nvme/target/trace.c
@@ -146,7 +146,7 @@ static const char *nvmet_trace_fabrics_common(struct trace_seq *p, u8 *spc)
146{ 146{
147 const char *ret = trace_seq_buffer_ptr(p); 147 const char *ret = trace_seq_buffer_ptr(p);
148 148
149 trace_seq_printf(p, "spcecific=%*ph", 24, spc); 149 trace_seq_printf(p, "specific=%*ph", 24, spc);
150 trace_seq_putc(p, 0); 150 trace_seq_putc(p, 0);
151 return ret; 151 return ret;
152} 152}
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 5796ace76225..38c50946fc42 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -213,8 +213,7 @@ extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
213extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, 213extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
214 struct scsi_sense_hdr *sshdr); 214 struct scsi_sense_hdr *sshdr);
215extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, 215extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
216 struct blk_zone *zones, unsigned int *nr_zones, 216 struct blk_zone *zones, unsigned int *nr_zones);
217 gfp_t gfp_mask);
218 217
219#else /* CONFIG_BLK_DEV_ZONED */ 218#else /* CONFIG_BLK_DEV_ZONED */
220 219
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 7334024b64f1..db16c19e05c4 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -9,6 +9,8 @@
9 */ 9 */
10 10
11#include <linux/blkdev.h> 11#include <linux/blkdev.h>
12#include <linux/vmalloc.h>
13#include <linux/sched/mm.h>
12 14
13#include <asm/unaligned.h> 15#include <asm/unaligned.h>
14 16
@@ -50,7 +52,7 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
50/** 52/**
51 * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command. 53 * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
52 * @sdkp: The target disk 54 * @sdkp: The target disk
53 * @buf: Buffer to use for the reply 55 * @buf: vmalloc-ed buffer to use for the reply
54 * @buflen: the buffer size 56 * @buflen: the buffer size
55 * @lba: Start LBA of the report 57 * @lba: Start LBA of the report
56 * @partial: Do partial report 58 * @partial: Do partial report
@@ -79,7 +81,6 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
79 put_unaligned_be32(buflen, &cmd[10]); 81 put_unaligned_be32(buflen, &cmd[10]);
80 if (partial) 82 if (partial)
81 cmd[14] = ZBC_REPORT_ZONE_PARTIAL; 83 cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
82 memset(buf, 0, buflen);
83 84
84 result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, 85 result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
85 buf, buflen, &sshdr, 86 buf, buflen, &sshdr,
@@ -103,45 +104,83 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
103 return 0; 104 return 0;
104} 105}
105 106
107/*
108 * Maximum number of zones to get with one report zones command.
109 */
110#define SD_ZBC_REPORT_MAX_ZONES 8192U
111
112/**
113 * Allocate a buffer for report zones reply.
114 * @sdkp: The target disk
115 * @nr_zones: Maximum number of zones to report
116 * @buflen: Size of the buffer allocated
117 *
118 * Try to allocate a reply buffer for the number of requested zones.
119 * The size of the buffer allocated may be smaller than requested to
120 * satify the device constraint (max_hw_sectors, max_segments, etc).
121 *
122 * Return the address of the allocated buffer and update @buflen with
123 * the size of the allocated buffer.
124 */
125static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
126 unsigned int nr_zones, size_t *buflen)
127{
128 struct request_queue *q = sdkp->disk->queue;
129 size_t bufsize;
130 void *buf;
131
132 /*
133 * Report zone buffer size should be at most 64B times the number of
134 * zones requested plus the 64B reply header, but should be at least
135 * SECTOR_SIZE for ATA devices.
136 * Make sure that this size does not exceed the hardware capabilities.
137 * Furthermore, since the report zone command cannot be split, make
138 * sure that the allocated buffer can always be mapped by limiting the
139 * number of pages allocated to the HBA max segments limit.
140 */
141 nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
142 bufsize = roundup((nr_zones + 1) * 64, 512);
143 bufsize = min_t(size_t, bufsize,
144 queue_max_hw_sectors(q) << SECTOR_SHIFT);
145 bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
146
147 buf = vzalloc(bufsize);
148 if (buf)
149 *buflen = bufsize;
150
151 return buf;
152}
153
106/** 154/**
107 * sd_zbc_report_zones - Disk report zones operation. 155 * sd_zbc_report_zones - Disk report zones operation.
108 * @disk: The target disk 156 * @disk: The target disk
109 * @sector: Start 512B sector of the report 157 * @sector: Start 512B sector of the report
110 * @zones: Array of zone descriptors 158 * @zones: Array of zone descriptors
111 * @nr_zones: Number of descriptors in the array 159 * @nr_zones: Number of descriptors in the array
112 * @gfp_mask: Memory allocation mask
113 * 160 *
114 * Execute a report zones command on the target disk. 161 * Execute a report zones command on the target disk.
115 */ 162 */
116int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, 163int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
117 struct blk_zone *zones, unsigned int *nr_zones, 164 struct blk_zone *zones, unsigned int *nr_zones)
118 gfp_t gfp_mask)
119{ 165{
120 struct scsi_disk *sdkp = scsi_disk(disk); 166 struct scsi_disk *sdkp = scsi_disk(disk);
121 unsigned int i, buflen, nrz = *nr_zones; 167 unsigned int i, nrz = *nr_zones;
122 unsigned char *buf; 168 unsigned char *buf;
123 size_t offset = 0; 169 size_t buflen = 0, offset = 0;
124 int ret = 0; 170 int ret = 0;
125 171
126 if (!sd_is_zoned(sdkp)) 172 if (!sd_is_zoned(sdkp))
127 /* Not a zoned device */ 173 /* Not a zoned device */
128 return -EOPNOTSUPP; 174 return -EOPNOTSUPP;
129 175
130 /* 176 buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen);
131 * Get a reply buffer for the number of requested zones plus a header,
132 * without exceeding the device maximum command size. For ATA disks,
133 * buffers must be aligned to 512B.
134 */
135 buflen = min(queue_max_hw_sectors(disk->queue) << 9,
136 roundup((nrz + 1) * 64, 512));
137 buf = kmalloc(buflen, gfp_mask);
138 if (!buf) 177 if (!buf)
139 return -ENOMEM; 178 return -ENOMEM;
140 179
141 ret = sd_zbc_do_report_zones(sdkp, buf, buflen, 180 ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
142 sectors_to_logical(sdkp->device, sector), true); 181 sectors_to_logical(sdkp->device, sector), true);
143 if (ret) 182 if (ret)
144 goto out_free_buf; 183 goto out;
145 184
146 nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64); 185 nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
147 for (i = 0; i < nrz; i++) { 186 for (i = 0; i < nrz; i++) {
@@ -152,8 +191,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
152 191
153 *nr_zones = nrz; 192 *nr_zones = nrz;
154 193
155out_free_buf: 194out:
156 kfree(buf); 195 kvfree(buf);
157 196
158 return ret; 197 return ret;
159} 198}
@@ -287,8 +326,6 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
287 return 0; 326 return 0;
288} 327}
289 328
290#define SD_ZBC_BUF_SIZE 131072U
291
292/** 329/**
293 * sd_zbc_check_zones - Check the device capacity and zone sizes 330 * sd_zbc_check_zones - Check the device capacity and zone sizes
294 * @sdkp: Target disk 331 * @sdkp: Target disk
@@ -304,22 +341,28 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
304 */ 341 */
305static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) 342static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
306{ 343{
344 size_t bufsize, buflen;
345 unsigned int noio_flag;
307 u64 zone_blocks = 0; 346 u64 zone_blocks = 0;
308 sector_t max_lba, block = 0; 347 sector_t max_lba, block = 0;
309 unsigned char *buf; 348 unsigned char *buf;
310 unsigned char *rec; 349 unsigned char *rec;
311 unsigned int buf_len;
312 unsigned int list_length;
313 int ret; 350 int ret;
314 u8 same; 351 u8 same;
315 352
353 /* Do all memory allocations as if GFP_NOIO was specified */
354 noio_flag = memalloc_noio_save();
355
316 /* Get a buffer */ 356 /* Get a buffer */
317 buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL); 357 buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES,
318 if (!buf) 358 &bufsize);
319 return -ENOMEM; 359 if (!buf) {
360 ret = -ENOMEM;
361 goto out;
362 }
320 363
321 /* Do a report zone to get max_lba and the same field */ 364 /* Do a report zone to get max_lba and the same field */
322 ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false); 365 ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
323 if (ret) 366 if (ret)
324 goto out_free; 367 goto out_free;
325 368
@@ -355,12 +398,12 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
355 do { 398 do {
356 399
357 /* Parse REPORT ZONES header */ 400 /* Parse REPORT ZONES header */
358 list_length = get_unaligned_be32(&buf[0]) + 64; 401 buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
402 bufsize);
359 rec = buf + 64; 403 rec = buf + 64;
360 buf_len = min(list_length, SD_ZBC_BUF_SIZE);
361 404
362 /* Parse zone descriptors */ 405 /* Parse zone descriptors */
363 while (rec < buf + buf_len) { 406 while (rec < buf + buflen) {
364 u64 this_zone_blocks = get_unaligned_be64(&rec[8]); 407 u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
365 408
366 if (zone_blocks == 0) { 409 if (zone_blocks == 0) {
@@ -376,8 +419,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
376 } 419 }
377 420
378 if (block < sdkp->capacity) { 421 if (block < sdkp->capacity) {
379 ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 422 ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
380 block, true); 423 true);
381 if (ret) 424 if (ret)
382 goto out_free; 425 goto out_free;
383 } 426 }
@@ -408,7 +451,8 @@ out:
408 } 451 }
409 452
410out_free: 453out_free:
411 kfree(buf); 454 memalloc_noio_restore(noio_flag);
455 kvfree(buf);
412 456
413 return ret; 457 return ret;
414} 458}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index db337e53aab3..5106008f5e28 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2911,7 +2911,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2911 bio = NULL; 2911 bio = NULL;
2912 } else { 2912 } else {
2913 if (wbc) 2913 if (wbc)
2914 wbc_account_io(wbc, page, page_size); 2914 wbc_account_cgroup_owner(wbc, page, page_size);
2915 return 0; 2915 return 0;
2916 } 2916 }
2917 } 2917 }
@@ -2924,7 +2924,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2924 bio->bi_opf = opf; 2924 bio->bi_opf = opf;
2925 if (wbc) { 2925 if (wbc) {
2926 wbc_init_bio(wbc, bio); 2926 wbc_init_bio(wbc, bio);
2927 wbc_account_io(wbc, page, page_size); 2927 wbc_account_cgroup_owner(wbc, page, page_size);
2928 } 2928 }
2929 2929
2930 *bio_ret = bio; 2930 *bio_ret = bio;
diff --git a/fs/buffer.c b/fs/buffer.c
index 49a871570092..86a38b979323 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3089,7 +3089,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
3089 3089
3090 if (wbc) { 3090 if (wbc) {
3091 wbc_init_bio(wbc, bio); 3091 wbc_init_bio(wbc, bio);
3092 wbc_account_io(wbc, bh->b_page, bh->b_size); 3092 wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
3093 } 3093 }
3094 3094
3095 submit_bio(bio); 3095 submit_bio(bio);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index a18a47a2a1d1..12ceadef32c5 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -396,7 +396,7 @@ submit_and_retry:
396 ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); 396 ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
397 if (ret != bh->b_size) 397 if (ret != bh->b_size)
398 goto submit_and_retry; 398 goto submit_and_retry;
399 wbc_account_io(io->io_wbc, page, bh->b_size); 399 wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
400 io->io_next_block++; 400 io->io_next_block++;
401 return 0; 401 return 0;
402} 402}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0ca530afc684..4eb2f3920140 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -470,7 +470,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
470 } 470 }
471 471
472 if (fio->io_wbc && !is_read_io(fio->op)) 472 if (fio->io_wbc && !is_read_io(fio->op))
473 wbc_account_io(fio->io_wbc, page, PAGE_SIZE); 473 wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
474 474
475 bio_set_op_attrs(bio, fio->op, fio->op_flags); 475 bio_set_op_attrs(bio, fio->op, fio->op_flags);
476 476
@@ -513,7 +513,7 @@ alloc_new:
513 } 513 }
514 514
515 if (fio->io_wbc) 515 if (fio->io_wbc)
516 wbc_account_io(fio->io_wbc, page, PAGE_SIZE); 516 wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
517 517
518 inc_page_count(fio->sbi, WB_DATA_TYPE(page)); 518 inc_page_count(fio->sbi, WB_DATA_TYPE(page));
519 519
@@ -592,7 +592,7 @@ alloc_new:
592 } 592 }
593 593
594 if (fio->io_wbc) 594 if (fio->io_wbc)
595 wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); 595 wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
596 596
597 io->last_block_in_bio = fio->new_blkaddr; 597 io->last_block_in_bio = fio->new_blkaddr;
598 f2fs_trace_ios(fio, 0); 598 f2fs_trace_ios(fio, 0);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d95a681ef7c9..6de6cda44031 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2818,9 +2818,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
2818 while (zones && sector < nr_sectors) { 2818 while (zones && sector < nr_sectors) {
2819 2819
2820 nr_zones = F2FS_REPORT_NR_ZONES; 2820 nr_zones = F2FS_REPORT_NR_ZONES;
2821 err = blkdev_report_zones(bdev, sector, 2821 err = blkdev_report_zones(bdev, sector, zones, &nr_zones);
2822 zones, &nr_zones,
2823 GFP_KERNEL);
2824 if (err) 2822 if (err)
2825 break; 2823 break;
2826 if (!nr_zones) { 2824 if (!nr_zones) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9ebfb1b28430..542b02d170f8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -270,6 +270,7 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
270 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) 270 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
271 wb_put(wb); 271 wb_put(wb);
272} 272}
273EXPORT_SYMBOL_GPL(__inode_attach_wb);
273 274
274/** 275/**
275 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it 276 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
@@ -582,6 +583,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
582 if (unlikely(wb_dying(wbc->wb))) 583 if (unlikely(wb_dying(wbc->wb)))
583 inode_switch_wbs(inode, wbc->wb_id); 584 inode_switch_wbs(inode, wbc->wb_id);
584} 585}
586EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
585 587
586/** 588/**
587 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection 589 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -701,9 +703,10 @@ void wbc_detach_inode(struct writeback_control *wbc)
701 wb_put(wbc->wb); 703 wb_put(wbc->wb);
702 wbc->wb = NULL; 704 wbc->wb = NULL;
703} 705}
706EXPORT_SYMBOL_GPL(wbc_detach_inode);
704 707
705/** 708/**
706 * wbc_account_io - account IO issued during writeback 709 * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
707 * @wbc: writeback_control of the writeback in progress 710 * @wbc: writeback_control of the writeback in progress
708 * @page: page being written out 711 * @page: page being written out
709 * @bytes: number of bytes being written out 712 * @bytes: number of bytes being written out
@@ -712,8 +715,8 @@ void wbc_detach_inode(struct writeback_control *wbc)
712 * controlled by @wbc. Keep the book for foreign inode detection. See 715 * controlled by @wbc. Keep the book for foreign inode detection. See
713 * wbc_detach_inode(). 716 * wbc_detach_inode().
714 */ 717 */
715void wbc_account_io(struct writeback_control *wbc, struct page *page, 718void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
716 size_t bytes) 719 size_t bytes)
717{ 720{
718 struct cgroup_subsys_state *css; 721 struct cgroup_subsys_state *css;
719 int id; 722 int id;
@@ -724,7 +727,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
724 * behind a slow cgroup. Ultimately, we want pageout() to kick off 727 * behind a slow cgroup. Ultimately, we want pageout() to kick off
725 * regular writeback instead of writing things out itself. 728 * regular writeback instead of writing things out itself.
726 */ 729 */
727 if (!wbc->wb) 730 if (!wbc->wb || wbc->no_cgroup_owner)
728 return; 731 return;
729 732
730 css = mem_cgroup_css_from_page(page); 733 css = mem_cgroup_css_from_page(page);
@@ -750,7 +753,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
750 else 753 else
751 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); 754 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
752} 755}
753EXPORT_SYMBOL_GPL(wbc_account_io); 756EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
754 757
755/** 758/**
756 * inode_congested - test whether an inode is congested 759 * inode_congested - test whether an inode is congested
diff --git a/fs/mpage.c b/fs/mpage.c
index 436a85260394..a63620cdb73a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -647,7 +647,7 @@ alloc_new:
647 * the confused fail path above (OOM) will be very confused when 647 * the confused fail path above (OOM) will be very confused when
648 * it finds all bh marked clean (i.e. it will not write anything) 648 * it finds all bh marked clean (i.e. it will not write anything)
649 */ 649 */
650 wbc_account_io(wbc, page, PAGE_SIZE); 650 wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
651 length = first_unmapped << blkbits; 651 length = first_unmapped << blkbits;
652 if (bio_add_page(bio, page, length, 0) < length) { 652 if (bio_add_page(bio, page, length, 0) < length) {
653 bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); 653 bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 761248ee2778..f16d5f196c6b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -796,7 +796,7 @@ xfs_add_to_ioend(
796 } 796 }
797 797
798 wpc->ioend->io_size += len; 798 wpc->ioend->io_size += len;
799 wbc_account_io(wbc, page, len); 799 wbc_account_cgroup_owner(wbc, page, len);
800} 800}
801 801
802STATIC void 802STATIC void
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f9b029180241..35b31d176f74 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
48extern struct list_head bdi_list; 48extern struct list_head bdi_list;
49 49
50extern struct workqueue_struct *bdi_wq; 50extern struct workqueue_struct *bdi_wq;
51extern struct workqueue_struct *bdi_async_bio_wq;
51 52
52static inline bool wb_has_dirty_io(struct bdi_writeback *wb) 53static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
53{ 54{
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 33f23a858438..689a58231288 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -132,13 +132,17 @@ struct blkcg_gq {
132 132
133 struct blkg_policy_data *pd[BLKCG_MAX_POLS]; 133 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
134 134
135 struct rcu_head rcu_head; 135 spinlock_t async_bio_lock;
136 struct bio_list async_bios;
137 struct work_struct async_bio_work;
136 138
137 atomic_t use_delay; 139 atomic_t use_delay;
138 atomic64_t delay_nsec; 140 atomic64_t delay_nsec;
139 atomic64_t delay_start; 141 atomic64_t delay_start;
140 u64 last_delay; 142 u64 last_delay;
141 int last_use; 143 int last_use;
144
145 struct rcu_head rcu_head;
142}; 146};
143 147
144typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); 148typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
701 struct bio *bio) { return false; } 705 struct bio *bio) { return false; }
702#endif 706#endif
703 707
708bool __blkcg_punt_bio_submit(struct bio *bio);
709
710static inline bool blkcg_punt_bio_submit(struct bio *bio)
711{
712 if (bio->bi_opf & REQ_CGROUP_PUNT)
713 return __blkcg_punt_bio_submit(bio);
714 else
715 return false;
716}
704 717
705static inline void blkcg_bio_issue_init(struct bio *bio) 718static inline void blkcg_bio_issue_init(struct bio *bio)
706{ 719{
@@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
848static inline void blkg_get(struct blkcg_gq *blkg) { } 861static inline void blkg_get(struct blkcg_gq *blkg) { }
849static inline void blkg_put(struct blkcg_gq *blkg) { } 862static inline void blkg_put(struct blkcg_gq *blkg) { }
850 863
864static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
851static inline void blkcg_bio_issue_init(struct bio *bio) { } 865static inline void blkcg_bio_issue_init(struct bio *bio) { }
852static inline bool blkcg_bio_issue_check(struct request_queue *q, 866static inline bool blkcg_bio_issue_check(struct request_queue *q,
853 struct bio *bio) { return true; } 867 struct bio *bio) { return true; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6a53799c3fe2..feff3fe4467e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,14 @@ enum req_flag_bits {
311 __REQ_RAHEAD, /* read ahead, can fail anytime */ 311 __REQ_RAHEAD, /* read ahead, can fail anytime */
312 __REQ_BACKGROUND, /* background IO */ 312 __REQ_BACKGROUND, /* background IO */
313 __REQ_NOWAIT, /* Don't wait if request will block */ 313 __REQ_NOWAIT, /* Don't wait if request will block */
314 /*
315 * When a shared kthread needs to issue a bio for a cgroup, doing
316 * so synchronously can lead to priority inversions as the kthread
317 * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes
318 * submit_bio() punt the actual issuing to a dedicated per-blkcg
319 * work item to avoid such priority inversions.
320 */
321 __REQ_CGROUP_PUNT,
314 322
315 /* command specific flags for REQ_OP_WRITE_ZEROES: */ 323 /* command specific flags for REQ_OP_WRITE_ZEROES: */
316 __REQ_NOUNMAP, /* do not free blocks when zeroing */ 324 __REQ_NOUNMAP, /* do not free blocks when zeroing */
@@ -337,6 +345,8 @@ enum req_flag_bits {
337#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) 345#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
338#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) 346#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
339#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) 347#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
348#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)
349
340#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) 350#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
341#define REQ_HIPRI (1ULL << __REQ_HIPRI) 351#define REQ_HIPRI (1ULL << __REQ_HIPRI)
342 352
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0c482371c8b3..1ef375dafb1c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -344,10 +344,15 @@ struct queue_limits {
344 344
345#ifdef CONFIG_BLK_DEV_ZONED 345#ifdef CONFIG_BLK_DEV_ZONED
346 346
347/*
348 * Maximum number of zones to report with a single report zones command.
349 */
350#define BLK_ZONED_REPORT_MAX_ZONES 8192U
351
347extern unsigned int blkdev_nr_zones(struct block_device *bdev); 352extern unsigned int blkdev_nr_zones(struct block_device *bdev);
348extern int blkdev_report_zones(struct block_device *bdev, 353extern int blkdev_report_zones(struct block_device *bdev,
349 sector_t sector, struct blk_zone *zones, 354 sector_t sector, struct blk_zone *zones,
350 unsigned int *nr_zones, gfp_t gfp_mask); 355 unsigned int *nr_zones);
351extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, 356extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
352 sector_t nr_sectors, gfp_t gfp_mask); 357 sector_t nr_sectors, gfp_t gfp_mask);
353extern int blk_revalidate_disk_zones(struct gendisk *disk); 358extern int blk_revalidate_disk_zones(struct gendisk *disk);
@@ -681,7 +686,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
681 } 686 }
682} 687}
683 688
684static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) 689static inline sector_t blk_queue_zone_sectors(struct request_queue *q)
685{ 690{
686 return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; 691 return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
687} 692}
@@ -1418,7 +1423,7 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
1418 return false; 1423 return false;
1419} 1424}
1420 1425
1421static inline unsigned int bdev_zone_sectors(struct block_device *bdev) 1426static inline sector_t bdev_zone_sectors(struct block_device *bdev)
1422{ 1427{
1423 struct request_queue *q = bdev_get_queue(bdev); 1428 struct request_queue *q = bdev_get_queue(bdev);
1424 1429
@@ -1673,8 +1678,7 @@ struct block_device_operations {
1673 /* this callback is with swap_lock and sometimes page table lock held */ 1678 /* this callback is with swap_lock and sometimes page table lock held */
1674 void (*swap_slot_free_notify) (struct block_device *, unsigned long); 1679 void (*swap_slot_free_notify) (struct block_device *, unsigned long);
1675 int (*report_zones)(struct gendisk *, sector_t sector, 1680 int (*report_zones)(struct gendisk *, sector_t sector,
1676 struct blk_zone *zones, unsigned int *nr_zones, 1681 struct blk_zone *zones, unsigned int *nr_zones);
1677 gfp_t gfp_mask);
1678 struct module *owner; 1682 struct module *owner;
1679 const struct pr_ops *pr_ops; 1683 const struct pr_ops *pr_ops;
1680}; 1684};
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2af9b1b419f1..f6b048902d6c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -699,6 +699,7 @@ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
699struct cgroup_subsys_state; 699struct cgroup_subsys_state;
700struct cgroup; 700struct cgroup;
701 701
702static inline void css_get(struct cgroup_subsys_state *css) {}
702static inline void css_put(struct cgroup_subsys_state *css) {} 703static inline void css_put(struct cgroup_subsys_state *css) {}
703static inline int cgroup_attach_task_all(struct task_struct *from, 704static inline int cgroup_attach_task_all(struct task_struct *from,
704 struct task_struct *t) { return 0; } 705 struct task_struct *t) { return 0; }
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e1f51d607cc5..3b470cb03b66 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -95,8 +95,7 @@ typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **
95 95
96typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector, 96typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector,
97 struct blk_zone *zones, 97 struct blk_zone *zones,
98 unsigned int *nr_zones, 98 unsigned int *nr_zones);
99 gfp_t gfp_mask);
100 99
101/* 100/*
102 * These iteration functions are typically used to check (and combine) 101 * These iteration functions are typically used to check (and combine)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 169bb2e02516..17cd0078377c 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -75,7 +75,7 @@ struct elevator_type
75 size_t icq_size; /* see iocontext.h */ 75 size_t icq_size; /* see iocontext.h */
76 size_t icq_align; /* ditto */ 76 size_t icq_align; /* ditto */
77 struct elv_fs_entry *elevator_attrs; 77 struct elv_fs_entry *elevator_attrs;
78 char elevator_name[ELV_NAME_MAX]; 78 const char *elevator_name;
79 const char *elevator_alias; 79 const char *elevator_alias;
80 struct module *elevator_owner; 80 struct module *elevator_owner;
81#ifdef CONFIG_BLK_DEBUG_FS 81#ifdef CONFIG_BLK_DEBUG_FS
@@ -160,15 +160,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
160#define ELEVATOR_INSERT_FLUSH 5 160#define ELEVATOR_INSERT_FLUSH 5
161#define ELEVATOR_INSERT_SORT_MERGE 6 161#define ELEVATOR_INSERT_SORT_MERGE 6
162 162
163/*
164 * return values from elevator_may_queue_fn
165 */
166enum {
167 ELV_MQUEUE_MAY,
168 ELV_MQUEUE_NO,
169 ELV_MQUEUE_MUST,
170};
171
172#define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) 163#define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq))
173#define rb_entry_rq(node) rb_entry((node), struct request, rb_node) 164#define rb_entry_rq(node) rb_entry((node), struct request, rb_node)
174 165
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d98b2d8baf4e..01aa6a6c241d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -315,7 +315,7 @@ struct nvme_id_ns {
315 __u8 nmic; 315 __u8 nmic;
316 __u8 rescap; 316 __u8 rescap;
317 __u8 fpi; 317 __u8 fpi;
318 __u8 rsvd33; 318 __u8 dlfeat;
319 __le16 nawun; 319 __le16 nawun;
320 __le16 nawupf; 320 __le16 nawupf;
321 __le16 nacwu; 321 __le16 nacwu;
@@ -324,11 +324,17 @@ struct nvme_id_ns {
324 __le16 nabspf; 324 __le16 nabspf;
325 __le16 noiob; 325 __le16 noiob;
326 __u8 nvmcap[16]; 326 __u8 nvmcap[16];
327 __u8 rsvd64[28]; 327 __le16 npwg;
328 __le16 npwa;
329 __le16 npdg;
330 __le16 npda;
331 __le16 nows;
332 __u8 rsvd74[18];
328 __le32 anagrpid; 333 __le32 anagrpid;
329 __u8 rsvd96[3]; 334 __u8 rsvd96[3];
330 __u8 nsattr; 335 __u8 nsattr;
331 __u8 rsvd100[4]; 336 __le16 nvmsetid;
337 __le16 endgid;
332 __u8 nguid[16]; 338 __u8 nguid[16];
333 __u8 eui64[8]; 339 __u8 eui64[8];
334 struct nvme_lbaf lbaf[16]; 340 struct nvme_lbaf lbaf[16];
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 738a0c24874f..8945aac31392 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -11,6 +11,7 @@
11#include <linux/flex_proportions.h> 11#include <linux/flex_proportions.h>
12#include <linux/backing-dev-defs.h> 12#include <linux/backing-dev-defs.h>
13#include <linux/blk_types.h> 13#include <linux/blk_types.h>
14#include <linux/blk-cgroup.h>
14 15
15struct bio; 16struct bio;
16 17
@@ -68,6 +69,17 @@ struct writeback_control {
68 unsigned for_reclaim:1; /* Invoked from the page allocator */ 69 unsigned for_reclaim:1; /* Invoked from the page allocator */
69 unsigned range_cyclic:1; /* range_start is cyclic */ 70 unsigned range_cyclic:1; /* range_start is cyclic */
70 unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 71 unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
72
73 /*
74 * When writeback IOs are bounced through async layers, only the
75 * initial synchronous phase should be accounted towards inode
76 * cgroup ownership arbitration to avoid confusion. Later stages
77 * can set the following flag to disable the accounting.
78 */
79 unsigned no_cgroup_owner:1;
80
81 unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */
82
71#ifdef CONFIG_CGROUP_WRITEBACK 83#ifdef CONFIG_CGROUP_WRITEBACK
72 struct bdi_writeback *wb; /* wb this writeback is issued under */ 84 struct bdi_writeback *wb; /* wb this writeback is issued under */
73 struct inode *inode; /* inode being written out */ 85 struct inode *inode; /* inode being written out */
@@ -84,12 +96,27 @@ struct writeback_control {
84 96
85static inline int wbc_to_write_flags(struct writeback_control *wbc) 97static inline int wbc_to_write_flags(struct writeback_control *wbc)
86{ 98{
99 int flags = 0;
100
101 if (wbc->punt_to_cgroup)
102 flags = REQ_CGROUP_PUNT;
103
87 if (wbc->sync_mode == WB_SYNC_ALL) 104 if (wbc->sync_mode == WB_SYNC_ALL)
88 return REQ_SYNC; 105 flags |= REQ_SYNC;
89 else if (wbc->for_kupdate || wbc->for_background) 106 else if (wbc->for_kupdate || wbc->for_background)
90 return REQ_BACKGROUND; 107 flags |= REQ_BACKGROUND;
91 108
92 return 0; 109 return flags;
110}
111
112static inline struct cgroup_subsys_state *
113wbc_blkcg_css(struct writeback_control *wbc)
114{
115#ifdef CONFIG_CGROUP_WRITEBACK
116 if (wbc->wb)
117 return wbc->wb->blkcg_css;
118#endif
119 return blkcg_root_css;
93} 120}
94 121
95/* 122/*
@@ -188,8 +215,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
188 struct inode *inode) 215 struct inode *inode)
189 __releases(&inode->i_lock); 216 __releases(&inode->i_lock);
190void wbc_detach_inode(struct writeback_control *wbc); 217void wbc_detach_inode(struct writeback_control *wbc);
191void wbc_account_io(struct writeback_control *wbc, struct page *page, 218void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
192 size_t bytes); 219 size_t bytes);
193void cgroup_writeback_umount(void); 220void cgroup_writeback_umount(void);
194 221
195/** 222/**
@@ -291,8 +318,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
291{ 318{
292} 319}
293 320
294static inline void wbc_account_io(struct writeback_control *wbc, 321static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
295 struct page *page, size_t bytes) 322 struct page *page, size_t bytes)
296{ 323{
297} 324}
298 325