aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-04 21:17:17 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-04 21:17:17 -0500
commite5322c54068162846fdbac0f0891cd95f37d4c4e (patch)
tree21d22ea5f1226073e1e106053d16a055be95c387
parentbdf9d29799296fd787a9d2b99cb8feff77e2b9db (diff)
parent4d6af73d9e43f78651a43ee4c5ad221107ac8365 (diff)
Merge branch 'for-linus2' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe: "Round 2 of this. I cut back to the bare necessities, the patch is still larger than it usually would be at this time, due to the number of NVMe fixes in there. This pull request contains: - The 4 core fixes from Ming, that fix both problems with exceeding the virtual boundary limit in case of merging, and the gap checking for cloned bio's. - NVMe fixes from Keith and Christoph: - Regression on larger user commands, causing problems with reading log pages (for instance). This touches both NVMe, and the block core since that is now generally utilized also for these types of commands. - Hot removal fixes. - User exploitable issue with passthrough IO commands, if !length is given, causing us to fault on writing to the zero page. - Fix for a hang under error conditions - And finally, the current series regression for umount with cgroup writeback, where the final flush would happen async and hence open up window after umount where the device wasn't consistent. fsck right after umount would show this. From Tejun" * 'for-linus2' of git://git.kernel.dk/linux-block: block: support large requests in blk_rq_map_user_iov block: fix blk_rq_get_max_sectors for driver private requests nvme: fix max_segments integer truncation nvme: set queue limits for the admin queue writeback: flush inode cgroup wb switches instead of pinning super_block NVMe: Fix 0-length integrity payload NVMe: Don't allow unsupported flags NVMe: Move error handling to failed reset handler NVMe: Simplify device reset failure NVMe: Fix namespace removal deadlock NVMe: Use IDA for namespace disk naming NVMe: Don't unmap controller registers on reset block: merge: get the 1st and last bvec via helpers block: get the 1st and last bvec via helpers block: check virt boundary in bio_will_gap() block: bio: introduce helpers to get the 1st and last bvec
-rw-r--r--block/blk-map.c91
-rw-r--r--block/blk-merge.c8
-rw-r--r--drivers/nvme/host/core.c111
-rw-r--r--drivers/nvme/host/nvme.h8
-rw-r--r--drivers/nvme/host/pci.c149
-rw-r--r--fs/fs-writeback.c54
-rw-r--r--fs/super.c1
-rw-r--r--include/linux/bio.h37
-rw-r--r--include/linux/blkdev.h25
-rw-r--r--include/linux/writeback.h5
10 files changed, 341 insertions, 148 deletions
diff --git a/block/blk-map.c b/block/blk-map.c
index f565e11f465a..a54f0543b956 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio)
57 return ret; 57 return ret;
58} 58}
59 59
60static int __blk_rq_map_user_iov(struct request *rq,
61 struct rq_map_data *map_data, struct iov_iter *iter,
62 gfp_t gfp_mask, bool copy)
63{
64 struct request_queue *q = rq->q;
65 struct bio *bio, *orig_bio;
66 int ret;
67
68 if (copy)
69 bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
70 else
71 bio = bio_map_user_iov(q, iter, gfp_mask);
72
73 if (IS_ERR(bio))
74 return PTR_ERR(bio);
75
76 if (map_data && map_data->null_mapped)
77 bio_set_flag(bio, BIO_NULL_MAPPED);
78
79 iov_iter_advance(iter, bio->bi_iter.bi_size);
80 if (map_data)
81 map_data->offset += bio->bi_iter.bi_size;
82
83 orig_bio = bio;
84 blk_queue_bounce(q, &bio);
85
86 /*
87 * We link the bounce buffer in and could have to traverse it
88 * later so we have to get a ref to prevent it from being freed
89 */
90 bio_get(bio);
91
92 ret = blk_rq_append_bio(q, rq, bio);
93 if (ret) {
94 bio_endio(bio);
95 __blk_rq_unmap_user(orig_bio);
96 bio_put(bio);
97 return ret;
98 }
99
100 return 0;
101}
102
60/** 103/**
61 * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage 104 * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
62 * @q: request queue where request should be inserted 105 * @q: request queue where request should be inserted
@@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
82 struct rq_map_data *map_data, 125 struct rq_map_data *map_data,
83 const struct iov_iter *iter, gfp_t gfp_mask) 126 const struct iov_iter *iter, gfp_t gfp_mask)
84{ 127{
85 struct bio *bio;
86 int unaligned = 0;
87 struct iov_iter i;
88 struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; 128 struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
129 bool copy = (q->dma_pad_mask & iter->count) || map_data;
130 struct bio *bio = NULL;
131 struct iov_iter i;
132 int ret;
89 133
90 if (!iter || !iter->count) 134 if (!iter || !iter->count)
91 return -EINVAL; 135 return -EINVAL;
@@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
101 */ 145 */
102 if ((uaddr & queue_dma_alignment(q)) || 146 if ((uaddr & queue_dma_alignment(q)) ||
103 iovec_gap_to_prv(q, &prv, &iov)) 147 iovec_gap_to_prv(q, &prv, &iov))
104 unaligned = 1; 148 copy = true;
105 149
106 prv.iov_base = iov.iov_base; 150 prv.iov_base = iov.iov_base;
107 prv.iov_len = iov.iov_len; 151 prv.iov_len = iov.iov_len;
108 } 152 }
109 153
110 if (unaligned || (q->dma_pad_mask & iter->count) || map_data) 154 i = *iter;
111 bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); 155 do {
112 else 156 ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
113 bio = bio_map_user_iov(q, iter, gfp_mask); 157 if (ret)
114 158 goto unmap_rq;
115 if (IS_ERR(bio)) 159 if (!bio)
116 return PTR_ERR(bio); 160 bio = rq->bio;
117 161 } while (iov_iter_count(&i));
118 if (map_data && map_data->null_mapped)
119 bio_set_flag(bio, BIO_NULL_MAPPED);
120
121 if (bio->bi_iter.bi_size != iter->count) {
122 /*
123 * Grab an extra reference to this bio, as bio_unmap_user()
124 * expects to be able to drop it twice as it happens on the
125 * normal IO completion path
126 */
127 bio_get(bio);
128 bio_endio(bio);
129 __blk_rq_unmap_user(bio);
130 return -EINVAL;
131 }
132 162
133 if (!bio_flagged(bio, BIO_USER_MAPPED)) 163 if (!bio_flagged(bio, BIO_USER_MAPPED))
134 rq->cmd_flags |= REQ_COPY_USER; 164 rq->cmd_flags |= REQ_COPY_USER;
135
136 blk_queue_bounce(q, &bio);
137 bio_get(bio);
138 blk_rq_bio_prep(q, rq, bio);
139 return 0; 165 return 0;
166
167unmap_rq:
168 __blk_rq_unmap_user(bio);
169 rq->bio = NULL;
170 return -EINVAL;
140} 171}
141EXPORT_SYMBOL(blk_rq_map_user_iov); 172EXPORT_SYMBOL(blk_rq_map_user_iov);
142 173
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 888a7fec81f7..261353166dcf 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -304,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
304 struct bio *nxt) 304 struct bio *nxt)
305{ 305{
306 struct bio_vec end_bv = { NULL }, nxt_bv; 306 struct bio_vec end_bv = { NULL }, nxt_bv;
307 struct bvec_iter iter;
308 307
309 if (!blk_queue_cluster(q)) 308 if (!blk_queue_cluster(q))
310 return 0; 309 return 0;
@@ -316,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
316 if (!bio_has_data(bio)) 315 if (!bio_has_data(bio))
317 return 1; 316 return 1;
318 317
319 bio_for_each_segment(end_bv, bio, iter) 318 bio_get_last_bvec(bio, &end_bv);
320 if (end_bv.bv_len == iter.bi_size) 319 bio_get_first_bvec(nxt, &nxt_bv);
321 break;
322
323 nxt_bv = bio_iovec(nxt);
324 320
325 if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) 321 if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
326 return 0; 322 return 0;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3cd921e6121e..03c46412fff4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -55,8 +55,9 @@ static void nvme_free_ns(struct kref *kref)
55 ns->disk->private_data = NULL; 55 ns->disk->private_data = NULL;
56 spin_unlock(&dev_list_lock); 56 spin_unlock(&dev_list_lock);
57 57
58 nvme_put_ctrl(ns->ctrl);
59 put_disk(ns->disk); 58 put_disk(ns->disk);
59 ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
60 nvme_put_ctrl(ns->ctrl);
60 kfree(ns); 61 kfree(ns);
61} 62}
62 63
@@ -183,7 +184,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
183 goto out_unmap; 184 goto out_unmap;
184 } 185 }
185 186
186 if (meta_buffer) { 187 if (meta_buffer && meta_len) {
187 struct bio_integrity_payload *bip; 188 struct bio_integrity_payload *bip;
188 189
189 meta = kmalloc(meta_len, GFP_KERNEL); 190 meta = kmalloc(meta_len, GFP_KERNEL);
@@ -373,6 +374,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
373 374
374 if (copy_from_user(&io, uio, sizeof(io))) 375 if (copy_from_user(&io, uio, sizeof(io)))
375 return -EFAULT; 376 return -EFAULT;
377 if (io.flags)
378 return -EINVAL;
376 379
377 switch (io.opcode) { 380 switch (io.opcode) {
378 case nvme_cmd_write: 381 case nvme_cmd_write:
@@ -424,6 +427,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
424 return -EACCES; 427 return -EACCES;
425 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 428 if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
426 return -EFAULT; 429 return -EFAULT;
430 if (cmd.flags)
431 return -EINVAL;
427 432
428 memset(&c, 0, sizeof(c)); 433 memset(&c, 0, sizeof(c));
429 c.common.opcode = cmd.opcode; 434 c.common.opcode = cmd.opcode;
@@ -556,6 +561,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
556 u16 old_ms; 561 u16 old_ms;
557 unsigned short bs; 562 unsigned short bs;
558 563
564 if (test_bit(NVME_NS_DEAD, &ns->flags)) {
565 set_capacity(disk, 0);
566 return -ENODEV;
567 }
559 if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { 568 if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
560 dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", 569 dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
561 __func__, ns->ctrl->instance, ns->ns_id); 570 __func__, ns->ctrl->instance, ns->ns_id);
@@ -831,6 +840,23 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
831 return ret; 840 return ret;
832} 841}
833 842
843static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
844 struct request_queue *q)
845{
846 if (ctrl->max_hw_sectors) {
847 u32 max_segments =
848 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
849
850 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
851 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
852 }
853 if (ctrl->stripe_size)
854 blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9);
855 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
856 blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
857 blk_queue_virt_boundary(q, ctrl->page_size - 1);
858}
859
834/* 860/*
835 * Initialize the cached copies of the Identify data and various controller 861 * Initialize the cached copies of the Identify data and various controller
836 * register in our nvme_ctrl structure. This should be called as soon as 862 * register in our nvme_ctrl structure. This should be called as soon as
@@ -888,6 +914,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
888 } 914 }
889 } 915 }
890 916
917 nvme_set_queue_limits(ctrl, ctrl->admin_q);
918
891 kfree(id); 919 kfree(id);
892 return 0; 920 return 0;
893} 921}
@@ -1118,9 +1146,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1118 if (!ns) 1146 if (!ns)
1119 return; 1147 return;
1120 1148
1149 ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
1150 if (ns->instance < 0)
1151 goto out_free_ns;
1152
1121 ns->queue = blk_mq_init_queue(ctrl->tagset); 1153 ns->queue = blk_mq_init_queue(ctrl->tagset);
1122 if (IS_ERR(ns->queue)) 1154 if (IS_ERR(ns->queue))
1123 goto out_free_ns; 1155 goto out_release_instance;
1124 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); 1156 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1125 ns->queue->queuedata = ns; 1157 ns->queue->queuedata = ns;
1126 ns->ctrl = ctrl; 1158 ns->ctrl = ctrl;
@@ -1134,17 +1166,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1134 ns->disk = disk; 1166 ns->disk = disk;
1135 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ 1167 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
1136 1168
1169
1137 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); 1170 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
1138 if (ctrl->max_hw_sectors) { 1171 nvme_set_queue_limits(ctrl, ns->queue);
1139 blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
1140 blk_queue_max_segments(ns->queue,
1141 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
1142 }
1143 if (ctrl->stripe_size)
1144 blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
1145 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1146 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
1147 blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
1148 1172
1149 disk->major = nvme_major; 1173 disk->major = nvme_major;
1150 disk->first_minor = 0; 1174 disk->first_minor = 0;
@@ -1153,7 +1177,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1153 disk->queue = ns->queue; 1177 disk->queue = ns->queue;
1154 disk->driverfs_dev = ctrl->device; 1178 disk->driverfs_dev = ctrl->device;
1155 disk->flags = GENHD_FL_EXT_DEVT; 1179 disk->flags = GENHD_FL_EXT_DEVT;
1156 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid); 1180 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
1157 1181
1158 if (nvme_revalidate_disk(ns->disk)) 1182 if (nvme_revalidate_disk(ns->disk))
1159 goto out_free_disk; 1183 goto out_free_disk;
@@ -1173,40 +1197,29 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
1173 kfree(disk); 1197 kfree(disk);
1174 out_free_queue: 1198 out_free_queue:
1175 blk_cleanup_queue(ns->queue); 1199 blk_cleanup_queue(ns->queue);
1200 out_release_instance:
1201 ida_simple_remove(&ctrl->ns_ida, ns->instance);
1176 out_free_ns: 1202 out_free_ns:
1177 kfree(ns); 1203 kfree(ns);
1178} 1204}
1179 1205
1180static void nvme_ns_remove(struct nvme_ns *ns) 1206static void nvme_ns_remove(struct nvme_ns *ns)
1181{ 1207{
1182 bool kill = nvme_io_incapable(ns->ctrl) && 1208 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
1183 !blk_queue_dying(ns->queue); 1209 return;
1184
1185 lockdep_assert_held(&ns->ctrl->namespaces_mutex);
1186
1187 if (kill) {
1188 blk_set_queue_dying(ns->queue);
1189 1210
1190 /*
1191 * The controller was shutdown first if we got here through
1192 * device removal. The shutdown may requeue outstanding
1193 * requests. These need to be aborted immediately so
1194 * del_gendisk doesn't block indefinitely for their completion.
1195 */
1196 blk_mq_abort_requeue_list(ns->queue);
1197 }
1198 if (ns->disk->flags & GENHD_FL_UP) { 1211 if (ns->disk->flags & GENHD_FL_UP) {
1199 if (blk_get_integrity(ns->disk)) 1212 if (blk_get_integrity(ns->disk))
1200 blk_integrity_unregister(ns->disk); 1213 blk_integrity_unregister(ns->disk);
1201 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, 1214 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
1202 &nvme_ns_attr_group); 1215 &nvme_ns_attr_group);
1203 del_gendisk(ns->disk); 1216 del_gendisk(ns->disk);
1204 }
1205 if (kill || !blk_queue_dying(ns->queue)) {
1206 blk_mq_abort_requeue_list(ns->queue); 1217 blk_mq_abort_requeue_list(ns->queue);
1207 blk_cleanup_queue(ns->queue); 1218 blk_cleanup_queue(ns->queue);
1208 } 1219 }
1220 mutex_lock(&ns->ctrl->namespaces_mutex);
1209 list_del_init(&ns->list); 1221 list_del_init(&ns->list);
1222 mutex_unlock(&ns->ctrl->namespaces_mutex);
1210 nvme_put_ns(ns); 1223 nvme_put_ns(ns);
1211} 1224}
1212 1225
@@ -1300,10 +1313,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
1300{ 1313{
1301 struct nvme_ns *ns, *next; 1314 struct nvme_ns *ns, *next;
1302 1315
1303 mutex_lock(&ctrl->namespaces_mutex);
1304 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) 1316 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
1305 nvme_ns_remove(ns); 1317 nvme_ns_remove(ns);
1306 mutex_unlock(&ctrl->namespaces_mutex);
1307} 1318}
1308 1319
1309static DEFINE_IDA(nvme_instance_ida); 1320static DEFINE_IDA(nvme_instance_ida);
@@ -1350,6 +1361,7 @@ static void nvme_free_ctrl(struct kref *kref)
1350 1361
1351 put_device(ctrl->device); 1362 put_device(ctrl->device);
1352 nvme_release_instance(ctrl); 1363 nvme_release_instance(ctrl);
1364 ida_destroy(&ctrl->ns_ida);
1353 1365
1354 ctrl->ops->free_ctrl(ctrl); 1366 ctrl->ops->free_ctrl(ctrl);
1355} 1367}
@@ -1390,6 +1402,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
1390 } 1402 }
1391 get_device(ctrl->device); 1403 get_device(ctrl->device);
1392 dev_set_drvdata(ctrl->device, ctrl); 1404 dev_set_drvdata(ctrl->device, ctrl);
1405 ida_init(&ctrl->ns_ida);
1393 1406
1394 spin_lock(&dev_list_lock); 1407 spin_lock(&dev_list_lock);
1395 list_add_tail(&ctrl->node, &nvme_ctrl_list); 1408 list_add_tail(&ctrl->node, &nvme_ctrl_list);
@@ -1402,6 +1415,38 @@ out:
1402 return ret; 1415 return ret;
1403} 1416}
1404 1417
1418/**
1419 * nvme_kill_queues(): Ends all namespace queues
1420 * @ctrl: the dead controller that needs to end
1421 *
1422 * Call this function when the driver determines it is unable to get the
1423 * controller in a state capable of servicing IO.
1424 */
1425void nvme_kill_queues(struct nvme_ctrl *ctrl)
1426{
1427 struct nvme_ns *ns;
1428
1429 mutex_lock(&ctrl->namespaces_mutex);
1430 list_for_each_entry(ns, &ctrl->namespaces, list) {
1431 if (!kref_get_unless_zero(&ns->kref))
1432 continue;
1433
1434 /*
1435 * Revalidating a dead namespace sets capacity to 0. This will
1436 * end buffered writers dirtying pages that can't be synced.
1437 */
1438 if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
1439 revalidate_disk(ns->disk);
1440
1441 blk_set_queue_dying(ns->queue);
1442 blk_mq_abort_requeue_list(ns->queue);
1443 blk_mq_start_stopped_hw_queues(ns->queue, true);
1444
1445 nvme_put_ns(ns);
1446 }
1447 mutex_unlock(&ctrl->namespaces_mutex);
1448}
1449
1405void nvme_stop_queues(struct nvme_ctrl *ctrl) 1450void nvme_stop_queues(struct nvme_ctrl *ctrl)
1406{ 1451{
1407 struct nvme_ns *ns; 1452 struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9664d07d807d..fb15ba5f5d19 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -72,6 +72,7 @@ struct nvme_ctrl {
72 struct mutex namespaces_mutex; 72 struct mutex namespaces_mutex;
73 struct device *device; /* char device */ 73 struct device *device; /* char device */
74 struct list_head node; 74 struct list_head node;
75 struct ida ns_ida;
75 76
76 char name[12]; 77 char name[12];
77 char serial[20]; 78 char serial[20];
@@ -102,6 +103,7 @@ struct nvme_ns {
102 struct request_queue *queue; 103 struct request_queue *queue;
103 struct gendisk *disk; 104 struct gendisk *disk;
104 struct kref kref; 105 struct kref kref;
106 int instance;
105 107
106 u8 eui[8]; 108 u8 eui[8];
107 u8 uuid[16]; 109 u8 uuid[16];
@@ -112,6 +114,11 @@ struct nvme_ns {
112 bool ext; 114 bool ext;
113 u8 pi_type; 115 u8 pi_type;
114 int type; 116 int type;
117 unsigned long flags;
118
119#define NVME_NS_REMOVING 0
120#define NVME_NS_DEAD 1
121
115 u64 mode_select_num_blocks; 122 u64 mode_select_num_blocks;
116 u32 mode_select_block_len; 123 u32 mode_select_block_len;
117}; 124};
@@ -240,6 +247,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
240 247
241void nvme_stop_queues(struct nvme_ctrl *ctrl); 248void nvme_stop_queues(struct nvme_ctrl *ctrl);
242void nvme_start_queues(struct nvme_ctrl *ctrl); 249void nvme_start_queues(struct nvme_ctrl *ctrl);
250void nvme_kill_queues(struct nvme_ctrl *ctrl);
243 251
244struct request *nvme_alloc_request(struct request_queue *q, 252struct request *nvme_alloc_request(struct request_queue *q,
245 struct nvme_command *cmd, unsigned int flags); 253 struct nvme_command *cmd, unsigned int flags);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a128672472ec..680f5780750c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,7 +86,6 @@ struct nvme_queue;
86 86
87static int nvme_reset(struct nvme_dev *dev); 87static int nvme_reset(struct nvme_dev *dev);
88static void nvme_process_cq(struct nvme_queue *nvmeq); 88static void nvme_process_cq(struct nvme_queue *nvmeq);
89static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
90static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); 89static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
91 90
92/* 91/*
@@ -120,6 +119,7 @@ struct nvme_dev {
120 unsigned long flags; 119 unsigned long flags;
121 120
122#define NVME_CTRL_RESETTING 0 121#define NVME_CTRL_RESETTING 0
122#define NVME_CTRL_REMOVING 1
123 123
124 struct nvme_ctrl ctrl; 124 struct nvme_ctrl ctrl;
125 struct completion ioq_wait; 125 struct completion ioq_wait;
@@ -286,6 +286,17 @@ static int nvme_init_request(void *data, struct request *req,
286 return 0; 286 return 0;
287} 287}
288 288
289static void nvme_queue_scan(struct nvme_dev *dev)
290{
291 /*
292 * Do not queue new scan work when a controller is reset during
293 * removal.
294 */
295 if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
296 return;
297 queue_work(nvme_workq, &dev->scan_work);
298}
299
289static void nvme_complete_async_event(struct nvme_dev *dev, 300static void nvme_complete_async_event(struct nvme_dev *dev,
290 struct nvme_completion *cqe) 301 struct nvme_completion *cqe)
291{ 302{
@@ -300,7 +311,7 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
300 switch (result & 0xff07) { 311 switch (result & 0xff07) {
301 case NVME_AER_NOTICE_NS_CHANGED: 312 case NVME_AER_NOTICE_NS_CHANGED:
302 dev_info(dev->dev, "rescanning\n"); 313 dev_info(dev->dev, "rescanning\n");
303 queue_work(nvme_workq, &dev->scan_work); 314 nvme_queue_scan(dev);
304 default: 315 default:
305 dev_warn(dev->dev, "async event result %08x\n", result); 316 dev_warn(dev->dev, "async event result %08x\n", result);
306 } 317 }
@@ -679,7 +690,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
679 690
680 spin_lock_irq(&nvmeq->q_lock); 691 spin_lock_irq(&nvmeq->q_lock);
681 if (unlikely(nvmeq->cq_vector < 0)) { 692 if (unlikely(nvmeq->cq_vector < 0)) {
682 ret = BLK_MQ_RQ_QUEUE_BUSY; 693 if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
694 ret = BLK_MQ_RQ_QUEUE_BUSY;
695 else
696 ret = BLK_MQ_RQ_QUEUE_ERROR;
683 spin_unlock_irq(&nvmeq->q_lock); 697 spin_unlock_irq(&nvmeq->q_lock);
684 goto out; 698 goto out;
685 } 699 }
@@ -1250,6 +1264,12 @@ static struct blk_mq_ops nvme_mq_ops = {
1250static void nvme_dev_remove_admin(struct nvme_dev *dev) 1264static void nvme_dev_remove_admin(struct nvme_dev *dev)
1251{ 1265{
1252 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { 1266 if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1267 /*
1268 * If the controller was reset during removal, it's possible
1269 * user requests may be waiting on a stopped queue. Start the
1270 * queue to flush these to completion.
1271 */
1272 blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
1253 blk_cleanup_queue(dev->ctrl.admin_q); 1273 blk_cleanup_queue(dev->ctrl.admin_q);
1254 blk_mq_free_tag_set(&dev->admin_tagset); 1274 blk_mq_free_tag_set(&dev->admin_tagset);
1255 } 1275 }
@@ -1690,14 +1710,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
1690 return 0; 1710 return 0;
1691 dev->ctrl.tagset = &dev->tagset; 1711 dev->ctrl.tagset = &dev->tagset;
1692 } 1712 }
1693 queue_work(nvme_workq, &dev->scan_work); 1713 nvme_queue_scan(dev);
1694 return 0; 1714 return 0;
1695} 1715}
1696 1716
1697static int nvme_dev_map(struct nvme_dev *dev) 1717static int nvme_pci_enable(struct nvme_dev *dev)
1698{ 1718{
1699 u64 cap; 1719 u64 cap;
1700 int bars, result = -ENOMEM; 1720 int result = -ENOMEM;
1701 struct pci_dev *pdev = to_pci_dev(dev->dev); 1721 struct pci_dev *pdev = to_pci_dev(dev->dev);
1702 1722
1703 if (pci_enable_device_mem(pdev)) 1723 if (pci_enable_device_mem(pdev))
@@ -1705,24 +1725,14 @@ static int nvme_dev_map(struct nvme_dev *dev)
1705 1725
1706 dev->entry[0].vector = pdev->irq; 1726 dev->entry[0].vector = pdev->irq;
1707 pci_set_master(pdev); 1727 pci_set_master(pdev);
1708 bars = pci_select_bars(pdev, IORESOURCE_MEM);
1709 if (!bars)
1710 goto disable_pci;
1711
1712 if (pci_request_selected_regions(pdev, bars, "nvme"))
1713 goto disable_pci;
1714 1728
1715 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 1729 if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
1716 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 1730 dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
1717 goto disable; 1731 goto disable;
1718 1732
1719 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
1720 if (!dev->bar)
1721 goto disable;
1722
1723 if (readl(dev->bar + NVME_REG_CSTS) == -1) { 1733 if (readl(dev->bar + NVME_REG_CSTS) == -1) {
1724 result = -ENODEV; 1734 result = -ENODEV;
1725 goto unmap; 1735 goto disable;
1726 } 1736 }
1727 1737
1728 /* 1738 /*
@@ -1732,7 +1742,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
1732 if (!pdev->irq) { 1742 if (!pdev->irq) {
1733 result = pci_enable_msix(pdev, dev->entry, 1); 1743 result = pci_enable_msix(pdev, dev->entry, 1);
1734 if (result < 0) 1744 if (result < 0)
1735 goto unmap; 1745 goto disable;
1736 } 1746 }
1737 1747
1738 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); 1748 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
@@ -1759,18 +1769,20 @@ static int nvme_dev_map(struct nvme_dev *dev)
1759 pci_save_state(pdev); 1769 pci_save_state(pdev);
1760 return 0; 1770 return 0;
1761 1771
1762 unmap:
1763 iounmap(dev->bar);
1764 dev->bar = NULL;
1765 disable: 1772 disable:
1766 pci_release_regions(pdev);
1767 disable_pci:
1768 pci_disable_device(pdev); 1773 pci_disable_device(pdev);
1769 return result; 1774 return result;
1770} 1775}
1771 1776
1772static void nvme_dev_unmap(struct nvme_dev *dev) 1777static void nvme_dev_unmap(struct nvme_dev *dev)
1773{ 1778{
1779 if (dev->bar)
1780 iounmap(dev->bar);
1781 pci_release_regions(to_pci_dev(dev->dev));
1782}
1783
1784static void nvme_pci_disable(struct nvme_dev *dev)
1785{
1774 struct pci_dev *pdev = to_pci_dev(dev->dev); 1786 struct pci_dev *pdev = to_pci_dev(dev->dev);
1775 1787
1776 if (pdev->msi_enabled) 1788 if (pdev->msi_enabled)
@@ -1778,12 +1790,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
1778 else if (pdev->msix_enabled) 1790 else if (pdev->msix_enabled)
1779 pci_disable_msix(pdev); 1791 pci_disable_msix(pdev);
1780 1792
1781 if (dev->bar) {
1782 iounmap(dev->bar);
1783 dev->bar = NULL;
1784 pci_release_regions(pdev);
1785 }
1786
1787 if (pci_is_enabled(pdev)) { 1793 if (pci_is_enabled(pdev)) {
1788 pci_disable_pcie_error_reporting(pdev); 1794 pci_disable_pcie_error_reporting(pdev);
1789 pci_disable_device(pdev); 1795 pci_disable_device(pdev);
@@ -1842,7 +1848,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
1842 nvme_dev_list_remove(dev); 1848 nvme_dev_list_remove(dev);
1843 1849
1844 mutex_lock(&dev->shutdown_lock); 1850 mutex_lock(&dev->shutdown_lock);
1845 if (dev->bar) { 1851 if (pci_is_enabled(to_pci_dev(dev->dev))) {
1846 nvme_stop_queues(&dev->ctrl); 1852 nvme_stop_queues(&dev->ctrl);
1847 csts = readl(dev->bar + NVME_REG_CSTS); 1853 csts = readl(dev->bar + NVME_REG_CSTS);
1848 } 1854 }
@@ -1855,7 +1861,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
1855 nvme_disable_io_queues(dev); 1861 nvme_disable_io_queues(dev);
1856 nvme_disable_admin_queue(dev, shutdown); 1862 nvme_disable_admin_queue(dev, shutdown);
1857 } 1863 }
1858 nvme_dev_unmap(dev); 1864 nvme_pci_disable(dev);
1859 1865
1860 for (i = dev->queue_count - 1; i >= 0; i--) 1866 for (i = dev->queue_count - 1; i >= 0; i--)
1861 nvme_clear_queue(dev->queues[i]); 1867 nvme_clear_queue(dev->queues[i]);
@@ -1899,10 +1905,20 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
1899 kfree(dev); 1905 kfree(dev);
1900} 1906}
1901 1907
1908static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
1909{
1910 dev_warn(dev->dev, "Removing after probe failure status: %d\n", status);
1911
1912 kref_get(&dev->ctrl.kref);
1913 nvme_dev_disable(dev, false);
1914 if (!schedule_work(&dev->remove_work))
1915 nvme_put_ctrl(&dev->ctrl);
1916}
1917
1902static void nvme_reset_work(struct work_struct *work) 1918static void nvme_reset_work(struct work_struct *work)
1903{ 1919{
1904 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); 1920 struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
1905 int result; 1921 int result = -ENODEV;
1906 1922
1907 if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) 1923 if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
1908 goto out; 1924 goto out;
@@ -1911,37 +1927,37 @@ static void nvme_reset_work(struct work_struct *work)
1911 * If we're called to reset a live controller first shut it down before 1927 * If we're called to reset a live controller first shut it down before
1912 * moving on. 1928 * moving on.
1913 */ 1929 */
1914 if (dev->bar) 1930 if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
1915 nvme_dev_disable(dev, false); 1931 nvme_dev_disable(dev, false);
1916 1932
1917 set_bit(NVME_CTRL_RESETTING, &dev->flags); 1933 set_bit(NVME_CTRL_RESETTING, &dev->flags);
1918 1934
1919 result = nvme_dev_map(dev); 1935 result = nvme_pci_enable(dev);
1920 if (result) 1936 if (result)
1921 goto out; 1937 goto out;
1922 1938
1923 result = nvme_configure_admin_queue(dev); 1939 result = nvme_configure_admin_queue(dev);
1924 if (result) 1940 if (result)
1925 goto unmap; 1941 goto out;
1926 1942
1927 nvme_init_queue(dev->queues[0], 0); 1943 nvme_init_queue(dev->queues[0], 0);
1928 result = nvme_alloc_admin_tags(dev); 1944 result = nvme_alloc_admin_tags(dev);
1929 if (result) 1945 if (result)
1930 goto disable; 1946 goto out;
1931 1947
1932 result = nvme_init_identify(&dev->ctrl); 1948 result = nvme_init_identify(&dev->ctrl);
1933 if (result) 1949 if (result)
1934 goto free_tags; 1950 goto out;
1935 1951
1936 result = nvme_setup_io_queues(dev); 1952 result = nvme_setup_io_queues(dev);
1937 if (result) 1953 if (result)
1938 goto free_tags; 1954 goto out;
1939 1955
1940 dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; 1956 dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
1941 1957
1942 result = nvme_dev_list_add(dev); 1958 result = nvme_dev_list_add(dev);
1943 if (result) 1959 if (result)
1944 goto remove; 1960 goto out;
1945 1961
1946 /* 1962 /*
1947 * Keep the controller around but remove all namespaces if we don't have 1963 * Keep the controller around but remove all namespaces if we don't have
@@ -1958,19 +1974,8 @@ static void nvme_reset_work(struct work_struct *work)
1958 clear_bit(NVME_CTRL_RESETTING, &dev->flags); 1974 clear_bit(NVME_CTRL_RESETTING, &dev->flags);
1959 return; 1975 return;
1960 1976
1961 remove:
1962 nvme_dev_list_remove(dev);
1963 free_tags:
1964 nvme_dev_remove_admin(dev);
1965 blk_put_queue(dev->ctrl.admin_q);
1966 dev->ctrl.admin_q = NULL;
1967 dev->queues[0]->tags = NULL;
1968 disable:
1969 nvme_disable_admin_queue(dev, false);
1970 unmap:
1971 nvme_dev_unmap(dev);
1972 out: 1977 out:
1973 nvme_remove_dead_ctrl(dev); 1978 nvme_remove_dead_ctrl(dev, result);
1974} 1979}
1975 1980
1976static void nvme_remove_dead_ctrl_work(struct work_struct *work) 1981static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -1978,19 +1983,12 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
1978 struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); 1983 struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
1979 struct pci_dev *pdev = to_pci_dev(dev->dev); 1984 struct pci_dev *pdev = to_pci_dev(dev->dev);
1980 1985
1986 nvme_kill_queues(&dev->ctrl);
1981 if (pci_get_drvdata(pdev)) 1987 if (pci_get_drvdata(pdev))
1982 pci_stop_and_remove_bus_device_locked(pdev); 1988 pci_stop_and_remove_bus_device_locked(pdev);
1983 nvme_put_ctrl(&dev->ctrl); 1989 nvme_put_ctrl(&dev->ctrl);
1984} 1990}
1985 1991
1986static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
1987{
1988 dev_warn(dev->dev, "Removing after probe failure\n");
1989 kref_get(&dev->ctrl.kref);
1990 if (!schedule_work(&dev->remove_work))
1991 nvme_put_ctrl(&dev->ctrl);
1992}
1993
1994static int nvme_reset(struct nvme_dev *dev) 1992static int nvme_reset(struct nvme_dev *dev)
1995{ 1993{
1996 if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) 1994 if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
@@ -2042,6 +2040,27 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
2042 .free_ctrl = nvme_pci_free_ctrl, 2040 .free_ctrl = nvme_pci_free_ctrl,
2043}; 2041};
2044 2042
2043static int nvme_dev_map(struct nvme_dev *dev)
2044{
2045 int bars;
2046 struct pci_dev *pdev = to_pci_dev(dev->dev);
2047
2048 bars = pci_select_bars(pdev, IORESOURCE_MEM);
2049 if (!bars)
2050 return -ENODEV;
2051 if (pci_request_selected_regions(pdev, bars, "nvme"))
2052 return -ENODEV;
2053
2054 dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
2055 if (!dev->bar)
2056 goto release;
2057
2058 return 0;
2059 release:
2060 pci_release_regions(pdev);
2061 return -ENODEV;
2062}
2063
2045static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) 2064static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2046{ 2065{
2047 int node, result = -ENOMEM; 2066 int node, result = -ENOMEM;
@@ -2066,6 +2085,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2066 dev->dev = get_device(&pdev->dev); 2085 dev->dev = get_device(&pdev->dev);
2067 pci_set_drvdata(pdev, dev); 2086 pci_set_drvdata(pdev, dev);
2068 2087
2088 result = nvme_dev_map(dev);
2089 if (result)
2090 goto free;
2091
2069 INIT_LIST_HEAD(&dev->node); 2092 INIT_LIST_HEAD(&dev->node);
2070 INIT_WORK(&dev->scan_work, nvme_dev_scan); 2093 INIT_WORK(&dev->scan_work, nvme_dev_scan);
2071 INIT_WORK(&dev->reset_work, nvme_reset_work); 2094 INIT_WORK(&dev->reset_work, nvme_reset_work);
@@ -2089,6 +2112,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2089 nvme_release_prp_pools(dev); 2112 nvme_release_prp_pools(dev);
2090 put_pci: 2113 put_pci:
2091 put_device(dev->dev); 2114 put_device(dev->dev);
2115 nvme_dev_unmap(dev);
2092 free: 2116 free:
2093 kfree(dev->queues); 2117 kfree(dev->queues);
2094 kfree(dev->entry); 2118 kfree(dev->entry);
@@ -2112,10 +2136,16 @@ static void nvme_shutdown(struct pci_dev *pdev)
2112 nvme_dev_disable(dev, true); 2136 nvme_dev_disable(dev, true);
2113} 2137}
2114 2138
2139/*
2140 * The driver's remove may be called on a device in a partially initialized
2141 * state. This function must not have any dependencies on the device state in
2142 * order to proceed.
2143 */
2115static void nvme_remove(struct pci_dev *pdev) 2144static void nvme_remove(struct pci_dev *pdev)
2116{ 2145{
2117 struct nvme_dev *dev = pci_get_drvdata(pdev); 2146 struct nvme_dev *dev = pci_get_drvdata(pdev);
2118 2147
2148 set_bit(NVME_CTRL_REMOVING, &dev->flags);
2119 pci_set_drvdata(pdev, NULL); 2149 pci_set_drvdata(pdev, NULL);
2120 flush_work(&dev->scan_work); 2150 flush_work(&dev->scan_work);
2121 nvme_remove_namespaces(&dev->ctrl); 2151 nvme_remove_namespaces(&dev->ctrl);
@@ -2126,6 +2156,7 @@ static void nvme_remove(struct pci_dev *pdev)
2126 nvme_free_queues(dev, 0); 2156 nvme_free_queues(dev, 0);
2127 nvme_release_cmb(dev); 2157 nvme_release_cmb(dev);
2128 nvme_release_prp_pools(dev); 2158 nvme_release_prp_pools(dev);
2159 nvme_dev_unmap(dev);
2129 nvme_put_ctrl(&dev->ctrl); 2160 nvme_put_ctrl(&dev->ctrl);
2130} 2161}
2131 2162
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f76d8950a57..5c46ed9f3e14 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
223#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) 223#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
224 /* one round can affect upto 5 slots */ 224 /* one round can affect upto 5 slots */
225 225
226static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
227static struct workqueue_struct *isw_wq;
228
226void __inode_attach_wb(struct inode *inode, struct page *page) 229void __inode_attach_wb(struct inode *inode, struct page *page)
227{ 230{
228 struct backing_dev_info *bdi = inode_to_bdi(inode); 231 struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
317 struct inode_switch_wbs_context *isw = 320 struct inode_switch_wbs_context *isw =
318 container_of(work, struct inode_switch_wbs_context, work); 321 container_of(work, struct inode_switch_wbs_context, work);
319 struct inode *inode = isw->inode; 322 struct inode *inode = isw->inode;
320 struct super_block *sb = inode->i_sb;
321 struct address_space *mapping = inode->i_mapping; 323 struct address_space *mapping = inode->i_mapping;
322 struct bdi_writeback *old_wb = inode->i_wb; 324 struct bdi_writeback *old_wb = inode->i_wb;
323 struct bdi_writeback *new_wb = isw->new_wb; 325 struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +426,9 @@ skip_switch:
424 wb_put(new_wb); 426 wb_put(new_wb);
425 427
426 iput(inode); 428 iput(inode);
427 deactivate_super(sb);
428 kfree(isw); 429 kfree(isw);
430
431 atomic_dec(&isw_nr_in_flight);
429} 432}
430 433
431static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) 434static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
435 438
436 /* needs to grab bh-unsafe locks, bounce to work item */ 439 /* needs to grab bh-unsafe locks, bounce to work item */
437 INIT_WORK(&isw->work, inode_switch_wbs_work_fn); 440 INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
438 schedule_work(&isw->work); 441 queue_work(isw_wq, &isw->work);
439} 442}
440 443
441/** 444/**
@@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
471 474
472 /* while holding I_WB_SWITCH, no one else can update the association */ 475 /* while holding I_WB_SWITCH, no one else can update the association */
473 spin_lock(&inode->i_lock); 476 spin_lock(&inode->i_lock);
474 477 if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
475 if (inode->i_state & (I_WB_SWITCH | I_FREEING) || 478 inode->i_state & (I_WB_SWITCH | I_FREEING) ||
476 inode_to_wb(inode) == isw->new_wb) 479 inode_to_wb(inode) == isw->new_wb) {
477 goto out_unlock; 480 spin_unlock(&inode->i_lock);
478 481 goto out_free;
479 if (!atomic_inc_not_zero(&inode->i_sb->s_active)) 482 }
480 goto out_unlock;
481
482 inode->i_state |= I_WB_SWITCH; 483 inode->i_state |= I_WB_SWITCH;
483 spin_unlock(&inode->i_lock); 484 spin_unlock(&inode->i_lock);
484 485
485 ihold(inode); 486 ihold(inode);
486 isw->inode = inode; 487 isw->inode = inode;
487 488
489 atomic_inc(&isw_nr_in_flight);
490
488 /* 491 /*
489 * In addition to synchronizing among switchers, I_WB_SWITCH tells 492 * In addition to synchronizing among switchers, I_WB_SWITCH tells
490 * the RCU protected stat update paths to grab the mapping's 493 * the RCU protected stat update paths to grab the mapping's
@@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
494 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); 497 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
495 return; 498 return;
496 499
497out_unlock:
498 spin_unlock(&inode->i_lock);
499out_free: 500out_free:
500 if (isw->new_wb) 501 if (isw->new_wb)
501 wb_put(isw->new_wb); 502 wb_put(isw->new_wb);
@@ -847,6 +848,33 @@ restart:
847 wb_put(last_wb); 848 wb_put(last_wb);
848} 849}
849 850
851/**
852 * cgroup_writeback_umount - flush inode wb switches for umount
853 *
854 * This function is called when a super_block is about to be destroyed and
855 * flushes in-flight inode wb switches. An inode wb switch goes through
856 * RCU and then workqueue, so the two need to be flushed in order to ensure
857 * that all previously scheduled switches are finished. As wb switches are
858 * rare occurrences and synchronize_rcu() can take a while, perform
859 * flushing iff wb switches are in flight.
860 */
861void cgroup_writeback_umount(void)
862{
863 if (atomic_read(&isw_nr_in_flight)) {
864 synchronize_rcu();
865 flush_workqueue(isw_wq);
866 }
867}
868
869static int __init cgroup_writeback_init(void)
870{
871 isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
872 if (!isw_wq)
873 return -ENOMEM;
874 return 0;
875}
876fs_initcall(cgroup_writeback_init);
877
850#else /* CONFIG_CGROUP_WRITEBACK */ 878#else /* CONFIG_CGROUP_WRITEBACK */
851 879
852static struct bdi_writeback * 880static struct bdi_writeback *
diff --git a/fs/super.c b/fs/super.c
index 1182af8fd5ff..74914b1bae70 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
415 sb->s_flags &= ~MS_ACTIVE; 415 sb->s_flags &= ~MS_ACTIVE;
416 416
417 fsnotify_unmount_inodes(sb); 417 fsnotify_unmount_inodes(sb);
418 cgroup_writeback_umount();
418 419
419 evict_inodes(sb); 420 evict_inodes(sb);
420 421
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5349e6816cbb..cb6888824108 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -310,6 +310,43 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
310 bio->bi_flags &= ~(1U << bit); 310 bio->bi_flags &= ~(1U << bit);
311} 311}
312 312
313static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
314{
315 *bv = bio_iovec(bio);
316}
317
318static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
319{
320 struct bvec_iter iter = bio->bi_iter;
321 int idx;
322
323 if (!bio_flagged(bio, BIO_CLONED)) {
324 *bv = bio->bi_io_vec[bio->bi_vcnt - 1];
325 return;
326 }
327
328 if (unlikely(!bio_multiple_segments(bio))) {
329 *bv = bio_iovec(bio);
330 return;
331 }
332
333 bio_advance_iter(bio, &iter, iter.bi_size);
334
335 if (!iter.bi_bvec_done)
336 idx = iter.bi_idx - 1;
337 else /* in the middle of bvec */
338 idx = iter.bi_idx;
339
340 *bv = bio->bi_io_vec[idx];
341
342 /*
343 * iter.bi_bvec_done records actual length of the last bvec
344 * if this bio ends in the middle of one io vector
345 */
346 if (iter.bi_bvec_done)
347 bv->bv_len = iter.bi_bvec_done;
348}
349
313enum bip_flags { 350enum bip_flags {
314 BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ 351 BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */
315 BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ 352 BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4571ef1a12a9..413c84fbc4ed 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -895,7 +895,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
895{ 895{
896 struct request_queue *q = rq->q; 896 struct request_queue *q = rq->q;
897 897
898 if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC)) 898 if (unlikely(rq->cmd_type != REQ_TYPE_FS))
899 return q->limits.max_hw_sectors; 899 return q->limits.max_hw_sectors;
900 900
901 if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD)) 901 if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD))
@@ -1372,6 +1372,13 @@ static inline void put_dev_sector(Sector p)
1372 page_cache_release(p.v); 1372 page_cache_release(p.v);
1373} 1373}
1374 1374
1375static inline bool __bvec_gap_to_prev(struct request_queue *q,
1376 struct bio_vec *bprv, unsigned int offset)
1377{
1378 return offset ||
1379 ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
1380}
1381
1375/* 1382/*
1376 * Check if adding a bio_vec after bprv with offset would create a gap in 1383 * Check if adding a bio_vec after bprv with offset would create a gap in
1377 * the SG list. Most drivers don't care about this, but some do. 1384 * the SG list. Most drivers don't care about this, but some do.
@@ -1381,18 +1388,22 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
1381{ 1388{
1382 if (!queue_virt_boundary(q)) 1389 if (!queue_virt_boundary(q))
1383 return false; 1390 return false;
1384 return offset || 1391 return __bvec_gap_to_prev(q, bprv, offset);
1385 ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
1386} 1392}
1387 1393
1388static inline bool bio_will_gap(struct request_queue *q, struct bio *prev, 1394static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
1389 struct bio *next) 1395 struct bio *next)
1390{ 1396{
1391 if (!bio_has_data(prev)) 1397 if (bio_has_data(prev) && queue_virt_boundary(q)) {
1392 return false; 1398 struct bio_vec pb, nb;
1399
1400 bio_get_last_bvec(prev, &pb);
1401 bio_get_first_bvec(next, &nb);
1393 1402
1394 return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1], 1403 return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
1395 next->bi_io_vec[0].bv_offset); 1404 }
1405
1406 return false;
1396} 1407}
1397 1408
1398static inline bool req_gap_back_merge(struct request *req, struct bio *bio) 1409static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b333c945e571..d0b5ca5d4e08 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
198void wbc_detach_inode(struct writeback_control *wbc); 198void wbc_detach_inode(struct writeback_control *wbc);
199void wbc_account_io(struct writeback_control *wbc, struct page *page, 199void wbc_account_io(struct writeback_control *wbc, struct page *page,
200 size_t bytes); 200 size_t bytes);
201void cgroup_writeback_umount(void);
201 202
202/** 203/**
203 * inode_attach_wb - associate an inode with its wb 204 * inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@ static inline void wbc_account_io(struct writeback_control *wbc,
301{ 302{
302} 303}
303 304
305static inline void cgroup_writeback_umount(void)
306{
307}
308
304#endif /* CONFIG_CGROUP_WRITEBACK */ 309#endif /* CONFIG_CGROUP_WRITEBACK */
305 310
306/* 311/*