aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/DocBook/filesystems.tmpl2
-rw-r--r--block/Makefile7
-rw-r--r--block/bio-integrity.c (renamed from fs/bio-integrity.c)2
-rw-r--r--block/bio.c (renamed from fs/bio.c)11
-rw-r--r--block/blk-core.c110
-rw-r--r--block/blk-flush.c36
-rw-r--r--block/blk-iopoll.c4
-rw-r--r--block/blk-lib.c4
-rw-r--r--block/blk-map.c3
-rw-r--r--block/blk-mq-cpu.c17
-rw-r--r--block/blk-mq-cpumap.c27
-rw-r--r--block/blk-mq-sysfs.c58
-rw-r--r--block/blk-mq-tag.c561
-rw-r--r--block/blk-mq-tag.h71
-rw-r--r--block/blk-mq.c1399
-rw-r--r--block/blk-mq.h26
-rw-r--r--block/blk-sysfs.c47
-rw-r--r--block/blk-throttle.c10
-rw-r--r--block/blk-timeout.c60
-rw-r--r--block/blk.h9
-rw-r--r--block/bounce.c (renamed from mm/bounce.c)0
-rw-r--r--block/bsg.c2
-rw-r--r--block/cfq-iosched.c4
-rw-r--r--block/ioprio.c (renamed from fs/ioprio.c)0
-rw-r--r--drivers/block/amiflop.c2
-rw-r--r--drivers/block/ataflop.c2
-rw-r--r--drivers/block/cciss.c6
-rw-r--r--drivers/block/drbd/drbd_actlog.c23
-rw-r--r--drivers/block/drbd/drbd_int.h92
-rw-r--r--drivers/block/drbd/drbd_main.c28
-rw-r--r--drivers/block/drbd/drbd_nl.c485
-rw-r--r--drivers/block/drbd/drbd_nla.c1
-rw-r--r--drivers/block/drbd/drbd_proc.c2
-rw-r--r--drivers/block/drbd/drbd_protocol.h12
-rw-r--r--drivers/block/drbd/drbd_receiver.c196
-rw-r--r--drivers/block/drbd/drbd_req.c74
-rw-r--r--drivers/block/drbd/drbd_req.h6
-rw-r--r--drivers/block/drbd/drbd_state.c38
-rw-r--r--drivers/block/drbd/drbd_worker.c107
-rw-r--r--drivers/block/drbd/drbd_wrappers.h54
-rw-r--r--drivers/block/floppy.c20
-rw-r--r--drivers/block/hd.c10
-rw-r--r--drivers/block/mg_disk.c12
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c1089
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h32
-rw-r--r--drivers/block/null_blk.c119
-rw-r--r--drivers/block/paride/pcd.c2
-rw-r--r--drivers/block/paride/pd.c4
-rw-r--r--drivers/block/paride/pf.c4
-rw-r--r--drivers/block/skd_main.c12
-rw-r--r--drivers/block/swim.c2
-rw-r--r--drivers/block/swim3.c6
-rw-r--r--drivers/block/virtio_blk.c79
-rw-r--r--drivers/block/xen-blkfront.c4
-rw-r--r--drivers/block/xsysace.c4
-rw-r--r--drivers/block/z2ram.c6
-rw-r--r--drivers/cdrom/cdrom.c1271
-rw-r--r--drivers/cdrom/gdrom.c2
-rw-r--r--drivers/char/random.c1
-rw-r--r--drivers/ide/ide-disk.c5
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/mtd/mtd_blkdevs.c3
-rw-r--r--drivers/mtd/ubi/block.c2
-rw-r--r--drivers/sbus/char/jsflash.c2
-rw-r--r--drivers/scsi/scsi_lib.c5
-rw-r--r--drivers/scsi/sd.c13
-rw-r--r--fs/Makefile3
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk-mq.h99
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/blkdev.h34
-rw-r--r--mm/Makefile1
72 files changed, 3761 insertions, 2688 deletions
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
index 4f676838da06..bcdfdb9a9277 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -62,7 +62,7 @@
62!Efs/mpage.c 62!Efs/mpage.c
63!Efs/namei.c 63!Efs/namei.c
64!Efs/buffer.c 64!Efs/buffer.c
65!Efs/bio.c 65!Eblock/bio.c
66!Efs/seq_file.c 66!Efs/seq_file.c
67!Efs/filesystems.c 67!Efs/filesystems.c
68!Efs/fs-writeback.c 68!Efs/fs-writeback.c
diff --git a/block/Makefile b/block/Makefile
index 20645e88fb57..a2ce6ac935ec 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -2,13 +2,15 @@
2# Makefile for the kernel block layer 2# Makefile for the kernel block layer
3# 3#
4 4
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ 8 blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
9 blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ 9 blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
10 genhd.o scsi_ioctl.o partition-generic.o partitions/ 10 genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
11 partitions/
11 12
13obj-$(CONFIG_BOUNCE) += bounce.o
12obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 14obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
13obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o 15obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
14obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 16obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
@@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
20obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o 22obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
21obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o 23obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
22obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o 24obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
25obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
diff --git a/fs/bio-integrity.c b/block/bio-integrity.c
index 1c2ce0c87711..9e241063a616 100644
--- a/fs/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
617 if (!bs->bio_integrity_pool) 617 if (!bs->bio_integrity_pool)
618 return -1; 618 return -1;
619 619
620 bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); 620 bs->bvec_integrity_pool = biovec_create_pool(pool_size);
621 if (!bs->bvec_integrity_pool) { 621 if (!bs->bvec_integrity_pool) {
622 mempool_destroy(bs->bio_integrity_pool); 622 mempool_destroy(bs->bio_integrity_pool);
623 return -1; 623 return -1;
diff --git a/fs/bio.c b/block/bio.c
index 6f0362b77806..96d28eee8a1e 100644
--- a/fs/bio.c
+++ b/block/bio.c
@@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error)
305 305
306/** 306/**
307 * bio_chain - chain bio completions 307 * bio_chain - chain bio completions
308 * @bio: the target bio
309 * @parent: the @bio's parent bio
308 * 310 *
309 * The caller won't have a bi_end_io called when @bio completes - instead, 311 * The caller won't have a bi_end_io called when @bio completes - instead,
310 * @parent's bi_end_io won't be called until both @parent and @bio have 312 * @parent's bi_end_io won't be called until both @parent and @bio have
@@ -1011,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
1011 bio->bi_private = bmd; 1013 bio->bi_private = bmd;
1012} 1014}
1013 1015
1014static struct bio_map_data *bio_alloc_map_data(int nr_segs, 1016static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
1015 unsigned int iov_count,
1016 gfp_t gfp_mask) 1017 gfp_t gfp_mask)
1017{ 1018{
1018 if (iov_count > UIO_MAXIOV) 1019 if (iov_count > UIO_MAXIOV)
@@ -1154,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
1154 if (offset) 1155 if (offset)
1155 nr_pages++; 1156 nr_pages++;
1156 1157
1157 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); 1158 bmd = bio_alloc_map_data(iov_count, gfp_mask);
1158 if (!bmd) 1159 if (!bmd)
1159 return ERR_PTR(-ENOMEM); 1160 return ERR_PTR(-ENOMEM);
1160 1161
@@ -1859,7 +1860,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
1859 * create memory pools for biovec's in a bio_set. 1860 * create memory pools for biovec's in a bio_set.
1860 * use the global biovec slabs created for general use. 1861 * use the global biovec slabs created for general use.
1861 */ 1862 */
1862mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) 1863mempool_t *biovec_create_pool(int pool_entries)
1863{ 1864{
1864 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; 1865 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
1865 1866
@@ -1922,7 +1923,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1922 if (!bs->bio_pool) 1923 if (!bs->bio_pool)
1923 goto bad; 1924 goto bad;
1924 1925
1925 bs->bvec_pool = biovec_create_pool(bs, pool_size); 1926 bs->bvec_pool = biovec_create_pool(pool_size);
1926 if (!bs->bvec_pool) 1927 if (!bs->bvec_pool)
1927 goto bad; 1928 goto bad;
1928 1929
diff --git a/block/blk-core.c b/block/blk-core.c
index a0e3096c4bb5..d87be5b4e554 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -146,8 +146,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
146 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", 146 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
147 (unsigned long long)blk_rq_pos(rq), 147 (unsigned long long)blk_rq_pos(rq),
148 blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); 148 blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
149 printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", 149 printk(KERN_INFO " bio %p, biotail %p, len %u\n",
150 rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); 150 rq->bio, rq->biotail, blk_rq_bytes(rq));
151 151
152 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 152 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
153 printk(KERN_INFO " cdb: "); 153 printk(KERN_INFO " cdb: ");
@@ -251,8 +251,10 @@ void blk_sync_queue(struct request_queue *q)
251 struct blk_mq_hw_ctx *hctx; 251 struct blk_mq_hw_ctx *hctx;
252 int i; 252 int i;
253 253
254 queue_for_each_hw_ctx(q, hctx, i) 254 queue_for_each_hw_ctx(q, hctx, i) {
255 cancel_delayed_work_sync(&hctx->delayed_work); 255 cancel_delayed_work_sync(&hctx->run_work);
256 cancel_delayed_work_sync(&hctx->delay_work);
257 }
256 } else { 258 } else {
257 cancel_delayed_work_sync(&q->delay_work); 259 cancel_delayed_work_sync(&q->delay_work);
258 } 260 }
@@ -574,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
574 if (!q) 576 if (!q)
575 return NULL; 577 return NULL;
576 578
577 if (percpu_counter_init(&q->mq_usage_counter, 0))
578 goto fail_q;
579
580 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 579 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
581 if (q->id < 0) 580 if (q->id < 0)
582 goto fail_c; 581 goto fail_q;
583 582
584 q->backing_dev_info.ra_pages = 583 q->backing_dev_info.ra_pages =
585 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 584 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -637,8 +636,6 @@ fail_bdi:
637 bdi_destroy(&q->backing_dev_info); 636 bdi_destroy(&q->backing_dev_info);
638fail_id: 637fail_id:
639 ida_simple_remove(&blk_queue_ida, q->id); 638 ida_simple_remove(&blk_queue_ida, q->id);
640fail_c:
641 percpu_counter_destroy(&q->mq_usage_counter);
642fail_q: 639fail_q:
643 kmem_cache_free(blk_requestq_cachep, q); 640 kmem_cache_free(blk_requestq_cachep, q);
644 return NULL; 641 return NULL;
@@ -846,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags)
846 __freed_request(rl, sync ^ 1); 843 __freed_request(rl, sync ^ 1);
847} 844}
848 845
846int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
847{
848 struct request_list *rl;
849
850 spin_lock_irq(q->queue_lock);
851 q->nr_requests = nr;
852 blk_queue_congestion_threshold(q);
853
854 /* congestion isn't cgroup aware and follows root blkcg for now */
855 rl = &q->root_rl;
856
857 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
858 blk_set_queue_congested(q, BLK_RW_SYNC);
859 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
860 blk_clear_queue_congested(q, BLK_RW_SYNC);
861
862 if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
863 blk_set_queue_congested(q, BLK_RW_ASYNC);
864 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
865 blk_clear_queue_congested(q, BLK_RW_ASYNC);
866
867 blk_queue_for_each_rl(rl, q) {
868 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
869 blk_set_rl_full(rl, BLK_RW_SYNC);
870 } else {
871 blk_clear_rl_full(rl, BLK_RW_SYNC);
872 wake_up(&rl->wait[BLK_RW_SYNC]);
873 }
874
875 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
876 blk_set_rl_full(rl, BLK_RW_ASYNC);
877 } else {
878 blk_clear_rl_full(rl, BLK_RW_ASYNC);
879 wake_up(&rl->wait[BLK_RW_ASYNC]);
880 }
881 }
882
883 spin_unlock_irq(q->queue_lock);
884 return 0;
885}
886
849/* 887/*
850 * Determine if elevator data should be initialized when allocating the 888 * Determine if elevator data should be initialized when allocating the
851 * request associated with @bio. 889 * request associated with @bio.
@@ -1135,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
1135struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1173struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1136{ 1174{
1137 if (q->mq_ops) 1175 if (q->mq_ops)
1138 return blk_mq_alloc_request(q, rw, gfp_mask); 1176 return blk_mq_alloc_request(q, rw, gfp_mask, false);
1139 else 1177 else
1140 return blk_old_get_request(q, rw, gfp_mask); 1178 return blk_old_get_request(q, rw, gfp_mask);
1141} 1179}
@@ -1231,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
1231static void part_round_stats_single(int cpu, struct hd_struct *part, 1269static void part_round_stats_single(int cpu, struct hd_struct *part,
1232 unsigned long now) 1270 unsigned long now)
1233{ 1271{
1272 int inflight;
1273
1234 if (now == part->stamp) 1274 if (now == part->stamp)
1235 return; 1275 return;
1236 1276
1237 if (part_in_flight(part)) { 1277 inflight = part_in_flight(part);
1278 if (inflight) {
1238 __part_stat_add(cpu, part, time_in_queue, 1279 __part_stat_add(cpu, part, time_in_queue,
1239 part_in_flight(part) * (now - part->stamp)); 1280 inflight * (now - part->stamp));
1240 __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); 1281 __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1241 } 1282 }
1242 part->stamp = now; 1283 part->stamp = now;
@@ -1360,7 +1401,6 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1360 1401
1361 rq->__data_len = rq->resid_len = len; 1402 rq->__data_len = rq->resid_len = len;
1362 rq->nr_phys_segments = 1; 1403 rq->nr_phys_segments = 1;
1363 rq->buffer = bio_data(bio);
1364} 1404}
1365EXPORT_SYMBOL_GPL(blk_add_request_payload); 1405EXPORT_SYMBOL_GPL(blk_add_request_payload);
1366 1406
@@ -1402,12 +1442,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1402 bio->bi_next = req->bio; 1442 bio->bi_next = req->bio;
1403 req->bio = bio; 1443 req->bio = bio;
1404 1444
1405 /*
1406 * may not be valid. if the low level driver said
1407 * it didn't need a bounce buffer then it better
1408 * not touch req->buffer either...
1409 */
1410 req->buffer = bio_data(bio);
1411 req->__sector = bio->bi_iter.bi_sector; 1445 req->__sector = bio->bi_iter.bi_sector;
1412 req->__data_len += bio->bi_iter.bi_size; 1446 req->__data_len += bio->bi_iter.bi_size;
1413 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1447 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
@@ -1432,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1432 * added on the elevator at this point. In addition, we don't have 1466 * added on the elevator at this point. In addition, we don't have
1433 * reliable access to the elevator outside queue lock. Only check basic 1467 * reliable access to the elevator outside queue lock. Only check basic
1434 * merging parameters without querying the elevator. 1468 * merging parameters without querying the elevator.
1469 *
1470 * Caller must ensure !blk_queue_nomerges(q) beforehand.
1435 */ 1471 */
1436bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 1472bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1437 unsigned int *request_count) 1473 unsigned int *request_count)
@@ -1441,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1441 bool ret = false; 1477 bool ret = false;
1442 struct list_head *plug_list; 1478 struct list_head *plug_list;
1443 1479
1444 if (blk_queue_nomerges(q))
1445 goto out;
1446
1447 plug = current->plug; 1480 plug = current->plug;
1448 if (!plug) 1481 if (!plug)
1449 goto out; 1482 goto out;
@@ -1522,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1522 * Check if we can merge with the plugged list before grabbing 1555 * Check if we can merge with the plugged list before grabbing
1523 * any locks. 1556 * any locks.
1524 */ 1557 */
1525 if (blk_attempt_plug_merge(q, bio, &request_count)) 1558 if (!blk_queue_nomerges(q) &&
1559 blk_attempt_plug_merge(q, bio, &request_count))
1526 return; 1560 return;
1527 1561
1528 spin_lock_irq(q->queue_lock); 1562 spin_lock_irq(q->queue_lock);
@@ -1654,7 +1688,7 @@ static int __init fail_make_request_debugfs(void)
1654 struct dentry *dir = fault_create_debugfs_attr("fail_make_request", 1688 struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
1655 NULL, &fail_make_request); 1689 NULL, &fail_make_request);
1656 1690
1657 return IS_ERR(dir) ? PTR_ERR(dir) : 0; 1691 return PTR_ERR_OR_ZERO(dir);
1658} 1692}
1659 1693
1660late_initcall(fail_make_request_debugfs); 1694late_initcall(fail_make_request_debugfs);
@@ -2434,7 +2468,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2434 } 2468 }
2435 2469
2436 req->__data_len -= total_bytes; 2470 req->__data_len -= total_bytes;
2437 req->buffer = bio_data(req->bio);
2438 2471
2439 /* update sector only for requests with clear definition of sector */ 2472 /* update sector only for requests with clear definition of sector */
2440 if (req->cmd_type == REQ_TYPE_FS) 2473 if (req->cmd_type == REQ_TYPE_FS)
@@ -2503,7 +2536,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
2503/* 2536/*
2504 * queue lock must be held 2537 * queue lock must be held
2505 */ 2538 */
2506static void blk_finish_request(struct request *req, int error) 2539void blk_finish_request(struct request *req, int error)
2507{ 2540{
2508 if (blk_rq_tagged(req)) 2541 if (blk_rq_tagged(req))
2509 blk_queue_end_tag(req->q, req); 2542 blk_queue_end_tag(req->q, req);
@@ -2529,6 +2562,7 @@ static void blk_finish_request(struct request *req, int error)
2529 __blk_put_request(req->q, req); 2562 __blk_put_request(req->q, req);
2530 } 2563 }
2531} 2564}
2565EXPORT_SYMBOL(blk_finish_request);
2532 2566
2533/** 2567/**
2534 * blk_end_bidi_request - Complete a bidi request 2568 * blk_end_bidi_request - Complete a bidi request
@@ -2752,10 +2786,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2752 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ 2786 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
2753 rq->cmd_flags |= bio->bi_rw & REQ_WRITE; 2787 rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
2754 2788
2755 if (bio_has_data(bio)) { 2789 if (bio_has_data(bio))
2756 rq->nr_phys_segments = bio_phys_segments(q, bio); 2790 rq->nr_phys_segments = bio_phys_segments(q, bio);
2757 rq->buffer = bio_data(bio); 2791
2758 }
2759 rq->__data_len = bio->bi_iter.bi_size; 2792 rq->__data_len = bio->bi_iter.bi_size;
2760 rq->bio = rq->biotail = bio; 2793 rq->bio = rq->biotail = bio;
2761 2794
@@ -2831,7 +2864,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2831 2864
2832/* 2865/*
2833 * Copy attributes of the original request to the clone request. 2866 * Copy attributes of the original request to the clone request.
2834 * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. 2867 * The actual data parts (e.g. ->cmd, ->sense) are not copied.
2835 */ 2868 */
2836static void __blk_rq_prep_clone(struct request *dst, struct request *src) 2869static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2837{ 2870{
@@ -2857,7 +2890,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2857 * 2890 *
2858 * Description: 2891 * Description:
2859 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. 2892 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
2860 * The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) 2893 * The actual data parts of @rq_src (e.g. ->cmd, ->sense)
2861 * are not copied, and copying such parts is the caller's responsibility. 2894 * are not copied, and copying such parts is the caller's responsibility.
2862 * Also, pages which the original bios are pointing to are not copied 2895 * Also, pages which the original bios are pointing to are not copied
2863 * and the cloned bios just point same pages. 2896 * and the cloned bios just point same pages.
@@ -2904,19 +2937,26 @@ free_and_out:
2904} 2937}
2905EXPORT_SYMBOL_GPL(blk_rq_prep_clone); 2938EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
2906 2939
2907int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) 2940int kblockd_schedule_work(struct work_struct *work)
2908{ 2941{
2909 return queue_work(kblockd_workqueue, work); 2942 return queue_work(kblockd_workqueue, work);
2910} 2943}
2911EXPORT_SYMBOL(kblockd_schedule_work); 2944EXPORT_SYMBOL(kblockd_schedule_work);
2912 2945
2913int kblockd_schedule_delayed_work(struct request_queue *q, 2946int kblockd_schedule_delayed_work(struct delayed_work *dwork,
2914 struct delayed_work *dwork, unsigned long delay) 2947 unsigned long delay)
2915{ 2948{
2916 return queue_delayed_work(kblockd_workqueue, dwork, delay); 2949 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2917} 2950}
2918EXPORT_SYMBOL(kblockd_schedule_delayed_work); 2951EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2919 2952
2953int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
2954 unsigned long delay)
2955{
2956 return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
2957}
2958EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
2959
2920#define PLUG_MAGIC 0x91827364 2960#define PLUG_MAGIC 0x91827364
2921 2961
2922/** 2962/**
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43e6b4755e9a..ef608b35d9be 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq)
130 blk_clear_rq_complete(rq); 130 blk_clear_rq_complete(rq);
131} 131}
132 132
133static void mq_flush_run(struct work_struct *work)
134{
135 struct request *rq;
136
137 rq = container_of(work, struct request, mq_flush_work);
138
139 memset(&rq->csd, 0, sizeof(rq->csd));
140 blk_mq_insert_request(rq, false, true, false);
141}
142
143static bool blk_flush_queue_rq(struct request *rq, bool add_front) 133static bool blk_flush_queue_rq(struct request *rq, bool add_front)
144{ 134{
145 if (rq->q->mq_ops) { 135 if (rq->q->mq_ops) {
146 INIT_WORK(&rq->mq_flush_work, mq_flush_run); 136 struct request_queue *q = rq->q;
147 kblockd_schedule_work(rq->q, &rq->mq_flush_work); 137
138 blk_mq_add_to_requeue_list(rq, add_front);
139 blk_mq_kick_requeue_list(q);
148 return false; 140 return false;
149 } else { 141 } else {
150 if (add_front) 142 if (add_front)
@@ -306,23 +298,9 @@ static bool blk_kick_flush(struct request_queue *q)
306 */ 298 */
307 q->flush_pending_idx ^= 1; 299 q->flush_pending_idx ^= 1;
308 300
309 if (q->mq_ops) { 301 blk_rq_init(q, q->flush_rq);
310 struct blk_mq_ctx *ctx = first_rq->mq_ctx; 302 if (q->mq_ops)
311 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 303 blk_mq_clone_flush_request(q->flush_rq, first_rq);
312
313 blk_mq_rq_init(hctx, q->flush_rq);
314 q->flush_rq->mq_ctx = ctx;
315
316 /*
317 * Reuse the tag value from the fist waiting request,
318 * with blk-mq the tag is generated during request
319 * allocation and drivers can rely on it being inside
320 * the range they asked for.
321 */
322 q->flush_rq->tag = first_rq->tag;
323 } else {
324 blk_rq_init(q, q->flush_rq);
325 }
326 304
327 q->flush_rq->cmd_type = REQ_TYPE_FS; 305 q->flush_rq->cmd_type = REQ_TYPE_FS;
328 q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 306 q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index c11d24e379e2..d828b44a404b 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete);
64 * iopoll handler will not be invoked again before blk_iopoll_sched_prep() 64 * iopoll handler will not be invoked again before blk_iopoll_sched_prep()
65 * is called. 65 * is called.
66 **/ 66 **/
67void blk_iopoll_complete(struct blk_iopoll *iopoll) 67void blk_iopoll_complete(struct blk_iopoll *iop)
68{ 68{
69 unsigned long flags; 69 unsigned long flags;
70 70
71 local_irq_save(flags); 71 local_irq_save(flags);
72 __blk_iopoll_complete(iopoll); 72 __blk_iopoll_complete(iop);
73 local_irq_restore(flags); 73 local_irq_restore(flags);
74} 74}
75EXPORT_SYMBOL(blk_iopoll_complete); 75EXPORT_SYMBOL(blk_iopoll_complete);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 97a733cf3d5f..8411be3c19d3 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same);
226 * Generate and issue number of bios with zerofiled pages. 226 * Generate and issue number of bios with zerofiled pages.
227 */ 227 */
228 228
229int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 229static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
230 sector_t nr_sects, gfp_t gfp_mask) 230 sector_t nr_sects, gfp_t gfp_mask)
231{ 231{
232 int ret; 232 int ret;
233 struct bio *bio; 233 struct bio *bio;
diff --git a/block/blk-map.c b/block/blk-map.c
index f7b22bc21518..f890d4345b0c 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
155 if (!bio_flagged(bio, BIO_USER_MAPPED)) 155 if (!bio_flagged(bio, BIO_USER_MAPPED))
156 rq->cmd_flags |= REQ_COPY_USER; 156 rq->cmd_flags |= REQ_COPY_USER;
157 157
158 rq->buffer = NULL;
159 return 0; 158 return 0;
160unmap_rq: 159unmap_rq:
161 blk_rq_unmap_user(bio); 160 blk_rq_unmap_user(bio);
@@ -238,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
238 blk_queue_bounce(q, &bio); 237 blk_queue_bounce(q, &bio);
239 bio_get(bio); 238 bio_get(bio);
240 blk_rq_bio_prep(q, rq, bio); 239 blk_rq_bio_prep(q, rq, bio);
241 rq->buffer = NULL;
242 return 0; 240 return 0;
243} 241}
244EXPORT_SYMBOL(blk_rq_map_user_iov); 242EXPORT_SYMBOL(blk_rq_map_user_iov);
@@ -325,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
325 } 323 }
326 324
327 blk_queue_bounce(q, &rq->bio); 325 blk_queue_bounce(q, &rq->bio);
328 rq->buffer = NULL;
329 return 0; 326 return 0;
330} 327}
331EXPORT_SYMBOL(blk_rq_map_kern); 328EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
index 136ef8643bba..bb3ed488f7b5 100644
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@@ -1,3 +1,8 @@
1/*
2 * CPU notifier helper code for blk-mq
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 */
1#include <linux/kernel.h> 6#include <linux/kernel.h>
2#include <linux/module.h> 7#include <linux/module.h>
3#include <linux/init.h> 8#include <linux/init.h>
@@ -18,14 +23,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
18{ 23{
19 unsigned int cpu = (unsigned long) hcpu; 24 unsigned int cpu = (unsigned long) hcpu;
20 struct blk_mq_cpu_notifier *notify; 25 struct blk_mq_cpu_notifier *notify;
26 int ret = NOTIFY_OK;
21 27
22 raw_spin_lock(&blk_mq_cpu_notify_lock); 28 raw_spin_lock(&blk_mq_cpu_notify_lock);
23 29
24 list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) 30 list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
25 notify->notify(notify->data, action, cpu); 31 ret = notify->notify(notify->data, action, cpu);
32 if (ret != NOTIFY_OK)
33 break;
34 }
26 35
27 raw_spin_unlock(&blk_mq_cpu_notify_lock); 36 raw_spin_unlock(&blk_mq_cpu_notify_lock);
28 return NOTIFY_OK; 37 return ret;
29} 38}
30 39
31void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) 40void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
@@ -45,7 +54,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
45} 54}
46 55
47void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 56void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
48 void (*fn)(void *, unsigned long, unsigned int), 57 int (*fn)(void *, unsigned long, unsigned int),
49 void *data) 58 void *data)
50{ 59{
51 notifier->notify = fn; 60 notifier->notify = fn;
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 097921329619..1065d7c65fa1 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -1,3 +1,8 @@
1/*
2 * CPU <-> hardware queue mapping helpers
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 */
1#include <linux/kernel.h> 6#include <linux/kernel.h>
2#include <linux/threads.h> 7#include <linux/threads.h>
3#include <linux/module.h> 8#include <linux/module.h>
@@ -80,19 +85,35 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
80 return 0; 85 return 0;
81} 86}
82 87
83unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) 88unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
84{ 89{
85 unsigned int *map; 90 unsigned int *map;
86 91
87 /* If cpus are offline, map them to first hctx */ 92 /* If cpus are offline, map them to first hctx */
88 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, 93 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
89 reg->numa_node); 94 set->numa_node);
90 if (!map) 95 if (!map)
91 return NULL; 96 return NULL;
92 97
93 if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) 98 if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
94 return map; 99 return map;
95 100
96 kfree(map); 101 kfree(map);
97 return NULL; 102 return NULL;
98} 103}
104
105/*
106 * We have no quick way of doing reverse lookups. This is only used at
107 * queue init time, so runtime isn't important.
108 */
109int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
110{
111 int i;
112
113 for_each_possible_cpu(i) {
114 if (index == mq_map[i])
115 return cpu_to_node(i);
116 }
117
118 return NUMA_NO_NODE;
119}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index b0ba264b0522..99a60a829e69 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -203,59 +203,24 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
203 return ret; 203 return ret;
204} 204}
205 205
206static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) 206static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
207{
208 ssize_t ret;
209
210 spin_lock(&hctx->lock);
211 ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
212 spin_unlock(&hctx->lock);
213
214 return ret;
215}
216
217static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
218 const char *page, size_t len)
219{ 207{
220 struct blk_mq_ctx *ctx; 208 return blk_mq_tag_sysfs_show(hctx->tags, page);
221 unsigned long ret;
222 unsigned int i;
223
224 if (kstrtoul(page, 10, &ret)) {
225 pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
226 return -EINVAL;
227 }
228
229 spin_lock(&hctx->lock);
230 if (ret)
231 hctx->flags |= BLK_MQ_F_SHOULD_IPI;
232 else
233 hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
234 spin_unlock(&hctx->lock);
235
236 hctx_for_each_ctx(hctx, ctx, i)
237 ctx->ipi_redirect = !!ret;
238
239 return len;
240} 209}
241 210
242static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) 211static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
243{ 212{
244 return blk_mq_tag_sysfs_show(hctx->tags, page); 213 return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
245} 214}
246 215
247static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) 216static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
248{ 217{
249 unsigned int i, queue_num, first = 1; 218 unsigned int i, first = 1;
250 ssize_t ret = 0; 219 ssize_t ret = 0;
251 220
252 blk_mq_disable_hotplug(); 221 blk_mq_disable_hotplug();
253 222
254 for_each_online_cpu(i) { 223 for_each_cpu(i, hctx->cpumask) {
255 queue_num = hctx->queue->mq_map[i];
256 if (queue_num != hctx->queue_num)
257 continue;
258
259 if (first) 224 if (first)
260 ret += sprintf(ret + page, "%u", i); 225 ret += sprintf(ret + page, "%u", i);
261 else 226 else
@@ -307,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
307 .attr = {.name = "dispatched", .mode = S_IRUGO }, 272 .attr = {.name = "dispatched", .mode = S_IRUGO },
308 .show = blk_mq_hw_sysfs_dispatched_show, 273 .show = blk_mq_hw_sysfs_dispatched_show,
309}; 274};
275static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
276 .attr = {.name = "active", .mode = S_IRUGO },
277 .show = blk_mq_hw_sysfs_active_show,
278};
310static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { 279static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
311 .attr = {.name = "pending", .mode = S_IRUGO }, 280 .attr = {.name = "pending", .mode = S_IRUGO },
312 .show = blk_mq_hw_sysfs_rq_list_show, 281 .show = blk_mq_hw_sysfs_rq_list_show,
313}; 282};
314static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
315 .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
316 .show = blk_mq_hw_sysfs_ipi_show,
317 .store = blk_mq_hw_sysfs_ipi_store,
318};
319static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { 283static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
320 .attr = {.name = "tags", .mode = S_IRUGO }, 284 .attr = {.name = "tags", .mode = S_IRUGO },
321 .show = blk_mq_hw_sysfs_tags_show, 285 .show = blk_mq_hw_sysfs_tags_show,
@@ -330,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = {
330 &blk_mq_hw_sysfs_run.attr, 294 &blk_mq_hw_sysfs_run.attr,
331 &blk_mq_hw_sysfs_dispatched.attr, 295 &blk_mq_hw_sysfs_dispatched.attr,
332 &blk_mq_hw_sysfs_pending.attr, 296 &blk_mq_hw_sysfs_pending.attr,
333 &blk_mq_hw_sysfs_ipi.attr,
334 &blk_mq_hw_sysfs_tags.attr, 297 &blk_mq_hw_sysfs_tags.attr,
335 &blk_mq_hw_sysfs_cpus.attr, 298 &blk_mq_hw_sysfs_cpus.attr,
299 &blk_mq_hw_sysfs_active.attr,
336 NULL, 300 NULL,
337}; 301};
338 302
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83ae96c51a27..d90c4aeb7dd3 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,78 +1,345 @@
1/*
2 * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
3 * over multiple cachelines to avoid ping-pong between multiple submitters
4 * or submitter and completer. Uses rolling wakeups to avoid falling of
5 * the scaling cliff when we run out of tags and have to start putting
6 * submitters to sleep.
7 *
8 * Uses active queue tracking to support fairer distribution of tags
9 * between multiple submitters when a shared tag map is used.
10 *
11 * Copyright (C) 2013-2014 Jens Axboe
12 */
1#include <linux/kernel.h> 13#include <linux/kernel.h>
2#include <linux/module.h> 14#include <linux/module.h>
3#include <linux/percpu_ida.h> 15#include <linux/random.h>
4 16
5#include <linux/blk-mq.h> 17#include <linux/blk-mq.h>
6#include "blk.h" 18#include "blk.h"
7#include "blk-mq.h" 19#include "blk-mq.h"
8#include "blk-mq-tag.h" 20#include "blk-mq-tag.h"
9 21
22static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
23{
24 int i;
25
26 for (i = 0; i < bt->map_nr; i++) {
27 struct blk_align_bitmap *bm = &bt->map[i];
28 int ret;
29
30 ret = find_first_zero_bit(&bm->word, bm->depth);
31 if (ret < bm->depth)
32 return true;
33 }
34
35 return false;
36}
37
38bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
39{
40 if (!tags)
41 return true;
42
43 return bt_has_free_tags(&tags->bitmap_tags);
44}
45
46static inline void bt_index_inc(unsigned int *index)
47{
48 *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
49}
50
10/* 51/*
11 * Per tagged queue (tag address space) map 52 * If a previously inactive queue goes active, bump the active user count.
12 */ 53 */
13struct blk_mq_tags { 54bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
14 unsigned int nr_tags; 55{
15 unsigned int nr_reserved_tags; 56 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
16 unsigned int nr_batch_move; 57 !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
17 unsigned int nr_max_cache; 58 atomic_inc(&hctx->tags->active_queues);
18 59
19 struct percpu_ida free_tags; 60 return true;
20 struct percpu_ida reserved_tags; 61}
21};
22 62
23void blk_mq_wait_for_tags(struct blk_mq_tags *tags) 63/*
64 * Wakeup all potentially sleeping on normal (non-reserved) tags
65 */
66static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
24{ 67{
25 int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); 68 struct blk_mq_bitmap_tags *bt;
26 blk_mq_put_tag(tags, tag); 69 int i, wake_index;
70
71 bt = &tags->bitmap_tags;
72 wake_index = bt->wake_index;
73 for (i = 0; i < BT_WAIT_QUEUES; i++) {
74 struct bt_wait_state *bs = &bt->bs[wake_index];
75
76 if (waitqueue_active(&bs->wait))
77 wake_up(&bs->wait);
78
79 bt_index_inc(&wake_index);
80 }
27} 81}
28 82
29bool blk_mq_has_free_tags(struct blk_mq_tags *tags) 83/*
84 * If a previously busy queue goes inactive, potential waiters could now
85 * be allowed to queue. Wake them up and check.
86 */
87void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
88{
89 struct blk_mq_tags *tags = hctx->tags;
90
91 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
92 return;
93
94 atomic_dec(&tags->active_queues);
95
96 blk_mq_tag_wakeup_all(tags);
97}
98
99/*
100 * For shared tag users, we track the number of currently active users
101 * and attempt to provide a fair share of the tag depth for each of them.
102 */
103static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
104 struct blk_mq_bitmap_tags *bt)
105{
106 unsigned int depth, users;
107
108 if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
109 return true;
110 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
111 return true;
112
113 /*
114 * Don't try dividing an ant
115 */
116 if (bt->depth == 1)
117 return true;
118
119 users = atomic_read(&hctx->tags->active_queues);
120 if (!users)
121 return true;
122
123 /*
124 * Allow at least some tags
125 */
126 depth = max((bt->depth + users - 1) / users, 4U);
127 return atomic_read(&hctx->nr_active) < depth;
128}
129
130static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
30{ 131{
31 return !tags || 132 int tag, org_last_tag, end;
32 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; 133
134 org_last_tag = last_tag;
135 end = bm->depth;
136 do {
137restart:
138 tag = find_next_zero_bit(&bm->word, end, last_tag);
139 if (unlikely(tag >= end)) {
140 /*
141 * We started with an offset, start from 0 to
142 * exhaust the map.
143 */
144 if (org_last_tag && last_tag) {
145 end = last_tag;
146 last_tag = 0;
147 goto restart;
148 }
149 return -1;
150 }
151 last_tag = tag + 1;
152 } while (test_and_set_bit_lock(tag, &bm->word));
153
154 return tag;
33} 155}
34 156
35static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) 157/*
158 * Straight forward bitmap tag implementation, where each bit is a tag
159 * (cleared == free, and set == busy). The small twist is using per-cpu
160 * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
161 * contexts. This enables us to drastically limit the space searched,
162 * without dirtying an extra shared cacheline like we would if we stored
163 * the cache value inside the shared blk_mq_bitmap_tags structure. On top
164 * of that, each word of tags is in a separate cacheline. This means that
165 * multiple users will tend to stick to different cachelines, at least
166 * until the map is exhausted.
167 */
168static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
169 unsigned int *tag_cache)
36{ 170{
171 unsigned int last_tag, org_last_tag;
172 int index, i, tag;
173
174 if (!hctx_may_queue(hctx, bt))
175 return -1;
176
177 last_tag = org_last_tag = *tag_cache;
178 index = TAG_TO_INDEX(bt, last_tag);
179
180 for (i = 0; i < bt->map_nr; i++) {
181 tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
182 if (tag != -1) {
183 tag += (index << bt->bits_per_word);
184 goto done;
185 }
186
187 last_tag = 0;
188 if (++index >= bt->map_nr)
189 index = 0;
190 }
191
192 *tag_cache = 0;
193 return -1;
194
195 /*
196 * Only update the cache from the allocation path, if we ended
197 * up using the specific cached tag.
198 */
199done:
200 if (tag == org_last_tag) {
201 last_tag = tag + 1;
202 if (last_tag >= bt->depth - 1)
203 last_tag = 0;
204
205 *tag_cache = last_tag;
206 }
207
208 return tag;
209}
210
211static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
212 struct blk_mq_hw_ctx *hctx)
213{
214 struct bt_wait_state *bs;
215
216 if (!hctx)
217 return &bt->bs[0];
218
219 bs = &bt->bs[hctx->wait_index];
220 bt_index_inc(&hctx->wait_index);
221 return bs;
222}
223
224static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
225 unsigned int *last_tag, gfp_t gfp)
226{
227 struct bt_wait_state *bs;
228 DEFINE_WAIT(wait);
37 int tag; 229 int tag;
38 230
39 tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ? 231 tag = __bt_get(hctx, bt, last_tag);
40 TASK_UNINTERRUPTIBLE : TASK_RUNNING); 232 if (tag != -1)
41 if (tag < 0) 233 return tag;
42 return BLK_MQ_TAG_FAIL; 234
43 return tag + tags->nr_reserved_tags; 235 if (!(gfp & __GFP_WAIT))
236 return -1;
237
238 bs = bt_wait_ptr(bt, hctx);
239 do {
240 bool was_empty;
241
242 was_empty = list_empty(&wait.task_list);
243 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
244
245 tag = __bt_get(hctx, bt, last_tag);
246 if (tag != -1)
247 break;
248
249 if (was_empty)
250 atomic_set(&bs->wait_cnt, bt->wake_cnt);
251
252 io_schedule();
253 } while (1);
254
255 finish_wait(&bs->wait, &wait);
256 return tag;
257}
258
259static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags,
260 struct blk_mq_hw_ctx *hctx,
261 unsigned int *last_tag, gfp_t gfp)
262{
263 int tag;
264
265 tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp);
266 if (tag >= 0)
267 return tag + tags->nr_reserved_tags;
268
269 return BLK_MQ_TAG_FAIL;
44} 270}
45 271
46static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, 272static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
47 gfp_t gfp) 273 gfp_t gfp)
48{ 274{
49 int tag; 275 int tag, zero = 0;
50 276
51 if (unlikely(!tags->nr_reserved_tags)) { 277 if (unlikely(!tags->nr_reserved_tags)) {
52 WARN_ON_ONCE(1); 278 WARN_ON_ONCE(1);
53 return BLK_MQ_TAG_FAIL; 279 return BLK_MQ_TAG_FAIL;
54 } 280 }
55 281
56 tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ? 282 tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp);
57 TASK_UNINTERRUPTIBLE : TASK_RUNNING);
58 if (tag < 0) 283 if (tag < 0)
59 return BLK_MQ_TAG_FAIL; 284 return BLK_MQ_TAG_FAIL;
285
60 return tag; 286 return tag;
61} 287}
62 288
63unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) 289unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
290 gfp_t gfp, bool reserved)
64{ 291{
65 if (!reserved) 292 if (!reserved)
66 return __blk_mq_get_tag(tags, gfp); 293 return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);
67 294
68 return __blk_mq_get_reserved_tag(tags, gfp); 295 return __blk_mq_get_reserved_tag(hctx->tags, gfp);
296}
297
298static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
299{
300 int i, wake_index;
301
302 wake_index = bt->wake_index;
303 for (i = 0; i < BT_WAIT_QUEUES; i++) {
304 struct bt_wait_state *bs = &bt->bs[wake_index];
305
306 if (waitqueue_active(&bs->wait)) {
307 if (wake_index != bt->wake_index)
308 bt->wake_index = wake_index;
309
310 return bs;
311 }
312
313 bt_index_inc(&wake_index);
314 }
315
316 return NULL;
317}
318
319static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
320{
321 const int index = TAG_TO_INDEX(bt, tag);
322 struct bt_wait_state *bs;
323
324 /*
325 * The unlock memory barrier need to order access to req in free
326 * path and clearing tag bit
327 */
328 clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
329
330 bs = bt_wake_ptr(bt);
331 if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
332 atomic_set(&bs->wait_cnt, bt->wake_cnt);
333 bt_index_inc(&bt->wake_index);
334 wake_up(&bs->wait);
335 }
69} 336}
70 337
71static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 338static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
72{ 339{
73 BUG_ON(tag >= tags->nr_tags); 340 BUG_ON(tag >= tags->nr_tags);
74 341
75 percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); 342 bt_clear_tag(&tags->bitmap_tags, tag);
76} 343}
77 344
78static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, 345static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
@@ -80,22 +347,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
80{ 347{
81 BUG_ON(tag >= tags->nr_reserved_tags); 348 BUG_ON(tag >= tags->nr_reserved_tags);
82 349
83 percpu_ida_free(&tags->reserved_tags, tag); 350 bt_clear_tag(&tags->breserved_tags, tag);
84} 351}
85 352
86void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 353void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
354 unsigned int *last_tag)
87{ 355{
88 if (tag >= tags->nr_reserved_tags) 356 struct blk_mq_tags *tags = hctx->tags;
89 __blk_mq_put_tag(tags, tag); 357
90 else 358 if (tag >= tags->nr_reserved_tags) {
359 const int real_tag = tag - tags->nr_reserved_tags;
360
361 __blk_mq_put_tag(tags, real_tag);
362 *last_tag = real_tag;
363 } else
91 __blk_mq_put_reserved_tag(tags, tag); 364 __blk_mq_put_reserved_tag(tags, tag);
92} 365}
93 366
94static int __blk_mq_tag_iter(unsigned id, void *data) 367static void bt_for_each_free(struct blk_mq_bitmap_tags *bt,
368 unsigned long *free_map, unsigned int off)
95{ 369{
96 unsigned long *tag_map = data; 370 int i;
97 __set_bit(id, tag_map); 371
98 return 0; 372 for (i = 0; i < bt->map_nr; i++) {
373 struct blk_align_bitmap *bm = &bt->map[i];
374 int bit = 0;
375
376 do {
377 bit = find_next_zero_bit(&bm->word, bm->depth, bit);
378 if (bit >= bm->depth)
379 break;
380
381 __set_bit(bit + off, free_map);
382 bit++;
383 } while (1);
384
385 off += (1 << bt->bits_per_word);
386 }
99} 387}
100 388
101void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, 389void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
@@ -109,21 +397,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
109 if (!tag_map) 397 if (!tag_map)
110 return; 398 return;
111 399
112 percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); 400 bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags);
113 if (tags->nr_reserved_tags) 401 if (tags->nr_reserved_tags)
114 percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, 402 bt_for_each_free(&tags->breserved_tags, tag_map, 0);
115 tag_map);
116 403
117 fn(data, tag_map); 404 fn(data, tag_map);
118 kfree(tag_map); 405 kfree(tag_map);
119} 406}
407EXPORT_SYMBOL(blk_mq_tag_busy_iter);
408
409static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
410{
411 unsigned int i, used;
412
413 for (i = 0, used = 0; i < bt->map_nr; i++) {
414 struct blk_align_bitmap *bm = &bt->map[i];
415
416 used += bitmap_weight(&bm->word, bm->depth);
417 }
418
419 return bt->depth - used;
420}
421
422static void bt_update_count(struct blk_mq_bitmap_tags *bt,
423 unsigned int depth)
424{
425 unsigned int tags_per_word = 1U << bt->bits_per_word;
426 unsigned int map_depth = depth;
427
428 if (depth) {
429 int i;
430
431 for (i = 0; i < bt->map_nr; i++) {
432 bt->map[i].depth = min(map_depth, tags_per_word);
433 map_depth -= bt->map[i].depth;
434 }
435 }
436
437 bt->wake_cnt = BT_WAIT_BATCH;
438 if (bt->wake_cnt > depth / 4)
439 bt->wake_cnt = max(1U, depth / 4);
440
441 bt->depth = depth;
442}
443
444static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
445 int node, bool reserved)
446{
447 int i;
448
449 bt->bits_per_word = ilog2(BITS_PER_LONG);
450
451 /*
452 * Depth can be zero for reserved tags, that's not a failure
453 * condition.
454 */
455 if (depth) {
456 unsigned int nr, tags_per_word;
457
458 tags_per_word = (1 << bt->bits_per_word);
459
460 /*
461 * If the tag space is small, shrink the number of tags
462 * per word so we spread over a few cachelines, at least.
463 * If less than 4 tags, just forget about it, it's not
464 * going to work optimally anyway.
465 */
466 if (depth >= 4) {
467 while (tags_per_word * 4 > depth) {
468 bt->bits_per_word--;
469 tags_per_word = (1 << bt->bits_per_word);
470 }
471 }
472
473 nr = ALIGN(depth, tags_per_word) / tags_per_word;
474 bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
475 GFP_KERNEL, node);
476 if (!bt->map)
477 return -ENOMEM;
478
479 bt->map_nr = nr;
480 }
481
482 bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
483 if (!bt->bs) {
484 kfree(bt->map);
485 return -ENOMEM;
486 }
487
488 for (i = 0; i < BT_WAIT_QUEUES; i++)
489 init_waitqueue_head(&bt->bs[i].wait);
490
491 bt_update_count(bt, depth);
492 return 0;
493}
494
495static void bt_free(struct blk_mq_bitmap_tags *bt)
496{
497 kfree(bt->map);
498 kfree(bt->bs);
499}
500
501static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
502 int node)
503{
504 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
505
506 if (bt_alloc(&tags->bitmap_tags, depth, node, false))
507 goto enomem;
508 if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
509 goto enomem;
510
511 return tags;
512enomem:
513 bt_free(&tags->bitmap_tags);
514 kfree(tags);
515 return NULL;
516}
120 517
121struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 518struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
122 unsigned int reserved_tags, int node) 519 unsigned int reserved_tags, int node)
123{ 520{
124 unsigned int nr_tags, nr_cache;
125 struct blk_mq_tags *tags; 521 struct blk_mq_tags *tags;
126 int ret;
127 522
128 if (total_tags > BLK_MQ_TAG_MAX) { 523 if (total_tags > BLK_MQ_TAG_MAX) {
129 pr_err("blk-mq: tag depth too large\n"); 524 pr_err("blk-mq: tag depth too large\n");
@@ -134,73 +529,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
134 if (!tags) 529 if (!tags)
135 return NULL; 530 return NULL;
136 531
137 nr_tags = total_tags - reserved_tags;
138 nr_cache = nr_tags / num_possible_cpus();
139
140 if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
141 nr_cache = BLK_MQ_TAG_CACHE_MIN;
142 else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
143 nr_cache = BLK_MQ_TAG_CACHE_MAX;
144
145 tags->nr_tags = total_tags; 532 tags->nr_tags = total_tags;
146 tags->nr_reserved_tags = reserved_tags; 533 tags->nr_reserved_tags = reserved_tags;
147 tags->nr_max_cache = nr_cache;
148 tags->nr_batch_move = max(1u, nr_cache / 2);
149 534
150 ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - 535 return blk_mq_init_bitmap_tags(tags, node);
151 tags->nr_reserved_tags, 536}
152 tags->nr_max_cache,
153 tags->nr_batch_move);
154 if (ret)
155 goto err_free_tags;
156 537
157 if (reserved_tags) { 538void blk_mq_free_tags(struct blk_mq_tags *tags)
158 /* 539{
159 * With max_cahe and batch set to 1, the allocator fallbacks to 540 bt_free(&tags->bitmap_tags);
160 * no cached. It's fine reserved tags allocation is slow. 541 bt_free(&tags->breserved_tags);
161 */ 542 kfree(tags);
162 ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, 543}
163 1, 1);
164 if (ret)
165 goto err_reserved_tags;
166 }
167 544
168 return tags; 545void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
546{
547 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
169 548
170err_reserved_tags: 549 *tag = prandom_u32() % depth;
171 percpu_ida_destroy(&tags->free_tags);
172err_free_tags:
173 kfree(tags);
174 return NULL;
175} 550}
176 551
177void blk_mq_free_tags(struct blk_mq_tags *tags) 552int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
178{ 553{
179 percpu_ida_destroy(&tags->free_tags); 554 tdepth -= tags->nr_reserved_tags;
180 percpu_ida_destroy(&tags->reserved_tags); 555 if (tdepth > tags->nr_tags)
181 kfree(tags); 556 return -EINVAL;
557
558 /*
559 * Don't need (or can't) update reserved tags here, they remain
560 * static and should never need resizing.
561 */
562 bt_update_count(&tags->bitmap_tags, tdepth);
563 blk_mq_tag_wakeup_all(tags);
564 return 0;
182} 565}
183 566
184ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) 567ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
185{ 568{
186 char *orig_page = page; 569 char *orig_page = page;
187 unsigned int cpu; 570 unsigned int free, res;
188 571
189 if (!tags) 572 if (!tags)
190 return 0; 573 return 0;
191 574
192 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," 575 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
193 " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, 576 "bits_per_word=%u\n",
194 tags->nr_batch_move, tags->nr_max_cache); 577 tags->nr_tags, tags->nr_reserved_tags,
578 tags->bitmap_tags.bits_per_word);
195 579
196 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", 580 free = bt_unused_tags(&tags->bitmap_tags);
197 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), 581 res = bt_unused_tags(&tags->breserved_tags);
198 percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
199 582
200 for_each_possible_cpu(cpu) { 583 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
201 page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, 584 page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
202 percpu_ida_free_tags(&tags->free_tags, cpu));
203 }
204 585
205 return page - orig_page; 586 return page - orig_page;
206} 587}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 947ba2c6148e..c959de58d2a5 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -1,17 +1,59 @@
1#ifndef INT_BLK_MQ_TAG_H 1#ifndef INT_BLK_MQ_TAG_H
2#define INT_BLK_MQ_TAG_H 2#define INT_BLK_MQ_TAG_H
3 3
4struct blk_mq_tags; 4#include "blk-mq.h"
5
6enum {
7 BT_WAIT_QUEUES = 8,
8 BT_WAIT_BATCH = 8,
9};
10
11struct bt_wait_state {
12 atomic_t wait_cnt;
13 wait_queue_head_t wait;
14} ____cacheline_aligned_in_smp;
15
16#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word)
17#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1))
18
19struct blk_mq_bitmap_tags {
20 unsigned int depth;
21 unsigned int wake_cnt;
22 unsigned int bits_per_word;
23
24 unsigned int map_nr;
25 struct blk_align_bitmap *map;
26
27 unsigned int wake_index;
28 struct bt_wait_state *bs;
29};
30
31/*
32 * Tag address space map.
33 */
34struct blk_mq_tags {
35 unsigned int nr_tags;
36 unsigned int nr_reserved_tags;
37
38 atomic_t active_queues;
39
40 struct blk_mq_bitmap_tags bitmap_tags;
41 struct blk_mq_bitmap_tags breserved_tags;
42
43 struct request **rqs;
44 struct list_head page_list;
45};
46
5 47
6extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 48extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
7extern void blk_mq_free_tags(struct blk_mq_tags *tags); 49extern void blk_mq_free_tags(struct blk_mq_tags *tags);
8 50
9extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); 51extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
10extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); 52extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
11extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
12extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
13extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 53extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
14extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); 54extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
55extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
56extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
15 57
16enum { 58enum {
17 BLK_MQ_TAG_CACHE_MIN = 1, 59 BLK_MQ_TAG_CACHE_MIN = 1,
@@ -24,4 +66,23 @@ enum {
24 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 66 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
25}; 67};
26 68
69extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
70extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
71
72static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
73{
74 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
75 return false;
76
77 return __blk_mq_tag_busy(hctx);
78}
79
80static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
81{
82 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
83 return;
84
85 __blk_mq_tag_idle(hctx);
86}
87
27#endif 88#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1d2a9bdbee57..f27fe44230c2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1,3 +1,9 @@
1/*
2 * Block multiqueue core code
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 * Copyright (C) 2013-2014 Christoph Hellwig
6 */
1#include <linux/kernel.h> 7#include <linux/kernel.h>
2#include <linux/module.h> 8#include <linux/module.h>
3#include <linux/backing-dev.h> 9#include <linux/backing-dev.h>
@@ -56,38 +62,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
56{ 62{
57 unsigned int i; 63 unsigned int i;
58 64
59 for (i = 0; i < hctx->nr_ctx_map; i++) 65 for (i = 0; i < hctx->ctx_map.map_size; i++)
60 if (hctx->ctx_map[i]) 66 if (hctx->ctx_map.map[i].word)
61 return true; 67 return true;
62 68
63 return false; 69 return false;
64} 70}
65 71
72static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
73 struct blk_mq_ctx *ctx)
74{
75 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
76}
77
78#define CTX_TO_BIT(hctx, ctx) \
79 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
80
66/* 81/*
67 * Mark this ctx as having pending work in this hardware queue 82 * Mark this ctx as having pending work in this hardware queue
68 */ 83 */
69static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 84static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
70 struct blk_mq_ctx *ctx) 85 struct blk_mq_ctx *ctx)
71{ 86{
72 if (!test_bit(ctx->index_hw, hctx->ctx_map)) 87 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
73 set_bit(ctx->index_hw, hctx->ctx_map); 88
89 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
90 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
74} 91}
75 92
76static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 93static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
77 gfp_t gfp, bool reserved) 94 struct blk_mq_ctx *ctx)
78{ 95{
79 struct request *rq; 96 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
80 unsigned int tag;
81 97
82 tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 98 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
83 if (tag != BLK_MQ_TAG_FAIL) {
84 rq = hctx->rqs[tag];
85 rq->tag = tag;
86
87 return rq;
88 }
89
90 return NULL;
91} 99}
92 100
93static int blk_mq_queue_enter(struct request_queue *q) 101static int blk_mq_queue_enter(struct request_queue *q)
@@ -186,78 +194,109 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
186 if (blk_queue_io_stat(q)) 194 if (blk_queue_io_stat(q))
187 rw_flags |= REQ_IO_STAT; 195 rw_flags |= REQ_IO_STAT;
188 196
197 INIT_LIST_HEAD(&rq->queuelist);
198 /* csd/requeue_work/fifo_time is initialized before use */
199 rq->q = q;
189 rq->mq_ctx = ctx; 200 rq->mq_ctx = ctx;
190 rq->cmd_flags = rw_flags; 201 rq->cmd_flags |= rw_flags;
202 rq->cmd_type = 0;
203 /* do not touch atomic flags, it needs atomic ops against the timer */
204 rq->cpu = -1;
205 rq->__data_len = 0;
206 rq->__sector = (sector_t) -1;
207 rq->bio = NULL;
208 rq->biotail = NULL;
209 INIT_HLIST_NODE(&rq->hash);
210 RB_CLEAR_NODE(&rq->rb_node);
211 memset(&rq->flush, 0, max(sizeof(rq->flush), sizeof(rq->elv)));
212 rq->rq_disk = NULL;
213 rq->part = NULL;
191 rq->start_time = jiffies; 214 rq->start_time = jiffies;
215#ifdef CONFIG_BLK_CGROUP
216 rq->rl = NULL;
192 set_start_time_ns(rq); 217 set_start_time_ns(rq);
218 rq->io_start_time_ns = 0;
219#endif
220 rq->nr_phys_segments = 0;
221#if defined(CONFIG_BLK_DEV_INTEGRITY)
222 rq->nr_integrity_segments = 0;
223#endif
224 rq->ioprio = 0;
225 rq->special = NULL;
226 /* tag was already set */
227 rq->errors = 0;
228 memset(rq->__cmd, 0, sizeof(rq->__cmd));
229 rq->cmd = rq->__cmd;
230 rq->cmd_len = BLK_MAX_CDB;
231
232 rq->extra_len = 0;
233 rq->sense_len = 0;
234 rq->resid_len = 0;
235 rq->sense = NULL;
236
237 rq->deadline = 0;
238 INIT_LIST_HEAD(&rq->timeout_list);
239 rq->timeout = 0;
240 rq->retries = 0;
241 rq->end_io = NULL;
242 rq->end_io_data = NULL;
243 rq->next_rq = NULL;
244
193 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 245 ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
194} 246}
195 247
196static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 248static struct request *
197 int rw, gfp_t gfp, 249__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
198 bool reserved) 250 struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved)
199{ 251{
200 struct request *rq; 252 struct request *rq;
253 unsigned int tag;
201 254
202 do { 255 tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
203 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 256 if (tag != BLK_MQ_TAG_FAIL) {
204 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 257 rq = hctx->tags->rqs[tag];
205 258
206 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 259 rq->cmd_flags = 0;
207 if (rq) { 260 if (blk_mq_tag_busy(hctx)) {
208 blk_mq_rq_ctx_init(q, ctx, rq, rw); 261 rq->cmd_flags = REQ_MQ_INFLIGHT;
209 break; 262 atomic_inc(&hctx->nr_active);
210 } 263 }
211 264
212 blk_mq_put_ctx(ctx); 265 rq->tag = tag;
213 if (!(gfp & __GFP_WAIT)) 266 blk_mq_rq_ctx_init(q, ctx, rq, rw);
214 break; 267 return rq;
215 268 }
216 __blk_mq_run_hw_queue(hctx);
217 blk_mq_wait_for_tags(hctx->tags);
218 } while (1);
219 269
220 return rq; 270 return NULL;
221} 271}
222 272
223struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) 273struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
274 bool reserved)
224{ 275{
276 struct blk_mq_ctx *ctx;
277 struct blk_mq_hw_ctx *hctx;
225 struct request *rq; 278 struct request *rq;
226 279
227 if (blk_mq_queue_enter(q)) 280 if (blk_mq_queue_enter(q))
228 return NULL; 281 return NULL;
229 282
230 rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); 283 ctx = blk_mq_get_ctx(q);
231 if (rq) 284 hctx = q->mq_ops->map_queue(q, ctx->cpu);
232 blk_mq_put_ctx(rq->mq_ctx);
233 return rq;
234}
235
236struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
237 gfp_t gfp)
238{
239 struct request *rq;
240 285
241 if (blk_mq_queue_enter(q)) 286 rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT,
242 return NULL; 287 reserved);
288 if (!rq && (gfp & __GFP_WAIT)) {
289 __blk_mq_run_hw_queue(hctx);
290 blk_mq_put_ctx(ctx);
243 291
244 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 292 ctx = blk_mq_get_ctx(q);
245 if (rq) 293 hctx = q->mq_ops->map_queue(q, ctx->cpu);
246 blk_mq_put_ctx(rq->mq_ctx); 294 rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved);
295 }
296 blk_mq_put_ctx(ctx);
247 return rq; 297 return rq;
248} 298}
249EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 299EXPORT_SYMBOL(blk_mq_alloc_request);
250
251/*
252 * Re-init and set pdu, if we have it
253 */
254void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
255{
256 blk_rq_init(hctx->queue, rq);
257
258 if (hctx->cmd_size)
259 rq->special = blk_mq_rq_to_pdu(rq);
260}
261 300
262static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 301static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
263 struct blk_mq_ctx *ctx, struct request *rq) 302 struct blk_mq_ctx *ctx, struct request *rq)
@@ -265,9 +304,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
265 const int tag = rq->tag; 304 const int tag = rq->tag;
266 struct request_queue *q = rq->q; 305 struct request_queue *q = rq->q;
267 306
268 blk_mq_rq_init(hctx, rq); 307 if (rq->cmd_flags & REQ_MQ_INFLIGHT)
269 blk_mq_put_tag(hctx->tags, tag); 308 atomic_dec(&hctx->nr_active);
270 309
310 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
311 blk_mq_put_tag(hctx, tag, &ctx->last_tag);
271 blk_mq_queue_exit(q); 312 blk_mq_queue_exit(q);
272} 313}
273 314
@@ -283,20 +324,47 @@ void blk_mq_free_request(struct request *rq)
283 __blk_mq_free_request(hctx, ctx, rq); 324 __blk_mq_free_request(hctx, ctx, rq);
284} 325}
285 326
286bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes) 327/*
328 * Clone all relevant state from a request that has been put on hold in
329 * the flush state machine into the preallocated flush request that hangs
330 * off the request queue.
331 *
332 * For a driver the flush request should be invisible, that's why we are
333 * impersonating the original request here.
334 */
335void blk_mq_clone_flush_request(struct request *flush_rq,
336 struct request *orig_rq)
287{ 337{
288 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 338 struct blk_mq_hw_ctx *hctx =
289 return true; 339 orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
290 340
341 flush_rq->mq_ctx = orig_rq->mq_ctx;
342 flush_rq->tag = orig_rq->tag;
343 memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
344 hctx->cmd_size);
345}
346
347inline void __blk_mq_end_io(struct request *rq, int error)
348{
291 blk_account_io_done(rq); 349 blk_account_io_done(rq);
292 350
293 if (rq->end_io) 351 if (rq->end_io) {
294 rq->end_io(rq, error); 352 rq->end_io(rq, error);
295 else 353 } else {
354 if (unlikely(blk_bidi_rq(rq)))
355 blk_mq_free_request(rq->next_rq);
296 blk_mq_free_request(rq); 356 blk_mq_free_request(rq);
297 return false; 357 }
298} 358}
299EXPORT_SYMBOL(blk_mq_end_io_partial); 359EXPORT_SYMBOL(__blk_mq_end_io);
360
361void blk_mq_end_io(struct request *rq, int error)
362{
363 if (blk_update_request(rq, error, blk_rq_bytes(rq)))
364 BUG();
365 __blk_mq_end_io(rq, error);
366}
367EXPORT_SYMBOL(blk_mq_end_io);
300 368
301static void __blk_mq_complete_request_remote(void *data) 369static void __blk_mq_complete_request_remote(void *data)
302{ 370{
@@ -308,15 +376,19 @@ static void __blk_mq_complete_request_remote(void *data)
308void __blk_mq_complete_request(struct request *rq) 376void __blk_mq_complete_request(struct request *rq)
309{ 377{
310 struct blk_mq_ctx *ctx = rq->mq_ctx; 378 struct blk_mq_ctx *ctx = rq->mq_ctx;
379 bool shared = false;
311 int cpu; 380 int cpu;
312 381
313 if (!ctx->ipi_redirect) { 382 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
314 rq->q->softirq_done_fn(rq); 383 rq->q->softirq_done_fn(rq);
315 return; 384 return;
316 } 385 }
317 386
318 cpu = get_cpu(); 387 cpu = get_cpu();
319 if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { 388 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
389 shared = cpus_share_cache(cpu, ctx->cpu);
390
391 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
320 rq->csd.func = __blk_mq_complete_request_remote; 392 rq->csd.func = __blk_mq_complete_request_remote;
321 rq->csd.info = rq; 393 rq->csd.info = rq;
322 rq->csd.flags = 0; 394 rq->csd.flags = 0;
@@ -337,10 +409,16 @@ void __blk_mq_complete_request(struct request *rq)
337 **/ 409 **/
338void blk_mq_complete_request(struct request *rq) 410void blk_mq_complete_request(struct request *rq)
339{ 411{
340 if (unlikely(blk_should_fake_timeout(rq->q))) 412 struct request_queue *q = rq->q;
413
414 if (unlikely(blk_should_fake_timeout(q)))
341 return; 415 return;
342 if (!blk_mark_rq_complete(rq)) 416 if (!blk_mark_rq_complete(rq)) {
343 __blk_mq_complete_request(rq); 417 if (q->softirq_done_fn)
418 __blk_mq_complete_request(rq);
419 else
420 blk_mq_end_io(rq, rq->errors);
421 }
344} 422}
345EXPORT_SYMBOL(blk_mq_complete_request); 423EXPORT_SYMBOL(blk_mq_complete_request);
346 424
@@ -350,13 +428,29 @@ static void blk_mq_start_request(struct request *rq, bool last)
350 428
351 trace_block_rq_issue(q, rq); 429 trace_block_rq_issue(q, rq);
352 430
431 rq->resid_len = blk_rq_bytes(rq);
432 if (unlikely(blk_bidi_rq(rq)))
433 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
434
353 /* 435 /*
354 * Just mark start time and set the started bit. Due to memory 436 * Just mark start time and set the started bit. Due to memory
355 * ordering, we know we'll see the correct deadline as long as 437 * ordering, we know we'll see the correct deadline as long as
356 * REQ_ATOMIC_STARTED is seen. 438 * REQ_ATOMIC_STARTED is seen. Use the default queue timeout,
439 * unless one has been set in the request.
440 */
441 if (!rq->timeout)
442 rq->deadline = jiffies + q->rq_timeout;
443 else
444 rq->deadline = jiffies + rq->timeout;
445
446 /*
447 * Mark us as started and clear complete. Complete might have been
448 * set if requeue raced with timeout, which then marked it as
449 * complete. So be sure to clear complete again when we start
450 * the request, otherwise we'll ignore the completion event.
357 */ 451 */
358 rq->deadline = jiffies + q->rq_timeout;
359 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 452 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
453 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
360 454
361 if (q->dma_drain_size && blk_rq_bytes(rq)) { 455 if (q->dma_drain_size && blk_rq_bytes(rq)) {
362 /* 456 /*
@@ -378,7 +472,7 @@ static void blk_mq_start_request(struct request *rq, bool last)
378 rq->cmd_flags |= REQ_END; 472 rq->cmd_flags |= REQ_END;
379} 473}
380 474
381static void blk_mq_requeue_request(struct request *rq) 475static void __blk_mq_requeue_request(struct request *rq)
382{ 476{
383 struct request_queue *q = rq->q; 477 struct request_queue *q = rq->q;
384 478
@@ -391,6 +485,80 @@ static void blk_mq_requeue_request(struct request *rq)
391 rq->nr_phys_segments--; 485 rq->nr_phys_segments--;
392} 486}
393 487
488void blk_mq_requeue_request(struct request *rq)
489{
490 __blk_mq_requeue_request(rq);
491 blk_clear_rq_complete(rq);
492
493 BUG_ON(blk_queued_rq(rq));
494 blk_mq_add_to_requeue_list(rq, true);
495}
496EXPORT_SYMBOL(blk_mq_requeue_request);
497
498static void blk_mq_requeue_work(struct work_struct *work)
499{
500 struct request_queue *q =
501 container_of(work, struct request_queue, requeue_work);
502 LIST_HEAD(rq_list);
503 struct request *rq, *next;
504 unsigned long flags;
505
506 spin_lock_irqsave(&q->requeue_lock, flags);
507 list_splice_init(&q->requeue_list, &rq_list);
508 spin_unlock_irqrestore(&q->requeue_lock, flags);
509
510 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
511 if (!(rq->cmd_flags & REQ_SOFTBARRIER))
512 continue;
513
514 rq->cmd_flags &= ~REQ_SOFTBARRIER;
515 list_del_init(&rq->queuelist);
516 blk_mq_insert_request(rq, true, false, false);
517 }
518
519 while (!list_empty(&rq_list)) {
520 rq = list_entry(rq_list.next, struct request, queuelist);
521 list_del_init(&rq->queuelist);
522 blk_mq_insert_request(rq, false, false, false);
523 }
524
525 blk_mq_run_queues(q, false);
526}
527
528void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
529{
530 struct request_queue *q = rq->q;
531 unsigned long flags;
532
533 /*
534 * We abuse this flag that is otherwise used by the I/O scheduler to
535 * request head insertation from the workqueue.
536 */
537 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
538
539 spin_lock_irqsave(&q->requeue_lock, flags);
540 if (at_head) {
541 rq->cmd_flags |= REQ_SOFTBARRIER;
542 list_add(&rq->queuelist, &q->requeue_list);
543 } else {
544 list_add_tail(&rq->queuelist, &q->requeue_list);
545 }
546 spin_unlock_irqrestore(&q->requeue_lock, flags);
547}
548EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
549
550void blk_mq_kick_requeue_list(struct request_queue *q)
551{
552 kblockd_schedule_work(&q->requeue_work);
553}
554EXPORT_SYMBOL(blk_mq_kick_requeue_list);
555
556struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
557{
558 return tags->rqs[tag];
559}
560EXPORT_SYMBOL(blk_mq_tag_to_rq);
561
394struct blk_mq_timeout_data { 562struct blk_mq_timeout_data {
395 struct blk_mq_hw_ctx *hctx; 563 struct blk_mq_hw_ctx *hctx;
396 unsigned long *next; 564 unsigned long *next;
@@ -412,12 +580,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
412 do { 580 do {
413 struct request *rq; 581 struct request *rq;
414 582
415 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 583 tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
416 if (tag >= hctx->queue_depth) 584 if (tag >= hctx->tags->nr_tags)
417 break; 585 break;
418 586
419 rq = hctx->rqs[tag++]; 587 rq = blk_mq_tag_to_rq(hctx->tags, tag++);
420 588 if (rq->q != hctx->queue)
589 continue;
421 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 590 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
422 continue; 591 continue;
423 592
@@ -442,6 +611,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
442 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 611 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
443} 612}
444 613
614static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
615{
616 struct request_queue *q = rq->q;
617
618 /*
619 * We know that complete is set at this point. If STARTED isn't set
620 * anymore, then the request isn't active and the "timeout" should
621 * just be ignored. This can happen due to the bitflag ordering.
622 * Timeout first checks if STARTED is set, and if it is, assumes
623 * the request is active. But if we race with completion, then
624 * we both flags will get cleared. So check here again, and ignore
625 * a timeout event with a request that isn't active.
626 */
627 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
628 return BLK_EH_NOT_HANDLED;
629
630 if (!q->mq_ops->timeout)
631 return BLK_EH_RESET_TIMER;
632
633 return q->mq_ops->timeout(rq);
634}
635
445static void blk_mq_rq_timer(unsigned long data) 636static void blk_mq_rq_timer(unsigned long data)
446{ 637{
447 struct request_queue *q = (struct request_queue *) data; 638 struct request_queue *q = (struct request_queue *) data;
@@ -449,11 +640,24 @@ static void blk_mq_rq_timer(unsigned long data)
449 unsigned long next = 0; 640 unsigned long next = 0;
450 int i, next_set = 0; 641 int i, next_set = 0;
451 642
452 queue_for_each_hw_ctx(q, hctx, i) 643 queue_for_each_hw_ctx(q, hctx, i) {
644 /*
645 * If not software queues are currently mapped to this
646 * hardware queue, there's nothing to check
647 */
648 if (!hctx->nr_ctx || !hctx->tags)
649 continue;
650
453 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 651 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
652 }
454 653
455 if (next_set) 654 if (next_set) {
456 mod_timer(&q->timeout, round_jiffies_up(next)); 655 next = blk_rq_timeout(round_jiffies_up(next));
656 mod_timer(&q->timeout, next);
657 } else {
658 queue_for_each_hw_ctx(q, hctx, i)
659 blk_mq_tag_idle(hctx);
660 }
457} 661}
458 662
459/* 663/*
@@ -495,9 +699,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
495 return false; 699 return false;
496} 700}
497 701
498void blk_mq_add_timer(struct request *rq) 702/*
703 * Process software queues that have been marked busy, splicing them
704 * to the for-dispatch
705 */
706static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
499{ 707{
500 __blk_add_timer(rq, NULL); 708 struct blk_mq_ctx *ctx;
709 int i;
710
711 for (i = 0; i < hctx->ctx_map.map_size; i++) {
712 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
713 unsigned int off, bit;
714
715 if (!bm->word)
716 continue;
717
718 bit = 0;
719 off = i * hctx->ctx_map.bits_per_word;
720 do {
721 bit = find_next_bit(&bm->word, bm->depth, bit);
722 if (bit >= bm->depth)
723 break;
724
725 ctx = hctx->ctxs[bit + off];
726 clear_bit(bit, &bm->word);
727 spin_lock(&ctx->lock);
728 list_splice_tail_init(&ctx->rq_list, list);
729 spin_unlock(&ctx->lock);
730
731 bit++;
732 } while (1);
733 }
501} 734}
502 735
503/* 736/*
@@ -509,10 +742,11 @@ void blk_mq_add_timer(struct request *rq)
509static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 742static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
510{ 743{
511 struct request_queue *q = hctx->queue; 744 struct request_queue *q = hctx->queue;
512 struct blk_mq_ctx *ctx;
513 struct request *rq; 745 struct request *rq;
514 LIST_HEAD(rq_list); 746 LIST_HEAD(rq_list);
515 int bit, queued; 747 int queued;
748
749 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
516 750
517 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 751 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
518 return; 752 return;
@@ -522,15 +756,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
522 /* 756 /*
523 * Touch any software queue that has pending entries. 757 * Touch any software queue that has pending entries.
524 */ 758 */
525 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 759 flush_busy_ctxs(hctx, &rq_list);
526 clear_bit(bit, hctx->ctx_map);
527 ctx = hctx->ctxs[bit];
528 BUG_ON(bit != ctx->index_hw);
529
530 spin_lock(&ctx->lock);
531 list_splice_tail_init(&ctx->rq_list, &rq_list);
532 spin_unlock(&ctx->lock);
533 }
534 760
535 /* 761 /*
536 * If we have previous entries on our dispatch list, grab them 762 * If we have previous entries on our dispatch list, grab them
@@ -544,13 +770,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
544 } 770 }
545 771
546 /* 772 /*
547 * Delete and return all entries from our dispatch list
548 */
549 queued = 0;
550
551 /*
552 * Now process all the entries, sending them to the driver. 773 * Now process all the entries, sending them to the driver.
553 */ 774 */
775 queued = 0;
554 while (!list_empty(&rq_list)) { 776 while (!list_empty(&rq_list)) {
555 int ret; 777 int ret;
556 778
@@ -565,13 +787,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
565 queued++; 787 queued++;
566 continue; 788 continue;
567 case BLK_MQ_RQ_QUEUE_BUSY: 789 case BLK_MQ_RQ_QUEUE_BUSY:
568 /*
569 * FIXME: we should have a mechanism to stop the queue
570 * like blk_stop_queue, otherwise we will waste cpu
571 * time
572 */
573 list_add(&rq->queuelist, &rq_list); 790 list_add(&rq->queuelist, &rq_list);
574 blk_mq_requeue_request(rq); 791 __blk_mq_requeue_request(rq);
575 break; 792 break;
576 default: 793 default:
577 pr_err("blk-mq: bad return on queue: %d\n", ret); 794 pr_err("blk-mq: bad return on queue: %d\n", ret);
@@ -601,17 +818,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
601 } 818 }
602} 819}
603 820
821/*
822 * It'd be great if the workqueue API had a way to pass
823 * in a mask and had some smarts for more clever placement.
824 * For now we just round-robin here, switching for every
825 * BLK_MQ_CPU_WORK_BATCH queued items.
826 */
827static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
828{
829 int cpu = hctx->next_cpu;
830
831 if (--hctx->next_cpu_batch <= 0) {
832 int next_cpu;
833
834 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
835 if (next_cpu >= nr_cpu_ids)
836 next_cpu = cpumask_first(hctx->cpumask);
837
838 hctx->next_cpu = next_cpu;
839 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
840 }
841
842 return cpu;
843}
844
604void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 845void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
605{ 846{
606 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 847 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
607 return; 848 return;
608 849
609 if (!async) 850 if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
610 __blk_mq_run_hw_queue(hctx); 851 __blk_mq_run_hw_queue(hctx);
852 else if (hctx->queue->nr_hw_queues == 1)
853 kblockd_schedule_delayed_work(&hctx->run_work, 0);
611 else { 854 else {
612 struct request_queue *q = hctx->queue; 855 unsigned int cpu;
613 856
614 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 857 cpu = blk_mq_hctx_next_cpu(hctx);
858 kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
615 } 859 }
616} 860}
617 861
@@ -626,14 +870,17 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
626 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 870 test_bit(BLK_MQ_S_STOPPED, &hctx->state))
627 continue; 871 continue;
628 872
873 preempt_disable();
629 blk_mq_run_hw_queue(hctx, async); 874 blk_mq_run_hw_queue(hctx, async);
875 preempt_enable();
630 } 876 }
631} 877}
632EXPORT_SYMBOL(blk_mq_run_queues); 878EXPORT_SYMBOL(blk_mq_run_queues);
633 879
634void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 880void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
635{ 881{
636 cancel_delayed_work(&hctx->delayed_work); 882 cancel_delayed_work(&hctx->run_work);
883 cancel_delayed_work(&hctx->delay_work);
637 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 884 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
638} 885}
639EXPORT_SYMBOL(blk_mq_stop_hw_queue); 886EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -651,11 +898,25 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues);
651void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 898void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
652{ 899{
653 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 900 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
901
902 preempt_disable();
654 __blk_mq_run_hw_queue(hctx); 903 __blk_mq_run_hw_queue(hctx);
904 preempt_enable();
655} 905}
656EXPORT_SYMBOL(blk_mq_start_hw_queue); 906EXPORT_SYMBOL(blk_mq_start_hw_queue);
657 907
658void blk_mq_start_stopped_hw_queues(struct request_queue *q) 908void blk_mq_start_hw_queues(struct request_queue *q)
909{
910 struct blk_mq_hw_ctx *hctx;
911 int i;
912
913 queue_for_each_hw_ctx(q, hctx, i)
914 blk_mq_start_hw_queue(hctx);
915}
916EXPORT_SYMBOL(blk_mq_start_hw_queues);
917
918
919void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
659{ 920{
660 struct blk_mq_hw_ctx *hctx; 921 struct blk_mq_hw_ctx *hctx;
661 int i; 922 int i;
@@ -665,19 +926,47 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q)
665 continue; 926 continue;
666 927
667 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 928 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
668 blk_mq_run_hw_queue(hctx, true); 929 preempt_disable();
930 blk_mq_run_hw_queue(hctx, async);
931 preempt_enable();
669 } 932 }
670} 933}
671EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 934EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
672 935
673static void blk_mq_work_fn(struct work_struct *work) 936static void blk_mq_run_work_fn(struct work_struct *work)
674{ 937{
675 struct blk_mq_hw_ctx *hctx; 938 struct blk_mq_hw_ctx *hctx;
676 939
677 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 940 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
941
678 __blk_mq_run_hw_queue(hctx); 942 __blk_mq_run_hw_queue(hctx);
679} 943}
680 944
945static void blk_mq_delay_work_fn(struct work_struct *work)
946{
947 struct blk_mq_hw_ctx *hctx;
948
949 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
950
951 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
952 __blk_mq_run_hw_queue(hctx);
953}
954
955void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
956{
957 unsigned long tmo = msecs_to_jiffies(msecs);
958
959 if (hctx->queue->nr_hw_queues == 1)
960 kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
961 else {
962 unsigned int cpu;
963
964 cpu = blk_mq_hctx_next_cpu(hctx);
965 kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
966 }
967}
968EXPORT_SYMBOL(blk_mq_delay_queue);
969
681static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 970static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
682 struct request *rq, bool at_head) 971 struct request *rq, bool at_head)
683{ 972{
@@ -689,12 +978,13 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
689 list_add(&rq->queuelist, &ctx->rq_list); 978 list_add(&rq->queuelist, &ctx->rq_list);
690 else 979 else
691 list_add_tail(&rq->queuelist, &ctx->rq_list); 980 list_add_tail(&rq->queuelist, &ctx->rq_list);
981
692 blk_mq_hctx_mark_pending(hctx, ctx); 982 blk_mq_hctx_mark_pending(hctx, ctx);
693 983
694 /* 984 /*
695 * We do this early, to ensure we are on the right CPU. 985 * We do this early, to ensure we are on the right CPU.
696 */ 986 */
697 blk_mq_add_timer(rq); 987 blk_add_timer(rq);
698} 988}
699 989
700void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 990void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
@@ -719,10 +1009,10 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
719 spin_unlock(&ctx->lock); 1009 spin_unlock(&ctx->lock);
720 } 1010 }
721 1011
722 blk_mq_put_ctx(current_ctx);
723
724 if (run_queue) 1012 if (run_queue)
725 blk_mq_run_hw_queue(hctx, async); 1013 blk_mq_run_hw_queue(hctx, async);
1014
1015 blk_mq_put_ctx(current_ctx);
726} 1016}
727 1017
728static void blk_mq_insert_requests(struct request_queue *q, 1018static void blk_mq_insert_requests(struct request_queue *q,
@@ -758,9 +1048,8 @@ static void blk_mq_insert_requests(struct request_queue *q,
758 } 1048 }
759 spin_unlock(&ctx->lock); 1049 spin_unlock(&ctx->lock);
760 1050
761 blk_mq_put_ctx(current_ctx);
762
763 blk_mq_run_hw_queue(hctx, from_schedule); 1051 blk_mq_run_hw_queue(hctx, from_schedule);
1052 blk_mq_put_ctx(current_ctx);
764} 1053}
765 1054
766static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1055static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -826,21 +1115,161 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
826 blk_account_io_start(rq, 1); 1115 blk_account_io_start(rq, 1);
827} 1116}
828 1117
829static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 1118static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1119 struct blk_mq_ctx *ctx,
1120 struct request *rq, struct bio *bio)
830{ 1121{
1122 struct request_queue *q = hctx->queue;
1123
1124 if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
1125 blk_mq_bio_to_request(rq, bio);
1126 spin_lock(&ctx->lock);
1127insert_rq:
1128 __blk_mq_insert_request(hctx, rq, false);
1129 spin_unlock(&ctx->lock);
1130 return false;
1131 } else {
1132 spin_lock(&ctx->lock);
1133 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1134 blk_mq_bio_to_request(rq, bio);
1135 goto insert_rq;
1136 }
1137
1138 spin_unlock(&ctx->lock);
1139 __blk_mq_free_request(hctx, ctx, rq);
1140 return true;
1141 }
1142}
1143
1144struct blk_map_ctx {
831 struct blk_mq_hw_ctx *hctx; 1145 struct blk_mq_hw_ctx *hctx;
832 struct blk_mq_ctx *ctx; 1146 struct blk_mq_ctx *ctx;
1147};
1148
1149static struct request *blk_mq_map_request(struct request_queue *q,
1150 struct bio *bio,
1151 struct blk_map_ctx *data)
1152{
1153 struct blk_mq_hw_ctx *hctx;
1154 struct blk_mq_ctx *ctx;
1155 struct request *rq;
1156 int rw = bio_data_dir(bio);
1157
1158 if (unlikely(blk_mq_queue_enter(q))) {
1159 bio_endio(bio, -EIO);
1160 return NULL;
1161 }
1162
1163 ctx = blk_mq_get_ctx(q);
1164 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1165
1166 if (rw_is_sync(bio->bi_rw))
1167 rw |= REQ_SYNC;
1168
1169 trace_block_getrq(q, bio, rw);
1170 rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false);
1171 if (unlikely(!rq)) {
1172 __blk_mq_run_hw_queue(hctx);
1173 blk_mq_put_ctx(ctx);
1174 trace_block_sleeprq(q, bio, rw);
1175
1176 ctx = blk_mq_get_ctx(q);
1177 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1178 rq = __blk_mq_alloc_request(q, hctx, ctx, rw,
1179 __GFP_WAIT|GFP_ATOMIC, false);
1180 }
1181
1182 hctx->queued++;
1183 data->hctx = hctx;
1184 data->ctx = ctx;
1185 return rq;
1186}
1187
1188/*
1189 * Multiple hardware queue variant. This will not use per-process plugs,
1190 * but will attempt to bypass the hctx queueing if we can go straight to
1191 * hardware for SYNC IO.
1192 */
1193static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1194{
833 const int is_sync = rw_is_sync(bio->bi_rw); 1195 const int is_sync = rw_is_sync(bio->bi_rw);
834 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1196 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
835 int rw = bio_data_dir(bio); 1197 struct blk_map_ctx data;
836 struct request *rq; 1198 struct request *rq;
1199
1200 blk_queue_bounce(q, &bio);
1201
1202 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1203 bio_endio(bio, -EIO);
1204 return;
1205 }
1206
1207 rq = blk_mq_map_request(q, bio, &data);
1208 if (unlikely(!rq))
1209 return;
1210
1211 if (unlikely(is_flush_fua)) {
1212 blk_mq_bio_to_request(rq, bio);
1213 blk_insert_flush(rq);
1214 goto run_queue;
1215 }
1216
1217 if (is_sync) {
1218 int ret;
1219
1220 blk_mq_bio_to_request(rq, bio);
1221 blk_mq_start_request(rq, true);
1222
1223 /*
1224 * For OK queue, we are done. For error, kill it. Any other
1225 * error (busy), just add it to our list as we previously
1226 * would have done
1227 */
1228 ret = q->mq_ops->queue_rq(data.hctx, rq);
1229 if (ret == BLK_MQ_RQ_QUEUE_OK)
1230 goto done;
1231 else {
1232 __blk_mq_requeue_request(rq);
1233
1234 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1235 rq->errors = -EIO;
1236 blk_mq_end_io(rq, rq->errors);
1237 goto done;
1238 }
1239 }
1240 }
1241
1242 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1243 /*
1244 * For a SYNC request, send it to the hardware immediately. For
1245 * an ASYNC request, just ensure that we run it later on. The
1246 * latter allows for merging opportunities and more efficient
1247 * dispatching.
1248 */
1249run_queue:
1250 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1251 }
1252done:
1253 blk_mq_put_ctx(data.ctx);
1254}
1255
1256/*
1257 * Single hardware queue variant. This will attempt to use any per-process
1258 * plug for merging and IO deferral.
1259 */
1260static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1261{
1262 const int is_sync = rw_is_sync(bio->bi_rw);
1263 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
837 unsigned int use_plug, request_count = 0; 1264 unsigned int use_plug, request_count = 0;
1265 struct blk_map_ctx data;
1266 struct request *rq;
838 1267
839 /* 1268 /*
840 * If we have multiple hardware queues, just go directly to 1269 * If we have multiple hardware queues, just go directly to
841 * one of those for sync IO. 1270 * one of those for sync IO.
842 */ 1271 */
843 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 1272 use_plug = !is_flush_fua && !is_sync;
844 1273
845 blk_queue_bounce(q, &bio); 1274 blk_queue_bounce(q, &bio);
846 1275
@@ -849,37 +1278,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
849 return; 1278 return;
850 } 1279 }
851 1280
852 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 1281 if (use_plug && !blk_queue_nomerges(q) &&
853 return; 1282 blk_attempt_plug_merge(q, bio, &request_count))
854
855 if (blk_mq_queue_enter(q)) {
856 bio_endio(bio, -EIO);
857 return; 1283 return;
858 }
859 1284
860 ctx = blk_mq_get_ctx(q); 1285 rq = blk_mq_map_request(q, bio, &data);
861 hctx = q->mq_ops->map_queue(q, ctx->cpu);
862
863 if (is_sync)
864 rw |= REQ_SYNC;
865 trace_block_getrq(q, bio, rw);
866 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
867 if (likely(rq))
868 blk_mq_rq_ctx_init(q, ctx, rq, rw);
869 else {
870 blk_mq_put_ctx(ctx);
871 trace_block_sleeprq(q, bio, rw);
872 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
873 false);
874 ctx = rq->mq_ctx;
875 hctx = q->mq_ops->map_queue(q, ctx->cpu);
876 }
877
878 hctx->queued++;
879 1286
880 if (unlikely(is_flush_fua)) { 1287 if (unlikely(is_flush_fua)) {
881 blk_mq_bio_to_request(rq, bio); 1288 blk_mq_bio_to_request(rq, bio);
882 blk_mq_put_ctx(ctx);
883 blk_insert_flush(rq); 1289 blk_insert_flush(rq);
884 goto run_queue; 1290 goto run_queue;
885 } 1291 }
@@ -901,31 +1307,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
901 trace_block_plug(q); 1307 trace_block_plug(q);
902 } 1308 }
903 list_add_tail(&rq->queuelist, &plug->mq_list); 1309 list_add_tail(&rq->queuelist, &plug->mq_list);
904 blk_mq_put_ctx(ctx); 1310 blk_mq_put_ctx(data.ctx);
905 return; 1311 return;
906 } 1312 }
907 } 1313 }
908 1314
909 spin_lock(&ctx->lock); 1315 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
910 1316 /*
911 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1317 * For a SYNC request, send it to the hardware immediately. For
912 blk_mq_attempt_merge(q, ctx, bio)) 1318 * an ASYNC request, just ensure that we run it later on. The
913 __blk_mq_free_request(hctx, ctx, rq); 1319 * latter allows for merging opportunities and more efficient
914 else { 1320 * dispatching.
915 blk_mq_bio_to_request(rq, bio); 1321 */
916 __blk_mq_insert_request(hctx, rq, false); 1322run_queue:
1323 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
917 } 1324 }
918 1325
919 spin_unlock(&ctx->lock); 1326 blk_mq_put_ctx(data.ctx);
920 blk_mq_put_ctx(ctx);
921
922 /*
923 * For a SYNC request, send it to the hardware immediately. For an
924 * ASYNC request, just ensure that we run it later on. The latter
925 * allows for merging opportunities and more efficient dispatching.
926 */
927run_queue:
928 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
929} 1327}
930 1328
931/* 1329/*
@@ -937,32 +1335,153 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
937} 1335}
938EXPORT_SYMBOL(blk_mq_map_queue); 1336EXPORT_SYMBOL(blk_mq_map_queue);
939 1337
940struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 1338static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
941 unsigned int hctx_index) 1339 struct blk_mq_tags *tags, unsigned int hctx_idx)
942{ 1340{
943 return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 1341 struct page *page;
944 GFP_KERNEL | __GFP_ZERO, reg->numa_node); 1342
1343 if (tags->rqs && set->ops->exit_request) {
1344 int i;
1345
1346 for (i = 0; i < tags->nr_tags; i++) {
1347 if (!tags->rqs[i])
1348 continue;
1349 set->ops->exit_request(set->driver_data, tags->rqs[i],
1350 hctx_idx, i);
1351 }
1352 }
1353
1354 while (!list_empty(&tags->page_list)) {
1355 page = list_first_entry(&tags->page_list, struct page, lru);
1356 list_del_init(&page->lru);
1357 __free_pages(page, page->private);
1358 }
1359
1360 kfree(tags->rqs);
1361
1362 blk_mq_free_tags(tags);
945} 1363}
946EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
947 1364
948void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 1365static size_t order_to_size(unsigned int order)
949 unsigned int hctx_index)
950{ 1366{
951 kfree(hctx); 1367 return (size_t)PAGE_SIZE << order;
952} 1368}
953EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
954 1369
955static void blk_mq_hctx_notify(void *data, unsigned long action, 1370static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
956 unsigned int cpu) 1371 unsigned int hctx_idx)
1372{
1373 struct blk_mq_tags *tags;
1374 unsigned int i, j, entries_per_page, max_order = 4;
1375 size_t rq_size, left;
1376
1377 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1378 set->numa_node);
1379 if (!tags)
1380 return NULL;
1381
1382 INIT_LIST_HEAD(&tags->page_list);
1383
1384 tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
1385 GFP_KERNEL, set->numa_node);
1386 if (!tags->rqs) {
1387 blk_mq_free_tags(tags);
1388 return NULL;
1389 }
1390
1391 /*
1392 * rq_size is the size of the request plus driver payload, rounded
1393 * to the cacheline size
1394 */
1395 rq_size = round_up(sizeof(struct request) + set->cmd_size,
1396 cache_line_size());
1397 left = rq_size * set->queue_depth;
1398
1399 for (i = 0; i < set->queue_depth; ) {
1400 int this_order = max_order;
1401 struct page *page;
1402 int to_do;
1403 void *p;
1404
1405 while (left < order_to_size(this_order - 1) && this_order)
1406 this_order--;
1407
1408 do {
1409 page = alloc_pages_node(set->numa_node, GFP_KERNEL,
1410 this_order);
1411 if (page)
1412 break;
1413 if (!this_order--)
1414 break;
1415 if (order_to_size(this_order) < rq_size)
1416 break;
1417 } while (1);
1418
1419 if (!page)
1420 goto fail;
1421
1422 page->private = this_order;
1423 list_add_tail(&page->lru, &tags->page_list);
1424
1425 p = page_address(page);
1426 entries_per_page = order_to_size(this_order) / rq_size;
1427 to_do = min(entries_per_page, set->queue_depth - i);
1428 left -= to_do * rq_size;
1429 for (j = 0; j < to_do; j++) {
1430 tags->rqs[i] = p;
1431 if (set->ops->init_request) {
1432 if (set->ops->init_request(set->driver_data,
1433 tags->rqs[i], hctx_idx, i,
1434 set->numa_node))
1435 goto fail;
1436 }
1437
1438 p += rq_size;
1439 i++;
1440 }
1441 }
1442
1443 return tags;
1444
1445fail:
1446 pr_warn("%s: failed to allocate requests\n", __func__);
1447 blk_mq_free_rq_map(set, tags, hctx_idx);
1448 return NULL;
1449}
1450
1451static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
1452{
1453 kfree(bitmap->map);
1454}
1455
1456static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
1457{
1458 unsigned int bpw = 8, total, num_maps, i;
1459
1460 bitmap->bits_per_word = bpw;
1461
1462 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
1463 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
1464 GFP_KERNEL, node);
1465 if (!bitmap->map)
1466 return -ENOMEM;
1467
1468 bitmap->map_size = num_maps;
1469
1470 total = nr_cpu_ids;
1471 for (i = 0; i < num_maps; i++) {
1472 bitmap->map[i].depth = min(total, bitmap->bits_per_word);
1473 total -= bitmap->map[i].depth;
1474 }
1475
1476 return 0;
1477}
1478
1479static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
957{ 1480{
958 struct blk_mq_hw_ctx *hctx = data;
959 struct request_queue *q = hctx->queue; 1481 struct request_queue *q = hctx->queue;
960 struct blk_mq_ctx *ctx; 1482 struct blk_mq_ctx *ctx;
961 LIST_HEAD(tmp); 1483 LIST_HEAD(tmp);
962 1484
963 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
964 return;
965
966 /* 1485 /*
967 * Move ctx entries to new CPU, if this one is going away. 1486 * Move ctx entries to new CPU, if this one is going away.
968 */ 1487 */
@@ -971,12 +1490,12 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
971 spin_lock(&ctx->lock); 1490 spin_lock(&ctx->lock);
972 if (!list_empty(&ctx->rq_list)) { 1491 if (!list_empty(&ctx->rq_list)) {
973 list_splice_init(&ctx->rq_list, &tmp); 1492 list_splice_init(&ctx->rq_list, &tmp);
974 clear_bit(ctx->index_hw, hctx->ctx_map); 1493 blk_mq_hctx_clear_pending(hctx, ctx);
975 } 1494 }
976 spin_unlock(&ctx->lock); 1495 spin_unlock(&ctx->lock);
977 1496
978 if (list_empty(&tmp)) 1497 if (list_empty(&tmp))
979 return; 1498 return NOTIFY_OK;
980 1499
981 ctx = blk_mq_get_ctx(q); 1500 ctx = blk_mq_get_ctx(q);
982 spin_lock(&ctx->lock); 1501 spin_lock(&ctx->lock);
@@ -993,210 +1512,103 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
993 blk_mq_hctx_mark_pending(hctx, ctx); 1512 blk_mq_hctx_mark_pending(hctx, ctx);
994 1513
995 spin_unlock(&ctx->lock); 1514 spin_unlock(&ctx->lock);
996 blk_mq_put_ctx(ctx);
997 1515
998 blk_mq_run_hw_queue(hctx, true); 1516 blk_mq_run_hw_queue(hctx, true);
1517 blk_mq_put_ctx(ctx);
1518 return NOTIFY_OK;
999} 1519}
1000 1520
1001static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1521static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
1002 int (*init)(void *, struct blk_mq_hw_ctx *,
1003 struct request *, unsigned int),
1004 void *data)
1005{ 1522{
1006 unsigned int i; 1523 struct request_queue *q = hctx->queue;
1007 int ret = 0; 1524 struct blk_mq_tag_set *set = q->tag_set;
1008
1009 for (i = 0; i < hctx->queue_depth; i++) {
1010 struct request *rq = hctx->rqs[i];
1011
1012 ret = init(data, hctx, rq, i);
1013 if (ret)
1014 break;
1015 }
1016
1017 return ret;
1018}
1019 1525
1020int blk_mq_init_commands(struct request_queue *q, 1526 if (set->tags[hctx->queue_num])
1021 int (*init)(void *, struct blk_mq_hw_ctx *, 1527 return NOTIFY_OK;
1022 struct request *, unsigned int),
1023 void *data)
1024{
1025 struct blk_mq_hw_ctx *hctx;
1026 unsigned int i;
1027 int ret = 0;
1028 1528
1029 queue_for_each_hw_ctx(q, hctx, i) { 1529 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
1030 ret = blk_mq_init_hw_commands(hctx, init, data); 1530 if (!set->tags[hctx->queue_num])
1031 if (ret) 1531 return NOTIFY_STOP;
1032 break;
1033 }
1034 1532
1035 return ret; 1533 hctx->tags = set->tags[hctx->queue_num];
1534 return NOTIFY_OK;
1036} 1535}
1037EXPORT_SYMBOL(blk_mq_init_commands);
1038 1536
1039static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx, 1537static int blk_mq_hctx_notify(void *data, unsigned long action,
1040 void (*free)(void *, struct blk_mq_hw_ctx *, 1538 unsigned int cpu)
1041 struct request *, unsigned int),
1042 void *data)
1043{ 1539{
1044 unsigned int i; 1540 struct blk_mq_hw_ctx *hctx = data;
1045 1541
1046 for (i = 0; i < hctx->queue_depth; i++) { 1542 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1047 struct request *rq = hctx->rqs[i]; 1543 return blk_mq_hctx_cpu_offline(hctx, cpu);
1544 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
1545 return blk_mq_hctx_cpu_online(hctx, cpu);
1048 1546
1049 free(data, hctx, rq, i); 1547 return NOTIFY_OK;
1050 }
1051} 1548}
1052 1549
1053void blk_mq_free_commands(struct request_queue *q, 1550static void blk_mq_exit_hw_queues(struct request_queue *q,
1054 void (*free)(void *, struct blk_mq_hw_ctx *, 1551 struct blk_mq_tag_set *set, int nr_queue)
1055 struct request *, unsigned int),
1056 void *data)
1057{ 1552{
1058 struct blk_mq_hw_ctx *hctx; 1553 struct blk_mq_hw_ctx *hctx;
1059 unsigned int i; 1554 unsigned int i;
1060 1555
1061 queue_for_each_hw_ctx(q, hctx, i) 1556 queue_for_each_hw_ctx(q, hctx, i) {
1062 blk_mq_free_hw_commands(hctx, free, data); 1557 if (i == nr_queue)
1063} 1558 break;
1064EXPORT_SYMBOL(blk_mq_free_commands);
1065 1559
1066static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1560 if (set->ops->exit_hctx)
1067{ 1561 set->ops->exit_hctx(hctx, i);
1068 struct page *page;
1069 1562
1070 while (!list_empty(&hctx->page_list)) { 1563 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1071 page = list_first_entry(&hctx->page_list, struct page, lru); 1564 kfree(hctx->ctxs);
1072 list_del_init(&page->lru); 1565 blk_mq_free_bitmap(&hctx->ctx_map);
1073 __free_pages(page, page->private);
1074 } 1566 }
1075 1567
1076 kfree(hctx->rqs);
1077
1078 if (hctx->tags)
1079 blk_mq_free_tags(hctx->tags);
1080}
1081
1082static size_t order_to_size(unsigned int order)
1083{
1084 size_t ret = PAGE_SIZE;
1085
1086 while (order--)
1087 ret *= 2;
1088
1089 return ret;
1090} 1568}
1091 1569
1092static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1570static void blk_mq_free_hw_queues(struct request_queue *q,
1093 unsigned int reserved_tags, int node) 1571 struct blk_mq_tag_set *set)
1094{ 1572{
1095 unsigned int i, j, entries_per_page, max_order = 4; 1573 struct blk_mq_hw_ctx *hctx;
1096 size_t rq_size, left; 1574 unsigned int i;
1097
1098 INIT_LIST_HEAD(&hctx->page_list);
1099
1100 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
1101 GFP_KERNEL, node);
1102 if (!hctx->rqs)
1103 return -ENOMEM;
1104
1105 /*
1106 * rq_size is the size of the request plus driver payload, rounded
1107 * to the cacheline size
1108 */
1109 rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
1110 cache_line_size());
1111 left = rq_size * hctx->queue_depth;
1112
1113 for (i = 0; i < hctx->queue_depth;) {
1114 int this_order = max_order;
1115 struct page *page;
1116 int to_do;
1117 void *p;
1118
1119 while (left < order_to_size(this_order - 1) && this_order)
1120 this_order--;
1121
1122 do {
1123 page = alloc_pages_node(node, GFP_KERNEL, this_order);
1124 if (page)
1125 break;
1126 if (!this_order--)
1127 break;
1128 if (order_to_size(this_order) < rq_size)
1129 break;
1130 } while (1);
1131
1132 if (!page)
1133 break;
1134
1135 page->private = this_order;
1136 list_add_tail(&page->lru, &hctx->page_list);
1137
1138 p = page_address(page);
1139 entries_per_page = order_to_size(this_order) / rq_size;
1140 to_do = min(entries_per_page, hctx->queue_depth - i);
1141 left -= to_do * rq_size;
1142 for (j = 0; j < to_do; j++) {
1143 hctx->rqs[i] = p;
1144 blk_mq_rq_init(hctx, hctx->rqs[i]);
1145 p += rq_size;
1146 i++;
1147 }
1148 }
1149
1150 if (i < (reserved_tags + BLK_MQ_TAG_MIN))
1151 goto err_rq_map;
1152 else if (i != hctx->queue_depth) {
1153 hctx->queue_depth = i;
1154 pr_warn("%s: queue depth set to %u because of low memory\n",
1155 __func__, i);
1156 }
1157 1575
1158 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1576 queue_for_each_hw_ctx(q, hctx, i) {
1159 if (!hctx->tags) { 1577 free_cpumask_var(hctx->cpumask);
1160err_rq_map: 1578 kfree(hctx);
1161 blk_mq_free_rq_map(hctx);
1162 return -ENOMEM;
1163 } 1579 }
1164
1165 return 0;
1166} 1580}
1167 1581
1168static int blk_mq_init_hw_queues(struct request_queue *q, 1582static int blk_mq_init_hw_queues(struct request_queue *q,
1169 struct blk_mq_reg *reg, void *driver_data) 1583 struct blk_mq_tag_set *set)
1170{ 1584{
1171 struct blk_mq_hw_ctx *hctx; 1585 struct blk_mq_hw_ctx *hctx;
1172 unsigned int i, j; 1586 unsigned int i;
1173 1587
1174 /* 1588 /*
1175 * Initialize hardware queues 1589 * Initialize hardware queues
1176 */ 1590 */
1177 queue_for_each_hw_ctx(q, hctx, i) { 1591 queue_for_each_hw_ctx(q, hctx, i) {
1178 unsigned int num_maps;
1179 int node; 1592 int node;
1180 1593
1181 node = hctx->numa_node; 1594 node = hctx->numa_node;
1182 if (node == NUMA_NO_NODE) 1595 if (node == NUMA_NO_NODE)
1183 node = hctx->numa_node = reg->numa_node; 1596 node = hctx->numa_node = set->numa_node;
1184 1597
1185 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1598 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1599 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1186 spin_lock_init(&hctx->lock); 1600 spin_lock_init(&hctx->lock);
1187 INIT_LIST_HEAD(&hctx->dispatch); 1601 INIT_LIST_HEAD(&hctx->dispatch);
1188 hctx->queue = q; 1602 hctx->queue = q;
1189 hctx->queue_num = i; 1603 hctx->queue_num = i;
1190 hctx->flags = reg->flags; 1604 hctx->flags = set->flags;
1191 hctx->queue_depth = reg->queue_depth; 1605 hctx->cmd_size = set->cmd_size;
1192 hctx->cmd_size = reg->cmd_size;
1193 1606
1194 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1607 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1195 blk_mq_hctx_notify, hctx); 1608 blk_mq_hctx_notify, hctx);
1196 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1609 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1197 1610
1198 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1611 hctx->tags = set->tags[i];
1199 break;
1200 1612
1201 /* 1613 /*
1202 * Allocate space for all possible cpus to avoid allocation in 1614 * Allocate space for all possible cpus to avoid allocation in
@@ -1207,17 +1619,13 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
1207 if (!hctx->ctxs) 1619 if (!hctx->ctxs)
1208 break; 1620 break;
1209 1621
1210 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1622 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
1211 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1212 GFP_KERNEL, node);
1213 if (!hctx->ctx_map)
1214 break; 1623 break;
1215 1624
1216 hctx->nr_ctx_map = num_maps;
1217 hctx->nr_ctx = 0; 1625 hctx->nr_ctx = 0;
1218 1626
1219 if (reg->ops->init_hctx && 1627 if (set->ops->init_hctx &&
1220 reg->ops->init_hctx(hctx, driver_data, i)) 1628 set->ops->init_hctx(hctx, set->driver_data, i))
1221 break; 1629 break;
1222 } 1630 }
1223 1631
@@ -1227,17 +1635,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
1227 /* 1635 /*
1228 * Init failed 1636 * Init failed
1229 */ 1637 */
1230 queue_for_each_hw_ctx(q, hctx, j) { 1638 blk_mq_exit_hw_queues(q, set, i);
1231 if (i == j)
1232 break;
1233
1234 if (reg->ops->exit_hctx)
1235 reg->ops->exit_hctx(hctx, j);
1236
1237 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1238 blk_mq_free_rq_map(hctx);
1239 kfree(hctx->ctxs);
1240 }
1241 1639
1242 return 1; 1640 return 1;
1243} 1641}
@@ -1258,12 +1656,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
1258 __ctx->queue = q; 1656 __ctx->queue = q;
1259 1657
1260 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1658 /* If the cpu isn't online, the cpu is mapped to first hctx */
1261 hctx = q->mq_ops->map_queue(q, i);
1262 hctx->nr_ctx++;
1263
1264 if (!cpu_online(i)) 1659 if (!cpu_online(i))
1265 continue; 1660 continue;
1266 1661
1662 hctx = q->mq_ops->map_queue(q, i);
1663 cpumask_set_cpu(i, hctx->cpumask);
1664 hctx->nr_ctx++;
1665
1267 /* 1666 /*
1268 * Set local node, IFF we have more than one hw queue. If 1667 * Set local node, IFF we have more than one hw queue. If
1269 * not, we remain on the home node of the device 1668 * not, we remain on the home node of the device
@@ -1280,6 +1679,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1280 struct blk_mq_ctx *ctx; 1679 struct blk_mq_ctx *ctx;
1281 1680
1282 queue_for_each_hw_ctx(q, hctx, i) { 1681 queue_for_each_hw_ctx(q, hctx, i) {
1682 cpumask_clear(hctx->cpumask);
1283 hctx->nr_ctx = 0; 1683 hctx->nr_ctx = 0;
1284 } 1684 }
1285 1685
@@ -1288,115 +1688,205 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1288 */ 1688 */
1289 queue_for_each_ctx(q, ctx, i) { 1689 queue_for_each_ctx(q, ctx, i) {
1290 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1690 /* If the cpu isn't online, the cpu is mapped to first hctx */
1691 if (!cpu_online(i))
1692 continue;
1693
1291 hctx = q->mq_ops->map_queue(q, i); 1694 hctx = q->mq_ops->map_queue(q, i);
1695 cpumask_set_cpu(i, hctx->cpumask);
1292 ctx->index_hw = hctx->nr_ctx; 1696 ctx->index_hw = hctx->nr_ctx;
1293 hctx->ctxs[hctx->nr_ctx++] = ctx; 1697 hctx->ctxs[hctx->nr_ctx++] = ctx;
1294 } 1698 }
1699
1700 queue_for_each_hw_ctx(q, hctx, i) {
1701 /*
1702 * If not software queues are mapped to this hardware queue,
1703 * disable it and free the request entries
1704 */
1705 if (!hctx->nr_ctx) {
1706 struct blk_mq_tag_set *set = q->tag_set;
1707
1708 if (set->tags[i]) {
1709 blk_mq_free_rq_map(set, set->tags[i], i);
1710 set->tags[i] = NULL;
1711 hctx->tags = NULL;
1712 }
1713 continue;
1714 }
1715
1716 /*
1717 * Initialize batch roundrobin counts
1718 */
1719 hctx->next_cpu = cpumask_first(hctx->cpumask);
1720 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1721 }
1295} 1722}
1296 1723
1297struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1724static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
1298 void *driver_data)
1299{ 1725{
1300 struct blk_mq_hw_ctx **hctxs; 1726 struct blk_mq_hw_ctx *hctx;
1301 struct blk_mq_ctx *ctx;
1302 struct request_queue *q; 1727 struct request_queue *q;
1728 bool shared;
1303 int i; 1729 int i;
1304 1730
1305 if (!reg->nr_hw_queues || 1731 if (set->tag_list.next == set->tag_list.prev)
1306 !reg->ops->queue_rq || !reg->ops->map_queue || 1732 shared = false;
1307 !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1733 else
1308 return ERR_PTR(-EINVAL); 1734 shared = true;
1309 1735
1310 if (!reg->queue_depth) 1736 list_for_each_entry(q, &set->tag_list, tag_set_list) {
1311 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1737 blk_mq_freeze_queue(q);
1312 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1738
1313 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1739 queue_for_each_hw_ctx(q, hctx, i) {
1314 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1740 if (shared)
1741 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1742 else
1743 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1744 }
1745 blk_mq_unfreeze_queue(q);
1315 } 1746 }
1747}
1748
1749static void blk_mq_del_queue_tag_set(struct request_queue *q)
1750{
1751 struct blk_mq_tag_set *set = q->tag_set;
1752
1753 blk_mq_freeze_queue(q);
1754
1755 mutex_lock(&set->tag_list_lock);
1756 list_del_init(&q->tag_set_list);
1757 blk_mq_update_tag_set_depth(set);
1758 mutex_unlock(&set->tag_list_lock);
1759
1760 blk_mq_unfreeze_queue(q);
1761}
1762
1763static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1764 struct request_queue *q)
1765{
1766 q->tag_set = set;
1767
1768 mutex_lock(&set->tag_list_lock);
1769 list_add_tail(&q->tag_set_list, &set->tag_list);
1770 blk_mq_update_tag_set_depth(set);
1771 mutex_unlock(&set->tag_list_lock);
1772}
1316 1773
1317 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1774struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1318 return ERR_PTR(-EINVAL); 1775{
1776 struct blk_mq_hw_ctx **hctxs;
1777 struct blk_mq_ctx *ctx;
1778 struct request_queue *q;
1779 unsigned int *map;
1780 int i;
1319 1781
1320 ctx = alloc_percpu(struct blk_mq_ctx); 1782 ctx = alloc_percpu(struct blk_mq_ctx);
1321 if (!ctx) 1783 if (!ctx)
1322 return ERR_PTR(-ENOMEM); 1784 return ERR_PTR(-ENOMEM);
1323 1785
1324 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1786 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1325 reg->numa_node); 1787 set->numa_node);
1326 1788
1327 if (!hctxs) 1789 if (!hctxs)
1328 goto err_percpu; 1790 goto err_percpu;
1329 1791
1330 for (i = 0; i < reg->nr_hw_queues; i++) { 1792 map = blk_mq_make_queue_map(set);
1331 hctxs[i] = reg->ops->alloc_hctx(reg, i); 1793 if (!map)
1794 goto err_map;
1795
1796 for (i = 0; i < set->nr_hw_queues; i++) {
1797 int node = blk_mq_hw_queue_to_node(map, i);
1798
1799 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
1800 GFP_KERNEL, node);
1332 if (!hctxs[i]) 1801 if (!hctxs[i])
1333 goto err_hctxs; 1802 goto err_hctxs;
1334 1803
1335 hctxs[i]->numa_node = NUMA_NO_NODE; 1804 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
1805 goto err_hctxs;
1806
1807 atomic_set(&hctxs[i]->nr_active, 0);
1808 hctxs[i]->numa_node = node;
1336 hctxs[i]->queue_num = i; 1809 hctxs[i]->queue_num = i;
1337 } 1810 }
1338 1811
1339 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1812 q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1340 if (!q) 1813 if (!q)
1341 goto err_hctxs; 1814 goto err_hctxs;
1342 1815
1343 q->mq_map = blk_mq_make_queue_map(reg); 1816 if (percpu_counter_init(&q->mq_usage_counter, 0))
1344 if (!q->mq_map)
1345 goto err_map; 1817 goto err_map;
1346 1818
1347 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1819 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1348 blk_queue_rq_timeout(q, 30000); 1820 blk_queue_rq_timeout(q, 30000);
1349 1821
1350 q->nr_queues = nr_cpu_ids; 1822 q->nr_queues = nr_cpu_ids;
1351 q->nr_hw_queues = reg->nr_hw_queues; 1823 q->nr_hw_queues = set->nr_hw_queues;
1824 q->mq_map = map;
1352 1825
1353 q->queue_ctx = ctx; 1826 q->queue_ctx = ctx;
1354 q->queue_hw_ctx = hctxs; 1827 q->queue_hw_ctx = hctxs;
1355 1828
1356 q->mq_ops = reg->ops; 1829 q->mq_ops = set->ops;
1357 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1830 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1358 1831
1359 q->sg_reserved_size = INT_MAX; 1832 q->sg_reserved_size = INT_MAX;
1360 1833
1361 blk_queue_make_request(q, blk_mq_make_request); 1834 INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
1362 blk_queue_rq_timed_out(q, reg->ops->timeout); 1835 INIT_LIST_HEAD(&q->requeue_list);
1363 if (reg->timeout) 1836 spin_lock_init(&q->requeue_lock);
1364 blk_queue_rq_timeout(q, reg->timeout); 1837
1838 if (q->nr_hw_queues > 1)
1839 blk_queue_make_request(q, blk_mq_make_request);
1840 else
1841 blk_queue_make_request(q, blk_sq_make_request);
1842
1843 blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
1844 if (set->timeout)
1845 blk_queue_rq_timeout(q, set->timeout);
1365 1846
1366 if (reg->ops->complete) 1847 /*
1367 blk_queue_softirq_done(q, reg->ops->complete); 1848 * Do this after blk_queue_make_request() overrides it...
1849 */
1850 q->nr_requests = set->queue_depth;
1851
1852 if (set->ops->complete)
1853 blk_queue_softirq_done(q, set->ops->complete);
1368 1854
1369 blk_mq_init_flush(q); 1855 blk_mq_init_flush(q);
1370 blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1856 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
1371 1857
1372 q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, 1858 q->flush_rq = kzalloc(round_up(sizeof(struct request) +
1373 cache_line_size()), GFP_KERNEL); 1859 set->cmd_size, cache_line_size()),
1860 GFP_KERNEL);
1374 if (!q->flush_rq) 1861 if (!q->flush_rq)
1375 goto err_hw; 1862 goto err_hw;
1376 1863
1377 if (blk_mq_init_hw_queues(q, reg, driver_data)) 1864 if (blk_mq_init_hw_queues(q, set))
1378 goto err_flush_rq; 1865 goto err_flush_rq;
1379 1866
1380 blk_mq_map_swqueue(q);
1381
1382 mutex_lock(&all_q_mutex); 1867 mutex_lock(&all_q_mutex);
1383 list_add_tail(&q->all_q_node, &all_q_list); 1868 list_add_tail(&q->all_q_node, &all_q_list);
1384 mutex_unlock(&all_q_mutex); 1869 mutex_unlock(&all_q_mutex);
1385 1870
1871 blk_mq_add_queue_tag_set(set, q);
1872
1873 blk_mq_map_swqueue(q);
1874
1386 return q; 1875 return q;
1387 1876
1388err_flush_rq: 1877err_flush_rq:
1389 kfree(q->flush_rq); 1878 kfree(q->flush_rq);
1390err_hw: 1879err_hw:
1391 kfree(q->mq_map);
1392err_map:
1393 blk_cleanup_queue(q); 1880 blk_cleanup_queue(q);
1394err_hctxs: 1881err_hctxs:
1395 for (i = 0; i < reg->nr_hw_queues; i++) { 1882 kfree(map);
1883 for (i = 0; i < set->nr_hw_queues; i++) {
1396 if (!hctxs[i]) 1884 if (!hctxs[i])
1397 break; 1885 break;
1398 reg->ops->free_hctx(hctxs[i], i); 1886 free_cpumask_var(hctxs[i]->cpumask);
1887 kfree(hctxs[i]);
1399 } 1888 }
1889err_map:
1400 kfree(hctxs); 1890 kfree(hctxs);
1401err_percpu: 1891err_percpu:
1402 free_percpu(ctx); 1892 free_percpu(ctx);
@@ -1406,18 +1896,14 @@ EXPORT_SYMBOL(blk_mq_init_queue);
1406 1896
1407void blk_mq_free_queue(struct request_queue *q) 1897void blk_mq_free_queue(struct request_queue *q)
1408{ 1898{
1409 struct blk_mq_hw_ctx *hctx; 1899 struct blk_mq_tag_set *set = q->tag_set;
1410 int i;
1411 1900
1412 queue_for_each_hw_ctx(q, hctx, i) { 1901 blk_mq_del_queue_tag_set(q);
1413 kfree(hctx->ctx_map); 1902
1414 kfree(hctx->ctxs); 1903 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
1415 blk_mq_free_rq_map(hctx); 1904 blk_mq_free_hw_queues(q, set);
1416 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1905
1417 if (q->mq_ops->exit_hctx) 1906 percpu_counter_destroy(&q->mq_usage_counter);
1418 q->mq_ops->exit_hctx(hctx, i);
1419 q->mq_ops->free_hctx(hctx, i);
1420 }
1421 1907
1422 free_percpu(q->queue_ctx); 1908 free_percpu(q->queue_ctx);
1423 kfree(q->queue_hw_ctx); 1909 kfree(q->queue_hw_ctx);
@@ -1456,10 +1942,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1456 struct request_queue *q; 1942 struct request_queue *q;
1457 1943
1458 /* 1944 /*
1459 * Before new mapping is established, hotadded cpu might already start 1945 * Before new mappings are established, hotadded cpu might already
1460 * handling requests. This doesn't break anything as we map offline 1946 * start handling requests. This doesn't break anything as we map
1461 * CPUs to first hardware queue. We will re-init queue below to get 1947 * offline CPUs to first hardware queue. We will re-init the queue
1462 * optimal settings. 1948 * below to get optimal settings.
1463 */ 1949 */
1464 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1950 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1465 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1951 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
@@ -1472,6 +1958,81 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1472 return NOTIFY_OK; 1958 return NOTIFY_OK;
1473} 1959}
1474 1960
1961int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
1962{
1963 int i;
1964
1965 if (!set->nr_hw_queues)
1966 return -EINVAL;
1967 if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
1968 return -EINVAL;
1969 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
1970 return -EINVAL;
1971
1972 if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
1973 return -EINVAL;
1974
1975
1976 set->tags = kmalloc_node(set->nr_hw_queues *
1977 sizeof(struct blk_mq_tags *),
1978 GFP_KERNEL, set->numa_node);
1979 if (!set->tags)
1980 goto out;
1981
1982 for (i = 0; i < set->nr_hw_queues; i++) {
1983 set->tags[i] = blk_mq_init_rq_map(set, i);
1984 if (!set->tags[i])
1985 goto out_unwind;
1986 }
1987
1988 mutex_init(&set->tag_list_lock);
1989 INIT_LIST_HEAD(&set->tag_list);
1990
1991 return 0;
1992
1993out_unwind:
1994 while (--i >= 0)
1995 blk_mq_free_rq_map(set, set->tags[i], i);
1996out:
1997 return -ENOMEM;
1998}
1999EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2000
2001void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2002{
2003 int i;
2004
2005 for (i = 0; i < set->nr_hw_queues; i++) {
2006 if (set->tags[i])
2007 blk_mq_free_rq_map(set, set->tags[i], i);
2008 }
2009
2010 kfree(set->tags);
2011}
2012EXPORT_SYMBOL(blk_mq_free_tag_set);
2013
2014int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2015{
2016 struct blk_mq_tag_set *set = q->tag_set;
2017 struct blk_mq_hw_ctx *hctx;
2018 int i, ret;
2019
2020 if (!set || nr > set->queue_depth)
2021 return -EINVAL;
2022
2023 ret = 0;
2024 queue_for_each_hw_ctx(q, hctx, i) {
2025 ret = blk_mq_tag_update_depth(hctx->tags, nr);
2026 if (ret)
2027 break;
2028 }
2029
2030 if (!ret)
2031 q->nr_requests = nr;
2032
2033 return ret;
2034}
2035
1475void blk_mq_disable_hotplug(void) 2036void blk_mq_disable_hotplug(void)
1476{ 2037{
1477 mutex_lock(&all_q_mutex); 2038 mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ebbe6bac9d61..ff5e6bf0f691 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
1#ifndef INT_BLK_MQ_H 1#ifndef INT_BLK_MQ_H
2#define INT_BLK_MQ_H 2#define INT_BLK_MQ_H
3 3
4struct blk_mq_tag_set;
5
4struct blk_mq_ctx { 6struct blk_mq_ctx {
5 struct { 7 struct {
6 spinlock_t lock; 8 spinlock_t lock;
@@ -9,7 +11,8 @@ struct blk_mq_ctx {
9 11
10 unsigned int cpu; 12 unsigned int cpu;
11 unsigned int index_hw; 13 unsigned int index_hw;
12 unsigned int ipi_redirect; 14
15 unsigned int last_tag ____cacheline_aligned_in_smp;
13 16
14 /* incremented at dispatch time */ 17 /* incremented at dispatch time */
15 unsigned long rq_dispatched[2]; 18 unsigned long rq_dispatched[2];
@@ -20,21 +23,23 @@ struct blk_mq_ctx {
20 23
21 struct request_queue *queue; 24 struct request_queue *queue;
22 struct kobject kobj; 25 struct kobject kobj;
23}; 26} ____cacheline_aligned_in_smp;
24 27
25void __blk_mq_complete_request(struct request *rq); 28void __blk_mq_complete_request(struct request *rq);
26void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 29void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
27void blk_mq_init_flush(struct request_queue *q); 30void blk_mq_init_flush(struct request_queue *q);
28void blk_mq_drain_queue(struct request_queue *q); 31void blk_mq_drain_queue(struct request_queue *q);
29void blk_mq_free_queue(struct request_queue *q); 32void blk_mq_free_queue(struct request_queue *q);
30void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); 33void blk_mq_clone_flush_request(struct request *flush_rq,
34 struct request *orig_rq);
35int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
31 36
32/* 37/*
33 * CPU hotplug helpers 38 * CPU hotplug helpers
34 */ 39 */
35struct blk_mq_cpu_notifier; 40struct blk_mq_cpu_notifier;
36void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 41void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
37 void (*fn)(void *, unsigned long, unsigned int), 42 int (*fn)(void *, unsigned long, unsigned int),
38 void *data); 43 void *data);
39void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 44void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
40void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 45void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
@@ -45,10 +50,17 @@ void blk_mq_disable_hotplug(void);
45/* 50/*
46 * CPU -> queue mappings 51 * CPU -> queue mappings
47 */ 52 */
48struct blk_mq_reg; 53extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
49extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
50extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); 54extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
55extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
51 56
52void blk_mq_add_timer(struct request *rq); 57/*
58 * Basic implementation of sparser bitmap, allowing the user to spread
59 * the bits over more cachelines.
60 */
61struct blk_align_bitmap {
62 unsigned long word;
63 unsigned long depth;
64} ____cacheline_aligned_in_smp;
53 65
54#endif 66#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7500f876dae4..23321fbab293 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
48static ssize_t 48static ssize_t
49queue_requests_store(struct request_queue *q, const char *page, size_t count) 49queue_requests_store(struct request_queue *q, const char *page, size_t count)
50{ 50{
51 struct request_list *rl;
52 unsigned long nr; 51 unsigned long nr;
53 int ret; 52 int ret, err;
54 53
55 if (!q->request_fn) 54 if (!q->request_fn && !q->mq_ops)
56 return -EINVAL; 55 return -EINVAL;
57 56
58 ret = queue_var_store(&nr, page, count); 57 ret = queue_var_store(&nr, page, count);
@@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
62 if (nr < BLKDEV_MIN_RQ) 61 if (nr < BLKDEV_MIN_RQ)
63 nr = BLKDEV_MIN_RQ; 62 nr = BLKDEV_MIN_RQ;
64 63
65 spin_lock_irq(q->queue_lock); 64 if (q->request_fn)
66 q->nr_requests = nr; 65 err = blk_update_nr_requests(q, nr);
67 blk_queue_congestion_threshold(q); 66 else
68 67 err = blk_mq_update_nr_requests(q, nr);
69 /* congestion isn't cgroup aware and follows root blkcg for now */ 68
70 rl = &q->root_rl; 69 if (err)
71 70 return err;
72 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
73 blk_set_queue_congested(q, BLK_RW_SYNC);
74 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
75 blk_clear_queue_congested(q, BLK_RW_SYNC);
76
77 if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
78 blk_set_queue_congested(q, BLK_RW_ASYNC);
79 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
80 blk_clear_queue_congested(q, BLK_RW_ASYNC);
81
82 blk_queue_for_each_rl(rl, q) {
83 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
84 blk_set_rl_full(rl, BLK_RW_SYNC);
85 } else {
86 blk_clear_rl_full(rl, BLK_RW_SYNC);
87 wake_up(&rl->wait[BLK_RW_SYNC]);
88 }
89
90 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
91 blk_set_rl_full(rl, BLK_RW_ASYNC);
92 } else {
93 blk_clear_rl_full(rl, BLK_RW_ASYNC);
94 wake_up(&rl->wait[BLK_RW_ASYNC]);
95 }
96 }
97 71
98 spin_unlock_irq(q->queue_lock);
99 return ret; 72 return ret;
100} 73}
101 74
@@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj)
544 if (q->queue_tags) 517 if (q->queue_tags)
545 __blk_queue_free_tags(q); 518 __blk_queue_free_tags(q);
546 519
547 percpu_counter_destroy(&q->mq_usage_counter);
548
549 if (q->mq_ops) 520 if (q->mq_ops)
550 blk_mq_free_queue(q); 521 blk_mq_free_queue(q);
551 522
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 033745cd7fba..9353b4683359 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
744static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 744static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
745{ 745{
746 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 746 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
747 return 0; 747 return false;
748 748
749 return 1; 749 return 1;
750} 750}
@@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
842 if (tg->io_disp[rw] + 1 <= io_allowed) { 842 if (tg->io_disp[rw] + 1 <= io_allowed) {
843 if (wait) 843 if (wait)
844 *wait = 0; 844 *wait = 0;
845 return 1; 845 return true;
846 } 846 }
847 847
848 /* Calc approx time to dispatch */ 848 /* Calc approx time to dispatch */
@@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
880 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { 880 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
881 if (wait) 881 if (wait)
882 *wait = 0; 882 *wait = 0;
883 return 1; 883 return true;
884 } 884 }
885 885
886 /* Calc approx time to dispatch */ 886 /* Calc approx time to dispatch */
@@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
923 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 923 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
924 if (wait) 924 if (wait)
925 *wait = 0; 925 *wait = 0;
926 return 1; 926 return true;
927 } 927 }
928 928
929 /* 929 /*
@@ -1258,7 +1258,7 @@ out_unlock:
1258 * of throtl_data->service_queue. Those bio's are ready and issued by this 1258 * of throtl_data->service_queue. Those bio's are ready and issued by this
1259 * function. 1259 * function.
1260 */ 1260 */
1261void blk_throtl_dispatch_work_fn(struct work_struct *work) 1261static void blk_throtl_dispatch_work_fn(struct work_struct *work)
1262{ 1262{
1263 struct throtl_data *td = container_of(work, struct throtl_data, 1263 struct throtl_data *td = container_of(work, struct throtl_data,
1264 dispatch_work); 1264 dispatch_work);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index d96f7061c6fd..43e8b515806f 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req)
96 __blk_complete_request(req); 96 __blk_complete_request(req);
97 break; 97 break;
98 case BLK_EH_RESET_TIMER: 98 case BLK_EH_RESET_TIMER:
99 if (q->mq_ops) 99 blk_add_timer(req);
100 blk_mq_add_timer(req);
101 else
102 blk_add_timer(req);
103
104 blk_clear_rq_complete(req); 100 blk_clear_rq_complete(req);
105 break; 101 break;
106 case BLK_EH_NOT_HANDLED: 102 case BLK_EH_NOT_HANDLED:
@@ -170,7 +166,26 @@ void blk_abort_request(struct request *req)
170} 166}
171EXPORT_SYMBOL_GPL(blk_abort_request); 167EXPORT_SYMBOL_GPL(blk_abort_request);
172 168
173void __blk_add_timer(struct request *req, struct list_head *timeout_list) 169unsigned long blk_rq_timeout(unsigned long timeout)
170{
171 unsigned long maxt;
172
173 maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
174 if (time_after(timeout, maxt))
175 timeout = maxt;
176
177 return timeout;
178}
179
180/**
181 * blk_add_timer - Start timeout timer for a single request
182 * @req: request that is about to start running.
183 *
184 * Notes:
185 * Each request has its own timer, and as it is added to the queue, we
186 * set up the timer. When the request completes, we cancel the timer.
187 */
188void blk_add_timer(struct request *req)
174{ 189{
175 struct request_queue *q = req->q; 190 struct request_queue *q = req->q;
176 unsigned long expiry; 191 unsigned long expiry;
@@ -188,32 +203,29 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list)
188 req->timeout = q->rq_timeout; 203 req->timeout = q->rq_timeout;
189 204
190 req->deadline = jiffies + req->timeout; 205 req->deadline = jiffies + req->timeout;
191 if (timeout_list) 206 if (!q->mq_ops)
192 list_add_tail(&req->timeout_list, timeout_list); 207 list_add_tail(&req->timeout_list, &req->q->timeout_list);
193 208
194 /* 209 /*
195 * If the timer isn't already pending or this timeout is earlier 210 * If the timer isn't already pending or this timeout is earlier
196 * than an existing one, modify the timer. Round up to next nearest 211 * than an existing one, modify the timer. Round up to next nearest
197 * second. 212 * second.
198 */ 213 */
199 expiry = round_jiffies_up(req->deadline); 214 expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
200 215
201 if (!timer_pending(&q->timeout) || 216 if (!timer_pending(&q->timeout) ||
202 time_before(expiry, q->timeout.expires)) 217 time_before(expiry, q->timeout.expires)) {
203 mod_timer(&q->timeout, expiry); 218 unsigned long diff = q->timeout.expires - expiry;
204 219
205} 220 /*
221 * Due to added timer slack to group timers, the timer
222 * will often be a little in front of what we asked for.
223 * So apply some tolerance here too, otherwise we keep
224 * modifying the timer because expires for value X
225 * will be X + something.
226 */
227 if (diff >= HZ / 2)
228 mod_timer(&q->timeout, expiry);
229 }
206 230
207/**
208 * blk_add_timer - Start timeout timer for a single request
209 * @req: request that is about to start running.
210 *
211 * Notes:
212 * Each request has its own timer, and as it is added to the queue, we
213 * set up the timer. When the request completes, we cancel the timer.
214 */
215void blk_add_timer(struct request *req)
216{
217 __blk_add_timer(req, &req->q->timeout_list);
218} 231}
219
diff --git a/block/blk.h b/block/blk.h
index 1d880f1f957f..45385e9abf6f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -9,6 +9,9 @@
9/* Number of requests a "batching" process may submit */ 9/* Number of requests a "batching" process may submit */
10#define BLK_BATCH_REQ 32 10#define BLK_BATCH_REQ 32
11 11
12/* Max future timer expiry for timeouts */
13#define BLK_MAX_TIMEOUT (5 * HZ)
14
12extern struct kmem_cache *blk_requestq_cachep; 15extern struct kmem_cache *blk_requestq_cachep;
13extern struct kmem_cache *request_cachep; 16extern struct kmem_cache *request_cachep;
14extern struct kobj_type blk_queue_ktype; 17extern struct kobj_type blk_queue_ktype;
@@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error,
37void blk_rq_timed_out_timer(unsigned long data); 40void blk_rq_timed_out_timer(unsigned long data);
38void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 41void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
39 unsigned int *next_set); 42 unsigned int *next_set);
40void __blk_add_timer(struct request *req, struct list_head *timeout_list); 43unsigned long blk_rq_timeout(unsigned long timeout);
44void blk_add_timer(struct request *req);
41void blk_delete_timer(struct request *); 45void blk_delete_timer(struct request *);
42void blk_add_timer(struct request *);
43 46
44 47
45bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 48bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
@@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
185 return q->nr_congestion_off; 188 return q->nr_congestion_off;
186} 189}
187 190
191extern int blk_update_nr_requests(struct request_queue *, unsigned int);
192
188/* 193/*
189 * Contribute to IO statistics IFF: 194 * Contribute to IO statistics IFF:
190 * 195 *
diff --git a/mm/bounce.c b/block/bounce.c
index 523918b8c6dc..523918b8c6dc 100644
--- a/mm/bounce.c
+++ b/block/bounce.c
diff --git a/block/bsg.c b/block/bsg.c
index 420a5a9f1b23..e5214c148096 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -1008,7 +1008,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
1008 /* 1008 /*
1009 * we need a proper transport to send commands, not a stacked device 1009 * we need a proper transport to send commands, not a stacked device
1010 */ 1010 */
1011 if (!q->request_fn) 1011 if (!queue_is_rq_based(q))
1012 return 0; 1012 return 0;
1013 1013
1014 bcd = &q->bsg_dev; 1014 bcd = &q->bsg_dev;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e0985f1955e7..22dffebc7c73 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -908,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
908{ 908{
909 if (cfqd->busy_queues) { 909 if (cfqd->busy_queues) {
910 cfq_log(cfqd, "schedule dispatch"); 910 cfq_log(cfqd, "schedule dispatch");
911 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 911 kblockd_schedule_work(&cfqd->unplug_work);
912 } 912 }
913} 913}
914 914
@@ -4460,7 +4460,7 @@ out_free:
4460static ssize_t 4460static ssize_t
4461cfq_var_show(unsigned int var, char *page) 4461cfq_var_show(unsigned int var, char *page)
4462{ 4462{
4463 return sprintf(page, "%d\n", var); 4463 return sprintf(page, "%u\n", var);
4464} 4464}
4465 4465
4466static ssize_t 4466static ssize_t
diff --git a/fs/ioprio.c b/block/ioprio.c
index e50170ca7c33..e50170ca7c33 100644
--- a/fs/ioprio.c
+++ b/block/ioprio.c
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 748dea4f34dc..758da2287d9a 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1406,7 +1406,7 @@ next_segment:
1406 1406
1407 track = block / (floppy->dtype->sects * floppy->type->sect_mult); 1407 track = block / (floppy->dtype->sects * floppy->type->sect_mult);
1408 sector = block % (floppy->dtype->sects * floppy->type->sect_mult); 1408 sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
1409 data = rq->buffer + 512 * cnt; 1409 data = bio_data(rq->bio) + 512 * cnt;
1410#ifdef DEBUG 1410#ifdef DEBUG
1411 printk("access to track %d, sector %d, with buffer at " 1411 printk("access to track %d, sector %d, with buffer at "
1412 "0x%08lx\n", track, sector, data); 1412 "0x%08lx\n", track, sector, data);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 96b629e1f0c9..7e8a55f8917c 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1484,7 +1484,7 @@ repeat:
1484 ReqCnt = 0; 1484 ReqCnt = 0;
1485 ReqCmd = rq_data_dir(fd_request); 1485 ReqCmd = rq_data_dir(fd_request);
1486 ReqBlock = blk_rq_pos(fd_request); 1486 ReqBlock = blk_rq_pos(fd_request);
1487 ReqBuffer = fd_request->buffer; 1487 ReqBuffer = bio_data(fd_request->bio);
1488 setup_req_params( drive ); 1488 setup_req_params( drive );
1489 do_fd_action( drive ); 1489 do_fd_action( drive );
1490 1490
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 73894ca33956..4595c22f33f7 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4080,7 +4080,7 @@ static void cciss_interrupt_mode(ctlr_info_t *h)
4080 goto default_int_mode; 4080 goto default_int_mode;
4081 4081
4082 if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) { 4082 if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) {
4083 err = pci_enable_msix(h->pdev, cciss_msix_entries, 4); 4083 err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4);
4084 if (!err) { 4084 if (!err) {
4085 h->intr[0] = cciss_msix_entries[0].vector; 4085 h->intr[0] = cciss_msix_entries[0].vector;
4086 h->intr[1] = cciss_msix_entries[1].vector; 4086 h->intr[1] = cciss_msix_entries[1].vector;
@@ -4088,10 +4088,6 @@ static void cciss_interrupt_mode(ctlr_info_t *h)
4088 h->intr[3] = cciss_msix_entries[3].vector; 4088 h->intr[3] = cciss_msix_entries[3].vector;
4089 h->msix_vector = 1; 4089 h->msix_vector = 1;
4090 return; 4090 return;
4091 }
4092 if (err > 0) {
4093 dev_warn(&h->pdev->dev,
4094 "only %d MSI-X vectors available\n", err);
4095 } else { 4091 } else {
4096 dev_warn(&h->pdev->dev, 4092 dev_warn(&h->pdev->dev,
4097 "MSI-X init failed %d\n", err); 4093 "MSI-X init failed %d\n", err);
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 90ae4ba8f9ee..05a1780ffa85 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -29,7 +29,6 @@
29#include <linux/drbd_limits.h> 29#include <linux/drbd_limits.h>
30#include <linux/dynamic_debug.h> 30#include <linux/dynamic_debug.h>
31#include "drbd_int.h" 31#include "drbd_int.h"
32#include "drbd_wrappers.h"
33 32
34 33
35enum al_transaction_types { 34enum al_transaction_types {
@@ -204,7 +203,7 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
204 203
205 BUG_ON(!bdev->md_bdev); 204 BUG_ON(!bdev->md_bdev);
206 205
207 drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", 206 dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
208 current->comm, current->pid, __func__, 207 current->comm, current->pid, __func__,
209 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", 208 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
210 (void*)_RET_IP_ ); 209 (void*)_RET_IP_ );
@@ -276,7 +275,6 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval
276 return _al_get(device, first, true); 275 return _al_get(device, first, true);
277} 276}
278 277
279static
280bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) 278bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i)
281{ 279{
282 /* for bios crossing activity log extent boundaries, 280 /* for bios crossing activity log extent boundaries,
@@ -846,7 +844,7 @@ void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size,
846 int wake_up = 0; 844 int wake_up = 0;
847 unsigned long flags; 845 unsigned long flags;
848 846
849 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 847 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
850 drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 848 drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
851 (unsigned long long)sector, size); 849 (unsigned long long)sector, size);
852 return; 850 return;
@@ -920,7 +918,7 @@ int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size
920 if (size == 0) 918 if (size == 0)
921 return 0; 919 return 0;
922 920
923 if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 921 if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
924 drbd_err(device, "sector: %llus, size: %d\n", 922 drbd_err(device, "sector: %llus, size: %d\n",
925 (unsigned long long)sector, size); 923 (unsigned long long)sector, size);
926 return 0; 924 return 0;
@@ -1023,8 +1021,7 @@ int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
1023 unsigned int enr = BM_SECT_TO_EXT(sector); 1021 unsigned int enr = BM_SECT_TO_EXT(sector);
1024 struct bm_extent *bm_ext; 1022 struct bm_extent *bm_ext;
1025 int i, sig; 1023 int i, sig;
1026 int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait. 1024 bool sa;
1027 200 times -> 20 seconds. */
1028 1025
1029retry: 1026retry:
1030 sig = wait_event_interruptible(device->al_wait, 1027 sig = wait_event_interruptible(device->al_wait,
@@ -1035,12 +1032,15 @@ retry:
1035 if (test_bit(BME_LOCKED, &bm_ext->flags)) 1032 if (test_bit(BME_LOCKED, &bm_ext->flags))
1036 return 0; 1033 return 0;
1037 1034
1035 /* step aside only while we are above c-min-rate; unless disabled. */
1036 sa = drbd_rs_c_min_rate_throttle(device);
1037
1038 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1038 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
1039 sig = wait_event_interruptible(device->al_wait, 1039 sig = wait_event_interruptible(device->al_wait,
1040 !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || 1040 !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) ||
1041 test_bit(BME_PRIORITY, &bm_ext->flags)); 1041 (sa && test_bit(BME_PRIORITY, &bm_ext->flags)));
1042 1042
1043 if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) { 1043 if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) {
1044 spin_lock_irq(&device->al_lock); 1044 spin_lock_irq(&device->al_lock);
1045 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1045 if (lc_put(device->resync, &bm_ext->lce) == 0) {
1046 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ 1046 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
@@ -1052,9 +1052,6 @@ retry:
1052 return -EINTR; 1052 return -EINTR;
1053 if (schedule_timeout_interruptible(HZ/10)) 1053 if (schedule_timeout_interruptible(HZ/10))
1054 return -EINTR; 1054 return -EINTR;
1055 if (sa && --sa == 0)
1056 drbd_warn(device, "drbd_rs_begin_io() stepped aside for 20sec."
1057 "Resync stalled?\n");
1058 goto retry; 1055 goto retry;
1059 } 1056 }
1060 } 1057 }
@@ -1288,7 +1285,7 @@ void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size)
1288 sector_t esector, nr_sectors; 1285 sector_t esector, nr_sectors;
1289 int wake_up = 0; 1286 int wake_up = 0;
1290 1287
1291 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 1288 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
1292 drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", 1289 drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
1293 (unsigned long long)sector, size); 1290 (unsigned long long)sector, size);
1294 return; 1291 return;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e7093d4291f1..a76ceb344d64 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -382,6 +382,12 @@ enum {
382 __EE_CALL_AL_COMPLETE_IO, 382 __EE_CALL_AL_COMPLETE_IO,
383 __EE_MAY_SET_IN_SYNC, 383 __EE_MAY_SET_IN_SYNC,
384 384
385 /* is this a TRIM aka REQ_DISCARD? */
386 __EE_IS_TRIM,
387 /* our lower level cannot handle trim,
388 * and we want to fall back to zeroout instead */
389 __EE_IS_TRIM_USE_ZEROOUT,
390
385 /* In case a barrier failed, 391 /* In case a barrier failed,
386 * we need to resubmit without the barrier flag. */ 392 * we need to resubmit without the barrier flag. */
387 __EE_RESUBMITTED, 393 __EE_RESUBMITTED,
@@ -405,7 +411,9 @@ enum {
405}; 411};
406#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) 412#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
407#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) 413#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
408#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) 414#define EE_IS_TRIM (1<<__EE_IS_TRIM)
415#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
416#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
409#define EE_WAS_ERROR (1<<__EE_WAS_ERROR) 417#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
410#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) 418#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
411#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) 419#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
@@ -579,6 +587,7 @@ struct drbd_resource {
579 struct list_head resources; 587 struct list_head resources;
580 struct res_opts res_opts; 588 struct res_opts res_opts;
581 struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ 589 struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */
590 struct mutex adm_mutex; /* mutex to serialize administrative requests */
582 spinlock_t req_lock; 591 spinlock_t req_lock;
583 592
584 unsigned susp:1; /* IO suspended by user */ 593 unsigned susp:1; /* IO suspended by user */
@@ -609,6 +618,7 @@ struct drbd_connection {
609 struct drbd_socket data; /* data/barrier/cstate/parameter packets */ 618 struct drbd_socket data; /* data/barrier/cstate/parameter packets */
610 struct drbd_socket meta; /* ping/ack (metadata) packets */ 619 struct drbd_socket meta; /* ping/ack (metadata) packets */
611 int agreed_pro_version; /* actually used protocol version */ 620 int agreed_pro_version; /* actually used protocol version */
621 u32 agreed_features;
612 unsigned long last_received; /* in jiffies, either socket */ 622 unsigned long last_received; /* in jiffies, either socket */
613 unsigned int ko_count; 623 unsigned int ko_count;
614 624
@@ -814,6 +824,28 @@ struct drbd_device {
814 struct submit_worker submit; 824 struct submit_worker submit;
815}; 825};
816 826
827struct drbd_config_context {
828 /* assigned from drbd_genlmsghdr */
829 unsigned int minor;
830 /* assigned from request attributes, if present */
831 unsigned int volume;
832#define VOLUME_UNSPECIFIED (-1U)
833 /* pointer into the request skb,
834 * limited lifetime! */
835 char *resource_name;
836 struct nlattr *my_addr;
837 struct nlattr *peer_addr;
838
839 /* reply buffer */
840 struct sk_buff *reply_skb;
841 /* pointer into reply buffer */
842 struct drbd_genlmsghdr *reply_dh;
843 /* resolved from attributes, if possible */
844 struct drbd_device *device;
845 struct drbd_resource *resource;
846 struct drbd_connection *connection;
847};
848
817static inline struct drbd_device *minor_to_device(unsigned int minor) 849static inline struct drbd_device *minor_to_device(unsigned int minor)
818{ 850{
819 return (struct drbd_device *)idr_find(&drbd_devices, minor); 851 return (struct drbd_device *)idr_find(&drbd_devices, minor);
@@ -821,7 +853,7 @@ static inline struct drbd_device *minor_to_device(unsigned int minor)
821 853
822static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device) 854static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
823{ 855{
824 return list_first_entry(&device->peer_devices, struct drbd_peer_device, peer_devices); 856 return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
825} 857}
826 858
827#define for_each_resource(resource, _resources) \ 859#define for_each_resource(resource, _resources) \
@@ -1139,6 +1171,12 @@ struct bm_extent {
1139#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ 1171#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
1140#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ 1172#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
1141 1173
1174/* For now, don't allow more than one activity log extent worth of data
1175 * to be discarded in one go. We may need to rework drbd_al_begin_io()
1176 * to allow for even larger discard ranges */
1177#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE
1178#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
1179
1142extern int drbd_bm_init(struct drbd_device *device); 1180extern int drbd_bm_init(struct drbd_device *device);
1143extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); 1181extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
1144extern void drbd_bm_cleanup(struct drbd_device *device); 1182extern void drbd_bm_cleanup(struct drbd_device *device);
@@ -1229,9 +1267,9 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
1229extern rwlock_t global_state_lock; 1267extern rwlock_t global_state_lock;
1230 1268
1231extern int conn_lowest_minor(struct drbd_connection *connection); 1269extern int conn_lowest_minor(struct drbd_connection *connection);
1232enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr); 1270extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
1233extern void drbd_destroy_device(struct kref *kref); 1271extern void drbd_destroy_device(struct kref *kref);
1234extern void drbd_delete_device(struct drbd_device *mdev); 1272extern void drbd_delete_device(struct drbd_device *device);
1235 1273
1236extern struct drbd_resource *drbd_create_resource(const char *name); 1274extern struct drbd_resource *drbd_create_resource(const char *name);
1237extern void drbd_free_resource(struct drbd_resource *resource); 1275extern void drbd_free_resource(struct drbd_resource *resource);
@@ -1257,7 +1295,7 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
1257 1295
1258 1296
1259/* drbd_nl.c */ 1297/* drbd_nl.c */
1260extern int drbd_msg_put_info(const char *info); 1298extern int drbd_msg_put_info(struct sk_buff *skb, const char *info);
1261extern void drbd_suspend_io(struct drbd_device *device); 1299extern void drbd_suspend_io(struct drbd_device *device);
1262extern void drbd_resume_io(struct drbd_device *device); 1300extern void drbd_resume_io(struct drbd_device *device);
1263extern char *ppsize(char *buf, unsigned long long size); 1301extern char *ppsize(char *buf, unsigned long long size);
@@ -1283,6 +1321,10 @@ extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
1283extern int drbd_khelper(struct drbd_device *device, char *cmd); 1321extern int drbd_khelper(struct drbd_device *device, char *cmd);
1284 1322
1285/* drbd_worker.c */ 1323/* drbd_worker.c */
1324/* bi_end_io handlers */
1325extern void drbd_md_io_complete(struct bio *bio, int error);
1326extern void drbd_peer_request_endio(struct bio *bio, int error);
1327extern void drbd_request_endio(struct bio *bio, int error);
1286extern int drbd_worker(struct drbd_thread *thi); 1328extern int drbd_worker(struct drbd_thread *thi);
1287enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor); 1329enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
1288void drbd_resync_after_changed(struct drbd_device *device); 1330void drbd_resync_after_changed(struct drbd_device *device);
@@ -1332,16 +1374,20 @@ extern int w_start_resync(struct drbd_work *, int);
1332extern void resync_timer_fn(unsigned long data); 1374extern void resync_timer_fn(unsigned long data);
1333extern void start_resync_timer_fn(unsigned long data); 1375extern void start_resync_timer_fn(unsigned long data);
1334 1376
1377extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
1378
1335/* drbd_receiver.c */ 1379/* drbd_receiver.c */
1336extern int drbd_receiver(struct drbd_thread *thi); 1380extern int drbd_receiver(struct drbd_thread *thi);
1337extern int drbd_asender(struct drbd_thread *thi); 1381extern int drbd_asender(struct drbd_thread *thi);
1338extern int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector); 1382extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
1383extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector);
1339extern int drbd_submit_peer_request(struct drbd_device *, 1384extern int drbd_submit_peer_request(struct drbd_device *,
1340 struct drbd_peer_request *, const unsigned, 1385 struct drbd_peer_request *, const unsigned,
1341 const int); 1386 const int);
1342extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); 1387extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
1343extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, 1388extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
1344 sector_t, unsigned int, 1389 sector_t, unsigned int,
1390 bool,
1345 gfp_t) __must_hold(local); 1391 gfp_t) __must_hold(local);
1346extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, 1392extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
1347 int); 1393 int);
@@ -1401,6 +1447,37 @@ static inline void drbd_tcp_quickack(struct socket *sock)
1401 (char*)&val, sizeof(val)); 1447 (char*)&val, sizeof(val));
1402} 1448}
1403 1449
1450/* sets the number of 512 byte sectors of our virtual device */
1451static inline void drbd_set_my_capacity(struct drbd_device *device,
1452 sector_t size)
1453{
1454 /* set_capacity(device->this_bdev->bd_disk, size); */
1455 set_capacity(device->vdisk, size);
1456 device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
1457}
1458
1459/*
1460 * used to submit our private bio
1461 */
1462static inline void drbd_generic_make_request(struct drbd_device *device,
1463 int fault_type, struct bio *bio)
1464{
1465 __release(local);
1466 if (!bio->bi_bdev) {
1467 printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
1468 "bio->bi_bdev == NULL\n",
1469 device_to_minor(device));
1470 dump_stack();
1471 bio_endio(bio, -ENODEV);
1472 return;
1473 }
1474
1475 if (drbd_insert_fault(device, fault_type))
1476 bio_endio(bio, -EIO);
1477 else
1478 generic_make_request(bio);
1479}
1480
1404void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo); 1481void drbd_bump_write_ordering(struct drbd_connection *connection, enum write_ordering_e wo);
1405 1482
1406/* drbd_proc.c */ 1483/* drbd_proc.c */
@@ -1410,6 +1487,7 @@ extern const char *drbd_conn_str(enum drbd_conns s);
1410extern const char *drbd_role_str(enum drbd_role s); 1487extern const char *drbd_role_str(enum drbd_role s);
1411 1488
1412/* drbd_actlog.c */ 1489/* drbd_actlog.c */
1490extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
1413extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); 1491extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
1414extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate); 1492extern void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate);
1415extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); 1493extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
@@ -2144,7 +2222,7 @@ static inline void drbd_md_flush(struct drbd_device *device)
2144 2222
2145static inline struct drbd_connection *first_connection(struct drbd_resource *resource) 2223static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
2146{ 2224{
2147 return list_first_entry(&resource->connections, 2225 return list_first_entry_or_null(&resource->connections,
2148 struct drbd_connection, connections); 2226 struct drbd_connection, connections);
2149} 2227}
2150 2228
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 331e5cc1227d..960645c26e6f 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1607,8 +1607,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long b
1607 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; 1607 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
1608} 1608}
1609 1609
1610/* Used to send write requests 1610/* Used to send write or TRIM aka REQ_DISCARD requests
1611 * R_PRIMARY -> Peer (P_DATA) 1611 * R_PRIMARY -> Peer (P_DATA, P_TRIM)
1612 */ 1612 */
1613int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req) 1613int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
1614{ 1614{
@@ -1640,6 +1640,16 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
1640 dp_flags |= DP_SEND_WRITE_ACK; 1640 dp_flags |= DP_SEND_WRITE_ACK;
1641 } 1641 }
1642 p->dp_flags = cpu_to_be32(dp_flags); 1642 p->dp_flags = cpu_to_be32(dp_flags);
1643
1644 if (dp_flags & DP_DISCARD) {
1645 struct p_trim *t = (struct p_trim*)p;
1646 t->size = cpu_to_be32(req->i.size);
1647 err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
1648 goto out;
1649 }
1650
1651 /* our digest is still only over the payload.
1652 * TRIM does not carry any payload. */
1643 if (dgs) 1653 if (dgs)
1644 drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); 1654 drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
1645 err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size); 1655 err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
@@ -1675,6 +1685,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
1675 ... Be noisy about digest too large ... 1685 ... Be noisy about digest too large ...
1676 } */ 1686 } */
1677 } 1687 }
1688out:
1678 mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ 1689 mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
1679 1690
1680 return err; 1691 return err;
@@ -2570,6 +2581,7 @@ struct drbd_resource *drbd_create_resource(const char *name)
2570 INIT_LIST_HEAD(&resource->connections); 2581 INIT_LIST_HEAD(&resource->connections);
2571 list_add_tail_rcu(&resource->resources, &drbd_resources); 2582 list_add_tail_rcu(&resource->resources, &drbd_resources);
2572 mutex_init(&resource->conf_update); 2583 mutex_init(&resource->conf_update);
2584 mutex_init(&resource->adm_mutex);
2573 spin_lock_init(&resource->req_lock); 2585 spin_lock_init(&resource->req_lock);
2574 return resource; 2586 return resource;
2575 2587
@@ -2687,14 +2699,16 @@ static int init_submitter(struct drbd_device *device)
2687 return 0; 2699 return 0;
2688} 2700}
2689 2701
2690enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned int minor, int vnr) 2702enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
2691{ 2703{
2704 struct drbd_resource *resource = adm_ctx->resource;
2692 struct drbd_connection *connection; 2705 struct drbd_connection *connection;
2693 struct drbd_device *device; 2706 struct drbd_device *device;
2694 struct drbd_peer_device *peer_device, *tmp_peer_device; 2707 struct drbd_peer_device *peer_device, *tmp_peer_device;
2695 struct gendisk *disk; 2708 struct gendisk *disk;
2696 struct request_queue *q; 2709 struct request_queue *q;
2697 int id; 2710 int id;
2711 int vnr = adm_ctx->volume;
2698 enum drbd_ret_code err = ERR_NOMEM; 2712 enum drbd_ret_code err = ERR_NOMEM;
2699 2713
2700 device = minor_to_device(minor); 2714 device = minor_to_device(minor);
@@ -2763,7 +2777,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
2763 if (id < 0) { 2777 if (id < 0) {
2764 if (id == -ENOSPC) { 2778 if (id == -ENOSPC) {
2765 err = ERR_MINOR_EXISTS; 2779 err = ERR_MINOR_EXISTS;
2766 drbd_msg_put_info("requested minor exists already"); 2780 drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
2767 } 2781 }
2768 goto out_no_minor_idr; 2782 goto out_no_minor_idr;
2769 } 2783 }
@@ -2773,7 +2787,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
2773 if (id < 0) { 2787 if (id < 0) {
2774 if (id == -ENOSPC) { 2788 if (id == -ENOSPC) {
2775 err = ERR_MINOR_EXISTS; 2789 err = ERR_MINOR_EXISTS;
2776 drbd_msg_put_info("requested minor exists already"); 2790 drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already");
2777 } 2791 }
2778 goto out_idr_remove_minor; 2792 goto out_idr_remove_minor;
2779 } 2793 }
@@ -2794,7 +2808,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
2794 if (id < 0) { 2808 if (id < 0) {
2795 if (id == -ENOSPC) { 2809 if (id == -ENOSPC) {
2796 err = ERR_INVALID_REQUEST; 2810 err = ERR_INVALID_REQUEST;
2797 drbd_msg_put_info("requested volume exists already"); 2811 drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already");
2798 } 2812 }
2799 goto out_idr_remove_from_resource; 2813 goto out_idr_remove_from_resource;
2800 } 2814 }
@@ -2803,7 +2817,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_resource *resource, unsigned i
2803 2817
2804 if (init_submitter(device)) { 2818 if (init_submitter(device)) {
2805 err = ERR_NOMEM; 2819 err = ERR_NOMEM;
2806 drbd_msg_put_info("unable to create submit workqueue"); 2820 drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue");
2807 goto out_idr_remove_vol; 2821 goto out_idr_remove_vol;
2808 } 2822 }
2809 2823
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 526414bc2cab..1b35c45c92b7 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -34,7 +34,6 @@
34#include "drbd_int.h" 34#include "drbd_int.h"
35#include "drbd_protocol.h" 35#include "drbd_protocol.h"
36#include "drbd_req.h" 36#include "drbd_req.h"
37#include "drbd_wrappers.h"
38#include <asm/unaligned.h> 37#include <asm/unaligned.h>
39#include <linux/drbd_limits.h> 38#include <linux/drbd_limits.h>
40#include <linux/kthread.h> 39#include <linux/kthread.h>
@@ -82,32 +81,6 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
82/* used blkdev_get_by_path, to claim our meta data device(s) */ 81/* used blkdev_get_by_path, to claim our meta data device(s) */
83static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; 82static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
84 83
85/* Configuration is strictly serialized, because generic netlink message
86 * processing is strictly serialized by the genl_lock().
87 * Which means we can use one static global drbd_config_context struct.
88 */
89static struct drbd_config_context {
90 /* assigned from drbd_genlmsghdr */
91 unsigned int minor;
92 /* assigned from request attributes, if present */
93 unsigned int volume;
94#define VOLUME_UNSPECIFIED (-1U)
95 /* pointer into the request skb,
96 * limited lifetime! */
97 char *resource_name;
98 struct nlattr *my_addr;
99 struct nlattr *peer_addr;
100
101 /* reply buffer */
102 struct sk_buff *reply_skb;
103 /* pointer into reply buffer */
104 struct drbd_genlmsghdr *reply_dh;
105 /* resolved from attributes, if possible */
106 struct drbd_device *device;
107 struct drbd_resource *resource;
108 struct drbd_connection *connection;
109} adm_ctx;
110
111static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) 84static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
112{ 85{
113 genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); 86 genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
@@ -117,9 +90,8 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
117 90
118/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only 91/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
119 * reason it could fail was no space in skb, and there are 4k available. */ 92 * reason it could fail was no space in skb, and there are 4k available. */
120int drbd_msg_put_info(const char *info) 93int drbd_msg_put_info(struct sk_buff *skb, const char *info)
121{ 94{
122 struct sk_buff *skb = adm_ctx.reply_skb;
123 struct nlattr *nla; 95 struct nlattr *nla;
124 int err = -EMSGSIZE; 96 int err = -EMSGSIZE;
125 97
@@ -143,42 +115,46 @@ int drbd_msg_put_info(const char *info)
143 * and per-family private info->pointers. 115 * and per-family private info->pointers.
144 * But we need to stay compatible with older kernels. 116 * But we need to stay compatible with older kernels.
145 * If it returns successfully, adm_ctx members are valid. 117 * If it returns successfully, adm_ctx members are valid.
118 *
119 * At this point, we still rely on the global genl_lock().
120 * If we want to avoid that, and allow "genl_family.parallel_ops", we may need
121 * to add additional synchronization against object destruction/modification.
146 */ 122 */
147#define DRBD_ADM_NEED_MINOR 1 123#define DRBD_ADM_NEED_MINOR 1
148#define DRBD_ADM_NEED_RESOURCE 2 124#define DRBD_ADM_NEED_RESOURCE 2
149#define DRBD_ADM_NEED_CONNECTION 4 125#define DRBD_ADM_NEED_CONNECTION 4
150static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info, 126static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
151 unsigned flags) 127 struct sk_buff *skb, struct genl_info *info, unsigned flags)
152{ 128{
153 struct drbd_genlmsghdr *d_in = info->userhdr; 129 struct drbd_genlmsghdr *d_in = info->userhdr;
154 const u8 cmd = info->genlhdr->cmd; 130 const u8 cmd = info->genlhdr->cmd;
155 int err; 131 int err;
156 132
157 memset(&adm_ctx, 0, sizeof(adm_ctx)); 133 memset(adm_ctx, 0, sizeof(*adm_ctx));
158 134
159 /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ 135 /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
160 if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) 136 if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
161 return -EPERM; 137 return -EPERM;
162 138
163 adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); 139 adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
164 if (!adm_ctx.reply_skb) { 140 if (!adm_ctx->reply_skb) {
165 err = -ENOMEM; 141 err = -ENOMEM;
166 goto fail; 142 goto fail;
167 } 143 }
168 144
169 adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb, 145 adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb,
170 info, &drbd_genl_family, 0, cmd); 146 info, &drbd_genl_family, 0, cmd);
171 /* put of a few bytes into a fresh skb of >= 4k will always succeed. 147 /* put of a few bytes into a fresh skb of >= 4k will always succeed.
172 * but anyways */ 148 * but anyways */
173 if (!adm_ctx.reply_dh) { 149 if (!adm_ctx->reply_dh) {
174 err = -ENOMEM; 150 err = -ENOMEM;
175 goto fail; 151 goto fail;
176 } 152 }
177 153
178 adm_ctx.reply_dh->minor = d_in->minor; 154 adm_ctx->reply_dh->minor = d_in->minor;
179 adm_ctx.reply_dh->ret_code = NO_ERROR; 155 adm_ctx->reply_dh->ret_code = NO_ERROR;
180 156
181 adm_ctx.volume = VOLUME_UNSPECIFIED; 157 adm_ctx->volume = VOLUME_UNSPECIFIED;
182 if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { 158 if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
183 struct nlattr *nla; 159 struct nlattr *nla;
184 /* parse and validate only */ 160 /* parse and validate only */
@@ -188,111 +164,131 @@ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
188 164
189 /* It was present, and valid, 165 /* It was present, and valid,
190 * copy it over to the reply skb. */ 166 * copy it over to the reply skb. */
191 err = nla_put_nohdr(adm_ctx.reply_skb, 167 err = nla_put_nohdr(adm_ctx->reply_skb,
192 info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, 168 info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
193 info->attrs[DRBD_NLA_CFG_CONTEXT]); 169 info->attrs[DRBD_NLA_CFG_CONTEXT]);
194 if (err) 170 if (err)
195 goto fail; 171 goto fail;
196 172
197 /* and assign stuff to the global adm_ctx */ 173 /* and assign stuff to the adm_ctx */
198 nla = nested_attr_tb[__nla_type(T_ctx_volume)]; 174 nla = nested_attr_tb[__nla_type(T_ctx_volume)];
199 if (nla) 175 if (nla)
200 adm_ctx.volume = nla_get_u32(nla); 176 adm_ctx->volume = nla_get_u32(nla);
201 nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; 177 nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
202 if (nla) 178 if (nla)
203 adm_ctx.resource_name = nla_data(nla); 179 adm_ctx->resource_name = nla_data(nla);
204 adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; 180 adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
205 adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; 181 adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
206 if ((adm_ctx.my_addr && 182 if ((adm_ctx->my_addr &&
207 nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.connection->my_addr)) || 183 nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) ||
208 (adm_ctx.peer_addr && 184 (adm_ctx->peer_addr &&
209 nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.connection->peer_addr))) { 185 nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) {
210 err = -EINVAL; 186 err = -EINVAL;
211 goto fail; 187 goto fail;
212 } 188 }
213 } 189 }
214 190
215 adm_ctx.minor = d_in->minor; 191 adm_ctx->minor = d_in->minor;
216 adm_ctx.device = minor_to_device(d_in->minor); 192 adm_ctx->device = minor_to_device(d_in->minor);
217 if (adm_ctx.resource_name) { 193
218 adm_ctx.resource = drbd_find_resource(adm_ctx.resource_name); 194 /* We are protected by the global genl_lock().
195 * But we may explicitly drop it/retake it in drbd_adm_set_role(),
196 * so make sure this object stays around. */
197 if (adm_ctx->device)
198 kref_get(&adm_ctx->device->kref);
199
200 if (adm_ctx->resource_name) {
201 adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
219 } 202 }
220 203
221 if (!adm_ctx.device && (flags & DRBD_ADM_NEED_MINOR)) { 204 if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) {
222 drbd_msg_put_info("unknown minor"); 205 drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor");
223 return ERR_MINOR_INVALID; 206 return ERR_MINOR_INVALID;
224 } 207 }
225 if (!adm_ctx.resource && (flags & DRBD_ADM_NEED_RESOURCE)) { 208 if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
226 drbd_msg_put_info("unknown resource"); 209 drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource");
227 if (adm_ctx.resource_name) 210 if (adm_ctx->resource_name)
228 return ERR_RES_NOT_KNOWN; 211 return ERR_RES_NOT_KNOWN;
229 return ERR_INVALID_REQUEST; 212 return ERR_INVALID_REQUEST;
230 } 213 }
231 214
232 if (flags & DRBD_ADM_NEED_CONNECTION) { 215 if (flags & DRBD_ADM_NEED_CONNECTION) {
233 if (adm_ctx.resource) { 216 if (adm_ctx->resource) {
234 drbd_msg_put_info("no resource name expected"); 217 drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected");
235 return ERR_INVALID_REQUEST; 218 return ERR_INVALID_REQUEST;
236 } 219 }
237 if (adm_ctx.device) { 220 if (adm_ctx->device) {
238 drbd_msg_put_info("no minor number expected"); 221 drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected");
239 return ERR_INVALID_REQUEST; 222 return ERR_INVALID_REQUEST;
240 } 223 }
241 if (adm_ctx.my_addr && adm_ctx.peer_addr) 224 if (adm_ctx->my_addr && adm_ctx->peer_addr)
242 adm_ctx.connection = conn_get_by_addrs(nla_data(adm_ctx.my_addr), 225 adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr),
243 nla_len(adm_ctx.my_addr), 226 nla_len(adm_ctx->my_addr),
244 nla_data(adm_ctx.peer_addr), 227 nla_data(adm_ctx->peer_addr),
245 nla_len(adm_ctx.peer_addr)); 228 nla_len(adm_ctx->peer_addr));
246 if (!adm_ctx.connection) { 229 if (!adm_ctx->connection) {
247 drbd_msg_put_info("unknown connection"); 230 drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection");
248 return ERR_INVALID_REQUEST; 231 return ERR_INVALID_REQUEST;
249 } 232 }
250 } 233 }
251 234
252 /* some more paranoia, if the request was over-determined */ 235 /* some more paranoia, if the request was over-determined */
253 if (adm_ctx.device && adm_ctx.resource && 236 if (adm_ctx->device && adm_ctx->resource &&
254 adm_ctx.device->resource != adm_ctx.resource) { 237 adm_ctx->device->resource != adm_ctx->resource) {
255 pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n", 238 pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
256 adm_ctx.minor, adm_ctx.resource->name, 239 adm_ctx->minor, adm_ctx->resource->name,
257 adm_ctx.device->resource->name); 240 adm_ctx->device->resource->name);
258 drbd_msg_put_info("minor exists in different resource"); 241 drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource");
259 return ERR_INVALID_REQUEST; 242 return ERR_INVALID_REQUEST;
260 } 243 }
261 if (adm_ctx.device && 244 if (adm_ctx->device &&
262 adm_ctx.volume != VOLUME_UNSPECIFIED && 245 adm_ctx->volume != VOLUME_UNSPECIFIED &&
263 adm_ctx.volume != adm_ctx.device->vnr) { 246 adm_ctx->volume != adm_ctx->device->vnr) {
264 pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", 247 pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
265 adm_ctx.minor, adm_ctx.volume, 248 adm_ctx->minor, adm_ctx->volume,
266 adm_ctx.device->vnr, 249 adm_ctx->device->vnr,
267 adm_ctx.device->resource->name); 250 adm_ctx->device->resource->name);
268 drbd_msg_put_info("minor exists as different volume"); 251 drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume");
269 return ERR_INVALID_REQUEST; 252 return ERR_INVALID_REQUEST;
270 } 253 }
271 254
255 /* still, provide adm_ctx->resource always, if possible. */
256 if (!adm_ctx->resource) {
257 adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource
258 : adm_ctx->connection ? adm_ctx->connection->resource : NULL;
259 if (adm_ctx->resource)
260 kref_get(&adm_ctx->resource->kref);
261 }
262
272 return NO_ERROR; 263 return NO_ERROR;
273 264
274fail: 265fail:
275 nlmsg_free(adm_ctx.reply_skb); 266 nlmsg_free(adm_ctx->reply_skb);
276 adm_ctx.reply_skb = NULL; 267 adm_ctx->reply_skb = NULL;
277 return err; 268 return err;
278} 269}
279 270
280static int drbd_adm_finish(struct genl_info *info, int retcode) 271static int drbd_adm_finish(struct drbd_config_context *adm_ctx,
272 struct genl_info *info, int retcode)
281{ 273{
282 if (adm_ctx.connection) { 274 if (adm_ctx->device) {
283 kref_put(&adm_ctx.connection->kref, drbd_destroy_connection); 275 kref_put(&adm_ctx->device->kref, drbd_destroy_device);
284 adm_ctx.connection = NULL; 276 adm_ctx->device = NULL;
285 } 277 }
286 if (adm_ctx.resource) { 278 if (adm_ctx->connection) {
287 kref_put(&adm_ctx.resource->kref, drbd_destroy_resource); 279 kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection);
288 adm_ctx.resource = NULL; 280 adm_ctx->connection = NULL;
281 }
282 if (adm_ctx->resource) {
283 kref_put(&adm_ctx->resource->kref, drbd_destroy_resource);
284 adm_ctx->resource = NULL;
289 } 285 }
290 286
291 if (!adm_ctx.reply_skb) 287 if (!adm_ctx->reply_skb)
292 return -ENOMEM; 288 return -ENOMEM;
293 289
294 adm_ctx.reply_dh->ret_code = retcode; 290 adm_ctx->reply_dh->ret_code = retcode;
295 drbd_adm_send_reply(adm_ctx.reply_skb, info); 291 drbd_adm_send_reply(adm_ctx->reply_skb, info);
296 return 0; 292 return 0;
297} 293}
298 294
@@ -426,6 +422,14 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec
426 } 422 }
427 rcu_read_unlock(); 423 rcu_read_unlock();
428 424
425 if (fp == FP_NOT_AVAIL) {
426 /* IO Suspending works on the whole resource.
427 Do it only for one device. */
428 vnr = 0;
429 peer_device = idr_get_next(&connection->peer_devices, &vnr);
430 drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
431 }
432
429 return fp; 433 return fp;
430} 434}
431 435
@@ -438,12 +442,13 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
438 char *ex_to_string; 442 char *ex_to_string;
439 int r; 443 int r;
440 444
445 spin_lock_irq(&connection->resource->req_lock);
441 if (connection->cstate >= C_WF_REPORT_PARAMS) { 446 if (connection->cstate >= C_WF_REPORT_PARAMS) {
442 drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); 447 drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
448 spin_unlock_irq(&connection->resource->req_lock);
443 return false; 449 return false;
444 } 450 }
445 451
446 spin_lock_irq(&connection->resource->req_lock);
447 connect_cnt = connection->connect_cnt; 452 connect_cnt = connection->connect_cnt;
448 spin_unlock_irq(&connection->resource->req_lock); 453 spin_unlock_irq(&connection->resource->req_lock);
449 454
@@ -654,11 +659,11 @@ drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force)
654 put_ldev(device); 659 put_ldev(device);
655 } 660 }
656 } else { 661 } else {
657 mutex_lock(&device->resource->conf_update); 662 /* Called from drbd_adm_set_role only.
663 * We are still holding the conf_update mutex. */
658 nc = first_peer_device(device)->connection->net_conf; 664 nc = first_peer_device(device)->connection->net_conf;
659 if (nc) 665 if (nc)
660 nc->discard_my_data = 0; /* without copy; single bit op is atomic */ 666 nc->discard_my_data = 0; /* without copy; single bit op is atomic */
661 mutex_unlock(&device->resource->conf_update);
662 667
663 set_disk_ro(device->vdisk, false); 668 set_disk_ro(device->vdisk, false);
664 if (get_ldev(device)) { 669 if (get_ldev(device)) {
@@ -700,11 +705,12 @@ static const char *from_attrs_err_to_txt(int err)
700 705
701int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) 706int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
702{ 707{
708 struct drbd_config_context adm_ctx;
703 struct set_role_parms parms; 709 struct set_role_parms parms;
704 int err; 710 int err;
705 enum drbd_ret_code retcode; 711 enum drbd_ret_code retcode;
706 712
707 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 713 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
708 if (!adm_ctx.reply_skb) 714 if (!adm_ctx.reply_skb)
709 return retcode; 715 return retcode;
710 if (retcode != NO_ERROR) 716 if (retcode != NO_ERROR)
@@ -715,17 +721,22 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
715 err = set_role_parms_from_attrs(&parms, info); 721 err = set_role_parms_from_attrs(&parms, info);
716 if (err) { 722 if (err) {
717 retcode = ERR_MANDATORY_TAG; 723 retcode = ERR_MANDATORY_TAG;
718 drbd_msg_put_info(from_attrs_err_to_txt(err)); 724 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
719 goto out; 725 goto out;
720 } 726 }
721 } 727 }
728 genl_unlock();
729 mutex_lock(&adm_ctx.resource->adm_mutex);
722 730
723 if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) 731 if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
724 retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); 732 retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
725 else 733 else
726 retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); 734 retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
735
736 mutex_unlock(&adm_ctx.resource->adm_mutex);
737 genl_lock();
727out: 738out:
728 drbd_adm_finish(info, retcode); 739 drbd_adm_finish(&adm_ctx, info, retcode);
729 return 0; 740 return 0;
730} 741}
731 742
@@ -1104,15 +1115,18 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
1104 struct request_queue * const q = device->rq_queue; 1115 struct request_queue * const q = device->rq_queue;
1105 unsigned int max_hw_sectors = max_bio_size >> 9; 1116 unsigned int max_hw_sectors = max_bio_size >> 9;
1106 unsigned int max_segments = 0; 1117 unsigned int max_segments = 0;
1118 struct request_queue *b = NULL;
1107 1119
1108 if (get_ldev_if_state(device, D_ATTACHING)) { 1120 if (get_ldev_if_state(device, D_ATTACHING)) {
1109 struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; 1121 b = device->ldev->backing_bdev->bd_disk->queue;
1110 1122
1111 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); 1123 max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
1112 rcu_read_lock(); 1124 rcu_read_lock();
1113 max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; 1125 max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
1114 rcu_read_unlock(); 1126 rcu_read_unlock();
1115 put_ldev(device); 1127
1128 blk_set_stacking_limits(&q->limits);
1129 blk_queue_max_write_same_sectors(q, 0);
1116 } 1130 }
1117 1131
1118 blk_queue_logical_block_size(q, 512); 1132 blk_queue_logical_block_size(q, 512);
@@ -1121,8 +1135,25 @@ static void drbd_setup_queue_param(struct drbd_device *device, unsigned int max_
1121 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); 1135 blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
1122 blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); 1136 blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
1123 1137
1124 if (get_ldev_if_state(device, D_ATTACHING)) { 1138 if (b) {
1125 struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; 1139 struct drbd_connection *connection = first_peer_device(device)->connection;
1140
1141 if (blk_queue_discard(b) &&
1142 (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
1143 /* For now, don't allow more than one activity log extent worth of data
1144 * to be discarded in one go. We may need to rework drbd_al_begin_io()
1145 * to allow for even larger discard ranges */
1146 q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS;
1147
1148 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1149 /* REALLY? Is stacking secdiscard "legal"? */
1150 if (blk_queue_secdiscard(b))
1151 queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
1152 } else {
1153 q->limits.max_discard_sectors = 0;
1154 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
1155 queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
1156 }
1126 1157
1127 blk_queue_stack_limits(q, b); 1158 blk_queue_stack_limits(q, b);
1128 1159
@@ -1164,8 +1195,14 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device)
1164 peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ 1195 peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
1165 else 1196 else
1166 peer = DRBD_MAX_BIO_SIZE; 1197 peer = DRBD_MAX_BIO_SIZE;
1167 }
1168 1198
1199 /* We may later detach and re-attach on a disconnected Primary.
1200 * Avoid this setting to jump back in that case.
1201 * We want to store what we know the peer DRBD can handle,
1202 * not what the peer IO backend can handle. */
1203 if (peer > device->peer_max_bio_size)
1204 device->peer_max_bio_size = peer;
1205 }
1169 new = min(local, peer); 1206 new = min(local, peer);
1170 1207
1171 if (device->state.role == R_PRIMARY && new < now) 1208 if (device->state.role == R_PRIMARY && new < now)
@@ -1258,19 +1295,21 @@ static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
1258 1295
1259int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) 1296int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1260{ 1297{
1298 struct drbd_config_context adm_ctx;
1261 enum drbd_ret_code retcode; 1299 enum drbd_ret_code retcode;
1262 struct drbd_device *device; 1300 struct drbd_device *device;
1263 struct disk_conf *new_disk_conf, *old_disk_conf; 1301 struct disk_conf *new_disk_conf, *old_disk_conf;
1264 struct fifo_buffer *old_plan = NULL, *new_plan = NULL; 1302 struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
1265 int err, fifo_size; 1303 int err, fifo_size;
1266 1304
1267 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 1305 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
1268 if (!adm_ctx.reply_skb) 1306 if (!adm_ctx.reply_skb)
1269 return retcode; 1307 return retcode;
1270 if (retcode != NO_ERROR) 1308 if (retcode != NO_ERROR)
1271 goto out; 1309 goto finish;
1272 1310
1273 device = adm_ctx.device; 1311 device = adm_ctx.device;
1312 mutex_lock(&adm_ctx.resource->adm_mutex);
1274 1313
1275 /* we also need a disk 1314 /* we also need a disk
1276 * to change the options on */ 1315 * to change the options on */
@@ -1294,7 +1333,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
1294 err = disk_conf_from_attrs_for_change(new_disk_conf, info); 1333 err = disk_conf_from_attrs_for_change(new_disk_conf, info);
1295 if (err && err != -ENOMSG) { 1334 if (err && err != -ENOMSG) {
1296 retcode = ERR_MANDATORY_TAG; 1335 retcode = ERR_MANDATORY_TAG;
1297 drbd_msg_put_info(from_attrs_err_to_txt(err)); 1336 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
1298 goto fail_unlock; 1337 goto fail_unlock;
1299 } 1338 }
1300 1339
@@ -1385,12 +1424,15 @@ fail_unlock:
1385success: 1424success:
1386 put_ldev(device); 1425 put_ldev(device);
1387 out: 1426 out:
1388 drbd_adm_finish(info, retcode); 1427 mutex_unlock(&adm_ctx.resource->adm_mutex);
1428 finish:
1429 drbd_adm_finish(&adm_ctx, info, retcode);
1389 return 0; 1430 return 0;
1390} 1431}
1391 1432
1392int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) 1433int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1393{ 1434{
1435 struct drbd_config_context adm_ctx;
1394 struct drbd_device *device; 1436 struct drbd_device *device;
1395 int err; 1437 int err;
1396 enum drbd_ret_code retcode; 1438 enum drbd_ret_code retcode;
@@ -1406,13 +1448,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1406 enum drbd_state_rv rv; 1448 enum drbd_state_rv rv;
1407 struct net_conf *nc; 1449 struct net_conf *nc;
1408 1450
1409 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 1451 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
1410 if (!adm_ctx.reply_skb) 1452 if (!adm_ctx.reply_skb)
1411 return retcode; 1453 return retcode;
1412 if (retcode != NO_ERROR) 1454 if (retcode != NO_ERROR)
1413 goto finish; 1455 goto finish;
1414 1456
1415 device = adm_ctx.device; 1457 device = adm_ctx.device;
1458 mutex_lock(&adm_ctx.resource->adm_mutex);
1416 conn_reconfig_start(first_peer_device(device)->connection); 1459 conn_reconfig_start(first_peer_device(device)->connection);
1417 1460
1418 /* if you want to reconfigure, please tear down first */ 1461 /* if you want to reconfigure, please tear down first */
@@ -1455,7 +1498,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1455 err = disk_conf_from_attrs(new_disk_conf, info); 1498 err = disk_conf_from_attrs(new_disk_conf, info);
1456 if (err) { 1499 if (err) {
1457 retcode = ERR_MANDATORY_TAG; 1500 retcode = ERR_MANDATORY_TAG;
1458 drbd_msg_put_info(from_attrs_err_to_txt(err)); 1501 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
1459 goto fail; 1502 goto fail;
1460 } 1503 }
1461 1504
@@ -1619,7 +1662,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1619 } 1662 }
1620 1663
1621 if (device->state.conn < C_CONNECTED && 1664 if (device->state.conn < C_CONNECTED &&
1622 device->state.role == R_PRIMARY && 1665 device->state.role == R_PRIMARY && device->ed_uuid &&
1623 (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { 1666 (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
1624 drbd_err(device, "Can only attach to data with current UUID=%016llX\n", 1667 drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
1625 (unsigned long long)device->ed_uuid); 1668 (unsigned long long)device->ed_uuid);
@@ -1797,7 +1840,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1797 kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); 1840 kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
1798 put_ldev(device); 1841 put_ldev(device);
1799 conn_reconfig_done(first_peer_device(device)->connection); 1842 conn_reconfig_done(first_peer_device(device)->connection);
1800 drbd_adm_finish(info, retcode); 1843 mutex_unlock(&adm_ctx.resource->adm_mutex);
1844 drbd_adm_finish(&adm_ctx, info, retcode);
1801 return 0; 1845 return 0;
1802 1846
1803 force_diskless_dec: 1847 force_diskless_dec:
@@ -1819,9 +1863,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1819 kfree(new_disk_conf); 1863 kfree(new_disk_conf);
1820 lc_destroy(resync_lru); 1864 lc_destroy(resync_lru);
1821 kfree(new_plan); 1865 kfree(new_plan);
1822 1866 mutex_unlock(&adm_ctx.resource->adm_mutex);
1823 finish: 1867 finish:
1824 drbd_adm_finish(info, retcode); 1868 drbd_adm_finish(&adm_ctx, info, retcode);
1825 return 0; 1869 return 0;
1826} 1870}
1827 1871
@@ -1860,11 +1904,12 @@ out:
1860 * Only then we have finally detached. */ 1904 * Only then we have finally detached. */
1861int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) 1905int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
1862{ 1906{
1907 struct drbd_config_context adm_ctx;
1863 enum drbd_ret_code retcode; 1908 enum drbd_ret_code retcode;
1864 struct detach_parms parms = { }; 1909 struct detach_parms parms = { };
1865 int err; 1910 int err;
1866 1911
1867 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 1912 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
1868 if (!adm_ctx.reply_skb) 1913 if (!adm_ctx.reply_skb)
1869 return retcode; 1914 return retcode;
1870 if (retcode != NO_ERROR) 1915 if (retcode != NO_ERROR)
@@ -1874,14 +1919,16 @@ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
1874 err = detach_parms_from_attrs(&parms, info); 1919 err = detach_parms_from_attrs(&parms, info);
1875 if (err) { 1920 if (err) {
1876 retcode = ERR_MANDATORY_TAG; 1921 retcode = ERR_MANDATORY_TAG;
1877 drbd_msg_put_info(from_attrs_err_to_txt(err)); 1922 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
1878 goto out; 1923 goto out;
1879 } 1924 }
1880 } 1925 }
1881 1926
1927 mutex_lock(&adm_ctx.resource->adm_mutex);
1882 retcode = adm_detach(adm_ctx.device, parms.force_detach); 1928 retcode = adm_detach(adm_ctx.device, parms.force_detach);
1929 mutex_unlock(&adm_ctx.resource->adm_mutex);
1883out: 1930out:
1884 drbd_adm_finish(info, retcode); 1931 drbd_adm_finish(&adm_ctx, info, retcode);
1885 return 0; 1932 return 0;
1886} 1933}
1887 1934
@@ -2055,6 +2102,7 @@ static void free_crypto(struct crypto *crypto)
2055 2102
2056int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) 2103int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2057{ 2104{
2105 struct drbd_config_context adm_ctx;
2058 enum drbd_ret_code retcode; 2106 enum drbd_ret_code retcode;
2059 struct drbd_connection *connection; 2107 struct drbd_connection *connection;
2060 struct net_conf *old_net_conf, *new_net_conf = NULL; 2108 struct net_conf *old_net_conf, *new_net_conf = NULL;
@@ -2063,13 +2111,14 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2063 int rsr; /* re-sync running */ 2111 int rsr; /* re-sync running */
2064 struct crypto crypto = { }; 2112 struct crypto crypto = { };
2065 2113
2066 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); 2114 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
2067 if (!adm_ctx.reply_skb) 2115 if (!adm_ctx.reply_skb)
2068 return retcode; 2116 return retcode;
2069 if (retcode != NO_ERROR) 2117 if (retcode != NO_ERROR)
2070 goto out; 2118 goto finish;
2071 2119
2072 connection = adm_ctx.connection; 2120 connection = adm_ctx.connection;
2121 mutex_lock(&adm_ctx.resource->adm_mutex);
2073 2122
2074 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); 2123 new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
2075 if (!new_net_conf) { 2124 if (!new_net_conf) {
@@ -2084,7 +2133,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2084 old_net_conf = connection->net_conf; 2133 old_net_conf = connection->net_conf;
2085 2134
2086 if (!old_net_conf) { 2135 if (!old_net_conf) {
2087 drbd_msg_put_info("net conf missing, try connect"); 2136 drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect");
2088 retcode = ERR_INVALID_REQUEST; 2137 retcode = ERR_INVALID_REQUEST;
2089 goto fail; 2138 goto fail;
2090 } 2139 }
@@ -2096,7 +2145,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2096 err = net_conf_from_attrs_for_change(new_net_conf, info); 2145 err = net_conf_from_attrs_for_change(new_net_conf, info);
2097 if (err && err != -ENOMSG) { 2146 if (err && err != -ENOMSG) {
2098 retcode = ERR_MANDATORY_TAG; 2147 retcode = ERR_MANDATORY_TAG;
2099 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2148 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
2100 goto fail; 2149 goto fail;
2101 } 2150 }
2102 2151
@@ -2167,12 +2216,15 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
2167 done: 2216 done:
2168 conn_reconfig_done(connection); 2217 conn_reconfig_done(connection);
2169 out: 2218 out:
2170 drbd_adm_finish(info, retcode); 2219 mutex_unlock(&adm_ctx.resource->adm_mutex);
2220 finish:
2221 drbd_adm_finish(&adm_ctx, info, retcode);
2171 return 0; 2222 return 0;
2172} 2223}
2173 2224
2174int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) 2225int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2175{ 2226{
2227 struct drbd_config_context adm_ctx;
2176 struct drbd_peer_device *peer_device; 2228 struct drbd_peer_device *peer_device;
2177 struct net_conf *old_net_conf, *new_net_conf = NULL; 2229 struct net_conf *old_net_conf, *new_net_conf = NULL;
2178 struct crypto crypto = { }; 2230 struct crypto crypto = { };
@@ -2182,14 +2234,14 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2182 int i; 2234 int i;
2183 int err; 2235 int err;
2184 2236
2185 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); 2237 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
2186 2238
2187 if (!adm_ctx.reply_skb) 2239 if (!adm_ctx.reply_skb)
2188 return retcode; 2240 return retcode;
2189 if (retcode != NO_ERROR) 2241 if (retcode != NO_ERROR)
2190 goto out; 2242 goto out;
2191 if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { 2243 if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
2192 drbd_msg_put_info("connection endpoint(s) missing"); 2244 drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing");
2193 retcode = ERR_INVALID_REQUEST; 2245 retcode = ERR_INVALID_REQUEST;
2194 goto out; 2246 goto out;
2195 } 2247 }
@@ -2215,6 +2267,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2215 } 2267 }
2216 } 2268 }
2217 2269
2270 mutex_lock(&adm_ctx.resource->adm_mutex);
2218 connection = first_connection(adm_ctx.resource); 2271 connection = first_connection(adm_ctx.resource);
2219 conn_reconfig_start(connection); 2272 conn_reconfig_start(connection);
2220 2273
@@ -2235,7 +2288,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2235 err = net_conf_from_attrs(new_net_conf, info); 2288 err = net_conf_from_attrs(new_net_conf, info);
2236 if (err && err != -ENOMSG) { 2289 if (err && err != -ENOMSG) {
2237 retcode = ERR_MANDATORY_TAG; 2290 retcode = ERR_MANDATORY_TAG;
2238 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2291 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
2239 goto fail; 2292 goto fail;
2240 } 2293 }
2241 2294
@@ -2284,7 +2337,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
2284 retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); 2337 retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
2285 2338
2286 conn_reconfig_done(connection); 2339 conn_reconfig_done(connection);
2287 drbd_adm_finish(info, retcode); 2340 mutex_unlock(&adm_ctx.resource->adm_mutex);
2341 drbd_adm_finish(&adm_ctx, info, retcode);
2288 return 0; 2342 return 0;
2289 2343
2290fail: 2344fail:
@@ -2292,8 +2346,9 @@ fail:
2292 kfree(new_net_conf); 2346 kfree(new_net_conf);
2293 2347
2294 conn_reconfig_done(connection); 2348 conn_reconfig_done(connection);
2349 mutex_unlock(&adm_ctx.resource->adm_mutex);
2295out: 2350out:
2296 drbd_adm_finish(info, retcode); 2351 drbd_adm_finish(&adm_ctx, info, retcode);
2297 return 0; 2352 return 0;
2298} 2353}
2299 2354
@@ -2356,13 +2411,14 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection
2356 2411
2357int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) 2412int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
2358{ 2413{
2414 struct drbd_config_context adm_ctx;
2359 struct disconnect_parms parms; 2415 struct disconnect_parms parms;
2360 struct drbd_connection *connection; 2416 struct drbd_connection *connection;
2361 enum drbd_state_rv rv; 2417 enum drbd_state_rv rv;
2362 enum drbd_ret_code retcode; 2418 enum drbd_ret_code retcode;
2363 int err; 2419 int err;
2364 2420
2365 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION); 2421 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
2366 if (!adm_ctx.reply_skb) 2422 if (!adm_ctx.reply_skb)
2367 return retcode; 2423 return retcode;
2368 if (retcode != NO_ERROR) 2424 if (retcode != NO_ERROR)
@@ -2374,18 +2430,20 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
2374 err = disconnect_parms_from_attrs(&parms, info); 2430 err = disconnect_parms_from_attrs(&parms, info);
2375 if (err) { 2431 if (err) {
2376 retcode = ERR_MANDATORY_TAG; 2432 retcode = ERR_MANDATORY_TAG;
2377 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2433 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
2378 goto fail; 2434 goto fail;
2379 } 2435 }
2380 } 2436 }
2381 2437
2438 mutex_lock(&adm_ctx.resource->adm_mutex);
2382 rv = conn_try_disconnect(connection, parms.force_disconnect); 2439 rv = conn_try_disconnect(connection, parms.force_disconnect);
2383 if (rv < SS_SUCCESS) 2440 if (rv < SS_SUCCESS)
2384 retcode = rv; /* FIXME: Type mismatch. */ 2441 retcode = rv; /* FIXME: Type mismatch. */
2385 else 2442 else
2386 retcode = NO_ERROR; 2443 retcode = NO_ERROR;
2444 mutex_unlock(&adm_ctx.resource->adm_mutex);
2387 fail: 2445 fail:
2388 drbd_adm_finish(info, retcode); 2446 drbd_adm_finish(&adm_ctx, info, retcode);
2389 return 0; 2447 return 0;
2390} 2448}
2391 2449
@@ -2407,6 +2465,7 @@ void resync_after_online_grow(struct drbd_device *device)
2407 2465
2408int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) 2466int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2409{ 2467{
2468 struct drbd_config_context adm_ctx;
2410 struct disk_conf *old_disk_conf, *new_disk_conf = NULL; 2469 struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
2411 struct resize_parms rs; 2470 struct resize_parms rs;
2412 struct drbd_device *device; 2471 struct drbd_device *device;
@@ -2417,12 +2476,13 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2417 sector_t u_size; 2476 sector_t u_size;
2418 int err; 2477 int err;
2419 2478
2420 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2479 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
2421 if (!adm_ctx.reply_skb) 2480 if (!adm_ctx.reply_skb)
2422 return retcode; 2481 return retcode;
2423 if (retcode != NO_ERROR) 2482 if (retcode != NO_ERROR)
2424 goto fail; 2483 goto finish;
2425 2484
2485 mutex_lock(&adm_ctx.resource->adm_mutex);
2426 device = adm_ctx.device; 2486 device = adm_ctx.device;
2427 if (!get_ldev(device)) { 2487 if (!get_ldev(device)) {
2428 retcode = ERR_NO_DISK; 2488 retcode = ERR_NO_DISK;
@@ -2436,7 +2496,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2436 err = resize_parms_from_attrs(&rs, info); 2496 err = resize_parms_from_attrs(&rs, info);
2437 if (err) { 2497 if (err) {
2438 retcode = ERR_MANDATORY_TAG; 2498 retcode = ERR_MANDATORY_TAG;
2439 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2499 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
2440 goto fail_ldev; 2500 goto fail_ldev;
2441 } 2501 }
2442 } 2502 }
@@ -2482,7 +2542,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2482 goto fail_ldev; 2542 goto fail_ldev;
2483 } 2543 }
2484 2544
2485 if (device->state.conn != C_CONNECTED) { 2545 if (device->state.conn != C_CONNECTED && !rs.resize_force) {
2486 retcode = ERR_MD_LAYOUT_CONNECTED; 2546 retcode = ERR_MD_LAYOUT_CONNECTED;
2487 goto fail_ldev; 2547 goto fail_ldev;
2488 } 2548 }
@@ -2528,7 +2588,9 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2528 } 2588 }
2529 2589
2530 fail: 2590 fail:
2531 drbd_adm_finish(info, retcode); 2591 mutex_unlock(&adm_ctx.resource->adm_mutex);
2592 finish:
2593 drbd_adm_finish(&adm_ctx, info, retcode);
2532 return 0; 2594 return 0;
2533 2595
2534 fail_ldev: 2596 fail_ldev:
@@ -2538,11 +2600,12 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2538 2600
2539int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) 2601int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
2540{ 2602{
2603 struct drbd_config_context adm_ctx;
2541 enum drbd_ret_code retcode; 2604 enum drbd_ret_code retcode;
2542 struct res_opts res_opts; 2605 struct res_opts res_opts;
2543 int err; 2606 int err;
2544 2607
2545 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); 2608 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
2546 if (!adm_ctx.reply_skb) 2609 if (!adm_ctx.reply_skb)
2547 return retcode; 2610 return retcode;
2548 if (retcode != NO_ERROR) 2611 if (retcode != NO_ERROR)
@@ -2555,33 +2618,37 @@ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
2555 err = res_opts_from_attrs(&res_opts, info); 2618 err = res_opts_from_attrs(&res_opts, info);
2556 if (err && err != -ENOMSG) { 2619 if (err && err != -ENOMSG) {
2557 retcode = ERR_MANDATORY_TAG; 2620 retcode = ERR_MANDATORY_TAG;
2558 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2621 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
2559 goto fail; 2622 goto fail;
2560 } 2623 }
2561 2624
2625 mutex_lock(&adm_ctx.resource->adm_mutex);
2562 err = set_resource_options(adm_ctx.resource, &res_opts); 2626 err = set_resource_options(adm_ctx.resource, &res_opts);
2563 if (err) { 2627 if (err) {
2564 retcode = ERR_INVALID_REQUEST; 2628 retcode = ERR_INVALID_REQUEST;
2565 if (err == -ENOMEM) 2629 if (err == -ENOMEM)
2566 retcode = ERR_NOMEM; 2630 retcode = ERR_NOMEM;
2567 } 2631 }
2632 mutex_unlock(&adm_ctx.resource->adm_mutex);
2568 2633
2569fail: 2634fail:
2570 drbd_adm_finish(info, retcode); 2635 drbd_adm_finish(&adm_ctx, info, retcode);
2571 return 0; 2636 return 0;
2572} 2637}
2573 2638
2574int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) 2639int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2575{ 2640{
2641 struct drbd_config_context adm_ctx;
2576 struct drbd_device *device; 2642 struct drbd_device *device;
2577 int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ 2643 int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2578 2644
2579 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2645 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
2580 if (!adm_ctx.reply_skb) 2646 if (!adm_ctx.reply_skb)
2581 return retcode; 2647 return retcode;
2582 if (retcode != NO_ERROR) 2648 if (retcode != NO_ERROR)
2583 goto out; 2649 goto out;
2584 2650
2651 mutex_lock(&adm_ctx.resource->adm_mutex);
2585 device = adm_ctx.device; 2652 device = adm_ctx.device;
2586 2653
2587 /* If there is still bitmap IO pending, probably because of a previous 2654 /* If there is still bitmap IO pending, probably because of a previous
@@ -2605,26 +2672,29 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
2605 } else 2672 } else
2606 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); 2673 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
2607 drbd_resume_io(device); 2674 drbd_resume_io(device);
2608 2675 mutex_unlock(&adm_ctx.resource->adm_mutex);
2609out: 2676out:
2610 drbd_adm_finish(info, retcode); 2677 drbd_adm_finish(&adm_ctx, info, retcode);
2611 return 0; 2678 return 0;
2612} 2679}
2613 2680
2614static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, 2681static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
2615 union drbd_state mask, union drbd_state val) 2682 union drbd_state mask, union drbd_state val)
2616{ 2683{
2684 struct drbd_config_context adm_ctx;
2617 enum drbd_ret_code retcode; 2685 enum drbd_ret_code retcode;
2618 2686
2619 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2687 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
2620 if (!adm_ctx.reply_skb) 2688 if (!adm_ctx.reply_skb)
2621 return retcode; 2689 return retcode;
2622 if (retcode != NO_ERROR) 2690 if (retcode != NO_ERROR)
2623 goto out; 2691 goto out;
2624 2692
2693 mutex_lock(&adm_ctx.resource->adm_mutex);
2625 retcode = drbd_request_state(adm_ctx.device, mask, val); 2694 retcode = drbd_request_state(adm_ctx.device, mask, val);
2695 mutex_unlock(&adm_ctx.resource->adm_mutex);
2626out: 2696out:
2627 drbd_adm_finish(info, retcode); 2697 drbd_adm_finish(&adm_ctx, info, retcode);
2628 return 0; 2698 return 0;
2629} 2699}
2630 2700
@@ -2639,15 +2709,17 @@ static int drbd_bmio_set_susp_al(struct drbd_device *device)
2639 2709
2640int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) 2710int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2641{ 2711{
2712 struct drbd_config_context adm_ctx;
2642 int retcode; /* drbd_ret_code, drbd_state_rv */ 2713 int retcode; /* drbd_ret_code, drbd_state_rv */
2643 struct drbd_device *device; 2714 struct drbd_device *device;
2644 2715
2645 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2716 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
2646 if (!adm_ctx.reply_skb) 2717 if (!adm_ctx.reply_skb)
2647 return retcode; 2718 return retcode;
2648 if (retcode != NO_ERROR) 2719 if (retcode != NO_ERROR)
2649 goto out; 2720 goto out;
2650 2721
2722 mutex_lock(&adm_ctx.resource->adm_mutex);
2651 device = adm_ctx.device; 2723 device = adm_ctx.device;
2652 2724
2653 /* If there is still bitmap IO pending, probably because of a previous 2725 /* If there is still bitmap IO pending, probably because of a previous
@@ -2674,40 +2746,45 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
2674 } else 2746 } else
2675 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); 2747 retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
2676 drbd_resume_io(device); 2748 drbd_resume_io(device);
2677 2749 mutex_unlock(&adm_ctx.resource->adm_mutex);
2678out: 2750out:
2679 drbd_adm_finish(info, retcode); 2751 drbd_adm_finish(&adm_ctx, info, retcode);
2680 return 0; 2752 return 0;
2681} 2753}
2682 2754
2683int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) 2755int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
2684{ 2756{
2757 struct drbd_config_context adm_ctx;
2685 enum drbd_ret_code retcode; 2758 enum drbd_ret_code retcode;
2686 2759
2687 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2760 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
2688 if (!adm_ctx.reply_skb) 2761 if (!adm_ctx.reply_skb)
2689 return retcode; 2762 return retcode;
2690 if (retcode != NO_ERROR) 2763 if (retcode != NO_ERROR)
2691 goto out; 2764 goto out;
2692 2765
2766 mutex_lock(&adm_ctx.resource->adm_mutex);
2693 if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) 2767 if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
2694 retcode = ERR_PAUSE_IS_SET; 2768 retcode = ERR_PAUSE_IS_SET;
2769 mutex_unlock(&adm_ctx.resource->adm_mutex);
2695out: 2770out:
2696 drbd_adm_finish(info, retcode); 2771 drbd_adm_finish(&adm_ctx, info, retcode);
2697 return 0; 2772 return 0;
2698} 2773}
2699 2774
2700int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) 2775int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
2701{ 2776{
2777 struct drbd_config_context adm_ctx;
2702 union drbd_dev_state s; 2778 union drbd_dev_state s;
2703 enum drbd_ret_code retcode; 2779 enum drbd_ret_code retcode;
2704 2780
2705 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2781 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
2706 if (!adm_ctx.reply_skb) 2782 if (!adm_ctx.reply_skb)
2707 return retcode; 2783 return retcode;
2708 if (retcode != NO_ERROR) 2784 if (retcode != NO_ERROR)
2709 goto out; 2785 goto out;
2710 2786
2787 mutex_lock(&adm_ctx.resource->adm_mutex);
2711 if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { 2788 if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
2712 s = adm_ctx.device->state; 2789 s = adm_ctx.device->state;
2713 if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { 2790 if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
@@ -2717,9 +2794,9 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
2717 retcode = ERR_PAUSE_IS_CLEAR; 2794 retcode = ERR_PAUSE_IS_CLEAR;
2718 } 2795 }
2719 } 2796 }
2720 2797 mutex_unlock(&adm_ctx.resource->adm_mutex);
2721out: 2798out:
2722 drbd_adm_finish(info, retcode); 2799 drbd_adm_finish(&adm_ctx, info, retcode);
2723 return 0; 2800 return 0;
2724} 2801}
2725 2802
@@ -2730,15 +2807,17 @@ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
2730 2807
2731int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) 2808int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
2732{ 2809{
2810 struct drbd_config_context adm_ctx;
2733 struct drbd_device *device; 2811 struct drbd_device *device;
2734 int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ 2812 int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
2735 2813
2736 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 2814 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
2737 if (!adm_ctx.reply_skb) 2815 if (!adm_ctx.reply_skb)
2738 return retcode; 2816 return retcode;
2739 if (retcode != NO_ERROR) 2817 if (retcode != NO_ERROR)
2740 goto out; 2818 goto out;
2741 2819
2820 mutex_lock(&adm_ctx.resource->adm_mutex);
2742 device = adm_ctx.device; 2821 device = adm_ctx.device;
2743 if (test_bit(NEW_CUR_UUID, &device->flags)) { 2822 if (test_bit(NEW_CUR_UUID, &device->flags)) {
2744 drbd_uuid_new_current(device); 2823 drbd_uuid_new_current(device);
@@ -2753,9 +2832,9 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
2753 tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO); 2832 tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
2754 } 2833 }
2755 drbd_resume_io(device); 2834 drbd_resume_io(device);
2756 2835 mutex_unlock(&adm_ctx.resource->adm_mutex);
2757out: 2836out:
2758 drbd_adm_finish(info, retcode); 2837 drbd_adm_finish(&adm_ctx, info, retcode);
2759 return 0; 2838 return 0;
2760} 2839}
2761 2840
@@ -2931,10 +3010,11 @@ nla_put_failure:
2931 3010
2932int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) 3011int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
2933{ 3012{
3013 struct drbd_config_context adm_ctx;
2934 enum drbd_ret_code retcode; 3014 enum drbd_ret_code retcode;
2935 int err; 3015 int err;
2936 3016
2937 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 3017 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
2938 if (!adm_ctx.reply_skb) 3018 if (!adm_ctx.reply_skb)
2939 return retcode; 3019 return retcode;
2940 if (retcode != NO_ERROR) 3020 if (retcode != NO_ERROR)
@@ -2946,7 +3026,7 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
2946 return err; 3026 return err;
2947 } 3027 }
2948out: 3028out:
2949 drbd_adm_finish(info, retcode); 3029 drbd_adm_finish(&adm_ctx, info, retcode);
2950 return 0; 3030 return 0;
2951} 3031}
2952 3032
@@ -3133,11 +3213,12 @@ dump:
3133 3213
3134int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) 3214int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
3135{ 3215{
3216 struct drbd_config_context adm_ctx;
3136 enum drbd_ret_code retcode; 3217 enum drbd_ret_code retcode;
3137 struct timeout_parms tp; 3218 struct timeout_parms tp;
3138 int err; 3219 int err;
3139 3220
3140 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 3221 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
3141 if (!adm_ctx.reply_skb) 3222 if (!adm_ctx.reply_skb)
3142 return retcode; 3223 return retcode;
3143 if (retcode != NO_ERROR) 3224 if (retcode != NO_ERROR)
@@ -3154,17 +3235,18 @@ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
3154 return err; 3235 return err;
3155 } 3236 }
3156out: 3237out:
3157 drbd_adm_finish(info, retcode); 3238 drbd_adm_finish(&adm_ctx, info, retcode);
3158 return 0; 3239 return 0;
3159} 3240}
3160 3241
3161int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) 3242int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
3162{ 3243{
3244 struct drbd_config_context adm_ctx;
3163 struct drbd_device *device; 3245 struct drbd_device *device;
3164 enum drbd_ret_code retcode; 3246 enum drbd_ret_code retcode;
3165 struct start_ov_parms parms; 3247 struct start_ov_parms parms;
3166 3248
3167 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 3249 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
3168 if (!adm_ctx.reply_skb) 3250 if (!adm_ctx.reply_skb)
3169 return retcode; 3251 return retcode;
3170 if (retcode != NO_ERROR) 3252 if (retcode != NO_ERROR)
@@ -3179,10 +3261,12 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
3179 int err = start_ov_parms_from_attrs(&parms, info); 3261 int err = start_ov_parms_from_attrs(&parms, info);
3180 if (err) { 3262 if (err) {
3181 retcode = ERR_MANDATORY_TAG; 3263 retcode = ERR_MANDATORY_TAG;
3182 drbd_msg_put_info(from_attrs_err_to_txt(err)); 3264 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
3183 goto out; 3265 goto out;
3184 } 3266 }
3185 } 3267 }
3268 mutex_lock(&adm_ctx.resource->adm_mutex);
3269
3186 /* w_make_ov_request expects position to be aligned */ 3270 /* w_make_ov_request expects position to be aligned */
3187 device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); 3271 device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
3188 device->ov_stop_sector = parms.ov_stop_sector; 3272 device->ov_stop_sector = parms.ov_stop_sector;
@@ -3193,21 +3277,24 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
3193 wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); 3277 wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
3194 retcode = drbd_request_state(device, NS(conn, C_VERIFY_S)); 3278 retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
3195 drbd_resume_io(device); 3279 drbd_resume_io(device);
3280
3281 mutex_unlock(&adm_ctx.resource->adm_mutex);
3196out: 3282out:
3197 drbd_adm_finish(info, retcode); 3283 drbd_adm_finish(&adm_ctx, info, retcode);
3198 return 0; 3284 return 0;
3199} 3285}
3200 3286
3201 3287
3202int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) 3288int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
3203{ 3289{
3290 struct drbd_config_context adm_ctx;
3204 struct drbd_device *device; 3291 struct drbd_device *device;
3205 enum drbd_ret_code retcode; 3292 enum drbd_ret_code retcode;
3206 int skip_initial_sync = 0; 3293 int skip_initial_sync = 0;
3207 int err; 3294 int err;
3208 struct new_c_uuid_parms args; 3295 struct new_c_uuid_parms args;
3209 3296
3210 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 3297 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
3211 if (!adm_ctx.reply_skb) 3298 if (!adm_ctx.reply_skb)
3212 return retcode; 3299 return retcode;
3213 if (retcode != NO_ERROR) 3300 if (retcode != NO_ERROR)
@@ -3219,11 +3306,12 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
3219 err = new_c_uuid_parms_from_attrs(&args, info); 3306 err = new_c_uuid_parms_from_attrs(&args, info);
3220 if (err) { 3307 if (err) {
3221 retcode = ERR_MANDATORY_TAG; 3308 retcode = ERR_MANDATORY_TAG;
3222 drbd_msg_put_info(from_attrs_err_to_txt(err)); 3309 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
3223 goto out_nolock; 3310 goto out_nolock;
3224 } 3311 }
3225 } 3312 }
3226 3313
3314 mutex_lock(&adm_ctx.resource->adm_mutex);
3227 mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */ 3315 mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
3228 3316
3229 if (!get_ldev(device)) { 3317 if (!get_ldev(device)) {
@@ -3268,22 +3356,24 @@ out_dec:
3268 put_ldev(device); 3356 put_ldev(device);
3269out: 3357out:
3270 mutex_unlock(device->state_mutex); 3358 mutex_unlock(device->state_mutex);
3359 mutex_unlock(&adm_ctx.resource->adm_mutex);
3271out_nolock: 3360out_nolock:
3272 drbd_adm_finish(info, retcode); 3361 drbd_adm_finish(&adm_ctx, info, retcode);
3273 return 0; 3362 return 0;
3274} 3363}
3275 3364
3276static enum drbd_ret_code 3365static enum drbd_ret_code
3277drbd_check_resource_name(const char *name) 3366drbd_check_resource_name(struct drbd_config_context *adm_ctx)
3278{ 3367{
3368 const char *name = adm_ctx->resource_name;
3279 if (!name || !name[0]) { 3369 if (!name || !name[0]) {
3280 drbd_msg_put_info("resource name missing"); 3370 drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing");
3281 return ERR_MANDATORY_TAG; 3371 return ERR_MANDATORY_TAG;
3282 } 3372 }
3283 /* if we want to use these in sysfs/configfs/debugfs some day, 3373 /* if we want to use these in sysfs/configfs/debugfs some day,
3284 * we must not allow slashes */ 3374 * we must not allow slashes */
3285 if (strchr(name, '/')) { 3375 if (strchr(name, '/')) {
3286 drbd_msg_put_info("invalid resource name"); 3376 drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name");
3287 return ERR_INVALID_REQUEST; 3377 return ERR_INVALID_REQUEST;
3288 } 3378 }
3289 return NO_ERROR; 3379 return NO_ERROR;
@@ -3291,11 +3381,12 @@ drbd_check_resource_name(const char *name)
3291 3381
3292int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) 3382int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
3293{ 3383{
3384 struct drbd_config_context adm_ctx;
3294 enum drbd_ret_code retcode; 3385 enum drbd_ret_code retcode;
3295 struct res_opts res_opts; 3386 struct res_opts res_opts;
3296 int err; 3387 int err;
3297 3388
3298 retcode = drbd_adm_prepare(skb, info, 0); 3389 retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0);
3299 if (!adm_ctx.reply_skb) 3390 if (!adm_ctx.reply_skb)
3300 return retcode; 3391 return retcode;
3301 if (retcode != NO_ERROR) 3392 if (retcode != NO_ERROR)
@@ -3305,48 +3396,50 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
3305 err = res_opts_from_attrs(&res_opts, info); 3396 err = res_opts_from_attrs(&res_opts, info);
3306 if (err && err != -ENOMSG) { 3397 if (err && err != -ENOMSG) {
3307 retcode = ERR_MANDATORY_TAG; 3398 retcode = ERR_MANDATORY_TAG;
3308 drbd_msg_put_info(from_attrs_err_to_txt(err)); 3399 drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
3309 goto out; 3400 goto out;
3310 } 3401 }
3311 3402
3312 retcode = drbd_check_resource_name(adm_ctx.resource_name); 3403 retcode = drbd_check_resource_name(&adm_ctx);
3313 if (retcode != NO_ERROR) 3404 if (retcode != NO_ERROR)
3314 goto out; 3405 goto out;
3315 3406
3316 if (adm_ctx.resource) { 3407 if (adm_ctx.resource) {
3317 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { 3408 if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
3318 retcode = ERR_INVALID_REQUEST; 3409 retcode = ERR_INVALID_REQUEST;
3319 drbd_msg_put_info("resource exists"); 3410 drbd_msg_put_info(adm_ctx.reply_skb, "resource exists");
3320 } 3411 }
3321 /* else: still NO_ERROR */ 3412 /* else: still NO_ERROR */
3322 goto out; 3413 goto out;
3323 } 3414 }
3324 3415
3416 /* not yet safe for genl_family.parallel_ops */
3325 if (!conn_create(adm_ctx.resource_name, &res_opts)) 3417 if (!conn_create(adm_ctx.resource_name, &res_opts))
3326 retcode = ERR_NOMEM; 3418 retcode = ERR_NOMEM;
3327out: 3419out:
3328 drbd_adm_finish(info, retcode); 3420 drbd_adm_finish(&adm_ctx, info, retcode);
3329 return 0; 3421 return 0;
3330} 3422}
3331 3423
3332int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) 3424int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
3333{ 3425{
3426 struct drbd_config_context adm_ctx;
3334 struct drbd_genlmsghdr *dh = info->userhdr; 3427 struct drbd_genlmsghdr *dh = info->userhdr;
3335 enum drbd_ret_code retcode; 3428 enum drbd_ret_code retcode;
3336 3429
3337 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); 3430 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
3338 if (!adm_ctx.reply_skb) 3431 if (!adm_ctx.reply_skb)
3339 return retcode; 3432 return retcode;
3340 if (retcode != NO_ERROR) 3433 if (retcode != NO_ERROR)
3341 goto out; 3434 goto out;
3342 3435
3343 if (dh->minor > MINORMASK) { 3436 if (dh->minor > MINORMASK) {
3344 drbd_msg_put_info("requested minor out of range"); 3437 drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range");
3345 retcode = ERR_INVALID_REQUEST; 3438 retcode = ERR_INVALID_REQUEST;
3346 goto out; 3439 goto out;
3347 } 3440 }
3348 if (adm_ctx.volume > DRBD_VOLUME_MAX) { 3441 if (adm_ctx.volume > DRBD_VOLUME_MAX) {
3349 drbd_msg_put_info("requested volume id out of range"); 3442 drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range");
3350 retcode = ERR_INVALID_REQUEST; 3443 retcode = ERR_INVALID_REQUEST;
3351 goto out; 3444 goto out;
3352 } 3445 }
@@ -3360,9 +3453,11 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
3360 goto out; 3453 goto out;
3361 } 3454 }
3362 3455
3363 retcode = drbd_create_device(adm_ctx.resource, dh->minor, adm_ctx.volume); 3456 mutex_lock(&adm_ctx.resource->adm_mutex);
3457 retcode = drbd_create_device(&adm_ctx, dh->minor);
3458 mutex_unlock(&adm_ctx.resource->adm_mutex);
3364out: 3459out:
3365 drbd_adm_finish(info, retcode); 3460 drbd_adm_finish(&adm_ctx, info, retcode);
3366 return 0; 3461 return 0;
3367} 3462}
3368 3463
@@ -3383,35 +3478,40 @@ static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
3383 3478
3384int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info) 3479int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
3385{ 3480{
3481 struct drbd_config_context adm_ctx;
3386 enum drbd_ret_code retcode; 3482 enum drbd_ret_code retcode;
3387 3483
3388 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR); 3484 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
3389 if (!adm_ctx.reply_skb) 3485 if (!adm_ctx.reply_skb)
3390 return retcode; 3486 return retcode;
3391 if (retcode != NO_ERROR) 3487 if (retcode != NO_ERROR)
3392 goto out; 3488 goto out;
3393 3489
3490 mutex_lock(&adm_ctx.resource->adm_mutex);
3394 retcode = adm_del_minor(adm_ctx.device); 3491 retcode = adm_del_minor(adm_ctx.device);
3492 mutex_unlock(&adm_ctx.resource->adm_mutex);
3395out: 3493out:
3396 drbd_adm_finish(info, retcode); 3494 drbd_adm_finish(&adm_ctx, info, retcode);
3397 return 0; 3495 return 0;
3398} 3496}
3399 3497
3400int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) 3498int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3401{ 3499{
3500 struct drbd_config_context adm_ctx;
3402 struct drbd_resource *resource; 3501 struct drbd_resource *resource;
3403 struct drbd_connection *connection; 3502 struct drbd_connection *connection;
3404 struct drbd_device *device; 3503 struct drbd_device *device;
3405 int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ 3504 int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
3406 unsigned i; 3505 unsigned i;
3407 3506
3408 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); 3507 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
3409 if (!adm_ctx.reply_skb) 3508 if (!adm_ctx.reply_skb)
3410 return retcode; 3509 return retcode;
3411 if (retcode != NO_ERROR) 3510 if (retcode != NO_ERROR)
3412 goto out; 3511 goto finish;
3413 3512
3414 resource = adm_ctx.resource; 3513 resource = adm_ctx.resource;
3514 mutex_lock(&resource->adm_mutex);
3415 /* demote */ 3515 /* demote */
3416 for_each_connection(connection, resource) { 3516 for_each_connection(connection, resource) {
3417 struct drbd_peer_device *peer_device; 3517 struct drbd_peer_device *peer_device;
@@ -3419,14 +3519,14 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3419 idr_for_each_entry(&connection->peer_devices, peer_device, i) { 3519 idr_for_each_entry(&connection->peer_devices, peer_device, i) {
3420 retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0); 3520 retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0);
3421 if (retcode < SS_SUCCESS) { 3521 if (retcode < SS_SUCCESS) {
3422 drbd_msg_put_info("failed to demote"); 3522 drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote");
3423 goto out; 3523 goto out;
3424 } 3524 }
3425 } 3525 }
3426 3526
3427 retcode = conn_try_disconnect(connection, 0); 3527 retcode = conn_try_disconnect(connection, 0);
3428 if (retcode < SS_SUCCESS) { 3528 if (retcode < SS_SUCCESS) {
3429 drbd_msg_put_info("failed to disconnect"); 3529 drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect");
3430 goto out; 3530 goto out;
3431 } 3531 }
3432 } 3532 }
@@ -3435,7 +3535,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3435 idr_for_each_entry(&resource->devices, device, i) { 3535 idr_for_each_entry(&resource->devices, device, i) {
3436 retcode = adm_detach(device, 0); 3536 retcode = adm_detach(device, 0);
3437 if (retcode < SS_SUCCESS || retcode > NO_ERROR) { 3537 if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
3438 drbd_msg_put_info("failed to detach"); 3538 drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach");
3439 goto out; 3539 goto out;
3440 } 3540 }
3441 } 3541 }
@@ -3453,7 +3553,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3453 retcode = adm_del_minor(device); 3553 retcode = adm_del_minor(device);
3454 if (retcode != NO_ERROR) { 3554 if (retcode != NO_ERROR) {
3455 /* "can not happen" */ 3555 /* "can not happen" */
3456 drbd_msg_put_info("failed to delete volume"); 3556 drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume");
3457 goto out; 3557 goto out;
3458 } 3558 }
3459 } 3559 }
@@ -3462,25 +3562,28 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
3462 synchronize_rcu(); 3562 synchronize_rcu();
3463 drbd_free_resource(resource); 3563 drbd_free_resource(resource);
3464 retcode = NO_ERROR; 3564 retcode = NO_ERROR;
3465
3466out: 3565out:
3467 drbd_adm_finish(info, retcode); 3566 mutex_unlock(&resource->adm_mutex);
3567finish:
3568 drbd_adm_finish(&adm_ctx, info, retcode);
3468 return 0; 3569 return 0;
3469} 3570}
3470 3571
3471int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) 3572int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
3472{ 3573{
3574 struct drbd_config_context adm_ctx;
3473 struct drbd_resource *resource; 3575 struct drbd_resource *resource;
3474 struct drbd_connection *connection; 3576 struct drbd_connection *connection;
3475 enum drbd_ret_code retcode; 3577 enum drbd_ret_code retcode;
3476 3578
3477 retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE); 3579 retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
3478 if (!adm_ctx.reply_skb) 3580 if (!adm_ctx.reply_skb)
3479 return retcode; 3581 return retcode;
3480 if (retcode != NO_ERROR) 3582 if (retcode != NO_ERROR)
3481 goto out; 3583 goto finish;
3482 3584
3483 resource = adm_ctx.resource; 3585 resource = adm_ctx.resource;
3586 mutex_lock(&resource->adm_mutex);
3484 for_each_connection(connection, resource) { 3587 for_each_connection(connection, resource) {
3485 if (connection->cstate > C_STANDALONE) { 3588 if (connection->cstate > C_STANDALONE) {
3486 retcode = ERR_NET_CONFIGURED; 3589 retcode = ERR_NET_CONFIGURED;
@@ -3499,7 +3602,9 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
3499 drbd_free_resource(resource); 3602 drbd_free_resource(resource);
3500 retcode = NO_ERROR; 3603 retcode = NO_ERROR;
3501out: 3604out:
3502 drbd_adm_finish(info, retcode); 3605 mutex_unlock(&resource->adm_mutex);
3606finish:
3607 drbd_adm_finish(&adm_ctx, info, retcode);
3503 return 0; 3608 return 0;
3504} 3609}
3505 3610
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
index fa672b6df8d6..b2d4791498a6 100644
--- a/drivers/block/drbd/drbd_nla.c
+++ b/drivers/block/drbd/drbd_nla.c
@@ -1,4 +1,3 @@
1#include "drbd_wrappers.h"
2#include <linux/kernel.h> 1#include <linux/kernel.h>
3#include <net/netlink.h> 2#include <net/netlink.h>
4#include <linux/drbd_genl_api.h> 3#include <linux/drbd_genl_api.h>
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 2f26e8ffa45b..89736bdbbc70 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -116,7 +116,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
116 /* ------------------------ ~18s average ------------------------ */ 116 /* ------------------------ ~18s average ------------------------ */
117 i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS; 117 i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
118 dt = (jiffies - device->rs_mark_time[i]) / HZ; 118 dt = (jiffies - device->rs_mark_time[i]) / HZ;
119 if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS)) 119 if (dt > 180)
120 stalled = 1; 120 stalled = 1;
121 121
122 if (!dt) 122 if (!dt)
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index 3c04ec0ea333..2da9104a3851 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -54,6 +54,11 @@ enum drbd_packet {
54 P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ 54 P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
55 P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ 55 P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */
56 P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ 56 P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */
57 /* 0x2e to 0x30 reserved, used in drbd 9 */
58
59 /* REQ_DISCARD. We used "discard" in different contexts before,
60 * which is why I chose TRIM here, to disambiguate. */
61 P_TRIM = 0x31,
57 62
58 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ 63 P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
59 P_MAX_OPT_CMD = 0x101, 64 P_MAX_OPT_CMD = 0x101,
@@ -119,6 +124,11 @@ struct p_data {
119 u32 dp_flags; 124 u32 dp_flags;
120} __packed; 125} __packed;
121 126
127struct p_trim {
128 struct p_data p_data;
129 u32 size; /* == bio->bi_size */
130} __packed;
131
122/* 132/*
123 * commands which share a struct: 133 * commands which share a struct:
124 * p_block_ack: 134 * p_block_ack:
@@ -150,6 +160,8 @@ struct p_block_req {
150 * ReportParams 160 * ReportParams
151 */ 161 */
152 162
163#define FF_TRIM 1
164
153struct p_connection_features { 165struct p_connection_features {
154 u32 protocol_min; 166 u32 protocol_min;
155 u32 feature_flags; 167 u32 feature_flags;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 68e3992e8838..b6c8aaf4931b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -46,9 +46,10 @@
46#include "drbd_int.h" 46#include "drbd_int.h"
47#include "drbd_protocol.h" 47#include "drbd_protocol.h"
48#include "drbd_req.h" 48#include "drbd_req.h"
49
50#include "drbd_vli.h" 49#include "drbd_vli.h"
51 50
51#define PRO_FEATURES (FF_TRIM)
52
52struct packet_info { 53struct packet_info {
53 enum drbd_packet cmd; 54 enum drbd_packet cmd;
54 unsigned int size; 55 unsigned int size;
@@ -65,7 +66,7 @@ enum finish_epoch {
65static int drbd_do_features(struct drbd_connection *connection); 66static int drbd_do_features(struct drbd_connection *connection);
66static int drbd_do_auth(struct drbd_connection *connection); 67static int drbd_do_auth(struct drbd_connection *connection);
67static int drbd_disconnected(struct drbd_peer_device *); 68static int drbd_disconnected(struct drbd_peer_device *);
68 69static void conn_wait_active_ee_empty(struct drbd_connection *connection);
69static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event); 70static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
70static int e_end_block(struct drbd_work *, int); 71static int e_end_block(struct drbd_work *, int);
71 72
@@ -234,9 +235,17 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
234 * @retry: whether to retry, if not enough pages are available right now 235 * @retry: whether to retry, if not enough pages are available right now
235 * 236 *
236 * Tries to allocate number pages, first from our own page pool, then from 237 * Tries to allocate number pages, first from our own page pool, then from
237 * the kernel, unless this allocation would exceed the max_buffers setting. 238 * the kernel.
238 * Possibly retry until DRBD frees sufficient pages somewhere else. 239 * Possibly retry until DRBD frees sufficient pages somewhere else.
239 * 240 *
241 * If this allocation would exceed the max_buffers setting, we throttle
242 * allocation (schedule_timeout) to give the system some room to breathe.
243 *
244 * We do not use max-buffers as hard limit, because it could lead to
245 * congestion and further to a distributed deadlock during online-verify or
246 * (checksum based) resync, if the max-buffers, socket buffer sizes and
247 * resync-rate settings are mis-configured.
248 *
240 * Returns a page chain linked via page->private. 249 * Returns a page chain linked via page->private.
241 */ 250 */
242struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number, 251struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
@@ -246,10 +255,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
246 struct page *page = NULL; 255 struct page *page = NULL;
247 struct net_conf *nc; 256 struct net_conf *nc;
248 DEFINE_WAIT(wait); 257 DEFINE_WAIT(wait);
249 int mxb; 258 unsigned int mxb;
250 259
251 /* Yes, we may run up to @number over max_buffers. If we
252 * follow it strictly, the admin will get it wrong anyways. */
253 rcu_read_lock(); 260 rcu_read_lock();
254 nc = rcu_dereference(peer_device->connection->net_conf); 261 nc = rcu_dereference(peer_device->connection->net_conf);
255 mxb = nc ? nc->max_buffers : 1000000; 262 mxb = nc ? nc->max_buffers : 1000000;
@@ -277,7 +284,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
277 break; 284 break;
278 } 285 }
279 286
280 schedule(); 287 if (schedule_timeout(HZ/10) == 0)
288 mxb = UINT_MAX;
281 } 289 }
282 finish_wait(&drbd_pp_wait, &wait); 290 finish_wait(&drbd_pp_wait, &wait);
283 291
@@ -331,7 +339,7 @@ You must not have the req_lock:
331 339
332struct drbd_peer_request * 340struct drbd_peer_request *
333drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 341drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
334 unsigned int data_size, gfp_t gfp_mask) __must_hold(local) 342 unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
335{ 343{
336 struct drbd_device *device = peer_device->device; 344 struct drbd_device *device = peer_device->device;
337 struct drbd_peer_request *peer_req; 345 struct drbd_peer_request *peer_req;
@@ -348,7 +356,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
348 return NULL; 356 return NULL;
349 } 357 }
350 358
351 if (data_size) { 359 if (has_payload && data_size) {
352 page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); 360 page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
353 if (!page) 361 if (!page)
354 goto fail; 362 goto fail;
@@ -1026,24 +1034,27 @@ randomize:
1026 if (drbd_send_protocol(connection) == -EOPNOTSUPP) 1034 if (drbd_send_protocol(connection) == -EOPNOTSUPP)
1027 return -1; 1035 return -1;
1028 1036
1037 /* Prevent a race between resync-handshake and
1038 * being promoted to Primary.
1039 *
1040 * Grab and release the state mutex, so we know that any current
1041 * drbd_set_role() is finished, and any incoming drbd_set_role
1042 * will see the STATE_SENT flag, and wait for it to be cleared.
1043 */
1044 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1045 mutex_lock(peer_device->device->state_mutex);
1046
1029 set_bit(STATE_SENT, &connection->flags); 1047 set_bit(STATE_SENT, &connection->flags);
1030 1048
1049 idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1050 mutex_unlock(peer_device->device->state_mutex);
1051
1031 rcu_read_lock(); 1052 rcu_read_lock();
1032 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1053 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1033 struct drbd_device *device = peer_device->device; 1054 struct drbd_device *device = peer_device->device;
1034 kref_get(&device->kref); 1055 kref_get(&device->kref);
1035 rcu_read_unlock(); 1056 rcu_read_unlock();
1036 1057
1037 /* Prevent a race between resync-handshake and
1038 * being promoted to Primary.
1039 *
1040 * Grab and release the state mutex, so we know that any current
1041 * drbd_set_role() is finished, and any incoming drbd_set_role
1042 * will see the STATE_SENT flag, and wait for it to be cleared.
1043 */
1044 mutex_lock(device->state_mutex);
1045 mutex_unlock(device->state_mutex);
1046
1047 if (discard_my_data) 1058 if (discard_my_data)
1048 set_bit(DISCARD_MY_DATA, &device->flags); 1059 set_bit(DISCARD_MY_DATA, &device->flags);
1049 else 1060 else
@@ -1315,6 +1326,20 @@ int drbd_submit_peer_request(struct drbd_device *device,
1315 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; 1326 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
1316 int err = -ENOMEM; 1327 int err = -ENOMEM;
1317 1328
1329 if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
1330 /* wait for all pending IO completions, before we start
1331 * zeroing things out. */
1332 conn_wait_active_ee_empty(first_peer_device(device)->connection);
1333 if (blkdev_issue_zeroout(device->ldev->backing_bdev,
1334 sector, ds >> 9, GFP_NOIO))
1335 peer_req->flags |= EE_WAS_ERROR;
1336 drbd_endio_write_sec_final(peer_req);
1337 return 0;
1338 }
1339
1340 if (peer_req->flags & EE_IS_TRIM)
1341 nr_pages = 0; /* discards don't have any payload. */
1342
1318 /* In most cases, we will only need one bio. But in case the lower 1343 /* In most cases, we will only need one bio. But in case the lower
1319 * level restrictions happen to be different at this offset on this 1344 * level restrictions happen to be different at this offset on this
1320 * side than those of the sending peer, we may need to submit the 1345 * side than those of the sending peer, we may need to submit the
@@ -1326,7 +1351,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
1326next_bio: 1351next_bio:
1327 bio = bio_alloc(GFP_NOIO, nr_pages); 1352 bio = bio_alloc(GFP_NOIO, nr_pages);
1328 if (!bio) { 1353 if (!bio) {
1329 drbd_err(device, "submit_ee: Allocation of a bio failed\n"); 1354 drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
1330 goto fail; 1355 goto fail;
1331 } 1356 }
1332 /* > peer_req->i.sector, unless this is the first bio */ 1357 /* > peer_req->i.sector, unless this is the first bio */
@@ -1340,6 +1365,11 @@ next_bio:
1340 bios = bio; 1365 bios = bio;
1341 ++n_bios; 1366 ++n_bios;
1342 1367
1368 if (rw & REQ_DISCARD) {
1369 bio->bi_iter.bi_size = ds;
1370 goto submit;
1371 }
1372
1343 page_chain_for_each(page) { 1373 page_chain_for_each(page) {
1344 unsigned len = min_t(unsigned, ds, PAGE_SIZE); 1374 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1345 if (!bio_add_page(bio, page, len, 0)) { 1375 if (!bio_add_page(bio, page, len, 0)) {
@@ -1360,8 +1390,9 @@ next_bio:
1360 sector += len >> 9; 1390 sector += len >> 9;
1361 --nr_pages; 1391 --nr_pages;
1362 } 1392 }
1363 D_ASSERT(device, page == NULL);
1364 D_ASSERT(device, ds == 0); 1393 D_ASSERT(device, ds == 0);
1394submit:
1395 D_ASSERT(device, page == NULL);
1365 1396
1366 atomic_set(&peer_req->pending_bios, n_bios); 1397 atomic_set(&peer_req->pending_bios, n_bios);
1367 do { 1398 do {
@@ -1490,19 +1521,21 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
1490 * and from receive_Data */ 1521 * and from receive_Data */
1491static struct drbd_peer_request * 1522static struct drbd_peer_request *
1492read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, 1523read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1493 int data_size) __must_hold(local) 1524 struct packet_info *pi) __must_hold(local)
1494{ 1525{
1495 struct drbd_device *device = peer_device->device; 1526 struct drbd_device *device = peer_device->device;
1496 const sector_t capacity = drbd_get_capacity(device->this_bdev); 1527 const sector_t capacity = drbd_get_capacity(device->this_bdev);
1497 struct drbd_peer_request *peer_req; 1528 struct drbd_peer_request *peer_req;
1498 struct page *page; 1529 struct page *page;
1499 int dgs, ds, err; 1530 int dgs, ds, err;
1531 int data_size = pi->size;
1500 void *dig_in = peer_device->connection->int_dig_in; 1532 void *dig_in = peer_device->connection->int_dig_in;
1501 void *dig_vv = peer_device->connection->int_dig_vv; 1533 void *dig_vv = peer_device->connection->int_dig_vv;
1502 unsigned long *data; 1534 unsigned long *data;
1535 struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
1503 1536
1504 dgs = 0; 1537 dgs = 0;
1505 if (peer_device->connection->peer_integrity_tfm) { 1538 if (!trim && peer_device->connection->peer_integrity_tfm) {
1506 dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); 1539 dgs = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
1507 /* 1540 /*
1508 * FIXME: Receive the incoming digest into the receive buffer 1541 * FIXME: Receive the incoming digest into the receive buffer
@@ -1514,9 +1547,15 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1514 data_size -= dgs; 1547 data_size -= dgs;
1515 } 1548 }
1516 1549
1550 if (trim) {
1551 D_ASSERT(peer_device, data_size == 0);
1552 data_size = be32_to_cpu(trim->size);
1553 }
1554
1517 if (!expect(IS_ALIGNED(data_size, 512))) 1555 if (!expect(IS_ALIGNED(data_size, 512)))
1518 return NULL; 1556 return NULL;
1519 if (!expect(data_size <= DRBD_MAX_BIO_SIZE)) 1557 /* prepare for larger trim requests. */
1558 if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
1520 return NULL; 1559 return NULL;
1521 1560
1522 /* even though we trust out peer, 1561 /* even though we trust out peer,
@@ -1532,11 +1571,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1532 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 1571 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1533 * "criss-cross" setup, that might cause write-out on some other DRBD, 1572 * "criss-cross" setup, that might cause write-out on some other DRBD,
1534 * which in turn might block on the other node at this very place. */ 1573 * which in turn might block on the other node at this very place. */
1535 peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, GFP_NOIO); 1574 peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
1536 if (!peer_req) 1575 if (!peer_req)
1537 return NULL; 1576 return NULL;
1538 1577
1539 if (!data_size) 1578 if (trim)
1540 return peer_req; 1579 return peer_req;
1541 1580
1542 ds = data_size; 1581 ds = data_size;
@@ -1676,12 +1715,12 @@ static int e_end_resync_block(struct drbd_work *w, int unused)
1676} 1715}
1677 1716
1678static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector, 1717static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
1679 int data_size) __releases(local) 1718 struct packet_info *pi) __releases(local)
1680{ 1719{
1681 struct drbd_device *device = peer_device->device; 1720 struct drbd_device *device = peer_device->device;
1682 struct drbd_peer_request *peer_req; 1721 struct drbd_peer_request *peer_req;
1683 1722
1684 peer_req = read_in_block(peer_device, ID_SYNCER, sector, data_size); 1723 peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
1685 if (!peer_req) 1724 if (!peer_req)
1686 goto fail; 1725 goto fail;
1687 1726
@@ -1697,7 +1736,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
1697 list_add(&peer_req->w.list, &device->sync_ee); 1736 list_add(&peer_req->w.list, &device->sync_ee);
1698 spin_unlock_irq(&device->resource->req_lock); 1737 spin_unlock_irq(&device->resource->req_lock);
1699 1738
1700 atomic_add(data_size >> 9, &device->rs_sect_ev); 1739 atomic_add(pi->size >> 9, &device->rs_sect_ev);
1701 if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) 1740 if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
1702 return 0; 1741 return 0;
1703 1742
@@ -1785,7 +1824,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet
1785 /* data is submitted to disk within recv_resync_read. 1824 /* data is submitted to disk within recv_resync_read.
1786 * corresponding put_ldev done below on error, 1825 * corresponding put_ldev done below on error,
1787 * or in drbd_peer_request_endio. */ 1826 * or in drbd_peer_request_endio. */
1788 err = recv_resync_read(peer_device, sector, pi->size); 1827 err = recv_resync_read(peer_device, sector, pi);
1789 } else { 1828 } else {
1790 if (__ratelimit(&drbd_ratelimit_state)) 1829 if (__ratelimit(&drbd_ratelimit_state))
1791 drbd_err(device, "Can not write resync data to local disk.\n"); 1830 drbd_err(device, "Can not write resync data to local disk.\n");
@@ -2196,7 +2235,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2196 */ 2235 */
2197 2236
2198 sector = be64_to_cpu(p->sector); 2237 sector = be64_to_cpu(p->sector);
2199 peer_req = read_in_block(peer_device, p->block_id, sector, pi->size); 2238 peer_req = read_in_block(peer_device, p->block_id, sector, pi);
2200 if (!peer_req) { 2239 if (!peer_req) {
2201 put_ldev(device); 2240 put_ldev(device);
2202 return -EIO; 2241 return -EIO;
@@ -2206,7 +2245,15 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2206 2245
2207 dp_flags = be32_to_cpu(p->dp_flags); 2246 dp_flags = be32_to_cpu(p->dp_flags);
2208 rw |= wire_flags_to_bio(dp_flags); 2247 rw |= wire_flags_to_bio(dp_flags);
2209 if (peer_req->pages == NULL) { 2248 if (pi->cmd == P_TRIM) {
2249 struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
2250 peer_req->flags |= EE_IS_TRIM;
2251 if (!blk_queue_discard(q))
2252 peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
2253 D_ASSERT(peer_device, peer_req->i.size > 0);
2254 D_ASSERT(peer_device, rw & REQ_DISCARD);
2255 D_ASSERT(peer_device, peer_req->pages == NULL);
2256 } else if (peer_req->pages == NULL) {
2210 D_ASSERT(device, peer_req->i.size == 0); 2257 D_ASSERT(device, peer_req->i.size == 0);
2211 D_ASSERT(device, dp_flags & DP_FLUSH); 2258 D_ASSERT(device, dp_flags & DP_FLUSH);
2212 } 2259 }
@@ -2242,7 +2289,12 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
2242 update_peer_seq(peer_device, peer_seq); 2289 update_peer_seq(peer_device, peer_seq);
2243 spin_lock_irq(&device->resource->req_lock); 2290 spin_lock_irq(&device->resource->req_lock);
2244 } 2291 }
2245 list_add(&peer_req->w.list, &device->active_ee); 2292 /* if we use the zeroout fallback code, we process synchronously
2293 * and we wait for all pending requests, respectively wait for
2294 * active_ee to become empty in drbd_submit_peer_request();
2295 * better not add ourselves here. */
2296 if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
2297 list_add(&peer_req->w.list, &device->active_ee);
2246 spin_unlock_irq(&device->resource->req_lock); 2298 spin_unlock_irq(&device->resource->req_lock);
2247 2299
2248 if (device->state.conn == C_SYNC_TARGET) 2300 if (device->state.conn == C_SYNC_TARGET)
@@ -2313,39 +2365,45 @@ out_interrupted:
2313 * The current sync rate used here uses only the most recent two step marks, 2365 * The current sync rate used here uses only the most recent two step marks,
2314 * to have a short time average so we can react faster. 2366 * to have a short time average so we can react faster.
2315 */ 2367 */
2316int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector) 2368bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
2317{ 2369{
2318 struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
2319 unsigned long db, dt, dbdt;
2320 struct lc_element *tmp; 2370 struct lc_element *tmp;
2321 int curr_events; 2371 bool throttle = true;
2322 int throttle = 0;
2323 unsigned int c_min_rate;
2324
2325 rcu_read_lock();
2326 c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2327 rcu_read_unlock();
2328 2372
2329 /* feature disabled? */ 2373 if (!drbd_rs_c_min_rate_throttle(device))
2330 if (c_min_rate == 0) 2374 return false;
2331 return 0;
2332 2375
2333 spin_lock_irq(&device->al_lock); 2376 spin_lock_irq(&device->al_lock);
2334 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); 2377 tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
2335 if (tmp) { 2378 if (tmp) {
2336 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 2379 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2337 if (test_bit(BME_PRIORITY, &bm_ext->flags)) { 2380 if (test_bit(BME_PRIORITY, &bm_ext->flags))
2338 spin_unlock_irq(&device->al_lock); 2381 throttle = false;
2339 return 0;
2340 }
2341 /* Do not slow down if app IO is already waiting for this extent */ 2382 /* Do not slow down if app IO is already waiting for this extent */
2342 } 2383 }
2343 spin_unlock_irq(&device->al_lock); 2384 spin_unlock_irq(&device->al_lock);
2344 2385
2386 return throttle;
2387}
2388
2389bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2390{
2391 struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
2392 unsigned long db, dt, dbdt;
2393 unsigned int c_min_rate;
2394 int curr_events;
2395
2396 rcu_read_lock();
2397 c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2398 rcu_read_unlock();
2399
2400 /* feature disabled? */
2401 if (c_min_rate == 0)
2402 return false;
2403
2345 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 2404 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
2346 (int)part_stat_read(&disk->part0, sectors[1]) - 2405 (int)part_stat_read(&disk->part0, sectors[1]) -
2347 atomic_read(&device->rs_sect_ev); 2406 atomic_read(&device->rs_sect_ev);
2348
2349 if (!device->rs_last_events || curr_events - device->rs_last_events > 64) { 2407 if (!device->rs_last_events || curr_events - device->rs_last_events > 64) {
2350 unsigned long rs_left; 2408 unsigned long rs_left;
2351 int i; 2409 int i;
@@ -2368,12 +2426,11 @@ int drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector)
2368 dbdt = Bit2KB(db/dt); 2426 dbdt = Bit2KB(db/dt);
2369 2427
2370 if (dbdt > c_min_rate) 2428 if (dbdt > c_min_rate)
2371 throttle = 1; 2429 return true;
2372 } 2430 }
2373 return throttle; 2431 return false;
2374} 2432}
2375 2433
2376
2377static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) 2434static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
2378{ 2435{
2379 struct drbd_peer_device *peer_device; 2436 struct drbd_peer_device *peer_device;
@@ -2436,7 +2493,8 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
2436 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD 2493 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2437 * "criss-cross" setup, that might cause write-out on some other DRBD, 2494 * "criss-cross" setup, that might cause write-out on some other DRBD,
2438 * which in turn might block on the other node at this very place. */ 2495 * which in turn might block on the other node at this very place. */
2439 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, GFP_NOIO); 2496 peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
2497 true /* has real payload */, GFP_NOIO);
2440 if (!peer_req) { 2498 if (!peer_req) {
2441 put_ldev(device); 2499 put_ldev(device);
2442 return -ENOMEM; 2500 return -ENOMEM;
@@ -3648,6 +3706,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3648 put_ldev(device); 3706 put_ldev(device);
3649 } 3707 }
3650 3708
3709 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3710 drbd_reconsider_max_bio_size(device);
3711 /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
3712 In case we cleared the QUEUE_FLAG_DISCARD from our queue in
3713 drbd_reconsider_max_bio_size(), we can be sure that after
3714 drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
3715
3651 ddsf = be16_to_cpu(p->dds_flags); 3716 ddsf = be16_to_cpu(p->dds_flags);
3652 if (get_ldev(device)) { 3717 if (get_ldev(device)) {
3653 dd = drbd_determine_dev_size(device, ddsf, NULL); 3718 dd = drbd_determine_dev_size(device, ddsf, NULL);
@@ -3660,9 +3725,6 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
3660 drbd_set_my_capacity(device, p_size); 3725 drbd_set_my_capacity(device, p_size);
3661 } 3726 }
3662 3727
3663 device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3664 drbd_reconsider_max_bio_size(device);
3665
3666 if (get_ldev(device)) { 3728 if (get_ldev(device)) {
3667 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) { 3729 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
3668 device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); 3730 device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
@@ -4423,6 +4485,7 @@ static struct data_cmd drbd_cmd_handler[] = {
4423 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, 4485 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
4424 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, 4486 [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
4425 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, 4487 [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
4488 [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
4426}; 4489};
4427 4490
4428static void drbdd(struct drbd_connection *connection) 4491static void drbdd(struct drbd_connection *connection)
@@ -4630,6 +4693,7 @@ static int drbd_send_features(struct drbd_connection *connection)
4630 memset(p, 0, sizeof(*p)); 4693 memset(p, 0, sizeof(*p));
4631 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); 4694 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
4632 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); 4695 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
4696 p->feature_flags = cpu_to_be32(PRO_FEATURES);
4633 return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); 4697 return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
4634} 4698}
4635 4699
@@ -4683,10 +4747,14 @@ static int drbd_do_features(struct drbd_connection *connection)
4683 goto incompat; 4747 goto incompat;
4684 4748
4685 connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); 4749 connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
4750 connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
4686 4751
4687 drbd_info(connection, "Handshake successful: " 4752 drbd_info(connection, "Handshake successful: "
4688 "Agreed network protocol version %d\n", connection->agreed_pro_version); 4753 "Agreed network protocol version %d\n", connection->agreed_pro_version);
4689 4754
4755 drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
4756 connection->agreed_features & FF_TRIM ? " " : " not ");
4757
4690 return 1; 4758 return 1;
4691 4759
4692 incompat: 4760 incompat:
@@ -4778,6 +4846,12 @@ static int drbd_do_auth(struct drbd_connection *connection)
4778 goto fail; 4846 goto fail;
4779 } 4847 }
4780 4848
4849 if (pi.size < CHALLENGE_LEN) {
4850 drbd_err(connection, "AuthChallenge payload too small.\n");
4851 rv = -1;
4852 goto fail;
4853 }
4854
4781 peers_ch = kmalloc(pi.size, GFP_NOIO); 4855 peers_ch = kmalloc(pi.size, GFP_NOIO);
4782 if (peers_ch == NULL) { 4856 if (peers_ch == NULL) {
4783 drbd_err(connection, "kmalloc of peers_ch failed\n"); 4857 drbd_err(connection, "kmalloc of peers_ch failed\n");
@@ -4791,6 +4865,12 @@ static int drbd_do_auth(struct drbd_connection *connection)
4791 goto fail; 4865 goto fail;
4792 } 4866 }
4793 4867
4868 if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
4869 drbd_err(connection, "Peer presented the same challenge!\n");
4870 rv = -1;
4871 goto fail;
4872 }
4873
4794 resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm); 4874 resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
4795 response = kmalloc(resp_size, GFP_NOIO); 4875 response = kmalloc(resp_size, GFP_NOIO);
4796 if (response == NULL) { 4876 if (response == NULL) {
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3779c8d2875b..09803d0d5207 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -522,6 +522,13 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
522 mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); 522 mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
523 break; 523 break;
524 524
525 case DISCARD_COMPLETED_NOTSUPP:
526 case DISCARD_COMPLETED_WITH_ERROR:
527 /* I'd rather not detach from local disk just because it
528 * failed a REQ_DISCARD. */
529 mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
530 break;
531
525 case QUEUE_FOR_NET_READ: 532 case QUEUE_FOR_NET_READ:
526 /* READ or READA, and 533 /* READ or READA, and
527 * no local disk, 534 * no local disk,
@@ -1235,6 +1242,7 @@ void do_submit(struct work_struct *ws)
1235 if (list_empty(&incoming)) 1242 if (list_empty(&incoming))
1236 break; 1243 break;
1237 1244
1245skip_fast_path:
1238 wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending)); 1246 wait_event(device->al_wait, prepare_al_transaction_nonblock(device, &incoming, &pending));
1239 /* Maybe more was queued, while we prepared the transaction? 1247 /* Maybe more was queued, while we prepared the transaction?
1240 * Try to stuff them into this transaction as well. 1248 * Try to stuff them into this transaction as well.
@@ -1273,6 +1281,25 @@ void do_submit(struct work_struct *ws)
1273 list_del_init(&req->tl_requests); 1281 list_del_init(&req->tl_requests);
1274 drbd_send_and_submit(device, req); 1282 drbd_send_and_submit(device, req);
1275 } 1283 }
1284
1285 /* If all currently hot activity log extents are kept busy by
1286 * incoming requests, we still must not totally starve new
1287 * requests to cold extents. In that case, prepare one request
1288 * in blocking mode. */
1289 list_for_each_entry_safe(req, tmp, &incoming, tl_requests) {
1290 list_del_init(&req->tl_requests);
1291 req->rq_state |= RQ_IN_ACT_LOG;
1292 if (!drbd_al_begin_io_prepare(device, &req->i)) {
1293 /* Corresponding extent was hot after all? */
1294 drbd_send_and_submit(device, req);
1295 } else {
1296 /* Found a request to a cold extent.
1297 * Put on "pending" list,
1298 * and try to cumulate with more. */
1299 list_add(&req->tl_requests, &pending);
1300 goto skip_fast_path;
1301 }
1302 }
1276 } 1303 }
1277} 1304}
1278 1305
@@ -1326,23 +1353,35 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
1326 return limit; 1353 return limit;
1327} 1354}
1328 1355
1329static struct drbd_request *find_oldest_request(struct drbd_connection *connection) 1356static void find_oldest_requests(
1357 struct drbd_connection *connection,
1358 struct drbd_device *device,
1359 struct drbd_request **oldest_req_waiting_for_peer,
1360 struct drbd_request **oldest_req_waiting_for_disk)
1330{ 1361{
1331 /* Walk the transfer log,
1332 * and find the oldest not yet completed request */
1333 struct drbd_request *r; 1362 struct drbd_request *r;
1363 *oldest_req_waiting_for_peer = NULL;
1364 *oldest_req_waiting_for_disk = NULL;
1334 list_for_each_entry(r, &connection->transfer_log, tl_requests) { 1365 list_for_each_entry(r, &connection->transfer_log, tl_requests) {
1335 if (atomic_read(&r->completion_ref)) 1366 const unsigned s = r->rq_state;
1336 return r; 1367 if (!*oldest_req_waiting_for_peer
1368 && ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE)))
1369 *oldest_req_waiting_for_peer = r;
1370
1371 if (!*oldest_req_waiting_for_disk
1372 && (s & RQ_LOCAL_PENDING) && r->device == device)
1373 *oldest_req_waiting_for_disk = r;
1374
1375 if (*oldest_req_waiting_for_peer && *oldest_req_waiting_for_disk)
1376 break;
1337 } 1377 }
1338 return NULL;
1339} 1378}
1340 1379
1341void request_timer_fn(unsigned long data) 1380void request_timer_fn(unsigned long data)
1342{ 1381{
1343 struct drbd_device *device = (struct drbd_device *) data; 1382 struct drbd_device *device = (struct drbd_device *) data;
1344 struct drbd_connection *connection = first_peer_device(device)->connection; 1383 struct drbd_connection *connection = first_peer_device(device)->connection;
1345 struct drbd_request *req; /* oldest request */ 1384 struct drbd_request *req_disk, *req_peer; /* oldest request */
1346 struct net_conf *nc; 1385 struct net_conf *nc;
1347 unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ 1386 unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
1348 unsigned long now; 1387 unsigned long now;
@@ -1366,8 +1405,8 @@ void request_timer_fn(unsigned long data)
1366 now = jiffies; 1405 now = jiffies;
1367 1406
1368 spin_lock_irq(&device->resource->req_lock); 1407 spin_lock_irq(&device->resource->req_lock);
1369 req = find_oldest_request(connection); 1408 find_oldest_requests(connection, device, &req_peer, &req_disk);
1370 if (!req) { 1409 if (req_peer == NULL && req_disk == NULL) {
1371 spin_unlock_irq(&device->resource->req_lock); 1410 spin_unlock_irq(&device->resource->req_lock);
1372 mod_timer(&device->request_timer, now + et); 1411 mod_timer(&device->request_timer, now + et);
1373 return; 1412 return;
@@ -1389,19 +1428,26 @@ void request_timer_fn(unsigned long data)
1389 * ~198 days with 250 HZ, we have a window where the timeout would need 1428 * ~198 days with 250 HZ, we have a window where the timeout would need
1390 * to expire twice (worst case) to become effective. Good enough. 1429 * to expire twice (worst case) to become effective. Good enough.
1391 */ 1430 */
1392 if (ent && req->rq_state & RQ_NET_PENDING && 1431 if (ent && req_peer &&
1393 time_after(now, req->start_time + ent) && 1432 time_after(now, req_peer->start_time + ent) &&
1394 !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { 1433 !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
1395 drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); 1434 drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
1396 _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); 1435 _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
1397 } 1436 }
1398 if (dt && req->rq_state & RQ_LOCAL_PENDING && req->device == device && 1437 if (dt && req_disk &&
1399 time_after(now, req->start_time + dt) && 1438 time_after(now, req_disk->start_time + dt) &&
1400 !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { 1439 !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
1401 drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); 1440 drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
1402 __drbd_chk_io_error(device, DRBD_FORCE_DETACH); 1441 __drbd_chk_io_error(device, DRBD_FORCE_DETACH);
1403 } 1442 }
1404 nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; 1443
1444 /* Reschedule timer for the nearest not already expired timeout.
1445 * Fallback to now + min(effective network timeout, disk timeout). */
1446 ent = (ent && req_peer && time_before(now, req_peer->start_time + ent))
1447 ? req_peer->start_time + ent : now + et;
1448 dt = (dt && req_disk && time_before(now, req_disk->start_time + dt))
1449 ? req_disk->start_time + dt : now + et;
1450 nt = time_before(ent, dt) ? ent : dt;
1405 spin_unlock_irq(&connection->resource->req_lock); 1451 spin_unlock_irq(&connection->resource->req_lock);
1406 mod_timer(&device->request_timer, nt); 1452 mod_timer(&device->request_timer, nt);
1407} 1453}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index c684c963538e..8566cd5866b4 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -30,7 +30,6 @@
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/drbd.h> 31#include <linux/drbd.h>
32#include "drbd_int.h" 32#include "drbd_int.h"
33#include "drbd_wrappers.h"
34 33
35/* The request callbacks will be called in irq context by the IDE drivers, 34/* The request callbacks will be called in irq context by the IDE drivers,
36 and in Softirqs/Tasklets/BH context by the SCSI drivers, 35 and in Softirqs/Tasklets/BH context by the SCSI drivers,
@@ -111,11 +110,14 @@ enum drbd_req_event {
111 BARRIER_ACKED, /* in protocol A and B */ 110 BARRIER_ACKED, /* in protocol A and B */
112 DATA_RECEIVED, /* (remote read) */ 111 DATA_RECEIVED, /* (remote read) */
113 112
113 COMPLETED_OK,
114 READ_COMPLETED_WITH_ERROR, 114 READ_COMPLETED_WITH_ERROR,
115 READ_AHEAD_COMPLETED_WITH_ERROR, 115 READ_AHEAD_COMPLETED_WITH_ERROR,
116 WRITE_COMPLETED_WITH_ERROR, 116 WRITE_COMPLETED_WITH_ERROR,
117 DISCARD_COMPLETED_NOTSUPP,
118 DISCARD_COMPLETED_WITH_ERROR,
119
117 ABORT_DISK_IO, 120 ABORT_DISK_IO,
118 COMPLETED_OK,
119 RESEND, 121 RESEND,
120 FAIL_FROZEN_DISK_IO, 122 FAIL_FROZEN_DISK_IO,
121 RESTART_FROZEN_DISK_IO, 123 RESTART_FROZEN_DISK_IO,
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 1a84345a3868..a5d8aae00e04 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -54,8 +54,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
54static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); 54static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
55static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); 55static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
56static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); 56static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
57static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, 57static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
58 enum sanitize_state_warnings *warn); 58 union drbd_state ns, enum sanitize_state_warnings *warn);
59 59
60static inline bool is_susp(union drbd_state s) 60static inline bool is_susp(union drbd_state s)
61{ 61{
@@ -287,7 +287,7 @@ _req_st_cond(struct drbd_device *device, union drbd_state mask,
287 287
288 spin_lock_irqsave(&device->resource->req_lock, flags); 288 spin_lock_irqsave(&device->resource->req_lock, flags);
289 os = drbd_read_state(device); 289 os = drbd_read_state(device);
290 ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); 290 ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
291 rv = is_valid_transition(os, ns); 291 rv = is_valid_transition(os, ns);
292 if (rv >= SS_SUCCESS) 292 if (rv >= SS_SUCCESS)
293 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ 293 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
@@ -333,7 +333,7 @@ drbd_req_state(struct drbd_device *device, union drbd_state mask,
333 333
334 spin_lock_irqsave(&device->resource->req_lock, flags); 334 spin_lock_irqsave(&device->resource->req_lock, flags);
335 os = drbd_read_state(device); 335 os = drbd_read_state(device);
336 ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); 336 ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
337 rv = is_valid_transition(os, ns); 337 rv = is_valid_transition(os, ns);
338 if (rv < SS_SUCCESS) { 338 if (rv < SS_SUCCESS) {
339 spin_unlock_irqrestore(&device->resource->req_lock, flags); 339 spin_unlock_irqrestore(&device->resource->req_lock, flags);
@@ -740,8 +740,8 @@ static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_st
740 * When we loose connection, we have to set the state of the peers disk (pdsk) 740 * When we loose connection, we have to set the state of the peers disk (pdsk)
741 * to D_UNKNOWN. This rule and many more along those lines are in this function. 741 * to D_UNKNOWN. This rule and many more along those lines are in this function.
742 */ 742 */
743static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state ns, 743static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
744 enum sanitize_state_warnings *warn) 744 union drbd_state ns, enum sanitize_state_warnings *warn)
745{ 745{
746 enum drbd_fencing_p fp; 746 enum drbd_fencing_p fp;
747 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; 747 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
@@ -882,11 +882,13 @@ static union drbd_state sanitize_state(struct drbd_device *device, union drbd_st
882 } 882 }
883 883
884 if (fp == FP_STONITH && 884 if (fp == FP_STONITH &&
885 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED)) 885 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
886 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
886 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ 887 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
887 888
888 if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO && 889 if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
889 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) 890 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
891 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
890 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ 892 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
891 893
892 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { 894 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
@@ -958,7 +960,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns,
958 960
959 os = drbd_read_state(device); 961 os = drbd_read_state(device);
960 962
961 ns = sanitize_state(device, ns, &ssw); 963 ns = sanitize_state(device, os, ns, &ssw);
962 if (ns.i == os.i) 964 if (ns.i == os.i)
963 return SS_NOTHING_TO_DO; 965 return SS_NOTHING_TO_DO;
964 966
@@ -1656,7 +1658,7 @@ conn_is_valid_transition(struct drbd_connection *connection, union drbd_state ma
1656 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { 1658 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1657 struct drbd_device *device = peer_device->device; 1659 struct drbd_device *device = peer_device->device;
1658 os = drbd_read_state(device); 1660 os = drbd_read_state(device);
1659 ns = sanitize_state(device, apply_mask_val(os, mask, val), NULL); 1661 ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
1660 1662
1661 if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) 1663 if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
1662 ns.disk = os.disk; 1664 ns.disk = os.disk;
@@ -1718,7 +1720,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
1718 number_of_volumes++; 1720 number_of_volumes++;
1719 os = drbd_read_state(device); 1721 os = drbd_read_state(device);
1720 ns = apply_mask_val(os, mask, val); 1722 ns = apply_mask_val(os, mask, val);
1721 ns = sanitize_state(device, ns, NULL); 1723 ns = sanitize_state(device, os, ns, NULL);
1722 1724
1723 if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) 1725 if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
1724 ns.disk = os.disk; 1726 ns.disk = os.disk;
@@ -1763,19 +1765,19 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
1763static enum drbd_state_rv 1765static enum drbd_state_rv
1764_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) 1766_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
1765{ 1767{
1766 enum drbd_state_rv rv; 1768 enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;
1767 1769
1768 if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags)) 1770 if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
1769 return SS_CW_SUCCESS; 1771 rv = SS_CW_SUCCESS;
1770 1772
1771 if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags)) 1773 if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
1772 return SS_CW_FAILED_BY_PEER; 1774 rv = SS_CW_FAILED_BY_PEER;
1773 1775
1774 rv = conn_is_valid_transition(connection, mask, val, 0); 1776 err = conn_is_valid_transition(connection, mask, val, 0);
1775 if (rv == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) 1777 if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
1776 rv = SS_UNKNOWN_ERROR; /* continue waiting */ 1778 return rv;
1777 1779
1778 return rv; 1780 return err;
1779} 1781}
1780 1782
1781enum drbd_state_rv 1783enum drbd_state_rv
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 2c4ce42c3657..d8f57b6305cd 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -118,7 +118,7 @@ static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __rele
118 118
119/* writes on behalf of the partner, or resync writes, 119/* writes on behalf of the partner, or resync writes,
120 * "submitted" by the receiver, final stage. */ 120 * "submitted" by the receiver, final stage. */
121static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) 121void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
122{ 122{
123 unsigned long flags = 0; 123 unsigned long flags = 0;
124 struct drbd_peer_device *peer_device = peer_req->peer_device; 124 struct drbd_peer_device *peer_device = peer_req->peer_device;
@@ -150,7 +150,9 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel
150 150
151 do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee); 151 do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
152 152
153 if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) 153 /* FIXME do we want to detach for failed REQ_DISCARD?
154 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
155 if (peer_req->flags & EE_WAS_ERROR)
154 __drbd_chk_io_error(device, DRBD_WRITE_ERROR); 156 __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
155 spin_unlock_irqrestore(&device->resource->req_lock, flags); 157 spin_unlock_irqrestore(&device->resource->req_lock, flags);
156 158
@@ -176,10 +178,12 @@ void drbd_peer_request_endio(struct bio *bio, int error)
176 struct drbd_device *device = peer_req->peer_device->device; 178 struct drbd_device *device = peer_req->peer_device->device;
177 int uptodate = bio_flagged(bio, BIO_UPTODATE); 179 int uptodate = bio_flagged(bio, BIO_UPTODATE);
178 int is_write = bio_data_dir(bio) == WRITE; 180 int is_write = bio_data_dir(bio) == WRITE;
181 int is_discard = !!(bio->bi_rw & REQ_DISCARD);
179 182
180 if (error && __ratelimit(&drbd_ratelimit_state)) 183 if (error && __ratelimit(&drbd_ratelimit_state))
181 drbd_warn(device, "%s: error=%d s=%llus\n", 184 drbd_warn(device, "%s: error=%d s=%llus\n",
182 is_write ? "write" : "read", error, 185 is_write ? (is_discard ? "discard" : "write")
186 : "read", error,
183 (unsigned long long)peer_req->i.sector); 187 (unsigned long long)peer_req->i.sector);
184 if (!error && !uptodate) { 188 if (!error && !uptodate) {
185 if (__ratelimit(&drbd_ratelimit_state)) 189 if (__ratelimit(&drbd_ratelimit_state))
@@ -263,7 +267,12 @@ void drbd_request_endio(struct bio *bio, int error)
263 267
264 /* to avoid recursion in __req_mod */ 268 /* to avoid recursion in __req_mod */
265 if (unlikely(error)) { 269 if (unlikely(error)) {
266 what = (bio_data_dir(bio) == WRITE) 270 if (bio->bi_rw & REQ_DISCARD)
271 what = (error == -EOPNOTSUPP)
272 ? DISCARD_COMPLETED_NOTSUPP
273 : DISCARD_COMPLETED_WITH_ERROR;
274 else
275 what = (bio_data_dir(bio) == WRITE)
267 ? WRITE_COMPLETED_WITH_ERROR 276 ? WRITE_COMPLETED_WITH_ERROR
268 : (bio_rw(bio) == READ) 277 : (bio_rw(bio) == READ)
269 ? READ_COMPLETED_WITH_ERROR 278 ? READ_COMPLETED_WITH_ERROR
@@ -395,7 +404,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
395 /* GFP_TRY, because if there is no memory available right now, this may 404 /* GFP_TRY, because if there is no memory available right now, this may
396 * be rescheduled for later. It is "only" background resync, after all. */ 405 * be rescheduled for later. It is "only" background resync, after all. */
397 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, 406 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
398 size, GFP_TRY); 407 size, true /* has real payload */, GFP_TRY);
399 if (!peer_req) 408 if (!peer_req)
400 goto defer; 409 goto defer;
401 410
@@ -492,10 +501,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size)
492 return fb; 501 return fb;
493} 502}
494 503
495static int drbd_rs_controller(struct drbd_device *device) 504static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
496{ 505{
497 struct disk_conf *dc; 506 struct disk_conf *dc;
498 unsigned int sect_in; /* Number of sectors that came in since the last turn */
499 unsigned int want; /* The number of sectors we want in the proxy */ 507 unsigned int want; /* The number of sectors we want in the proxy */
500 int req_sect; /* Number of sectors to request in this turn */ 508 int req_sect; /* Number of sectors to request in this turn */
501 int correction; /* Number of sectors more we need in the proxy*/ 509 int correction; /* Number of sectors more we need in the proxy*/
@@ -505,9 +513,6 @@ static int drbd_rs_controller(struct drbd_device *device)
505 int max_sect; 513 int max_sect;
506 struct fifo_buffer *plan; 514 struct fifo_buffer *plan;
507 515
508 sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */
509 device->rs_in_flight -= sect_in;
510
511 dc = rcu_dereference(device->ldev->disk_conf); 516 dc = rcu_dereference(device->ldev->disk_conf);
512 plan = rcu_dereference(device->rs_plan_s); 517 plan = rcu_dereference(device->rs_plan_s);
513 518
@@ -550,11 +555,16 @@ static int drbd_rs_controller(struct drbd_device *device)
550 555
551static int drbd_rs_number_requests(struct drbd_device *device) 556static int drbd_rs_number_requests(struct drbd_device *device)
552{ 557{
553 int number; 558 unsigned int sect_in; /* Number of sectors that came in since the last turn */
559 int number, mxb;
560
561 sect_in = atomic_xchg(&device->rs_sect_in, 0);
562 device->rs_in_flight -= sect_in;
554 563
555 rcu_read_lock(); 564 rcu_read_lock();
565 mxb = drbd_get_max_buffers(device) / 2;
556 if (rcu_dereference(device->rs_plan_s)->size) { 566 if (rcu_dereference(device->rs_plan_s)->size) {
557 number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9); 567 number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
558 device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; 568 device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
559 } else { 569 } else {
560 device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate; 570 device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
@@ -562,8 +572,14 @@ static int drbd_rs_number_requests(struct drbd_device *device)
562 } 572 }
563 rcu_read_unlock(); 573 rcu_read_unlock();
564 574
565 /* ignore the amount of pending requests, the resync controller should 575 /* Don't have more than "max-buffers"/2 in-flight.
566 * throttle down to incoming reply rate soon enough anyways. */ 576 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
577 * potentially causing a distributed deadlock on congestion during
578 * online-verify or (checksum-based) resync, if max-buffers,
579 * socket buffer sizes and resync rate settings are mis-configured. */
580 if (mxb - device->rs_in_flight < number)
581 number = mxb - device->rs_in_flight;
582
567 return number; 583 return number;
568} 584}
569 585
@@ -597,7 +613,7 @@ static int make_resync_request(struct drbd_device *device, int cancel)
597 613
598 max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; 614 max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
599 number = drbd_rs_number_requests(device); 615 number = drbd_rs_number_requests(device);
600 if (number == 0) 616 if (number <= 0)
601 goto requeue; 617 goto requeue;
602 618
603 for (i = 0; i < number; i++) { 619 for (i = 0; i < number; i++) {
@@ -647,7 +663,7 @@ next_sector:
647 */ 663 */
648 align = 1; 664 align = 1;
649 rollback_i = i; 665 rollback_i = i;
650 for (;;) { 666 while (i < number) {
651 if (size + BM_BLOCK_SIZE > max_bio_size) 667 if (size + BM_BLOCK_SIZE > max_bio_size)
652 break; 668 break;
653 669
@@ -1670,11 +1686,15 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1670 } 1686 }
1671 clear_bit(B_RS_H_DONE, &device->flags); 1687 clear_bit(B_RS_H_DONE, &device->flags);
1672 1688
1673 write_lock_irq(&global_state_lock); 1689 /* req_lock: serialize with drbd_send_and_submit() and others
1690 * global_state_lock: for stable sync-after dependencies */
1691 spin_lock_irq(&device->resource->req_lock);
1692 write_lock(&global_state_lock);
1674 /* Did some connection breakage or IO error race with us? */ 1693 /* Did some connection breakage or IO error race with us? */
1675 if (device->state.conn < C_CONNECTED 1694 if (device->state.conn < C_CONNECTED
1676 || !get_ldev_if_state(device, D_NEGOTIATING)) { 1695 || !get_ldev_if_state(device, D_NEGOTIATING)) {
1677 write_unlock_irq(&global_state_lock); 1696 write_unlock(&global_state_lock);
1697 spin_unlock_irq(&device->resource->req_lock);
1678 mutex_unlock(device->state_mutex); 1698 mutex_unlock(device->state_mutex);
1679 return; 1699 return;
1680 } 1700 }
@@ -1714,7 +1734,8 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1714 } 1734 }
1715 _drbd_pause_after(device); 1735 _drbd_pause_after(device);
1716 } 1736 }
1717 write_unlock_irq(&global_state_lock); 1737 write_unlock(&global_state_lock);
1738 spin_unlock_irq(&device->resource->req_lock);
1718 1739
1719 if (r == SS_SUCCESS) { 1740 if (r == SS_SUCCESS) {
1720 /* reset rs_last_bcast when a resync or verify is started, 1741 /* reset rs_last_bcast when a resync or verify is started,
@@ -1778,34 +1799,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
1778 mutex_unlock(device->state_mutex); 1799 mutex_unlock(device->state_mutex);
1779} 1800}
1780 1801
1781/* If the resource already closed the current epoch, but we did not
1782 * (because we have not yet seen new requests), we should send the
1783 * corresponding barrier now. Must be checked within the same spinlock
1784 * that is used to check for new requests. */
1785static bool need_to_send_barrier(struct drbd_connection *connection)
1786{
1787 if (!connection->send.seen_any_write_yet)
1788 return false;
1789
1790 /* Skip barriers that do not contain any writes.
1791 * This may happen during AHEAD mode. */
1792 if (!connection->send.current_epoch_writes)
1793 return false;
1794
1795 /* ->req_lock is held when requests are queued on
1796 * connection->sender_work, and put into ->transfer_log.
1797 * It is also held when ->current_tle_nr is increased.
1798 * So either there are already new requests queued,
1799 * and corresponding barriers will be send there.
1800 * Or nothing new is queued yet, so the difference will be 1.
1801 */
1802 if (atomic_read(&connection->current_tle_nr) !=
1803 connection->send.current_epoch_nr + 1)
1804 return false;
1805
1806 return true;
1807}
1808
1809static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) 1802static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
1810{ 1803{
1811 spin_lock_irq(&queue->q_lock); 1804 spin_lock_irq(&queue->q_lock);
@@ -1864,12 +1857,22 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
1864 spin_unlock_irq(&connection->resource->req_lock); 1857 spin_unlock_irq(&connection->resource->req_lock);
1865 break; 1858 break;
1866 } 1859 }
1867 send_barrier = need_to_send_barrier(connection); 1860
1861 /* We found nothing new to do, no to-be-communicated request,
1862 * no other work item. We may still need to close the last
1863 * epoch. Next incoming request epoch will be connection ->
1864 * current transfer log epoch number. If that is different
1865 * from the epoch of the last request we communicated, it is
1866 * safe to send the epoch separating barrier now.
1867 */
1868 send_barrier =
1869 atomic_read(&connection->current_tle_nr) !=
1870 connection->send.current_epoch_nr;
1868 spin_unlock_irq(&connection->resource->req_lock); 1871 spin_unlock_irq(&connection->resource->req_lock);
1869 if (send_barrier) { 1872
1870 drbd_send_barrier(connection); 1873 if (send_barrier)
1871 connection->send.current_epoch_nr++; 1874 maybe_send_barrier(connection,
1872 } 1875 connection->send.current_epoch_nr + 1);
1873 schedule(); 1876 schedule();
1874 /* may be woken up for other things but new work, too, 1877 /* may be woken up for other things but new work, too,
1875 * e.g. if the current epoch got closed. 1878 * e.g. if the current epoch got closed.
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
deleted file mode 100644
index 3db9ebaf64f6..000000000000
--- a/drivers/block/drbd/drbd_wrappers.h
+++ /dev/null
@@ -1,54 +0,0 @@
1#ifndef _DRBD_WRAPPERS_H
2#define _DRBD_WRAPPERS_H
3
4#include <linux/ctype.h>
5#include <linux/mm.h>
6#include "drbd_int.h"
7
8/* see get_sb_bdev and bd_claim */
9extern char *drbd_sec_holder;
10
11/* sets the number of 512 byte sectors of our virtual device */
12static inline void drbd_set_my_capacity(struct drbd_device *device,
13 sector_t size)
14{
15 /* set_capacity(device->this_bdev->bd_disk, size); */
16 set_capacity(device->vdisk, size);
17 device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
18}
19
20#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
21
22/* bi_end_io handlers */
23extern void drbd_md_io_complete(struct bio *bio, int error);
24extern void drbd_peer_request_endio(struct bio *bio, int error);
25extern void drbd_request_endio(struct bio *bio, int error);
26
27/*
28 * used to submit our private bio
29 */
30static inline void drbd_generic_make_request(struct drbd_device *device,
31 int fault_type, struct bio *bio)
32{
33 __release(local);
34 if (!bio->bi_bdev) {
35 printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
36 "bio->bi_bdev == NULL\n",
37 device_to_minor(device));
38 dump_stack();
39 bio_endio(bio, -ENODEV);
40 return;
41 }
42
43 if (drbd_insert_fault(device, fault_type))
44 bio_endio(bio, -EIO);
45 else
46 generic_make_request(bio);
47}
48
49#ifndef __CHECKER__
50# undef __cond_lock
51# define __cond_lock(x,c) (c)
52#endif
53
54#endif
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 8f5565bf34cd..8e767bb7995e 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2351,7 +2351,7 @@ static void rw_interrupt(void)
2351 } 2351 }
2352 2352
2353 if (CT(COMMAND) != FD_READ || 2353 if (CT(COMMAND) != FD_READ ||
2354 raw_cmd->kernel_data == current_req->buffer) { 2354 raw_cmd->kernel_data == bio_data(current_req->bio)) {
2355 /* transfer directly from buffer */ 2355 /* transfer directly from buffer */
2356 cont->done(1); 2356 cont->done(1);
2357 } else if (CT(COMMAND) == FD_READ) { 2357 } else if (CT(COMMAND) == FD_READ) {
@@ -2640,7 +2640,7 @@ static int make_raw_rw_request(void)
2640 raw_cmd->flags &= ~FD_RAW_WRITE; 2640 raw_cmd->flags &= ~FD_RAW_WRITE;
2641 raw_cmd->flags |= FD_RAW_READ; 2641 raw_cmd->flags |= FD_RAW_READ;
2642 COMMAND = FM_MODE(_floppy, FD_READ); 2642 COMMAND = FM_MODE(_floppy, FD_READ);
2643 } else if ((unsigned long)current_req->buffer < MAX_DMA_ADDRESS) { 2643 } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) {
2644 unsigned long dma_limit; 2644 unsigned long dma_limit;
2645 int direct, indirect; 2645 int direct, indirect;
2646 2646
@@ -2654,13 +2654,13 @@ static int make_raw_rw_request(void)
2654 */ 2654 */
2655 max_size = buffer_chain_size(); 2655 max_size = buffer_chain_size();
2656 dma_limit = (MAX_DMA_ADDRESS - 2656 dma_limit = (MAX_DMA_ADDRESS -
2657 ((unsigned long)current_req->buffer)) >> 9; 2657 ((unsigned long)bio_data(current_req->bio))) >> 9;
2658 if ((unsigned long)max_size > dma_limit) 2658 if ((unsigned long)max_size > dma_limit)
2659 max_size = dma_limit; 2659 max_size = dma_limit;
2660 /* 64 kb boundaries */ 2660 /* 64 kb boundaries */
2661 if (CROSS_64KB(current_req->buffer, max_size << 9)) 2661 if (CROSS_64KB(bio_data(current_req->bio), max_size << 9))
2662 max_size = (K_64 - 2662 max_size = (K_64 -
2663 ((unsigned long)current_req->buffer) % 2663 ((unsigned long)bio_data(current_req->bio)) %
2664 K_64) >> 9; 2664 K_64) >> 9;
2665 direct = transfer_size(ssize, max_sector, max_size) - fsector_t; 2665 direct = transfer_size(ssize, max_sector, max_size) - fsector_t;
2666 /* 2666 /*
@@ -2677,7 +2677,7 @@ static int make_raw_rw_request(void)
2677 (DP->read_track & (1 << DRS->probed_format)))))) { 2677 (DP->read_track & (1 << DRS->probed_format)))))) {
2678 max_size = blk_rq_sectors(current_req); 2678 max_size = blk_rq_sectors(current_req);
2679 } else { 2679 } else {
2680 raw_cmd->kernel_data = current_req->buffer; 2680 raw_cmd->kernel_data = bio_data(current_req->bio);
2681 raw_cmd->length = current_count_sectors << 9; 2681 raw_cmd->length = current_count_sectors << 9;
2682 if (raw_cmd->length == 0) { 2682 if (raw_cmd->length == 0) {
2683 DPRINT("%s: zero dma transfer attempted\n", __func__); 2683 DPRINT("%s: zero dma transfer attempted\n", __func__);
@@ -2731,7 +2731,7 @@ static int make_raw_rw_request(void)
2731 raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; 2731 raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1;
2732 raw_cmd->length <<= 9; 2732 raw_cmd->length <<= 9;
2733 if ((raw_cmd->length < current_count_sectors << 9) || 2733 if ((raw_cmd->length < current_count_sectors << 9) ||
2734 (raw_cmd->kernel_data != current_req->buffer && 2734 (raw_cmd->kernel_data != bio_data(current_req->bio) &&
2735 CT(COMMAND) == FD_WRITE && 2735 CT(COMMAND) == FD_WRITE &&
2736 (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max || 2736 (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max ||
2737 aligned_sector_t < buffer_min)) || 2737 aligned_sector_t < buffer_min)) ||
@@ -2739,7 +2739,7 @@ static int make_raw_rw_request(void)
2739 raw_cmd->length <= 0 || current_count_sectors <= 0) { 2739 raw_cmd->length <= 0 || current_count_sectors <= 0) {
2740 DPRINT("fractionary current count b=%lx s=%lx\n", 2740 DPRINT("fractionary current count b=%lx s=%lx\n",
2741 raw_cmd->length, current_count_sectors); 2741 raw_cmd->length, current_count_sectors);
2742 if (raw_cmd->kernel_data != current_req->buffer) 2742 if (raw_cmd->kernel_data != bio_data(current_req->bio))
2743 pr_info("addr=%d, length=%ld\n", 2743 pr_info("addr=%d, length=%ld\n",
2744 (int)((raw_cmd->kernel_data - 2744 (int)((raw_cmd->kernel_data -
2745 floppy_track_buffer) >> 9), 2745 floppy_track_buffer) >> 9),
@@ -2756,7 +2756,7 @@ static int make_raw_rw_request(void)
2756 return 0; 2756 return 0;
2757 } 2757 }
2758 2758
2759 if (raw_cmd->kernel_data != current_req->buffer) { 2759 if (raw_cmd->kernel_data != bio_data(current_req->bio)) {
2760 if (raw_cmd->kernel_data < floppy_track_buffer || 2760 if (raw_cmd->kernel_data < floppy_track_buffer ||
2761 current_count_sectors < 0 || 2761 current_count_sectors < 0 ||
2762 raw_cmd->length < 0 || 2762 raw_cmd->length < 0 ||
@@ -3809,7 +3809,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
3809 bio.bi_iter.bi_size = size; 3809 bio.bi_iter.bi_size = size;
3810 bio.bi_bdev = bdev; 3810 bio.bi_bdev = bdev;
3811 bio.bi_iter.bi_sector = 0; 3811 bio.bi_iter.bi_sector = 0;
3812 bio.bi_flags = (1 << BIO_QUIET); 3812 bio.bi_flags |= (1 << BIO_QUIET);
3813 bio.bi_private = &cbdata; 3813 bio.bi_private = &cbdata;
3814 bio.bi_end_io = floppy_rb0_cb; 3814 bio.bi_end_io = floppy_rb0_cb;
3815 3815
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index bf397bf108b7..8a290c08262f 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -464,11 +464,11 @@ static void read_intr(void)
464 464
465ok_to_read: 465ok_to_read:
466 req = hd_req; 466 req = hd_req;
467 insw(HD_DATA, req->buffer, 256); 467 insw(HD_DATA, bio_data(req->bio), 256);
468#ifdef DEBUG 468#ifdef DEBUG
469 printk("%s: read: sector %ld, remaining = %u, buffer=%p\n", 469 printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
470 req->rq_disk->disk_name, blk_rq_pos(req) + 1, 470 req->rq_disk->disk_name, blk_rq_pos(req) + 1,
471 blk_rq_sectors(req) - 1, req->buffer+512); 471 blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
472#endif 472#endif
473 if (hd_end_request(0, 512)) { 473 if (hd_end_request(0, 512)) {
474 SET_HANDLER(&read_intr); 474 SET_HANDLER(&read_intr);
@@ -505,7 +505,7 @@ static void write_intr(void)
505ok_to_write: 505ok_to_write:
506 if (hd_end_request(0, 512)) { 506 if (hd_end_request(0, 512)) {
507 SET_HANDLER(&write_intr); 507 SET_HANDLER(&write_intr);
508 outsw(HD_DATA, req->buffer, 256); 508 outsw(HD_DATA, bio_data(req->bio), 256);
509 return; 509 return;
510 } 510 }
511 511
@@ -624,7 +624,7 @@ repeat:
624 printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n", 624 printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
625 req->rq_disk->disk_name, 625 req->rq_disk->disk_name,
626 req_data_dir(req) == READ ? "read" : "writ", 626 req_data_dir(req) == READ ? "read" : "writ",
627 cyl, head, sec, nsect, req->buffer); 627 cyl, head, sec, nsect, bio_data(req->bio));
628#endif 628#endif
629 if (req->cmd_type == REQ_TYPE_FS) { 629 if (req->cmd_type == REQ_TYPE_FS) {
630 switch (rq_data_dir(req)) { 630 switch (rq_data_dir(req)) {
@@ -643,7 +643,7 @@ repeat:
643 bad_rw_intr(); 643 bad_rw_intr();
644 goto repeat; 644 goto repeat;
645 } 645 }
646 outsw(HD_DATA, req->buffer, 256); 646 outsw(HD_DATA, bio_data(req->bio), 256);
647 break; 647 break;
648 default: 648 default:
649 printk("unknown hd-command\n"); 649 printk("unknown hd-command\n");
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index eb59b1241366..e352cac707e8 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -479,7 +479,7 @@ static unsigned int mg_out(struct mg_host *host,
479 479
480static void mg_read_one(struct mg_host *host, struct request *req) 480static void mg_read_one(struct mg_host *host, struct request *req)
481{ 481{
482 u16 *buff = (u16 *)req->buffer; 482 u16 *buff = (u16 *)bio_data(req->bio);
483 u32 i; 483 u32 i;
484 484
485 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) 485 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@@ -496,7 +496,7 @@ static void mg_read(struct request *req)
496 mg_bad_rw_intr(host); 496 mg_bad_rw_intr(host);
497 497
498 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", 498 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
499 blk_rq_sectors(req), blk_rq_pos(req), req->buffer); 499 blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
500 500
501 do { 501 do {
502 if (mg_wait(host, ATA_DRQ, 502 if (mg_wait(host, ATA_DRQ,
@@ -514,7 +514,7 @@ static void mg_read(struct request *req)
514 514
515static void mg_write_one(struct mg_host *host, struct request *req) 515static void mg_write_one(struct mg_host *host, struct request *req)
516{ 516{
517 u16 *buff = (u16 *)req->buffer; 517 u16 *buff = (u16 *)bio_data(req->bio);
518 u32 i; 518 u32 i;
519 519
520 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) 520 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@@ -534,7 +534,7 @@ static void mg_write(struct request *req)
534 } 534 }
535 535
536 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", 536 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
537 rem, blk_rq_pos(req), req->buffer); 537 rem, blk_rq_pos(req), bio_data(req->bio));
538 538
539 if (mg_wait(host, ATA_DRQ, 539 if (mg_wait(host, ATA_DRQ,
540 MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { 540 MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
@@ -585,7 +585,7 @@ ok_to_read:
585 mg_read_one(host, req); 585 mg_read_one(host, req);
586 586
587 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", 587 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
588 blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer); 588 blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
589 589
590 /* send read confirm */ 590 /* send read confirm */
591 outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); 591 outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
@@ -624,7 +624,7 @@ ok_to_write:
624 /* write 1 sector and set handler if remains */ 624 /* write 1 sector and set handler if remains */
625 mg_write_one(host, req); 625 mg_write_one(host, req);
626 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", 626 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
627 blk_rq_pos(req), blk_rq_sectors(req), req->buffer); 627 blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
628 host->mg_do_intr = mg_write_intr; 628 host->mg_do_intr = mg_write_intr;
629 mod_timer(&host->timer, jiffies + 3 * HZ); 629 mod_timer(&host->timer, jiffies + 3 * HZ);
630 } 630 }
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 59c5abe32f06..74abd49fabdc 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -31,6 +31,7 @@
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/genhd.h> 32#include <linux/genhd.h>
33#include <linux/blkdev.h> 33#include <linux/blkdev.h>
34#include <linux/blk-mq.h>
34#include <linux/bio.h> 35#include <linux/bio.h>
35#include <linux/dma-mapping.h> 36#include <linux/dma-mapping.h>
36#include <linux/idr.h> 37#include <linux/idr.h>
@@ -173,60 +174,36 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
173 return false; /* device present */ 174 return false; /* device present */
174} 175}
175 176
176/* 177static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
177 * Obtain an empty command slot.
178 *
179 * This function needs to be reentrant since it could be called
180 * at the same time on multiple CPUs. The allocation of the
181 * command slot must be atomic.
182 *
183 * @port Pointer to the port data structure.
184 *
185 * return value
186 * >= 0 Index of command slot obtained.
187 * -1 No command slots available.
188 */
189static int get_slot(struct mtip_port *port)
190{ 178{
191 int slot, i; 179 struct request *rq;
192 unsigned int num_command_slots = port->dd->slot_groups * 32;
193 180
194 /* 181 rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true);
195 * Try 10 times, because there is a small race here. 182 return blk_mq_rq_to_pdu(rq);
196 * that's ok, because it's still cheaper than a lock. 183}
197 *
198 * Race: Since this section is not protected by lock, same bit
199 * could be chosen by different process contexts running in
200 * different processor. So instead of costly lock, we are going
201 * with loop.
202 */
203 for (i = 0; i < 10; i++) {
204 slot = find_next_zero_bit(port->allocated,
205 num_command_slots, 1);
206 if ((slot < num_command_slots) &&
207 (!test_and_set_bit(slot, port->allocated)))
208 return slot;
209 }
210 dev_warn(&port->dd->pdev->dev, "Failed to get a tag.\n");
211 184
212 mtip_check_surprise_removal(port->dd->pdev); 185static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd)
213 return -1; 186{
187 blk_put_request(blk_mq_rq_from_pdu(cmd));
214} 188}
215 189
216/* 190/*
217 * Release a command slot. 191 * Once we add support for one hctx per mtip group, this will change a bit
218 *
219 * @port Pointer to the port data structure.
220 * @tag Tag of command to release
221 *
222 * return value
223 * None
224 */ 192 */
225static inline void release_slot(struct mtip_port *port, int tag) 193static struct request *mtip_rq_from_tag(struct driver_data *dd,
194 unsigned int tag)
195{
196 struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0];
197
198 return blk_mq_tag_to_rq(hctx->tags, tag);
199}
200
201static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
202 unsigned int tag)
226{ 203{
227 smp_mb__before_clear_bit(); 204 struct request *rq = mtip_rq_from_tag(dd, tag);
228 clear_bit(tag, port->allocated); 205
229 smp_mb__after_clear_bit(); 206 return blk_mq_rq_to_pdu(rq);
230} 207}
231 208
232/* 209/*
@@ -248,93 +225,28 @@ static inline void release_slot(struct mtip_port *port, int tag)
248 * None 225 * None
249 */ 226 */
250static void mtip_async_complete(struct mtip_port *port, 227static void mtip_async_complete(struct mtip_port *port,
251 int tag, 228 int tag, struct mtip_cmd *cmd, int status)
252 void *data,
253 int status)
254{ 229{
255 struct mtip_cmd *cmd; 230 struct driver_data *dd = port->dd;
256 struct driver_data *dd = data; 231 struct request *rq;
257 int unaligned, cb_status = status ? -EIO : 0;
258 void (*func)(void *, int);
259 232
260 if (unlikely(!dd) || unlikely(!port)) 233 if (unlikely(!dd) || unlikely(!port))
261 return; 234 return;
262 235
263 cmd = &port->commands[tag];
264
265 if (unlikely(status == PORT_IRQ_TF_ERR)) { 236 if (unlikely(status == PORT_IRQ_TF_ERR)) {
266 dev_warn(&port->dd->pdev->dev, 237 dev_warn(&port->dd->pdev->dev,
267 "Command tag %d failed due to TFE\n", tag); 238 "Command tag %d failed due to TFE\n", tag);
268 } 239 }
269 240
270 /* Clear the active flag */ 241 /* Unmap the DMA scatter list entries */
271 atomic_set(&port->commands[tag].active, 0); 242 dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, cmd->direction);
272
273 /* Upper layer callback */
274 func = cmd->async_callback;
275 if (likely(func && cmpxchg(&cmd->async_callback, func, 0) == func)) {
276 243
277 /* Unmap the DMA scatter list entries */ 244 rq = mtip_rq_from_tag(dd, tag);
278 dma_unmap_sg(&dd->pdev->dev,
279 cmd->sg,
280 cmd->scatter_ents,
281 cmd->direction);
282 245
283 func(cmd->async_data, cb_status); 246 if (unlikely(cmd->unaligned))
284 unaligned = cmd->unaligned; 247 up(&port->cmd_slot_unal);
285 248
286 /* Clear the allocated bit for the command */ 249 blk_mq_end_io(rq, status ? -EIO : 0);
287 release_slot(port, tag);
288
289 if (unlikely(unaligned))
290 up(&port->cmd_slot_unal);
291 else
292 up(&port->cmd_slot);
293 }
294}
295
296/*
297 * This function is called for clean the pending command in the
298 * command slot during the surprise removal of device and return
299 * error to the upper layer.
300 *
301 * @dd Pointer to the DRIVER_DATA structure.
302 *
303 * return value
304 * None
305 */
306static void mtip_command_cleanup(struct driver_data *dd)
307{
308 int tag = 0;
309 struct mtip_cmd *cmd;
310 struct mtip_port *port = dd->port;
311 unsigned int num_cmd_slots = dd->slot_groups * 32;
312
313 if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag))
314 return;
315
316 if (!port)
317 return;
318
319 cmd = &port->commands[MTIP_TAG_INTERNAL];
320 if (atomic_read(&cmd->active))
321 if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) &
322 (1 << MTIP_TAG_INTERNAL))
323 if (cmd->comp_func)
324 cmd->comp_func(port, MTIP_TAG_INTERNAL,
325 cmd->comp_data, -ENODEV);
326
327 while (1) {
328 tag = find_next_bit(port->allocated, num_cmd_slots, tag);
329 if (tag >= num_cmd_slots)
330 break;
331
332 cmd = &port->commands[tag];
333 if (atomic_read(&cmd->active))
334 mtip_async_complete(port, tag, dd, -ENODEV);
335 }
336
337 set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
338} 250}
339 251
340/* 252/*
@@ -388,8 +300,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
388{ 300{
389 int group = tag >> 5; 301 int group = tag >> 5;
390 302
391 atomic_set(&port->commands[tag].active, 1);
392
393 /* guard SACT and CI registers */ 303 /* guard SACT and CI registers */
394 spin_lock(&port->cmd_issue_lock[group]); 304 spin_lock(&port->cmd_issue_lock[group]);
395 writel((1 << MTIP_TAG_BIT(tag)), 305 writel((1 << MTIP_TAG_BIT(tag)),
@@ -397,10 +307,6 @@ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
397 writel((1 << MTIP_TAG_BIT(tag)), 307 writel((1 << MTIP_TAG_BIT(tag)),
398 port->cmd_issue[MTIP_TAG_INDEX(tag)]); 308 port->cmd_issue[MTIP_TAG_INDEX(tag)]);
399 spin_unlock(&port->cmd_issue_lock[group]); 309 spin_unlock(&port->cmd_issue_lock[group]);
400
401 /* Set the command's timeout value.*/
402 port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
403 MTIP_NCQ_COMMAND_TIMEOUT_MS);
404} 310}
405 311
406/* 312/*
@@ -648,132 +554,13 @@ static void print_tags(struct driver_data *dd,
648 554
649 memset(tagmap, 0, sizeof(tagmap)); 555 memset(tagmap, 0, sizeof(tagmap));
650 for (group = SLOTBITS_IN_LONGS; group > 0; group--) 556 for (group = SLOTBITS_IN_LONGS; group > 0; group--)
651 tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ", 557 tagmap_len += sprintf(tagmap + tagmap_len, "%016lX ",
652 tagbits[group-1]); 558 tagbits[group-1]);
653 dev_warn(&dd->pdev->dev, 559 dev_warn(&dd->pdev->dev,
654 "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap); 560 "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap);
655} 561}
656 562
657/* 563/*
658 * Called periodically to see if any read/write commands are
659 * taking too long to complete.
660 *
661 * @data Pointer to the PORT data structure.
662 *
663 * return value
664 * None
665 */
666static void mtip_timeout_function(unsigned long int data)
667{
668 struct mtip_port *port = (struct mtip_port *) data;
669 struct host_to_dev_fis *fis;
670 struct mtip_cmd *cmd;
671 int unaligned, tag, cmdto_cnt = 0;
672 unsigned int bit, group;
673 unsigned int num_command_slots;
674 unsigned long to, tagaccum[SLOTBITS_IN_LONGS];
675 void (*func)(void *, int);
676
677 if (unlikely(!port))
678 return;
679
680 if (unlikely(port->dd->sr))
681 return;
682
683 if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) {
684 mod_timer(&port->cmd_timer,
685 jiffies + msecs_to_jiffies(30000));
686 return;
687 }
688 /* clear the tag accumulator */
689 memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
690 num_command_slots = port->dd->slot_groups * 32;
691
692 for (tag = 0; tag < num_command_slots; tag++) {
693 /*
694 * Skip internal command slot as it has
695 * its own timeout mechanism
696 */
697 if (tag == MTIP_TAG_INTERNAL)
698 continue;
699
700 if (atomic_read(&port->commands[tag].active) &&
701 (time_after(jiffies, port->commands[tag].comp_time))) {
702 group = tag >> 5;
703 bit = tag & 0x1F;
704
705 cmd = &port->commands[tag];
706 fis = (struct host_to_dev_fis *) cmd->command;
707
708 set_bit(tag, tagaccum);
709 cmdto_cnt++;
710 if (cmdto_cnt == 1)
711 set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
712
713 /*
714 * Clear the completed bit. This should prevent
715 * any interrupt handlers from trying to retire
716 * the command.
717 */
718 writel(1 << bit, port->completed[group]);
719
720 /* Clear the active flag for the command */
721 atomic_set(&port->commands[tag].active, 0);
722
723 func = cmd->async_callback;
724 if (func &&
725 cmpxchg(&cmd->async_callback, func, 0) == func) {
726
727 /* Unmap the DMA scatter list entries */
728 dma_unmap_sg(&port->dd->pdev->dev,
729 cmd->sg,
730 cmd->scatter_ents,
731 cmd->direction);
732
733 func(cmd->async_data, -EIO);
734 unaligned = cmd->unaligned;
735
736 /* Clear the allocated bit for the command. */
737 release_slot(port, tag);
738
739 if (unaligned)
740 up(&port->cmd_slot_unal);
741 else
742 up(&port->cmd_slot);
743 }
744 }
745 }
746
747 if (cmdto_cnt) {
748 print_tags(port->dd, "timed out", tagaccum, cmdto_cnt);
749 if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
750 mtip_device_reset(port->dd);
751 wake_up_interruptible(&port->svc_wait);
752 }
753 clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
754 }
755
756 if (port->ic_pause_timer) {
757 to = port->ic_pause_timer + msecs_to_jiffies(1000);
758 if (time_after(jiffies, to)) {
759 if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
760 port->ic_pause_timer = 0;
761 clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
762 clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
763 clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
764 wake_up_interruptible(&port->svc_wait);
765 }
766
767
768 }
769 }
770
771 /* Restart the timer */
772 mod_timer(&port->cmd_timer,
773 jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
774}
775
776/*
777 * Internal command completion callback function. 564 * Internal command completion callback function.
778 * 565 *
779 * This function is normally called by the driver ISR when an internal 566 * This function is normally called by the driver ISR when an internal
@@ -789,28 +576,19 @@ static void mtip_timeout_function(unsigned long int data)
789 * None 576 * None
790 */ 577 */
791static void mtip_completion(struct mtip_port *port, 578static void mtip_completion(struct mtip_port *port,
792 int tag, 579 int tag, struct mtip_cmd *command, int status)
793 void *data,
794 int status)
795{ 580{
796 struct mtip_cmd *command = &port->commands[tag]; 581 struct completion *waiting = command->comp_data;
797 struct completion *waiting = data;
798 if (unlikely(status == PORT_IRQ_TF_ERR)) 582 if (unlikely(status == PORT_IRQ_TF_ERR))
799 dev_warn(&port->dd->pdev->dev, 583 dev_warn(&port->dd->pdev->dev,
800 "Internal command %d completed with TFE\n", tag); 584 "Internal command %d completed with TFE\n", tag);
801 585
802 command->async_callback = NULL;
803 command->comp_func = NULL;
804
805 complete(waiting); 586 complete(waiting);
806} 587}
807 588
808static void mtip_null_completion(struct mtip_port *port, 589static void mtip_null_completion(struct mtip_port *port,
809 int tag, 590 int tag, struct mtip_cmd *command, int status)
810 void *data,
811 int status)
812{ 591{
813 return;
814} 592}
815 593
816static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, 594static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
@@ -842,19 +620,16 @@ static void mtip_handle_tfe(struct driver_data *dd)
842 620
843 port = dd->port; 621 port = dd->port;
844 622
845 /* Stop the timer to prevent command timeouts. */
846 del_timer(&port->cmd_timer);
847 set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); 623 set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
848 624
849 if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && 625 if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
850 test_bit(MTIP_TAG_INTERNAL, port->allocated)) { 626 test_bit(MTIP_TAG_INTERNAL, port->allocated)) {
851 cmd = &port->commands[MTIP_TAG_INTERNAL]; 627 cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
852 dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); 628 dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n");
853 629
854 atomic_inc(&cmd->active); /* active > 1 indicates error */
855 if (cmd->comp_data && cmd->comp_func) { 630 if (cmd->comp_data && cmd->comp_func) {
856 cmd->comp_func(port, MTIP_TAG_INTERNAL, 631 cmd->comp_func(port, MTIP_TAG_INTERNAL,
857 cmd->comp_data, PORT_IRQ_TF_ERR); 632 cmd, PORT_IRQ_TF_ERR);
858 } 633 }
859 goto handle_tfe_exit; 634 goto handle_tfe_exit;
860 } 635 }
@@ -866,6 +641,8 @@ static void mtip_handle_tfe(struct driver_data *dd)
866 for (group = 0; group < dd->slot_groups; group++) { 641 for (group = 0; group < dd->slot_groups; group++) {
867 completed = readl(port->completed[group]); 642 completed = readl(port->completed[group]);
868 643
644 dev_warn(&dd->pdev->dev, "g=%u, comp=%x\n", group, completed);
645
869 /* clear completed status register in the hardware.*/ 646 /* clear completed status register in the hardware.*/
870 writel(completed, port->completed[group]); 647 writel(completed, port->completed[group]);
871 648
@@ -879,15 +656,11 @@ static void mtip_handle_tfe(struct driver_data *dd)
879 if (tag == MTIP_TAG_INTERNAL) 656 if (tag == MTIP_TAG_INTERNAL)
880 continue; 657 continue;
881 658
882 cmd = &port->commands[tag]; 659 cmd = mtip_cmd_from_tag(dd, tag);
883 if (likely(cmd->comp_func)) { 660 if (likely(cmd->comp_func)) {
884 set_bit(tag, tagaccum); 661 set_bit(tag, tagaccum);
885 cmd_cnt++; 662 cmd_cnt++;
886 atomic_set(&cmd->active, 0); 663 cmd->comp_func(port, tag, cmd, 0);
887 cmd->comp_func(port,
888 tag,
889 cmd->comp_data,
890 0);
891 } else { 664 } else {
892 dev_err(&port->dd->pdev->dev, 665 dev_err(&port->dd->pdev->dev,
893 "Missing completion func for tag %d", 666 "Missing completion func for tag %d",
@@ -947,11 +720,7 @@ static void mtip_handle_tfe(struct driver_data *dd)
947 for (bit = 0; bit < 32; bit++) { 720 for (bit = 0; bit < 32; bit++) {
948 reissue = 1; 721 reissue = 1;
949 tag = (group << 5) + bit; 722 tag = (group << 5) + bit;
950 cmd = &port->commands[tag]; 723 cmd = mtip_cmd_from_tag(dd, tag);
951
952 /* If the active bit is set re-issue the command */
953 if (atomic_read(&cmd->active) == 0)
954 continue;
955 724
956 fis = (struct host_to_dev_fis *)cmd->command; 725 fis = (struct host_to_dev_fis *)cmd->command;
957 726
@@ -970,11 +739,9 @@ static void mtip_handle_tfe(struct driver_data *dd)
970 tag, 739 tag,
971 fail_reason != NULL ? 740 fail_reason != NULL ?
972 fail_reason : "unknown"); 741 fail_reason : "unknown");
973 atomic_set(&cmd->active, 0);
974 if (cmd->comp_func) { 742 if (cmd->comp_func) {
975 cmd->comp_func(port, tag, 743 cmd->comp_func(port, tag,
976 cmd->comp_data, 744 cmd, -ENODATA);
977 -ENODATA);
978 } 745 }
979 continue; 746 continue;
980 } 747 }
@@ -997,14 +764,9 @@ static void mtip_handle_tfe(struct driver_data *dd)
997 /* Retire a command that will not be reissued */ 764 /* Retire a command that will not be reissued */
998 dev_warn(&port->dd->pdev->dev, 765 dev_warn(&port->dd->pdev->dev,
999 "retiring tag %d\n", tag); 766 "retiring tag %d\n", tag);
1000 atomic_set(&cmd->active, 0);
1001 767
1002 if (cmd->comp_func) 768 if (cmd->comp_func)
1003 cmd->comp_func( 769 cmd->comp_func(port, tag, cmd, PORT_IRQ_TF_ERR);
1004 port,
1005 tag,
1006 cmd->comp_data,
1007 PORT_IRQ_TF_ERR);
1008 else 770 else
1009 dev_warn(&port->dd->pdev->dev, 771 dev_warn(&port->dd->pdev->dev,
1010 "Bad completion for tag %d\n", 772 "Bad completion for tag %d\n",
@@ -1017,9 +779,6 @@ handle_tfe_exit:
1017 /* clear eh_active */ 779 /* clear eh_active */
1018 clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); 780 clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
1019 wake_up_interruptible(&port->svc_wait); 781 wake_up_interruptible(&port->svc_wait);
1020
1021 mod_timer(&port->cmd_timer,
1022 jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
1023} 782}
1024 783
1025/* 784/*
@@ -1048,15 +807,10 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
1048 if (unlikely(tag == MTIP_TAG_INTERNAL)) 807 if (unlikely(tag == MTIP_TAG_INTERNAL))
1049 continue; 808 continue;
1050 809
1051 command = &port->commands[tag]; 810 command = mtip_cmd_from_tag(dd, tag);
1052 /* make internal callback */ 811 if (likely(command->comp_func))
1053 if (likely(command->comp_func)) { 812 command->comp_func(port, tag, command, 0);
1054 command->comp_func( 813 else {
1055 port,
1056 tag,
1057 command->comp_data,
1058 0);
1059 } else {
1060 dev_dbg(&dd->pdev->dev, 814 dev_dbg(&dd->pdev->dev,
1061 "Null completion for tag %d", 815 "Null completion for tag %d",
1062 tag); 816 tag);
@@ -1081,16 +835,13 @@ static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
1081static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) 835static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
1082{ 836{
1083 struct mtip_port *port = dd->port; 837 struct mtip_port *port = dd->port;
1084 struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL]; 838 struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
1085 839
1086 if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && 840 if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
1087 (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL]) 841 (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
1088 & (1 << MTIP_TAG_INTERNAL))) { 842 & (1 << MTIP_TAG_INTERNAL))) {
1089 if (cmd->comp_func) { 843 if (cmd->comp_func) {
1090 cmd->comp_func(port, 844 cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, 0);
1091 MTIP_TAG_INTERNAL,
1092 cmd->comp_data,
1093 0);
1094 return; 845 return;
1095 } 846 }
1096 } 847 }
@@ -1103,8 +854,6 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
1103 */ 854 */
1104static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) 855static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
1105{ 856{
1106 if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR)))
1107 mtip_handle_tfe(dd);
1108 857
1109 if (unlikely(port_stat & PORT_IRQ_CONNECT)) { 858 if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
1110 dev_warn(&dd->pdev->dev, 859 dev_warn(&dd->pdev->dev,
@@ -1122,6 +871,12 @@ static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
1122 dev_warn(&dd->pdev->dev, 871 dev_warn(&dd->pdev->dev,
1123 "Port stat errors %x unhandled\n", 872 "Port stat errors %x unhandled\n",
1124 (port_stat & ~PORT_IRQ_HANDLED)); 873 (port_stat & ~PORT_IRQ_HANDLED));
874 if (mtip_check_surprise_removal(dd->pdev))
875 return;
876 }
877 if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) {
878 set_bit(MTIP_PF_EH_ACTIVE_BIT, &dd->port->flags);
879 wake_up_interruptible(&dd->port->svc_wait);
1125 } 880 }
1126} 881}
1127 882
@@ -1222,7 +977,6 @@ static irqreturn_t mtip_irq_handler(int irq, void *instance)
1222 977
1223static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) 978static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
1224{ 979{
1225 atomic_set(&port->commands[tag].active, 1);
1226 writel(1 << MTIP_TAG_BIT(tag), 980 writel(1 << MTIP_TAG_BIT(tag),
1227 port->cmd_issue[MTIP_TAG_INDEX(tag)]); 981 port->cmd_issue[MTIP_TAG_INDEX(tag)]);
1228} 982}
@@ -1280,6 +1034,8 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
1280 unsigned int n; 1034 unsigned int n;
1281 unsigned int active = 1; 1035 unsigned int active = 1;
1282 1036
1037 blk_mq_stop_hw_queues(port->dd->queue);
1038
1283 to = jiffies + msecs_to_jiffies(timeout); 1039 to = jiffies + msecs_to_jiffies(timeout);
1284 do { 1040 do {
1285 if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) && 1041 if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) &&
@@ -1287,8 +1043,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
1287 msleep(20); 1043 msleep(20);
1288 continue; /* svc thd is actively issuing commands */ 1044 continue; /* svc thd is actively issuing commands */
1289 } 1045 }
1046
1047 msleep(100);
1048 if (mtip_check_surprise_removal(port->dd->pdev))
1049 goto err_fault;
1290 if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) 1050 if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
1291 return -EFAULT; 1051 goto err_fault;
1052
1292 /* 1053 /*
1293 * Ignore s_active bit 0 of array element 0. 1054 * Ignore s_active bit 0 of array element 0.
1294 * This bit will always be set 1055 * This bit will always be set
@@ -1299,11 +1060,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
1299 1060
1300 if (!active) 1061 if (!active)
1301 break; 1062 break;
1302
1303 msleep(20);
1304 } while (time_before(jiffies, to)); 1063 } while (time_before(jiffies, to));
1305 1064
1065 blk_mq_start_stopped_hw_queues(port->dd->queue, true);
1306 return active ? -EBUSY : 0; 1066 return active ? -EBUSY : 0;
1067err_fault:
1068 blk_mq_start_stopped_hw_queues(port->dd->queue, true);
1069 return -EFAULT;
1307} 1070}
1308 1071
1309/* 1072/*
@@ -1335,10 +1098,9 @@ static int mtip_exec_internal_command(struct mtip_port *port,
1335{ 1098{
1336 struct mtip_cmd_sg *command_sg; 1099 struct mtip_cmd_sg *command_sg;
1337 DECLARE_COMPLETION_ONSTACK(wait); 1100 DECLARE_COMPLETION_ONSTACK(wait);
1338 int rv = 0, ready2go = 1; 1101 struct mtip_cmd *int_cmd;
1339 struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL];
1340 unsigned long to;
1341 struct driver_data *dd = port->dd; 1102 struct driver_data *dd = port->dd;
1103 int rv = 0;
1342 1104
1343 /* Make sure the buffer is 8 byte aligned. This is asic specific. */ 1105 /* Make sure the buffer is 8 byte aligned. This is asic specific. */
1344 if (buffer & 0x00000007) { 1106 if (buffer & 0x00000007) {
@@ -1346,19 +1108,8 @@ static int mtip_exec_internal_command(struct mtip_port *port,
1346 return -EFAULT; 1108 return -EFAULT;
1347 } 1109 }
1348 1110
1349 to = jiffies + msecs_to_jiffies(timeout); 1111 int_cmd = mtip_get_int_command(dd);
1350 do { 1112
1351 ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL,
1352 port->allocated);
1353 if (ready2go)
1354 break;
1355 mdelay(100);
1356 } while (time_before(jiffies, to));
1357 if (!ready2go) {
1358 dev_warn(&dd->pdev->dev,
1359 "Internal cmd active. new cmd [%02X]\n", fis->command);
1360 return -EBUSY;
1361 }
1362 set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); 1113 set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
1363 port->ic_pause_timer = 0; 1114 port->ic_pause_timer = 0;
1364 1115
@@ -1368,10 +1119,11 @@ static int mtip_exec_internal_command(struct mtip_port *port,
1368 if (atomic == GFP_KERNEL) { 1119 if (atomic == GFP_KERNEL) {
1369 if (fis->command != ATA_CMD_STANDBYNOW1) { 1120 if (fis->command != ATA_CMD_STANDBYNOW1) {
1370 /* wait for io to complete if non atomic */ 1121 /* wait for io to complete if non atomic */
1371 if (mtip_quiesce_io(port, 5000) < 0) { 1122 if (mtip_quiesce_io(port,
1123 MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) {
1372 dev_warn(&dd->pdev->dev, 1124 dev_warn(&dd->pdev->dev,
1373 "Failed to quiesce IO\n"); 1125 "Failed to quiesce IO\n");
1374 release_slot(port, MTIP_TAG_INTERNAL); 1126 mtip_put_int_command(dd, int_cmd);
1375 clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); 1127 clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
1376 wake_up_interruptible(&port->svc_wait); 1128 wake_up_interruptible(&port->svc_wait);
1377 return -EBUSY; 1129 return -EBUSY;
@@ -1416,9 +1168,9 @@ static int mtip_exec_internal_command(struct mtip_port *port,
1416 1168
1417 if (atomic == GFP_KERNEL) { 1169 if (atomic == GFP_KERNEL) {
1418 /* Wait for the command to complete or timeout. */ 1170 /* Wait for the command to complete or timeout. */
1419 if (wait_for_completion_interruptible_timeout( 1171 if ((rv = wait_for_completion_interruptible_timeout(
1420 &wait, 1172 &wait,
1421 msecs_to_jiffies(timeout)) <= 0) { 1173 msecs_to_jiffies(timeout))) <= 0) {
1422 if (rv == -ERESTARTSYS) { /* interrupted */ 1174 if (rv == -ERESTARTSYS) { /* interrupted */
1423 dev_err(&dd->pdev->dev, 1175 dev_err(&dd->pdev->dev,
1424 "Internal command [%02X] was interrupted after %lu ms\n", 1176 "Internal command [%02X] was interrupted after %lu ms\n",
@@ -1497,8 +1249,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
1497 } 1249 }
1498exec_ic_exit: 1250exec_ic_exit:
1499 /* Clear the allocated and active bits for the internal command. */ 1251 /* Clear the allocated and active bits for the internal command. */
1500 atomic_set(&int_cmd->active, 0); 1252 mtip_put_int_command(dd, int_cmd);
1501 release_slot(port, MTIP_TAG_INTERNAL);
1502 if (rv >= 0 && mtip_pause_ncq(port, fis)) { 1253 if (rv >= 0 && mtip_pause_ncq(port, fis)) {
1503 /* NCQ paused */ 1254 /* NCQ paused */
1504 return rv; 1255 return rv;
@@ -1529,6 +1280,37 @@ static inline void ata_swap_string(u16 *buf, unsigned int len)
1529 be16_to_cpus(&buf[i]); 1280 be16_to_cpus(&buf[i]);
1530} 1281}
1531 1282
1283static void mtip_set_timeout(struct driver_data *dd,
1284 struct host_to_dev_fis *fis,
1285 unsigned int *timeout, u8 erasemode)
1286{
1287 switch (fis->command) {
1288 case ATA_CMD_DOWNLOAD_MICRO:
1289 *timeout = 120000; /* 2 minutes */
1290 break;
1291 case ATA_CMD_SEC_ERASE_UNIT:
1292 case 0xFC:
1293 if (erasemode)
1294 *timeout = ((*(dd->port->identify + 90) * 2) * 60000);
1295 else
1296 *timeout = ((*(dd->port->identify + 89) * 2) * 60000);
1297 break;
1298 case ATA_CMD_STANDBYNOW1:
1299 *timeout = 120000; /* 2 minutes */
1300 break;
1301 case 0xF7:
1302 case 0xFA:
1303 *timeout = 60000; /* 60 seconds */
1304 break;
1305 case ATA_CMD_SMART:
1306 *timeout = 15000; /* 15 seconds */
1307 break;
1308 default:
1309 *timeout = MTIP_IOCTL_CMD_TIMEOUT_MS;
1310 break;
1311 }
1312}
1313
1532/* 1314/*
1533 * Request the device identity information. 1315 * Request the device identity information.
1534 * 1316 *
@@ -1576,7 +1358,7 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
1576 sizeof(u16) * ATA_ID_WORDS, 1358 sizeof(u16) * ATA_ID_WORDS,
1577 0, 1359 0,
1578 GFP_KERNEL, 1360 GFP_KERNEL,
1579 MTIP_INTERNAL_COMMAND_TIMEOUT_MS) 1361 MTIP_INT_CMD_TIMEOUT_MS)
1580 < 0) { 1362 < 0) {
1581 rv = -1; 1363 rv = -1;
1582 goto out; 1364 goto out;
@@ -1644,6 +1426,7 @@ static int mtip_standby_immediate(struct mtip_port *port)
1644 int rv; 1426 int rv;
1645 struct host_to_dev_fis fis; 1427 struct host_to_dev_fis fis;
1646 unsigned long start; 1428 unsigned long start;
1429 unsigned int timeout;
1647 1430
1648 /* Build the FIS. */ 1431 /* Build the FIS. */
1649 memset(&fis, 0, sizeof(struct host_to_dev_fis)); 1432 memset(&fis, 0, sizeof(struct host_to_dev_fis));
@@ -1651,6 +1434,8 @@ static int mtip_standby_immediate(struct mtip_port *port)
1651 fis.opts = 1 << 7; 1434 fis.opts = 1 << 7;
1652 fis.command = ATA_CMD_STANDBYNOW1; 1435 fis.command = ATA_CMD_STANDBYNOW1;
1653 1436
1437 mtip_set_timeout(port->dd, &fis, &timeout, 0);
1438
1654 start = jiffies; 1439 start = jiffies;
1655 rv = mtip_exec_internal_command(port, 1440 rv = mtip_exec_internal_command(port,
1656 &fis, 1441 &fis,
@@ -1659,7 +1444,7 @@ static int mtip_standby_immediate(struct mtip_port *port)
1659 0, 1444 0,
1660 0, 1445 0,
1661 GFP_ATOMIC, 1446 GFP_ATOMIC,
1662 15000); 1447 timeout);
1663 dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n", 1448 dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n",
1664 jiffies_to_msecs(jiffies - start)); 1449 jiffies_to_msecs(jiffies - start));
1665 if (rv) 1450 if (rv)
@@ -1705,7 +1490,7 @@ static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
1705 sectors * ATA_SECT_SIZE, 1490 sectors * ATA_SECT_SIZE,
1706 0, 1491 0,
1707 GFP_ATOMIC, 1492 GFP_ATOMIC,
1708 MTIP_INTERNAL_COMMAND_TIMEOUT_MS); 1493 MTIP_INT_CMD_TIMEOUT_MS);
1709} 1494}
1710 1495
1711/* 1496/*
@@ -1998,6 +1783,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
1998{ 1783{
1999 struct host_to_dev_fis fis; 1784 struct host_to_dev_fis fis;
2000 struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG); 1785 struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
1786 unsigned int to;
2001 1787
2002 /* Build the FIS. */ 1788 /* Build the FIS. */
2003 memset(&fis, 0, sizeof(struct host_to_dev_fis)); 1789 memset(&fis, 0, sizeof(struct host_to_dev_fis));
@@ -2011,6 +1797,8 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
2011 fis.cyl_hi = command[5]; 1797 fis.cyl_hi = command[5];
2012 fis.device = command[6] & ~0x10; /* Clear the dev bit*/ 1798 fis.device = command[6] & ~0x10; /* Clear the dev bit*/
2013 1799
1800 mtip_set_timeout(port->dd, &fis, &to, 0);
1801
2014 dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n", 1802 dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n",
2015 __func__, 1803 __func__,
2016 command[0], 1804 command[0],
@@ -2029,7 +1817,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
2029 0, 1817 0,
2030 0, 1818 0,
2031 GFP_KERNEL, 1819 GFP_KERNEL,
2032 MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) { 1820 to) < 0) {
2033 return -1; 1821 return -1;
2034 } 1822 }
2035 1823
@@ -2069,6 +1857,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
2069 u8 *buf = NULL; 1857 u8 *buf = NULL;
2070 dma_addr_t dma_addr = 0; 1858 dma_addr_t dma_addr = 0;
2071 int rv = 0, xfer_sz = command[3]; 1859 int rv = 0, xfer_sz = command[3];
1860 unsigned int to;
2072 1861
2073 if (xfer_sz) { 1862 if (xfer_sz) {
2074 if (!user_buffer) 1863 if (!user_buffer)
@@ -2100,6 +1889,8 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
2100 fis.cyl_hi = 0xC2; 1889 fis.cyl_hi = 0xC2;
2101 } 1890 }
2102 1891
1892 mtip_set_timeout(port->dd, &fis, &to, 0);
1893
2103 if (xfer_sz) 1894 if (xfer_sz)
2104 reply = (port->rxfis + RX_FIS_PIO_SETUP); 1895 reply = (port->rxfis + RX_FIS_PIO_SETUP);
2105 else 1896 else
@@ -2122,7 +1913,7 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
2122 (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0), 1913 (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0),
2123 0, 1914 0,
2124 GFP_KERNEL, 1915 GFP_KERNEL,
2125 MTIP_IOCTL_COMMAND_TIMEOUT_MS) 1916 to)
2126 < 0) { 1917 < 0) {
2127 rv = -EFAULT; 1918 rv = -EFAULT;
2128 goto exit_drive_command; 1919 goto exit_drive_command;
@@ -2202,36 +1993,6 @@ static unsigned int implicit_sector(unsigned char command,
2202 } 1993 }
2203 return rv; 1994 return rv;
2204} 1995}
2205static void mtip_set_timeout(struct driver_data *dd,
2206 struct host_to_dev_fis *fis,
2207 unsigned int *timeout, u8 erasemode)
2208{
2209 switch (fis->command) {
2210 case ATA_CMD_DOWNLOAD_MICRO:
2211 *timeout = 120000; /* 2 minutes */
2212 break;
2213 case ATA_CMD_SEC_ERASE_UNIT:
2214 case 0xFC:
2215 if (erasemode)
2216 *timeout = ((*(dd->port->identify + 90) * 2) * 60000);
2217 else
2218 *timeout = ((*(dd->port->identify + 89) * 2) * 60000);
2219 break;
2220 case ATA_CMD_STANDBYNOW1:
2221 *timeout = 120000; /* 2 minutes */
2222 break;
2223 case 0xF7:
2224 case 0xFA:
2225 *timeout = 60000; /* 60 seconds */
2226 break;
2227 case ATA_CMD_SMART:
2228 *timeout = 15000; /* 15 seconds */
2229 break;
2230 default:
2231 *timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
2232 break;
2233 }
2234}
2235 1996
2236/* 1997/*
2237 * Executes a taskfile 1998 * Executes a taskfile
@@ -2606,22 +2367,21 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
2606 * return value 2367 * return value
2607 * None 2368 * None
2608 */ 2369 */
2609static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, 2370static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
2610 int nsect, int nents, int tag, void *callback, 2371 struct mtip_cmd *command, int nents,
2611 void *data, int dir, int unaligned) 2372 struct blk_mq_hw_ctx *hctx)
2612{ 2373{
2613 struct host_to_dev_fis *fis; 2374 struct host_to_dev_fis *fis;
2614 struct mtip_port *port = dd->port; 2375 struct mtip_port *port = dd->port;
2615 struct mtip_cmd *command = &port->commands[tag]; 2376 int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2616 int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 2377 u64 start = blk_rq_pos(rq);
2617 u64 start = sector; 2378 unsigned int nsect = blk_rq_sectors(rq);
2618 2379
2619 /* Map the scatter list for DMA access */ 2380 /* Map the scatter list for DMA access */
2620 nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); 2381 nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
2621 2382
2622 command->scatter_ents = nents; 2383 command->scatter_ents = nents;
2623 2384
2624 command->unaligned = unaligned;
2625 /* 2385 /*
2626 * The number of retries for this command before it is 2386 * The number of retries for this command before it is
2627 * reported as a failure to the upper layers. 2387 * reported as a failure to the upper layers.
@@ -2632,8 +2392,10 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
2632 fis = command->command; 2392 fis = command->command;
2633 fis->type = 0x27; 2393 fis->type = 0x27;
2634 fis->opts = 1 << 7; 2394 fis->opts = 1 << 7;
2635 fis->command = 2395 if (rq_data_dir(rq) == READ)
2636 (dir == READ ? ATA_CMD_FPDMA_READ : ATA_CMD_FPDMA_WRITE); 2396 fis->command = ATA_CMD_FPDMA_READ;
2397 else
2398 fis->command = ATA_CMD_FPDMA_WRITE;
2637 fis->lba_low = start & 0xFF; 2399 fis->lba_low = start & 0xFF;
2638 fis->lba_mid = (start >> 8) & 0xFF; 2400 fis->lba_mid = (start >> 8) & 0xFF;
2639 fis->lba_hi = (start >> 16) & 0xFF; 2401 fis->lba_hi = (start >> 16) & 0xFF;
@@ -2643,14 +2405,14 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
2643 fis->device = 1 << 6; 2405 fis->device = 1 << 6;
2644 fis->features = nsect & 0xFF; 2406 fis->features = nsect & 0xFF;
2645 fis->features_ex = (nsect >> 8) & 0xFF; 2407 fis->features_ex = (nsect >> 8) & 0xFF;
2646 fis->sect_count = ((tag << 3) | (tag >> 5)); 2408 fis->sect_count = ((rq->tag << 3) | (rq->tag >> 5));
2647 fis->sect_cnt_ex = 0; 2409 fis->sect_cnt_ex = 0;
2648 fis->control = 0; 2410 fis->control = 0;
2649 fis->res2 = 0; 2411 fis->res2 = 0;
2650 fis->res3 = 0; 2412 fis->res3 = 0;
2651 fill_command_sg(dd, command, nents); 2413 fill_command_sg(dd, command, nents);
2652 2414
2653 if (unaligned) 2415 if (command->unaligned)
2654 fis->device |= 1 << 7; 2416 fis->device |= 1 << 7;
2655 2417
2656 /* Populate the command header */ 2418 /* Populate the command header */
@@ -2668,81 +2430,17 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector,
2668 command->direction = dma_dir; 2430 command->direction = dma_dir;
2669 2431
2670 /* 2432 /*
2671 * Set the completion function and data for the command passed
2672 * from the upper layer.
2673 */
2674 command->async_data = data;
2675 command->async_callback = callback;
2676
2677 /*
2678 * To prevent this command from being issued 2433 * To prevent this command from being issued
2679 * if an internal command is in progress or error handling is active. 2434 * if an internal command is in progress or error handling is active.
2680 */ 2435 */
2681 if (port->flags & MTIP_PF_PAUSE_IO) { 2436 if (port->flags & MTIP_PF_PAUSE_IO) {
2682 set_bit(tag, port->cmds_to_issue); 2437 set_bit(rq->tag, port->cmds_to_issue);
2683 set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); 2438 set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
2684 return; 2439 return;
2685 } 2440 }
2686 2441
2687 /* Issue the command to the hardware */ 2442 /* Issue the command to the hardware */
2688 mtip_issue_ncq_command(port, tag); 2443 mtip_issue_ncq_command(port, rq->tag);
2689
2690 return;
2691}
2692
2693/*
2694 * Release a command slot.
2695 *
2696 * @dd Pointer to the driver data structure.
2697 * @tag Slot tag
2698 *
2699 * return value
2700 * None
2701 */
2702static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag,
2703 int unaligned)
2704{
2705 struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
2706 &dd->port->cmd_slot;
2707 release_slot(dd->port, tag);
2708 up(sem);
2709}
2710
2711/*
2712 * Obtain a command slot and return its associated scatter list.
2713 *
2714 * @dd Pointer to the driver data structure.
2715 * @tag Pointer to an int that will receive the allocated command
2716 * slot tag.
2717 *
2718 * return value
2719 * Pointer to the scatter list for the allocated command slot
2720 * or NULL if no command slots are available.
2721 */
2722static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
2723 int *tag, int unaligned)
2724{
2725 struct semaphore *sem = unaligned ? &dd->port->cmd_slot_unal :
2726 &dd->port->cmd_slot;
2727
2728 /*
2729 * It is possible that, even with this semaphore, a thread
2730 * may think that no command slots are available. Therefore, we
2731 * need to make an attempt to get_slot().
2732 */
2733 down(sem);
2734 *tag = get_slot(dd->port);
2735
2736 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
2737 up(sem);
2738 return NULL;
2739 }
2740 if (unlikely(*tag < 0)) {
2741 up(sem);
2742 return NULL;
2743 }
2744
2745 return dd->port->commands[*tag].sg;
2746} 2444}
2747 2445
2748/* 2446/*
@@ -3113,6 +2811,7 @@ static int mtip_free_orphan(struct driver_data *dd)
3113 if (dd->queue) { 2811 if (dd->queue) {
3114 dd->queue->queuedata = NULL; 2812 dd->queue->queuedata = NULL;
3115 blk_cleanup_queue(dd->queue); 2813 blk_cleanup_queue(dd->queue);
2814 blk_mq_free_tag_set(&dd->tags);
3116 dd->queue = NULL; 2815 dd->queue = NULL;
3117 } 2816 }
3118 } 2817 }
@@ -3270,6 +2969,11 @@ static int mtip_service_thread(void *data)
3270 int ret; 2969 int ret;
3271 2970
3272 while (1) { 2971 while (1) {
2972 if (kthread_should_stop() ||
2973 test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
2974 goto st_out;
2975 clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
2976
3273 /* 2977 /*
3274 * the condition is to check neither an internal command is 2978 * the condition is to check neither an internal command is
3275 * is in progress nor error handling is active 2979 * is in progress nor error handling is active
@@ -3277,11 +2981,12 @@ static int mtip_service_thread(void *data)
3277 wait_event_interruptible(port->svc_wait, (port->flags) && 2981 wait_event_interruptible(port->svc_wait, (port->flags) &&
3278 !(port->flags & MTIP_PF_PAUSE_IO)); 2982 !(port->flags & MTIP_PF_PAUSE_IO));
3279 2983
3280 if (kthread_should_stop())
3281 goto st_out;
3282
3283 set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); 2984 set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
3284 2985
2986 if (kthread_should_stop() ||
2987 test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
2988 goto st_out;
2989
3285 /* If I am an orphan, start self cleanup */ 2990 /* If I am an orphan, start self cleanup */
3286 if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags)) 2991 if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags))
3287 break; 2992 break;
@@ -3290,6 +2995,16 @@ static int mtip_service_thread(void *data)
3290 &dd->dd_flag))) 2995 &dd->dd_flag)))
3291 goto st_out; 2996 goto st_out;
3292 2997
2998restart_eh:
2999 /* Demux bits: start with error handling */
3000 if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) {
3001 mtip_handle_tfe(dd);
3002 clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
3003 }
3004
3005 if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags))
3006 goto restart_eh;
3007
3293 if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { 3008 if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
3294 slot = 1; 3009 slot = 1;
3295 /* used to restrict the loop to one iteration */ 3010 /* used to restrict the loop to one iteration */
@@ -3319,16 +3034,14 @@ static int mtip_service_thread(void *data)
3319 } 3034 }
3320 3035
3321 clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); 3036 clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
3322 } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { 3037 }
3038
3039 if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
3323 if (mtip_ftl_rebuild_poll(dd) < 0) 3040 if (mtip_ftl_rebuild_poll(dd) < 0)
3324 set_bit(MTIP_DDF_REBUILD_FAILED_BIT, 3041 set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
3325 &dd->dd_flag); 3042 &dd->dd_flag);
3326 clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); 3043 clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
3327 } 3044 }
3328 clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
3329
3330 if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
3331 goto st_out;
3332 } 3045 }
3333 3046
3334 /* wait for pci remove to exit */ 3047 /* wait for pci remove to exit */
@@ -3365,7 +3078,6 @@ st_out:
3365 */ 3078 */
3366static void mtip_dma_free(struct driver_data *dd) 3079static void mtip_dma_free(struct driver_data *dd)
3367{ 3080{
3368 int i;
3369 struct mtip_port *port = dd->port; 3081 struct mtip_port *port = dd->port;
3370 3082
3371 if (port->block1) 3083 if (port->block1)
@@ -3376,13 +3088,6 @@ static void mtip_dma_free(struct driver_data *dd)
3376 dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, 3088 dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
3377 port->command_list, port->command_list_dma); 3089 port->command_list, port->command_list_dma);
3378 } 3090 }
3379
3380 for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
3381 if (port->commands[i].command)
3382 dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
3383 port->commands[i].command,
3384 port->commands[i].command_dma);
3385 }
3386} 3091}
3387 3092
3388/* 3093/*
@@ -3396,8 +3101,6 @@ static void mtip_dma_free(struct driver_data *dd)
3396static int mtip_dma_alloc(struct driver_data *dd) 3101static int mtip_dma_alloc(struct driver_data *dd)
3397{ 3102{
3398 struct mtip_port *port = dd->port; 3103 struct mtip_port *port = dd->port;
3399 int i, rv = 0;
3400 u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
3401 3104
3402 /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ 3105 /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
3403 port->block1 = 3106 port->block1 =
@@ -3430,41 +3133,63 @@ static int mtip_dma_alloc(struct driver_data *dd)
3430 port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET; 3133 port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET;
3431 port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET; 3134 port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET;
3432 3135
3433 /* Setup per command SGL DMA region */ 3136 return 0;
3434 3137}
3435 /* Point the command headers at the command tables */
3436 for (i = 0; i < MTIP_MAX_COMMAND_SLOTS; i++) {
3437 port->commands[i].command =
3438 dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
3439 &port->commands[i].command_dma, GFP_KERNEL);
3440 if (!port->commands[i].command) {
3441 rv = -ENOMEM;
3442 mtip_dma_free(dd);
3443 return rv;
3444 }
3445 memset(port->commands[i].command, 0, CMD_DMA_ALLOC_SZ);
3446
3447 port->commands[i].command_header = port->command_list +
3448 (sizeof(struct mtip_cmd_hdr) * i);
3449 port->commands[i].command_header_dma =
3450 dd->port->command_list_dma +
3451 (sizeof(struct mtip_cmd_hdr) * i);
3452 3138
3453 if (host_cap_64) 3139static int mtip_hw_get_identify(struct driver_data *dd)
3454 port->commands[i].command_header->ctbau = 3140{
3455 __force_bit2int cpu_to_le32( 3141 struct smart_attr attr242;
3456 (port->commands[i].command_dma >> 16) >> 16); 3142 unsigned char *buf;
3143 int rv;
3457 3144
3458 port->commands[i].command_header->ctba = 3145 if (mtip_get_identify(dd->port, NULL) < 0)
3459 __force_bit2int cpu_to_le32( 3146 return -EFAULT;
3460 port->commands[i].command_dma & 0xFFFFFFFF);
3461 3147
3462 sg_init_table(port->commands[i].sg, MTIP_MAX_SG); 3148 if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
3149 MTIP_FTL_REBUILD_MAGIC) {
3150 set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
3151 return MTIP_FTL_REBUILD_MAGIC;
3152 }
3153 mtip_dump_identify(dd->port);
3463 3154
3464 /* Mark command as currently inactive */ 3155 /* check write protect, over temp and rebuild statuses */
3465 atomic_set(&dd->port->commands[i].active, 0); 3156 rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
3157 dd->port->log_buf,
3158 dd->port->log_buf_dma, 1);
3159 if (rv) {
3160 dev_warn(&dd->pdev->dev,
3161 "Error in READ LOG EXT (10h) command\n");
3162 /* non-critical error, don't fail the load */
3163 } else {
3164 buf = (unsigned char *)dd->port->log_buf;
3165 if (buf[259] & 0x1) {
3166 dev_info(&dd->pdev->dev,
3167 "Write protect bit is set.\n");
3168 set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
3169 }
3170 if (buf[288] == 0xF7) {
3171 dev_info(&dd->pdev->dev,
3172 "Exceeded Tmax, drive in thermal shutdown.\n");
3173 set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
3174 }
3175 if (buf[288] == 0xBF) {
3176 dev_info(&dd->pdev->dev,
3177 "Drive indicates rebuild has failed.\n");
3178 /* TODO */
3179 }
3466 } 3180 }
3467 return 0; 3181
3182 /* get write protect progess */
3183 memset(&attr242, 0, sizeof(struct smart_attr));
3184 if (mtip_get_smart_attr(dd->port, 242, &attr242))
3185 dev_warn(&dd->pdev->dev,
3186 "Unable to check write protect progress\n");
3187 else
3188 dev_info(&dd->pdev->dev,
3189 "Write protect progress: %u%% (%u blocks)\n",
3190 attr242.cur, le32_to_cpu(attr242.data));
3191
3192 return rv;
3468} 3193}
3469 3194
3470/* 3195/*
@@ -3481,8 +3206,6 @@ static int mtip_hw_init(struct driver_data *dd)
3481 int rv; 3206 int rv;
3482 unsigned int num_command_slots; 3207 unsigned int num_command_slots;
3483 unsigned long timeout, timetaken; 3208 unsigned long timeout, timetaken;
3484 unsigned char *buf;
3485 struct smart_attr attr242;
3486 3209
3487 dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; 3210 dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
3488 3211
@@ -3513,8 +3236,6 @@ static int mtip_hw_init(struct driver_data *dd)
3513 else 3236 else
3514 dd->unal_qdepth = 0; 3237 dd->unal_qdepth = 0;
3515 3238
3516 /* Counting semaphore to track command slot usage */
3517 sema_init(&dd->port->cmd_slot, num_command_slots - 1 - dd->unal_qdepth);
3518 sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); 3239 sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
3519 3240
3520 /* Spinlock to prevent concurrent issue */ 3241 /* Spinlock to prevent concurrent issue */
@@ -3599,73 +3320,16 @@ static int mtip_hw_init(struct driver_data *dd)
3599 writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, 3320 writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
3600 dd->mmio + HOST_CTL); 3321 dd->mmio + HOST_CTL);
3601 3322
3602 init_timer(&dd->port->cmd_timer);
3603 init_waitqueue_head(&dd->port->svc_wait); 3323 init_waitqueue_head(&dd->port->svc_wait);
3604 3324
3605 dd->port->cmd_timer.data = (unsigned long int) dd->port;
3606 dd->port->cmd_timer.function = mtip_timeout_function;
3607 mod_timer(&dd->port->cmd_timer,
3608 jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
3609
3610
3611 if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { 3325 if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
3612 rv = -EFAULT; 3326 rv = -EFAULT;
3613 goto out3; 3327 goto out3;
3614 } 3328 }
3615 3329
3616 if (mtip_get_identify(dd->port, NULL) < 0) {
3617 rv = -EFAULT;
3618 goto out3;
3619 }
3620 mtip_dump_identify(dd->port);
3621
3622 if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
3623 MTIP_FTL_REBUILD_MAGIC) {
3624 set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
3625 return MTIP_FTL_REBUILD_MAGIC;
3626 }
3627
3628 /* check write protect, over temp and rebuild statuses */
3629 rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
3630 dd->port->log_buf,
3631 dd->port->log_buf_dma, 1);
3632 if (rv) {
3633 dev_warn(&dd->pdev->dev,
3634 "Error in READ LOG EXT (10h) command\n");
3635 /* non-critical error, don't fail the load */
3636 } else {
3637 buf = (unsigned char *)dd->port->log_buf;
3638 if (buf[259] & 0x1) {
3639 dev_info(&dd->pdev->dev,
3640 "Write protect bit is set.\n");
3641 set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
3642 }
3643 if (buf[288] == 0xF7) {
3644 dev_info(&dd->pdev->dev,
3645 "Exceeded Tmax, drive in thermal shutdown.\n");
3646 set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
3647 }
3648 if (buf[288] == 0xBF) {
3649 dev_info(&dd->pdev->dev,
3650 "Drive is in security locked state.\n");
3651 set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
3652 }
3653 }
3654
3655 /* get write protect progess */
3656 memset(&attr242, 0, sizeof(struct smart_attr));
3657 if (mtip_get_smart_attr(dd->port, 242, &attr242))
3658 dev_warn(&dd->pdev->dev,
3659 "Unable to check write protect progress\n");
3660 else
3661 dev_info(&dd->pdev->dev,
3662 "Write protect progress: %u%% (%u blocks)\n",
3663 attr242.cur, le32_to_cpu(attr242.data));
3664 return rv; 3330 return rv;
3665 3331
3666out3: 3332out3:
3667 del_timer_sync(&dd->port->cmd_timer);
3668
3669 /* Disable interrupts on the HBA. */ 3333 /* Disable interrupts on the HBA. */
3670 writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, 3334 writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
3671 dd->mmio + HOST_CTL); 3335 dd->mmio + HOST_CTL);
@@ -3685,6 +3349,22 @@ out1:
3685 return rv; 3349 return rv;
3686} 3350}
3687 3351
3352static void mtip_standby_drive(struct driver_data *dd)
3353{
3354 if (dd->sr)
3355 return;
3356
3357 /*
3358 * Send standby immediate (E0h) to the drive so that it
3359 * saves its state.
3360 */
3361 if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
3362 !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
3363 if (mtip_standby_immediate(dd->port))
3364 dev_warn(&dd->pdev->dev,
3365 "STANDBY IMMEDIATE failed\n");
3366}
3367
3688/* 3368/*
3689 * Called to deinitialize an interface. 3369 * Called to deinitialize an interface.
3690 * 3370 *
@@ -3700,12 +3380,6 @@ static int mtip_hw_exit(struct driver_data *dd)
3700 * saves its state. 3380 * saves its state.
3701 */ 3381 */
3702 if (!dd->sr) { 3382 if (!dd->sr) {
3703 if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
3704 !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
3705 if (mtip_standby_immediate(dd->port))
3706 dev_warn(&dd->pdev->dev,
3707 "STANDBY IMMEDIATE failed\n");
3708
3709 /* de-initialize the port. */ 3383 /* de-initialize the port. */
3710 mtip_deinit_port(dd->port); 3384 mtip_deinit_port(dd->port);
3711 3385
@@ -3714,8 +3388,6 @@ static int mtip_hw_exit(struct driver_data *dd)
3714 dd->mmio + HOST_CTL); 3388 dd->mmio + HOST_CTL);
3715 } 3389 }
3716 3390
3717 del_timer_sync(&dd->port->cmd_timer);
3718
3719 /* Release the IRQ. */ 3391 /* Release the IRQ. */
3720 irq_set_affinity_hint(dd->pdev->irq, NULL); 3392 irq_set_affinity_hint(dd->pdev->irq, NULL);
3721 devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); 3393 devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
@@ -4032,100 +3704,138 @@ static const struct block_device_operations mtip_block_ops = {
4032 * 3704 *
4033 * @queue Pointer to the request queue. Unused other than to obtain 3705 * @queue Pointer to the request queue. Unused other than to obtain
4034 * the driver data structure. 3706 * the driver data structure.
4035 * @bio Pointer to the BIO. 3707 * @rq Pointer to the request.
4036 * 3708 *
4037 */ 3709 */
4038static void mtip_make_request(struct request_queue *queue, struct bio *bio) 3710static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
4039{ 3711{
4040 struct driver_data *dd = queue->queuedata; 3712 struct driver_data *dd = hctx->queue->queuedata;
4041 struct scatterlist *sg; 3713 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
4042 struct bio_vec bvec; 3714 unsigned int nents;
4043 struct bvec_iter iter;
4044 int nents = 0;
4045 int tag = 0, unaligned = 0;
4046 3715
4047 if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { 3716 if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
4048 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, 3717 if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
4049 &dd->dd_flag))) { 3718 &dd->dd_flag))) {
4050 bio_endio(bio, -ENXIO); 3719 return -ENXIO;
4051 return;
4052 } 3720 }
4053 if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) { 3721 if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
4054 bio_endio(bio, -ENODATA); 3722 return -ENODATA;
4055 return;
4056 } 3723 }
4057 if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT, 3724 if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
4058 &dd->dd_flag) && 3725 &dd->dd_flag) &&
4059 bio_data_dir(bio))) { 3726 rq_data_dir(rq))) {
4060 bio_endio(bio, -ENODATA); 3727 return -ENODATA;
4061 return;
4062 }
4063 if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) {
4064 bio_endio(bio, -ENODATA);
4065 return;
4066 }
4067 if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
4068 bio_endio(bio, -ENXIO);
4069 return;
4070 } 3728 }
3729 if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)))
3730 return -ENODATA;
3731 if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
3732 return -ENXIO;
4071 } 3733 }
4072 3734
4073 if (unlikely(bio->bi_rw & REQ_DISCARD)) { 3735 if (rq->cmd_flags & REQ_DISCARD) {
4074 bio_endio(bio, mtip_send_trim(dd, bio->bi_iter.bi_sector, 3736 int err;
4075 bio_sectors(bio)));
4076 return;
4077 }
4078 3737
4079 if (unlikely(!bio_has_data(bio))) { 3738 err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
4080 blk_queue_flush(queue, 0); 3739 blk_mq_end_io(rq, err);
4081 bio_endio(bio, 0); 3740 return 0;
4082 return;
4083 } 3741 }
4084 3742
4085 if (bio_data_dir(bio) == WRITE && bio_sectors(bio) <= 64 && 3743 /* Create the scatter list for this request. */
4086 dd->unal_qdepth) { 3744 nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg);
4087 if (bio->bi_iter.bi_sector % 8 != 0) 3745
4088 /* Unaligned on 4k boundaries */ 3746 /* Issue the read/write. */
4089 unaligned = 1; 3747 mtip_hw_submit_io(dd, rq, cmd, nents, hctx);
4090 else if (bio_sectors(bio) % 8 != 0) /* Aligned but not 4k/8k */ 3748 return 0;
4091 unaligned = 1; 3749}
3750
3751static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
3752 struct request *rq)
3753{
3754 struct driver_data *dd = hctx->queue->queuedata;
3755 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
3756
3757 if (!dd->unal_qdepth || rq_data_dir(rq) == READ)
3758 return false;
3759
3760 /*
3761 * If unaligned depth must be limited on this controller, mark it
3762 * as unaligned if the IO isn't on a 4k boundary (start of length).
3763 */
3764 if (blk_rq_sectors(rq) <= 64) {
3765 if ((blk_rq_pos(rq) & 7) || (blk_rq_sectors(rq) & 7))
3766 cmd->unaligned = 1;
4092 } 3767 }
4093 3768
4094 sg = mtip_hw_get_scatterlist(dd, &tag, unaligned); 3769 if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal))
4095 if (likely(sg != NULL)) { 3770 return true;
4096 blk_queue_bounce(queue, &bio);
4097 3771
4098 if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) { 3772 return false;
4099 dev_warn(&dd->pdev->dev, 3773}
4100 "Maximum number of SGL entries exceeded\n");
4101 bio_io_error(bio);
4102 mtip_hw_release_scatterlist(dd, tag, unaligned);
4103 return;
4104 }
4105 3774
4106 /* Create the scatter list for this bio. */ 3775static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
4107 bio_for_each_segment(bvec, bio, iter) { 3776{
4108 sg_set_page(&sg[nents], 3777 int ret;
4109 bvec.bv_page,
4110 bvec.bv_len,
4111 bvec.bv_offset);
4112 nents++;
4113 }
4114 3778
4115 /* Issue the read/write. */ 3779 if (mtip_check_unal_depth(hctx, rq))
4116 mtip_hw_submit_io(dd, 3780 return BLK_MQ_RQ_QUEUE_BUSY;
4117 bio->bi_iter.bi_sector, 3781
4118 bio_sectors(bio), 3782 ret = mtip_submit_request(hctx, rq);
4119 nents, 3783 if (!ret)
4120 tag, 3784 return BLK_MQ_RQ_QUEUE_OK;
4121 bio_endio, 3785
4122 bio, 3786 rq->errors = ret;
4123 bio_data_dir(bio), 3787 return BLK_MQ_RQ_QUEUE_ERROR;
4124 unaligned); 3788}
4125 } else 3789
4126 bio_io_error(bio); 3790static void mtip_free_cmd(void *data, struct request *rq,
3791 unsigned int hctx_idx, unsigned int request_idx)
3792{
3793 struct driver_data *dd = data;
3794 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
3795
3796 if (!cmd->command)
3797 return;
3798
3799 dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
3800 cmd->command, cmd->command_dma);
3801}
3802
3803static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
3804 unsigned int request_idx, unsigned int numa_node)
3805{
3806 struct driver_data *dd = data;
3807 struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
3808 u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
3809
3810 cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
3811 &cmd->command_dma, GFP_KERNEL);
3812 if (!cmd->command)
3813 return -ENOMEM;
3814
3815 memset(cmd->command, 0, CMD_DMA_ALLOC_SZ);
3816
3817 /* Point the command headers at the command tables. */
3818 cmd->command_header = dd->port->command_list +
3819 (sizeof(struct mtip_cmd_hdr) * request_idx);
3820 cmd->command_header_dma = dd->port->command_list_dma +
3821 (sizeof(struct mtip_cmd_hdr) * request_idx);
3822
3823 if (host_cap_64)
3824 cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
3825
3826 cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
3827
3828 sg_init_table(cmd->sg, MTIP_MAX_SG);
3829 return 0;
4127} 3830}
4128 3831
3832static struct blk_mq_ops mtip_mq_ops = {
3833 .queue_rq = mtip_queue_rq,
3834 .map_queue = blk_mq_map_queue,
3835 .init_request = mtip_init_cmd,
3836 .exit_request = mtip_free_cmd,
3837};
3838
4129/* 3839/*
4130 * Block layer initialization function. 3840 * Block layer initialization function.
4131 * 3841 *
@@ -4148,11 +3858,7 @@ static int mtip_block_initialize(struct driver_data *dd)
4148 if (dd->disk) 3858 if (dd->disk)
4149 goto skip_create_disk; /* hw init done, before rebuild */ 3859 goto skip_create_disk; /* hw init done, before rebuild */
4150 3860
4151 /* Initialize the protocol layer. */ 3861 if (mtip_hw_init(dd)) {
4152 wait_for_rebuild = mtip_hw_init(dd);
4153 if (wait_for_rebuild < 0) {
4154 dev_err(&dd->pdev->dev,
4155 "Protocol layer initialization failed\n");
4156 rv = -EINVAL; 3862 rv = -EINVAL;
4157 goto protocol_init_error; 3863 goto protocol_init_error;
4158 } 3864 }
@@ -4194,29 +3900,53 @@ static int mtip_block_initialize(struct driver_data *dd)
4194 3900
4195 mtip_hw_debugfs_init(dd); 3901 mtip_hw_debugfs_init(dd);
4196 3902
4197 /*
4198 * if rebuild pending, start the service thread, and delay the block
4199 * queue creation and add_disk()
4200 */
4201 if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
4202 goto start_service_thread;
4203
4204skip_create_disk: 3903skip_create_disk:
4205 /* Allocate the request queue. */ 3904 memset(&dd->tags, 0, sizeof(dd->tags));
4206 dd->queue = blk_alloc_queue_node(GFP_KERNEL, dd->numa_node); 3905 dd->tags.ops = &mtip_mq_ops;
4207 if (dd->queue == NULL) { 3906 dd->tags.nr_hw_queues = 1;
3907 dd->tags.queue_depth = MTIP_MAX_COMMAND_SLOTS;
3908 dd->tags.reserved_tags = 1;
3909 dd->tags.cmd_size = sizeof(struct mtip_cmd);
3910 dd->tags.numa_node = dd->numa_node;
3911 dd->tags.flags = BLK_MQ_F_SHOULD_MERGE;
3912 dd->tags.driver_data = dd;
3913
3914 rv = blk_mq_alloc_tag_set(&dd->tags);
3915 if (rv) {
4208 dev_err(&dd->pdev->dev, 3916 dev_err(&dd->pdev->dev,
4209 "Unable to allocate request queue\n"); 3917 "Unable to allocate request queue\n");
4210 rv = -ENOMEM; 3918 rv = -ENOMEM;
4211 goto block_queue_alloc_init_error; 3919 goto block_queue_alloc_init_error;
4212 } 3920 }
4213 3921
4214 /* Attach our request function to the request queue. */ 3922 /* Allocate the request queue. */
4215 blk_queue_make_request(dd->queue, mtip_make_request); 3923 dd->queue = blk_mq_init_queue(&dd->tags);
3924 if (IS_ERR(dd->queue)) {
3925 dev_err(&dd->pdev->dev,
3926 "Unable to allocate request queue\n");
3927 rv = -ENOMEM;
3928 goto block_queue_alloc_init_error;
3929 }
4216 3930
4217 dd->disk->queue = dd->queue; 3931 dd->disk->queue = dd->queue;
4218 dd->queue->queuedata = dd; 3932 dd->queue->queuedata = dd;
4219 3933
3934 /* Initialize the protocol layer. */
3935 wait_for_rebuild = mtip_hw_get_identify(dd);
3936 if (wait_for_rebuild < 0) {
3937 dev_err(&dd->pdev->dev,
3938 "Protocol layer initialization failed\n");
3939 rv = -EINVAL;
3940 goto init_hw_cmds_error;
3941 }
3942
3943 /*
3944 * if rebuild pending, start the service thread, and delay the block
3945 * queue creation and add_disk()
3946 */
3947 if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
3948 goto start_service_thread;
3949
4220 /* Set device limits. */ 3950 /* Set device limits. */
4221 set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags); 3951 set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
4222 blk_queue_max_segments(dd->queue, MTIP_MAX_SG); 3952 blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
@@ -4295,8 +4025,9 @@ kthread_run_error:
4295 del_gendisk(dd->disk); 4025 del_gendisk(dd->disk);
4296 4026
4297read_capacity_error: 4027read_capacity_error:
4028init_hw_cmds_error:
4298 blk_cleanup_queue(dd->queue); 4029 blk_cleanup_queue(dd->queue);
4299 4030 blk_mq_free_tag_set(&dd->tags);
4300block_queue_alloc_init_error: 4031block_queue_alloc_init_error:
4301 mtip_hw_debugfs_exit(dd); 4032 mtip_hw_debugfs_exit(dd);
4302disk_index_error: 4033disk_index_error:
@@ -4345,6 +4076,9 @@ static int mtip_block_remove(struct driver_data *dd)
4345 kobject_put(kobj); 4076 kobject_put(kobj);
4346 } 4077 }
4347 } 4078 }
4079
4080 mtip_standby_drive(dd);
4081
4348 /* 4082 /*
4349 * Delete our gendisk structure. This also removes the device 4083 * Delete our gendisk structure. This also removes the device
4350 * from /dev 4084 * from /dev
@@ -4357,6 +4091,7 @@ static int mtip_block_remove(struct driver_data *dd)
4357 if (dd->disk->queue) { 4091 if (dd->disk->queue) {
4358 del_gendisk(dd->disk); 4092 del_gendisk(dd->disk);
4359 blk_cleanup_queue(dd->queue); 4093 blk_cleanup_queue(dd->queue);
4094 blk_mq_free_tag_set(&dd->tags);
4360 dd->queue = NULL; 4095 dd->queue = NULL;
4361 } else 4096 } else
4362 put_disk(dd->disk); 4097 put_disk(dd->disk);
@@ -4391,6 +4126,8 @@ static int mtip_block_remove(struct driver_data *dd)
4391 */ 4126 */
4392static int mtip_block_shutdown(struct driver_data *dd) 4127static int mtip_block_shutdown(struct driver_data *dd)
4393{ 4128{
4129 mtip_hw_shutdown(dd);
4130
4394 /* Delete our gendisk structure, and cleanup the blk queue. */ 4131 /* Delete our gendisk structure, and cleanup the blk queue. */
4395 if (dd->disk) { 4132 if (dd->disk) {
4396 dev_info(&dd->pdev->dev, 4133 dev_info(&dd->pdev->dev,
@@ -4399,6 +4136,7 @@ static int mtip_block_shutdown(struct driver_data *dd)
4399 if (dd->disk->queue) { 4136 if (dd->disk->queue) {
4400 del_gendisk(dd->disk); 4137 del_gendisk(dd->disk);
4401 blk_cleanup_queue(dd->queue); 4138 blk_cleanup_queue(dd->queue);
4139 blk_mq_free_tag_set(&dd->tags);
4402 } else 4140 } else
4403 put_disk(dd->disk); 4141 put_disk(dd->disk);
4404 dd->disk = NULL; 4142 dd->disk = NULL;
@@ -4408,8 +4146,6 @@ static int mtip_block_shutdown(struct driver_data *dd)
4408 spin_lock(&rssd_index_lock); 4146 spin_lock(&rssd_index_lock);
4409 ida_remove(&rssd_index_ida, dd->index); 4147 ida_remove(&rssd_index_ida, dd->index);
4410 spin_unlock(&rssd_index_lock); 4148 spin_unlock(&rssd_index_lock);
4411
4412 mtip_hw_shutdown(dd);
4413 return 0; 4149 return 0;
4414} 4150}
4415 4151
@@ -4479,6 +4215,57 @@ static DEFINE_HANDLER(5);
4479static DEFINE_HANDLER(6); 4215static DEFINE_HANDLER(6);
4480static DEFINE_HANDLER(7); 4216static DEFINE_HANDLER(7);
4481 4217
4218static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev)
4219{
4220 int pos;
4221 unsigned short pcie_dev_ctrl;
4222
4223 pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
4224 if (pos) {
4225 pci_read_config_word(pdev,
4226 pos + PCI_EXP_DEVCTL,
4227 &pcie_dev_ctrl);
4228 if (pcie_dev_ctrl & (1 << 11) ||
4229 pcie_dev_ctrl & (1 << 4)) {
4230 dev_info(&dd->pdev->dev,
4231 "Disabling ERO/No-Snoop on bridge device %04x:%04x\n",
4232 pdev->vendor, pdev->device);
4233 pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN |
4234 PCI_EXP_DEVCTL_RELAX_EN);
4235 pci_write_config_word(pdev,
4236 pos + PCI_EXP_DEVCTL,
4237 pcie_dev_ctrl);
4238 }
4239 }
4240}
4241
4242static void mtip_fix_ero_nosnoop(struct driver_data *dd, struct pci_dev *pdev)
4243{
4244 /*
4245 * This workaround is specific to AMD/ATI chipset with a PCI upstream
4246 * device with device id 0x5aXX
4247 */
4248 if (pdev->bus && pdev->bus->self) {
4249 if (pdev->bus->self->vendor == PCI_VENDOR_ID_ATI &&
4250 ((pdev->bus->self->device & 0xff00) == 0x5a00)) {
4251 mtip_disable_link_opts(dd, pdev->bus->self);
4252 } else {
4253 /* Check further up the topology */
4254 struct pci_dev *parent_dev = pdev->bus->self;
4255 if (parent_dev->bus &&
4256 parent_dev->bus->parent &&
4257 parent_dev->bus->parent->self &&
4258 parent_dev->bus->parent->self->vendor ==
4259 PCI_VENDOR_ID_ATI &&
4260 (parent_dev->bus->parent->self->device &
4261 0xff00) == 0x5a00) {
4262 mtip_disable_link_opts(dd,
4263 parent_dev->bus->parent->self);
4264 }
4265 }
4266 }
4267}
4268
4482/* 4269/*
4483 * Called for each supported PCI device detected. 4270 * Called for each supported PCI device detected.
4484 * 4271 *
@@ -4630,6 +4417,8 @@ static int mtip_pci_probe(struct pci_dev *pdev,
4630 goto msi_initialize_err; 4417 goto msi_initialize_err;
4631 } 4418 }
4632 4419
4420 mtip_fix_ero_nosnoop(dd, pdev);
4421
4633 /* Initialize the block layer. */ 4422 /* Initialize the block layer. */
4634 rv = mtip_block_initialize(dd); 4423 rv = mtip_block_initialize(dd);
4635 if (rv < 0) { 4424 if (rv < 0) {
@@ -4710,8 +4499,6 @@ static void mtip_pci_remove(struct pci_dev *pdev)
4710 dev_warn(&dd->pdev->dev, 4499 dev_warn(&dd->pdev->dev,
4711 "Completion workers still active!\n"); 4500 "Completion workers still active!\n");
4712 } 4501 }
4713 /* Cleanup the outstanding commands */
4714 mtip_command_cleanup(dd);
4715 4502
4716 /* Clean up the block layer. */ 4503 /* Clean up the block layer. */
4717 mtip_block_remove(dd); 4504 mtip_block_remove(dd);
@@ -4737,8 +4524,6 @@ static void mtip_pci_remove(struct pci_dev *pdev)
4737 4524
4738 pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); 4525 pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
4739 pci_set_drvdata(pdev, NULL); 4526 pci_set_drvdata(pdev, NULL);
4740 pci_dev_put(pdev);
4741
4742} 4527}
4743 4528
4744/* 4529/*
@@ -4935,13 +4720,13 @@ static int __init mtip_init(void)
4935 */ 4720 */
4936static void __exit mtip_exit(void) 4721static void __exit mtip_exit(void)
4937{ 4722{
4938 debugfs_remove_recursive(dfs_parent);
4939
4940 /* Release the allocated major block device number. */ 4723 /* Release the allocated major block device number. */
4941 unregister_blkdev(mtip_major, MTIP_DRV_NAME); 4724 unregister_blkdev(mtip_major, MTIP_DRV_NAME);
4942 4725
4943 /* Unregister the PCI driver. */ 4726 /* Unregister the PCI driver. */
4944 pci_unregister_driver(&mtip_pci_driver); 4727 pci_unregister_driver(&mtip_pci_driver);
4728
4729 debugfs_remove_recursive(dfs_parent);
4945} 4730}
4946 4731
4947MODULE_AUTHOR("Micron Technology, Inc"); 4732MODULE_AUTHOR("Micron Technology, Inc");
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index ffb955e7ccb9..4b9b554234bc 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -40,9 +40,11 @@
40#define MTIP_MAX_RETRIES 2 40#define MTIP_MAX_RETRIES 2
41 41
42/* Various timeout values in ms */ 42/* Various timeout values in ms */
43#define MTIP_NCQ_COMMAND_TIMEOUT_MS 5000 43#define MTIP_NCQ_CMD_TIMEOUT_MS 15000
44#define MTIP_IOCTL_COMMAND_TIMEOUT_MS 5000 44#define MTIP_IOCTL_CMD_TIMEOUT_MS 5000
45#define MTIP_INTERNAL_COMMAND_TIMEOUT_MS 5000 45#define MTIP_INT_CMD_TIMEOUT_MS 5000
46#define MTIP_QUIESCE_IO_TIMEOUT_MS (MTIP_NCQ_CMD_TIMEOUT_MS * \
47 (MTIP_MAX_RETRIES + 1))
46 48
47/* check for timeouts every 500ms */ 49/* check for timeouts every 500ms */
48#define MTIP_TIMEOUT_CHECK_PERIOD 500 50#define MTIP_TIMEOUT_CHECK_PERIOD 500
@@ -331,12 +333,8 @@ struct mtip_cmd {
331 */ 333 */
332 void (*comp_func)(struct mtip_port *port, 334 void (*comp_func)(struct mtip_port *port,
333 int tag, 335 int tag,
334 void *data, 336 struct mtip_cmd *cmd,
335 int status); 337 int status);
336 /* Additional callback function that may be called by comp_func() */
337 void (*async_callback)(void *data, int status);
338
339 void *async_data; /* Addl. data passed to async_callback() */
340 338
341 int scatter_ents; /* Number of scatter list entries used */ 339 int scatter_ents; /* Number of scatter list entries used */
342 340
@@ -347,10 +345,6 @@ struct mtip_cmd {
347 int retries; /* The number of retries left for this command. */ 345 int retries; /* The number of retries left for this command. */
348 346
349 int direction; /* Data transfer direction */ 347 int direction; /* Data transfer direction */
350
351 unsigned long comp_time; /* command completion time, in jiffies */
352
353 atomic_t active; /* declares if this command sent to the drive. */
354}; 348};
355 349
356/* Structure used to describe a port. */ 350/* Structure used to describe a port. */
@@ -436,12 +430,6 @@ struct mtip_port {
436 * or error handling is active 430 * or error handling is active
437 */ 431 */
438 unsigned long cmds_to_issue[SLOTBITS_IN_LONGS]; 432 unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
439 /*
440 * Array of command slots. Structure includes pointers to the
441 * command header and command table, and completion function and data
442 * pointers.
443 */
444 struct mtip_cmd commands[MTIP_MAX_COMMAND_SLOTS];
445 /* Used by mtip_service_thread to wait for an event */ 433 /* Used by mtip_service_thread to wait for an event */
446 wait_queue_head_t svc_wait; 434 wait_queue_head_t svc_wait;
447 /* 435 /*
@@ -452,13 +440,7 @@ struct mtip_port {
452 /* 440 /*
453 * Timer used to complete commands that have been active for too long. 441 * Timer used to complete commands that have been active for too long.
454 */ 442 */
455 struct timer_list cmd_timer;
456 unsigned long ic_pause_timer; 443 unsigned long ic_pause_timer;
457 /*
458 * Semaphore used to block threads if there are no
459 * command slots available.
460 */
461 struct semaphore cmd_slot;
462 444
463 /* Semaphore to control queue depth of unaligned IOs */ 445 /* Semaphore to control queue depth of unaligned IOs */
464 struct semaphore cmd_slot_unal; 446 struct semaphore cmd_slot_unal;
@@ -485,6 +467,8 @@ struct driver_data {
485 467
486 struct request_queue *queue; /* Our request queue. */ 468 struct request_queue *queue; /* Our request queue. */
487 469
470 struct blk_mq_tag_set tags; /* blk_mq tags */
471
488 struct mtip_port *port; /* Pointer to the port data structure. */ 472 struct mtip_port *port; /* Pointer to the port data structure. */
489 473
490 unsigned product_type; /* magic value declaring the product type */ 474 unsigned product_type; /* magic value declaring the product type */
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 091b9ea14feb..77087a29b127 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -32,6 +32,7 @@ struct nullb {
32 unsigned int index; 32 unsigned int index;
33 struct request_queue *q; 33 struct request_queue *q;
34 struct gendisk *disk; 34 struct gendisk *disk;
35 struct blk_mq_tag_set tag_set;
35 struct hrtimer timer; 36 struct hrtimer timer;
36 unsigned int queue_depth; 37 unsigned int queue_depth;
37 spinlock_t lock; 38 spinlock_t lock;
@@ -202,8 +203,8 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
202 entry = llist_reverse_order(entry); 203 entry = llist_reverse_order(entry);
203 do { 204 do {
204 cmd = container_of(entry, struct nullb_cmd, ll_list); 205 cmd = container_of(entry, struct nullb_cmd, ll_list);
205 end_cmd(cmd);
206 entry = entry->next; 206 entry = entry->next;
207 end_cmd(cmd);
207 } while (entry); 208 } while (entry);
208 } 209 }
209 210
@@ -226,7 +227,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
226 227
227static void null_softirq_done_fn(struct request *rq) 228static void null_softirq_done_fn(struct request *rq)
228{ 229{
229 end_cmd(rq->special); 230 end_cmd(blk_mq_rq_to_pdu(rq));
230} 231}
231 232
232static inline void null_handle_cmd(struct nullb_cmd *cmd) 233static inline void null_handle_cmd(struct nullb_cmd *cmd)
@@ -311,7 +312,7 @@ static void null_request_fn(struct request_queue *q)
311 312
312static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) 313static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
313{ 314{
314 struct nullb_cmd *cmd = rq->special; 315 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
315 316
316 cmd->rq = rq; 317 cmd->rq = rq;
317 cmd->nq = hctx->driver_data; 318 cmd->nq = hctx->driver_data;
@@ -320,46 +321,6 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
320 return BLK_MQ_RQ_QUEUE_OK; 321 return BLK_MQ_RQ_QUEUE_OK;
321} 322}
322 323
323static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
324{
325 int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes);
326 int tip = (reg->nr_hw_queues % nr_online_nodes);
327 int node = 0, i, n;
328
329 /*
330 * Split submit queues evenly wrt to the number of nodes. If uneven,
331 * fill the first buckets with one extra, until the rest is filled with
332 * no extra.
333 */
334 for (i = 0, n = 1; i < hctx_index; i++, n++) {
335 if (n % b_size == 0) {
336 n = 0;
337 node++;
338
339 tip--;
340 if (!tip)
341 b_size = reg->nr_hw_queues / nr_online_nodes;
342 }
343 }
344
345 /*
346 * A node might not be online, therefore map the relative node id to the
347 * real node id.
348 */
349 for_each_online_node(n) {
350 if (!node)
351 break;
352 node--;
353 }
354
355 return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n);
356}
357
358static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
359{
360 kfree(hctx);
361}
362
363static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) 324static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
364{ 325{
365 BUG_ON(!nullb); 326 BUG_ON(!nullb);
@@ -389,19 +350,14 @@ static struct blk_mq_ops null_mq_ops = {
389 .complete = null_softirq_done_fn, 350 .complete = null_softirq_done_fn,
390}; 351};
391 352
392static struct blk_mq_reg null_mq_reg = {
393 .ops = &null_mq_ops,
394 .queue_depth = 64,
395 .cmd_size = sizeof(struct nullb_cmd),
396 .flags = BLK_MQ_F_SHOULD_MERGE,
397};
398
399static void null_del_dev(struct nullb *nullb) 353static void null_del_dev(struct nullb *nullb)
400{ 354{
401 list_del_init(&nullb->list); 355 list_del_init(&nullb->list);
402 356
403 del_gendisk(nullb->disk); 357 del_gendisk(nullb->disk);
404 blk_cleanup_queue(nullb->q); 358 blk_cleanup_queue(nullb->q);
359 if (queue_mode == NULL_Q_MQ)
360 blk_mq_free_tag_set(&nullb->tag_set);
405 put_disk(nullb->disk); 361 put_disk(nullb->disk);
406 kfree(nullb); 362 kfree(nullb);
407} 363}
@@ -506,7 +462,7 @@ static int null_add_dev(void)
506 462
507 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); 463 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
508 if (!nullb) 464 if (!nullb)
509 return -ENOMEM; 465 goto out;
510 466
511 spin_lock_init(&nullb->lock); 467 spin_lock_init(&nullb->lock);
512 468
@@ -514,49 +470,44 @@ static int null_add_dev(void)
514 submit_queues = nr_online_nodes; 470 submit_queues = nr_online_nodes;
515 471
516 if (setup_queues(nullb)) 472 if (setup_queues(nullb))
517 goto err; 473 goto out_free_nullb;
518 474
519 if (queue_mode == NULL_Q_MQ) { 475 if (queue_mode == NULL_Q_MQ) {
520 null_mq_reg.numa_node = home_node; 476 nullb->tag_set.ops = &null_mq_ops;
521 null_mq_reg.queue_depth = hw_queue_depth; 477 nullb->tag_set.nr_hw_queues = submit_queues;
522 null_mq_reg.nr_hw_queues = submit_queues; 478 nullb->tag_set.queue_depth = hw_queue_depth;
523 479 nullb->tag_set.numa_node = home_node;
524 if (use_per_node_hctx) { 480 nullb->tag_set.cmd_size = sizeof(struct nullb_cmd);
525 null_mq_reg.ops->alloc_hctx = null_alloc_hctx; 481 nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
526 null_mq_reg.ops->free_hctx = null_free_hctx; 482 nullb->tag_set.driver_data = nullb;
527 } else { 483
528 null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue; 484 if (blk_mq_alloc_tag_set(&nullb->tag_set))
529 null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue; 485 goto out_cleanup_queues;
530 } 486
531 487 nullb->q = blk_mq_init_queue(&nullb->tag_set);
532 nullb->q = blk_mq_init_queue(&null_mq_reg, nullb); 488 if (!nullb->q)
489 goto out_cleanup_tags;
533 } else if (queue_mode == NULL_Q_BIO) { 490 } else if (queue_mode == NULL_Q_BIO) {
534 nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); 491 nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
492 if (!nullb->q)
493 goto out_cleanup_queues;
535 blk_queue_make_request(nullb->q, null_queue_bio); 494 blk_queue_make_request(nullb->q, null_queue_bio);
536 init_driver_queues(nullb); 495 init_driver_queues(nullb);
537 } else { 496 } else {
538 nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); 497 nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
498 if (!nullb->q)
499 goto out_cleanup_queues;
539 blk_queue_prep_rq(nullb->q, null_rq_prep_fn); 500 blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
540 if (nullb->q) 501 blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
541 blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
542 init_driver_queues(nullb); 502 init_driver_queues(nullb);
543 } 503 }
544 504
545 if (!nullb->q)
546 goto queue_fail;
547
548 nullb->q->queuedata = nullb; 505 nullb->q->queuedata = nullb;
549 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); 506 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
550 507
551 disk = nullb->disk = alloc_disk_node(1, home_node); 508 disk = nullb->disk = alloc_disk_node(1, home_node);
552 if (!disk) { 509 if (!disk)
553queue_fail: 510 goto out_cleanup_blk_queue;
554 blk_cleanup_queue(nullb->q);
555 cleanup_queues(nullb);
556err:
557 kfree(nullb);
558 return -ENOMEM;
559 }
560 511
561 mutex_lock(&lock); 512 mutex_lock(&lock);
562 list_add_tail(&nullb->list, &nullb_list); 513 list_add_tail(&nullb->list, &nullb_list);
@@ -579,6 +530,18 @@ err:
579 sprintf(disk->disk_name, "nullb%d", nullb->index); 530 sprintf(disk->disk_name, "nullb%d", nullb->index);
580 add_disk(disk); 531 add_disk(disk);
581 return 0; 532 return 0;
533
534out_cleanup_blk_queue:
535 blk_cleanup_queue(nullb->q);
536out_cleanup_tags:
537 if (queue_mode == NULL_Q_MQ)
538 blk_mq_free_tag_set(&nullb->tag_set);
539out_cleanup_queues:
540 cleanup_queues(nullb);
541out_free_nullb:
542 kfree(nullb);
543out:
544 return -ENOMEM;
582} 545}
583 546
584static int __init null_init(void) 547static int __init null_init(void)
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index e76bdc074dbe..719cb1bc1640 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -747,7 +747,7 @@ static void do_pcd_request(struct request_queue * q)
747 pcd_current = cd; 747 pcd_current = cd;
748 pcd_sector = blk_rq_pos(pcd_req); 748 pcd_sector = blk_rq_pos(pcd_req);
749 pcd_count = blk_rq_cur_sectors(pcd_req); 749 pcd_count = blk_rq_cur_sectors(pcd_req);
750 pcd_buf = pcd_req->buffer; 750 pcd_buf = bio_data(pcd_req->bio);
751 pcd_busy = 1; 751 pcd_busy = 1;
752 ps_set_intr(do_pcd_read, NULL, 0, nice); 752 ps_set_intr(do_pcd_read, NULL, 0, nice);
753 return; 753 return;
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 19ad8f0c83ef..fea7e76a00de 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -454,7 +454,7 @@ static enum action do_pd_io_start(void)
454 if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) 454 if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
455 return Fail; 455 return Fail;
456 pd_run = blk_rq_sectors(pd_req); 456 pd_run = blk_rq_sectors(pd_req);
457 pd_buf = pd_req->buffer; 457 pd_buf = bio_data(pd_req->bio);
458 pd_retries = 0; 458 pd_retries = 0;
459 if (pd_cmd == READ) 459 if (pd_cmd == READ)
460 return do_pd_read_start(); 460 return do_pd_read_start();
@@ -485,7 +485,7 @@ static int pd_next_buf(void)
485 spin_lock_irqsave(&pd_lock, saved_flags); 485 spin_lock_irqsave(&pd_lock, saved_flags);
486 __blk_end_request_cur(pd_req, 0); 486 __blk_end_request_cur(pd_req, 0);
487 pd_count = blk_rq_cur_sectors(pd_req); 487 pd_count = blk_rq_cur_sectors(pd_req);
488 pd_buf = pd_req->buffer; 488 pd_buf = bio_data(pd_req->bio);
489 spin_unlock_irqrestore(&pd_lock, saved_flags); 489 spin_unlock_irqrestore(&pd_lock, saved_flags);
490 return 0; 490 return 0;
491} 491}
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index f5c86d523ba0..9a15fd3c9349 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -795,7 +795,7 @@ repeat:
795 } 795 }
796 796
797 pf_cmd = rq_data_dir(pf_req); 797 pf_cmd = rq_data_dir(pf_req);
798 pf_buf = pf_req->buffer; 798 pf_buf = bio_data(pf_req->bio);
799 pf_retries = 0; 799 pf_retries = 0;
800 800
801 pf_busy = 1; 801 pf_busy = 1;
@@ -827,7 +827,7 @@ static int pf_next_buf(void)
827 if (!pf_req) 827 if (!pf_req)
828 return 1; 828 return 1;
829 pf_count = blk_rq_cur_sectors(pf_req); 829 pf_count = blk_rq_cur_sectors(pf_req);
830 pf_buf = pf_req->buffer; 830 pf_buf = bio_data(pf_req->bio);
831 } 831 }
832 return 0; 832 return 0;
833} 833}
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index a69dd93d1bd5..608532d3f8c9 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -563,7 +563,6 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
563 563
564 req = skreq->req; 564 req = skreq->req;
565 blk_add_request_payload(req, page, len); 565 blk_add_request_payload(req, page, len);
566 req->buffer = buf;
567} 566}
568 567
569static void skd_request_fn_not_online(struct request_queue *q); 568static void skd_request_fn_not_online(struct request_queue *q);
@@ -744,6 +743,7 @@ static void skd_request_fn(struct request_queue *q)
744 break; 743 break;
745 } 744 }
746 skreq->discard_page = 1; 745 skreq->discard_page = 1;
746 req->completion_data = page;
747 skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); 747 skd_prep_discard_cdb(scsi_req, skreq, page, lba, count);
748 748
749 } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { 749 } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) {
@@ -858,8 +858,7 @@ static void skd_end_request(struct skd_device *skdev,
858 (skreq->discard_page == 1)) { 858 (skreq->discard_page == 1)) {
859 pr_debug("%s:%s:%d, free the page!", 859 pr_debug("%s:%s:%d, free the page!",
860 skdev->name, __func__, __LINE__); 860 skdev->name, __func__, __LINE__);
861 free_page((unsigned long)req->buffer); 861 __free_page(req->completion_data);
862 req->buffer = NULL;
863 } 862 }
864 863
865 if (unlikely(error)) { 864 if (unlikely(error)) {
@@ -3945,15 +3944,14 @@ static int skd_acquire_msix(struct skd_device *skdev)
3945 for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) 3944 for (i = 0; i < SKD_MAX_MSIX_COUNT; i++)
3946 entries[i].entry = i; 3945 entries[i].entry = i;
3947 3946
3948 rc = pci_enable_msix_range(pdev, entries, 3947 rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT);
3949 SKD_MIN_MSIX_COUNT, SKD_MAX_MSIX_COUNT); 3948 if (rc) {
3950 if (rc < 0) {
3951 pr_err("(%s): failed to enable MSI-X %d\n", 3949 pr_err("(%s): failed to enable MSI-X %d\n",
3952 skd_name(skdev), rc); 3950 skd_name(skdev), rc);
3953 goto msix_out; 3951 goto msix_out;
3954 } 3952 }
3955 3953
3956 skdev->msix_count = rc; 3954 skdev->msix_count = SKD_MAX_MSIX_COUNT;
3957 skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) * 3955 skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) *
3958 skdev->msix_count, GFP_KERNEL); 3956 skdev->msix_count, GFP_KERNEL);
3959 if (!skdev->msix_entries) { 3957 if (!skdev->msix_entries) {
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index b02d53a399f3..6b44bbe528b7 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -549,7 +549,7 @@ static void redo_fd_request(struct request_queue *q)
549 case READ: 549 case READ:
550 err = floppy_read_sectors(fs, blk_rq_pos(req), 550 err = floppy_read_sectors(fs, blk_rq_pos(req),
551 blk_rq_cur_sectors(req), 551 blk_rq_cur_sectors(req),
552 req->buffer); 552 bio_data(req->bio));
553 break; 553 break;
554 } 554 }
555 done: 555 done:
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c74f7b56e7c4..523ee8fd4c15 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -342,7 +342,7 @@ static void start_request(struct floppy_state *fs)
342 swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", 342 swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
343 req->rq_disk->disk_name, req->cmd, 343 req->rq_disk->disk_name, req->cmd,
344 (long)blk_rq_pos(req), blk_rq_sectors(req), 344 (long)blk_rq_pos(req), blk_rq_sectors(req),
345 req->buffer); 345 bio_data(req->bio));
346 swim3_dbg(" errors=%d current_nr_sectors=%u\n", 346 swim3_dbg(" errors=%d current_nr_sectors=%u\n",
347 req->errors, blk_rq_cur_sectors(req)); 347 req->errors, blk_rq_cur_sectors(req));
348#endif 348#endif
@@ -479,11 +479,11 @@ static inline void setup_transfer(struct floppy_state *fs)
479 /* Set up 3 dma commands: write preamble, data, postamble */ 479 /* Set up 3 dma commands: write preamble, data, postamble */
480 init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble)); 480 init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
481 ++cp; 481 ++cp;
482 init_dma(cp, OUTPUT_MORE, req->buffer, 512); 482 init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512);
483 ++cp; 483 ++cp;
484 init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble)); 484 init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
485 } else { 485 } else {
486 init_dma(cp, INPUT_LAST, req->buffer, n * 512); 486 init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512);
487 } 487 }
488 ++cp; 488 ++cp;
489 out_le16(&cp->command, DBDMA_STOP); 489 out_le16(&cp->command, DBDMA_STOP);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6d8a87f252de..c8f286e8d80f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -30,6 +30,9 @@ struct virtio_blk
30 /* The disk structure for the kernel. */ 30 /* The disk structure for the kernel. */
31 struct gendisk *disk; 31 struct gendisk *disk;
32 32
33 /* Block layer tags. */
34 struct blk_mq_tag_set tag_set;
35
33 /* Process context for config space updates */ 36 /* Process context for config space updates */
34 struct work_struct config_work; 37 struct work_struct config_work;
35 38
@@ -112,7 +115,7 @@ static int __virtblk_add_req(struct virtqueue *vq,
112 115
113static inline void virtblk_request_done(struct request *req) 116static inline void virtblk_request_done(struct request *req)
114{ 117{
115 struct virtblk_req *vbr = req->special; 118 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
116 int error = virtblk_result(vbr); 119 int error = virtblk_result(vbr);
117 120
118 if (req->cmd_type == REQ_TYPE_BLOCK_PC) { 121 if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -144,17 +147,17 @@ static void virtblk_done(struct virtqueue *vq)
144 if (unlikely(virtqueue_is_broken(vq))) 147 if (unlikely(virtqueue_is_broken(vq)))
145 break; 148 break;
146 } while (!virtqueue_enable_cb(vq)); 149 } while (!virtqueue_enable_cb(vq));
147 spin_unlock_irqrestore(&vblk->vq_lock, flags);
148 150
149 /* In case queue is stopped waiting for more buffers. */ 151 /* In case queue is stopped waiting for more buffers. */
150 if (req_done) 152 if (req_done)
151 blk_mq_start_stopped_hw_queues(vblk->disk->queue); 153 blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
154 spin_unlock_irqrestore(&vblk->vq_lock, flags);
152} 155}
153 156
154static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) 157static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
155{ 158{
156 struct virtio_blk *vblk = hctx->queue->queuedata; 159 struct virtio_blk *vblk = hctx->queue->queuedata;
157 struct virtblk_req *vbr = req->special; 160 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
158 unsigned long flags; 161 unsigned long flags;
159 unsigned int num; 162 unsigned int num;
160 const bool last = (req->cmd_flags & REQ_END) != 0; 163 const bool last = (req->cmd_flags & REQ_END) != 0;
@@ -202,8 +205,8 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
202 err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num); 205 err = __virtblk_add_req(vblk->vq, vbr, vbr->sg, num);
203 if (err) { 206 if (err) {
204 virtqueue_kick(vblk->vq); 207 virtqueue_kick(vblk->vq);
205 spin_unlock_irqrestore(&vblk->vq_lock, flags);
206 blk_mq_stop_hw_queue(hctx); 208 blk_mq_stop_hw_queue(hctx);
209 spin_unlock_irqrestore(&vblk->vq_lock, flags);
207 /* Out of mem doesn't actually happen, since we fall back 210 /* Out of mem doesn't actually happen, since we fall back
208 * to direct descriptors */ 211 * to direct descriptors */
209 if (err == -ENOMEM || err == -ENOSPC) 212 if (err == -ENOMEM || err == -ENOSPC)
@@ -480,33 +483,27 @@ static const struct device_attribute dev_attr_cache_type_rw =
480 __ATTR(cache_type, S_IRUGO|S_IWUSR, 483 __ATTR(cache_type, S_IRUGO|S_IWUSR,
481 virtblk_cache_type_show, virtblk_cache_type_store); 484 virtblk_cache_type_show, virtblk_cache_type_store);
482 485
483static struct blk_mq_ops virtio_mq_ops = { 486static int virtblk_init_request(void *data, struct request *rq,
484 .queue_rq = virtio_queue_rq, 487 unsigned int hctx_idx, unsigned int request_idx,
485 .map_queue = blk_mq_map_queue, 488 unsigned int numa_node)
486 .alloc_hctx = blk_mq_alloc_single_hw_queue,
487 .free_hctx = blk_mq_free_single_hw_queue,
488 .complete = virtblk_request_done,
489};
490
491static struct blk_mq_reg virtio_mq_reg = {
492 .ops = &virtio_mq_ops,
493 .nr_hw_queues = 1,
494 .queue_depth = 0, /* Set in virtblk_probe */
495 .numa_node = NUMA_NO_NODE,
496 .flags = BLK_MQ_F_SHOULD_MERGE,
497};
498module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);
499
500static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
501 struct request *rq, unsigned int nr)
502{ 489{
503 struct virtio_blk *vblk = data; 490 struct virtio_blk *vblk = data;
504 struct virtblk_req *vbr = rq->special; 491 struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
505 492
506 sg_init_table(vbr->sg, vblk->sg_elems); 493 sg_init_table(vbr->sg, vblk->sg_elems);
507 return 0; 494 return 0;
508} 495}
509 496
497static struct blk_mq_ops virtio_mq_ops = {
498 .queue_rq = virtio_queue_rq,
499 .map_queue = blk_mq_map_queue,
500 .complete = virtblk_request_done,
501 .init_request = virtblk_init_request,
502};
503
504static unsigned int virtblk_queue_depth;
505module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
506
510static int virtblk_probe(struct virtio_device *vdev) 507static int virtblk_probe(struct virtio_device *vdev)
511{ 508{
512 struct virtio_blk *vblk; 509 struct virtio_blk *vblk;
@@ -561,24 +558,34 @@ static int virtblk_probe(struct virtio_device *vdev)
561 } 558 }
562 559
563 /* Default queue sizing is to fill the ring. */ 560 /* Default queue sizing is to fill the ring. */
564 if (!virtio_mq_reg.queue_depth) { 561 if (!virtblk_queue_depth) {
565 virtio_mq_reg.queue_depth = vblk->vq->num_free; 562 virtblk_queue_depth = vblk->vq->num_free;
566 /* ... but without indirect descs, we use 2 descs per req */ 563 /* ... but without indirect descs, we use 2 descs per req */
567 if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) 564 if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
568 virtio_mq_reg.queue_depth /= 2; 565 virtblk_queue_depth /= 2;
569 } 566 }
570 virtio_mq_reg.cmd_size = 567
568 memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
569 vblk->tag_set.ops = &virtio_mq_ops;
570 vblk->tag_set.nr_hw_queues = 1;
571 vblk->tag_set.queue_depth = virtblk_queue_depth;
572 vblk->tag_set.numa_node = NUMA_NO_NODE;
573 vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
574 vblk->tag_set.cmd_size =
571 sizeof(struct virtblk_req) + 575 sizeof(struct virtblk_req) +
572 sizeof(struct scatterlist) * sg_elems; 576 sizeof(struct scatterlist) * sg_elems;
577 vblk->tag_set.driver_data = vblk;
573 578
574 q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk); 579 err = blk_mq_alloc_tag_set(&vblk->tag_set);
580 if (err)
581 goto out_put_disk;
582
583 q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
575 if (!q) { 584 if (!q) {
576 err = -ENOMEM; 585 err = -ENOMEM;
577 goto out_put_disk; 586 goto out_free_tags;
578 } 587 }
579 588
580 blk_mq_init_commands(q, virtblk_init_vbr, vblk);
581
582 q->queuedata = vblk; 589 q->queuedata = vblk;
583 590
584 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); 591 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@@ -679,6 +686,8 @@ static int virtblk_probe(struct virtio_device *vdev)
679out_del_disk: 686out_del_disk:
680 del_gendisk(vblk->disk); 687 del_gendisk(vblk->disk);
681 blk_cleanup_queue(vblk->disk->queue); 688 blk_cleanup_queue(vblk->disk->queue);
689out_free_tags:
690 blk_mq_free_tag_set(&vblk->tag_set);
682out_put_disk: 691out_put_disk:
683 put_disk(vblk->disk); 692 put_disk(vblk->disk);
684out_free_vq: 693out_free_vq:
@@ -705,6 +714,8 @@ static void virtblk_remove(struct virtio_device *vdev)
705 del_gendisk(vblk->disk); 714 del_gendisk(vblk->disk);
706 blk_cleanup_queue(vblk->disk->queue); 715 blk_cleanup_queue(vblk->disk->queue);
707 716
717 blk_mq_free_tag_set(&vblk->tag_set);
718
708 /* Stop all the virtqueues. */ 719 /* Stop all the virtqueues. */
709 vdev->config->reset(vdev); 720 vdev->config->reset(vdev);
710 721
@@ -749,7 +760,7 @@ static int virtblk_restore(struct virtio_device *vdev)
749 vblk->config_enable = true; 760 vblk->config_enable = true;
750 ret = init_vq(vdev->priv); 761 ret = init_vq(vdev->priv);
751 if (!ret) 762 if (!ret)
752 blk_mq_start_stopped_hw_queues(vblk->disk->queue); 763 blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
753 764
754 return ret; 765 return ret;
755} 766}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 25c11ad34184..5deb235bd18f 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -612,10 +612,10 @@ static void do_blkif_request(struct request_queue *rq)
612 } 612 }
613 613
614 pr_debug("do_blk_req %p: cmd %p, sec %lx, " 614 pr_debug("do_blk_req %p: cmd %p, sec %lx, "
615 "(%u/%u) buffer:%p [%s]\n", 615 "(%u/%u) [%s]\n",
616 req, req->cmd, (unsigned long)blk_rq_pos(req), 616 req, req->cmd, (unsigned long)blk_rq_pos(req),
617 blk_rq_cur_sectors(req), blk_rq_sectors(req), 617 blk_rq_cur_sectors(req), blk_rq_sectors(req),
618 req->buffer, rq_data_dir(req) ? "write" : "read"); 618 rq_data_dir(req) ? "write" : "read");
619 619
620 if (blkif_queue_request(req)) { 620 if (blkif_queue_request(req)) {
621 blk_requeue_request(rq, req); 621 blk_requeue_request(rq, req);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 1393b8871a28..ab3ea62e5dfc 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -661,7 +661,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
661 rq_data_dir(req)); 661 rq_data_dir(req));
662 662
663 ace->req = req; 663 ace->req = req;
664 ace->data_ptr = req->buffer; 664 ace->data_ptr = bio_data(req->bio);
665 ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR; 665 ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR;
666 ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF); 666 ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF);
667 667
@@ -733,7 +733,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
733 * blk_rq_sectors(ace->req), 733 * blk_rq_sectors(ace->req),
734 * blk_rq_cur_sectors(ace->req)); 734 * blk_rq_cur_sectors(ace->req));
735 */ 735 */
736 ace->data_ptr = ace->req->buffer; 736 ace->data_ptr = bio_data(ace->req->bio);
737 ace->data_count = blk_rq_cur_sectors(ace->req) * 16; 737 ace->data_count = blk_rq_cur_sectors(ace->req) * 16;
738 ace_fsm_yieldirq(ace); 738 ace_fsm_yieldirq(ace);
739 break; 739 break;
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 27de5046708a..968f9e52effa 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -87,13 +87,15 @@ static void do_z2_request(struct request_queue *q)
87 while (len) { 87 while (len) {
88 unsigned long addr = start & Z2RAM_CHUNKMASK; 88 unsigned long addr = start & Z2RAM_CHUNKMASK;
89 unsigned long size = Z2RAM_CHUNKSIZE - addr; 89 unsigned long size = Z2RAM_CHUNKSIZE - addr;
90 void *buffer = bio_data(req->bio);
91
90 if (len < size) 92 if (len < size)
91 size = len; 93 size = len;
92 addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ]; 94 addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ];
93 if (rq_data_dir(req) == READ) 95 if (rq_data_dir(req) == READ)
94 memcpy(req->buffer, (char *)addr, size); 96 memcpy(buffer, (char *)addr, size);
95 else 97 else
96 memcpy((char *)addr, req->buffer, size); 98 memcpy((char *)addr, buffer, size);
97 start += size; 99 start += size;
98 len -= size; 100 len -= size;
99 } 101 }
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 8a3aff724d98..49ac5662585b 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -312,36 +312,24 @@ static const char *mrw_format_status[] = {
312 312
313static const char *mrw_address_space[] = { "DMA", "GAA" }; 313static const char *mrw_address_space[] = { "DMA", "GAA" };
314 314
315#if (ERRLOGMASK!=CD_NOTHING) 315#if (ERRLOGMASK != CD_NOTHING)
316#define cdinfo(type, fmt, args...) \ 316#define cd_dbg(type, fmt, ...) \
317do { \ 317do { \
318 if ((ERRLOGMASK & type) || debug == 1) \ 318 if ((ERRLOGMASK & type) || debug == 1) \
319 pr_info(fmt, ##args); \ 319 pr_debug(fmt, ##__VA_ARGS__); \
320} while (0) 320} while (0)
321#else 321#else
322#define cdinfo(type, fmt, args...) \ 322#define cd_dbg(type, fmt, ...) \
323do { \ 323do { \
324 if (0 && (ERRLOGMASK & type) || debug == 1) \ 324 if (0 && (ERRLOGMASK & type) || debug == 1) \
325 pr_info(fmt, ##args); \ 325 pr_debug(fmt, ##__VA_ARGS__); \
326} while (0) 326} while (0)
327#endif 327#endif
328 328
329/* These are used to simplify getting data in from and back to user land */
330#define IOCTL_IN(arg, type, in) \
331 if (copy_from_user(&(in), (type __user *) (arg), sizeof (in))) \
332 return -EFAULT;
333
334#define IOCTL_OUT(arg, type, out) \
335 if (copy_to_user((type __user *) (arg), &(out), sizeof (out))) \
336 return -EFAULT;
337
338/* The (cdo->capability & ~cdi->mask & CDC_XXX) construct was used in 329/* The (cdo->capability & ~cdi->mask & CDC_XXX) construct was used in
339 a lot of places. This macro makes the code more clear. */ 330 a lot of places. This macro makes the code more clear. */
340#define CDROM_CAN(type) (cdi->ops->capability & ~cdi->mask & (type)) 331#define CDROM_CAN(type) (cdi->ops->capability & ~cdi->mask & (type))
341 332
342/* used in the audio ioctls */
343#define CHECKAUDIO if ((ret=check_for_audio_disc(cdi, cdo))) return ret
344
345/* 333/*
346 * Another popular OS uses 7 seconds as the hard timeout for default 334 * Another popular OS uses 7 seconds as the hard timeout for default
347 * commands, so it is a good choice for us as well. 335 * commands, so it is a good choice for us as well.
@@ -349,21 +337,6 @@ do { \
349#define CDROM_DEF_TIMEOUT (7 * HZ) 337#define CDROM_DEF_TIMEOUT (7 * HZ)
350 338
351/* Not-exported routines. */ 339/* Not-exported routines. */
352static int open_for_data(struct cdrom_device_info * cdi);
353static int check_for_audio_disc(struct cdrom_device_info * cdi,
354 struct cdrom_device_ops * cdo);
355static void sanitize_format(union cdrom_addr *addr,
356 u_char * curr, u_char requested);
357static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
358 unsigned long arg);
359
360int cdrom_get_last_written(struct cdrom_device_info *, long *);
361static int cdrom_get_next_writable(struct cdrom_device_info *, long *);
362static void cdrom_count_tracks(struct cdrom_device_info *, tracktype*);
363
364static int cdrom_mrw_exit(struct cdrom_device_info *cdi);
365
366static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di);
367 340
368static void cdrom_sysctl_register(void); 341static void cdrom_sysctl_register(void);
369 342
@@ -382,113 +355,65 @@ static int cdrom_dummy_generic_packet(struct cdrom_device_info *cdi,
382 return -EIO; 355 return -EIO;
383} 356}
384 357
385/* This macro makes sure we don't have to check on cdrom_device_ops 358static int cdrom_flush_cache(struct cdrom_device_info *cdi)
386 * existence in the run-time routines below. Change_capability is a
387 * hack to have the capability flags defined const, while we can still
388 * change it here without gcc complaining at every line.
389 */
390#define ENSURE(call, bits) if (cdo->call == NULL) *change_capability &= ~(bits)
391
392int register_cdrom(struct cdrom_device_info *cdi)
393{
394 static char banner_printed;
395 struct cdrom_device_ops *cdo = cdi->ops;
396 int *change_capability = (int *)&cdo->capability; /* hack */
397
398 cdinfo(CD_OPEN, "entering register_cdrom\n");
399
400 if (cdo->open == NULL || cdo->release == NULL)
401 return -EINVAL;
402 if (!banner_printed) {
403 pr_info("Uniform CD-ROM driver " REVISION "\n");
404 banner_printed = 1;
405 cdrom_sysctl_register();
406 }
407
408 ENSURE(drive_status, CDC_DRIVE_STATUS );
409 if (cdo->check_events == NULL && cdo->media_changed == NULL)
410 *change_capability = ~(CDC_MEDIA_CHANGED | CDC_SELECT_DISC);
411 ENSURE(tray_move, CDC_CLOSE_TRAY | CDC_OPEN_TRAY);
412 ENSURE(lock_door, CDC_LOCK);
413 ENSURE(select_speed, CDC_SELECT_SPEED);
414 ENSURE(get_last_session, CDC_MULTI_SESSION);
415 ENSURE(get_mcn, CDC_MCN);
416 ENSURE(reset, CDC_RESET);
417 ENSURE(generic_packet, CDC_GENERIC_PACKET);
418 cdi->mc_flags = 0;
419 cdo->n_minors = 0;
420 cdi->options = CDO_USE_FFLAGS;
421
422 if (autoclose==1 && CDROM_CAN(CDC_CLOSE_TRAY))
423 cdi->options |= (int) CDO_AUTO_CLOSE;
424 if (autoeject==1 && CDROM_CAN(CDC_OPEN_TRAY))
425 cdi->options |= (int) CDO_AUTO_EJECT;
426 if (lockdoor==1)
427 cdi->options |= (int) CDO_LOCK;
428 if (check_media_type==1)
429 cdi->options |= (int) CDO_CHECK_TYPE;
430
431 if (CDROM_CAN(CDC_MRW_W))
432 cdi->exit = cdrom_mrw_exit;
433
434 if (cdi->disk)
435 cdi->cdda_method = CDDA_BPC_FULL;
436 else
437 cdi->cdda_method = CDDA_OLD;
438
439 if (!cdo->generic_packet)
440 cdo->generic_packet = cdrom_dummy_generic_packet;
441
442 cdinfo(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name);
443 mutex_lock(&cdrom_mutex);
444 list_add(&cdi->list, &cdrom_list);
445 mutex_unlock(&cdrom_mutex);
446 return 0;
447}
448#undef ENSURE
449
450void unregister_cdrom(struct cdrom_device_info *cdi)
451{ 359{
452 cdinfo(CD_OPEN, "entering unregister_cdrom\n"); 360 struct packet_command cgc;
453 361
454 mutex_lock(&cdrom_mutex); 362 init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
455 list_del(&cdi->list); 363 cgc.cmd[0] = GPCMD_FLUSH_CACHE;
456 mutex_unlock(&cdrom_mutex);
457 364
458 if (cdi->exit) 365 cgc.timeout = 5 * 60 * HZ;
459 cdi->exit(cdi);
460 366
461 cdi->ops->n_minors--; 367 return cdi->ops->generic_packet(cdi, &cgc);
462 cdinfo(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name);
463} 368}
464 369
465int cdrom_get_media_event(struct cdrom_device_info *cdi, 370/* requires CD R/RW */
466 struct media_event_desc *med) 371static int cdrom_get_disc_info(struct cdrom_device_info *cdi,
372 disc_information *di)
467{ 373{
374 struct cdrom_device_ops *cdo = cdi->ops;
468 struct packet_command cgc; 375 struct packet_command cgc;
469 unsigned char buffer[8]; 376 int ret, buflen;
470 struct event_header *eh = (struct event_header *) buffer;
471 377
472 init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ); 378 /* set up command and get the disc info */
473 cgc.cmd[0] = GPCMD_GET_EVENT_STATUS_NOTIFICATION; 379 init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
474 cgc.cmd[1] = 1; /* IMMED */ 380 cgc.cmd[0] = GPCMD_READ_DISC_INFO;
475 cgc.cmd[4] = 1 << 4; /* media event */ 381 cgc.cmd[8] = cgc.buflen = 2;
476 cgc.cmd[8] = sizeof(buffer);
477 cgc.quiet = 1; 382 cgc.quiet = 1;
478 383
479 if (cdi->ops->generic_packet(cdi, &cgc)) 384 ret = cdo->generic_packet(cdi, &cgc);
480 return 1; 385 if (ret)
386 return ret;
481 387
482 if (be16_to_cpu(eh->data_len) < sizeof(*med)) 388 /* not all drives have the same disc_info length, so requeue
483 return 1; 389 * packet with the length the drive tells us it can supply
390 */
391 buflen = be16_to_cpu(di->disc_information_length) +
392 sizeof(di->disc_information_length);
484 393
485 if (eh->nea || eh->notification_class != 0x4) 394 if (buflen > sizeof(disc_information))
486 return 1; 395 buflen = sizeof(disc_information);
487 396
488 memcpy(med, &buffer[sizeof(*eh)], sizeof(*med)); 397 cgc.cmd[8] = cgc.buflen = buflen;
489 return 0; 398 ret = cdo->generic_packet(cdi, &cgc);
399 if (ret)
400 return ret;
401
402 /* return actual fill size */
403 return buflen;
490} 404}
491 405
406/* This macro makes sure we don't have to check on cdrom_device_ops
407 * existence in the run-time routines below. Change_capability is a
408 * hack to have the capability flags defined const, while we can still
409 * change it here without gcc complaining at every line.
410 */
411#define ENSURE(call, bits) \
412do { \
413 if (cdo->call == NULL) \
414 *change_capability &= ~(bits); \
415} while (0)
416
492/* 417/*
493 * the first prototypes used 0x2c as the page code for the mrw mode page, 418 * the first prototypes used 0x2c as the page code for the mrw mode page,
494 * subsequently this was changed to 0x03. probe the one used by this drive 419 * subsequently this was changed to 0x03. probe the one used by this drive
@@ -605,18 +530,6 @@ static int cdrom_mrw_bgformat_susp(struct cdrom_device_info *cdi, int immed)
605 return cdi->ops->generic_packet(cdi, &cgc); 530 return cdi->ops->generic_packet(cdi, &cgc);
606} 531}
607 532
608static int cdrom_flush_cache(struct cdrom_device_info *cdi)
609{
610 struct packet_command cgc;
611
612 init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
613 cgc.cmd[0] = GPCMD_FLUSH_CACHE;
614
615 cgc.timeout = 5 * 60 * HZ;
616
617 return cdi->ops->generic_packet(cdi, &cgc);
618}
619
620static int cdrom_mrw_exit(struct cdrom_device_info *cdi) 533static int cdrom_mrw_exit(struct cdrom_device_info *cdi)
621{ 534{
622 disc_information di; 535 disc_information di;
@@ -650,17 +563,19 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space)
650 cgc.buffer = buffer; 563 cgc.buffer = buffer;
651 cgc.buflen = sizeof(buffer); 564 cgc.buflen = sizeof(buffer);
652 565
653 if ((ret = cdrom_mode_sense(cdi, &cgc, cdi->mrw_mode_page, 0))) 566 ret = cdrom_mode_sense(cdi, &cgc, cdi->mrw_mode_page, 0);
567 if (ret)
654 return ret; 568 return ret;
655 569
656 mph = (struct mode_page_header *) buffer; 570 mph = (struct mode_page_header *)buffer;
657 offset = be16_to_cpu(mph->desc_length); 571 offset = be16_to_cpu(mph->desc_length);
658 size = be16_to_cpu(mph->mode_data_length) + 2; 572 size = be16_to_cpu(mph->mode_data_length) + 2;
659 573
660 buffer[offset + 3] = space; 574 buffer[offset + 3] = space;
661 cgc.buflen = size; 575 cgc.buflen = size;
662 576
663 if ((ret = cdrom_mode_select(cdi, &cgc))) 577 ret = cdrom_mode_select(cdi, &cgc);
578 if (ret)
664 return ret; 579 return ret;
665 580
666 pr_info("%s: mrw address space %s selected\n", 581 pr_info("%s: mrw address space %s selected\n",
@@ -668,6 +583,106 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space)
668 return 0; 583 return 0;
669} 584}
670 585
586int register_cdrom(struct cdrom_device_info *cdi)
587{
588 static char banner_printed;
589 struct cdrom_device_ops *cdo = cdi->ops;
590 int *change_capability = (int *)&cdo->capability; /* hack */
591
592 cd_dbg(CD_OPEN, "entering register_cdrom\n");
593
594 if (cdo->open == NULL || cdo->release == NULL)
595 return -EINVAL;
596 if (!banner_printed) {
597 pr_info("Uniform CD-ROM driver " REVISION "\n");
598 banner_printed = 1;
599 cdrom_sysctl_register();
600 }
601
602 ENSURE(drive_status, CDC_DRIVE_STATUS);
603 if (cdo->check_events == NULL && cdo->media_changed == NULL)
604 *change_capability = ~(CDC_MEDIA_CHANGED | CDC_SELECT_DISC);
605 ENSURE(tray_move, CDC_CLOSE_TRAY | CDC_OPEN_TRAY);
606 ENSURE(lock_door, CDC_LOCK);
607 ENSURE(select_speed, CDC_SELECT_SPEED);
608 ENSURE(get_last_session, CDC_MULTI_SESSION);
609 ENSURE(get_mcn, CDC_MCN);
610 ENSURE(reset, CDC_RESET);
611 ENSURE(generic_packet, CDC_GENERIC_PACKET);
612 cdi->mc_flags = 0;
613 cdo->n_minors = 0;
614 cdi->options = CDO_USE_FFLAGS;
615
616 if (autoclose == 1 && CDROM_CAN(CDC_CLOSE_TRAY))
617 cdi->options |= (int) CDO_AUTO_CLOSE;
618 if (autoeject == 1 && CDROM_CAN(CDC_OPEN_TRAY))
619 cdi->options |= (int) CDO_AUTO_EJECT;
620 if (lockdoor == 1)
621 cdi->options |= (int) CDO_LOCK;
622 if (check_media_type == 1)
623 cdi->options |= (int) CDO_CHECK_TYPE;
624
625 if (CDROM_CAN(CDC_MRW_W))
626 cdi->exit = cdrom_mrw_exit;
627
628 if (cdi->disk)
629 cdi->cdda_method = CDDA_BPC_FULL;
630 else
631 cdi->cdda_method = CDDA_OLD;
632
633 if (!cdo->generic_packet)
634 cdo->generic_packet = cdrom_dummy_generic_packet;
635
636 cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name);
637 mutex_lock(&cdrom_mutex);
638 list_add(&cdi->list, &cdrom_list);
639 mutex_unlock(&cdrom_mutex);
640 return 0;
641}
642#undef ENSURE
643
644void unregister_cdrom(struct cdrom_device_info *cdi)
645{
646 cd_dbg(CD_OPEN, "entering unregister_cdrom\n");
647
648 mutex_lock(&cdrom_mutex);
649 list_del(&cdi->list);
650 mutex_unlock(&cdrom_mutex);
651
652 if (cdi->exit)
653 cdi->exit(cdi);
654
655 cdi->ops->n_minors--;
656 cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name);
657}
658
659int cdrom_get_media_event(struct cdrom_device_info *cdi,
660 struct media_event_desc *med)
661{
662 struct packet_command cgc;
663 unsigned char buffer[8];
664 struct event_header *eh = (struct event_header *)buffer;
665
666 init_cdrom_command(&cgc, buffer, sizeof(buffer), CGC_DATA_READ);
667 cgc.cmd[0] = GPCMD_GET_EVENT_STATUS_NOTIFICATION;
668 cgc.cmd[1] = 1; /* IMMED */
669 cgc.cmd[4] = 1 << 4; /* media event */
670 cgc.cmd[8] = sizeof(buffer);
671 cgc.quiet = 1;
672
673 if (cdi->ops->generic_packet(cdi, &cgc))
674 return 1;
675
676 if (be16_to_cpu(eh->data_len) < sizeof(*med))
677 return 1;
678
679 if (eh->nea || eh->notification_class != 0x4)
680 return 1;
681
682 memcpy(med, &buffer[sizeof(*eh)], sizeof(*med));
683 return 0;
684}
685
671static int cdrom_get_random_writable(struct cdrom_device_info *cdi, 686static int cdrom_get_random_writable(struct cdrom_device_info *cdi,
672 struct rwrt_feature_desc *rfd) 687 struct rwrt_feature_desc *rfd)
673{ 688{
@@ -839,7 +854,7 @@ static int cdrom_ram_open_write(struct cdrom_device_info *cdi)
839 else if (CDF_RWRT == be16_to_cpu(rfd.feature_code)) 854 else if (CDF_RWRT == be16_to_cpu(rfd.feature_code))
840 ret = !rfd.curr; 855 ret = !rfd.curr;
841 856
842 cdinfo(CD_OPEN, "can open for random write\n"); 857 cd_dbg(CD_OPEN, "can open for random write\n");
843 return ret; 858 return ret;
844} 859}
845 860
@@ -928,12 +943,12 @@ static void cdrom_dvd_rw_close_write(struct cdrom_device_info *cdi)
928 struct packet_command cgc; 943 struct packet_command cgc;
929 944
930 if (cdi->mmc3_profile != 0x1a) { 945 if (cdi->mmc3_profile != 0x1a) {
931 cdinfo(CD_CLOSE, "%s: No DVD+RW\n", cdi->name); 946 cd_dbg(CD_CLOSE, "%s: No DVD+RW\n", cdi->name);
932 return; 947 return;
933 } 948 }
934 949
935 if (!cdi->media_written) { 950 if (!cdi->media_written) {
936 cdinfo(CD_CLOSE, "%s: DVD+RW media clean\n", cdi->name); 951 cd_dbg(CD_CLOSE, "%s: DVD+RW media clean\n", cdi->name);
937 return; 952 return;
938 } 953 }
939 954
@@ -969,82 +984,74 @@ static int cdrom_close_write(struct cdrom_device_info *cdi)
969#endif 984#endif
970} 985}
971 986
972/* We use the open-option O_NONBLOCK to indicate that the 987/* badly broken, I know. Is due for a fixup anytime. */
973 * purpose of opening is only for subsequent ioctl() calls; no device 988static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype *tracks)
974 * integrity checks are performed.
975 *
976 * We hope that all cd-player programs will adopt this convention. It
977 * is in their own interest: device control becomes a lot easier
978 * this way.
979 */
980int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t mode)
981{ 989{
982 int ret; 990 struct cdrom_tochdr header;
983 991 struct cdrom_tocentry entry;
984 cdinfo(CD_OPEN, "entering cdrom_open\n"); 992 int ret, i;
985 993 tracks->data = 0;
986 /* open is event synchronization point, check events first */ 994 tracks->audio = 0;
987 check_disk_change(bdev); 995 tracks->cdi = 0;
988 996 tracks->xa = 0;
989 /* if this was a O_NONBLOCK open and we should honor the flags, 997 tracks->error = 0;
990 * do a quick open without drive/disc integrity checks. */ 998 cd_dbg(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n");
991 cdi->use_count++; 999 /* Grab the TOC header so we can see how many tracks there are */
992 if ((mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) { 1000 ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header);
993 ret = cdi->ops->open(cdi, 1); 1001 if (ret) {
994 } else { 1002 if (ret == -ENOMEDIUM)
995 ret = open_for_data(cdi); 1003 tracks->error = CDS_NO_DISC;
996 if (ret) 1004 else
997 goto err; 1005 tracks->error = CDS_NO_INFO;
998 cdrom_mmc3_profile(cdi); 1006 return;
999 if (mode & FMODE_WRITE) {
1000 ret = -EROFS;
1001 if (cdrom_open_write(cdi))
1002 goto err_release;
1003 if (!CDROM_CAN(CDC_RAM))
1004 goto err_release;
1005 ret = 0;
1006 cdi->media_written = 0;
1007 }
1008 } 1007 }
1009 1008 /* check what type of tracks are on this disc */
1010 if (ret) 1009 entry.cdte_format = CDROM_MSF;
1011 goto err; 1010 for (i = header.cdth_trk0; i <= header.cdth_trk1; i++) {
1012 1011 entry.cdte_track = i;
1013 cdinfo(CD_OPEN, "Use count for \"/dev/%s\" now %d\n", 1012 if (cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &entry)) {
1014 cdi->name, cdi->use_count); 1013 tracks->error = CDS_NO_INFO;
1015 return 0; 1014 return;
1016err_release: 1015 }
1017 if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) { 1016 if (entry.cdte_ctrl & CDROM_DATA_TRACK) {
1018 cdi->ops->lock_door(cdi, 0); 1017 if (entry.cdte_format == 0x10)
1019 cdinfo(CD_OPEN, "door unlocked.\n"); 1018 tracks->cdi++;
1019 else if (entry.cdte_format == 0x20)
1020 tracks->xa++;
1021 else
1022 tracks->data++;
1023 } else {
1024 tracks->audio++;
1025 }
1026 cd_dbg(CD_COUNT_TRACKS, "track %d: format=%d, ctrl=%d\n",
1027 i, entry.cdte_format, entry.cdte_ctrl);
1020 } 1028 }
1021 cdi->ops->release(cdi); 1029 cd_dbg(CD_COUNT_TRACKS, "disc has %d tracks: %d=audio %d=data %d=Cd-I %d=XA\n",
1022err: 1030 header.cdth_trk1, tracks->audio, tracks->data,
1023 cdi->use_count--; 1031 tracks->cdi, tracks->xa);
1024 return ret;
1025} 1032}
1026 1033
1027static 1034static
1028int open_for_data(struct cdrom_device_info * cdi) 1035int open_for_data(struct cdrom_device_info *cdi)
1029{ 1036{
1030 int ret; 1037 int ret;
1031 struct cdrom_device_ops *cdo = cdi->ops; 1038 struct cdrom_device_ops *cdo = cdi->ops;
1032 tracktype tracks; 1039 tracktype tracks;
1033 cdinfo(CD_OPEN, "entering open_for_data\n"); 1040 cd_dbg(CD_OPEN, "entering open_for_data\n");
1034 /* Check if the driver can report drive status. If it can, we 1041 /* Check if the driver can report drive status. If it can, we
1035 can do clever things. If it can't, well, we at least tried! */ 1042 can do clever things. If it can't, well, we at least tried! */
1036 if (cdo->drive_status != NULL) { 1043 if (cdo->drive_status != NULL) {
1037 ret = cdo->drive_status(cdi, CDSL_CURRENT); 1044 ret = cdo->drive_status(cdi, CDSL_CURRENT);
1038 cdinfo(CD_OPEN, "drive_status=%d\n", ret); 1045 cd_dbg(CD_OPEN, "drive_status=%d\n", ret);
1039 if (ret == CDS_TRAY_OPEN) { 1046 if (ret == CDS_TRAY_OPEN) {
1040 cdinfo(CD_OPEN, "the tray is open...\n"); 1047 cd_dbg(CD_OPEN, "the tray is open...\n");
1041 /* can/may i close it? */ 1048 /* can/may i close it? */
1042 if (CDROM_CAN(CDC_CLOSE_TRAY) && 1049 if (CDROM_CAN(CDC_CLOSE_TRAY) &&
1043 cdi->options & CDO_AUTO_CLOSE) { 1050 cdi->options & CDO_AUTO_CLOSE) {
1044 cdinfo(CD_OPEN, "trying to close the tray.\n"); 1051 cd_dbg(CD_OPEN, "trying to close the tray\n");
1045 ret=cdo->tray_move(cdi,0); 1052 ret=cdo->tray_move(cdi,0);
1046 if (ret) { 1053 if (ret) {
1047 cdinfo(CD_OPEN, "bummer. tried to close the tray but failed.\n"); 1054 cd_dbg(CD_OPEN, "bummer. tried to close the tray but failed.\n");
1048 /* Ignore the error from the low 1055 /* Ignore the error from the low
1049 level driver. We don't care why it 1056 level driver. We don't care why it
1050 couldn't close the tray. We only care 1057 couldn't close the tray. We only care
@@ -1054,19 +1061,19 @@ int open_for_data(struct cdrom_device_info * cdi)
1054 goto clean_up_and_return; 1061 goto clean_up_and_return;
1055 } 1062 }
1056 } else { 1063 } else {
1057 cdinfo(CD_OPEN, "bummer. this drive can't close the tray.\n"); 1064 cd_dbg(CD_OPEN, "bummer. this drive can't close the tray.\n");
1058 ret=-ENOMEDIUM; 1065 ret=-ENOMEDIUM;
1059 goto clean_up_and_return; 1066 goto clean_up_and_return;
1060 } 1067 }
1061 /* Ok, the door should be closed now.. Check again */ 1068 /* Ok, the door should be closed now.. Check again */
1062 ret = cdo->drive_status(cdi, CDSL_CURRENT); 1069 ret = cdo->drive_status(cdi, CDSL_CURRENT);
1063 if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) { 1070 if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) {
1064 cdinfo(CD_OPEN, "bummer. the tray is still not closed.\n"); 1071 cd_dbg(CD_OPEN, "bummer. the tray is still not closed.\n");
1065 cdinfo(CD_OPEN, "tray might not contain a medium.\n"); 1072 cd_dbg(CD_OPEN, "tray might not contain a medium\n");
1066 ret=-ENOMEDIUM; 1073 ret=-ENOMEDIUM;
1067 goto clean_up_and_return; 1074 goto clean_up_and_return;
1068 } 1075 }
1069 cdinfo(CD_OPEN, "the tray is now closed.\n"); 1076 cd_dbg(CD_OPEN, "the tray is now closed\n");
1070 } 1077 }
1071 /* the door should be closed now, check for the disc */ 1078 /* the door should be closed now, check for the disc */
1072 ret = cdo->drive_status(cdi, CDSL_CURRENT); 1079 ret = cdo->drive_status(cdi, CDSL_CURRENT);
@@ -1077,7 +1084,7 @@ int open_for_data(struct cdrom_device_info * cdi)
1077 } 1084 }
1078 cdrom_count_tracks(cdi, &tracks); 1085 cdrom_count_tracks(cdi, &tracks);
1079 if (tracks.error == CDS_NO_DISC) { 1086 if (tracks.error == CDS_NO_DISC) {
1080 cdinfo(CD_OPEN, "bummer. no disc.\n"); 1087 cd_dbg(CD_OPEN, "bummer. no disc.\n");
1081 ret=-ENOMEDIUM; 1088 ret=-ENOMEDIUM;
1082 goto clean_up_and_return; 1089 goto clean_up_and_return;
1083 } 1090 }
@@ -1087,34 +1094,34 @@ int open_for_data(struct cdrom_device_info * cdi)
1087 if (cdi->options & CDO_CHECK_TYPE) { 1094 if (cdi->options & CDO_CHECK_TYPE) {
1088 /* give people a warning shot, now that CDO_CHECK_TYPE 1095 /* give people a warning shot, now that CDO_CHECK_TYPE
1089 is the default case! */ 1096 is the default case! */
1090 cdinfo(CD_OPEN, "bummer. wrong media type.\n"); 1097 cd_dbg(CD_OPEN, "bummer. wrong media type.\n");
1091 cdinfo(CD_WARNING, "pid %d must open device O_NONBLOCK!\n", 1098 cd_dbg(CD_WARNING, "pid %d must open device O_NONBLOCK!\n",
1092 (unsigned int)task_pid_nr(current)); 1099 (unsigned int)task_pid_nr(current));
1093 ret=-EMEDIUMTYPE; 1100 ret=-EMEDIUMTYPE;
1094 goto clean_up_and_return; 1101 goto clean_up_and_return;
1095 } 1102 }
1096 else { 1103 else {
1097 cdinfo(CD_OPEN, "wrong media type, but CDO_CHECK_TYPE not set.\n"); 1104 cd_dbg(CD_OPEN, "wrong media type, but CDO_CHECK_TYPE not set\n");
1098 } 1105 }
1099 } 1106 }
1100 1107
1101 cdinfo(CD_OPEN, "all seems well, opening the device.\n"); 1108 cd_dbg(CD_OPEN, "all seems well, opening the devicen");
1102 1109
1103 /* all seems well, we can open the device */ 1110 /* all seems well, we can open the device */
1104 ret = cdo->open(cdi, 0); /* open for data */ 1111 ret = cdo->open(cdi, 0); /* open for data */
1105 cdinfo(CD_OPEN, "opening the device gave me %d.\n", ret); 1112 cd_dbg(CD_OPEN, "opening the device gave me %d\n", ret);
1106 /* After all this careful checking, we shouldn't have problems 1113 /* After all this careful checking, we shouldn't have problems
1107 opening the device, but we don't want the device locked if 1114 opening the device, but we don't want the device locked if
1108 this somehow fails... */ 1115 this somehow fails... */
1109 if (ret) { 1116 if (ret) {
1110 cdinfo(CD_OPEN, "open device failed.\n"); 1117 cd_dbg(CD_OPEN, "open device failed\n");
1111 goto clean_up_and_return; 1118 goto clean_up_and_return;
1112 } 1119 }
1113 if (CDROM_CAN(CDC_LOCK) && (cdi->options & CDO_LOCK)) { 1120 if (CDROM_CAN(CDC_LOCK) && (cdi->options & CDO_LOCK)) {
1114 cdo->lock_door(cdi, 1); 1121 cdo->lock_door(cdi, 1);
1115 cdinfo(CD_OPEN, "door locked.\n"); 1122 cd_dbg(CD_OPEN, "door locked\n");
1116 } 1123 }
1117 cdinfo(CD_OPEN, "device opened successfully.\n"); 1124 cd_dbg(CD_OPEN, "device opened successfully\n");
1118 return ret; 1125 return ret;
1119 1126
1120 /* Something failed. Try to unlock the drive, because some drivers 1127 /* Something failed. Try to unlock the drive, because some drivers
@@ -1123,14 +1130,70 @@ int open_for_data(struct cdrom_device_info * cdi)
1123 This ensures that the drive gets unlocked after a mount fails. This 1130 This ensures that the drive gets unlocked after a mount fails. This
1124 is a goto to avoid bloating the driver with redundant code. */ 1131 is a goto to avoid bloating the driver with redundant code. */
1125clean_up_and_return: 1132clean_up_and_return:
1126 cdinfo(CD_OPEN, "open failed.\n"); 1133 cd_dbg(CD_OPEN, "open failed\n");
1127 if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) { 1134 if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) {
1128 cdo->lock_door(cdi, 0); 1135 cdo->lock_door(cdi, 0);
1129 cdinfo(CD_OPEN, "door unlocked.\n"); 1136 cd_dbg(CD_OPEN, "door unlocked\n");
1130 } 1137 }
1131 return ret; 1138 return ret;
1132} 1139}
1133 1140
1141/* We use the open-option O_NONBLOCK to indicate that the
1142 * purpose of opening is only for subsequent ioctl() calls; no device
1143 * integrity checks are performed.
1144 *
1145 * We hope that all cd-player programs will adopt this convention. It
1146 * is in their own interest: device control becomes a lot easier
1147 * this way.
1148 */
1149int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev,
1150 fmode_t mode)
1151{
1152 int ret;
1153
1154 cd_dbg(CD_OPEN, "entering cdrom_open\n");
1155
1156 /* open is event synchronization point, check events first */
1157 check_disk_change(bdev);
1158
1159 /* if this was a O_NONBLOCK open and we should honor the flags,
1160 * do a quick open without drive/disc integrity checks. */
1161 cdi->use_count++;
1162 if ((mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) {
1163 ret = cdi->ops->open(cdi, 1);
1164 } else {
1165 ret = open_for_data(cdi);
1166 if (ret)
1167 goto err;
1168 cdrom_mmc3_profile(cdi);
1169 if (mode & FMODE_WRITE) {
1170 ret = -EROFS;
1171 if (cdrom_open_write(cdi))
1172 goto err_release;
1173 if (!CDROM_CAN(CDC_RAM))
1174 goto err_release;
1175 ret = 0;
1176 cdi->media_written = 0;
1177 }
1178 }
1179
1180 if (ret)
1181 goto err;
1182
1183 cd_dbg(CD_OPEN, "Use count for \"/dev/%s\" now %d\n",
1184 cdi->name, cdi->use_count);
1185 return 0;
1186err_release:
1187 if (CDROM_CAN(CDC_LOCK) && cdi->options & CDO_LOCK) {
1188 cdi->ops->lock_door(cdi, 0);
1189 cd_dbg(CD_OPEN, "door unlocked\n");
1190 }
1191 cdi->ops->release(cdi);
1192err:
1193 cdi->use_count--;
1194 return ret;
1195}
1196
1134/* This code is similar to that in open_for_data. The routine is called 1197/* This code is similar to that in open_for_data. The routine is called
1135 whenever an audio play operation is requested. 1198 whenever an audio play operation is requested.
1136*/ 1199*/
@@ -1139,21 +1202,21 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi,
1139{ 1202{
1140 int ret; 1203 int ret;
1141 tracktype tracks; 1204 tracktype tracks;
1142 cdinfo(CD_OPEN, "entering check_for_audio_disc\n"); 1205 cd_dbg(CD_OPEN, "entering check_for_audio_disc\n");
1143 if (!(cdi->options & CDO_CHECK_TYPE)) 1206 if (!(cdi->options & CDO_CHECK_TYPE))
1144 return 0; 1207 return 0;
1145 if (cdo->drive_status != NULL) { 1208 if (cdo->drive_status != NULL) {
1146 ret = cdo->drive_status(cdi, CDSL_CURRENT); 1209 ret = cdo->drive_status(cdi, CDSL_CURRENT);
1147 cdinfo(CD_OPEN, "drive_status=%d\n", ret); 1210 cd_dbg(CD_OPEN, "drive_status=%d\n", ret);
1148 if (ret == CDS_TRAY_OPEN) { 1211 if (ret == CDS_TRAY_OPEN) {
1149 cdinfo(CD_OPEN, "the tray is open...\n"); 1212 cd_dbg(CD_OPEN, "the tray is open...\n");
1150 /* can/may i close it? */ 1213 /* can/may i close it? */
1151 if (CDROM_CAN(CDC_CLOSE_TRAY) && 1214 if (CDROM_CAN(CDC_CLOSE_TRAY) &&
1152 cdi->options & CDO_AUTO_CLOSE) { 1215 cdi->options & CDO_AUTO_CLOSE) {
1153 cdinfo(CD_OPEN, "trying to close the tray.\n"); 1216 cd_dbg(CD_OPEN, "trying to close the tray\n");
1154 ret=cdo->tray_move(cdi,0); 1217 ret=cdo->tray_move(cdi,0);
1155 if (ret) { 1218 if (ret) {
1156 cdinfo(CD_OPEN, "bummer. tried to close tray but failed.\n"); 1219 cd_dbg(CD_OPEN, "bummer. tried to close tray but failed.\n");
1157 /* Ignore the error from the low 1220 /* Ignore the error from the low
1158 level driver. We don't care why it 1221 level driver. We don't care why it
1159 couldn't close the tray. We only care 1222 couldn't close the tray. We only care
@@ -1162,20 +1225,20 @@ static int check_for_audio_disc(struct cdrom_device_info * cdi,
1162 return -ENOMEDIUM; 1225 return -ENOMEDIUM;
1163 } 1226 }
1164 } else { 1227 } else {
1165 cdinfo(CD_OPEN, "bummer. this driver can't close the tray.\n"); 1228 cd_dbg(CD_OPEN, "bummer. this driver can't close the tray.\n");
1166 return -ENOMEDIUM; 1229 return -ENOMEDIUM;
1167 } 1230 }
1168 /* Ok, the door should be closed now.. Check again */ 1231 /* Ok, the door should be closed now.. Check again */
1169 ret = cdo->drive_status(cdi, CDSL_CURRENT); 1232 ret = cdo->drive_status(cdi, CDSL_CURRENT);
1170 if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) { 1233 if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) {
1171 cdinfo(CD_OPEN, "bummer. the tray is still not closed.\n"); 1234 cd_dbg(CD_OPEN, "bummer. the tray is still not closed.\n");
1172 return -ENOMEDIUM; 1235 return -ENOMEDIUM;
1173 } 1236 }
1174 if (ret!=CDS_DISC_OK) { 1237 if (ret!=CDS_DISC_OK) {
1175 cdinfo(CD_OPEN, "bummer. disc isn't ready.\n"); 1238 cd_dbg(CD_OPEN, "bummer. disc isn't ready.\n");
1176 return -EIO; 1239 return -EIO;
1177 } 1240 }
1178 cdinfo(CD_OPEN, "the tray is now closed.\n"); 1241 cd_dbg(CD_OPEN, "the tray is now closed\n");
1179 } 1242 }
1180 } 1243 }
1181 cdrom_count_tracks(cdi, &tracks); 1244 cdrom_count_tracks(cdi, &tracks);
@@ -1193,17 +1256,18 @@ void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode)
1193 struct cdrom_device_ops *cdo = cdi->ops; 1256 struct cdrom_device_ops *cdo = cdi->ops;
1194 int opened_for_data; 1257 int opened_for_data;
1195 1258
1196 cdinfo(CD_CLOSE, "entering cdrom_release\n"); 1259 cd_dbg(CD_CLOSE, "entering cdrom_release\n");
1197 1260
1198 if (cdi->use_count > 0) 1261 if (cdi->use_count > 0)
1199 cdi->use_count--; 1262 cdi->use_count--;
1200 1263
1201 if (cdi->use_count == 0) { 1264 if (cdi->use_count == 0) {
1202 cdinfo(CD_CLOSE, "Use count for \"/dev/%s\" now zero\n", cdi->name); 1265 cd_dbg(CD_CLOSE, "Use count for \"/dev/%s\" now zero\n",
1266 cdi->name);
1203 cdrom_dvd_rw_close_write(cdi); 1267 cdrom_dvd_rw_close_write(cdi);
1204 1268
1205 if ((cdo->capability & CDC_LOCK) && !cdi->keeplocked) { 1269 if ((cdo->capability & CDC_LOCK) && !cdi->keeplocked) {
1206 cdinfo(CD_CLOSE, "Unlocking door!\n"); 1270 cd_dbg(CD_CLOSE, "Unlocking door!\n");
1207 cdo->lock_door(cdi, 0); 1271 cdo->lock_door(cdi, 0);
1208 } 1272 }
1209 } 1273 }
@@ -1262,7 +1326,7 @@ static int cdrom_slot_status(struct cdrom_device_info *cdi, int slot)
1262 struct cdrom_changer_info *info; 1326 struct cdrom_changer_info *info;
1263 int ret; 1327 int ret;
1264 1328
1265 cdinfo(CD_CHANGER, "entering cdrom_slot_status()\n"); 1329 cd_dbg(CD_CHANGER, "entering cdrom_slot_status()\n");
1266 if (cdi->sanyo_slot) 1330 if (cdi->sanyo_slot)
1267 return CDS_NO_INFO; 1331 return CDS_NO_INFO;
1268 1332
@@ -1292,7 +1356,7 @@ int cdrom_number_of_slots(struct cdrom_device_info *cdi)
1292 int nslots = 1; 1356 int nslots = 1;
1293 struct cdrom_changer_info *info; 1357 struct cdrom_changer_info *info;
1294 1358
1295 cdinfo(CD_CHANGER, "entering cdrom_number_of_slots()\n"); 1359 cd_dbg(CD_CHANGER, "entering cdrom_number_of_slots()\n");
1296 /* cdrom_read_mech_status requires a valid value for capacity: */ 1360 /* cdrom_read_mech_status requires a valid value for capacity: */
1297 cdi->capacity = 0; 1361 cdi->capacity = 0;
1298 1362
@@ -1313,7 +1377,7 @@ static int cdrom_load_unload(struct cdrom_device_info *cdi, int slot)
1313{ 1377{
1314 struct packet_command cgc; 1378 struct packet_command cgc;
1315 1379
1316 cdinfo(CD_CHANGER, "entering cdrom_load_unload()\n"); 1380 cd_dbg(CD_CHANGER, "entering cdrom_load_unload()\n");
1317 if (cdi->sanyo_slot && slot < 0) 1381 if (cdi->sanyo_slot && slot < 0)
1318 return 0; 1382 return 0;
1319 1383
@@ -1342,7 +1406,7 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
1342 int curslot; 1406 int curslot;
1343 int ret; 1407 int ret;
1344 1408
1345 cdinfo(CD_CHANGER, "entering cdrom_select_disc()\n"); 1409 cd_dbg(CD_CHANGER, "entering cdrom_select_disc()\n");
1346 if (!CDROM_CAN(CDC_SELECT_DISC)) 1410 if (!CDROM_CAN(CDC_SELECT_DISC))
1347 return -EDRIVE_CANT_DO_THIS; 1411 return -EDRIVE_CANT_DO_THIS;
1348 1412
@@ -1476,51 +1540,6 @@ int cdrom_media_changed(struct cdrom_device_info *cdi)
1476 return media_changed(cdi, 0); 1540 return media_changed(cdi, 0);
1477} 1541}
1478 1542
1479/* badly broken, I know. Is due for a fixup anytime. */
1480static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype* tracks)
1481{
1482 struct cdrom_tochdr header;
1483 struct cdrom_tocentry entry;
1484 int ret, i;
1485 tracks->data=0;
1486 tracks->audio=0;
1487 tracks->cdi=0;
1488 tracks->xa=0;
1489 tracks->error=0;
1490 cdinfo(CD_COUNT_TRACKS, "entering cdrom_count_tracks\n");
1491 /* Grab the TOC header so we can see how many tracks there are */
1492 if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCHDR, &header))) {
1493 if (ret == -ENOMEDIUM)
1494 tracks->error = CDS_NO_DISC;
1495 else
1496 tracks->error = CDS_NO_INFO;
1497 return;
1498 }
1499 /* check what type of tracks are on this disc */
1500 entry.cdte_format = CDROM_MSF;
1501 for (i = header.cdth_trk0; i <= header.cdth_trk1; i++) {
1502 entry.cdte_track = i;
1503 if (cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &entry)) {
1504 tracks->error=CDS_NO_INFO;
1505 return;
1506 }
1507 if (entry.cdte_ctrl & CDROM_DATA_TRACK) {
1508 if (entry.cdte_format == 0x10)
1509 tracks->cdi++;
1510 else if (entry.cdte_format == 0x20)
1511 tracks->xa++;
1512 else
1513 tracks->data++;
1514 } else
1515 tracks->audio++;
1516 cdinfo(CD_COUNT_TRACKS, "track %d: format=%d, ctrl=%d\n",
1517 i, entry.cdte_format, entry.cdte_ctrl);
1518 }
1519 cdinfo(CD_COUNT_TRACKS, "disc has %d tracks: %d=audio %d=data %d=Cd-I %d=XA\n",
1520 header.cdth_trk1, tracks->audio, tracks->data,
1521 tracks->cdi, tracks->xa);
1522}
1523
1524/* Requests to the low-level drivers will /always/ be done in the 1543/* Requests to the low-level drivers will /always/ be done in the
1525 following format convention: 1544 following format convention:
1526 1545
@@ -1632,7 +1651,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1632 switch (ai->type) { 1651 switch (ai->type) {
1633 /* LU data send */ 1652 /* LU data send */
1634 case DVD_LU_SEND_AGID: 1653 case DVD_LU_SEND_AGID:
1635 cdinfo(CD_DVD, "entering DVD_LU_SEND_AGID\n"); 1654 cd_dbg(CD_DVD, "entering DVD_LU_SEND_AGID\n");
1636 cgc.quiet = 1; 1655 cgc.quiet = 1;
1637 setup_report_key(&cgc, ai->lsa.agid, 0); 1656 setup_report_key(&cgc, ai->lsa.agid, 0);
1638 1657
@@ -1644,7 +1663,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1644 break; 1663 break;
1645 1664
1646 case DVD_LU_SEND_KEY1: 1665 case DVD_LU_SEND_KEY1:
1647 cdinfo(CD_DVD, "entering DVD_LU_SEND_KEY1\n"); 1666 cd_dbg(CD_DVD, "entering DVD_LU_SEND_KEY1\n");
1648 setup_report_key(&cgc, ai->lsk.agid, 2); 1667 setup_report_key(&cgc, ai->lsk.agid, 2);
1649 1668
1650 if ((ret = cdo->generic_packet(cdi, &cgc))) 1669 if ((ret = cdo->generic_packet(cdi, &cgc)))
@@ -1655,7 +1674,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1655 break; 1674 break;
1656 1675
1657 case DVD_LU_SEND_CHALLENGE: 1676 case DVD_LU_SEND_CHALLENGE:
1658 cdinfo(CD_DVD, "entering DVD_LU_SEND_CHALLENGE\n"); 1677 cd_dbg(CD_DVD, "entering DVD_LU_SEND_CHALLENGE\n");
1659 setup_report_key(&cgc, ai->lsc.agid, 1); 1678 setup_report_key(&cgc, ai->lsc.agid, 1);
1660 1679
1661 if ((ret = cdo->generic_packet(cdi, &cgc))) 1680 if ((ret = cdo->generic_packet(cdi, &cgc)))
@@ -1667,7 +1686,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1667 1686
1668 /* Post-auth key */ 1687 /* Post-auth key */
1669 case DVD_LU_SEND_TITLE_KEY: 1688 case DVD_LU_SEND_TITLE_KEY:
1670 cdinfo(CD_DVD, "entering DVD_LU_SEND_TITLE_KEY\n"); 1689 cd_dbg(CD_DVD, "entering DVD_LU_SEND_TITLE_KEY\n");
1671 cgc.quiet = 1; 1690 cgc.quiet = 1;
1672 setup_report_key(&cgc, ai->lstk.agid, 4); 1691 setup_report_key(&cgc, ai->lstk.agid, 4);
1673 cgc.cmd[5] = ai->lstk.lba; 1692 cgc.cmd[5] = ai->lstk.lba;
@@ -1686,7 +1705,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1686 break; 1705 break;
1687 1706
1688 case DVD_LU_SEND_ASF: 1707 case DVD_LU_SEND_ASF:
1689 cdinfo(CD_DVD, "entering DVD_LU_SEND_ASF\n"); 1708 cd_dbg(CD_DVD, "entering DVD_LU_SEND_ASF\n");
1690 setup_report_key(&cgc, ai->lsasf.agid, 5); 1709 setup_report_key(&cgc, ai->lsasf.agid, 5);
1691 1710
1692 if ((ret = cdo->generic_packet(cdi, &cgc))) 1711 if ((ret = cdo->generic_packet(cdi, &cgc)))
@@ -1697,7 +1716,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1697 1716
1698 /* LU data receive (LU changes state) */ 1717 /* LU data receive (LU changes state) */
1699 case DVD_HOST_SEND_CHALLENGE: 1718 case DVD_HOST_SEND_CHALLENGE:
1700 cdinfo(CD_DVD, "entering DVD_HOST_SEND_CHALLENGE\n"); 1719 cd_dbg(CD_DVD, "entering DVD_HOST_SEND_CHALLENGE\n");
1701 setup_send_key(&cgc, ai->hsc.agid, 1); 1720 setup_send_key(&cgc, ai->hsc.agid, 1);
1702 buf[1] = 0xe; 1721 buf[1] = 0xe;
1703 copy_chal(&buf[4], ai->hsc.chal); 1722 copy_chal(&buf[4], ai->hsc.chal);
@@ -1709,7 +1728,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1709 break; 1728 break;
1710 1729
1711 case DVD_HOST_SEND_KEY2: 1730 case DVD_HOST_SEND_KEY2:
1712 cdinfo(CD_DVD, "entering DVD_HOST_SEND_KEY2\n"); 1731 cd_dbg(CD_DVD, "entering DVD_HOST_SEND_KEY2\n");
1713 setup_send_key(&cgc, ai->hsk.agid, 3); 1732 setup_send_key(&cgc, ai->hsk.agid, 3);
1714 buf[1] = 0xa; 1733 buf[1] = 0xa;
1715 copy_key(&buf[4], ai->hsk.key); 1734 copy_key(&buf[4], ai->hsk.key);
@@ -1724,7 +1743,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1724 /* Misc */ 1743 /* Misc */
1725 case DVD_INVALIDATE_AGID: 1744 case DVD_INVALIDATE_AGID:
1726 cgc.quiet = 1; 1745 cgc.quiet = 1;
1727 cdinfo(CD_DVD, "entering DVD_INVALIDATE_AGID\n"); 1746 cd_dbg(CD_DVD, "entering DVD_INVALIDATE_AGID\n");
1728 setup_report_key(&cgc, ai->lsa.agid, 0x3f); 1747 setup_report_key(&cgc, ai->lsa.agid, 0x3f);
1729 if ((ret = cdo->generic_packet(cdi, &cgc))) 1748 if ((ret = cdo->generic_packet(cdi, &cgc)))
1730 return ret; 1749 return ret;
@@ -1732,7 +1751,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1732 1751
1733 /* Get region settings */ 1752 /* Get region settings */
1734 case DVD_LU_SEND_RPC_STATE: 1753 case DVD_LU_SEND_RPC_STATE:
1735 cdinfo(CD_DVD, "entering DVD_LU_SEND_RPC_STATE\n"); 1754 cd_dbg(CD_DVD, "entering DVD_LU_SEND_RPC_STATE\n");
1736 setup_report_key(&cgc, 0, 8); 1755 setup_report_key(&cgc, 0, 8);
1737 memset(&rpc_state, 0, sizeof(rpc_state_t)); 1756 memset(&rpc_state, 0, sizeof(rpc_state_t));
1738 cgc.buffer = (char *) &rpc_state; 1757 cgc.buffer = (char *) &rpc_state;
@@ -1749,7 +1768,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1749 1768
1750 /* Set region settings */ 1769 /* Set region settings */
1751 case DVD_HOST_SEND_RPC_STATE: 1770 case DVD_HOST_SEND_RPC_STATE:
1752 cdinfo(CD_DVD, "entering DVD_HOST_SEND_RPC_STATE\n"); 1771 cd_dbg(CD_DVD, "entering DVD_HOST_SEND_RPC_STATE\n");
1753 setup_send_key(&cgc, 0, 6); 1772 setup_send_key(&cgc, 0, 6);
1754 buf[1] = 6; 1773 buf[1] = 6;
1755 buf[4] = ai->hrpcs.pdrc; 1774 buf[4] = ai->hrpcs.pdrc;
@@ -1759,7 +1778,7 @@ static int dvd_do_auth(struct cdrom_device_info *cdi, dvd_authinfo *ai)
1759 break; 1778 break;
1760 1779
1761 default: 1780 default:
1762 cdinfo(CD_WARNING, "Invalid DVD key ioctl (%d)\n", ai->type); 1781 cd_dbg(CD_WARNING, "Invalid DVD key ioctl (%d)\n", ai->type);
1763 return -ENOTTY; 1782 return -ENOTTY;
1764 } 1783 }
1765 1784
@@ -1891,7 +1910,8 @@ static int dvd_read_bca(struct cdrom_device_info *cdi, dvd_struct *s,
1891 1910
1892 s->bca.len = buf[0] << 8 | buf[1]; 1911 s->bca.len = buf[0] << 8 | buf[1];
1893 if (s->bca.len < 12 || s->bca.len > 188) { 1912 if (s->bca.len < 12 || s->bca.len > 188) {
1894 cdinfo(CD_WARNING, "Received invalid BCA length (%d)\n", s->bca.len); 1913 cd_dbg(CD_WARNING, "Received invalid BCA length (%d)\n",
1914 s->bca.len);
1895 ret = -EIO; 1915 ret = -EIO;
1896 goto out; 1916 goto out;
1897 } 1917 }
@@ -1927,14 +1947,13 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s,
1927 1947
1928 s->manufact.len = buf[0] << 8 | buf[1]; 1948 s->manufact.len = buf[0] << 8 | buf[1];
1929 if (s->manufact.len < 0) { 1949 if (s->manufact.len < 0) {
1930 cdinfo(CD_WARNING, "Received invalid manufacture info length" 1950 cd_dbg(CD_WARNING, "Received invalid manufacture info length (%d)\n",
1931 " (%d)\n", s->manufact.len); 1951 s->manufact.len);
1932 ret = -EIO; 1952 ret = -EIO;
1933 } else { 1953 } else {
1934 if (s->manufact.len > 2048) { 1954 if (s->manufact.len > 2048) {
1935 cdinfo(CD_WARNING, "Received invalid manufacture info " 1955 cd_dbg(CD_WARNING, "Received invalid manufacture info length (%d): truncating to 2048\n",
1936 "length (%d): truncating to 2048\n", 1956 s->manufact.len);
1937 s->manufact.len);
1938 s->manufact.len = 2048; 1957 s->manufact.len = 2048;
1939 } 1958 }
1940 memcpy(s->manufact.value, &buf[4], s->manufact.len); 1959 memcpy(s->manufact.value, &buf[4], s->manufact.len);
@@ -1965,8 +1984,8 @@ static int dvd_read_struct(struct cdrom_device_info *cdi, dvd_struct *s,
1965 return dvd_read_manufact(cdi, s, cgc); 1984 return dvd_read_manufact(cdi, s, cgc);
1966 1985
1967 default: 1986 default:
1968 cdinfo(CD_WARNING, ": Invalid DVD structure read requested (%d)\n", 1987 cd_dbg(CD_WARNING, ": Invalid DVD structure read requested (%d)\n",
1969 s->type); 1988 s->type);
1970 return -EINVAL; 1989 return -EINVAL;
1971 } 1990 }
1972} 1991}
@@ -2255,7 +2274,7 @@ static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi,
2255 u8 requested_format; 2274 u8 requested_format;
2256 int ret; 2275 int ret;
2257 2276
2258 cdinfo(CD_DO_IOCTL, "entering CDROMMULTISESSION\n"); 2277 cd_dbg(CD_DO_IOCTL, "entering CDROMMULTISESSION\n");
2259 2278
2260 if (!(cdi->ops->capability & CDC_MULTI_SESSION)) 2279 if (!(cdi->ops->capability & CDC_MULTI_SESSION))
2261 return -ENOSYS; 2280 return -ENOSYS;
@@ -2277,13 +2296,13 @@ static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi,
2277 if (copy_to_user(argp, &ms_info, sizeof(ms_info))) 2296 if (copy_to_user(argp, &ms_info, sizeof(ms_info)))
2278 return -EFAULT; 2297 return -EFAULT;
2279 2298
2280 cdinfo(CD_DO_IOCTL, "CDROMMULTISESSION successful\n"); 2299 cd_dbg(CD_DO_IOCTL, "CDROMMULTISESSION successful\n");
2281 return 0; 2300 return 0;
2282} 2301}
2283 2302
2284static int cdrom_ioctl_eject(struct cdrom_device_info *cdi) 2303static int cdrom_ioctl_eject(struct cdrom_device_info *cdi)
2285{ 2304{
2286 cdinfo(CD_DO_IOCTL, "entering CDROMEJECT\n"); 2305 cd_dbg(CD_DO_IOCTL, "entering CDROMEJECT\n");
2287 2306
2288 if (!CDROM_CAN(CDC_OPEN_TRAY)) 2307 if (!CDROM_CAN(CDC_OPEN_TRAY))
2289 return -ENOSYS; 2308 return -ENOSYS;
@@ -2300,7 +2319,7 @@ static int cdrom_ioctl_eject(struct cdrom_device_info *cdi)
2300 2319
2301static int cdrom_ioctl_closetray(struct cdrom_device_info *cdi) 2320static int cdrom_ioctl_closetray(struct cdrom_device_info *cdi)
2302{ 2321{
2303 cdinfo(CD_DO_IOCTL, "entering CDROMCLOSETRAY\n"); 2322 cd_dbg(CD_DO_IOCTL, "entering CDROMCLOSETRAY\n");
2304 2323
2305 if (!CDROM_CAN(CDC_CLOSE_TRAY)) 2324 if (!CDROM_CAN(CDC_CLOSE_TRAY))
2306 return -ENOSYS; 2325 return -ENOSYS;
@@ -2310,7 +2329,7 @@ static int cdrom_ioctl_closetray(struct cdrom_device_info *cdi)
2310static int cdrom_ioctl_eject_sw(struct cdrom_device_info *cdi, 2329static int cdrom_ioctl_eject_sw(struct cdrom_device_info *cdi,
2311 unsigned long arg) 2330 unsigned long arg)
2312{ 2331{
2313 cdinfo(CD_DO_IOCTL, "entering CDROMEJECT_SW\n"); 2332 cd_dbg(CD_DO_IOCTL, "entering CDROMEJECT_SW\n");
2314 2333
2315 if (!CDROM_CAN(CDC_OPEN_TRAY)) 2334 if (!CDROM_CAN(CDC_OPEN_TRAY))
2316 return -ENOSYS; 2335 return -ENOSYS;
@@ -2329,7 +2348,7 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi,
2329 struct cdrom_changer_info *info; 2348 struct cdrom_changer_info *info;
2330 int ret; 2349 int ret;
2331 2350
2332 cdinfo(CD_DO_IOCTL, "entering CDROM_MEDIA_CHANGED\n"); 2351 cd_dbg(CD_DO_IOCTL, "entering CDROM_MEDIA_CHANGED\n");
2333 2352
2334 if (!CDROM_CAN(CDC_MEDIA_CHANGED)) 2353 if (!CDROM_CAN(CDC_MEDIA_CHANGED))
2335 return -ENOSYS; 2354 return -ENOSYS;
@@ -2355,7 +2374,7 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi,
2355static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi, 2374static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi,
2356 unsigned long arg) 2375 unsigned long arg)
2357{ 2376{
2358 cdinfo(CD_DO_IOCTL, "entering CDROM_SET_OPTIONS\n"); 2377 cd_dbg(CD_DO_IOCTL, "entering CDROM_SET_OPTIONS\n");
2359 2378
2360 /* 2379 /*
2361 * Options need to be in sync with capability. 2380 * Options need to be in sync with capability.
@@ -2383,7 +2402,7 @@ static int cdrom_ioctl_set_options(struct cdrom_device_info *cdi,
2383static int cdrom_ioctl_clear_options(struct cdrom_device_info *cdi, 2402static int cdrom_ioctl_clear_options(struct cdrom_device_info *cdi,
2384 unsigned long arg) 2403 unsigned long arg)
2385{ 2404{
2386 cdinfo(CD_DO_IOCTL, "entering CDROM_CLEAR_OPTIONS\n"); 2405 cd_dbg(CD_DO_IOCTL, "entering CDROM_CLEAR_OPTIONS\n");
2387 2406
2388 cdi->options &= ~(int) arg; 2407 cdi->options &= ~(int) arg;
2389 return cdi->options; 2408 return cdi->options;
@@ -2392,7 +2411,7 @@ static int cdrom_ioctl_clear_options(struct cdrom_device_info *cdi,
2392static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi, 2411static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi,
2393 unsigned long arg) 2412 unsigned long arg)
2394{ 2413{
2395 cdinfo(CD_DO_IOCTL, "entering CDROM_SELECT_SPEED\n"); 2414 cd_dbg(CD_DO_IOCTL, "entering CDROM_SELECT_SPEED\n");
2396 2415
2397 if (!CDROM_CAN(CDC_SELECT_SPEED)) 2416 if (!CDROM_CAN(CDC_SELECT_SPEED))
2398 return -ENOSYS; 2417 return -ENOSYS;
@@ -2402,7 +2421,7 @@ static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi,
2402static int cdrom_ioctl_select_disc(struct cdrom_device_info *cdi, 2421static int cdrom_ioctl_select_disc(struct cdrom_device_info *cdi,
2403 unsigned long arg) 2422 unsigned long arg)
2404{ 2423{
2405 cdinfo(CD_DO_IOCTL, "entering CDROM_SELECT_DISC\n"); 2424 cd_dbg(CD_DO_IOCTL, "entering CDROM_SELECT_DISC\n");
2406 2425
2407 if (!CDROM_CAN(CDC_SELECT_DISC)) 2426 if (!CDROM_CAN(CDC_SELECT_DISC))
2408 return -ENOSYS; 2427 return -ENOSYS;
@@ -2420,14 +2439,14 @@ static int cdrom_ioctl_select_disc(struct cdrom_device_info *cdi,
2420 if (cdi->ops->select_disc) 2439 if (cdi->ops->select_disc)
2421 return cdi->ops->select_disc(cdi, arg); 2440 return cdi->ops->select_disc(cdi, arg);
2422 2441
2423 cdinfo(CD_CHANGER, "Using generic cdrom_select_disc()\n"); 2442 cd_dbg(CD_CHANGER, "Using generic cdrom_select_disc()\n");
2424 return cdrom_select_disc(cdi, arg); 2443 return cdrom_select_disc(cdi, arg);
2425} 2444}
2426 2445
2427static int cdrom_ioctl_reset(struct cdrom_device_info *cdi, 2446static int cdrom_ioctl_reset(struct cdrom_device_info *cdi,
2428 struct block_device *bdev) 2447 struct block_device *bdev)
2429{ 2448{
2430 cdinfo(CD_DO_IOCTL, "entering CDROM_RESET\n"); 2449 cd_dbg(CD_DO_IOCTL, "entering CDROM_RESET\n");
2431 2450
2432 if (!capable(CAP_SYS_ADMIN)) 2451 if (!capable(CAP_SYS_ADMIN))
2433 return -EACCES; 2452 return -EACCES;
@@ -2440,7 +2459,7 @@ static int cdrom_ioctl_reset(struct cdrom_device_info *cdi,
2440static int cdrom_ioctl_lock_door(struct cdrom_device_info *cdi, 2459static int cdrom_ioctl_lock_door(struct cdrom_device_info *cdi,
2441 unsigned long arg) 2460 unsigned long arg)
2442{ 2461{
2443 cdinfo(CD_DO_IOCTL, "%socking door.\n", arg ? "L" : "Unl"); 2462 cd_dbg(CD_DO_IOCTL, "%socking door\n", arg ? "L" : "Unl");
2444 2463
2445 if (!CDROM_CAN(CDC_LOCK)) 2464 if (!CDROM_CAN(CDC_LOCK))
2446 return -EDRIVE_CANT_DO_THIS; 2465 return -EDRIVE_CANT_DO_THIS;
@@ -2459,7 +2478,7 @@ static int cdrom_ioctl_lock_door(struct cdrom_device_info *cdi,
2459static int cdrom_ioctl_debug(struct cdrom_device_info *cdi, 2478static int cdrom_ioctl_debug(struct cdrom_device_info *cdi,
2460 unsigned long arg) 2479 unsigned long arg)
2461{ 2480{
2462 cdinfo(CD_DO_IOCTL, "%sabling debug.\n", arg ? "En" : "Dis"); 2481 cd_dbg(CD_DO_IOCTL, "%sabling debug\n", arg ? "En" : "Dis");
2463 2482
2464 if (!capable(CAP_SYS_ADMIN)) 2483 if (!capable(CAP_SYS_ADMIN))
2465 return -EACCES; 2484 return -EACCES;
@@ -2469,7 +2488,7 @@ static int cdrom_ioctl_debug(struct cdrom_device_info *cdi,
2469 2488
2470static int cdrom_ioctl_get_capability(struct cdrom_device_info *cdi) 2489static int cdrom_ioctl_get_capability(struct cdrom_device_info *cdi)
2471{ 2490{
2472 cdinfo(CD_DO_IOCTL, "entering CDROM_GET_CAPABILITY\n"); 2491 cd_dbg(CD_DO_IOCTL, "entering CDROM_GET_CAPABILITY\n");
2473 return (cdi->ops->capability & ~cdi->mask); 2492 return (cdi->ops->capability & ~cdi->mask);
2474} 2493}
2475 2494
@@ -2485,7 +2504,7 @@ static int cdrom_ioctl_get_mcn(struct cdrom_device_info *cdi,
2485 struct cdrom_mcn mcn; 2504 struct cdrom_mcn mcn;
2486 int ret; 2505 int ret;
2487 2506
2488 cdinfo(CD_DO_IOCTL, "entering CDROM_GET_MCN\n"); 2507 cd_dbg(CD_DO_IOCTL, "entering CDROM_GET_MCN\n");
2489 2508
2490 if (!(cdi->ops->capability & CDC_MCN)) 2509 if (!(cdi->ops->capability & CDC_MCN))
2491 return -ENOSYS; 2510 return -ENOSYS;
@@ -2495,14 +2514,14 @@ static int cdrom_ioctl_get_mcn(struct cdrom_device_info *cdi,
2495 2514
2496 if (copy_to_user(argp, &mcn, sizeof(mcn))) 2515 if (copy_to_user(argp, &mcn, sizeof(mcn)))
2497 return -EFAULT; 2516 return -EFAULT;
2498 cdinfo(CD_DO_IOCTL, "CDROM_GET_MCN successful\n"); 2517 cd_dbg(CD_DO_IOCTL, "CDROM_GET_MCN successful\n");
2499 return 0; 2518 return 0;
2500} 2519}
2501 2520
2502static int cdrom_ioctl_drive_status(struct cdrom_device_info *cdi, 2521static int cdrom_ioctl_drive_status(struct cdrom_device_info *cdi,
2503 unsigned long arg) 2522 unsigned long arg)
2504{ 2523{
2505 cdinfo(CD_DO_IOCTL, "entering CDROM_DRIVE_STATUS\n"); 2524 cd_dbg(CD_DO_IOCTL, "entering CDROM_DRIVE_STATUS\n");
2506 2525
2507 if (!(cdi->ops->capability & CDC_DRIVE_STATUS)) 2526 if (!(cdi->ops->capability & CDC_DRIVE_STATUS))
2508 return -ENOSYS; 2527 return -ENOSYS;
@@ -2535,7 +2554,7 @@ static int cdrom_ioctl_disc_status(struct cdrom_device_info *cdi)
2535{ 2554{
2536 tracktype tracks; 2555 tracktype tracks;
2537 2556
2538 cdinfo(CD_DO_IOCTL, "entering CDROM_DISC_STATUS\n"); 2557 cd_dbg(CD_DO_IOCTL, "entering CDROM_DISC_STATUS\n");
2539 2558
2540 cdrom_count_tracks(cdi, &tracks); 2559 cdrom_count_tracks(cdi, &tracks);
2541 if (tracks.error) 2560 if (tracks.error)
@@ -2557,13 +2576,13 @@ static int cdrom_ioctl_disc_status(struct cdrom_device_info *cdi)
2557 return CDS_DATA_1; 2576 return CDS_DATA_1;
2558 /* Policy mode off */ 2577 /* Policy mode off */
2559 2578
2560 cdinfo(CD_WARNING,"This disc doesn't have any tracks I recognize!\n"); 2579 cd_dbg(CD_WARNING, "This disc doesn't have any tracks I recognize!\n");
2561 return CDS_NO_INFO; 2580 return CDS_NO_INFO;
2562} 2581}
2563 2582
2564static int cdrom_ioctl_changer_nslots(struct cdrom_device_info *cdi) 2583static int cdrom_ioctl_changer_nslots(struct cdrom_device_info *cdi)
2565{ 2584{
2566 cdinfo(CD_DO_IOCTL, "entering CDROM_CHANGER_NSLOTS\n"); 2585 cd_dbg(CD_DO_IOCTL, "entering CDROM_CHANGER_NSLOTS\n");
2567 return cdi->capacity; 2586 return cdi->capacity;
2568} 2587}
2569 2588
@@ -2574,7 +2593,7 @@ static int cdrom_ioctl_get_subchnl(struct cdrom_device_info *cdi,
2574 u8 requested, back; 2593 u8 requested, back;
2575 int ret; 2594 int ret;
2576 2595
2577 /* cdinfo(CD_DO_IOCTL,"entering CDROMSUBCHNL\n");*/ 2596 /* cd_dbg(CD_DO_IOCTL,"entering CDROMSUBCHNL\n");*/
2578 2597
2579 if (copy_from_user(&q, argp, sizeof(q))) 2598 if (copy_from_user(&q, argp, sizeof(q)))
2580 return -EFAULT; 2599 return -EFAULT;
@@ -2594,7 +2613,7 @@ static int cdrom_ioctl_get_subchnl(struct cdrom_device_info *cdi,
2594 2613
2595 if (copy_to_user(argp, &q, sizeof(q))) 2614 if (copy_to_user(argp, &q, sizeof(q)))
2596 return -EFAULT; 2615 return -EFAULT;
2597 /* cdinfo(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */ 2616 /* cd_dbg(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
2598 return 0; 2617 return 0;
2599} 2618}
2600 2619
@@ -2604,7 +2623,7 @@ static int cdrom_ioctl_read_tochdr(struct cdrom_device_info *cdi,
2604 struct cdrom_tochdr header; 2623 struct cdrom_tochdr header;
2605 int ret; 2624 int ret;
2606 2625
2607 /* cdinfo(CD_DO_IOCTL, "entering CDROMREADTOCHDR\n"); */ 2626 /* cd_dbg(CD_DO_IOCTL, "entering CDROMREADTOCHDR\n"); */
2608 2627
2609 if (copy_from_user(&header, argp, sizeof(header))) 2628 if (copy_from_user(&header, argp, sizeof(header)))
2610 return -EFAULT; 2629 return -EFAULT;
@@ -2615,7 +2634,7 @@ static int cdrom_ioctl_read_tochdr(struct cdrom_device_info *cdi,
2615 2634
2616 if (copy_to_user(argp, &header, sizeof(header))) 2635 if (copy_to_user(argp, &header, sizeof(header)))
2617 return -EFAULT; 2636 return -EFAULT;
2618 /* cdinfo(CD_DO_IOCTL, "CDROMREADTOCHDR successful\n"); */ 2637 /* cd_dbg(CD_DO_IOCTL, "CDROMREADTOCHDR successful\n"); */
2619 return 0; 2638 return 0;
2620} 2639}
2621 2640
@@ -2626,7 +2645,7 @@ static int cdrom_ioctl_read_tocentry(struct cdrom_device_info *cdi,
2626 u8 requested_format; 2645 u8 requested_format;
2627 int ret; 2646 int ret;
2628 2647
2629 /* cdinfo(CD_DO_IOCTL, "entering CDROMREADTOCENTRY\n"); */ 2648 /* cd_dbg(CD_DO_IOCTL, "entering CDROMREADTOCENTRY\n"); */
2630 2649
2631 if (copy_from_user(&entry, argp, sizeof(entry))) 2650 if (copy_from_user(&entry, argp, sizeof(entry)))
2632 return -EFAULT; 2651 return -EFAULT;
@@ -2643,7 +2662,7 @@ static int cdrom_ioctl_read_tocentry(struct cdrom_device_info *cdi,
2643 2662
2644 if (copy_to_user(argp, &entry, sizeof(entry))) 2663 if (copy_to_user(argp, &entry, sizeof(entry)))
2645 return -EFAULT; 2664 return -EFAULT;
2646 /* cdinfo(CD_DO_IOCTL, "CDROMREADTOCENTRY successful\n"); */ 2665 /* cd_dbg(CD_DO_IOCTL, "CDROMREADTOCENTRY successful\n"); */
2647 return 0; 2666 return 0;
2648} 2667}
2649 2668
@@ -2652,7 +2671,7 @@ static int cdrom_ioctl_play_msf(struct cdrom_device_info *cdi,
2652{ 2671{
2653 struct cdrom_msf msf; 2672 struct cdrom_msf msf;
2654 2673
2655 cdinfo(CD_DO_IOCTL, "entering CDROMPLAYMSF\n"); 2674 cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
2656 2675
2657 if (!CDROM_CAN(CDC_PLAY_AUDIO)) 2676 if (!CDROM_CAN(CDC_PLAY_AUDIO))
2658 return -ENOSYS; 2677 return -ENOSYS;
@@ -2667,7 +2686,7 @@ static int cdrom_ioctl_play_trkind(struct cdrom_device_info *cdi,
2667 struct cdrom_ti ti; 2686 struct cdrom_ti ti;
2668 int ret; 2687 int ret;
2669 2688
2670 cdinfo(CD_DO_IOCTL, "entering CDROMPLAYTRKIND\n"); 2689 cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYTRKIND\n");
2671 2690
2672 if (!CDROM_CAN(CDC_PLAY_AUDIO)) 2691 if (!CDROM_CAN(CDC_PLAY_AUDIO))
2673 return -ENOSYS; 2692 return -ENOSYS;
@@ -2684,7 +2703,7 @@ static int cdrom_ioctl_volctrl(struct cdrom_device_info *cdi,
2684{ 2703{
2685 struct cdrom_volctrl volume; 2704 struct cdrom_volctrl volume;
2686 2705
2687 cdinfo(CD_DO_IOCTL, "entering CDROMVOLCTRL\n"); 2706 cd_dbg(CD_DO_IOCTL, "entering CDROMVOLCTRL\n");
2688 2707
2689 if (!CDROM_CAN(CDC_PLAY_AUDIO)) 2708 if (!CDROM_CAN(CDC_PLAY_AUDIO))
2690 return -ENOSYS; 2709 return -ENOSYS;
@@ -2699,7 +2718,7 @@ static int cdrom_ioctl_volread(struct cdrom_device_info *cdi,
2699 struct cdrom_volctrl volume; 2718 struct cdrom_volctrl volume;
2700 int ret; 2719 int ret;
2701 2720
2702 cdinfo(CD_DO_IOCTL, "entering CDROMVOLREAD\n"); 2721 cd_dbg(CD_DO_IOCTL, "entering CDROMVOLREAD\n");
2703 2722
2704 if (!CDROM_CAN(CDC_PLAY_AUDIO)) 2723 if (!CDROM_CAN(CDC_PLAY_AUDIO))
2705 return -ENOSYS; 2724 return -ENOSYS;
@@ -2718,7 +2737,7 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi,
2718{ 2737{
2719 int ret; 2738 int ret;
2720 2739
2721 cdinfo(CD_DO_IOCTL, "doing audio ioctl (start/stop/pause/resume)\n"); 2740 cd_dbg(CD_DO_IOCTL, "doing audio ioctl (start/stop/pause/resume)\n");
2722 2741
2723 if (!CDROM_CAN(CDC_PLAY_AUDIO)) 2742 if (!CDROM_CAN(CDC_PLAY_AUDIO))
2724 return -ENOSYS; 2743 return -ENOSYS;
@@ -2729,103 +2748,6 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi,
2729} 2748}
2730 2749
2731/* 2750/*
2732 * Just about every imaginable ioctl is supported in the Uniform layer
2733 * these days.
2734 * ATAPI / SCSI specific code now mainly resides in mmc_ioctl().
2735 */
2736int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
2737 fmode_t mode, unsigned int cmd, unsigned long arg)
2738{
2739 void __user *argp = (void __user *)arg;
2740 int ret;
2741
2742 /*
2743 * Try the generic SCSI command ioctl's first.
2744 */
2745 ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
2746 if (ret != -ENOTTY)
2747 return ret;
2748
2749 switch (cmd) {
2750 case CDROMMULTISESSION:
2751 return cdrom_ioctl_multisession(cdi, argp);
2752 case CDROMEJECT:
2753 return cdrom_ioctl_eject(cdi);
2754 case CDROMCLOSETRAY:
2755 return cdrom_ioctl_closetray(cdi);
2756 case CDROMEJECT_SW:
2757 return cdrom_ioctl_eject_sw(cdi, arg);
2758 case CDROM_MEDIA_CHANGED:
2759 return cdrom_ioctl_media_changed(cdi, arg);
2760 case CDROM_SET_OPTIONS:
2761 return cdrom_ioctl_set_options(cdi, arg);
2762 case CDROM_CLEAR_OPTIONS:
2763 return cdrom_ioctl_clear_options(cdi, arg);
2764 case CDROM_SELECT_SPEED:
2765 return cdrom_ioctl_select_speed(cdi, arg);
2766 case CDROM_SELECT_DISC:
2767 return cdrom_ioctl_select_disc(cdi, arg);
2768 case CDROMRESET:
2769 return cdrom_ioctl_reset(cdi, bdev);
2770 case CDROM_LOCKDOOR:
2771 return cdrom_ioctl_lock_door(cdi, arg);
2772 case CDROM_DEBUG:
2773 return cdrom_ioctl_debug(cdi, arg);
2774 case CDROM_GET_CAPABILITY:
2775 return cdrom_ioctl_get_capability(cdi);
2776 case CDROM_GET_MCN:
2777 return cdrom_ioctl_get_mcn(cdi, argp);
2778 case CDROM_DRIVE_STATUS:
2779 return cdrom_ioctl_drive_status(cdi, arg);
2780 case CDROM_DISC_STATUS:
2781 return cdrom_ioctl_disc_status(cdi);
2782 case CDROM_CHANGER_NSLOTS:
2783 return cdrom_ioctl_changer_nslots(cdi);
2784 }
2785
2786 /*
2787 * Use the ioctls that are implemented through the generic_packet()
2788 * interface. this may look at bit funny, but if -ENOTTY is
2789 * returned that particular ioctl is not implemented and we
2790 * let it go through the device specific ones.
2791 */
2792 if (CDROM_CAN(CDC_GENERIC_PACKET)) {
2793 ret = mmc_ioctl(cdi, cmd, arg);
2794 if (ret != -ENOTTY)
2795 return ret;
2796 }
2797
2798 /*
2799 * Note: most of the cdinfo() calls are commented out here,
2800 * because they fill up the sys log when CD players poll
2801 * the drive.
2802 */
2803 switch (cmd) {
2804 case CDROMSUBCHNL:
2805 return cdrom_ioctl_get_subchnl(cdi, argp);
2806 case CDROMREADTOCHDR:
2807 return cdrom_ioctl_read_tochdr(cdi, argp);
2808 case CDROMREADTOCENTRY:
2809 return cdrom_ioctl_read_tocentry(cdi, argp);
2810 case CDROMPLAYMSF:
2811 return cdrom_ioctl_play_msf(cdi, argp);
2812 case CDROMPLAYTRKIND:
2813 return cdrom_ioctl_play_trkind(cdi, argp);
2814 case CDROMVOLCTRL:
2815 return cdrom_ioctl_volctrl(cdi, argp);
2816 case CDROMVOLREAD:
2817 return cdrom_ioctl_volread(cdi, argp);
2818 case CDROMSTART:
2819 case CDROMSTOP:
2820 case CDROMPAUSE:
2821 case CDROMRESUME:
2822 return cdrom_ioctl_audioctl(cdi, cmd);
2823 }
2824
2825 return -ENOSYS;
2826}
2827
2828/*
2829 * Required when we need to use READ_10 to issue other than 2048 block 2751 * Required when we need to use READ_10 to issue other than 2048 block
2830 * reads 2752 * reads
2831 */ 2753 */
@@ -2854,10 +2776,158 @@ static int cdrom_switch_blocksize(struct cdrom_device_info *cdi, int size)
2854 return cdo->generic_packet(cdi, &cgc); 2776 return cdo->generic_packet(cdi, &cgc);
2855} 2777}
2856 2778
2779static int cdrom_get_track_info(struct cdrom_device_info *cdi,
2780 __u16 track, __u8 type, track_information *ti)
2781{
2782 struct cdrom_device_ops *cdo = cdi->ops;
2783 struct packet_command cgc;
2784 int ret, buflen;
2785
2786 init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
2787 cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
2788 cgc.cmd[1] = type & 3;
2789 cgc.cmd[4] = (track & 0xff00) >> 8;
2790 cgc.cmd[5] = track & 0xff;
2791 cgc.cmd[8] = 8;
2792 cgc.quiet = 1;
2793
2794 ret = cdo->generic_packet(cdi, &cgc);
2795 if (ret)
2796 return ret;
2797
2798 buflen = be16_to_cpu(ti->track_information_length) +
2799 sizeof(ti->track_information_length);
2800
2801 if (buflen > sizeof(track_information))
2802 buflen = sizeof(track_information);
2803
2804 cgc.cmd[8] = cgc.buflen = buflen;
2805 ret = cdo->generic_packet(cdi, &cgc);
2806 if (ret)
2807 return ret;
2808
2809 /* return actual fill size */
2810 return buflen;
2811}
2812
2813/* return the last written block on the CD-R media. this is for the udf
2814 file system. */
2815int cdrom_get_last_written(struct cdrom_device_info *cdi, long *last_written)
2816{
2817 struct cdrom_tocentry toc;
2818 disc_information di;
2819 track_information ti;
2820 __u32 last_track;
2821 int ret = -1, ti_size;
2822
2823 if (!CDROM_CAN(CDC_GENERIC_PACKET))
2824 goto use_toc;
2825
2826 ret = cdrom_get_disc_info(cdi, &di);
2827 if (ret < (int)(offsetof(typeof(di), last_track_lsb)
2828 + sizeof(di.last_track_lsb)))
2829 goto use_toc;
2830
2831 /* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */
2832 last_track = (di.last_track_msb << 8) | di.last_track_lsb;
2833 ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
2834 if (ti_size < (int)offsetof(typeof(ti), track_start))
2835 goto use_toc;
2836
2837 /* if this track is blank, try the previous. */
2838 if (ti.blank) {
2839 if (last_track == 1)
2840 goto use_toc;
2841 last_track--;
2842 ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
2843 }
2844
2845 if (ti_size < (int)(offsetof(typeof(ti), track_size)
2846 + sizeof(ti.track_size)))
2847 goto use_toc;
2848
2849 /* if last recorded field is valid, return it. */
2850 if (ti.lra_v && ti_size >= (int)(offsetof(typeof(ti), last_rec_address)
2851 + sizeof(ti.last_rec_address))) {
2852 *last_written = be32_to_cpu(ti.last_rec_address);
2853 } else {
2854 /* make it up instead */
2855 *last_written = be32_to_cpu(ti.track_start) +
2856 be32_to_cpu(ti.track_size);
2857 if (ti.free_blocks)
2858 *last_written -= (be32_to_cpu(ti.free_blocks) + 7);
2859 }
2860 return 0;
2861
2862 /* this is where we end up if the drive either can't do a
2863 GPCMD_READ_DISC_INFO or GPCMD_READ_TRACK_RZONE_INFO or if
2864 it doesn't give enough information or fails. then we return
2865 the toc contents. */
2866use_toc:
2867 toc.cdte_format = CDROM_MSF;
2868 toc.cdte_track = CDROM_LEADOUT;
2869 if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &toc)))
2870 return ret;
2871 sanitize_format(&toc.cdte_addr, &toc.cdte_format, CDROM_LBA);
2872 *last_written = toc.cdte_addr.lba;
2873 return 0;
2874}
2875
2876/* return the next writable block. also for udf file system. */
2877static int cdrom_get_next_writable(struct cdrom_device_info *cdi,
2878 long *next_writable)
2879{
2880 disc_information di;
2881 track_information ti;
2882 __u16 last_track;
2883 int ret, ti_size;
2884
2885 if (!CDROM_CAN(CDC_GENERIC_PACKET))
2886 goto use_last_written;
2887
2888 ret = cdrom_get_disc_info(cdi, &di);
2889 if (ret < 0 || ret < offsetof(typeof(di), last_track_lsb)
2890 + sizeof(di.last_track_lsb))
2891 goto use_last_written;
2892
2893 /* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */
2894 last_track = (di.last_track_msb << 8) | di.last_track_lsb;
2895 ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
2896 if (ti_size < 0 || ti_size < offsetof(typeof(ti), track_start))
2897 goto use_last_written;
2898
2899 /* if this track is blank, try the previous. */
2900 if (ti.blank) {
2901 if (last_track == 1)
2902 goto use_last_written;
2903 last_track--;
2904 ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
2905 if (ti_size < 0)
2906 goto use_last_written;
2907 }
2908
2909 /* if next recordable address field is valid, use it. */
2910 if (ti.nwa_v && ti_size >= offsetof(typeof(ti), next_writable)
2911 + sizeof(ti.next_writable)) {
2912 *next_writable = be32_to_cpu(ti.next_writable);
2913 return 0;
2914 }
2915
2916use_last_written:
2917 ret = cdrom_get_last_written(cdi, next_writable);
2918 if (ret) {
2919 *next_writable = 0;
2920 return ret;
2921 } else {
2922 *next_writable += 7;
2923 return 0;
2924 }
2925}
2926
2857static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, 2927static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
2858 void __user *arg, 2928 void __user *arg,
2859 struct packet_command *cgc, 2929 struct packet_command *cgc,
2860 int cmd) 2930 int cmd)
2861{ 2931{
2862 struct request_sense sense; 2932 struct request_sense sense;
2863 struct cdrom_msf msf; 2933 struct cdrom_msf msf;
@@ -2876,7 +2946,8 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
2876 blocksize = CD_FRAMESIZE_RAW0; 2946 blocksize = CD_FRAMESIZE_RAW0;
2877 break; 2947 break;
2878 } 2948 }
2879 IOCTL_IN(arg, struct cdrom_msf, msf); 2949 if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
2950 return -EFAULT;
2880 lba = msf_to_lba(msf.cdmsf_min0, msf.cdmsf_sec0, msf.cdmsf_frame0); 2951 lba = msf_to_lba(msf.cdmsf_min0, msf.cdmsf_sec0, msf.cdmsf_frame0);
2881 /* FIXME: we need upper bound checking, too!! */ 2952 /* FIXME: we need upper bound checking, too!! */
2882 if (lba < 0) 2953 if (lba < 0)
@@ -2891,8 +2962,8 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi,
2891 cgc->data_direction = CGC_DATA_READ; 2962 cgc->data_direction = CGC_DATA_READ;
2892 ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize); 2963 ret = cdrom_read_block(cdi, cgc, lba, 1, format, blocksize);
2893 if (ret && sense.sense_key == 0x05 && 2964 if (ret && sense.sense_key == 0x05 &&
2894 sense.asc == 0x20 && 2965 sense.asc == 0x20 &&
2895 sense.ascq == 0x00) { 2966 sense.ascq == 0x00) {
2896 /* 2967 /*
2897 * SCSI-II devices are not required to support 2968 * SCSI-II devices are not required to support
2898 * READ_CD, so let's try switching block size 2969 * READ_CD, so let's try switching block size
@@ -2913,12 +2984,14 @@ out:
2913} 2984}
2914 2985
2915static noinline int mmc_ioctl_cdrom_read_audio(struct cdrom_device_info *cdi, 2986static noinline int mmc_ioctl_cdrom_read_audio(struct cdrom_device_info *cdi,
2916 void __user *arg) 2987 void __user *arg)
2917{ 2988{
2918 struct cdrom_read_audio ra; 2989 struct cdrom_read_audio ra;
2919 int lba; 2990 int lba;
2920 2991
2921 IOCTL_IN(arg, struct cdrom_read_audio, ra); 2992 if (copy_from_user(&ra, (struct cdrom_read_audio __user *)arg,
2993 sizeof(ra)))
2994 return -EFAULT;
2922 2995
2923 if (ra.addr_format == CDROM_MSF) 2996 if (ra.addr_format == CDROM_MSF)
2924 lba = msf_to_lba(ra.addr.msf.minute, 2997 lba = msf_to_lba(ra.addr.msf.minute,
@@ -2937,12 +3010,13 @@ static noinline int mmc_ioctl_cdrom_read_audio(struct cdrom_device_info *cdi,
2937} 3010}
2938 3011
2939static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi, 3012static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
2940 void __user *arg) 3013 void __user *arg)
2941{ 3014{
2942 int ret; 3015 int ret;
2943 struct cdrom_subchnl q; 3016 struct cdrom_subchnl q;
2944 u_char requested, back; 3017 u_char requested, back;
2945 IOCTL_IN(arg, struct cdrom_subchnl, q); 3018 if (copy_from_user(&q, (struct cdrom_subchnl __user *)arg, sizeof(q)))
3019 return -EFAULT;
2946 requested = q.cdsc_format; 3020 requested = q.cdsc_format;
2947 if (!((requested == CDROM_MSF) || 3021 if (!((requested == CDROM_MSF) ||
2948 (requested == CDROM_LBA))) 3022 (requested == CDROM_LBA)))
@@ -2954,19 +3028,21 @@ static noinline int mmc_ioctl_cdrom_subchannel(struct cdrom_device_info *cdi,
2954 back = q.cdsc_format; /* local copy */ 3028 back = q.cdsc_format; /* local copy */
2955 sanitize_format(&q.cdsc_absaddr, &back, requested); 3029 sanitize_format(&q.cdsc_absaddr, &back, requested);
2956 sanitize_format(&q.cdsc_reladdr, &q.cdsc_format, requested); 3030 sanitize_format(&q.cdsc_reladdr, &q.cdsc_format, requested);
2957 IOCTL_OUT(arg, struct cdrom_subchnl, q); 3031 if (copy_to_user((struct cdrom_subchnl __user *)arg, &q, sizeof(q)))
2958 /* cdinfo(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */ 3032 return -EFAULT;
3033 /* cd_dbg(CD_DO_IOCTL, "CDROMSUBCHNL successful\n"); */
2959 return 0; 3034 return 0;
2960} 3035}
2961 3036
2962static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi, 3037static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi,
2963 void __user *arg, 3038 void __user *arg,
2964 struct packet_command *cgc) 3039 struct packet_command *cgc)
2965{ 3040{
2966 struct cdrom_device_ops *cdo = cdi->ops; 3041 struct cdrom_device_ops *cdo = cdi->ops;
2967 struct cdrom_msf msf; 3042 struct cdrom_msf msf;
2968 cdinfo(CD_DO_IOCTL, "entering CDROMPLAYMSF\n"); 3043 cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYMSF\n");
2969 IOCTL_IN(arg, struct cdrom_msf, msf); 3044 if (copy_from_user(&msf, (struct cdrom_msf __user *)arg, sizeof(msf)))
3045 return -EFAULT;
2970 cgc->cmd[0] = GPCMD_PLAY_AUDIO_MSF; 3046 cgc->cmd[0] = GPCMD_PLAY_AUDIO_MSF;
2971 cgc->cmd[3] = msf.cdmsf_min0; 3047 cgc->cmd[3] = msf.cdmsf_min0;
2972 cgc->cmd[4] = msf.cdmsf_sec0; 3048 cgc->cmd[4] = msf.cdmsf_sec0;
@@ -2979,13 +3055,14 @@ static noinline int mmc_ioctl_cdrom_play_msf(struct cdrom_device_info *cdi,
2979} 3055}
2980 3056
2981static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi, 3057static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi,
2982 void __user *arg, 3058 void __user *arg,
2983 struct packet_command *cgc) 3059 struct packet_command *cgc)
2984{ 3060{
2985 struct cdrom_device_ops *cdo = cdi->ops; 3061 struct cdrom_device_ops *cdo = cdi->ops;
2986 struct cdrom_blk blk; 3062 struct cdrom_blk blk;
2987 cdinfo(CD_DO_IOCTL, "entering CDROMPLAYBLK\n"); 3063 cd_dbg(CD_DO_IOCTL, "entering CDROMPLAYBLK\n");
2988 IOCTL_IN(arg, struct cdrom_blk, blk); 3064 if (copy_from_user(&blk, (struct cdrom_blk __user *)arg, sizeof(blk)))
3065 return -EFAULT;
2989 cgc->cmd[0] = GPCMD_PLAY_AUDIO_10; 3066 cgc->cmd[0] = GPCMD_PLAY_AUDIO_10;
2990 cgc->cmd[2] = (blk.from >> 24) & 0xff; 3067 cgc->cmd[2] = (blk.from >> 24) & 0xff;
2991 cgc->cmd[3] = (blk.from >> 16) & 0xff; 3068 cgc->cmd[3] = (blk.from >> 16) & 0xff;
@@ -2998,9 +3075,9 @@ static noinline int mmc_ioctl_cdrom_play_blk(struct cdrom_device_info *cdi,
2998} 3075}
2999 3076
3000static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi, 3077static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
3001 void __user *arg, 3078 void __user *arg,
3002 struct packet_command *cgc, 3079 struct packet_command *cgc,
3003 unsigned int cmd) 3080 unsigned int cmd)
3004{ 3081{
3005 struct cdrom_volctrl volctrl; 3082 struct cdrom_volctrl volctrl;
3006 unsigned char buffer[32]; 3083 unsigned char buffer[32];
@@ -3008,9 +3085,11 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
3008 unsigned short offset; 3085 unsigned short offset;
3009 int ret; 3086 int ret;
3010 3087
3011 cdinfo(CD_DO_IOCTL, "entering CDROMVOLUME\n"); 3088 cd_dbg(CD_DO_IOCTL, "entering CDROMVOLUME\n");
3012 3089
3013 IOCTL_IN(arg, struct cdrom_volctrl, volctrl); 3090 if (copy_from_user(&volctrl, (struct cdrom_volctrl __user *)arg,
3091 sizeof(volctrl)))
3092 return -EFAULT;
3014 3093
3015 cgc->buffer = buffer; 3094 cgc->buffer = buffer;
3016 cgc->buflen = 24; 3095 cgc->buflen = 24;
@@ -3030,14 +3109,14 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
3030 if (offset + 16 > cgc->buflen) { 3109 if (offset + 16 > cgc->buflen) {
3031 cgc->buflen = offset + 16; 3110 cgc->buflen = offset + 16;
3032 ret = cdrom_mode_sense(cdi, cgc, 3111 ret = cdrom_mode_sense(cdi, cgc,
3033 GPMODE_AUDIO_CTL_PAGE, 0); 3112 GPMODE_AUDIO_CTL_PAGE, 0);
3034 if (ret) 3113 if (ret)
3035 return ret; 3114 return ret;
3036 } 3115 }
3037 3116
3038 /* sanity check */ 3117 /* sanity check */
3039 if ((buffer[offset] & 0x3f) != GPMODE_AUDIO_CTL_PAGE || 3118 if ((buffer[offset] & 0x3f) != GPMODE_AUDIO_CTL_PAGE ||
3040 buffer[offset + 1] < 14) 3119 buffer[offset + 1] < 14)
3041 return -EINVAL; 3120 return -EINVAL;
3042 3121
3043 /* now we have the current volume settings. if it was only 3122 /* now we have the current volume settings. if it was only
@@ -3047,7 +3126,9 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
3047 volctrl.channel1 = buffer[offset+11]; 3126 volctrl.channel1 = buffer[offset+11];
3048 volctrl.channel2 = buffer[offset+13]; 3127 volctrl.channel2 = buffer[offset+13];
3049 volctrl.channel3 = buffer[offset+15]; 3128 volctrl.channel3 = buffer[offset+15];
3050 IOCTL_OUT(arg, struct cdrom_volctrl, volctrl); 3129 if (copy_to_user((struct cdrom_volctrl __user *)arg, &volctrl,
3130 sizeof(volctrl)))
3131 return -EFAULT;
3051 return 0; 3132 return 0;
3052 } 3133 }
3053 3134
@@ -3069,11 +3150,11 @@ static noinline int mmc_ioctl_cdrom_volume(struct cdrom_device_info *cdi,
3069} 3150}
3070 3151
3071static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi, 3152static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi,
3072 struct packet_command *cgc, 3153 struct packet_command *cgc,
3073 int cmd) 3154 int cmd)
3074{ 3155{
3075 struct cdrom_device_ops *cdo = cdi->ops; 3156 struct cdrom_device_ops *cdo = cdi->ops;
3076 cdinfo(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n"); 3157 cd_dbg(CD_DO_IOCTL, "entering CDROMSTART/CDROMSTOP\n");
3077 cgc->cmd[0] = GPCMD_START_STOP_UNIT; 3158 cgc->cmd[0] = GPCMD_START_STOP_UNIT;
3078 cgc->cmd[1] = 1; 3159 cgc->cmd[1] = 1;
3079 cgc->cmd[4] = (cmd == CDROMSTART) ? 1 : 0; 3160 cgc->cmd[4] = (cmd == CDROMSTART) ? 1 : 0;
@@ -3082,11 +3163,11 @@ static noinline int mmc_ioctl_cdrom_start_stop(struct cdrom_device_info *cdi,
3082} 3163}
3083 3164
3084static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi, 3165static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi,
3085 struct packet_command *cgc, 3166 struct packet_command *cgc,
3086 int cmd) 3167 int cmd)
3087{ 3168{
3088 struct cdrom_device_ops *cdo = cdi->ops; 3169 struct cdrom_device_ops *cdo = cdi->ops;
3089 cdinfo(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n"); 3170 cd_dbg(CD_DO_IOCTL, "entering CDROMPAUSE/CDROMRESUME\n");
3090 cgc->cmd[0] = GPCMD_PAUSE_RESUME; 3171 cgc->cmd[0] = GPCMD_PAUSE_RESUME;
3091 cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0; 3172 cgc->cmd[8] = (cmd == CDROMRESUME) ? 1 : 0;
3092 cgc->data_direction = CGC_DATA_NONE; 3173 cgc->data_direction = CGC_DATA_NONE;
@@ -3094,8 +3175,8 @@ static noinline int mmc_ioctl_cdrom_pause_resume(struct cdrom_device_info *cdi,
3094} 3175}
3095 3176
3096static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi, 3177static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi,
3097 void __user *arg, 3178 void __user *arg,
3098 struct packet_command *cgc) 3179 struct packet_command *cgc)
3099{ 3180{
3100 int ret; 3181 int ret;
3101 dvd_struct *s; 3182 dvd_struct *s;
@@ -3108,7 +3189,7 @@ static noinline int mmc_ioctl_dvd_read_struct(struct cdrom_device_info *cdi,
3108 if (!s) 3189 if (!s)
3109 return -ENOMEM; 3190 return -ENOMEM;
3110 3191
3111 cdinfo(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n"); 3192 cd_dbg(CD_DO_IOCTL, "entering DVD_READ_STRUCT\n");
3112 if (copy_from_user(s, arg, size)) { 3193 if (copy_from_user(s, arg, size)) {
3113 kfree(s); 3194 kfree(s);
3114 return -EFAULT; 3195 return -EFAULT;
@@ -3126,44 +3207,48 @@ out:
3126} 3207}
3127 3208
3128static noinline int mmc_ioctl_dvd_auth(struct cdrom_device_info *cdi, 3209static noinline int mmc_ioctl_dvd_auth(struct cdrom_device_info *cdi,
3129 void __user *arg) 3210 void __user *arg)
3130{ 3211{
3131 int ret; 3212 int ret;
3132 dvd_authinfo ai; 3213 dvd_authinfo ai;
3133 if (!CDROM_CAN(CDC_DVD)) 3214 if (!CDROM_CAN(CDC_DVD))
3134 return -ENOSYS; 3215 return -ENOSYS;
3135 cdinfo(CD_DO_IOCTL, "entering DVD_AUTH\n"); 3216 cd_dbg(CD_DO_IOCTL, "entering DVD_AUTH\n");
3136 IOCTL_IN(arg, dvd_authinfo, ai); 3217 if (copy_from_user(&ai, (dvd_authinfo __user *)arg, sizeof(ai)))
3218 return -EFAULT;
3137 ret = dvd_do_auth(cdi, &ai); 3219 ret = dvd_do_auth(cdi, &ai);
3138 if (ret) 3220 if (ret)
3139 return ret; 3221 return ret;
3140 IOCTL_OUT(arg, dvd_authinfo, ai); 3222 if (copy_to_user((dvd_authinfo __user *)arg, &ai, sizeof(ai)))
3223 return -EFAULT;
3141 return 0; 3224 return 0;
3142} 3225}
3143 3226
3144static noinline int mmc_ioctl_cdrom_next_writable(struct cdrom_device_info *cdi, 3227static noinline int mmc_ioctl_cdrom_next_writable(struct cdrom_device_info *cdi,
3145 void __user *arg) 3228 void __user *arg)
3146{ 3229{
3147 int ret; 3230 int ret;
3148 long next = 0; 3231 long next = 0;
3149 cdinfo(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n"); 3232 cd_dbg(CD_DO_IOCTL, "entering CDROM_NEXT_WRITABLE\n");
3150 ret = cdrom_get_next_writable(cdi, &next); 3233 ret = cdrom_get_next_writable(cdi, &next);
3151 if (ret) 3234 if (ret)
3152 return ret; 3235 return ret;
3153 IOCTL_OUT(arg, long, next); 3236 if (copy_to_user((long __user *)arg, &next, sizeof(next)))
3237 return -EFAULT;
3154 return 0; 3238 return 0;
3155} 3239}
3156 3240
3157static noinline int mmc_ioctl_cdrom_last_written(struct cdrom_device_info *cdi, 3241static noinline int mmc_ioctl_cdrom_last_written(struct cdrom_device_info *cdi,
3158 void __user *arg) 3242 void __user *arg)
3159{ 3243{
3160 int ret; 3244 int ret;
3161 long last = 0; 3245 long last = 0;
3162 cdinfo(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n"); 3246 cd_dbg(CD_DO_IOCTL, "entering CDROM_LAST_WRITTEN\n");
3163 ret = cdrom_get_last_written(cdi, &last); 3247 ret = cdrom_get_last_written(cdi, &last);
3164 if (ret) 3248 if (ret)
3165 return ret; 3249 return ret;
3166 IOCTL_OUT(arg, long, last); 3250 if (copy_to_user((long __user *)arg, &last, sizeof(last)))
3251 return -EFAULT;
3167 return 0; 3252 return 0;
3168} 3253}
3169 3254
@@ -3212,181 +3297,101 @@ static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
3212 return -ENOTTY; 3297 return -ENOTTY;
3213} 3298}
3214 3299
3215static int cdrom_get_track_info(struct cdrom_device_info *cdi, __u16 track, __u8 type, 3300/*
3216 track_information *ti) 3301 * Just about every imaginable ioctl is supported in the Uniform layer
3217{ 3302 * these days.
3218 struct cdrom_device_ops *cdo = cdi->ops; 3303 * ATAPI / SCSI specific code now mainly resides in mmc_ioctl().
3219 struct packet_command cgc; 3304 */
3220 int ret, buflen; 3305int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
3221 3306 fmode_t mode, unsigned int cmd, unsigned long arg)
3222 init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
3223 cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
3224 cgc.cmd[1] = type & 3;
3225 cgc.cmd[4] = (track & 0xff00) >> 8;
3226 cgc.cmd[5] = track & 0xff;
3227 cgc.cmd[8] = 8;
3228 cgc.quiet = 1;
3229
3230 if ((ret = cdo->generic_packet(cdi, &cgc)))
3231 return ret;
3232
3233 buflen = be16_to_cpu(ti->track_information_length) +
3234 sizeof(ti->track_information_length);
3235
3236 if (buflen > sizeof(track_information))
3237 buflen = sizeof(track_information);
3238
3239 cgc.cmd[8] = cgc.buflen = buflen;
3240 if ((ret = cdo->generic_packet(cdi, &cgc)))
3241 return ret;
3242
3243 /* return actual fill size */
3244 return buflen;
3245}
3246
3247/* requires CD R/RW */
3248static int cdrom_get_disc_info(struct cdrom_device_info *cdi, disc_information *di)
3249{ 3307{
3250 struct cdrom_device_ops *cdo = cdi->ops; 3308 void __user *argp = (void __user *)arg;
3251 struct packet_command cgc; 3309 int ret;
3252 int ret, buflen;
3253
3254 /* set up command and get the disc info */
3255 init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
3256 cgc.cmd[0] = GPCMD_READ_DISC_INFO;
3257 cgc.cmd[8] = cgc.buflen = 2;
3258 cgc.quiet = 1;
3259
3260 if ((ret = cdo->generic_packet(cdi, &cgc)))
3261 return ret;
3262 3310
3263 /* not all drives have the same disc_info length, so requeue 3311 /*
3264 * packet with the length the drive tells us it can supply 3312 * Try the generic SCSI command ioctl's first.
3265 */ 3313 */
3266 buflen = be16_to_cpu(di->disc_information_length) + 3314 ret = scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
3267 sizeof(di->disc_information_length); 3315 if (ret != -ENOTTY)
3268
3269 if (buflen > sizeof(disc_information))
3270 buflen = sizeof(disc_information);
3271
3272 cgc.cmd[8] = cgc.buflen = buflen;
3273 if ((ret = cdo->generic_packet(cdi, &cgc)))
3274 return ret; 3316 return ret;
3275 3317
3276 /* return actual fill size */ 3318 switch (cmd) {
3277 return buflen; 3319 case CDROMMULTISESSION:
3278} 3320 return cdrom_ioctl_multisession(cdi, argp);
3279 3321 case CDROMEJECT:
3280/* return the last written block on the CD-R media. this is for the udf 3322 return cdrom_ioctl_eject(cdi);
3281 file system. */ 3323 case CDROMCLOSETRAY:
3282int cdrom_get_last_written(struct cdrom_device_info *cdi, long *last_written) 3324 return cdrom_ioctl_closetray(cdi);
3283{ 3325 case CDROMEJECT_SW:
3284 struct cdrom_tocentry toc; 3326 return cdrom_ioctl_eject_sw(cdi, arg);
3285 disc_information di; 3327 case CDROM_MEDIA_CHANGED:
3286 track_information ti; 3328 return cdrom_ioctl_media_changed(cdi, arg);
3287 __u32 last_track; 3329 case CDROM_SET_OPTIONS:
3288 int ret = -1, ti_size; 3330 return cdrom_ioctl_set_options(cdi, arg);
3289 3331 case CDROM_CLEAR_OPTIONS:
3290 if (!CDROM_CAN(CDC_GENERIC_PACKET)) 3332 return cdrom_ioctl_clear_options(cdi, arg);
3291 goto use_toc; 3333 case CDROM_SELECT_SPEED:
3292 3334 return cdrom_ioctl_select_speed(cdi, arg);
3293 ret = cdrom_get_disc_info(cdi, &di); 3335 case CDROM_SELECT_DISC:
3294 if (ret < (int)(offsetof(typeof(di), last_track_lsb) 3336 return cdrom_ioctl_select_disc(cdi, arg);
3295 + sizeof(di.last_track_lsb))) 3337 case CDROMRESET:
3296 goto use_toc; 3338 return cdrom_ioctl_reset(cdi, bdev);
3297 3339 case CDROM_LOCKDOOR:
3298 /* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */ 3340 return cdrom_ioctl_lock_door(cdi, arg);
3299 last_track = (di.last_track_msb << 8) | di.last_track_lsb; 3341 case CDROM_DEBUG:
3300 ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); 3342 return cdrom_ioctl_debug(cdi, arg);
3301 if (ti_size < (int)offsetof(typeof(ti), track_start)) 3343 case CDROM_GET_CAPABILITY:
3302 goto use_toc; 3344 return cdrom_ioctl_get_capability(cdi);
3303 3345 case CDROM_GET_MCN:
3304 /* if this track is blank, try the previous. */ 3346 return cdrom_ioctl_get_mcn(cdi, argp);
3305 if (ti.blank) { 3347 case CDROM_DRIVE_STATUS:
3306 if (last_track==1) 3348 return cdrom_ioctl_drive_status(cdi, arg);
3307 goto use_toc; 3349 case CDROM_DISC_STATUS:
3308 last_track--; 3350 return cdrom_ioctl_disc_status(cdi);
3309 ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti); 3351 case CDROM_CHANGER_NSLOTS:
3310 } 3352 return cdrom_ioctl_changer_nslots(cdi);
3311
3312 if (ti_size < (int)(offsetof(typeof(ti), track_size)
3313 + sizeof(ti.track_size)))
3314 goto use_toc;
3315
3316 /* if last recorded field is valid, return it. */
3317 if (ti.lra_v && ti_size >= (int)(offsetof(typeof(ti), last_rec_address)
3318 + sizeof(ti.last_rec_address))) {
3319 *last_written = be32_to_cpu(ti.last_rec_address);
3320 } else {
3321 /* make it up instead */
3322 *last_written = be32_to_cpu(ti.track_start) +
3323 be32_to_cpu(ti.track_size);
3324 if (ti.free_blocks)
3325 *last_written -= (be32_to_cpu(ti.free_blocks) + 7);
3326 } 3353 }
3327 return 0;
3328 3354
3329 /* this is where we end up if the drive either can't do a 3355 /*
3330 GPCMD_READ_DISC_INFO or GPCMD_READ_TRACK_RZONE_INFO or if 3356 * Use the ioctls that are implemented through the generic_packet()
3331 it doesn't give enough information or fails. then we return 3357 * interface. this may look at bit funny, but if -ENOTTY is
3332 the toc contents. */ 3358 * returned that particular ioctl is not implemented and we
3333use_toc: 3359 * let it go through the device specific ones.
3334 toc.cdte_format = CDROM_MSF; 3360 */
3335 toc.cdte_track = CDROM_LEADOUT; 3361 if (CDROM_CAN(CDC_GENERIC_PACKET)) {
3336 if ((ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &toc))) 3362 ret = mmc_ioctl(cdi, cmd, arg);
3337 return ret; 3363 if (ret != -ENOTTY)
3338 sanitize_format(&toc.cdte_addr, &toc.cdte_format, CDROM_LBA); 3364 return ret;
3339 *last_written = toc.cdte_addr.lba;
3340 return 0;
3341}
3342
3343/* return the next writable block. also for udf file system. */
3344static int cdrom_get_next_writable(struct cdrom_device_info *cdi, long *next_writable)
3345{
3346 disc_information di;
3347 track_information ti;
3348 __u16 last_track;
3349 int ret, ti_size;
3350
3351 if (!CDROM_CAN(CDC_GENERIC_PACKET))
3352 goto use_last_written;
3353
3354 ret = cdrom_get_disc_info(cdi, &di);
3355 if (ret < 0 || ret < offsetof(typeof(di), last_track_lsb)
3356 + sizeof(di.last_track_lsb))
3357 goto use_last_written;
3358
3359 /* if unit didn't return msb, it's zeroed by cdrom_get_disc_info */
3360 last_track = (di.last_track_msb << 8) | di.last_track_lsb;
3361 ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
3362 if (ti_size < 0 || ti_size < offsetof(typeof(ti), track_start))
3363 goto use_last_written;
3364
3365 /* if this track is blank, try the previous. */
3366 if (ti.blank) {
3367 if (last_track == 1)
3368 goto use_last_written;
3369 last_track--;
3370 ti_size = cdrom_get_track_info(cdi, last_track, 1, &ti);
3371 if (ti_size < 0)
3372 goto use_last_written;
3373 } 3365 }
3374 3366
3375 /* if next recordable address field is valid, use it. */ 3367 /*
3376 if (ti.nwa_v && ti_size >= offsetof(typeof(ti), next_writable) 3368 * Note: most of the cd_dbg() calls are commented out here,
3377 + sizeof(ti.next_writable)) { 3369 * because they fill up the sys log when CD players poll
3378 *next_writable = be32_to_cpu(ti.next_writable); 3370 * the drive.
3379 return 0; 3371 */
3372 switch (cmd) {
3373 case CDROMSUBCHNL:
3374 return cdrom_ioctl_get_subchnl(cdi, argp);
3375 case CDROMREADTOCHDR:
3376 return cdrom_ioctl_read_tochdr(cdi, argp);
3377 case CDROMREADTOCENTRY:
3378 return cdrom_ioctl_read_tocentry(cdi, argp);
3379 case CDROMPLAYMSF:
3380 return cdrom_ioctl_play_msf(cdi, argp);
3381 case CDROMPLAYTRKIND:
3382 return cdrom_ioctl_play_trkind(cdi, argp);
3383 case CDROMVOLCTRL:
3384 return cdrom_ioctl_volctrl(cdi, argp);
3385 case CDROMVOLREAD:
3386 return cdrom_ioctl_volread(cdi, argp);
3387 case CDROMSTART:
3388 case CDROMSTOP:
3389 case CDROMPAUSE:
3390 case CDROMRESUME:
3391 return cdrom_ioctl_audioctl(cdi, cmd);
3380 } 3392 }
3381 3393
3382use_last_written: 3394 return -ENOSYS;
3383 if ((ret = cdrom_get_last_written(cdi, next_writable))) {
3384 *next_writable = 0;
3385 return ret;
3386 } else {
3387 *next_writable += 7;
3388 return 0;
3389 }
3390} 3395}
3391 3396
3392EXPORT_SYMBOL(cdrom_get_last_written); 3397EXPORT_SYMBOL(cdrom_get_last_written);
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 51e75ad96422..584bc3126403 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -602,7 +602,7 @@ static void gdrom_readdisk_dma(struct work_struct *work)
602 spin_unlock(&gdrom_lock); 602 spin_unlock(&gdrom_lock);
603 block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET; 603 block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET;
604 block_cnt = blk_rq_sectors(req)/GD_TO_BLK; 604 block_cnt = blk_rq_sectors(req)/GD_TO_BLK;
605 __raw_writel(virt_to_phys(req->buffer), GDROM_DMA_STARTADDR_REG); 605 __raw_writel(virt_to_phys(bio_data(req->bio)), GDROM_DMA_STARTADDR_REG);
606 __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG); 606 __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG);
607 __raw_writel(1, GDROM_DMA_DIRECTION_REG); 607 __raw_writel(1, GDROM_DMA_DIRECTION_REG);
608 __raw_writel(1, GDROM_DMA_ENABLE_REG); 608 __raw_writel(1, GDROM_DMA_ENABLE_REG);
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 6b75713d953a..0a19d866a153 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk)
902 add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); 902 add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
903 trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool)); 903 trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool));
904} 904}
905EXPORT_SYMBOL_GPL(add_disk_randomness);
905#endif 906#endif
906 907
907/********************************************************************* 908/*********************************************************************
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 16f69be820c7..ee880382e3bc 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -188,10 +188,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
188 188
189 ledtrig_ide_activity(); 189 ledtrig_ide_activity();
190 190
191 pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n", 191 pr_debug("%s: %sing: block=%llu, sectors=%u\n",
192 drive->name, rq_data_dir(rq) == READ ? "read" : "writ", 192 drive->name, rq_data_dir(rq) == READ ? "read" : "writ",
193 (unsigned long long)block, blk_rq_sectors(rq), 193 (unsigned long long)block, blk_rq_sectors(rq));
194 (unsigned long)rq->buffer);
195 194
196 if (hwif->rw_disk) 195 if (hwif->rw_disk)
197 hwif->rw_disk(drive, rq); 196 hwif->rw_disk(drive, rq);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 455e64916498..6a71bc7c9133 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq,
1544 clone->cmd = rq->cmd; 1544 clone->cmd = rq->cmd;
1545 clone->cmd_len = rq->cmd_len; 1545 clone->cmd_len = rq->cmd_len;
1546 clone->sense = rq->sense; 1546 clone->sense = rq->sense;
1547 clone->buffer = rq->buffer;
1548 clone->end_io = end_clone_request; 1547 clone->end_io = end_clone_request;
1549 clone->end_io_data = tio; 1548 clone->end_io_data = tio;
1550 1549
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 0b2ccb68c0d0..4dbfaee9aa95 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -82,8 +82,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
82 82
83 block = blk_rq_pos(req) << 9 >> tr->blkshift; 83 block = blk_rq_pos(req) << 9 >> tr->blkshift;
84 nsect = blk_rq_cur_bytes(req) >> tr->blkshift; 84 nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
85 85 buf = bio_data(req->bio);
86 buf = req->buffer;
87 86
88 if (req->cmd_type != REQ_TYPE_FS) 87 if (req->cmd_type != REQ_TYPE_FS)
89 return -EIO; 88 return -EIO;
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index 7ff473c871a9..ee774ba3728d 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -253,7 +253,7 @@ static int do_ubiblock_request(struct ubiblock *dev, struct request *req)
253 * flash access anyway. 253 * flash access anyway.
254 */ 254 */
255 mutex_lock(&dev->dev_mutex); 255 mutex_lock(&dev->dev_mutex);
256 ret = ubiblock_read(dev, req->buffer, sec, len); 256 ret = ubiblock_read(dev, bio_data(req->bio), sec, len);
257 mutex_unlock(&dev->dev_mutex); 257 mutex_unlock(&dev->dev_mutex);
258 258
259 return ret; 259 return ret;
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 4ccb5d869389..a40ee1e37486 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -207,7 +207,7 @@ static void jsfd_do_request(struct request_queue *q)
207 goto end; 207 goto end;
208 } 208 }
209 209
210 jsfd_read(req->buffer, jdp->dbase + offset, len); 210 jsfd_read(bio_data(req->bio), jdp->dbase + offset, len);
211 err = 0; 211 err = 0;
212 end: 212 end:
213 if (!__blk_end_request_cur(req, err)) 213 if (!__blk_end_request_cur(req, err))
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 65a123d9c676..3cc82d3dec78 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -139,7 +139,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
139 */ 139 */
140 spin_lock_irqsave(q->queue_lock, flags); 140 spin_lock_irqsave(q->queue_lock, flags);
141 blk_requeue_request(q, cmd->request); 141 blk_requeue_request(q, cmd->request);
142 kblockd_schedule_work(q, &device->requeue_work); 142 kblockd_schedule_work(&device->requeue_work);
143 spin_unlock_irqrestore(q->queue_lock, flags); 143 spin_unlock_irqrestore(q->queue_lock, flags);
144} 144}
145 145
@@ -1018,8 +1018,6 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
1018 return BLKPREP_DEFER; 1018 return BLKPREP_DEFER;
1019 } 1019 }
1020 1020
1021 req->buffer = NULL;
1022
1023 /* 1021 /*
1024 * Next, walk the list, and fill in the addresses and sizes of 1022 * Next, walk the list, and fill in the addresses and sizes of
1025 * each segment. 1023 * each segment.
@@ -1156,7 +1154,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
1156 BUG_ON(blk_rq_bytes(req)); 1154 BUG_ON(blk_rq_bytes(req));
1157 1155
1158 memset(&cmd->sdb, 0, sizeof(cmd->sdb)); 1156 memset(&cmd->sdb, 0, sizeof(cmd->sdb));
1159 req->buffer = NULL;
1160 } 1157 }
1161 1158
1162 cmd->cmd_len = req->cmd_len; 1159 cmd->cmd_len = req->cmd_len;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index efcbcd182863..96af195224f2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -737,16 +737,14 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq)
737 goto out; 737 goto out;
738 } 738 }
739 739
740 rq->completion_data = page;
740 blk_add_request_payload(rq, page, len); 741 blk_add_request_payload(rq, page, len);
741 ret = scsi_setup_blk_pc_cmnd(sdp, rq); 742 ret = scsi_setup_blk_pc_cmnd(sdp, rq);
742 rq->buffer = page_address(page);
743 rq->__data_len = nr_bytes; 743 rq->__data_len = nr_bytes;
744 744
745out: 745out:
746 if (ret != BLKPREP_OK) { 746 if (ret != BLKPREP_OK)
747 __free_page(page); 747 __free_page(page);
748 rq->buffer = NULL;
749 }
750 return ret; 748 return ret;
751} 749}
752 750
@@ -842,10 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq)
842{ 840{
843 struct scsi_cmnd *SCpnt = rq->special; 841 struct scsi_cmnd *SCpnt = rq->special;
844 842
845 if (rq->cmd_flags & REQ_DISCARD) { 843 if (rq->cmd_flags & REQ_DISCARD)
846 free_page((unsigned long)rq->buffer); 844 __free_page(rq->completion_data);
847 rq->buffer = NULL; 845
848 }
849 if (SCpnt->cmnd != rq->cmd) { 846 if (SCpnt->cmnd != rq->cmd) {
850 mempool_free(SCpnt->cmnd, sd_cdb_pool); 847 mempool_free(SCpnt->cmnd, sd_cdb_pool);
851 SCpnt->cmnd = NULL; 848 SCpnt->cmnd = NULL;
diff --git a/fs/Makefile b/fs/Makefile
index f9cb9876e466..4030cbfbc9af 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \
14 stack.o fs_struct.o statfs.o 14 stack.o fs_struct.o statfs.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o block_dev.o direct-io.o mpage.o
18else 18else
19obj-y += no-block.o 19obj-y += no-block.o
20endif 20endif
21 21
22obj-$(CONFIG_PROC_FS) += proc_namespace.o 22obj-$(CONFIG_PROC_FS) += proc_namespace.o
23 23
24obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
25obj-y += notify/ 24obj-y += notify/
26obj-$(CONFIG_EPOLL) += eventpoll.o 25obj-$(CONFIG_EPOLL) += eventpoll.o
27obj-$(CONFIG_ANON_INODES) += anon_inodes.o 26obj-$(CONFIG_ANON_INODES) += anon_inodes.o
diff --git a/include/linux/bio.h b/include/linux/bio.h
index bba550826921..5a645769f020 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
333 333
334extern struct bio_set *bioset_create(unsigned int, unsigned int); 334extern struct bio_set *bioset_create(unsigned int, unsigned int);
335extern void bioset_free(struct bio_set *); 335extern void bioset_free(struct bio_set *);
336extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); 336extern mempool_t *biovec_create_pool(int pool_entries);
337 337
338extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); 338extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
339extern void bio_put(struct bio *); 339extern void bio_put(struct bio *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0120451545d8..91dfb75ce39f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -8,7 +8,13 @@ struct blk_mq_tags;
8struct blk_mq_cpu_notifier { 8struct blk_mq_cpu_notifier {
9 struct list_head list; 9 struct list_head list;
10 void *data; 10 void *data;
11 void (*notify)(void *data, unsigned long action, unsigned int cpu); 11 int (*notify)(void *data, unsigned long action, unsigned int cpu);
12};
13
14struct blk_mq_ctxmap {
15 unsigned int map_size;
16 unsigned int bits_per_word;
17 struct blk_align_bitmap *map;
12}; 18};
13 19
14struct blk_mq_hw_ctx { 20struct blk_mq_hw_ctx {
@@ -18,7 +24,11 @@ struct blk_mq_hw_ctx {
18 } ____cacheline_aligned_in_smp; 24 } ____cacheline_aligned_in_smp;
19 25
20 unsigned long state; /* BLK_MQ_S_* flags */ 26 unsigned long state; /* BLK_MQ_S_* flags */
21 struct delayed_work delayed_work; 27 struct delayed_work run_work;
28 struct delayed_work delay_work;
29 cpumask_var_t cpumask;
30 int next_cpu;
31 int next_cpu_batch;
22 32
23 unsigned long flags; /* BLK_MQ_F_* flags */ 33 unsigned long flags; /* BLK_MQ_F_* flags */
24 34
@@ -27,13 +37,13 @@ struct blk_mq_hw_ctx {
27 37
28 void *driver_data; 38 void *driver_data;
29 39
40 struct blk_mq_ctxmap ctx_map;
41
30 unsigned int nr_ctx; 42 unsigned int nr_ctx;
31 struct blk_mq_ctx **ctxs; 43 struct blk_mq_ctx **ctxs;
32 unsigned int nr_ctx_map;
33 unsigned long *ctx_map;
34 44
35 struct request **rqs; 45 unsigned int wait_index;
36 struct list_head page_list; 46
37 struct blk_mq_tags *tags; 47 struct blk_mq_tags *tags;
38 48
39 unsigned long queued; 49 unsigned long queued;
@@ -41,31 +51,40 @@ struct blk_mq_hw_ctx {
41#define BLK_MQ_MAX_DISPATCH_ORDER 10 51#define BLK_MQ_MAX_DISPATCH_ORDER 10
42 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 52 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
43 53
44 unsigned int queue_depth;
45 unsigned int numa_node; 54 unsigned int numa_node;
46 unsigned int cmd_size; /* per-request extra data */ 55 unsigned int cmd_size; /* per-request extra data */
47 56
57 atomic_t nr_active;
58
48 struct blk_mq_cpu_notifier cpu_notifier; 59 struct blk_mq_cpu_notifier cpu_notifier;
49 struct kobject kobj; 60 struct kobject kobj;
50}; 61};
51 62
52struct blk_mq_reg { 63struct blk_mq_tag_set {
53 struct blk_mq_ops *ops; 64 struct blk_mq_ops *ops;
54 unsigned int nr_hw_queues; 65 unsigned int nr_hw_queues;
55 unsigned int queue_depth; 66 unsigned int queue_depth; /* max hw supported */
56 unsigned int reserved_tags; 67 unsigned int reserved_tags;
57 unsigned int cmd_size; /* per-request extra data */ 68 unsigned int cmd_size; /* per-request extra data */
58 int numa_node; 69 int numa_node;
59 unsigned int timeout; 70 unsigned int timeout;
60 unsigned int flags; /* BLK_MQ_F_* */ 71 unsigned int flags; /* BLK_MQ_F_* */
72 void *driver_data;
73
74 struct blk_mq_tags **tags;
75
76 struct mutex tag_list_lock;
77 struct list_head tag_list;
61}; 78};
62 79
63typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); 80typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
64typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); 81typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
65typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
66typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
67typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 82typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
68typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 83typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
84typedef int (init_request_fn)(void *, struct request *, unsigned int,
85 unsigned int, unsigned int);
86typedef void (exit_request_fn)(void *, struct request *, unsigned int,
87 unsigned int);
69 88
70struct blk_mq_ops { 89struct blk_mq_ops {
71 /* 90 /*
@@ -86,18 +105,20 @@ struct blk_mq_ops {
86 softirq_done_fn *complete; 105 softirq_done_fn *complete;
87 106
88 /* 107 /*
89 * Override for hctx allocations (should probably go)
90 */
91 alloc_hctx_fn *alloc_hctx;
92 free_hctx_fn *free_hctx;
93
94 /*
95 * Called when the block layer side of a hardware queue has been 108 * Called when the block layer side of a hardware queue has been
96 * set up, allowing the driver to allocate/init matching structures. 109 * set up, allowing the driver to allocate/init matching structures.
97 * Ditto for exit/teardown. 110 * Ditto for exit/teardown.
98 */ 111 */
99 init_hctx_fn *init_hctx; 112 init_hctx_fn *init_hctx;
100 exit_hctx_fn *exit_hctx; 113 exit_hctx_fn *exit_hctx;
114
115 /*
116 * Called for every command allocated by the block layer to allow
117 * the driver to set up driver specific data.
118 * Ditto for exit/teardown.
119 */
120 init_request_fn *init_request;
121 exit_request_fn *exit_request;
101}; 122};
102 123
103enum { 124enum {
@@ -107,18 +128,22 @@ enum {
107 128
108 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 129 BLK_MQ_F_SHOULD_MERGE = 1 << 0,
109 BLK_MQ_F_SHOULD_SORT = 1 << 1, 130 BLK_MQ_F_SHOULD_SORT = 1 << 1,
110 BLK_MQ_F_SHOULD_IPI = 1 << 2, 131 BLK_MQ_F_TAG_SHARED = 1 << 2,
111 132
112 BLK_MQ_S_STOPPED = 0, 133 BLK_MQ_S_STOPPED = 0,
134 BLK_MQ_S_TAG_ACTIVE = 1,
113 135
114 BLK_MQ_MAX_DEPTH = 2048, 136 BLK_MQ_MAX_DEPTH = 2048,
137
138 BLK_MQ_CPU_WORK_BATCH = 8,
115}; 139};
116 140
117struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *); 141struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
118int blk_mq_register_disk(struct gendisk *); 142int blk_mq_register_disk(struct gendisk *);
119void blk_mq_unregister_disk(struct gendisk *); 143void blk_mq_unregister_disk(struct gendisk *);
120int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); 144
121void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); 145int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
146void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
122 147
123void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 148void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
124 149
@@ -126,28 +151,28 @@ void blk_mq_insert_request(struct request *, bool, bool, bool);
126void blk_mq_run_queues(struct request_queue *q, bool async); 151void blk_mq_run_queues(struct request_queue *q, bool async);
127void blk_mq_free_request(struct request *rq); 152void blk_mq_free_request(struct request *rq);
128bool blk_mq_can_queue(struct blk_mq_hw_ctx *); 153bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
129struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); 154struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
130struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); 155 gfp_t gfp, bool reserved);
131struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); 156struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
132 157
133struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); 158struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
134struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int); 159struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
135void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
136 160
137bool blk_mq_end_io_partial(struct request *rq, int error, 161void blk_mq_end_io(struct request *rq, int error);
138 unsigned int nr_bytes); 162void __blk_mq_end_io(struct request *rq, int error);
139static inline void blk_mq_end_io(struct request *rq, int error)
140{
141 bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq));
142 BUG_ON(!done);
143}
144 163
164void blk_mq_requeue_request(struct request *rq);
165void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
166void blk_mq_kick_requeue_list(struct request_queue *q);
145void blk_mq_complete_request(struct request *rq); 167void blk_mq_complete_request(struct request *rq);
146 168
147void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 169void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
148void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 170void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
149void blk_mq_stop_hw_queues(struct request_queue *q); 171void blk_mq_stop_hw_queues(struct request_queue *q);
150void blk_mq_start_stopped_hw_queues(struct request_queue *q); 172void blk_mq_start_hw_queues(struct request_queue *q);
173void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
174void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
175void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
151 176
152/* 177/*
153 * Driver command data is immediately after the request. So subtract request 178 * Driver command data is immediately after the request. So subtract request
@@ -162,12 +187,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
162 return (void *) rq + sizeof(*rq); 187 return (void *) rq + sizeof(*rq);
163} 188}
164 189
165static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
166 unsigned int tag)
167{
168 return hctx->rqs[tag];
169}
170
171#define queue_for_each_hw_ctx(q, hctx, i) \ 190#define queue_for_each_hw_ctx(q, hctx, i) \
172 for ((i) = 0; (i) < (q)->nr_hw_queues && \ 191 for ((i) = 0; (i) < (q)->nr_hw_queues && \
173 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) 192 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index aa0eaa2d0bd8..d8e4cea23a25 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -190,6 +190,7 @@ enum rq_flag_bits {
190 __REQ_PM, /* runtime pm request */ 190 __REQ_PM, /* runtime pm request */
191 __REQ_END, /* last of chain of requests */ 191 __REQ_END, /* last of chain of requests */
192 __REQ_HASHED, /* on IO scheduler merge hash */ 192 __REQ_HASHED, /* on IO scheduler merge hash */
193 __REQ_MQ_INFLIGHT, /* track inflight for MQ */
193 __REQ_NR_BITS, /* stops here */ 194 __REQ_NR_BITS, /* stops here */
194}; 195};
195 196
@@ -243,5 +244,6 @@ enum rq_flag_bits {
243#define REQ_PM (1ULL << __REQ_PM) 244#define REQ_PM (1ULL << __REQ_PM)
244#define REQ_END (1ULL << __REQ_END) 245#define REQ_END (1ULL << __REQ_END)
245#define REQ_HASHED (1ULL << __REQ_HASHED) 246#define REQ_HASHED (1ULL << __REQ_HASHED)
247#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT)
246 248
247#endif /* __LINUX_BLK_TYPES_H */ 249#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0d84981ee03f..e90e1692e052 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -90,15 +90,15 @@ enum rq_cmd_type_bits {
90#define BLK_MAX_CDB 16 90#define BLK_MAX_CDB 16
91 91
92/* 92/*
93 * try to put the fields that are referenced together in the same cacheline. 93 * Try to put the fields that are referenced together in the same cacheline.
94 * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() 94 *
95 * as well! 95 * If you modify this structure, make sure to update blk_rq_init() and
96 * especially blk_mq_rq_ctx_init() to take care of the added fields.
96 */ 97 */
97struct request { 98struct request {
98 struct list_head queuelist; 99 struct list_head queuelist;
99 union { 100 union {
100 struct call_single_data csd; 101 struct call_single_data csd;
101 struct work_struct mq_flush_work;
102 unsigned long fifo_time; 102 unsigned long fifo_time;
103 }; 103 };
104 104
@@ -178,7 +178,6 @@ struct request {
178 unsigned short ioprio; 178 unsigned short ioprio;
179 179
180 void *special; /* opaque pointer available for LLD use */ 180 void *special; /* opaque pointer available for LLD use */
181 char *buffer; /* kaddr of the current segment if available */
182 181
183 int tag; 182 int tag;
184 int errors; 183 int errors;
@@ -463,6 +462,10 @@ struct request_queue {
463 struct request *flush_rq; 462 struct request *flush_rq;
464 spinlock_t mq_flush_lock; 463 spinlock_t mq_flush_lock;
465 464
465 struct list_head requeue_list;
466 spinlock_t requeue_lock;
467 struct work_struct requeue_work;
468
466 struct mutex sysfs_lock; 469 struct mutex sysfs_lock;
467 470
468 int bypass_depth; 471 int bypass_depth;
@@ -481,6 +484,9 @@ struct request_queue {
481 wait_queue_head_t mq_freeze_wq; 484 wait_queue_head_t mq_freeze_wq;
482 struct percpu_counter mq_usage_counter; 485 struct percpu_counter mq_usage_counter;
483 struct list_head all_q_node; 486 struct list_head all_q_node;
487
488 struct blk_mq_tag_set *tag_set;
489 struct list_head tag_set_list;
484}; 490};
485 491
486#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ 492#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
@@ -613,6 +619,15 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
613 619
614#define rq_data_dir(rq) (((rq)->cmd_flags & 1) != 0) 620#define rq_data_dir(rq) (((rq)->cmd_flags & 1) != 0)
615 621
622/*
623 * Driver can handle struct request, if it either has an old style
624 * request_fn defined, or is blk-mq based.
625 */
626static inline bool queue_is_rq_based(struct request_queue *q)
627{
628 return q->request_fn || q->mq_ops;
629}
630
616static inline unsigned int blk_queue_cluster(struct request_queue *q) 631static inline unsigned int blk_queue_cluster(struct request_queue *q)
617{ 632{
618 return q->limits.cluster; 633 return q->limits.cluster;
@@ -937,6 +952,7 @@ extern struct request *blk_fetch_request(struct request_queue *q);
937 */ 952 */
938extern bool blk_update_request(struct request *rq, int error, 953extern bool blk_update_request(struct request *rq, int error,
939 unsigned int nr_bytes); 954 unsigned int nr_bytes);
955extern void blk_finish_request(struct request *rq, int error);
940extern bool blk_end_request(struct request *rq, int error, 956extern bool blk_end_request(struct request *rq, int error,
941 unsigned int nr_bytes); 957 unsigned int nr_bytes);
942extern void blk_end_request_all(struct request *rq, int error); 958extern void blk_end_request_all(struct request *rq, int error);
@@ -1102,7 +1118,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
1102/* 1118/*
1103 * tag stuff 1119 * tag stuff
1104 */ 1120 */
1105#define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED) 1121#define blk_rq_tagged(rq) \
1122 ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED))
1106extern int blk_queue_start_tag(struct request_queue *, struct request *); 1123extern int blk_queue_start_tag(struct request_queue *, struct request *);
1107extern struct request *blk_queue_find_tag(struct request_queue *, int); 1124extern struct request *blk_queue_find_tag(struct request_queue *, int);
1108extern void blk_queue_end_tag(struct request_queue *, struct request *); 1125extern void blk_queue_end_tag(struct request_queue *, struct request *);
@@ -1370,8 +1387,9 @@ static inline void put_dev_sector(Sector p)
1370} 1387}
1371 1388
1372struct work_struct; 1389struct work_struct;
1373int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1390int kblockd_schedule_work(struct work_struct *work);
1374int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); 1391int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
1392int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
1375 1393
1376#ifdef CONFIG_BLK_CGROUP 1394#ifdef CONFIG_BLK_CGROUP
1377/* 1395/*
diff --git a/mm/Makefile b/mm/Makefile
index b484452dac57..0173940407f6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,7 +30,6 @@ endif
30 30
31obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 31obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
32 32
33obj-$(CONFIG_BOUNCE) += bounce.o
34obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
35obj-$(CONFIG_FRONTSWAP) += frontswap.o 34obj-$(CONFIG_FRONTSWAP) += frontswap.o
36obj-$(CONFIG_ZSWAP) += zswap.o 35obj-$(CONFIG_ZSWAP) += zswap.o